forked from GitHub/gf-core
A Russian dictionary
A Russian dictionary generated from a wordlist created by the FreeLing project. The accompanying converter can be used to convert other wordlists in EAGLES format to GF grammars.
This commit is contained in:
28
contrib/eaglesconv/CollectLemmas.hs
Normal file
28
contrib/eaglesconv/CollectLemmas.hs
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
-- Copyright (C) 2011 Nikita Frolov
|
||||||
|
|
||||||
|
import qualified Data.Text as T
|
||||||
|
import qualified Data.Text.IO as UTF8
|
||||||
|
import System.IO
|
||||||
|
import System.Environment
|
||||||
|
import Control.Monad
|
||||||
|
import Control.Monad.State
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
args <- getArgs
|
||||||
|
forM_ args $ \ f -> do
|
||||||
|
entries <- UTF8.readFile f >>= (return . T.lines)
|
||||||
|
forM_ entries $ \ entry ->
|
||||||
|
do
|
||||||
|
let ws = T.words entry
|
||||||
|
form = head ws
|
||||||
|
tags = toPairs $ tail ws
|
||||||
|
forM_ tags $ \ (lemma, tag) ->
|
||||||
|
do
|
||||||
|
UTF8.putStrLn $ T.concat [lemma, sp, form, sp, tag]
|
||||||
|
where sp = T.singleton ' '
|
||||||
|
|
||||||
|
|
||||||
|
toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
|
||||||
|
where stride _ [] = []
|
||||||
|
stride n (x:xs) = x : stride n (drop (n-1) xs)
|
||||||
135
contrib/eaglesconv/EaglesConv.hs
Normal file
135
contrib/eaglesconv/EaglesConv.hs
Normal file
@@ -0,0 +1,135 @@
|
|||||||
|
-- Copyright (C) 2011 Nikita Frolov
|
||||||
|
|
||||||
|
-- No, we can't pipeline parsing and generation, because there is no guarantee
|
||||||
|
-- that we have collected all forms for a lemma before we've scanned the
|
||||||
|
-- complete file.
|
||||||
|
|
||||||
|
import qualified Data.Text as T
|
||||||
|
import qualified Data.Text.IO as UTF8
|
||||||
|
import System.IO
|
||||||
|
import System.Environment
|
||||||
|
import Control.Monad
|
||||||
|
import Control.Monad.State
|
||||||
|
import qualified Data.Map as M
|
||||||
|
import Codec.Text.IConv
|
||||||
|
import qualified Data.ByteString.Lazy as BS
|
||||||
|
import qualified Data.ByteString.Internal as BSI
|
||||||
|
|
||||||
|
import EaglesMatcher
|
||||||
|
|
||||||
|
type Lemmas = M.Map T.Text Forms
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = do
|
||||||
|
args <- getArgs
|
||||||
|
forM_ args $ \ f -> do
|
||||||
|
entries <- UTF8.readFile f >>= (return . T.lines)
|
||||||
|
lemmas <- return $ execState (collectLemmas entries) (M.empty :: Lemmas)
|
||||||
|
mapM_ generateLin (M.assocs lemmas)
|
||||||
|
|
||||||
|
collectLemmas entries = do
|
||||||
|
forM_ entries $ \ entry -> do
|
||||||
|
let ws = T.words entry
|
||||||
|
lemma = head ws
|
||||||
|
tags = toPairs $ tail ws
|
||||||
|
lemmas <- get
|
||||||
|
forM_ tags $ \ (form, tag) -> do
|
||||||
|
let forms = (case M.lookup lemma lemmas of
|
||||||
|
Just f -> f
|
||||||
|
Nothing -> M.empty) :: Forms
|
||||||
|
if isOpenCat . T.unpack $ tag
|
||||||
|
then put $ M.insert lemma (M.insert tag form forms) lemmas
|
||||||
|
else return ()
|
||||||
|
|
||||||
|
generateLin :: (T.Text, Forms) -> IO ()
|
||||||
|
generateLin (lemma, forms) = do
|
||||||
|
let lemma' = myVeryOwnCyrillicRomanizationIConvSucks lemma
|
||||||
|
UTF8.putStr $ T.concat [T.pack "lin ", lemma']
|
||||||
|
UTF8.putStr $ case T.unpack . head . M.keys $ forms of
|
||||||
|
('N':_:_:_:g:a:'0':_) ->
|
||||||
|
T.concat $ [T.pack "_N = mkN "]
|
||||||
|
++ map (quote . noun forms) [ ('N','S'), ('G','S')
|
||||||
|
, ('D','S'), ('F','S'), ('C','S'), ('O','S')
|
||||||
|
, ('L','S'), ('N','P'), ('G','P'), ('D','P')
|
||||||
|
, ('F','P'), ('C','P'), ('O','P') ]
|
||||||
|
++ [showG g, sp, showAni a, ln]
|
||||||
|
('N':_:c:n:g:a:_) ->
|
||||||
|
T.concat $ [T.pack "_PN = mkPN "
|
||||||
|
, quote $ noun forms ('N', 'S')
|
||||||
|
, showG g, sp
|
||||||
|
, showN n, sp, showAni a, ln]
|
||||||
|
('A':_) ->
|
||||||
|
T.concat $ [T.pack "_A = mkA ", quote $ adj forms 'P',
|
||||||
|
if adj forms 'P' /= adj forms 'C'
|
||||||
|
then quote $ adj forms 'C'
|
||||||
|
else T.pack ""
|
||||||
|
, ln]
|
||||||
|
('V':t) ->
|
||||||
|
let a = case t of
|
||||||
|
(_:_:_:_:'P':_:a':_) -> a'
|
||||||
|
(_:_:_:_:_:a':_) -> a'
|
||||||
|
in
|
||||||
|
T.concat $ [T.pack "_V = mkV ", showAsp a, sp]
|
||||||
|
++ map (quote . verbPres forms) [ ('S','1'), ('S','2')
|
||||||
|
, ('S','3'), ('P','1')
|
||||||
|
, ('P','2'), ('P','3')]
|
||||||
|
++ [ quote $ verbPast forms ('S', 'M')
|
||||||
|
, quote $ verbImp forms, quote $ verbInf forms, ln]
|
||||||
|
('D':_) ->
|
||||||
|
T.concat $ [T.pack "_Adv = mkAdv "
|
||||||
|
, quote . adv $ forms, ln]
|
||||||
|
putStrLn ""
|
||||||
|
hFlush stdout
|
||||||
|
where quote x = T.concat [T.pack "\"", x, T.pack "\" "]
|
||||||
|
showG 'F' = T.pack "Fem"
|
||||||
|
showG 'A' = T.pack "Neut"
|
||||||
|
showG _ = T.pack "Masc"
|
||||||
|
showAni 'I' = T.pack "Inanimate"
|
||||||
|
showAni _ = T.pack "Animate"
|
||||||
|
showN 'P' = T.pack "Pl"
|
||||||
|
showN _ = T.pack "Sg"
|
||||||
|
showAsp 'F' = T.pack "Perfective"
|
||||||
|
showAsp _ = T.pack "Imperfective"
|
||||||
|
sp = T.singleton ' '
|
||||||
|
ln = T.pack " ;"
|
||||||
|
|
||||||
|
toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
|
||||||
|
where stride _ [] = []
|
||||||
|
stride n (x:xs) = x : stride n (drop (n-1) xs)
|
||||||
|
|
||||||
|
myVeryOwnCyrillicRomanizationIConvSucks s = T.pack . concatMap r . T.unpack $ s
|
||||||
|
where r 'а' = "a"
|
||||||
|
r 'б' = "b"
|
||||||
|
r 'в' = "v"
|
||||||
|
r 'г' = "g"
|
||||||
|
r 'д' = "d"
|
||||||
|
r 'е' = "je"
|
||||||
|
r 'ё' = "jo"
|
||||||
|
r 'ж' = "zh"
|
||||||
|
r 'з' = "z"
|
||||||
|
r 'и' = "i"
|
||||||
|
r 'й' = "jj"
|
||||||
|
r 'к' = "k"
|
||||||
|
r 'л' = "l"
|
||||||
|
r 'м' = "m"
|
||||||
|
r 'н' = "n"
|
||||||
|
r 'о' = "o"
|
||||||
|
r 'п' = "p"
|
||||||
|
r 'р' = "r"
|
||||||
|
r 'с' = "s"
|
||||||
|
r 'т' = "t"
|
||||||
|
r 'у' = "u"
|
||||||
|
r 'ф' = "f"
|
||||||
|
r 'х' = "kh"
|
||||||
|
r 'ц' = "c"
|
||||||
|
r 'ч' = "ch"
|
||||||
|
r 'ш' = "sh"
|
||||||
|
r 'щ' = "shc"
|
||||||
|
r 'ъ' = "yy"
|
||||||
|
r 'ы' = "y"
|
||||||
|
r 'ь' = "q"
|
||||||
|
r 'э' = "e"
|
||||||
|
r 'ю' = "ju"
|
||||||
|
r 'я' = "ja"
|
||||||
|
r '-' = "_"
|
||||||
|
r o = [o]
|
||||||
63
contrib/eaglesconv/EaglesMatcher.hs
Normal file
63
contrib/eaglesconv/EaglesMatcher.hs
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
-- Copyright (C) 2011 Nikita Frolov
|
||||||
|
|
||||||
|
-- The format specification can be found at
|
||||||
|
-- http://devel.cpl.upc.edu/freeling/svn/trunk/doc/tagsets/tagset-ru.html
|
||||||
|
|
||||||
|
-- Bugs in the specification:
|
||||||
|
-- Participle, 2nd field: case, not mood
|
||||||
|
-- Participle, 6th field: field, not person
|
||||||
|
-- Verb, persons can be denoted both with 'Pnumber' or just 'number'
|
||||||
|
-- Noun, 10th field can be absent
|
||||||
|
|
||||||
|
-- No, it wouldn't be simpler to implement this grammar with Parsec or another
|
||||||
|
-- parser combinator library.
|
||||||
|
|
||||||
|
|
||||||
|
module EaglesMatcher where
|
||||||
|
|
||||||
|
import qualified Data.Text as T
|
||||||
|
import Data.List
|
||||||
|
import qualified Data.Map as M
|
||||||
|
|
||||||
|
type Forms = M.Map T.Text T.Text
|
||||||
|
|
||||||
|
isOpenCat ('A':_) = True
|
||||||
|
isOpenCat ('N':_) = True
|
||||||
|
isOpenCat ('V':_) = True
|
||||||
|
isOpenCat ('D':_) = True
|
||||||
|
isOpenCat _ = False
|
||||||
|
|
||||||
|
noun forms (c, n) = findForm (matchNoun . T.unpack) forms
|
||||||
|
where matchNoun ('N':_:c':n':_) = c == c' && n == n'
|
||||||
|
matchNoun _ = False
|
||||||
|
|
||||||
|
adj forms d = findForm (matchAdj . T.unpack) forms
|
||||||
|
where matchAdj ('A':'N':'S':'M':_:'F':d':_) = d == d
|
||||||
|
matchAdj _ = False
|
||||||
|
|
||||||
|
verbPres forms (n, p) = findForm (matchPres . T.unpack) forms
|
||||||
|
where matchPres ('V':'D':n':_:'P':'P':p':_:'A':_) = n == n' && p == p'
|
||||||
|
matchPres ('V':'D':n':_:'F':'P':p':_:'A':_) = n == n' && p == p'
|
||||||
|
matchPres ('V':'D':n':_:'P':'P':p':_) = n == n' && p == p'
|
||||||
|
matchPres ('V':'D':n':_:'F':'P':p':_) = n == n' && p == p'
|
||||||
|
matchPres _ = False
|
||||||
|
|
||||||
|
verbPast forms (n, g) = findForm (matchPast . T.unpack) forms
|
||||||
|
where matchPast ('V':'D':n':g':'S':_:_:'A':_) = n == n' && g == g'
|
||||||
|
matchPast _ = False
|
||||||
|
|
||||||
|
verbImp forms = findForm (matchImp . T.unpack) forms
|
||||||
|
where matchImp ('V':'M':_) = True
|
||||||
|
matchImp _ = False
|
||||||
|
|
||||||
|
verbInf forms = findForm (matchInf . T.unpack) forms
|
||||||
|
where matchInf ('V':'I':_) = True
|
||||||
|
matchInf _ = False
|
||||||
|
|
||||||
|
adv forms = findForm (matchAdv . T.unpack) forms
|
||||||
|
where matchAdv ('D':d:_) = d == 'P'
|
||||||
|
matchAdv _ = False
|
||||||
|
|
||||||
|
findForm match forms = case find match (M.keys forms) of
|
||||||
|
Just tag -> forms M.! tag
|
||||||
|
Nothing -> findForm (\ _ -> True) forms
|
||||||
239
contrib/eaglesconv/EaglesParser.hs
Normal file
239
contrib/eaglesconv/EaglesParser.hs
Normal file
@@ -0,0 +1,239 @@
|
|||||||
|
-- Copyright (C) 2011 Nikita Frolov
|
||||||
|
|
||||||
|
-- An early version of the parser that requires somewhat more memory. Kept for
|
||||||
|
-- nostalgic reasons.
|
||||||
|
|
||||||
|
module EaglesParser where
|
||||||
|
|
||||||
|
import qualified Data.Text as T
|
||||||
|
import Data.List
|
||||||
|
import qualified Data.Map as M
|
||||||
|
|
||||||
|
type Forms = M.Map Tag T.Text
|
||||||
|
|
||||||
|
data Tag = A Case Number Gender Animacy Form Degree Extra Obscene
|
||||||
|
| Adv Degree Extra Obscene
|
||||||
|
| AdvPron Extra
|
||||||
|
| Ord Case Number Gender Animacy
|
||||||
|
| AdjPron Case Number Gender Animacy Extra
|
||||||
|
| Frag Extra
|
||||||
|
| Conj Extra
|
||||||
|
| Inter Extra Obscene
|
||||||
|
| Num Case Number Gender Animacy Extra
|
||||||
|
| Part Extra
|
||||||
|
| Prep Extra
|
||||||
|
| N Case Number Gender Animacy Name Extra Obscene
|
||||||
|
| Pron Case Number Gender Animacy Extra
|
||||||
|
| V Mood Number Gender Tense Person Aspect Voice Trans Extra Obscene
|
||||||
|
| P Case Number Gender Tense Form Aspect Voice Trans Extra Obscene
|
||||||
|
deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
parseTag :: T.Text -> Tag
|
||||||
|
parseTag tag = case (T.unpack tag) of {
|
||||||
|
('A':c:n:g:a:f:cmp:e:o:[]) -> A (readCase c) (readNumber n)
|
||||||
|
(readGender g) (readAnimacy a)
|
||||||
|
(readForm f) (readDegree cmp)
|
||||||
|
(readExtra e) (readObscene o) ;
|
||||||
|
('D':cmp:e:o:[]) -> Adv (readDegree cmp)
|
||||||
|
(readExtra e) (readObscene o) ;
|
||||||
|
('P':e:[]) -> AdvPron (readExtra e) ;
|
||||||
|
('Y':c:n:g:a:[]) -> Ord (readCase c) (readNumber n)
|
||||||
|
(readGender g) (readAnimacy a) ;
|
||||||
|
('R':c:n:g:a:e:[]) -> AdjPron (readCase c) (readNumber n)
|
||||||
|
(readGender g) (readAnimacy a) (readExtra e) ;
|
||||||
|
('M':e:[]) -> Frag (readExtra e) ;
|
||||||
|
('C':e:[]) -> Conj (readExtra e) ;
|
||||||
|
('J':e:o:[]) -> Inter (readExtra e) (readObscene o) ;
|
||||||
|
('Z':c:n:g:a:e:[]) -> Num (readCase c) (readNumber n)
|
||||||
|
(readGender g) (readAnimacy a) (readExtra e) ;
|
||||||
|
('T':e:[]) -> Part (readExtra e) ;
|
||||||
|
('B':e:[]) -> Prep (readExtra e) ;
|
||||||
|
('N':_:c:n:g:a:name:e:o:_:[]) -> N (readCase c) (readNumber n)
|
||||||
|
(readGender g) (readAnimacy a)
|
||||||
|
(readName name)
|
||||||
|
(readExtra e) (readObscene o) ;
|
||||||
|
('N':_:c:n:g:a:name:e:o:[]) -> N (readCase c) (readNumber n)
|
||||||
|
(readGender g) (readAnimacy a)
|
||||||
|
(readName name)
|
||||||
|
(readExtra e) (readObscene o) ;
|
||||||
|
('E':c:n:g:a:e:[]) -> Pron (readCase c) (readNumber n)
|
||||||
|
(readGender g) (readAnimacy a) (readExtra e) ;
|
||||||
|
('V':m:n:g:t:'P':p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
|
||||||
|
(readGender g) (readTense t)
|
||||||
|
(readPerson p) (readAspect a)
|
||||||
|
(readVoice v) (readTrans tr)
|
||||||
|
(readExtra e) (readObscene o) ;
|
||||||
|
('V':m:n:g:t:'0':a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
|
||||||
|
(readGender g) (readTense t)
|
||||||
|
NP (readAspect a)
|
||||||
|
(readVoice v) (readTrans tr)
|
||||||
|
(readExtra e) (readObscene o) ;
|
||||||
|
('V':m:n:g:t:p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
|
||||||
|
(readGender g) (readTense t)
|
||||||
|
(readPerson p) (readAspect a)
|
||||||
|
(readVoice v) (readTrans tr)
|
||||||
|
(readExtra e) (readObscene o) ;
|
||||||
|
('Q':c:n:g:t:f:a:v:tr:e:o:[]) -> P (readCase c) (readNumber n)
|
||||||
|
(readGender g) (readTense t)
|
||||||
|
(readForm f) (readAspect a)
|
||||||
|
(readVoice v) (readTrans tr)
|
||||||
|
(readExtra e) (readObscene o) ;
|
||||||
|
_ -> error $ "Parse error: " ++ T.unpack tag }
|
||||||
|
|
||||||
|
data Case = Nom | Gen | Dat | Acc | Inst | Prepos | Partit | Loc | Voc | NC
|
||||||
|
deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readCase 'N' = Nom
|
||||||
|
readCase 'G' = Gen
|
||||||
|
readCase 'D' = Dat
|
||||||
|
readCase 'F' = Acc
|
||||||
|
readCase 'C' = Inst
|
||||||
|
readCase 'O' = Prepos
|
||||||
|
readCase 'P' = Partit
|
||||||
|
readCase 'L' = Loc
|
||||||
|
readCase 'V' = Voc
|
||||||
|
readCase '0' = NC
|
||||||
|
|
||||||
|
data Number = Sg | Pl | NN deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readNumber 'S' = Sg
|
||||||
|
readNumber 'P' = Pl
|
||||||
|
readNumber '0' = NN
|
||||||
|
|
||||||
|
data Gender = Masc | Fem | Neut | Common | NG deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readGender 'F' = Fem
|
||||||
|
readGender 'M' = Masc
|
||||||
|
readGender 'A' = Neut
|
||||||
|
readGender 'C' = Common
|
||||||
|
readGender '0' = NG
|
||||||
|
|
||||||
|
data Animacy = Animate | Inanimate | NA deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readAnimacy 'A' = Animate
|
||||||
|
readAnimacy 'I' = Inanimate
|
||||||
|
readAnimacy '0' = NA
|
||||||
|
|
||||||
|
data Form = Short | Full | NF deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readForm 'S' = Short
|
||||||
|
readForm 'F' = Full
|
||||||
|
readForm '0' = NF
|
||||||
|
|
||||||
|
data Degree = Pos | Comp | Super | ND deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readDegree 'E' = Super
|
||||||
|
readDegree 'C' = Comp
|
||||||
|
readDegree 'P' = Pos
|
||||||
|
readDegree '0' = ND
|
||||||
|
|
||||||
|
data Extra = Introductory | Difficult | Distorted | Predicative
|
||||||
|
| Colloquial | Rare | Abbreviation | Obsolete | NE deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readExtra 'P' = Introductory
|
||||||
|
readExtra 'D' = Difficult
|
||||||
|
readExtra 'V' = Distorted
|
||||||
|
readExtra 'R' = Predicative
|
||||||
|
readExtra 'I' = Colloquial
|
||||||
|
readExtra 'A' = Rare
|
||||||
|
readExtra 'B' = Abbreviation
|
||||||
|
readExtra 'E' = Obsolete
|
||||||
|
readExtra '0' = NE
|
||||||
|
|
||||||
|
data Obscene = Obscene | NO deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readObscene 'H' = Obscene
|
||||||
|
readObscene '0' = NO
|
||||||
|
|
||||||
|
data Name = Topo | Proper | Patro | Family | NNa deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readName 'G' = Topo
|
||||||
|
readName 'N' = Proper
|
||||||
|
readName 'S' = Patro
|
||||||
|
readName 'F' = Family
|
||||||
|
readName '0' = NNa
|
||||||
|
|
||||||
|
data Mood = Gerund | Inf | Ind | Imp | NM deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readMood 'G' = Gerund
|
||||||
|
readMood 'I' = Inf
|
||||||
|
readMood 'D' = Ind
|
||||||
|
readMood 'M' = Imp
|
||||||
|
readMood '0' = NM
|
||||||
|
|
||||||
|
data Tense = Pres | Fut | Past | NT deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readTense 'P' = Pres
|
||||||
|
readTense 'F' = Fut
|
||||||
|
readTense 'S' = Past
|
||||||
|
readTense '0' = NT
|
||||||
|
|
||||||
|
data Person = P1 | P2 | P3 | NP deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readPerson '1' = P1
|
||||||
|
readPerson '2' = P2
|
||||||
|
readPerson '3' = P3
|
||||||
|
|
||||||
|
data Aspect = Perf | Imperf | NAs deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readAspect 'F' = Perf
|
||||||
|
readAspect 'N' = Imperf
|
||||||
|
readAspect '0' = NAs
|
||||||
|
|
||||||
|
data Voice = Act | Pass | NV deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readVoice 'A' = Act
|
||||||
|
readVoice 'S' = Pass
|
||||||
|
readVoice '0' = NV
|
||||||
|
|
||||||
|
data Trans = Trans | Intrans | NTr deriving (Show, Ord, Eq)
|
||||||
|
|
||||||
|
readTrans 'M' = Trans
|
||||||
|
readTrans 'A' = Intrans
|
||||||
|
readTrans '0' = NTr
|
||||||
|
|
||||||
|
isOpenCat :: Tag -> Bool
|
||||||
|
isOpenCat (A _ _ _ _ _ _ _ _) = True
|
||||||
|
isOpenCat (N _ _ _ _ _ _ _) = True
|
||||||
|
isOpenCat (V _ _ _ _ _ _ _ _ _ _) = True
|
||||||
|
isOpenCat (Adv _ _ _) = True
|
||||||
|
isOpenCat _ = False
|
||||||
|
|
||||||
|
noun :: Forms -> (Case, Number) -> T.Text
|
||||||
|
noun forms (c, n) = findForm matchNoun forms
|
||||||
|
where matchNoun (N c' n' _ _ _ _ _) = c == c' && n == n'
|
||||||
|
matchNoun _ = False
|
||||||
|
|
||||||
|
adj :: Forms -> Degree -> T.Text
|
||||||
|
adj forms d = findForm matchAdj forms
|
||||||
|
where matchAdj (A _ _ _ _ _ d' _ _) = d == d
|
||||||
|
matchAdj _ = False
|
||||||
|
|
||||||
|
verbPres :: Forms -> (Number, Person) -> T.Text
|
||||||
|
verbPres forms (n, p) = findForm matchPres forms
|
||||||
|
where matchPres (V Ind n' _ Pres p' _ Act _ _ _) = n == n' && p == p'
|
||||||
|
matchPres _ = False
|
||||||
|
|
||||||
|
verbPast :: Forms -> (Number, Gender) -> T.Text
|
||||||
|
verbPast forms (n, g) = findForm matchPast forms
|
||||||
|
where matchPast (V Ind n' g' Past _ _ Act _ _ _) = n == n' && g == g'
|
||||||
|
matchPast _ = False
|
||||||
|
|
||||||
|
verbImp :: Forms -> T.Text
|
||||||
|
verbImp forms = findForm matchImp forms
|
||||||
|
where matchImp (V Imp _ _ _ _ _ _ _ _ _) = True
|
||||||
|
matchImp _ = False
|
||||||
|
|
||||||
|
verbInf :: Forms -> T.Text
|
||||||
|
verbInf forms = findForm matchInf forms
|
||||||
|
where matchInf (V Inf _ _ _ _ _ _ _ _ _) = True
|
||||||
|
matchInf _ = False
|
||||||
|
|
||||||
|
adv :: Forms -> T.Text
|
||||||
|
adv forms = findForm matchAdv forms
|
||||||
|
where matchAdv (Adv d _ _) = d == Pos
|
||||||
|
matchAdv _ = False
|
||||||
|
|
||||||
|
findForm match forms = case find match (M.keys forms) of
|
||||||
|
Just tag -> forms M.! tag
|
||||||
|
Nothing -> findForm (\ _ -> True) forms
|
||||||
24
contrib/eaglesconv/README
Normal file
24
contrib/eaglesconv/README
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
How to use:
|
||||||
|
|
||||||
|
1) Sort the wordlist so it can be split into sublists. It is necessary because
|
||||||
|
the converter is quite memory-hungry, and you might not have enough RAM to
|
||||||
|
process the whole wordlist at once.
|
||||||
|
|
||||||
|
./CollectLemmas dicc.src | sort > lemmas.src
|
||||||
|
|
||||||
|
2) Split the sorted wordlist.
|
||||||
|
|
||||||
|
split -l 500000 lemmas.src
|
||||||
|
|
||||||
|
3) Splitting has probably left forms of some lemmas spread across two
|
||||||
|
sublists. Manually edit sublists so all forms for a lemma are present in just
|
||||||
|
one sublist.
|
||||||
|
|
||||||
|
4) Run the converter.
|
||||||
|
|
||||||
|
./run_conv.sh xa*
|
||||||
|
|
||||||
|
5) The converter has produced abstract and concrete syntaxes for the
|
||||||
|
dictionary. You can try them out with GF:
|
||||||
|
|
||||||
|
gf DictRus.gf
|
||||||
8
contrib/eaglesconv/mkAbstract.sh
Normal file
8
contrib/eaglesconv/mkAbstract.sh
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
echo "abstract DictRusAbs = Cat ** {
|
||||||
|
"
|
||||||
|
cat $1 | sed 's/^lin/fun/g;s/=.*$//g;s/\_N/\_N : N\;/g;s/\_PN/\_PN : PN\;/g;s/\_A /\_A : A\;/g;s/\_V/\_V : V\;/g;s/\_Adv/\_Adv : Adv\;/g'
|
||||||
|
|
||||||
|
echo "
|
||||||
|
}"
|
||||||
12
contrib/eaglesconv/mkConcrete.sh
Normal file
12
contrib/eaglesconv/mkConcrete.sh
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
echo "--# -path=.:../prelude:../abstract:../common
|
||||||
|
|
||||||
|
concrete DictRus of DictRusAbs = CatRus **
|
||||||
|
open ParadigmsRus, Prelude, StructuralRus, MorphoRus in {
|
||||||
|
flags
|
||||||
|
optimize=values ;
|
||||||
|
coding=utf8 ;
|
||||||
|
"
|
||||||
|
cat $1
|
||||||
|
echo "}"
|
||||||
4
contrib/eaglesconv/run_conv.sh
Normal file
4
contrib/eaglesconv/run_conv.sh
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
./EaglesConv "$@" +RTS -K256M -RTS > convtmp
|
||||||
|
./mkConcrete.sh convtmp > DictRus.gf
|
||||||
|
./mkAbstract.sh convtmp > DictRusAbs.gf
|
||||||
135830
lib/src/russian/DictRus.gf
Normal file
135830
lib/src/russian/DictRus.gf
Normal file
File diff suppressed because it is too large
Load Diff
135825
lib/src/russian/DictRusAbs.gf
Normal file
135825
lib/src/russian/DictRusAbs.gf
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user