A Russian dictionary

A Russian dictionary generated from a wordlist created by the FreeLing project. The accompanying converter can be used to convert other wordlists in EAGLES format to GF grammars.
2011-12-31 02:36:24 +00:00
parent 4bf634ffcd
commit 2ff1d34c86
10 changed files with 272168 additions and 0 deletions
--- a/contrib/eaglesconv/CollectLemmas.hs
+++ b/contrib/eaglesconv/CollectLemmas.hs
@@ -0,0 +1,28 @@
 -- Copyright (C) 2011 Nikita Frolov
 import qualified Data.Text as T
 import qualified Data.Text.IO as UTF8
 import System.IO
 import System.Environment
 import Control.Monad
 import Control.Monad.State
 main :: IO ()
 main = do
  args <- getArgs
  forM_ args $ \ f -> do
    entries <- UTF8.readFile f >>= (return . T.lines)
    forM_ entries $ \ entry ->
       do
         let ws = T.words entry
             form = head ws
             tags = toPairs $ tail ws
         forM_ tags $ \ (lemma, tag) ->
             do
               UTF8.putStrLn $ T.concat [lemma, sp, form, sp, tag]
                   where sp = T.singleton ' '
 toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
    where stride _ [] = []
          stride n (x:xs) = x : stride n (drop (n-1) xs)
--- a/contrib/eaglesconv/EaglesConv.hs
+++ b/contrib/eaglesconv/EaglesConv.hs
@@ -0,0 +1,135 @@
 -- Copyright (C) 2011 Nikita Frolov
 -- No, we can't pipeline parsing and generation, because there is no guarantee
 -- that we have collected all forms for a lemma before we've scanned the
 -- complete file.
 import qualified Data.Text as T
 import qualified Data.Text.IO as UTF8
 import System.IO
 import System.Environment
 import Control.Monad
 import Control.Monad.State
 import qualified Data.Map as M
 import Codec.Text.IConv
 import qualified Data.ByteString.Lazy as BS
 import qualified Data.ByteString.Internal as BSI
 import EaglesMatcher
 type Lemmas = M.Map T.Text Forms
 main :: IO ()
 main = do
  args <- getArgs
  forM_ args $ \ f -> do
    entries <- UTF8.readFile f >>= (return . T.lines)
    lemmas <- return $ execState (collectLemmas entries) (M.empty :: Lemmas)
    mapM_ generateLin (M.assocs lemmas)
 collectLemmas entries = do
  forM_ entries $ \ entry -> do
    let ws = T.words entry
        lemma = head ws
        tags = toPairs $ tail ws
    lemmas <- get
    forM_ tags $ \ (form, tag) -> do
      let forms = (case M.lookup lemma lemmas of
                     Just f -> f
                     Nothing -> M.empty) :: Forms
      if isOpenCat . T.unpack $ tag
       then put $ M.insert lemma (M.insert tag form forms) lemmas
       else return ()
 generateLin :: (T.Text, Forms) -> IO ()
 generateLin (lemma, forms) = do
  let lemma' = myVeryOwnCyrillicRomanizationIConvSucks lemma
  UTF8.putStr $ T.concat [T.pack "lin ", lemma']
  UTF8.putStr $ case T.unpack . head . M.keys $ forms of
                  ('N':_:_:_:g:a:'0':_) ->
                      T.concat $ [T.pack "_N = mkN "]
                               ++ map (quote . noun forms) [ ('N','S'), ('G','S')
                               , ('D','S'), ('F','S'), ('C','S'), ('O','S')
                               , ('L','S'), ('N','P'), ('G','P'), ('D','P')
                               , ('F','P'), ('C','P'), ('O','P') ]
                           ++ [showG g, sp, showAni a, ln]
                  ('N':_:c:n:g:a:_) ->
                      T.concat $ [T.pack "_PN = mkPN "
                       , quote $ noun forms ('N', 'S')
                       , showG g, sp
                       , showN n, sp, showAni a, ln]
                  ('A':_) ->
                      T.concat $ [T.pack "_A = mkA ", quote $ adj forms 'P',
                                   if adj forms 'P' /= adj forms 'C'
                                      then quote $ adj forms 'C'
                                      else T.pack ""
                                   , ln]
                  ('V':t) ->
                      let a = case t of
                                (_:_:_:_:'P':_:a':_) -> a'
                                (_:_:_:_:_:a':_) -> a'
                      in
                        T.concat $ [T.pack "_V = mkV ", showAsp a, sp]
                           ++ map (quote . verbPres forms) [ ('S','1'), ('S','2')
                                                           , ('S','3'), ('P','1')
                                                           , ('P','2'), ('P','3')]
                           ++ [ quote $ verbPast forms ('S', 'M')
                              , quote $ verbImp forms, quote $ verbInf forms, ln]
                  ('D':_) ->
                      T.concat $ [T.pack "_Adv = mkAdv "
                                 , quote . adv $ forms, ln]
  putStrLn ""
  hFlush stdout
      where quote x = T.concat [T.pack "\"", x, T.pack "\" "]
            showG 'F' = T.pack "Fem"
            showG 'A' = T.pack "Neut"
            showG _ = T.pack "Masc"
            showAni 'I' = T.pack "Inanimate"
            showAni _ = T.pack "Animate"
            showN 'P' = T.pack "Pl"
            showN _ = T.pack "Sg"
            showAsp 'F' = T.pack "Perfective"
            showAsp _ = T.pack "Imperfective"
            sp = T.singleton ' '
            ln = T.pack " ;"
 toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
    where stride _ [] = []
          stride n (x:xs) = x : stride n (drop (n-1) xs)
 myVeryOwnCyrillicRomanizationIConvSucks s = T.pack . concatMap r . T.unpack $ s
    where r 'а' = "a"
          r 'б' = "b"
          r 'в' = "v"
          r 'г' = "g"
          r 'д' = "d"
          r 'е' = "je"
          r 'ё' = "jo"
          r 'ж' = "zh"
          r 'з' = "z"
          r 'и' = "i"
          r 'й' = "jj"
          r 'к' = "k"
          r 'л' = "l"
          r 'м' = "m"
          r 'н' = "n"
          r 'о' = "o"
          r 'п' = "p"
          r 'р' = "r"
          r 'с' = "s"
          r 'т' = "t"
          r 'у' = "u"
          r 'ф' = "f"
          r 'х' = "kh"
          r 'ц' = "c"
          r 'ч' = "ch"
          r 'ш' = "sh"
          r 'щ' = "shc"
          r 'ъ' = "yy"
          r 'ы' = "y"
          r 'ь' = "q"
          r 'э' = "e"
          r 'ю' = "ju"
          r 'я' = "ja"
          r '-' = "_"
          r o = [o]
--- a/contrib/eaglesconv/EaglesMatcher.hs
+++ b/contrib/eaglesconv/EaglesMatcher.hs
@@ -0,0 +1,63 @@
 -- Copyright (C) 2011 Nikita Frolov
 -- The format specification can be found at
 -- http://devel.cpl.upc.edu/freeling/svn/trunk/doc/tagsets/tagset-ru.html
 -- Bugs in the specification:
 -- Participle, 2nd field: case, not mood
 -- Participle, 6th field: field, not person
 -- Verb, persons can be denoted both with 'Pnumber' or just 'number'
 -- Noun, 10th field can be absent
 -- No, it wouldn't be simpler to implement this grammar with Parsec or another
 -- parser combinator library.
 module EaglesMatcher where
 import qualified Data.Text as T
 import Data.List
 import qualified Data.Map as M
 type Forms = M.Map T.Text T.Text
 isOpenCat ('A':_) = True
 isOpenCat ('N':_) = True
 isOpenCat ('V':_) = True
 isOpenCat ('D':_) = True
 isOpenCat _       = False
 noun forms (c, n) = findForm (matchNoun . T.unpack) forms
    where matchNoun ('N':_:c':n':_) = c == c' && n == n'
          matchNoun _ = False
 adj forms d = findForm (matchAdj . T.unpack) forms
    where matchAdj ('A':'N':'S':'M':_:'F':d':_) = d == d
          matchAdj _ = False
 verbPres forms (n, p) = findForm (matchPres . T.unpack) forms
    where matchPres ('V':'D':n':_:'P':'P':p':_:'A':_) = n == n' && p == p'
          matchPres ('V':'D':n':_:'F':'P':p':_:'A':_) = n == n' && p == p'
          matchPres ('V':'D':n':_:'P':'P':p':_) = n == n' && p == p'
          matchPres ('V':'D':n':_:'F':'P':p':_) = n == n' && p == p'
          matchPres _ = False
 verbPast forms (n, g) = findForm (matchPast . T.unpack) forms
    where matchPast ('V':'D':n':g':'S':_:_:'A':_) = n == n' && g == g'
          matchPast _ = False
 verbImp forms = findForm (matchImp . T.unpack) forms
    where matchImp ('V':'M':_) = True
          matchImp _ = False
 verbInf forms = findForm (matchInf . T.unpack) forms
    where matchInf ('V':'I':_) = True
          matchInf _ = False
 adv forms = findForm (matchAdv . T.unpack) forms
    where matchAdv ('D':d:_) = d == 'P'
          matchAdv _ = False
 findForm match forms = case find match (M.keys forms) of
                         Just tag -> forms M.! tag
                         Nothing -> findForm (\ _ -> True) forms
--- a/contrib/eaglesconv/EaglesParser.hs
+++ b/contrib/eaglesconv/EaglesParser.hs
@@ -0,0 +1,239 @@
 -- Copyright (C) 2011 Nikita Frolov
 -- An early version of the parser that requires somewhat more memory. Kept for
 -- nostalgic reasons.
 module EaglesParser where
 import qualified Data.Text as T
 import Data.List
 import qualified Data.Map as M
 type Forms = M.Map Tag T.Text
 data Tag = A Case Number Gender Animacy Form Degree Extra Obscene
    | Adv Degree Extra Obscene
    | AdvPron Extra
    | Ord Case Number Gender Animacy
    | AdjPron Case Number Gender Animacy Extra
    | Frag Extra
    | Conj Extra
    | Inter Extra Obscene
    | Num Case Number Gender Animacy Extra
    | Part Extra
    | Prep Extra
    | N Case Number Gender Animacy Name Extra Obscene
    | Pron Case Number Gender Animacy Extra
    | V Mood Number Gender Tense Person Aspect Voice Trans Extra Obscene
    | P Case Number Gender Tense Form Aspect Voice Trans Extra Obscene
      deriving (Show, Ord, Eq)
 parseTag :: T.Text -> Tag
 parseTag tag = case (T.unpack tag) of {
      ('A':c:n:g:a:f:cmp:e:o:[]) -> A (readCase c) (readNumber n)
                                   (readGender g) (readAnimacy a)
                                   (readForm f) (readDegree cmp)
                                   (readExtra e) (readObscene o) ;
      ('D':cmp:e:o:[]) -> Adv (readDegree cmp)
                         (readExtra e) (readObscene o) ;
      ('P':e:[]) -> AdvPron (readExtra e) ;
      ('Y':c:n:g:a:[]) -> Ord (readCase c) (readNumber n)
                         (readGender g) (readAnimacy a) ;
      ('R':c:n:g:a:e:[]) -> AdjPron (readCase c) (readNumber n)
                           (readGender g) (readAnimacy a) (readExtra e) ;
      ('M':e:[]) -> Frag (readExtra e) ;
      ('C':e:[]) -> Conj (readExtra e) ;
      ('J':e:o:[]) -> Inter (readExtra e) (readObscene o) ;
      ('Z':c:n:g:a:e:[]) -> Num (readCase c) (readNumber n)
                           (readGender g) (readAnimacy a) (readExtra e) ;
      ('T':e:[]) -> Part (readExtra e) ;
      ('B':e:[]) -> Prep (readExtra e) ;
      ('N':_:c:n:g:a:name:e:o:_:[]) -> N (readCase c) (readNumber n)
                                      (readGender g) (readAnimacy a)
                                      (readName name)
                                      (readExtra e) (readObscene o) ;
      ('N':_:c:n:g:a:name:e:o:[]) -> N (readCase c) (readNumber n)
                                      (readGender g) (readAnimacy a)
                                      (readName name)
                                      (readExtra e) (readObscene o) ;
      ('E':c:n:g:a:e:[]) -> Pron (readCase c) (readNumber n)
                          (readGender g) (readAnimacy a) (readExtra e) ;
      ('V':m:n:g:t:'P':p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
                                          (readGender g) (readTense t)
                                          (readPerson p) (readAspect a)
                                          (readVoice v) (readTrans tr)
                                          (readExtra e) (readObscene o) ;
      ('V':m:n:g:t:'0':a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
                                        (readGender g) (readTense t)
                                        NP (readAspect a)
                                        (readVoice v) (readTrans tr)
                                        (readExtra e) (readObscene o) ;
      ('V':m:n:g:t:p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n)
                                          (readGender g) (readTense t)
                                          (readPerson p) (readAspect a)
                                          (readVoice v) (readTrans tr)
                                          (readExtra e) (readObscene o) ;
      ('Q':c:n:g:t:f:a:v:tr:e:o:[]) -> P (readCase c) (readNumber n)
                                       (readGender g) (readTense t)
                                       (readForm f) (readAspect a)
                                       (readVoice v) (readTrans tr)
                                       (readExtra e) (readObscene o) ;
      _ -> error $ "Parse error: " ++ T.unpack tag }
 data Case = Nom | Gen | Dat | Acc | Inst | Prepos | Partit | Loc | Voc | NC
      deriving (Show, Ord, Eq)
 readCase 'N' = Nom
 readCase 'G' = Gen
 readCase 'D' = Dat
 readCase 'F' = Acc
 readCase 'C' = Inst
 readCase 'O' = Prepos
 readCase 'P' = Partit
 readCase 'L' = Loc
 readCase 'V' = Voc
 readCase '0' = NC
 data Number = Sg | Pl | NN deriving (Show, Ord, Eq)
 readNumber 'S' = Sg
 readNumber 'P' = Pl
 readNumber '0' = NN
 data Gender = Masc | Fem | Neut | Common | NG deriving (Show, Ord, Eq)
 readGender 'F' = Fem
 readGender 'M' = Masc
 readGender 'A' = Neut
 readGender 'C' = Common
 readGender '0' = NG
 data Animacy = Animate | Inanimate | NA deriving (Show, Ord, Eq)
 readAnimacy 'A' = Animate
 readAnimacy 'I' = Inanimate
 readAnimacy '0' = NA
 data Form = Short | Full | NF deriving (Show, Ord, Eq)
 readForm 'S' = Short
 readForm 'F' = Full
 readForm '0' = NF
 data Degree = Pos | Comp | Super | ND deriving (Show, Ord, Eq)
 readDegree 'E' = Super
 readDegree 'C' = Comp
 readDegree 'P' = Pos
 readDegree '0' = ND
 data Extra = Introductory | Difficult | Distorted | Predicative
           | Colloquial | Rare | Abbreviation | Obsolete | NE deriving (Show, Ord, Eq)
 readExtra 'P' = Introductory
 readExtra 'D' = Difficult
 readExtra 'V' = Distorted
 readExtra 'R' = Predicative
 readExtra 'I' = Colloquial
 readExtra 'A' = Rare
 readExtra 'B' = Abbreviation
 readExtra 'E' = Obsolete
 readExtra '0' = NE
 data Obscene = Obscene | NO deriving (Show, Ord, Eq)
 readObscene 'H' = Obscene
 readObscene '0' = NO
 data Name = Topo | Proper | Patro | Family | NNa deriving (Show, Ord, Eq)
 readName 'G' = Topo
 readName 'N' = Proper
 readName 'S' = Patro
 readName 'F' = Family
 readName '0' = NNa
 data Mood = Gerund | Inf | Ind | Imp | NM deriving (Show, Ord, Eq)
 readMood 'G' = Gerund
 readMood 'I' = Inf
 readMood 'D' = Ind
 readMood 'M' = Imp
 readMood '0' = NM
 data Tense = Pres | Fut | Past | NT deriving (Show, Ord, Eq)
 readTense 'P' = Pres
 readTense 'F' = Fut
 readTense 'S' = Past
 readTense '0' = NT
 data Person = P1 | P2 | P3 | NP deriving (Show, Ord, Eq)
 readPerson '1' = P1
 readPerson '2' = P2
 readPerson '3' = P3
 data Aspect = Perf | Imperf | NAs deriving (Show, Ord, Eq)
 readAspect 'F' = Perf
 readAspect 'N' = Imperf
 readAspect '0' = NAs
 data Voice = Act | Pass | NV deriving (Show, Ord, Eq)
 readVoice 'A' = Act
 readVoice 'S' = Pass
 readVoice '0' = NV
 data Trans = Trans | Intrans | NTr deriving (Show, Ord, Eq)
 readTrans 'M' = Trans
 readTrans 'A' = Intrans
 readTrans '0' = NTr
 isOpenCat :: Tag -> Bool
 isOpenCat (A _ _ _ _ _ _ _ _)     = True
 isOpenCat (N _ _ _ _ _ _ _)       = True
 isOpenCat (V _ _ _ _ _ _ _ _ _ _) = True
 isOpenCat (Adv _ _ _)             = True
 isOpenCat _                       = False
 noun :: Forms -> (Case, Number) -> T.Text
 noun forms (c, n) = findForm matchNoun forms
    where matchNoun (N c' n' _ _ _ _ _) = c == c' && n == n'
          matchNoun _ = False
 adj :: Forms -> Degree -> T.Text
 adj forms d = findForm matchAdj forms
    where matchAdj (A _ _ _ _ _ d' _ _) = d == d
          matchAdj _ = False
 verbPres :: Forms -> (Number, Person) -> T.Text
 verbPres forms (n, p) = findForm matchPres forms
    where matchPres (V Ind n' _ Pres p' _ Act _ _ _) = n == n' && p == p'
          matchPres _ = False
 verbPast :: Forms -> (Number, Gender) -> T.Text
 verbPast forms (n, g) = findForm matchPast forms
    where matchPast (V Ind n' g' Past _ _ Act _ _ _) = n == n' && g == g'
          matchPast _ = False
 verbImp :: Forms -> T.Text
 verbImp forms = findForm matchImp forms
    where matchImp (V Imp _ _ _ _ _ _ _ _ _) = True
          matchImp _ = False
 verbInf :: Forms -> T.Text
 verbInf forms = findForm matchInf forms
    where matchInf (V Inf _ _ _ _ _ _ _ _ _) = True
          matchInf _ = False
 adv :: Forms -> T.Text
 adv forms = findForm matchAdv forms
    where matchAdv (Adv d _ _) = d == Pos
          matchAdv _ = False
 findForm match forms = case find match (M.keys forms) of
                         Just tag -> forms M.! tag
                         Nothing -> findForm (\ _ -> True) forms
--- a/contrib/eaglesconv/README
+++ b/contrib/eaglesconv/README
@@ -0,0 +1,24 @@
 How to use:
 1) Sort the wordlist so it can be split into sublists. It is necessary because
 the converter is quite memory-hungry, and you might not have enough RAM to
 process the whole wordlist at once.
 ./CollectLemmas dicc.src | sort > lemmas.src
 2) Split the sorted wordlist.
 split -l 500000 lemmas.src
 3) Splitting has probably left forms of some lemmas spread across two
 sublists. Manually edit sublists so all forms for a lemma are present in just
 one sublist.
 4) Run the converter.
 ./run_conv.sh xa*
 5) The converter has produced abstract and concrete syntaxes for the
 dictionary. You can try them out with GF:
 gf DictRus.gf
--- a/contrib/eaglesconv/mkAbstract.sh
+++ b/contrib/eaglesconv/mkAbstract.sh
@@ -0,0 +1,8 @@
 #!/bin/sh
 echo "abstract DictRusAbs = Cat ** {
 "
 cat $1 | sed 's/^lin/fun/g;s/=.*$//g;s/\_N/\_N : N\;/g;s/\_PN/\_PN : PN\;/g;s/\_A /\_A : A\;/g;s/\_V/\_V : V\;/g;s/\_Adv/\_Adv : Adv\;/g'
 echo "
 }"
--- a/contrib/eaglesconv/mkConcrete.sh
+++ b/contrib/eaglesconv/mkConcrete.sh
@@ -0,0 +1,12 @@
 #!/bin/sh
 echo "--# -path=.:../prelude:../abstract:../common
 concrete DictRus of DictRusAbs = CatRus ** 
  open ParadigmsRus, Prelude, StructuralRus, MorphoRus in {
 flags 
  optimize=values ;
  coding=utf8 ;
 "
 cat $1
 echo "}"
--- a/contrib/eaglesconv/run_conv.sh
+++ b/contrib/eaglesconv/run_conv.sh
@@ -0,0 +1,4 @@
 #!/bin/sh
 ./EaglesConv "$@" +RTS -K256M -RTS > convtmp
 ./mkConcrete.sh convtmp > DictRus.gf
 ./mkAbstract.sh convtmp > DictRusAbs.gf
--- a/lib/src/russian/DictRus.gf
+++ b/lib/src/russian/DictRus.gf
--- a/lib/src/russian/DictRusAbs.gf
+++ b/lib/src/russian/DictRusAbs.gf