From 8e2c1823ed74b32d980b27ec2bd8972c80488309 Mon Sep 17 00:00:00 2001 From: Nick Frolov Date: Sat, 31 Dec 2011 02:36:24 +0000 Subject: [PATCH] A Russian dictionary A Russian dictionary generated from a wordlist created by the FreeLing project. The accompanying converter can be used to convert other wordlists in EAGLES format to GF grammars. --- contrib/eaglesconv/CollectLemmas.hs | 28 ++++ contrib/eaglesconv/EaglesConv.hs | 135 ++++++++++++++++ contrib/eaglesconv/EaglesMatcher.hs | 63 ++++++++ contrib/eaglesconv/EaglesParser.hs | 239 ++++++++++++++++++++++++++++ contrib/eaglesconv/README | 24 +++ contrib/eaglesconv/mkAbstract.sh | 8 + contrib/eaglesconv/mkConcrete.sh | 12 ++ contrib/eaglesconv/run_conv.sh | 4 + 8 files changed, 513 insertions(+) create mode 100644 contrib/eaglesconv/CollectLemmas.hs create mode 100644 contrib/eaglesconv/EaglesConv.hs create mode 100644 contrib/eaglesconv/EaglesMatcher.hs create mode 100644 contrib/eaglesconv/EaglesParser.hs create mode 100644 contrib/eaglesconv/README create mode 100644 contrib/eaglesconv/mkAbstract.sh create mode 100644 contrib/eaglesconv/mkConcrete.sh create mode 100644 contrib/eaglesconv/run_conv.sh diff --git a/contrib/eaglesconv/CollectLemmas.hs b/contrib/eaglesconv/CollectLemmas.hs new file mode 100644 index 000000000..a63e7e1a8 --- /dev/null +++ b/contrib/eaglesconv/CollectLemmas.hs @@ -0,0 +1,28 @@ +-- Copyright (C) 2011 Nikita Frolov + +import qualified Data.Text as T +import qualified Data.Text.IO as UTF8 +import System.IO +import System.Environment +import Control.Monad +import Control.Monad.State + +main :: IO () +main = do + args <- getArgs + forM_ args $ \ f -> do + entries <- UTF8.readFile f >>= (return . T.lines) + forM_ entries $ \ entry -> + do + let ws = T.words entry + form = head ws + tags = toPairs $ tail ws + forM_ tags $ \ (lemma, tag) -> + do + UTF8.putStrLn $ T.concat [lemma, sp, form, sp, tag] + where sp = T.singleton ' ' + + +toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs)) + where stride _ [] = [] + stride n (x:xs) = x : stride n (drop (n-1) xs) diff --git a/contrib/eaglesconv/EaglesConv.hs b/contrib/eaglesconv/EaglesConv.hs new file mode 100644 index 000000000..aa8929496 --- /dev/null +++ b/contrib/eaglesconv/EaglesConv.hs @@ -0,0 +1,135 @@ +-- Copyright (C) 2011 Nikita Frolov + +-- No, we can't pipeline parsing and generation, because there is no guarantee +-- that we have collected all forms for a lemma before we've scanned the +-- complete file. + +import qualified Data.Text as T +import qualified Data.Text.IO as UTF8 +import System.IO +import System.Environment +import Control.Monad +import Control.Monad.State +import qualified Data.Map as M +import Codec.Text.IConv +import qualified Data.ByteString.Lazy as BS +import qualified Data.ByteString.Internal as BSI + +import EaglesMatcher + +type Lemmas = M.Map T.Text Forms + +main :: IO () +main = do + args <- getArgs + forM_ args $ \ f -> do + entries <- UTF8.readFile f >>= (return . T.lines) + lemmas <- return $ execState (collectLemmas entries) (M.empty :: Lemmas) + mapM_ generateLin (M.assocs lemmas) + +collectLemmas entries = do + forM_ entries $ \ entry -> do + let ws = T.words entry + lemma = head ws + tags = toPairs $ tail ws + lemmas <- get + forM_ tags $ \ (form, tag) -> do + let forms = (case M.lookup lemma lemmas of + Just f -> f + Nothing -> M.empty) :: Forms + if isOpenCat . T.unpack $ tag + then put $ M.insert lemma (M.insert tag form forms) lemmas + else return () + +generateLin :: (T.Text, Forms) -> IO () +generateLin (lemma, forms) = do + let lemma' = myVeryOwnCyrillicRomanizationIConvSucks lemma + UTF8.putStr $ T.concat [T.pack "lin ", lemma'] + UTF8.putStr $ case T.unpack . head . M.keys $ forms of + ('N':_:_:_:g:a:'0':_) -> + T.concat $ [T.pack "_N = mkN "] + ++ map (quote . noun forms) [ ('N','S'), ('G','S') + , ('D','S'), ('F','S'), ('C','S'), ('O','S') + , ('L','S'), ('N','P'), ('G','P'), ('D','P') + , ('F','P'), ('C','P'), ('O','P') ] + ++ [showG g, sp, showAni a, ln] + ('N':_:c:n:g:a:_) -> + T.concat $ [T.pack "_PN = mkPN " + , quote $ noun forms ('N', 'S') + , showG g, sp + , showN n, sp, showAni a, ln] + ('A':_) -> + T.concat $ [T.pack "_A = mkA ", quote $ adj forms 'P', + if adj forms 'P' /= adj forms 'C' + then quote $ adj forms 'C' + else T.pack "" + , ln] + ('V':t) -> + let a = case t of + (_:_:_:_:'P':_:a':_) -> a' + (_:_:_:_:_:a':_) -> a' + in + T.concat $ [T.pack "_V = mkV ", showAsp a, sp] + ++ map (quote . verbPres forms) [ ('S','1'), ('S','2') + , ('S','3'), ('P','1') + , ('P','2'), ('P','3')] + ++ [ quote $ verbPast forms ('S', 'M') + , quote $ verbImp forms, quote $ verbInf forms, ln] + ('D':_) -> + T.concat $ [T.pack "_Adv = mkAdv " + , quote . adv $ forms, ln] + putStrLn "" + hFlush stdout + where quote x = T.concat [T.pack "\"", x, T.pack "\" "] + showG 'F' = T.pack "Fem" + showG 'A' = T.pack "Neut" + showG _ = T.pack "Masc" + showAni 'I' = T.pack "Inanimate" + showAni _ = T.pack "Animate" + showN 'P' = T.pack "Pl" + showN _ = T.pack "Sg" + showAsp 'F' = T.pack "Perfective" + showAsp _ = T.pack "Imperfective" + sp = T.singleton ' ' + ln = T.pack " ;" + +toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs)) + where stride _ [] = [] + stride n (x:xs) = x : stride n (drop (n-1) xs) + +myVeryOwnCyrillicRomanizationIConvSucks s = T.pack . concatMap r . T.unpack $ s + where r 'а' = "a" + r 'б' = "b" + r 'в' = "v" + r 'г' = "g" + r 'д' = "d" + r 'е' = "je" + r 'ё' = "jo" + r 'ж' = "zh" + r 'з' = "z" + r 'и' = "i" + r 'й' = "jj" + r 'к' = "k" + r 'л' = "l" + r 'м' = "m" + r 'н' = "n" + r 'о' = "o" + r 'п' = "p" + r 'р' = "r" + r 'с' = "s" + r 'т' = "t" + r 'у' = "u" + r 'ф' = "f" + r 'х' = "kh" + r 'ц' = "c" + r 'ч' = "ch" + r 'ш' = "sh" + r 'щ' = "shc" + r 'ъ' = "yy" + r 'ы' = "y" + r 'ь' = "q" + r 'э' = "e" + r 'ю' = "ju" + r 'я' = "ja" + r '-' = "_" + r o = [o] diff --git a/contrib/eaglesconv/EaglesMatcher.hs b/contrib/eaglesconv/EaglesMatcher.hs new file mode 100644 index 000000000..27e76706f --- /dev/null +++ b/contrib/eaglesconv/EaglesMatcher.hs @@ -0,0 +1,63 @@ +-- Copyright (C) 2011 Nikita Frolov + +-- The format specification can be found at +-- http://devel.cpl.upc.edu/freeling/svn/trunk/doc/tagsets/tagset-ru.html + +-- Bugs in the specification: +-- Participle, 2nd field: case, not mood +-- Participle, 6th field: field, not person +-- Verb, persons can be denoted both with 'Pnumber' or just 'number' +-- Noun, 10th field can be absent + +-- No, it wouldn't be simpler to implement this grammar with Parsec or another +-- parser combinator library. + + +module EaglesMatcher where + +import qualified Data.Text as T +import Data.List +import qualified Data.Map as M + +type Forms = M.Map T.Text T.Text + +isOpenCat ('A':_) = True +isOpenCat ('N':_) = True +isOpenCat ('V':_) = True +isOpenCat ('D':_) = True +isOpenCat _ = False + +noun forms (c, n) = findForm (matchNoun . T.unpack) forms + where matchNoun ('N':_:c':n':_) = c == c' && n == n' + matchNoun _ = False + +adj forms d = findForm (matchAdj . T.unpack) forms + where matchAdj ('A':'N':'S':'M':_:'F':d':_) = d == d + matchAdj _ = False + +verbPres forms (n, p) = findForm (matchPres . T.unpack) forms + where matchPres ('V':'D':n':_:'P':'P':p':_:'A':_) = n == n' && p == p' + matchPres ('V':'D':n':_:'F':'P':p':_:'A':_) = n == n' && p == p' + matchPres ('V':'D':n':_:'P':'P':p':_) = n == n' && p == p' + matchPres ('V':'D':n':_:'F':'P':p':_) = n == n' && p == p' + matchPres _ = False + +verbPast forms (n, g) = findForm (matchPast . T.unpack) forms + where matchPast ('V':'D':n':g':'S':_:_:'A':_) = n == n' && g == g' + matchPast _ = False + +verbImp forms = findForm (matchImp . T.unpack) forms + where matchImp ('V':'M':_) = True + matchImp _ = False + +verbInf forms = findForm (matchInf . T.unpack) forms + where matchInf ('V':'I':_) = True + matchInf _ = False + +adv forms = findForm (matchAdv . T.unpack) forms + where matchAdv ('D':d:_) = d == 'P' + matchAdv _ = False + +findForm match forms = case find match (M.keys forms) of + Just tag -> forms M.! tag + Nothing -> findForm (\ _ -> True) forms diff --git a/contrib/eaglesconv/EaglesParser.hs b/contrib/eaglesconv/EaglesParser.hs new file mode 100644 index 000000000..6fc64d3b8 --- /dev/null +++ b/contrib/eaglesconv/EaglesParser.hs @@ -0,0 +1,239 @@ +-- Copyright (C) 2011 Nikita Frolov + +-- An early version of the parser that requires somewhat more memory. Kept for +-- nostalgic reasons. + +module EaglesParser where + +import qualified Data.Text as T +import Data.List +import qualified Data.Map as M + +type Forms = M.Map Tag T.Text + +data Tag = A Case Number Gender Animacy Form Degree Extra Obscene + | Adv Degree Extra Obscene + | AdvPron Extra + | Ord Case Number Gender Animacy + | AdjPron Case Number Gender Animacy Extra + | Frag Extra + | Conj Extra + | Inter Extra Obscene + | Num Case Number Gender Animacy Extra + | Part Extra + | Prep Extra + | N Case Number Gender Animacy Name Extra Obscene + | Pron Case Number Gender Animacy Extra + | V Mood Number Gender Tense Person Aspect Voice Trans Extra Obscene + | P Case Number Gender Tense Form Aspect Voice Trans Extra Obscene + deriving (Show, Ord, Eq) + +parseTag :: T.Text -> Tag +parseTag tag = case (T.unpack tag) of { + ('A':c:n:g:a:f:cmp:e:o:[]) -> A (readCase c) (readNumber n) + (readGender g) (readAnimacy a) + (readForm f) (readDegree cmp) + (readExtra e) (readObscene o) ; + ('D':cmp:e:o:[]) -> Adv (readDegree cmp) + (readExtra e) (readObscene o) ; + ('P':e:[]) -> AdvPron (readExtra e) ; + ('Y':c:n:g:a:[]) -> Ord (readCase c) (readNumber n) + (readGender g) (readAnimacy a) ; + ('R':c:n:g:a:e:[]) -> AdjPron (readCase c) (readNumber n) + (readGender g) (readAnimacy a) (readExtra e) ; + ('M':e:[]) -> Frag (readExtra e) ; + ('C':e:[]) -> Conj (readExtra e) ; + ('J':e:o:[]) -> Inter (readExtra e) (readObscene o) ; + ('Z':c:n:g:a:e:[]) -> Num (readCase c) (readNumber n) + (readGender g) (readAnimacy a) (readExtra e) ; + ('T':e:[]) -> Part (readExtra e) ; + ('B':e:[]) -> Prep (readExtra e) ; + ('N':_:c:n:g:a:name:e:o:_:[]) -> N (readCase c) (readNumber n) + (readGender g) (readAnimacy a) + (readName name) + (readExtra e) (readObscene o) ; + ('N':_:c:n:g:a:name:e:o:[]) -> N (readCase c) (readNumber n) + (readGender g) (readAnimacy a) + (readName name) + (readExtra e) (readObscene o) ; + ('E':c:n:g:a:e:[]) -> Pron (readCase c) (readNumber n) + (readGender g) (readAnimacy a) (readExtra e) ; + ('V':m:n:g:t:'P':p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n) + (readGender g) (readTense t) + (readPerson p) (readAspect a) + (readVoice v) (readTrans tr) + (readExtra e) (readObscene o) ; + ('V':m:n:g:t:'0':a:v:tr:e:o:[]) -> V (readMood m) (readNumber n) + (readGender g) (readTense t) + NP (readAspect a) + (readVoice v) (readTrans tr) + (readExtra e) (readObscene o) ; + ('V':m:n:g:t:p:a:v:tr:e:o:[]) -> V (readMood m) (readNumber n) + (readGender g) (readTense t) + (readPerson p) (readAspect a) + (readVoice v) (readTrans tr) + (readExtra e) (readObscene o) ; + ('Q':c:n:g:t:f:a:v:tr:e:o:[]) -> P (readCase c) (readNumber n) + (readGender g) (readTense t) + (readForm f) (readAspect a) + (readVoice v) (readTrans tr) + (readExtra e) (readObscene o) ; + _ -> error $ "Parse error: " ++ T.unpack tag } + +data Case = Nom | Gen | Dat | Acc | Inst | Prepos | Partit | Loc | Voc | NC + deriving (Show, Ord, Eq) + +readCase 'N' = Nom +readCase 'G' = Gen +readCase 'D' = Dat +readCase 'F' = Acc +readCase 'C' = Inst +readCase 'O' = Prepos +readCase 'P' = Partit +readCase 'L' = Loc +readCase 'V' = Voc +readCase '0' = NC + +data Number = Sg | Pl | NN deriving (Show, Ord, Eq) + +readNumber 'S' = Sg +readNumber 'P' = Pl +readNumber '0' = NN + +data Gender = Masc | Fem | Neut | Common | NG deriving (Show, Ord, Eq) + +readGender 'F' = Fem +readGender 'M' = Masc +readGender 'A' = Neut +readGender 'C' = Common +readGender '0' = NG + +data Animacy = Animate | Inanimate | NA deriving (Show, Ord, Eq) + +readAnimacy 'A' = Animate +readAnimacy 'I' = Inanimate +readAnimacy '0' = NA + +data Form = Short | Full | NF deriving (Show, Ord, Eq) + +readForm 'S' = Short +readForm 'F' = Full +readForm '0' = NF + +data Degree = Pos | Comp | Super | ND deriving (Show, Ord, Eq) + +readDegree 'E' = Super +readDegree 'C' = Comp +readDegree 'P' = Pos +readDegree '0' = ND + +data Extra = Introductory | Difficult | Distorted | Predicative + | Colloquial | Rare | Abbreviation | Obsolete | NE deriving (Show, Ord, Eq) + +readExtra 'P' = Introductory +readExtra 'D' = Difficult +readExtra 'V' = Distorted +readExtra 'R' = Predicative +readExtra 'I' = Colloquial +readExtra 'A' = Rare +readExtra 'B' = Abbreviation +readExtra 'E' = Obsolete +readExtra '0' = NE + +data Obscene = Obscene | NO deriving (Show, Ord, Eq) + +readObscene 'H' = Obscene +readObscene '0' = NO + +data Name = Topo | Proper | Patro | Family | NNa deriving (Show, Ord, Eq) + +readName 'G' = Topo +readName 'N' = Proper +readName 'S' = Patro +readName 'F' = Family +readName '0' = NNa + +data Mood = Gerund | Inf | Ind | Imp | NM deriving (Show, Ord, Eq) + +readMood 'G' = Gerund +readMood 'I' = Inf +readMood 'D' = Ind +readMood 'M' = Imp +readMood '0' = NM + +data Tense = Pres | Fut | Past | NT deriving (Show, Ord, Eq) + +readTense 'P' = Pres +readTense 'F' = Fut +readTense 'S' = Past +readTense '0' = NT + +data Person = P1 | P2 | P3 | NP deriving (Show, Ord, Eq) + +readPerson '1' = P1 +readPerson '2' = P2 +readPerson '3' = P3 + +data Aspect = Perf | Imperf | NAs deriving (Show, Ord, Eq) + +readAspect 'F' = Perf +readAspect 'N' = Imperf +readAspect '0' = NAs + +data Voice = Act | Pass | NV deriving (Show, Ord, Eq) + +readVoice 'A' = Act +readVoice 'S' = Pass +readVoice '0' = NV + +data Trans = Trans | Intrans | NTr deriving (Show, Ord, Eq) + +readTrans 'M' = Trans +readTrans 'A' = Intrans +readTrans '0' = NTr + +isOpenCat :: Tag -> Bool +isOpenCat (A _ _ _ _ _ _ _ _) = True +isOpenCat (N _ _ _ _ _ _ _) = True +isOpenCat (V _ _ _ _ _ _ _ _ _ _) = True +isOpenCat (Adv _ _ _) = True +isOpenCat _ = False + +noun :: Forms -> (Case, Number) -> T.Text +noun forms (c, n) = findForm matchNoun forms + where matchNoun (N c' n' _ _ _ _ _) = c == c' && n == n' + matchNoun _ = False + +adj :: Forms -> Degree -> T.Text +adj forms d = findForm matchAdj forms + where matchAdj (A _ _ _ _ _ d' _ _) = d == d + matchAdj _ = False + +verbPres :: Forms -> (Number, Person) -> T.Text +verbPres forms (n, p) = findForm matchPres forms + where matchPres (V Ind n' _ Pres p' _ Act _ _ _) = n == n' && p == p' + matchPres _ = False + +verbPast :: Forms -> (Number, Gender) -> T.Text +verbPast forms (n, g) = findForm matchPast forms + where matchPast (V Ind n' g' Past _ _ Act _ _ _) = n == n' && g == g' + matchPast _ = False + +verbImp :: Forms -> T.Text +verbImp forms = findForm matchImp forms + where matchImp (V Imp _ _ _ _ _ _ _ _ _) = True + matchImp _ = False + +verbInf :: Forms -> T.Text +verbInf forms = findForm matchInf forms + where matchInf (V Inf _ _ _ _ _ _ _ _ _) = True + matchInf _ = False + +adv :: Forms -> T.Text +adv forms = findForm matchAdv forms + where matchAdv (Adv d _ _) = d == Pos + matchAdv _ = False + +findForm match forms = case find match (M.keys forms) of + Just tag -> forms M.! tag + Nothing -> findForm (\ _ -> True) forms \ No newline at end of file diff --git a/contrib/eaglesconv/README b/contrib/eaglesconv/README new file mode 100644 index 000000000..e3c84c61d --- /dev/null +++ b/contrib/eaglesconv/README @@ -0,0 +1,24 @@ +How to use: + +1) Sort the wordlist so it can be split into sublists. It is necessary because +the converter is quite memory-hungry, and you might not have enough RAM to +process the whole wordlist at once. + +./CollectLemmas dicc.src | sort > lemmas.src + +2) Split the sorted wordlist. + +split -l 500000 lemmas.src + +3) Splitting has probably left forms of some lemmas spread across two +sublists. Manually edit sublists so all forms for a lemma are present in just +one sublist. + +4) Run the converter. + +./run_conv.sh xa* + +5) The converter has produced abstract and concrete syntaxes for the +dictionary. You can try them out with GF: + +gf DictRus.gf \ No newline at end of file diff --git a/contrib/eaglesconv/mkAbstract.sh b/contrib/eaglesconv/mkAbstract.sh new file mode 100644 index 000000000..d07da18fc --- /dev/null +++ b/contrib/eaglesconv/mkAbstract.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +echo "abstract DictRusAbs = Cat ** { +" +cat $1 | sed 's/^lin/fun/g;s/=.*$//g;s/\_N/\_N : N\;/g;s/\_PN/\_PN : PN\;/g;s/\_A /\_A : A\;/g;s/\_V/\_V : V\;/g;s/\_Adv/\_Adv : Adv\;/g' + +echo " +}" \ No newline at end of file diff --git a/contrib/eaglesconv/mkConcrete.sh b/contrib/eaglesconv/mkConcrete.sh new file mode 100644 index 000000000..170ab9c5e --- /dev/null +++ b/contrib/eaglesconv/mkConcrete.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +echo "--# -path=.:../prelude:../abstract:../common + +concrete DictRus of DictRusAbs = CatRus ** + open ParadigmsRus, Prelude, StructuralRus, MorphoRus in { +flags + optimize=values ; + coding=utf8 ; +" +cat $1 +echo "}" diff --git a/contrib/eaglesconv/run_conv.sh b/contrib/eaglesconv/run_conv.sh new file mode 100644 index 000000000..5ad586834 --- /dev/null +++ b/contrib/eaglesconv/run_conv.sh @@ -0,0 +1,4 @@ +#!/bin/sh +./EaglesConv "$@" +RTS -K256M -RTS > convtmp +./mkConcrete.sh convtmp > DictRus.gf +./mkAbstract.sh convtmp > DictRusAbs.gf