mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-06 09:42:50 -06:00
A Russian dictionary
A Russian dictionary generated from a wordlist created by the FreeLing project. The accompanying converter can be used to convert other wordlists in EAGLES format to GF grammars.
This commit is contained in:
135
contrib/eaglesconv/EaglesConv.hs
Normal file
135
contrib/eaglesconv/EaglesConv.hs
Normal file
@@ -0,0 +1,135 @@
|
||||
-- Copyright (C) 2011 Nikita Frolov
|
||||
|
||||
-- No, we can't pipeline parsing and generation, because there is no guarantee
|
||||
-- that we have collected all forms for a lemma before we've scanned the
|
||||
-- complete file.
|
||||
|
||||
import qualified Data.Text as T
|
||||
import qualified Data.Text.IO as UTF8
|
||||
import System.IO
|
||||
import System.Environment
|
||||
import Control.Monad
|
||||
import Control.Monad.State
|
||||
import qualified Data.Map as M
|
||||
import Codec.Text.IConv
|
||||
import qualified Data.ByteString.Lazy as BS
|
||||
import qualified Data.ByteString.Internal as BSI
|
||||
|
||||
import EaglesMatcher
|
||||
|
||||
type Lemmas = M.Map T.Text Forms
|
||||
|
||||
main :: IO ()
|
||||
main = do
|
||||
args <- getArgs
|
||||
forM_ args $ \ f -> do
|
||||
entries <- UTF8.readFile f >>= (return . T.lines)
|
||||
lemmas <- return $ execState (collectLemmas entries) (M.empty :: Lemmas)
|
||||
mapM_ generateLin (M.assocs lemmas)
|
||||
|
||||
collectLemmas entries = do
|
||||
forM_ entries $ \ entry -> do
|
||||
let ws = T.words entry
|
||||
lemma = head ws
|
||||
tags = toPairs $ tail ws
|
||||
lemmas <- get
|
||||
forM_ tags $ \ (form, tag) -> do
|
||||
let forms = (case M.lookup lemma lemmas of
|
||||
Just f -> f
|
||||
Nothing -> M.empty) :: Forms
|
||||
if isOpenCat . T.unpack $ tag
|
||||
then put $ M.insert lemma (M.insert tag form forms) lemmas
|
||||
else return ()
|
||||
|
||||
generateLin :: (T.Text, Forms) -> IO ()
|
||||
generateLin (lemma, forms) = do
|
||||
let lemma' = myVeryOwnCyrillicRomanizationIConvSucks lemma
|
||||
UTF8.putStr $ T.concat [T.pack "lin ", lemma']
|
||||
UTF8.putStr $ case T.unpack . head . M.keys $ forms of
|
||||
('N':_:_:_:g:a:'0':_) ->
|
||||
T.concat $ [T.pack "_N = mkN "]
|
||||
++ map (quote . noun forms) [ ('N','S'), ('G','S')
|
||||
, ('D','S'), ('F','S'), ('C','S'), ('O','S')
|
||||
, ('L','S'), ('N','P'), ('G','P'), ('D','P')
|
||||
, ('F','P'), ('C','P'), ('O','P') ]
|
||||
++ [showG g, sp, showAni a, ln]
|
||||
('N':_:c:n:g:a:_) ->
|
||||
T.concat $ [T.pack "_PN = mkPN "
|
||||
, quote $ noun forms ('N', 'S')
|
||||
, showG g, sp
|
||||
, showN n, sp, showAni a, ln]
|
||||
('A':_) ->
|
||||
T.concat $ [T.pack "_A = mkA ", quote $ adj forms 'P',
|
||||
if adj forms 'P' /= adj forms 'C'
|
||||
then quote $ adj forms 'C'
|
||||
else T.pack ""
|
||||
, ln]
|
||||
('V':t) ->
|
||||
let a = case t of
|
||||
(_:_:_:_:'P':_:a':_) -> a'
|
||||
(_:_:_:_:_:a':_) -> a'
|
||||
in
|
||||
T.concat $ [T.pack "_V = mkV ", showAsp a, sp]
|
||||
++ map (quote . verbPres forms) [ ('S','1'), ('S','2')
|
||||
, ('S','3'), ('P','1')
|
||||
, ('P','2'), ('P','3')]
|
||||
++ [ quote $ verbPast forms ('S', 'M')
|
||||
, quote $ verbImp forms, quote $ verbInf forms, ln]
|
||||
('D':_) ->
|
||||
T.concat $ [T.pack "_Adv = mkAdv "
|
||||
, quote . adv $ forms, ln]
|
||||
putStrLn ""
|
||||
hFlush stdout
|
||||
where quote x = T.concat [T.pack "\"", x, T.pack "\" "]
|
||||
showG 'F' = T.pack "Fem"
|
||||
showG 'A' = T.pack "Neut"
|
||||
showG _ = T.pack "Masc"
|
||||
showAni 'I' = T.pack "Inanimate"
|
||||
showAni _ = T.pack "Animate"
|
||||
showN 'P' = T.pack "Pl"
|
||||
showN _ = T.pack "Sg"
|
||||
showAsp 'F' = T.pack "Perfective"
|
||||
showAsp _ = T.pack "Imperfective"
|
||||
sp = T.singleton ' '
|
||||
ln = T.pack " ;"
|
||||
|
||||
toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
|
||||
where stride _ [] = []
|
||||
stride n (x:xs) = x : stride n (drop (n-1) xs)
|
||||
|
||||
myVeryOwnCyrillicRomanizationIConvSucks s = T.pack . concatMap r . T.unpack $ s
|
||||
where r 'а' = "a"
|
||||
r 'б' = "b"
|
||||
r 'в' = "v"
|
||||
r 'г' = "g"
|
||||
r 'д' = "d"
|
||||
r 'е' = "je"
|
||||
r 'ё' = "jo"
|
||||
r 'ж' = "zh"
|
||||
r 'з' = "z"
|
||||
r 'и' = "i"
|
||||
r 'й' = "jj"
|
||||
r 'к' = "k"
|
||||
r 'л' = "l"
|
||||
r 'м' = "m"
|
||||
r 'н' = "n"
|
||||
r 'о' = "o"
|
||||
r 'п' = "p"
|
||||
r 'р' = "r"
|
||||
r 'с' = "s"
|
||||
r 'т' = "t"
|
||||
r 'у' = "u"
|
||||
r 'ф' = "f"
|
||||
r 'х' = "kh"
|
||||
r 'ц' = "c"
|
||||
r 'ч' = "ch"
|
||||
r 'ш' = "sh"
|
||||
r 'щ' = "shc"
|
||||
r 'ъ' = "yy"
|
||||
r 'ы' = "y"
|
||||
r 'ь' = "q"
|
||||
r 'э' = "e"
|
||||
r 'ю' = "ju"
|
||||
r 'я' = "ja"
|
||||
r '-' = "_"
|
||||
r o = [o]
|
||||
Reference in New Issue
Block a user