mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-14 15:29:31 -06:00
A Russian dictionary generated from a wordlist created by the FreeLing project. The accompanying converter can be used to convert other wordlists in EAGLES format to GF grammars.
136 lines
4.8 KiB
Haskell
136 lines
4.8 KiB
Haskell
-- Copyright (C) 2011 Nikita Frolov
|
||
|
||
-- No, we can't pipeline parsing and generation, because there is no guarantee
|
||
-- that we have collected all forms for a lemma before we've scanned the
|
||
-- complete file.
|
||
|
||
import qualified Data.Text as T
|
||
import qualified Data.Text.IO as UTF8
|
||
import System.IO
|
||
import System.Environment
|
||
import Control.Monad
|
||
import Control.Monad.State
|
||
import qualified Data.Map as M
|
||
import Codec.Text.IConv
|
||
import qualified Data.ByteString.Lazy as BS
|
||
import qualified Data.ByteString.Internal as BSI
|
||
|
||
import EaglesMatcher
|
||
|
||
type Lemmas = M.Map T.Text Forms
|
||
|
||
main :: IO ()
|
||
main = do
|
||
args <- getArgs
|
||
forM_ args $ \ f -> do
|
||
entries <- UTF8.readFile f >>= (return . T.lines)
|
||
lemmas <- return $ execState (collectLemmas entries) (M.empty :: Lemmas)
|
||
mapM_ generateLin (M.assocs lemmas)
|
||
|
||
collectLemmas entries = do
|
||
forM_ entries $ \ entry -> do
|
||
let ws = T.words entry
|
||
lemma = head ws
|
||
tags = toPairs $ tail ws
|
||
lemmas <- get
|
||
forM_ tags $ \ (form, tag) -> do
|
||
let forms = (case M.lookup lemma lemmas of
|
||
Just f -> f
|
||
Nothing -> M.empty) :: Forms
|
||
if isOpenCat . T.unpack $ tag
|
||
then put $ M.insert lemma (M.insert tag form forms) lemmas
|
||
else return ()
|
||
|
||
generateLin :: (T.Text, Forms) -> IO ()
|
||
generateLin (lemma, forms) = do
|
||
let lemma' = myVeryOwnCyrillicRomanizationIConvSucks lemma
|
||
UTF8.putStr $ T.concat [T.pack "lin ", lemma']
|
||
UTF8.putStr $ case T.unpack . head . M.keys $ forms of
|
||
('N':_:_:_:g:a:'0':_) ->
|
||
T.concat $ [T.pack "_N = mkN "]
|
||
++ map (quote . noun forms) [ ('N','S'), ('G','S')
|
||
, ('D','S'), ('F','S'), ('C','S'), ('O','S')
|
||
, ('L','S'), ('N','P'), ('G','P'), ('D','P')
|
||
, ('F','P'), ('C','P'), ('O','P') ]
|
||
++ [showG g, sp, showAni a, ln]
|
||
('N':_:c:n:g:a:_) ->
|
||
T.concat $ [T.pack "_PN = mkPN "
|
||
, quote $ noun forms ('N', 'S')
|
||
, showG g, sp
|
||
, showN n, sp, showAni a, ln]
|
||
('A':_) ->
|
||
T.concat $ [T.pack "_A = mkA ", quote $ adj forms 'P',
|
||
if adj forms 'P' /= adj forms 'C'
|
||
then quote $ adj forms 'C'
|
||
else T.pack ""
|
||
, ln]
|
||
('V':t) ->
|
||
let a = case t of
|
||
(_:_:_:_:'P':_:a':_) -> a'
|
||
(_:_:_:_:_:a':_) -> a'
|
||
in
|
||
T.concat $ [T.pack "_V = mkV ", showAsp a, sp]
|
||
++ map (quote . verbPres forms) [ ('S','1'), ('S','2')
|
||
, ('S','3'), ('P','1')
|
||
, ('P','2'), ('P','3')]
|
||
++ [ quote $ verbPast forms ('S', 'M')
|
||
, quote $ verbImp forms, quote $ verbInf forms, ln]
|
||
('D':_) ->
|
||
T.concat $ [T.pack "_Adv = mkAdv "
|
||
, quote . adv $ forms, ln]
|
||
putStrLn ""
|
||
hFlush stdout
|
||
where quote x = T.concat [T.pack "\"", x, T.pack "\" "]
|
||
showG 'F' = T.pack "Fem"
|
||
showG 'A' = T.pack "Neut"
|
||
showG _ = T.pack "Masc"
|
||
showAni 'I' = T.pack "Inanimate"
|
||
showAni _ = T.pack "Animate"
|
||
showN 'P' = T.pack "Pl"
|
||
showN _ = T.pack "Sg"
|
||
showAsp 'F' = T.pack "Perfective"
|
||
showAsp _ = T.pack "Imperfective"
|
||
sp = T.singleton ' '
|
||
ln = T.pack " ;"
|
||
|
||
toPairs xs = zip (stride 2 xs) (stride 2 (drop 1 xs))
|
||
where stride _ [] = []
|
||
stride n (x:xs) = x : stride n (drop (n-1) xs)
|
||
|
||
myVeryOwnCyrillicRomanizationIConvSucks s = T.pack . concatMap r . T.unpack $ s
|
||
where r 'а' = "a"
|
||
r 'б' = "b"
|
||
r 'в' = "v"
|
||
r 'г' = "g"
|
||
r 'д' = "d"
|
||
r 'е' = "je"
|
||
r 'ё' = "jo"
|
||
r 'ж' = "zh"
|
||
r 'з' = "z"
|
||
r 'и' = "i"
|
||
r 'й' = "jj"
|
||
r 'к' = "k"
|
||
r 'л' = "l"
|
||
r 'м' = "m"
|
||
r 'н' = "n"
|
||
r 'о' = "o"
|
||
r 'п' = "p"
|
||
r 'р' = "r"
|
||
r 'с' = "s"
|
||
r 'т' = "t"
|
||
r 'у' = "u"
|
||
r 'ф' = "f"
|
||
r 'х' = "kh"
|
||
r 'ц' = "c"
|
||
r 'ч' = "ch"
|
||
r 'ш' = "sh"
|
||
r 'щ' = "shc"
|
||
r 'ъ' = "yy"
|
||
r 'ы' = "y"
|
||
r 'ь' = "q"
|
||
r 'э' = "e"
|
||
r 'ю' = "ju"
|
||
r 'я' = "ja"
|
||
r '-' = "_"
|
||
r o = [o]
|