add the bgoffice import program and the produced dictonaries

This commit is contained in:
krasimir
2008-03-13 11:18:32 +00:00
parent 51288c77eb
commit ff7dc2006a
3 changed files with 115861 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,238 @@
import System.Directory
import Data.List
import Data.Char
import qualified Data.Map as Map
import Text.PrettyPrint
import System.Win32.NLS
import System.Win32.Console
main = do
codepage <- getACP
setConsoleCP codepage
setConsoleOutputCP codepage
fns <- fmap (sortByParadigm . concat) (mapM processCats cats)
writeParadigmsBul fns
writeLexiconBul fns
writeLexiconBulAbs fns
processCats ch = do
xs <- getDirectoryContents (dir ch)
let files = [x | x <- xs, take 2 x == "bg"]
mapM (processFile ch) files
sortByParadigm xs = map snd (sortBy (\(k1,_) (k2,_) -> compare k1 k2) ys)
where
ys = [case reads p of
[(n,_)] -> (n :: Int,x)
_ -> error ("Bad paradigm "++p) | x@(p,_,_,_) <- xs]
processFile ch file = do
putStr file
let fpath = dir ch++"/"++file
txt <- readFile fpath
let (suffixes,words) = parse fpath ch (lines txt)
let paradigm = reverse (drop 4 (reverse (drop 2 file)))
base_tmpl = head suffixes
body = text "let" <+> vcat (punctuate (char ';') (text "v0 =" <+> mkGetStem (length (head suffixes))
: [char 'v' <> int i <+> char '=' <+> text "last" <+> parens (mkGetStem suf)
| (i,suf) <- zip [1..] (splitQ base_tmpl)])) $$
text "in" <+> text (conName ch) <+> (vcat (map (parens . mkCalcForm 0 . ('?':)) suffixes) $$
if null (params ch) then empty else text (params ch))
<+> char ';'
putStrLn (" - " ++ show (length suffixes) ++ ", " ++ show (length words) ++ " words")
return (paradigm,ch,body,words)
deUnicode = map deUnicodeChar
where
deUnicodeChar c
| n >= 1040 && n <= 1103
= chr (n-848)
| otherwise = c
where
n = ord c
parse fileName ch ls = (suffixes,words)
where
ls1 = skipTo (deUnicode "Окончания:") ls
(suffixes', ls2) = collect [] ls1
suffixes = [normSuffix (getSuffix fileName suffixes' i) | i <- baseForms ch]
ls3 = skipTo (deUnicode "Думи:") ls2
(words, ls4) = collect [] ls3
getSuffix fileName xs i
| i < length xs = xs !! i
| otherwise = error (fileName++": getSuffix "++unwords xs++" "++show i)
skipTo ll [] = []
skipTo ll (l:ls)
| l == ll = ls
| otherwise = skipTo ll ls
collect acc [] = (reverse acc,[])
collect acc (l:ls)
| null l = collect acc ls
| head l == '#' = collect acc ls
| last l == ':' = (reverse acc,l:ls)
| otherwise = collect (l:acc) ls
normSuffix "0" = ""
normSuffix s = takeWhile (/=',') s
splitQ [] = []
splitQ ('?':cs) = length cs : splitQ cs
splitQ (c:cs) = splitQ cs
mkGetStem n
| n == 0 = text "base"
| otherwise = text "tk" <+> int n <+> text "base"
mkCalcForm i form =
case break (=='?') form of
(cs,[]) -> doubleQuotes (text cs)
(cs,'?':form) -> opt cs (doubleQuotes (text cs) <> char '+') <> char 'v' <> int i <> opt form (char '+' <> mkCalcForm (i+1) form)
where
opt [] doc = empty
opt _ doc = doc
cyr2lats ss =
let xs = concatMap (cyr2lat . chr . (+848) . ord) ss
ys = case xs of
'_':xs -> xs
_ -> xs
zs = case reverse ys of
'_':ys -> reverse ys
_ -> ys
in zs
cyr2lat 'а' = "a"
cyr2lat 'б' = "b"
cyr2lat 'в' = "v"
cyr2lat 'г' = "g"
cyr2lat 'д' = "d"
cyr2lat 'е' = "e"
cyr2lat 'ж' = "_zj_"
cyr2lat 'з' = "z"
cyr2lat 'и' = "i"
cyr2lat 'й' = "j"
cyr2lat 'к' = "k"
cyr2lat 'л' = "l"
cyr2lat 'м' = "m"
cyr2lat 'н' = "n"
cyr2lat 'о' = "o"
cyr2lat 'п' = "p"
cyr2lat 'р' = "r"
cyr2lat 'с' = "s"
cyr2lat 'т' = "t"
cyr2lat 'у' = "u"
cyr2lat 'ф' = "f"
cyr2lat 'х' = "h"
cyr2lat 'ц' = "c"
cyr2lat 'ч' = "_ch_"
cyr2lat 'ш' = "_sh_"
cyr2lat 'щ' = "_sht_"
cyr2lat 'ъ' = "y"
cyr2lat 'ь' = "a"
cyr2lat 'ю' = "_iu_"
cyr2lat 'я' = "_ja_"
cyr2lat x = "("++[x]++show (ord x)++")"
writeParadigmsBul fns = do
putStr "Writing ParadigmsBul ... "
let opers = text "oper" $$ nest 2 (vcat (map mkOper fns))
mkOper (paradigm,ch,body,ws) = decl $$ def
where
fname = "mk"++catName ch++paradigm
def = text fname <+> text "base =" <+> body
decl = text fname <+> text ": Str ->" <+> text (catName ch) <+> text ";"
doc = text "resource ParadigmsBul = MorphoFunsBul ** open" $$
text " Predef," $$
text " Prelude," $$
text " MorphoBul," $$
text " CatBul" $$
text " in {" $$
opers $$
text "}"
writeFile "ParadigmsBul.gf" (show doc)
putStrLn "Done"
writeLexiconBulAbs fns = do
putStr "Writing BGOfficeLexiconAbs ... "
let mkFuns (paradigm,ch,body,ws) =
[(w,text (cyr2lats w++"_"++catName ch) <+> char ':' <+> text (catName ch) <+> char ';') | w <- ws]
doc = text "abstract BGOfficeLexiconAbs = Cat ** {" $$
text "fun" $$
nest 2 (vcat (Map.elems (Map.fromList (concatMap mkFuns fns)))) $$
text "}"
writeFile "BGOfficeLexiconAbs.gf" (show doc)
putStrLn "Done"
writeLexiconBul fns = do
putStr "Writing BGOfficeLexicon ... "
let mkLins (paradigm,ch,body,ws) =
[(w,text (cyr2lats w++"_"++catName ch) <+> char '=' <+> text fname <+> doubleQuotes (text w) <+> char ';') | w <- ws]
where
fname = "mk"++catName ch++paradigm
lins = Map.elems (Map.fromList (concatMap mkLins fns))
doc = text "--# -path=.:prelude:resource/common:resource/abstract:resource/bulgarian" $$
text "" $$
text "concrete BGOfficeLexicon of BGOfficeLexiconAbs = CatBul **" $$
text "open ParadigmsBul, Prelude in {" $$
text "" $$
text "flags" $$
text " optimize=values ;" $$
text "" $$
text "lin" $$
nest 2 (vcat lins) $$
text "}"
writeFile "BGOfficeLexicon.gf" (show doc)
putStrLn "Done"
data CatHints
= CH { dir :: FilePath
, catName :: String
, conName :: String
, params :: String
, baseForms :: [Int]
}
cats = [ CH { dir = "data/verb"
, catName = "V"
, conName = "mkVerb"
, params = ""
, baseForms = [1,3,7,13,21,30,39,48,19]
}
, CH { dir = "data/noun/female"
, catName = "N"
, conName = "mkNoun"
, params = "DFem"
, baseForms = [1,3,3,5]
}
, CH { dir = "data/noun/male"
, catName = "N"
, conName = "mkNoun"
, params = "DMasc"
, baseForms = [1,4,6,7]
}
, CH { dir = "data/noun/personal"
, catName = "N"
, conName = "mkNoun"
, params = "DMascPersonal"
, baseForms = [1,4,6,7]
}
, CH { dir = "data/noun/neutral"
, catName = "N"
, conName = "mkNoun"
, params = "DNeut"
, baseForms = [1,3,3,1]
}
, CH { dir = "data/adjective"
, catName = "A"
, conName = "mkAdjective"
, params = ""
, baseForms = [1,2,3,4,5,6,7,8,9]
}
]