mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-10 13:29:32 -06:00
136 lines
5.6 KiB
Haskell
136 lines
5.6 KiB
Haskell
import qualified Data.Map
|
|
import qualified Data.Set
|
|
import Data.List
|
|
|
|
langs = words "Bul Cat Chi Dut Eng Fin Fre Ger Hin Ita Jpn Mlt Spa Swe Tha"
|
|
|
|
-- apply a function to every line, changing it to a list
|
|
changeLinesLang :: (String -> [String]) -> String -> IO ()
|
|
changeLinesLang f lang = do
|
|
dict <- readFile (gfFile "Dictionary" lang) >>= return . lines
|
|
writeFile (gfFile "tmp/Dictionary" lang) $ unlines $ concatMap f dict
|
|
|
|
createAllConcretes = do
|
|
createAbstract
|
|
mapM_ createConcrete langs
|
|
|
|
createAbstract = do
|
|
bnc <- readFile "bnc-to-check.txt" >>= return . words -- list of BNC funs
|
|
writeFile (gfFile "todo/tmp/TopDictionary" "") $
|
|
unlines $ ["abstract TopDictionary = Cat **{"] ++
|
|
[unwords ("fun":f:":": snd (splitFun f) :[";"]) | f <- bnc] ++ ["}"] -- print inspectable file, to todo/tmp/
|
|
|
|
createConcrete lang = do
|
|
bnc <- readFile "bnc-to-check.txt" >>= return . words -- list of BNC funs
|
|
dict <- readFile (gfFile "Dictionary" lang) >>= return . lines -- current lang lexicon
|
|
let header = getHeader dict
|
|
let dictmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words dict] -- lin rules to a map
|
|
let bncdict = [(f,lookupFun f dictmap) | f <- bnc] -- current lang for BNC
|
|
writeFile (gfFile "todo/tmp/TopDictionary" lang) $
|
|
unlines $ toTop header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- bncdict] ++ ["}"] -- print inspectable file, to todo/tmp/
|
|
|
|
gfFile body lang = body ++ lang ++ ".gf"
|
|
|
|
mergeDict lang = do
|
|
old <- readFile (gfFile "Dictionary" lang) >>= return . lines -- read old lexicon
|
|
new <- readFile (gfFile "todo/TopDictionary" lang) >>= return . lines -- read corrected and new words
|
|
let header = getHeader new
|
|
let oldmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words old]
|
|
let newlist = [(f,unwords ws) | "lin":f:"=":ws <- map words new]
|
|
let newmap = foldr (uncurry Data.Map.insert) oldmap newlist -- insert corrected words
|
|
writeFile (gfFile "tmp/Dictionary" lang) $
|
|
unlines $ fromTop header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- Data.Map.assocs newmap] ++ ["}"] -- print revised file to tmp/
|
|
|
|
changeFunNames nameFile lang = do
|
|
ns <- readFile nameFile
|
|
let names = Data.Map.fromList [(old,new) | old:new:_ <- map words (lines ns)] -- format: "old new" on separate lines
|
|
let look w = Data.Map.lookup w names
|
|
dict <- readFile (gfFile "Dictionary" lang) >>= return . lines -- read old lexicon
|
|
let
|
|
change line = case words line of
|
|
"lin":f:ws -> case look f of
|
|
Just g -> unwords $ "lin":g:ws
|
|
_ -> line
|
|
_ -> line
|
|
writeFile (gfFile "tmp/Dictionary" lang) $ unlines $ map change dict
|
|
|
|
|
|
-- get the part of Dict before the first lin rule
|
|
getHeader = takeWhile ((/= "lin") . take 3)
|
|
|
|
toTop = map tt where
|
|
tt s = case s of
|
|
'D':'i':'c':'t':cs -> "TopDict" ++ tt cs
|
|
c:cs -> c : tt cs
|
|
_ -> s
|
|
|
|
fromTop = map tt where
|
|
tt s = case s of
|
|
'T':'o':'p':'D':'i':'c':'t':cs -> "Dict" ++ tt cs
|
|
c:cs -> c : tt cs
|
|
_ -> s
|
|
|
|
-- try to find lin rules by searching first literally, then subcategories in priority order
|
|
|
|
lookupFun f dictmap = case look f of
|
|
Just rule | notEmpty rule -> rule
|
|
_ -> case [r | Just r <- map look (subCats f), notEmpty r] of
|
|
rule:_ -> "variants{}; -- " ++ rule -- cannot return it as such, as probably type incorrect
|
|
_ -> "variants{} ; -- "
|
|
where
|
|
look = flip Data.Map.lookup dictmap
|
|
notEmpty r = take 1 (words r) `notElem` [["variants"],["variants{}"]]
|
|
|
|
subCats f = case splitFun f of
|
|
(fun,cat) -> case cat of
|
|
"V" -> [fun ++ c | c <- words "V2 V3 VS VQ VA VV V2S V2Q V2A V2V"]
|
|
"V2" -> [fun ++ c | c <- words "V3 V2S V2Q V2A V2V V VS VQ VA VV"]
|
|
"VS" -> [fun ++ c | c <- words "VQ V2S V2Q V2 V V2A V2V V3 VA VV"]
|
|
"VQ" -> [fun ++ c | c <- words "VS V2Q V2S V2 V V2A V2V V3 VA VV"]
|
|
"VA" -> [fun ++ c | c <- words "V V2A V2 V3 VS VQ VV V2S V2Q V2V"]
|
|
"VV" -> [fun ++ c | c <- words "V2V V V2 V3 VS VQ VV V2S V2Q V2V"]
|
|
"V3" -> [fun ++ c | c <- words "V2 V2S V2Q V2A V2V V VS VQ VA VV"]
|
|
"V2S" -> [fun ++ c | c <- words "VS VQ V2Q V2A V2V V2 V3 VA V VV"]
|
|
"V2Q" -> [fun ++ c | c <- words "VQ VS V2S V2A V2V V2 V3 VA V VV"]
|
|
"V2A" -> [fun ++ c | c <- words "VA V2 V3 V VS VQ VV V2S V2Q V2V"]
|
|
"V2V" -> [fun ++ c | c <- words "VV V2 V2 V VS VQ VV V2S V2Q V2V"]
|
|
"Adv" -> [fun ++ c | c <- words "AdV Prep"]
|
|
"AdV" -> [fun ++ c | c <- words "Adv Prep"]
|
|
_ -> []
|
|
|
|
splitFun f = case span (/='_') (reverse f) of (tac,nuf) -> (reverse nuf, reverse tac)
|
|
|
|
|
|
------ word statistics
|
|
|
|
isUnchecked line = isInfixOf "--" line -- checked = no comments
|
|
isUnknown line = isInfixOf "{}" line -- known = not variants {}
|
|
|
|
statLang lang = do
|
|
dict <- readFile (gfFile "Dictionary" lang) >>= return . lines
|
|
let lins = filter ((==["lin"]) . take 1 . words) dict
|
|
let nall = length $ filter (not . isUnknown) lins
|
|
let nchecked = length $ filter (not . (\x -> isUnknown x || isUnchecked x)) lins
|
|
putStrLn $ lang ++ "\t" ++ show nall ++ "\t" ++ show nchecked
|
|
|
|
statAll = mapM_ statLang langs
|
|
|
|
|
|
{-
|
|
-- lin f = def ; -- comment
|
|
analyseLine :: String -> (String,String,String,String,String)
|
|
analyseLine s = case words s of
|
|
k:f:s:dc -> (k,f,s)
|
|
|
|
|
|
-------
|
|
|
|
-- handle split senses: remove the unsplit variant ; if the split ones are empty, copy the unsplit to them
|
|
|
|
handleSplit :: Data.Set.Set String -> String -> [String]
|
|
handleSplit sset line =
|
|
let (keyw,fun,sep,def,comment) = analyseLine line
|
|
-}
|
|
|
|
|