gf-core/lib/src/translator/CheckDict.hs

import qualified Data.Map
import qualified Data.Set
import Data.List

langs = words "Bul Cat Chi Dut Eng Fin Fre Ger Hin Ita Jpn Mlt Spa Swe Tha"

-- apply a function to every line, changing it to a list
changeLinesLang :: (String -> [String]) -> String -> IO ()
changeLinesLang f lang = do
  dict <- readFile (gfFile "Dictionary" lang) >>= return . lines
  writeFile (gfFile "tmp/Dictionary" lang) $ unlines $ concatMap f dict

createAllConcretes = do
  createAbstract
  mapM_ createConcrete langs

createAbstract = do
  bnc <- readFile "bnc-to-check.txt" >>= return . words                            -- list of BNC funs
  writeFile (gfFile "todo/tmp/TopDictionary" "") $
    unlines $ ["abstract TopDictionary = Cat **{"] ++
              [unwords ("fun":f:":": snd (splitFun f) :[";"]) | f <- bnc] ++ ["}"] -- print inspectable file, to todo/tmp/

createConcrete lang = do
  bnc <- readFile "bnc-to-check.txt" >>= return . words                                -- list of BNC funs
  dict <- readFile (gfFile "Dictionary" lang) >>= return . lines                       -- current lang lexicon
  let header = getHeader dict
  let dictmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words dict]  -- lin rules to a map
  let bncdict = [(f,lookupFun f dictmap) | f <- bnc]                                   -- current lang for BNC
  writeFile (gfFile "todo/tmp/TopDictionary" lang) $
    unlines $ toTop header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- bncdict] ++ ["}"] -- print inspectable file, to todo/tmp/

gfFile body lang = body ++ lang ++ ".gf"

mergeDict lang = do
  old <- readFile (gfFile      "Dictionary" lang) >>= return . lines                      -- read old lexicon
  new <- readFile (gfFile "todo/TopDictionary" lang) >>= return . lines                   -- read corrected and new words
  let header = getHeader new
  let oldmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words old]
  let newlist = [(f,unwords ws) | "lin":f:"=":ws <- map words new]
  let newmap = foldr (uncurry Data.Map.insert) oldmap newlist                             -- insert corrected words
  writeFile (gfFile "tmp/Dictionary" lang) $
    unlines $ fromTop header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- Data.Map.assocs newmap] ++ ["}"]  -- print revised file to tmp/

changeFunNames nameFile lang = do
  ns <- readFile nameFile
  let names = Data.Map.fromList [(old,new) | old:new:_ <- map words (lines ns)]      -- format: "old new" on separate lines
  let look w = Data.Map.lookup w names
  dict <- readFile (gfFile "Dictionary" lang) >>= return . lines                     -- read old lexicon
  let
    change line = case words line of
      "lin":f:ws -> case look f of
        Just g -> unwords $ "lin":g:ws
        _ -> line
      _ -> line
  writeFile (gfFile "tmp/Dictionary" lang) $ unlines $ map change dict


-- get the part of Dict before the first lin rule
getHeader = takeWhile ((/= "lin") . take 3)

toTop = map tt where
  tt s = case s of
    'D':'i':'c':'t':cs -> "TopDict" ++ tt cs
    c:cs               -> c : tt cs
    _ -> s

fromTop = map tt where
  tt s = case s of
    'T':'o':'p':'D':'i':'c':'t':cs -> "Dict" ++ tt cs
    c:cs               -> c : tt cs
    _ -> s

-- try to find lin rules by searching first literally, then subcategories in priority order

lookupFun f dictmap = case look f of
  Just rule | notEmpty rule -> rule
  _ -> case [r | Just r <- map look (subCats f), notEmpty r] of
    rule:_ -> "variants{}; -- " ++ rule  -- cannot return it as such, as probably type incorrect
    _ -> "variants{} ; -- "
 where
  look = flip Data.Map.lookup dictmap
  notEmpty r = take 1 (words r) `notElem` [["variants"],["variants{}"]]

subCats f = case splitFun f of
  (fun,cat) -> case cat of
   "V"   -> [fun ++ c | c <- words "V2 V3 VS VQ VA VV V2S V2Q V2A V2V"]
   "V2"  -> [fun ++ c | c <- words "V3 V2S V2Q V2A V2V V VS VQ VA VV"]
   "VS"  -> [fun ++ c | c <- words "VQ V2S V2Q V2 V V2A V2V V3 VA VV"]
   "VQ"  -> [fun ++ c | c <- words "VS V2Q V2S V2 V V2A V2V V3 VA VV"]
   "VA"  -> [fun ++ c | c <- words "V V2A V2 V3 VS VQ VV V2S V2Q V2V"]
   "VV"  -> [fun ++ c | c <- words "V2V V V2 V3 VS VQ VV V2S V2Q V2V"]
   "V3"  -> [fun ++ c | c <- words "V2 V2S V2Q V2A V2V V VS VQ VA VV"]
   "V2S" -> [fun ++ c | c <- words "VS VQ V2Q V2A V2V V2 V3 VA V VV"]
   "V2Q" -> [fun ++ c | c <- words "VQ VS V2S V2A V2V V2 V3 VA V VV"]
   "V2A" -> [fun ++ c | c <- words "VA V2 V3 V VS VQ VV V2S V2Q V2V"]
   "V2V" -> [fun ++ c | c <- words "VV V2 V2 V VS VQ VV V2S V2Q V2V"]
   "Adv" -> [fun ++ c | c <- words "AdV Prep"]
   "AdV" -> [fun ++ c | c <- words "Adv Prep"]
   _ -> []

splitFun f = case span (/='_') (reverse f) of (tac,nuf) -> (reverse nuf, reverse tac)


------ word statistics

isUnchecked line = isInfixOf "--" line           -- checked = no comments
isUnknown line = isInfixOf "{}" line       -- known = not variants {}

statLang lang = do
  dict <- readFile (gfFile "Dictionary" lang) >>= return . lines
  let lins = filter ((==["lin"]) . take 1 . words) dict
  let nall     = length $ filter (not . isUnknown) lins
  let nchecked = length $ filter (not . (\x -> isUnknown x || isUnchecked x)) lins
  putStrLn $ lang ++ "\t" ++ show nall ++ "\t" ++ show nchecked

statAll = mapM_ statLang langs


{-
-- lin f = def ; -- comment
analyseLine :: String -> (String,String,String,String,String)
analyseLine s = case words s of
  k:f:s:dc -> (k,f,s)


-------

-- handle split senses: remove the unsplit variant ; if the split ones are empty, copy the unsplit to them

handleSplit :: Data.Set.Set String -> String -> [String]
handleSplit sset line =
  let (keyw,fun,sep,def,comment) = analyseLine line
-}