improved MkMorphoDict.hs, still experimental

2021-05-28 11:01:47 +02:00
parent 2ade51ecff
commit 71d7de2577
1 changed files with 87 additions and 46 deletions
--- a/src/morphodict/MkMorphodict.hs
+++ b/src/morphodict/MkMorphodict.hs
@@ -17,86 +17,138 @@ import System.Environment (getArgs)
 --  - functionname = baseform_category, with exceptions
 --     - variant inflection tables: lie_1_V, lie_2_V
 --     - words that have non-ident characters: 'bird\'s-eye_A'
 --     - words that start with non-letters: W_'tween_Adv
 -- example:
 --   gf -make ../english/DictEng.gf
--   runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
+--   runghc
 --   runghc MkMorphodict.hs pgf MorphoDictEng.config DictEngAbs.pgf MorphoDictEng
 -- 64923 ->  56599 functions
-usage = "MkMorphodict <pgf> <outfile>"
+usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
 main = do
-  pgfile:outfile:_ <- getArgs
+  xx <- getArgs
-  pgf <- readPGF pgfile
+  if length xx /= 4
-  config <- readFile (outfile ++ ".config") >>= return . mkConfig
+    then putStrLn usage
    else do
      let mode:configfile:datafile:outfile:_ = xx 
      config <- readFile configfile >>= return . mkConfig
-  let (absrules,cncrules) = mkMorphoDict (MDEnv pgf config (head (languages pgf)))
+      rawdata <- case mode of
        "pgf" -> pgfFile2rawData config datafile
        "raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines 
      rawdata2gf config rawdata outfile
 rawdata2gf config rawdata outfile = do
  let env = MDEnv rawdata config
  let (absrules,cncrules) = mkMorphoDict env
  absheader <- readFile (outfile ++ "Abs.header")
  cncheader <- readFile (outfile ++ ".header")
  writeFile (outfile ++ "Abs.gf") absheader
-  appendFile (outfile ++ "Abs.gf") $ unlines absrules
+  appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
  appendFile (outfile ++ "Abs.gf") "}"
  writeFile (outfile ++ ".gf") cncheader
-  appendFile (outfile ++ ".gf") $ unlines cncrules
+  appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
  appendFile (outfile ++ ".gf") "}"
 -- one way to get raw data from a dictionary pgf
 -- another way, more controllod, is to write a wrapper grammar with a function, for each category, to generate a RawData entry
 pgfFile2rawData config pgffile = do
  pgf    <- readPGF pgffile
-type Cat  = CId
+  let cats = nub [c | (c,(_,_,_)) <- M.assocs config]
  let lang:_ = languages pgf
  return [
    (cat, map snd lin) |
        cat  <- cats,
        f    <- functionsByCat pgf (mkCId cat),
        lin  <- tabularLinearizes pgf lang (mkApp f [])
    ] 
 type Cat  = String
 type Fun  = String
 type Oper = String
-type Config = M.Map Cat (Cat,Oper,[Int])
+type Config = M.Map Cat (Cat,Oper,([Int],[Int])) -- lin word_Cat = Oper str_i1 str_i2 ... str_in  featj1 ... featjn ;
 data MDEnv = MDEnv {
-  pgf    :: PGF,
+  rawdata :: [RawData],
-  config :: Config,
+  config  :: Config
  lang   :: Language
  }
-mkConfig :: String -> Config
+mkConfig :: String -> Config  -- N : N mkN 0 2 4 6 # 9
 mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
 where
  mkOne s = case words s of
    "--":_                 -> Right s 
-    cat:":":tcat:oper:ints -> Left (mkCId cat,(mkCId tcat,oper,map read ints))
+    cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
    _ -> Right s
  mkArgs ints = case break (=="#") ints of
    (ss,[])   -> (map read ss, [])
    (ss,_:fs) -> (map read ss, map read fs)
-mkMorphoDict :: MDEnv -> ([String],[String])
+getRawData s = case words s of
  c:cs -> (c,cs)
 type RawData  = (String,[String])                              -- old cat name, forms and features
 type RawRule  = (([String],Cat), (Oper, ([String],[String])))  -- parts of fun name, new category, oper, arguments
 type RuleData = ((Fun,    Cat),  (Oper, ([String],[String])))  -- final fun name, cat, oper, args
 mkMorphoDict :: MDEnv -> ([String],[String])  -- fun rules, lin rules
 mkMorphoDict env =
  unzip $
  map splitRule $
-  findCompounds $
+----  findCompounds $ -- let us not care about compounds for the time being, but include them if they are given
  nameFunctions $
  mergeRules $
-  concatMap findRules cats
+  findRules $
  rawdata env
 where
-  splitRule (fun,(cat,lin)) = (unwords ["fun",fun,":",showCId cat,";"], unwords ["lin",fun,"=", unwords lin,";"])
+  splitRule ((fun,cat),(oper,(forms,feats))) =
    (unwords ["fun",fun,":",cat,";"], unwords ["lin",fun,"=", unwords (oper : map quote forms ++ feats),";"])
  cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)]
-  findRules cat = [
+  findRules :: [RawData] -> [RawRule]
-    ([snd (lin !! head ints), showCId c], (c, op : appSig ints (map snd lin))) |  --- head ints is the base form in smart paradigms
+  findRules raws = [
-      f    <- functionsByCat (pgf env) cat,
+    (([lemma],newcat),(oper, appSig sig args)) |
-      lin  <- tabularLinearizes (pgf env) (lang env) (mkApp f []), -- [[(String, String)]]
+        (oldcat,args) <- raws,
-      Just (c,op,ints) <- [M.lookup cat (config env)]
+        Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
        let lemma = args !! head (fst sig)
   ] 
-  appSig ints forms = [forms !! i | i <- ints]
+  appSig (ints,feats) args = ([args !! i | i <- ints], [args !! i | i <- feats])
  mergeRules :: [RawRule] -> [RawRule]
  mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
  nameFunctions :: [RawRule] -> [RuleData]
  nameFunctions = expandNames . sortOn fst
  expandNames :: [RawRule] -> [RuleData]
  expandNames fls = case fls of
-    (f,l):fls2 -> case span ((==f) . fst) fls2 of
+    (f@(w,c),l) : fls2 -> case span ((==f) . fst) fls2 of
-      ([],_) -> (mkFun f,l) : expandNames fls2
+      ([],_) -> ((mkFun (w ++ [c]),c),l) : expandNames fls2
      (fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3
    _ -> []
-  renames fls = [(mkFun (init f ++ [show i,last f]),l) | (i,(f,l)) <- zip [1..] fls]
+  renames :: [RawRule] -> [RuleData]
  renames fls = [((mkFun (f ++ [show i,c]),c),l) | (i,((f,c),l)) <- zip [1..] fls] ---- TODO disambiguate with a form, not int
 mkFun = showCId . mkCId . concat . intersperse "_"
 quote s = "\"" ++ s ++ "\""
 {- ---- let us ignore this
  findCompounds :: [RuleData] -> [RuleData] 
  findCompounds = getCompounds . sortOn cat_orthrevforms
  cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
@@ -105,6 +157,7 @@ mkMorphoDict env =
  revstem = head . snd . cat_revforms
  wforms (_,(_,_:forms)) = forms
  getCompounds :: [RuleData] -> [RuleData] 
  getCompounds fls = case fls of
    fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1 
    fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
@@ -117,26 +170,14 @@ mkMorphoDict env =
      True  -> markWith fl1 [";","--","compound",(fst fl)]
      False -> markWith fl1 [";","--","notcompound",(fst fl)]
-  markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs))
+  markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs)) ---- TODO only quote string args, not features
  isPrefixWord x xy =
-    length suff > 1 &&
+    length suff > 1 &&                                ---- compound first part must be at least two letters long
-    any (\c -> elem c "-0123456789aeiouyåäö") suff &&
+    any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit 
-    isPrefixOf x xy
+    isPrefixOf x xy                                   ---- and of course be a prefix
   where
     suff = drop (length x) xy
-
+-}
 mkFun = quoteIf . concat . intersperse "_"
 quoteIf s = case s of
  _ | any (\c -> not (isAlphaNum c || elem c "_'")) s -> "'" ++ unSgQuote s ++ "'"
  c:_ | not (isAlpha c) -> "W_" ++ s
  _ -> s
 where
  unSgQuote s = case s of
    '\'':cs -> "\\\'" ++ unSgQuote cs
    c:cs -> c : unSgQuote cs
    _ -> s
 quote s = "\"" ++ s ++ "\""