From 9790e07f6eb714c04840561fa87b5866ae736a29 Mon Sep 17 00:00:00 2001 From: aarneranta Date: Tue, 3 Mar 2020 18:10:02 +0100 Subject: [PATCH] generating MorphoDict Swe from SALDO sources --- src/morphodict/MorphoDictSwe.config | 2 + src/morphodict/MorphoDictSwe.header | 4 +- src/morphodict/MorphoDictSweAbs.header | 2 +- src/morphodict/saldo/Saldo.header | 1 + src/morphodict/saldo/SaldoGF.hs | 97 ++++++++++++++++++++++++++ src/morphodict/saldo/SaldoSwe.header | 50 +++++++++++++ 6 files changed, 152 insertions(+), 4 deletions(-) create mode 100644 src/morphodict/saldo/Saldo.header create mode 100644 src/morphodict/saldo/SaldoGF.hs create mode 100644 src/morphodict/saldo/SaldoSwe.header diff --git a/src/morphodict/MorphoDictSwe.config b/src/morphodict/MorphoDictSwe.config index e3773d89b..b541c0631 100644 --- a/src/morphodict/MorphoDictSwe.config +++ b/src/morphodict/MorphoDictSwe.config @@ -4,3 +4,5 @@ V : V mkV 6 0 4 2 8 10 V2 : V mkV 6 0 4 2 8 10 Adv : Adv mkAdv 0 Prep : Prep mkPrep 0 +PN : PN mkPN 0 + diff --git a/src/morphodict/MorphoDictSwe.header b/src/morphodict/MorphoDictSwe.header index 9ab75fdd4..42fcc0113 100644 --- a/src/morphodict/MorphoDictSwe.header +++ b/src/morphodict/MorphoDictSwe.header @@ -1,10 +1,8 @@ concrete MorphoDictSwe of MorphoDictSweAbs = - CatSwe [N,A,V,Adv,Prep] ** + CatSwe [N,A,V,Adv,Prep,PN] ** open ParadigmsSwe in { -oper mkkN : (apa,apan,apor,aporna,ap : Str) -> N - = \apa,apan,apor,aporna,ap -> changeCompoundN ap (mkN apa apan apor aporna) ; diff --git a/src/morphodict/MorphoDictSweAbs.header b/src/morphodict/MorphoDictSweAbs.header index 2a9508bd9..6d49ccfd5 100644 --- a/src/morphodict/MorphoDictSweAbs.header +++ b/src/morphodict/MorphoDictSweAbs.header @@ -1,4 +1,4 @@ abstract MorphoDictSweAbs = - Cat [N,A,V,Adv,Prep] ** + Cat [N,A,V,Adv,Prep,PN] ** { diff --git a/src/morphodict/saldo/Saldo.header b/src/morphodict/saldo/Saldo.header new file mode 100644 index 000000000..b48dd588f --- /dev/null +++ b/src/morphodict/saldo/Saldo.header @@ -0,0 +1 @@ +abstract Saldo = Cat [N,A,V,PN,Adv,Prep] ** { diff --git a/src/morphodict/saldo/SaldoGF.hs b/src/morphodict/saldo/SaldoGF.hs new file mode 100644 index 000000000..0ce45d0ee --- /dev/null +++ b/src/morphodict/saldo/SaldoGF.hs @@ -0,0 +1,97 @@ +import Data.List +import qualified Data.Map as M + +-- AR 2020-03-03 +-- generating GF from preprocessed SALDO (of type Lex by John Camilleri) + +main = do + lexicon <- readFile "saldom.hsdump" >>= return . readLex -- this is the preprocessed file + let gf = map (mkRules . treatNone) $ mkFuns lexicon + writeFile "abs.tmp" $ unlines $ map fst gf -- the generated files need headers + writeFile "cnc.tmp" $ unlines $ map snd gf -- use SaldoGF.header for this + +-- JC's datatypes, using String for simplicity + +type Lex = M.Map String Entry -- key is lemgram ID + +type Table = [(String,String)] + +data Entry = E + { ePOS :: String + , eTable :: Table -- morphological tags to surface form: ("sg def gen" ,"killens") + } deriving (Show, Read) + +readLex :: String -> [(String,Entry)] +readLex = read . drop 8 + +-- new code by AR + +mkRules (fun,cat,lin) = (nunwords ["fun",fun,":",cat,";"],nunwords ["lin",fun,"=",lin,";"]) + where + -- commenting out functions that still have NONE forms + nunwords ws = unwords ((if elem "\"NONE\"" (words lin) then ["--n"] else []) ++ ws) + +-- converting incomplete paradigms to special mkC constructors, defined in SaldoSwe.header +treatNone (f,cat,lin) = case (cat,drop 1 (words lin)) of + ("V", "\"NONE\"":"\"NONE\"":v:_) -> (f, "V", unwords ("mkVDep":[v])) + ("V", i:d:p:a:b:"\"NONE\"":_) -> (f, "V", unwords ("mkVIntr":[i,d,p,a,b])) + ("A", i:"\"NONE\"":p:c:s:_) -> (f, "A", unwords ("mkAUtr":[i,p,c,s])) + ("A", i:d:p:"\"NONE\"":"\"NONE\"":_) -> (f, "A", unwords ("mkAComp":[i,d,p])) + ("N", "\"NONE\"":d:"\"NONE\"":_) -> (f, "PN", unwords ("mkPNDef":[d])) --- + ("N", i:"\"NONE\"":"\"NONE\"":_) -> (f, "PN", unwords ("mkPNIndef":[i])) + ("N", i:d:"\"NONE\"":"\"NONE\"":_) -> (f, "N", unwords ("mkNSg":[i,d])) + ("N", "\"NONE\"":"\"NONE\"":i:d:_) -> (f, "N", unwords ("mkNPl":[i,d])) + _ -> (f,cat,lin) + +--- generating function names for simplicity: the result is fed to ../MkMorphoDict anyway +mkFuns lx = [("w"++show i, cat, lin) | (i,(cat,lin)) <- zip [1000000..] (concatMap (entry2lin . snd) lx)] + +entry2lin e = + [(cat, mkLin cat ws) | ws <- manyTables valuess] + where + (cat,forms) = formSpec (ePOS e) + valuess = [nub [v | (t,v) <- eTable e, t == f] | f <- forms] + mkLin c ws = unwords $ ["mk"++c] ++ ["\"" ++ w ++ "\"" | w <- ws] + +-- looking for the characteristic forms for each POS + +formSpec pos = case pos of + "nn" -> ("N",[ + "sg indef nom", + "sg def nom", + "pl indef nom", + "pl def nom" + ]) + "av" -> ("A",[ + "pos indef sg u nom", + "pos indef sg n nom", + "pos indef pl nom", + "komp nom", + "super indef nom" + ]) + "vb" -> ("V",[ + "inf aktiv", + "pres ind aktiv", + "imper", + "pret ind aktiv", + "sup aktiv", + "pret_part indef sg u nom" + ]) + "ab" -> ("Adv",[ + "invar" +---- "pos" + ]) + "pp" -> ("Prep",[ + "invar" + ]) + _ -> ("NONE++pos",["NONE++pos"]) -- ignoring other POS tags, which are rare anyway + +-- trying to generate a small number of tables from sets of variant forms; seems to work well enough + +manyTables formss = [ + map ((!!i) . pad) formss | + i <- [0..maximum (map length formss)-1], + let pad forms = if null forms then repeat "NONE" else forms ++ repeat (head forms) + ] + + diff --git a/src/morphodict/saldo/SaldoSwe.header b/src/morphodict/saldo/SaldoSwe.header new file mode 100644 index 000000000..0d877b076 --- /dev/null +++ b/src/morphodict/saldo/SaldoSwe.header @@ -0,0 +1,50 @@ +concrete SaldoSwe of Saldo = + CatSwe [N,A,V,Adv,Prep,PN] ** + open + ParadigmsSwe, Prelude + in + { + +-- to deal with incomplete paradigms +--- the values could be in special categories to avoid overgeneration + +oper + mkVDep : Str -> V + = \v -> case v of { + x + "as" => depV (mkV (x + "a")) ; + x + "es" => depV (mkV (x + "er")) ; + x + "s" => depV (mkV (x + "er")) ; + _ => Predef.error (v ++ "not for mkVDep") + } ; + + mkVIntr : (_,_,_,_,_ : Str) -> V + = \i,_,_,p,pt -> mkV i p pt ; --- + + mkAUtr : (_,_,_,_ : Str) -> A + = \u,p,c,s -> mkA u u p c s ; --- + + mkAComp : (_,_,_ : Str) -> A + = \u,n,p -> compoundA (mkA u n) ; --- + + mkPNDef : Str -> PN + = \s -> case s of { + _ + "n" => mkPN s utrum ; + _ => mkPN s neutrum + } ; + + mkPNIndef : Str -> PN + = \s -> mkPN s neutrum ; --- + + mkNSg : (_,_ : Str) -> N + = \i,d -> case d of { + _ + "n" => mkN i utrum ; --- + _ => mkN i neutrum --- + } ; + + mkNPl : (_,_ : Str) -> N + = \i,d -> case i of { + s + "or" => mkN (s + "a") ; --- + s + ("ar"|"er") => mkN s i ; --- + s + "en" => mkN (s + "e") i ; --- + _ => mkN i i --- + } ;