From 0fd9d1f1a75a6eb6d587338b2a54e5a245380df7 Mon Sep 17 00:00:00 2001 From: aarne Date: Tue, 28 Dec 2010 15:33:52 +0000 Subject: [PATCH] prepared Kotus to produce adjectives and compounds (by heuristics) --- lib/src/finnish/kotus/Kotus.gf | 5 + lib/src/finnish/kotus/Kotus.hs | 170 ++++++++++++++++++++++++--- lib/src/finnish/kotus/Makefile | 15 ++- lib/src/finnish/kotus/prelDictFin | 3 +- lib/src/finnish/kotus/prelDictFinAbs | 2 +- 5 files changed, 171 insertions(+), 24 deletions(-) diff --git a/lib/src/finnish/kotus/Kotus.gf b/lib/src/finnish/kotus/Kotus.gf index e2367df5d..4e007b977 100644 --- a/lib/src/finnish/kotus/Kotus.gf +++ b/lib/src/finnish/kotus/Kotus.gf @@ -297,5 +297,10 @@ oper c101 : Str -> {s : Str} -- pronouns etc = c99 ; -- dummy + +-- compound nouns, latter part inflected + compoundNK : (Str -> NForms) -> Str -> Str -> NForms = \d,x,y -> + let ys = d y in \\v => x + ys ! v ; + } diff --git a/lib/src/finnish/kotus/Kotus.hs b/lib/src/finnish/kotus/Kotus.hs index 061e7bff7..8f41b469c 100644 --- a/lib/src/finnish/kotus/Kotus.hs +++ b/lib/src/finnish/kotus/Kotus.hs @@ -1,17 +1,113 @@ -main = interact (unlines . concatMap mkOne . lines) +import qualified Data.Map as M +import qualified Data.Set as S +import System -mkOne = mkEntry . analyse +-- build DictFin from KOTUS word list. See Makefile. +-- AR 28/12/2010 -mkEntry (w,p,g,h) | elem p [0,50,51] = [] -- no inflection information, or a compound -mkEntry (w,p,g,h) | head w == '-' = mkEntry (tail w,p,g,h) -- suffix only -mkEntry (w,p,g,h) | last w == 't' && notElem p [5,43,47 ] = [] -- plurale tantum --- to do -mkEntry (w,p,g,h) = [mkFun fun cat, mkLin fun par w] where - cat = if p < 50 then catNoun - else if p < 99 then catVerb - else catAdverb - fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ cat - par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A") - num p = if p < 10 then "0" ++ show p else show p +main = do + f:xx <- getArgs + ss <- readFile f >>= return . lines + let dict0 = map getEntry ss + let dictList = [(word e,e) | e <- dict0] + let dictMap = M.fromAscList dictList + let adjList = [(adj,adv) | (adv,e) <- dictList, + cat e == catAdverb, -- kultamitali - kultamitalisti, hence need this + Just adj <- [lookAdj adv dictMap]] +--- mapM_ (\ (adj,adv) -> putStrLn (adv ++ " - " ++ adj)) adjList -- to see the adjectives + let compoundList = [(whole,(end,beg)) | (whole,e) <- dictList, + elem (cat e) [catNoun,catAdjective], + Just (end,beg) <- [lookCompound (whole,e) dictMap]] +--- mapM_ (\ (x,(y,_)) -> putStrLn (unwords [x,"-", word y])) compoundList -- to see compounds + let (adjSet,advSet) = let (adjs,advs) = unzip adjList in (S.fromList adjs,S.fromList advs) + let dictList1 = map (mkAdjAdv adjSet advSet) dictList + let dictList2 = case xx of + "-compounds":_ -> concatMap (mkCompound (M.fromList compoundList)) dictList1 + "-all":_ -> dictList1 ++ concatMap (mkCompound (M.fromList compoundList)) dictList1 + _ -> dictList1 + let dict2 = map snd dictList2 + mapM_ mkRules dict2 + + + + +---------------------------------------------------------------- +-- identify the parts of compounds + +-- longest match, suffix length 3..10, prefix length at least 3, both parts exist +-- this gives 36557 compounds... +--- no recursion to multi-word compounds +lookCompound :: (String, Entry) -> M.Map String Entry -> Maybe (Entry,String) +lookCompound (w,ent) dict = + looks [splitAt i w | let k = length w, i <- [k-3, k-4 .. max (k-10) 3]] + where + looks ws = case [(e,u) | (u,v) <- ws, + Just e <- [M.lookup v dict], cat e == catNoun, + Just _ <- [M.lookup u dict]] of + eu :_ -> return eu + _ -> Nothing + +-- return compounds +mkCompound compMap (w,ent) = + case M.lookup w compMap of + Just (e,b) -> return (w, ent { + word = word e, + tn = tn e, + par = "compoundNK " ++ par e ++ " " ++ quoted b, + isDummy = tn e == 0 + } + ) + _ -> [] + + +---------------------------------------------------------------- +-- +-- for words ending "-sti", look for corresponding adjective. If found, mark the adjective +-- as adjective, and eliminate the adverb. + +mkAdjAdv adjSet advSet (w,e) = + if S.member w adjSet && (cat e == catNoun) + then (w,e{cat = catAdjective, fun = take (length (fun e) - 2) (fun e) ++ "AK"}) + else if S.member w advSet && (cat e == catAdverb) then (w,e{isDerived = True}) + else (w,e) + +lookAdj adv dict = case splitAt (length adv - 3) adv of + (adj,"sti") -> case [e | a <- adjCandidates adj, + Just e <- [M.lookup a dict], cat e == catNoun] of + e :_ -> return $ word e + _ -> Nothing + _ -> Nothing + +adjCandidates adj = case reverse adj of + 'e':'s': w -> [reverse w ++ "nen", adj] -- vihainen - vihaisesti + 'a':'a': 'k' : 'k' : w -> + [reverse w ++ "kas", adj] -- halukas + 'a':'a': w -> + [init adj ++ "s", adj] -- hurskas - hurskaasti, suulas - suulaasti + v : v': 'k' : 'k' : w | v == v' && fromEnum v == 228 -> + [reverse w ++ "käs", adj] -- nenäkäs - nenäkkäästi + v : v': w | v == v' && fromEnum v == 228 -> + [init adj ++ "s", adj] -- ylväs - ylväästi + 'e':'e': w -> let rw = reverse w in + [rw ++ "e", rw ++ "ut", rw ++ "yt", adj] -- terveesti, ahdistuneesti + 'i':'i': w -> [reverse w ++ "is", adj] -- kaunis - kauniisti + 'e':_ -> [init adj ++ "i", adj] -- suuri - suuresti (not: suure - suuresti) + _ -> [adj] + + + +------------------------------------------------- +-- produce rules + +mkRules e = do + putRule $ mkFun (fun e) (cat e) + putRule $ mkLin (fun e) (par e) (word e) + where + putRule + | isDummy e = putStrLn . ("-- " ++) + | isDerived e = putStrLn . ("--+ " ++) + | isPlurTant e = putStrLn . ("--? " ++) + | otherwise = putStrLn mkFun fun cat = unwords ["fun",fun,":",cat,";"] mkLin fun par w = unwords ["lin",fun,"=",par,quoted w,";"] @@ -29,13 +125,32 @@ mkId = concatMap trim where quoted s = "\"" ++ s ++ "\"" -analyse :: String -> (String,Int,String,String) -analyse s = (word,paradigm,gradation,homonym) where - word = tagged "s" x - paradigm = (read (tagged "tn" x) :: Int) - gradation = tagged "av" x - homonym = tagged "hn" x +-- analyse each line in KOTUS by this + +getEntry :: String -> Entry +getEntry s = emptyEntry { + word = w, + tn = p, + av = g, + hn = h, + fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ c, + cat = c, + par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A"), + isSuffix = suff, + isDummy = elem p [0,50,51,101], + isPlurTant = last w == 't' && notElem p [5,43,47,99] + } + where x = getTags s + (w,suff) = let w0 = tagged "s" x in if (head w0 == '-') then (tail w0, True) else (w0,False) + p = (read (tagged "tn" x) :: Int) + h = tagged "hn" x + g = tagged "av" x + c = if p < 50 then catNoun + else if p < 99 then catVerb + else catAdverb + num p = if p < 10 then "0" ++ show p else show p + tagged :: String -> Tags -> String tagged s x = maybe "0" id $ lookup s x @@ -52,6 +167,25 @@ getTags s = case s of type Tags = [(String,String)] +data Entry = Entry { + word :: String, + tn :: Int, + av :: String, + hn :: String, + fun :: String, + cat :: String, + par :: String, + comp1 :: String, -- compound word with this as inflected first part + comp2 :: String, -- compound word with this as last part + isSuffix :: Bool, -- used only as suffix, e.g. "-tekoinen" + isDummy :: Bool, -- no inflection information, or compound, or pron, or adverb from adjective + isPlurTant :: Bool, -- plurale tantum, e.g. "sakset" + isDerived :: Bool -- is derived from other words, e.g. adverb from adjective +} + +emptyEntry = Entry "" 0 "" "" "" "" "" "" "" False False False False + catNoun = "NK" catVerb = "VK" +catAdjective = "AK" catAdverb = "AdvK" diff --git a/lib/src/finnish/kotus/Makefile b/lib/src/finnish/kotus/Makefile index 0b5493e2c..26003d406 100644 --- a/lib/src/finnish/kotus/Makefile +++ b/lib/src/finnish/kotus/Makefile @@ -1,10 +1,17 @@ gf: abs cnc -abs: +src: + runghc Kotus.hs src/kotus-sanalista_v1.xml >src.tmp + +compounds: + runghc Kotus.hs src/kotus-sanalista_v1.xml -compounds >src.tmp + +abs: src cp prelDictFinAbs DictFinAbs.gf - runghc Kotus.hs >DictFinAbs.gf + grep ":" src.tmp >>DictFinAbs.gf echo "}" >>DictFinAbs.gf -cnc: + +cnc: src cp prelDictFin DictFin.gf - runghc Kotus.hs >DictFin.gf echo "}" >>DictFin.gf diff --git a/lib/src/finnish/kotus/prelDictFin b/lib/src/finnish/kotus/prelDictFin index f066b9da8..ddabe432e 100644 --- a/lib/src/finnish/kotus/prelDictFin +++ b/lib/src/finnish/kotus/prelDictFin @@ -4,4 +4,5 @@ concrete DictFin of DictFinAbs = open MorphoFin, Kotus, Prelude in { flags coding = utf8 ; -lincat NK = NForms ; VK = VForms ;AdvK = SS ; +lincat NK = NForms ; AK = NForms ; VK = VForms ; AdvK = SS ; + diff --git a/lib/src/finnish/kotus/prelDictFinAbs b/lib/src/finnish/kotus/prelDictFinAbs index 28a6595da..77eaafd28 100644 --- a/lib/src/finnish/kotus/prelDictFinAbs +++ b/lib/src/finnish/kotus/prelDictFinAbs @@ -1,3 +1,3 @@ abstract DictFinAbs = { -cat NK ; VK ; AdvK ; +cat NK ; AK ; VK ; AdvK ;