prepared Kotus to produce adjectives and compounds (by heuristics)

2010-12-28 15:33:52 +00:00
parent b8842149be
commit 51d19683cf
5 changed files with 171 additions and 24 deletions
--- a/src/finnish/kotus/Kotus.hs
+++ b/src/finnish/kotus/Kotus.hs
@@ -1,17 +1,113 @@
-main = interact (unlines . concatMap mkOne . lines)
+import qualified Data.Map as M
+import qualified Data.Set as S
+import System

-mkOne = mkEntry . analyse
+-- build DictFin from KOTUS word list. See Makefile.
+-- AR 28/12/2010

-mkEntry (w,p,g,h) | elem p [0,50,51] = [] -- no inflection information, or a compound
-mkEntry (w,p,g,h) | head w == '-' = mkEntry (tail w,p,g,h) -- suffix only
-mkEntry (w,p,g,h) | last w == 't' && notElem p [5,43,47 ] = [] -- plurale tantum --- to do
-mkEntry (w,p,g,h) = [mkFun fun cat, mkLin fun par w] where
-  cat = if p < 50 then catNoun 
-        else if p < 99 then catVerb
-        else catAdverb
-  fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ cat
-  par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A")
-  num p = if p < 10 then "0" ++ show p else show p
+main = do
+  f:xx <- getArgs
+  ss  <- readFile f >>= return . lines
+  let dict0 = map getEntry ss
+  let dictList = [(word e,e) | e <- dict0]
+  let dictMap = M.fromAscList dictList
+  let adjList = [(adj,adv) | (adv,e) <- dictList, 
+                          cat e == catAdverb, -- kultamitali - kultamitalisti, hence need this
+                          Just adj <- [lookAdj adv dictMap]]
+---  mapM_ (\ (adj,adv) -> putStrLn (adv ++ " - " ++ adj)) adjList -- to see the adjectives
+  let compoundList = [(whole,(end,beg)) | (whole,e) <- dictList,
+                          elem (cat e) [catNoun,catAdjective],
+                          Just (end,beg) <- [lookCompound (whole,e) dictMap]]
+---  mapM_ (\ (x,(y,_)) -> putStrLn (unwords [x,"-", word y])) compoundList -- to see compounds
+  let (adjSet,advSet) = let (adjs,advs) = unzip adjList in (S.fromList adjs,S.fromList advs)
+  let dictList1 = map (mkAdjAdv adjSet advSet) dictList
+  let dictList2 = case xx of 
+        "-compounds":_ -> concatMap (mkCompound (M.fromList compoundList)) dictList1
+        "-all":_  -> dictList1 ++ concatMap (mkCompound (M.fromList compoundList)) dictList1
+        _ -> dictList1
+  let dict2 = map snd dictList2
+  mapM_ mkRules dict2
+
+
+
+
+----------------------------------------------------------------
+-- identify the parts of compounds
+
+-- longest match, suffix length 3..10, prefix length at least 3, both parts exist
+-- this gives 36557 compounds...
+--- no recursion to multi-word compounds
+lookCompound :: (String, Entry) -> M.Map String Entry -> Maybe (Entry,String)
+lookCompound (w,ent) dict = 
+  looks [splitAt i w | let k = length w, i <- [k-3, k-4 .. max (k-10) 3]] 
+ where
+  looks ws = case [(e,u) | (u,v) <- ws, 
+                       Just e <- [M.lookup v dict], cat e == catNoun,
+                       Just _ <- [M.lookup u dict]] of
+    eu :_ -> return eu
+    _ -> Nothing
+
+-- return compounds
+mkCompound compMap (w,ent) = 
+  case M.lookup w compMap of
+    Just (e,b) -> return (w, ent {
+            word = word e,
+            tn = tn e, 
+            par = "compoundNK " ++ par e ++ " " ++ quoted b,
+            isDummy = tn e == 0
+            }
+          )
+    _ -> []
+
+
+----------------------------------------------------------------
+--
+-- for words ending "-sti", look for corresponding adjective. If found, mark the adjective
+-- as adjective, and eliminate the adverb.
+
+mkAdjAdv adjSet advSet (w,e) = 
+  if S.member w adjSet && (cat e == catNoun) 
+     then (w,e{cat = catAdjective, fun = take (length (fun e) - 2) (fun e) ++ "AK"})
+  else if S.member w advSet && (cat e == catAdverb) then (w,e{isDerived = True})
+  else (w,e)
+
+lookAdj adv dict = case splitAt (length adv - 3) adv of
+  (adj,"sti") -> case [e | a <- adjCandidates adj, 
+                       Just e <- [M.lookup a dict], cat e == catNoun] of
+    e :_ -> return $ word e
+    _ -> Nothing
+  _ -> Nothing
+
+adjCandidates adj = case reverse adj of
+  'e':'s': w -> [reverse w ++ "nen", adj]  -- vihainen - vihaisesti
+  'a':'a': 'k' : 'k' : w -> 
+    [reverse w ++ "kas",  adj]  -- halukas
+  'a':'a': w -> 
+    [init adj ++ "s",  adj]  -- hurskas - hurskaasti, suulas - suulaasti
+  v  : v': 'k' : 'k' : w | v == v' && fromEnum v == 228 -> 
+    [reverse w ++ "käs",  adj] -- nenäkäs - nenäkkäästi
+  v  : v': w | v == v' && fromEnum v == 228 -> 
+    [init adj ++ "s",  adj] -- ylväs - ylväästi
+  'e':'e': w -> let rw = reverse w in 
+                [rw ++ "e", rw ++ "ut", rw ++ "yt", adj]  -- terveesti, ahdistuneesti
+  'i':'i': w -> [reverse w ++ "is",  adj]  -- kaunis - kauniisti
+  'e':_      -> [init adj ++ "i", adj]     -- suuri - suuresti (not: suure - suuresti)
+  _ -> [adj]
+
+
+
+-------------------------------------------------
+-- produce rules
+
+mkRules e = do
+  putRule $ mkFun (fun e) (cat e)
+  putRule $ mkLin (fun e) (par e) (word e)
+ where
+   putRule
+    | isDummy e = putStrLn . ("-- " ++)
+    | isDerived e = putStrLn . ("--+ " ++)
+    | isPlurTant e = putStrLn . ("--? " ++)
+    | otherwise = putStrLn 

 mkFun fun cat = unwords ["fun",fun,":",cat,";"]
 mkLin fun par w = unwords ["lin",fun,"=",par,quoted w,";"]
@@ -29,13 +125,32 @@ mkId = concatMap trim where

 quoted s = "\"" ++ s ++ "\""

-analyse :: String -> (String,Int,String,String)
-analyse s = (word,paradigm,gradation,homonym) where
-  word = tagged "s" x
-  paradigm = (read (tagged "tn" x) :: Int)
-  gradation = tagged "av" x
-  homonym = tagged "hn" x
+-- analyse each line in KOTUS by this
+
+getEntry :: String -> Entry
+getEntry s = emptyEntry {
+  word = w,
+  tn   = p,
+  av   = g,
+  hn   = h,
+  fun  = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ c,
+  cat  = c,
+  par  = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A"),
+  isSuffix = suff,
+  isDummy = elem p [0,50,51,101],
+  isPlurTant = last w == 't' && notElem p [5,43,47,99]
+  } 
+ where
  x = getTags s
+  (w,suff) = let w0 = tagged "s" x in if (head w0 == '-') then (tail w0, True) else (w0,False)
+  p = (read (tagged "tn" x) :: Int)
+  h = tagged "hn" x
+  g = tagged "av" x
+  c = if p < 50 then catNoun 
+      else if p < 99 then catVerb
+      else catAdverb
+  num p = if p < 10 then "0" ++ show p else show p
+

 tagged :: String -> Tags -> String
 tagged s x = maybe "0" id $ lookup s x
@@ -52,6 +167,25 @@ getTags s = case s of

 type Tags = [(String,String)]

+data Entry = Entry {
+  word :: String,
+  tn   :: Int,
+  av   :: String,
+  hn   :: String,
+  fun  :: String,
+  cat  :: String,
+  par  :: String,
+  comp1 :: String,  -- compound word with this as inflected first part
+  comp2 :: String,  -- compound word with this as last part
+  isSuffix :: Bool, -- used only as suffix, e.g. "-tekoinen"
+  isDummy :: Bool,  -- no inflection information, or compound, or pron, or adverb from adjective
+  isPlurTant :: Bool, -- plurale tantum, e.g. "sakset"
+  isDerived :: Bool   -- is derived from other words, e.g. adverb from adjective
+}
+
+emptyEntry = Entry "" 0 "" "" "" "" "" "" "" False False False False
+
 catNoun = "NK"
 catVerb = "VK"
+catAdjective = "AK"
 catAdverb = "AdvK"