mapped DictEng to Fin via Wordnet and Kotus

2026-05-28 04:08:55 -06:00 · 2013-03-28 08:16:45 +00:00
parent 9c7d1328a1
commit 26175a69df
6 changed files with 324776 additions and 21 deletions
--- a/lib/src/finnish/kotus/src/DictEng.senses
+++ b/lib/src/finnish/kotus/src/DictEng.senses
--- a/lib/src/finnish/kotus/src/DictEngFin.senses
+++ b/lib/src/finnish/kotus/src/DictEngFin.senses
--- a/lib/src/finnish/kotus/src/Kotus.hs
+++ b/lib/src/finnish/kotus/src/Kotus.hs
@@ -7,28 +7,56 @@
 -- 25/3: 93,696 words, 2628 not found, 6964 unsure
 -- 26/3: new heuristics for adjectives of type "älyperäinen", 1258 not found

+
 import Data.List
 import Data.Char
 import qualified Data.Map as M

 main = do
-  kotus <- readFile "kotus-sanalista_v1.xml" >>= return . mkKotus . lines
+---  kotus <- readFile "kotus-sanalista_v1.xml" >>= return . mkKotus . lines
 -- do lookup
----  interact $ unlines . map (look kotus) . lines
+---  interact $ unlines . map (look kotus) . lines

 --- just print all words
 ---  mapM_ (putStrLn . fst) $ M.toList kotus
 --- debug
 ---  mapM_ print $ take 60 $ M.toList kotus -- debug

-- analyse fin wordnet senses
-  interact $ unlines . map (annotateWordnet kotus) . lines
+--- analyse fin wordnet senses
+--- wordnet to kotus: runghc Kotus.hs <fiwn-wsenses.tsv >wn-kotus.txt
+--  interact $ unlines . map (annotateWordnet kotus) . lines
+
+
+--- find Finnish words for DictEng: runghc Kotus.hs <DictEng.senses >DictEngFin.senses
+
+  wnkotus <- readFile "wn-kotus.txt" >>= return . mkWNKotus . lines
+  interact $ unlines . map (annotateSenseInflection wnkotus) . lines
+
+
+annotateSenseInflection wnkotus line = case tabs line of
+  w:s:_ -> untabs (w : s : lookupSenseInflection wnkotus s)
+  _ -> "-- " ++ line
+
+type WNKotus = M.Map String [[String]]
+
+mkWNKotus :: [String] -> WNKotus
+mkWNKotus = M.fromListWith (++) . map mkOne where
+  mkOne s = case tabs s of
+    w:ws -> (drop 3 w,[ws]) -- fi: dropped
+    _ -> (s,[])
+
+lookupSenseInflection :: WNKotus -> String -> [String]
+lookupSenseInflection wnkotus synset = case M.lookup synset wnkotus of
+  Just fws@(_:_) -> minimumBy (\x y -> compare (last x) (last y)) fws  -- choose one with the best status
+  _ -> ["NOT_IN_KOTUS"]
+
+


 annotateWordnet :: Kotus -> String -> String
-annotateWordnet kotus line = case cat of
-  "V" -> untabs (take 2 tline ++ [cat, lookk ((iline !! 1) !! 0)]) -- fi:v02758977 sataa lunta 0   -> sataa
-  _   -> untabs (take 2 tline ++ [cat, lookk (last (iline !! 1))]) -- fi:n02769075 näytön taustakuva 0 -> taustakuva
+annotateWordnet kotus line = untabs $ [tline !! 0, cat, tline !! 1] ++ case cat of
+  "V" -> lookk ((iline !! 1) !! 0) -- fi:v02758977 sataa lunta 0   -> sataa
+  _   -> lookk (last (iline !! 1)) -- fi:n02769075 näytön taustakuva 0 -> taustakuva
 where
  iline = map words (init tline)
  tline = tabs line
@@ -37,7 +65,7 @@ annotateWordnet kotus line = case cat of
    'a' -> "A"
    'n' -> "N"
    'r' -> "Adv"
-  lookk = unwords . tail . words . look kotus  -- remove repetition of lemma
+  lookk = look kotus

 tabs :: String -> [String]
 tabs s = case break (=='\t') s of
@@ -47,24 +75,30 @@ tabs s = case break (=='\t') s of
 untabs :: [String] -> String
 untabs = concat . intersperse "\t"

-look :: Kotus -> String -> String
+look :: Kotus -> String -> [String]
 look kotus w = case M.lookup w kotus of
-  Just ["NOPAR"] -> lookCompound "INCOMPOUND" kotus w
-  Just descr -> unwords $ w : descr
-  _ -> lookCompound "OUTCOMPOUND" kotus w
+  Just ["NOPAR"] -> lookCompound "INCOMPOUND" kotus w -- compound in Kotus
+  Just descr -> descr ++ ["FOUND"]
+  _ -> lookCompound "OUTCOMPOUND" kotus w  -- compound not in Kotus

-lookCompound :: String -> Kotus -> String -> String
+lookCompound :: String -> Kotus -> String -> [String]
 lookCompound pref kotus w = case sort (concatMap looks (splits w)) of  -- sort: 1's first
-  (_,descr):_ -> unwords $ w : [descr] 
-  _ -> unwords $ w : ["NOTFOUND",pref]
+  (_,descr):_ -> descr 
+  _ -> ["MK", "XX5_NOTFOUND_"++pref]  -- NOUTFOUND, worst - use smart paradigm
 where
   splits s = reverse [splitAt n s | n <- [3 .. length s - 3]]
   looks (x,y) = case M.lookup y kotus of 
-     Just descr | elem x compPrefixes || any isCompPrefix (compForms x) -> [(1,unwords $ x : y : descr ++ [pref])] -- preferred: 1
-     Just descr -> [(2,unwords $ x : y : descr ++ ["UNSURE", pref])] -- secondary: 2
-     _ | drop (length y - 3) y == "nen" && (elem x compPrefixes || any isCompPrefix (compForms x)) 
-                                      -> [(3,unwords $ x : y : ["38","UNSURE",pref])] -- tertiary: 3
+     Just descr | makesCompPrefix x                           -> comp 1 descr  pref                      x y  -- preferred: 1
+     Just descr                                               -> comp 2 descr  ("XX3_UNSURE_"   ++ pref) x y  -- secondary: 2, UNSURE
+     _ | drop (length y - 3) y == "nen" && makesCompPrefix x  -> comp 3 ["38"] ("XX4_INVENTED_" ++ pref) x y 
+     _ | drop (length y - 3) y == "ttu" && makesCompPrefix x  -> comp 3 ["1"]  ("XX4_INVENTED_" ++ pref) x y 
+     _ | drop (length y - 3) y == "tty" && makesCompPrefix x  -> comp 3 ["1"]  ("XX4_INVENTED_" ++ pref) x y 
+     _ | drop (length y - 3) y == "nut" && makesCompPrefix x  -> comp 3 ["47"] ("XX4_INVENTED_" ++ pref) x y 
+     _ | drop (length y - 3) y == "nyt" && makesCompPrefix x  -> comp 3 ["47"] ("XX4_INVENTED_" ++ pref) x y 
     _ -> []
+   comp status descr pref x y = [(status, [unwords (descr ++ ["C", x ++ "_" ++ y]), pref])]
+
+   makesCompPrefix x = elem x compPrefixes || any isCompPrefix (compForms x) 
   isCompPrefix x = case M.lookup x kotus of
     Just _ -> True
     _ -> case M.lookup (x ++ "-") kotus of
@@ -90,8 +124,8 @@ oneKotusLine s = case untag s of w:ws -> (w,ws)

 untag s = case break (=='<') (drop 7 s) of 
  (w,d) -> case drop 4 d of
-    '<':'t':'>':_ -> [w, takeWhile isDigit (drop 11 d), rest (drop 11 d)]  -- 99, </st>
-    '<':'h':'n':'>':h:'<':'/':'h':'n':'>':'<':'t':'>':_ -> [w, ['H',h], takeWhile isDigit (drop 21 d), rest (drop 21 d)]  -- homonym
+    '<':'t':'>':_ -> [w, takeWhile isDigit (drop 11 d) ++ rest (drop 11 d)]  -- 99, </st>
+    '<':'h':'n':'>':h:'<':'/':'h':'n':'>':'<':'t':'>':_ -> [w, ['H',h], takeWhile isDigit (drop 21 d) ++ rest (drop 21 d)]  -- homonym
    _ -> [w, "NOPAR"]  -- no paradigm given
 where
   rest t = case t of
--- a/lib/src/finnish/kotus/src/LookupWordnet.hs
+++ b/lib/src/finnish/kotus/src/LookupWordnet.hs
@@ -0,0 +1,80 @@
+-- looking up a word from Princeton dict/index files
+
+-- e.g. look for senses of DictEngAbs:
+--  runghc LookupWordnet.hs <~/GF/lib/src/dictEngFuns.txt
+
+import Data.List
+import Data.Char
+import qualified Data.Map as M
+
+main = do
+  indw <- readFile "index.word"
+  let poss  = buildPoss (lines indw)
+  mapM_ print $ take 20 $ drop 1000 $ M.toList poss
+  inds <- readFile "index.sense"
+  let index = buildIndex poss (lines inds)
+  mapM_ print $ take 20 $ drop 1000 $ M.toList index
+
+-- interactive search of synsets for words, sorted by category and descending frequency
+---  interact (unlines . map (look index) . lines)
+-- example:
+-- tailor
+-- n%1%10689564 v%2%00301662 v%2%01666327 v%2%01666717
+
+--  look up the most frequent senses of a GF abstract syntax with entries like
+--  zoomorphism_N : N ;
+  interact (unlines . map (lookGF index) . lines)
+
+
+type Index = M.Map String [String] -- word form to synset id's
+
+look :: Index -> String -> String
+look index str = maybe "NOTFOUND" (unwords . nub . sort) $ M.lookup str index
+
+-- index.sense
+-- academic%1:18:00:: 09759069 1 2
+-- academic%3:01:00:: 02599939 1 18
+
+buildIndex :: Index -> [String] -> Index
+buildIndex poss = M.fromListWith (++) . map mkOne where
+  mkOne s = case words s of 
+    w:i:_ -> let 
+               (word,rank) = span (/='%') w 
+               c = look poss i
+             in (word, [c ++ take 2 rank ++ "%" ++ i])
+    _ -> (s,[])
+
+
+buildPoss :: [String] -> Index
+buildPoss = M.fromListWith (++) . concatMap mkOne where
+  mkOne s = case words s of 
+    w:c:rest -> [(i,[c]) | i <- dropWhile ((<8) . length) rest]
+    _ -> []
+
+
+
+lookGF :: Index -> String -> String
+lookGF index str = case words str of
+  fun:_:cat:_ -> fun ++ " " ++ case look index (init (takeWhile (not . isUpper) fun)) of
+    is -> case lookup (trunc cat) (map entry (words is)) of
+      Just i -> i
+      _ -> is ++ ":" ++ trunc cat
+  _ -> "-- " ++ str
+ where
+  trunc cat = case cat of 
+    "Adv" ->  "r"
+    _ -> map toLower (take 1 cat)  -- "a", "n", "v"
+  entry cfi = let c = take 1 cfi in (c, c ++ drop 4 cfi)
+    
+
+{-
+-- from index.word
+-- aboriginal a 3 3 & \ + 3 0 02599509 01037148 00813589
+
+buildIndex :: [String] -> Index
+buildIndex = M.fromListWith (++) . map mkOne where
+  mkOne s = case words s of 
+    w:c:rest -> (w,[ c++i | i <- dropWhile ((<8) . length) rest])
+    _ -> (s,[])
+-}
+
--- a/lib/src/finnish/kotus/src/Wordnet.hs
+++ b/lib/src/finnish/kotus/src/Wordnet.hs
@@ -0,0 +1,58 @@
+-- data structures for bilingual Eng-Fin wordnet
+-- so far: processing Princeton wordnet
+
+import Data.List
+import Data.Char
+import qualified Data.Map as M
+
+main = do
+  mkPOS "data.adj" "A"
+  mkPOS "data.adv" "Adv"
+  mkPOS "data.noun" "N"
+  mkPOS "data.verb" "V"
+
+mkPOS file cat = do
+  s <- readFile file
+  mapM_ (putStrLn . prEntry . mkEngEntry cat) (dataLines s)
+
+dataLines = filter ((/= " ") . take 1) . lines
+
+type Wordnet = M.Map Ident Entry
+
+data Entry = E {
+  ident   :: Ident,
+  cat     :: Ident,
+  english :: [Lex],
+  finnish :: [Lex]
+  }
+
+entry :: Entry
+entry = E {ident = "000", cat = "XXX", english = [], finnish = []}
+
+prEntry e = unwords $ [ident e, cat e, ";"] ++ intersperse "|" (map prLex (english e)) ++ [";"] ++ intersperse "|" (map prLex (finnish e))
+
+type Ident = String
+
+type Lex = ([String],Ident)  -- parts of multiword, paradigm name
+
+prLex (ws,f) = unwords (f:[unwords (map quote ws)])
+
+quote s = "\"" ++ s ++ "\""
+
+-- English format, in data.adj
+-- 00004615 00 s 02 cut 0 shortened 0 001 & 00004413 a 0000 | with parts removed; "the drastically cut film" 
+
+mkEngEntry :: Ident -> String -> Entry
+mkEngEntry c line = case words line of
+  i:_:_:_:ws -> entry {ident = i, cat = c, english = mkLin (usable ws)}
+ where
+  mkLin ws = [(words (unUnderscore w),"mkW" ++ c) | w <- ws]
+  usable = takeWhile (isAlpha . head) . everyOther
+
+everyOther :: [a] -> [a]
+everyOther xs = case xs of
+  x:_:ys -> x : everyOther ys
+  _ -> []
+
+unUnderscore :: String -> String
+unUnderscore = map (\c -> if c == '_' then ' ' else c)
--- a/lib/src/finnish/kotus/src/wn-kotus.txt
+++ b/lib/src/finnish/kotus/src/wn-kotus.txt