mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-12 22:39:31 -06:00
mapped DictEng to Fin via Wordnet and Kotus
This commit is contained in:
57969
lib/src/finnish/kotus/src/DictEng.senses
Normal file
57969
lib/src/finnish/kotus/src/DictEng.senses
Normal file
File diff suppressed because it is too large
Load Diff
57969
lib/src/finnish/kotus/src/DictEngFin.senses
Normal file
57969
lib/src/finnish/kotus/src/DictEngFin.senses
Normal file
File diff suppressed because it is too large
Load Diff
@@ -7,28 +7,56 @@
|
||||
-- 25/3: 93,696 words, 2628 not found, 6964 unsure
|
||||
-- 26/3: new heuristics for adjectives of type "älyperäinen", 1258 not found
|
||||
|
||||
|
||||
import Data.List
|
||||
import Data.Char
|
||||
import qualified Data.Map as M
|
||||
|
||||
main = do
|
||||
kotus <- readFile "kotus-sanalista_v1.xml" >>= return . mkKotus . lines
|
||||
--- kotus <- readFile "kotus-sanalista_v1.xml" >>= return . mkKotus . lines
|
||||
-- do lookup
|
||||
---- interact $ unlines . map (look kotus) . lines
|
||||
--- interact $ unlines . map (look kotus) . lines
|
||||
|
||||
--- just print all words
|
||||
--- mapM_ (putStrLn . fst) $ M.toList kotus
|
||||
--- debug
|
||||
--- mapM_ print $ take 60 $ M.toList kotus -- debug
|
||||
|
||||
-- analyse fin wordnet senses
|
||||
interact $ unlines . map (annotateWordnet kotus) . lines
|
||||
--- analyse fin wordnet senses
|
||||
--- wordnet to kotus: runghc Kotus.hs <fiwn-wsenses.tsv >wn-kotus.txt
|
||||
-- interact $ unlines . map (annotateWordnet kotus) . lines
|
||||
|
||||
|
||||
--- find Finnish words for DictEng: runghc Kotus.hs <DictEng.senses >DictEngFin.senses
|
||||
|
||||
wnkotus <- readFile "wn-kotus.txt" >>= return . mkWNKotus . lines
|
||||
interact $ unlines . map (annotateSenseInflection wnkotus) . lines
|
||||
|
||||
|
||||
annotateSenseInflection wnkotus line = case tabs line of
|
||||
w:s:_ -> untabs (w : s : lookupSenseInflection wnkotus s)
|
||||
_ -> "-- " ++ line
|
||||
|
||||
type WNKotus = M.Map String [[String]]
|
||||
|
||||
mkWNKotus :: [String] -> WNKotus
|
||||
mkWNKotus = M.fromListWith (++) . map mkOne where
|
||||
mkOne s = case tabs s of
|
||||
w:ws -> (drop 3 w,[ws]) -- fi: dropped
|
||||
_ -> (s,[])
|
||||
|
||||
lookupSenseInflection :: WNKotus -> String -> [String]
|
||||
lookupSenseInflection wnkotus synset = case M.lookup synset wnkotus of
|
||||
Just fws@(_:_) -> minimumBy (\x y -> compare (last x) (last y)) fws -- choose one with the best status
|
||||
_ -> ["NOT_IN_KOTUS"]
|
||||
|
||||
|
||||
|
||||
|
||||
annotateWordnet :: Kotus -> String -> String
|
||||
annotateWordnet kotus line = case cat of
|
||||
"V" -> untabs (take 2 tline ++ [cat, lookk ((iline !! 1) !! 0)]) -- fi:v02758977 sataa lunta 0 -> sataa
|
||||
_ -> untabs (take 2 tline ++ [cat, lookk (last (iline !! 1))]) -- fi:n02769075 näytön taustakuva 0 -> taustakuva
|
||||
annotateWordnet kotus line = untabs $ [tline !! 0, cat, tline !! 1] ++ case cat of
|
||||
"V" -> lookk ((iline !! 1) !! 0) -- fi:v02758977 sataa lunta 0 -> sataa
|
||||
_ -> lookk (last (iline !! 1)) -- fi:n02769075 näytön taustakuva 0 -> taustakuva
|
||||
where
|
||||
iline = map words (init tline)
|
||||
tline = tabs line
|
||||
@@ -37,7 +65,7 @@ annotateWordnet kotus line = case cat of
|
||||
'a' -> "A"
|
||||
'n' -> "N"
|
||||
'r' -> "Adv"
|
||||
lookk = unwords . tail . words . look kotus -- remove repetition of lemma
|
||||
lookk = look kotus
|
||||
|
||||
tabs :: String -> [String]
|
||||
tabs s = case break (=='\t') s of
|
||||
@@ -47,24 +75,30 @@ tabs s = case break (=='\t') s of
|
||||
untabs :: [String] -> String
|
||||
untabs = concat . intersperse "\t"
|
||||
|
||||
look :: Kotus -> String -> String
|
||||
look :: Kotus -> String -> [String]
|
||||
look kotus w = case M.lookup w kotus of
|
||||
Just ["NOPAR"] -> lookCompound "INCOMPOUND" kotus w
|
||||
Just descr -> unwords $ w : descr
|
||||
_ -> lookCompound "OUTCOMPOUND" kotus w
|
||||
Just ["NOPAR"] -> lookCompound "INCOMPOUND" kotus w -- compound in Kotus
|
||||
Just descr -> descr ++ ["FOUND"]
|
||||
_ -> lookCompound "OUTCOMPOUND" kotus w -- compound not in Kotus
|
||||
|
||||
lookCompound :: String -> Kotus -> String -> String
|
||||
lookCompound :: String -> Kotus -> String -> [String]
|
||||
lookCompound pref kotus w = case sort (concatMap looks (splits w)) of -- sort: 1's first
|
||||
(_,descr):_ -> unwords $ w : [descr]
|
||||
_ -> unwords $ w : ["NOTFOUND",pref]
|
||||
(_,descr):_ -> descr
|
||||
_ -> ["MK", "XX5_NOTFOUND_"++pref] -- NOUTFOUND, worst - use smart paradigm
|
||||
where
|
||||
splits s = reverse [splitAt n s | n <- [3 .. length s - 3]]
|
||||
looks (x,y) = case M.lookup y kotus of
|
||||
Just descr | elem x compPrefixes || any isCompPrefix (compForms x) -> [(1,unwords $ x : y : descr ++ [pref])] -- preferred: 1
|
||||
Just descr -> [(2,unwords $ x : y : descr ++ ["UNSURE", pref])] -- secondary: 2
|
||||
_ | drop (length y - 3) y == "nen" && (elem x compPrefixes || any isCompPrefix (compForms x))
|
||||
-> [(3,unwords $ x : y : ["38","UNSURE",pref])] -- tertiary: 3
|
||||
Just descr | makesCompPrefix x -> comp 1 descr pref x y -- preferred: 1
|
||||
Just descr -> comp 2 descr ("XX3_UNSURE_" ++ pref) x y -- secondary: 2, UNSURE
|
||||
_ | drop (length y - 3) y == "nen" && makesCompPrefix x -> comp 3 ["38"] ("XX4_INVENTED_" ++ pref) x y
|
||||
_ | drop (length y - 3) y == "ttu" && makesCompPrefix x -> comp 3 ["1"] ("XX4_INVENTED_" ++ pref) x y
|
||||
_ | drop (length y - 3) y == "tty" && makesCompPrefix x -> comp 3 ["1"] ("XX4_INVENTED_" ++ pref) x y
|
||||
_ | drop (length y - 3) y == "nut" && makesCompPrefix x -> comp 3 ["47"] ("XX4_INVENTED_" ++ pref) x y
|
||||
_ | drop (length y - 3) y == "nyt" && makesCompPrefix x -> comp 3 ["47"] ("XX4_INVENTED_" ++ pref) x y
|
||||
_ -> []
|
||||
comp status descr pref x y = [(status, [unwords (descr ++ ["C", x ++ "_" ++ y]), pref])]
|
||||
|
||||
makesCompPrefix x = elem x compPrefixes || any isCompPrefix (compForms x)
|
||||
isCompPrefix x = case M.lookup x kotus of
|
||||
Just _ -> True
|
||||
_ -> case M.lookup (x ++ "-") kotus of
|
||||
@@ -90,8 +124,8 @@ oneKotusLine s = case untag s of w:ws -> (w,ws)
|
||||
|
||||
untag s = case break (=='<') (drop 7 s) of
|
||||
(w,d) -> case drop 4 d of
|
||||
'<':'t':'>':_ -> [w, takeWhile isDigit (drop 11 d), rest (drop 11 d)] -- 99, </st>
|
||||
'<':'h':'n':'>':h:'<':'/':'h':'n':'>':'<':'t':'>':_ -> [w, ['H',h], takeWhile isDigit (drop 21 d), rest (drop 21 d)] -- homonym
|
||||
'<':'t':'>':_ -> [w, takeWhile isDigit (drop 11 d) ++ rest (drop 11 d)] -- 99, </st>
|
||||
'<':'h':'n':'>':h:'<':'/':'h':'n':'>':'<':'t':'>':_ -> [w, ['H',h], takeWhile isDigit (drop 21 d) ++ rest (drop 21 d)] -- homonym
|
||||
_ -> [w, "NOPAR"] -- no paradigm given
|
||||
where
|
||||
rest t = case t of
|
||||
|
||||
80
lib/src/finnish/kotus/src/LookupWordnet.hs
Normal file
80
lib/src/finnish/kotus/src/LookupWordnet.hs
Normal file
@@ -0,0 +1,80 @@
|
||||
-- looking up a word from Princeton dict/index files
|
||||
|
||||
-- e.g. look for senses of DictEngAbs:
|
||||
-- runghc LookupWordnet.hs <~/GF/lib/src/dictEngFuns.txt
|
||||
|
||||
import Data.List
|
||||
import Data.Char
|
||||
import qualified Data.Map as M
|
||||
|
||||
main = do
|
||||
indw <- readFile "index.word"
|
||||
let poss = buildPoss (lines indw)
|
||||
mapM_ print $ take 20 $ drop 1000 $ M.toList poss
|
||||
inds <- readFile "index.sense"
|
||||
let index = buildIndex poss (lines inds)
|
||||
mapM_ print $ take 20 $ drop 1000 $ M.toList index
|
||||
|
||||
-- interactive search of synsets for words, sorted by category and descending frequency
|
||||
--- interact (unlines . map (look index) . lines)
|
||||
-- example:
|
||||
-- tailor
|
||||
-- n%1%10689564 v%2%00301662 v%2%01666327 v%2%01666717
|
||||
|
||||
-- look up the most frequent senses of a GF abstract syntax with entries like
|
||||
-- zoomorphism_N : N ;
|
||||
interact (unlines . map (lookGF index) . lines)
|
||||
|
||||
|
||||
type Index = M.Map String [String] -- word form to synset id's
|
||||
|
||||
look :: Index -> String -> String
|
||||
look index str = maybe "NOTFOUND" (unwords . nub . sort) $ M.lookup str index
|
||||
|
||||
-- index.sense
|
||||
-- academic%1:18:00:: 09759069 1 2
|
||||
-- academic%3:01:00:: 02599939 1 18
|
||||
|
||||
buildIndex :: Index -> [String] -> Index
|
||||
buildIndex poss = M.fromListWith (++) . map mkOne where
|
||||
mkOne s = case words s of
|
||||
w:i:_ -> let
|
||||
(word,rank) = span (/='%') w
|
||||
c = look poss i
|
||||
in (word, [c ++ take 2 rank ++ "%" ++ i])
|
||||
_ -> (s,[])
|
||||
|
||||
|
||||
buildPoss :: [String] -> Index
|
||||
buildPoss = M.fromListWith (++) . concatMap mkOne where
|
||||
mkOne s = case words s of
|
||||
w:c:rest -> [(i,[c]) | i <- dropWhile ((<8) . length) rest]
|
||||
_ -> []
|
||||
|
||||
|
||||
|
||||
lookGF :: Index -> String -> String
|
||||
lookGF index str = case words str of
|
||||
fun:_:cat:_ -> fun ++ " " ++ case look index (init (takeWhile (not . isUpper) fun)) of
|
||||
is -> case lookup (trunc cat) (map entry (words is)) of
|
||||
Just i -> i
|
||||
_ -> is ++ ":" ++ trunc cat
|
||||
_ -> "-- " ++ str
|
||||
where
|
||||
trunc cat = case cat of
|
||||
"Adv" -> "r"
|
||||
_ -> map toLower (take 1 cat) -- "a", "n", "v"
|
||||
entry cfi = let c = take 1 cfi in (c, c ++ drop 4 cfi)
|
||||
|
||||
|
||||
{-
|
||||
-- from index.word
|
||||
-- aboriginal a 3 3 & \ + 3 0 02599509 01037148 00813589
|
||||
|
||||
buildIndex :: [String] -> Index
|
||||
buildIndex = M.fromListWith (++) . map mkOne where
|
||||
mkOne s = case words s of
|
||||
w:c:rest -> (w,[ c++i | i <- dropWhile ((<8) . length) rest])
|
||||
_ -> (s,[])
|
||||
-}
|
||||
|
||||
58
lib/src/finnish/kotus/src/Wordnet.hs
Normal file
58
lib/src/finnish/kotus/src/Wordnet.hs
Normal file
@@ -0,0 +1,58 @@
|
||||
-- data structures for bilingual Eng-Fin wordnet
|
||||
-- so far: processing Princeton wordnet
|
||||
|
||||
import Data.List
|
||||
import Data.Char
|
||||
import qualified Data.Map as M
|
||||
|
||||
main = do
|
||||
mkPOS "data.adj" "A"
|
||||
mkPOS "data.adv" "Adv"
|
||||
mkPOS "data.noun" "N"
|
||||
mkPOS "data.verb" "V"
|
||||
|
||||
mkPOS file cat = do
|
||||
s <- readFile file
|
||||
mapM_ (putStrLn . prEntry . mkEngEntry cat) (dataLines s)
|
||||
|
||||
dataLines = filter ((/= " ") . take 1) . lines
|
||||
|
||||
type Wordnet = M.Map Ident Entry
|
||||
|
||||
data Entry = E {
|
||||
ident :: Ident,
|
||||
cat :: Ident,
|
||||
english :: [Lex],
|
||||
finnish :: [Lex]
|
||||
}
|
||||
|
||||
entry :: Entry
|
||||
entry = E {ident = "000", cat = "XXX", english = [], finnish = []}
|
||||
|
||||
prEntry e = unwords $ [ident e, cat e, ";"] ++ intersperse "|" (map prLex (english e)) ++ [";"] ++ intersperse "|" (map prLex (finnish e))
|
||||
|
||||
type Ident = String
|
||||
|
||||
type Lex = ([String],Ident) -- parts of multiword, paradigm name
|
||||
|
||||
prLex (ws,f) = unwords (f:[unwords (map quote ws)])
|
||||
|
||||
quote s = "\"" ++ s ++ "\""
|
||||
|
||||
-- English format, in data.adj
|
||||
-- 00004615 00 s 02 cut 0 shortened 0 001 & 00004413 a 0000 | with parts removed; "the drastically cut film"
|
||||
|
||||
mkEngEntry :: Ident -> String -> Entry
|
||||
mkEngEntry c line = case words line of
|
||||
i:_:_:_:ws -> entry {ident = i, cat = c, english = mkLin (usable ws)}
|
||||
where
|
||||
mkLin ws = [(words (unUnderscore w),"mkW" ++ c) | w <- ws]
|
||||
usable = takeWhile (isAlpha . head) . everyOther
|
||||
|
||||
everyOther :: [a] -> [a]
|
||||
everyOther xs = case xs of
|
||||
x:_:ys -> x : everyOther ys
|
||||
_ -> []
|
||||
|
||||
unUnderscore :: String -> String
|
||||
unUnderscore = map (\c -> if c == '_' then ' ' else c)
|
||||
208645
lib/src/finnish/kotus/src/wn-kotus.txt
Normal file
208645
lib/src/finnish/kotus/src/wn-kotus.txt
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user