1
0
forked from GitHub/gf-core
Files
gf-core/lib/src/finnish/kotus/Kotus.hs

196 lines
6.5 KiB
Haskell

import qualified Data.Map as M
import qualified Data.Set as S
import System
-- build DictFin from KOTUS word list. See Makefile for how to run
-- AR 28/12/2010
main = do
f:xx <- getArgs
ss <- readFile f >>= return . lines
let dict0 = map getEntry ss
let dictList = [(word e,e) | e <- dict0]
let dictMap = M.fromAscList dictList
let adjList = [(adj,adv) | (adv,e) <- dictList,
cat e == catAdverb, -- kultamitali - kultamitalisti, hence need this
Just adj <- [lookAdj adv dictMap]]
--- mapM_ (\ (adj,adv) -> putStrLn (adv ++ " - " ++ adj)) adjList -- to see the adjectives
let compoundList = [(whole,(end,beg)) | (whole,e) <- dictList,
elem (cat e) [catNoun,catAdjective],
Just (end,beg) <- [lookCompound (whole,e) dictMap]]
--- mapM_ (\ (x,(y,_)) -> putStrLn (unwords [x,"-", word y])) compoundList -- to see compounds
let (adjSet,advSet) = let (adjs,advs) = unzip adjList in (S.fromList adjs,S.fromList advs)
let compoundMap = M.fromList compoundList
let dictList1 = map (mkAdjAdv adjSet advSet) dictList
let dictListComp = concatMap (mkCompound compoundMap) dictList1
let dictList2 = filter (flip M.notMember compoundMap . fst) dictList1
let dictList3 = case xx of
"-compounds":_ -> dictListComp
"-all":_ -> dictList2 ++ dictListComp
_ -> dictList2
let dict3 = map snd dictList3
mapM_ mkRules dict3
----------------------------------------------------------------
-- identify the parts of compounds
-- longest match, suffix length 3..10, prefix length at least 3, both parts exist
-- this gives 36557 compounds...
--- no recursion to multi-word compounds
lookCompound :: (String, Entry) -> M.Map String Entry -> Maybe (Entry,String)
lookCompound (w,ent) dict =
looks [splitAt i w | let k = length w, i <- [k-3, k-4 .. max (k-10) 3]]
where
looks ws = case [(e,u) | (u,v) <- ws,
Just e <- [M.lookup v dict], cat e == catNoun,
Just _ <- [M.lookup u dict]] of
eu :_ -> return eu
_ -> Nothing
-- return compounds
mkCompound compMap (w,ent) =
case M.lookup w compMap of
Just (e,b) -> return (w, ent {
word = word e,
tn = tn e,
par = unwords ["compoundNK",par e,quoted b],
isDummy = tn e == 0
}
)
_ -> []
----------------------------------------------------------------
--
-- for words ending "-sti", look for corresponding adjective. If found, mark the adjective
-- as adjective, and eliminate the adverb.
mkAdjAdv adjSet advSet (w,e) =
if S.member w adjSet && (cat e == catNoun)
then (w,e{cat = catAdjective, fun = take (length (fun e) - 2) (fun e) ++ "AK"})
else if S.member w advSet && (cat e == catAdverb) then (w,e{isDerived = True})
else (w,e)
lookAdj adv dict = case splitAt (length adv - 3) adv of
(adj,"sti") -> case [e | a <- adjCandidates adj,
Just e <- [M.lookup a dict], cat e == catNoun] of
e :_ -> return $ word e
_ -> Nothing
_ -> Nothing
adjCandidates adj = case reverse adj of
'e':'s': w -> [reverse w ++ "nen", adj] -- vihainen - vihaisesti
'a':'a': 'k' : 'k' : w ->
[reverse w ++ "kas", adj] -- halukas
'a':'a': w ->
[init adj ++ "s", adj] -- hurskas - hurskaasti, suulas - suulaasti
v : v': 'k' : 'k' : w | v == v' && fromEnum v == 228 ->
[reverse w ++ "käs", adj] -- nenäkäs - nenäkkäästi
v : v': w | v == v' && fromEnum v == 228 ->
[init adj ++ "s", adj] -- ylväs - ylväästi
'e':'e': w -> let rw = reverse w in
[rw ++ "e", rw ++ "ut", rw ++ "yt", adj] -- terveesti, ahdistuneesti
'i':'i': w -> [reverse w ++ "is", adj] -- kaunis - kauniisti
'e':_ -> [init adj ++ "i", adj] -- suuri - suuresti (not: suure - suuresti)
_ -> [adj]
-------------------------------------------------
-- produce rules
mkRules e = do
putRule $ mkFun (fun e) (cat e)
putRule $ mkLin (fun e) (par e) (word e)
where
putRule
| isDummy e = const (return ()) -- putStrLn . ("-- " ++)
| isDerived e = putStrLn . ("--+ " ++)
| isPlurTant e = putStrLn . ("--? " ++)
| otherwise = putStrLn
mkFun fun cat = unwords ["fun",fun,":",cat,";"]
mkLin fun par w = case words par of
f@"compoundNK":p:v:_ -> unwords ["lin",fun,"=","{s","=",f,v,"("++ p,quoted w ++")}",";"]
_ -> unwords ["lin",fun,"=","{s","=",par,quoted w ++"}",";"]
mkId = concatMap trim where
trim c = case fromEnum c of
32 -> "_" -- space
45 -> "_" -- -
224 -> "a''" -- à
228 -> "a'" -- ä
246 -> "o'" -- ö
252 -> "u'" -- ü
x | x < 65 || (x > 90 && x < 97) || x > 122 -> "_"
_ -> [c]
quoted s = "\"" ++ s ++ "\""
-- analyse each line in KOTUS by this
getEntry :: String -> Entry
getEntry s = emptyEntry {
word = w,
tn = p,
av = g,
hn = h,
fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ c,
cat = c,
par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A"),
isSuffix = suff,
isDummy = elem p [0,50,51,101],
isPlurTant = last w == 't' && notElem p [5,43,47,99]
}
where
x = getTags s
(w,suff) = let w0 = tagged "s" x in if (head w0 == '-') then (tail w0, True) else (w0,False)
p = (read (tagged "tn" x) :: Int)
h = tagged "hn" x
g = tagged "av" x
c = if p < 50 then catNoun
else if p < 99 then catVerb
else catAdverb
num p = if p < 10 then "0" ++ show p else show p
tagged :: String -> Tags -> String
tagged s x = maybe "0" id $ lookup s x
-- get values of leave tags
getTags :: String -> Tags
getTags s = case s of
'<':rest -> case break (=='>') rest of
(tag,_:more) -> case break (=='<') more of
([],_) -> getTags more
(v,end) -> (tag,v):getTags end
_ -> []
_ -> []
type Tags = [(String,String)]
data Entry = Entry {
word :: String,
tn :: Int,
av :: String,
hn :: String,
fun :: String,
cat :: String,
par :: String,
comp1 :: String, -- compound word with this as inflected first part
comp2 :: String, -- compound word with this as last part
isSuffix :: Bool, -- used only as suffix, e.g. "-tekoinen"
isDummy :: Bool, -- no inflection information, or compound, or pron, or adverb from adjective
isPlurTant :: Bool, -- plurale tantum, e.g. "sakset"
isDerived :: Bool -- is derived from other words, e.g. adverb from adjective
}
emptyEntry = Entry "" 0 "" "" "" "" "" "" "" False False False False
catNoun = "NK"
catVerb = "VK"
catAdjective = "AK"
catAdverb = "AdvK"