forked from GitHub/gf-core
196 lines
6.5 KiB
Haskell
196 lines
6.5 KiB
Haskell
import qualified Data.Map as M
|
|
import qualified Data.Set as S
|
|
import System
|
|
|
|
-- build DictFin from KOTUS word list. See Makefile for how to run
|
|
-- AR 28/12/2010
|
|
|
|
main = do
|
|
f:xx <- getArgs
|
|
ss <- readFile f >>= return . lines
|
|
let dict0 = map getEntry ss
|
|
let dictList = [(word e,e) | e <- dict0]
|
|
let dictMap = M.fromAscList dictList
|
|
let adjList = [(adj,adv) | (adv,e) <- dictList,
|
|
cat e == catAdverb, -- kultamitali - kultamitalisti, hence need this
|
|
Just adj <- [lookAdj adv dictMap]]
|
|
--- mapM_ (\ (adj,adv) -> putStrLn (adv ++ " - " ++ adj)) adjList -- to see the adjectives
|
|
let compoundList = [(whole,(end,beg)) | (whole,e) <- dictList,
|
|
elem (cat e) [catNoun,catAdjective],
|
|
Just (end,beg) <- [lookCompound (whole,e) dictMap]]
|
|
--- mapM_ (\ (x,(y,_)) -> putStrLn (unwords [x,"-", word y])) compoundList -- to see compounds
|
|
let (adjSet,advSet) = let (adjs,advs) = unzip adjList in (S.fromList adjs,S.fromList advs)
|
|
let compoundMap = M.fromList compoundList
|
|
let dictList1 = map (mkAdjAdv adjSet advSet) dictList
|
|
let dictListComp = concatMap (mkCompound compoundMap) dictList1
|
|
let dictList2 = filter (flip M.notMember compoundMap . fst) dictList1
|
|
let dictList3 = case xx of
|
|
"-compounds":_ -> dictListComp
|
|
"-all":_ -> dictList2 ++ dictListComp
|
|
_ -> dictList2
|
|
let dict3 = map snd dictList3
|
|
mapM_ mkRules dict3
|
|
|
|
|
|
|
|
|
|
----------------------------------------------------------------
|
|
-- identify the parts of compounds
|
|
|
|
-- longest match, suffix length 3..10, prefix length at least 3, both parts exist
|
|
-- this gives 36557 compounds...
|
|
--- no recursion to multi-word compounds
|
|
lookCompound :: (String, Entry) -> M.Map String Entry -> Maybe (Entry,String)
|
|
lookCompound (w,ent) dict =
|
|
looks [splitAt i w | let k = length w, i <- [k-3, k-4 .. max (k-10) 3]]
|
|
where
|
|
looks ws = case [(e,u) | (u,v) <- ws,
|
|
Just e <- [M.lookup v dict], cat e == catNoun,
|
|
Just _ <- [M.lookup u dict]] of
|
|
eu :_ -> return eu
|
|
_ -> Nothing
|
|
|
|
-- return compounds
|
|
mkCompound compMap (w,ent) =
|
|
case M.lookup w compMap of
|
|
Just (e,b) -> return (w, ent {
|
|
word = word e,
|
|
tn = tn e,
|
|
par = unwords ["compoundNK",par e,quoted b],
|
|
isDummy = tn e == 0
|
|
}
|
|
)
|
|
_ -> []
|
|
|
|
|
|
----------------------------------------------------------------
|
|
--
|
|
-- for words ending "-sti", look for corresponding adjective. If found, mark the adjective
|
|
-- as adjective, and eliminate the adverb.
|
|
|
|
mkAdjAdv adjSet advSet (w,e) =
|
|
if S.member w adjSet && (cat e == catNoun)
|
|
then (w,e{cat = catAdjective, fun = take (length (fun e) - 2) (fun e) ++ "AK"})
|
|
else if S.member w advSet && (cat e == catAdverb) then (w,e{isDerived = True})
|
|
else (w,e)
|
|
|
|
lookAdj adv dict = case splitAt (length adv - 3) adv of
|
|
(adj,"sti") -> case [e | a <- adjCandidates adj,
|
|
Just e <- [M.lookup a dict], cat e == catNoun] of
|
|
e :_ -> return $ word e
|
|
_ -> Nothing
|
|
_ -> Nothing
|
|
|
|
adjCandidates adj = case reverse adj of
|
|
'e':'s': w -> [reverse w ++ "nen", adj] -- vihainen - vihaisesti
|
|
'a':'a': 'k' : 'k' : w ->
|
|
[reverse w ++ "kas", adj] -- halukas
|
|
'a':'a': w ->
|
|
[init adj ++ "s", adj] -- hurskas - hurskaasti, suulas - suulaasti
|
|
v : v': 'k' : 'k' : w | v == v' && fromEnum v == 228 ->
|
|
[reverse w ++ "käs", adj] -- nenäkäs - nenäkkäästi
|
|
v : v': w | v == v' && fromEnum v == 228 ->
|
|
[init adj ++ "s", adj] -- ylväs - ylväästi
|
|
'e':'e': w -> let rw = reverse w in
|
|
[rw ++ "e", rw ++ "ut", rw ++ "yt", adj] -- terveesti, ahdistuneesti
|
|
'i':'i': w -> [reverse w ++ "is", adj] -- kaunis - kauniisti
|
|
'e':_ -> [init adj ++ "i", adj] -- suuri - suuresti (not: suure - suuresti)
|
|
_ -> [adj]
|
|
|
|
|
|
-------------------------------------------------
|
|
-- produce rules
|
|
|
|
mkRules e = do
|
|
putRule $ mkFun (fun e) (cat e)
|
|
putRule $ mkLin (fun e) (par e) (word e)
|
|
where
|
|
putRule
|
|
| isDummy e = const (return ()) -- putStrLn . ("-- " ++)
|
|
| isDerived e = putStrLn . ("--+ " ++)
|
|
| isPlurTant e = putStrLn . ("--? " ++)
|
|
| otherwise = putStrLn
|
|
|
|
mkFun fun cat = unwords ["fun",fun,":",cat,";"]
|
|
mkLin fun par w = case words par of
|
|
f@"compoundNK":p:v:_ -> unwords ["lin",fun,"=","{s","=",f,v,"("++ p,quoted w ++")}",";"]
|
|
_ -> unwords ["lin",fun,"=","{s","=",par,quoted w ++"}",";"]
|
|
|
|
mkId = concatMap trim where
|
|
trim c = case fromEnum c of
|
|
32 -> "_" -- space
|
|
45 -> "_" -- -
|
|
224 -> "a''" -- à
|
|
228 -> "a'" -- ä
|
|
246 -> "o'" -- ö
|
|
252 -> "u'" -- ü
|
|
x | x < 65 || (x > 90 && x < 97) || x > 122 -> "_"
|
|
_ -> [c]
|
|
|
|
quoted s = "\"" ++ s ++ "\""
|
|
|
|
-- analyse each line in KOTUS by this
|
|
|
|
getEntry :: String -> Entry
|
|
getEntry s = emptyEntry {
|
|
word = w,
|
|
tn = p,
|
|
av = g,
|
|
hn = h,
|
|
fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ c,
|
|
cat = c,
|
|
par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A"),
|
|
isSuffix = suff,
|
|
isDummy = elem p [0,50,51,101],
|
|
isPlurTant = last w == 't' && notElem p [5,43,47,99]
|
|
}
|
|
where
|
|
x = getTags s
|
|
(w,suff) = let w0 = tagged "s" x in if (head w0 == '-') then (tail w0, True) else (w0,False)
|
|
p = (read (tagged "tn" x) :: Int)
|
|
h = tagged "hn" x
|
|
g = tagged "av" x
|
|
c = if p < 50 then catNoun
|
|
else if p < 99 then catVerb
|
|
else catAdverb
|
|
num p = if p < 10 then "0" ++ show p else show p
|
|
|
|
|
|
tagged :: String -> Tags -> String
|
|
tagged s x = maybe "0" id $ lookup s x
|
|
|
|
-- get values of leave tags
|
|
getTags :: String -> Tags
|
|
getTags s = case s of
|
|
'<':rest -> case break (=='>') rest of
|
|
(tag,_:more) -> case break (=='<') more of
|
|
([],_) -> getTags more
|
|
(v,end) -> (tag,v):getTags end
|
|
_ -> []
|
|
_ -> []
|
|
|
|
type Tags = [(String,String)]
|
|
|
|
data Entry = Entry {
|
|
word :: String,
|
|
tn :: Int,
|
|
av :: String,
|
|
hn :: String,
|
|
fun :: String,
|
|
cat :: String,
|
|
par :: String,
|
|
comp1 :: String, -- compound word with this as inflected first part
|
|
comp2 :: String, -- compound word with this as last part
|
|
isSuffix :: Bool, -- used only as suffix, e.g. "-tekoinen"
|
|
isDummy :: Bool, -- no inflection information, or compound, or pron, or adverb from adjective
|
|
isPlurTant :: Bool, -- plurale tantum, e.g. "sakset"
|
|
isDerived :: Bool -- is derived from other words, e.g. adverb from adjective
|
|
}
|
|
|
|
emptyEntry = Entry "" 0 "" "" "" "" "" "" "" False False False False
|
|
|
|
catNoun = "NK"
|
|
catVerb = "VK"
|
|
catAdjective = "AK"
|
|
catAdverb = "AdvK"
|