Files
gf-core/lib/src/finnish/frequency/Freq.hs

69 lines
1.7 KiB
Haskell

import qualified Data.Set as S
main = do
src <- readFile "src/suomen-sanomalehtikielen-taajuussanasto-utf8.txt"
kotus <- readFile "kotusfuns.txt"
let funSet = S.fromList (map (head . words) (lines kotus))
mapM_ putStrLn $ concatMap (prEntry funSet . mkOne) $ lines src
stoplist = S.fromList
["ei","olla","ei_ihme","eikä","ettei","ellei","vaikkei","muttei","miksei","jollei","jottei"]
prEntry funSet (f,c,w,p@(k,n),i) = if n == 0 then [] else [
unwords ["fun",f, " : ", c,";"],
unwords ["lin",f, "=", para,";"]
]
where
para = case n of
2 -> let kf = i ++ "_" ++ k in
if S.member kf funSet
then "mk" ++ c ++ " " ++ kf
else "mk" ++ c ++ " " ++ quoted w
_ -> "mk" ++ k ++ " " ++ quoted w
mkOne line = case words line of
_:_:_:w:c0:_ | S.member w stoplist -> none
_:_:_:w:c0:_ -> let c = mkCat c0 in (mkFun w c, c, w, mkLin c0, mkId w)
_ -> none
none = ("","","",("",0),"")
quoted s = "\"" ++ s ++ "\""
mkCat = fst . catlin
mkLin = snd . catlin
catlin c = case c of
"(adjektiivi)" -> kotus "A"
"(adverbi)" -> rep "Adv"
"(erisnimi)" -> rep "PN"
"(interjektio)" -> hide "Interj"
"(konjunktio)" -> hide "Conj"
"(lukusana)" -> hide "Numeral"
"(lyhenne)" -> hide "Abbr"
"(prepositio)" -> hide "Prep"
"(pronomini)" -> hide "Pron"
"(substantiivi)" -> kotus "N"
"(verbi)" -> kotus "V"
_ -> hide "Junk"
rep s = (s,(s,1))
kotus s = (s,(s ++ "K",2))
--- for entries not to be included
hide s = (s,(s,0))
mkFun w c = mkId w ++ "_" ++ c
mkId = concatMap trim where
trim c = case fromEnum c of
32 -> "_" -- space
45 -> "_" -- -
224 -> "a''" -- à
228 -> "a'" -- ä
246 -> "o'" -- ö
252 -> "u'" -- ü
x | x < 65 || (x > 90 && x < 97) || x > 122 -> "_"
_ -> [c]