mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-21 02:39:31 -06:00
69 lines
1.7 KiB
Haskell
69 lines
1.7 KiB
Haskell
import qualified Data.Set as S
|
|
|
|
main = do
|
|
src <- readFile "src/suomen-sanomalehtikielen-taajuussanasto-utf8.txt"
|
|
kotus <- readFile "kotusfuns.txt"
|
|
let funSet = S.fromList (map (head . words) (lines kotus))
|
|
mapM_ putStrLn $ concatMap (prEntry funSet . mkOne) $ lines src
|
|
|
|
stoplist = S.fromList
|
|
["ei","olla","ei_ihme","eikä","ettei","ellei","vaikkei","muttei","miksei","jollei","jottei"]
|
|
|
|
prEntry funSet (f,c,w,p@(k,n),i) = if n == 0 then [] else [
|
|
unwords ["fun",f, " : ", c,";"],
|
|
unwords ["lin",f, "=", para,";"]
|
|
]
|
|
where
|
|
para = case n of
|
|
2 -> let kf = i ++ "_" ++ k in
|
|
if S.member kf funSet
|
|
then "mk" ++ c ++ " " ++ kf
|
|
else "mk" ++ c ++ " " ++ quoted w
|
|
_ -> "mk" ++ k ++ " " ++ quoted w
|
|
|
|
|
|
mkOne line = case words line of
|
|
_:_:_:w:c0:_ | S.member w stoplist -> none
|
|
_:_:_:w:c0:_ -> let c = mkCat c0 in (mkFun w c, c, w, mkLin c0, mkId w)
|
|
_ -> none
|
|
|
|
none = ("","","",("",0),"")
|
|
|
|
quoted s = "\"" ++ s ++ "\""
|
|
|
|
mkCat = fst . catlin
|
|
mkLin = snd . catlin
|
|
|
|
catlin c = case c of
|
|
"(adjektiivi)" -> kotus "A"
|
|
"(adverbi)" -> rep "Adv"
|
|
"(erisnimi)" -> rep "PN"
|
|
"(interjektio)" -> hide "Interj"
|
|
"(konjunktio)" -> hide "Conj"
|
|
"(lukusana)" -> hide "Numeral"
|
|
"(lyhenne)" -> hide "Abbr"
|
|
"(prepositio)" -> hide "Prep"
|
|
"(pronomini)" -> hide "Pron"
|
|
"(substantiivi)" -> kotus "N"
|
|
"(verbi)" -> kotus "V"
|
|
_ -> hide "Junk"
|
|
|
|
|
|
rep s = (s,(s,1))
|
|
kotus s = (s,(s ++ "K",2))
|
|
--- for entries not to be included
|
|
hide s = (s,(s,0))
|
|
|
|
mkFun w c = mkId w ++ "_" ++ c
|
|
|
|
mkId = concatMap trim where
|
|
trim c = case fromEnum c of
|
|
32 -> "_" -- space
|
|
45 -> "_" -- -
|
|
224 -> "a''" -- à
|
|
228 -> "a'" -- ä
|
|
246 -> "o'" -- ö
|
|
252 -> "u'" -- ü
|
|
x | x < 65 || (x > 90 && x < 97) || x > 122 -> "_"
|
|
_ -> [c]
|