1
0
forked from GitHub/gf-core

started project with Finnish frequency dictionary

This commit is contained in:
aarne
2010-12-28 20:47:27 +00:00
parent ed4e07d0c2
commit 3f9313cc0f
3 changed files with 10044 additions and 0 deletions

View File

@@ -0,0 +1,31 @@
main = interact (unlines . map mkOne . lines)
mkOne line = case words line of
_:_:_:w:c0:_ -> let c = cat c0 in unwords [mkId w ++ "_" ++ c, ":", c]
_ -> []
cat c = case c of
"(adjektiivi)" -> "A"
"(adverbi)" -> "Adv"
"(erisnimi)" -> "PN"
"(interjektio)" -> "Interj"
"(konjunktio)" -> "Conj"
"(lukusana)" -> "Numeral"
"(lyhenne)" -> "Abbr"
"(prepositio)" -> "Prep"
"(pronomini)" -> "Pron"
"(substantiivi)" -> "N"
"(verbi)" -> "V"
_ -> "Junk"
mkId = concatMap trim where
trim c = case fromEnum c of
32 -> "_" -- space
45 -> "_" -- -
224 -> "a''" -- à
228 -> "a'" -- ä
246 -> "o'" -- ö
252 -> "u'" -- ü
x | x < 65 || (x > 90 && x < 97) || x > 122 -> "_"
_ -> [c]

View File

@@ -0,0 +1,8 @@
10,000 most frequent words from Finnish newspaper corpus
Source: http://www.csc.fi/tutkimus/alat/kielitiede/taajuussanasto-B9996/view
License:
Creative Commons
Nimeä-Epäkaupallinen-Ei muutettuja teoksia 1.0 Suomi-lisenssi
http://creativecommons.org/licenses/by-nd-nc/1.0/fi/

File diff suppressed because it is too large Load Diff