forked from GitHub/gf-core
started project with Finnish frequency dictionary
This commit is contained in:
31
lib/src/finnish/frequency/Freq.hs
Normal file
31
lib/src/finnish/frequency/Freq.hs
Normal file
@@ -0,0 +1,31 @@
|
||||
main = interact (unlines . map mkOne . lines)
|
||||
|
||||
mkOne line = case words line of
|
||||
_:_:_:w:c0:_ -> let c = cat c0 in unwords [mkId w ++ "_" ++ c, ":", c]
|
||||
_ -> []
|
||||
|
||||
cat c = case c of
|
||||
"(adjektiivi)" -> "A"
|
||||
"(adverbi)" -> "Adv"
|
||||
"(erisnimi)" -> "PN"
|
||||
"(interjektio)" -> "Interj"
|
||||
"(konjunktio)" -> "Conj"
|
||||
"(lukusana)" -> "Numeral"
|
||||
"(lyhenne)" -> "Abbr"
|
||||
"(prepositio)" -> "Prep"
|
||||
"(pronomini)" -> "Pron"
|
||||
"(substantiivi)" -> "N"
|
||||
"(verbi)" -> "V"
|
||||
_ -> "Junk"
|
||||
|
||||
|
||||
mkId = concatMap trim where
|
||||
trim c = case fromEnum c of
|
||||
32 -> "_" -- space
|
||||
45 -> "_" -- -
|
||||
224 -> "a''" -- à
|
||||
228 -> "a'" -- ä
|
||||
246 -> "o'" -- ö
|
||||
252 -> "u'" -- ü
|
||||
x | x < 65 || (x > 90 && x < 97) || x > 122 -> "_"
|
||||
_ -> [c]
|
||||
8
lib/src/finnish/frequency/src/README
Normal file
8
lib/src/finnish/frequency/src/README
Normal file
@@ -0,0 +1,8 @@
|
||||
10,000 most frequent words from Finnish newspaper corpus
|
||||
|
||||
Source: http://www.csc.fi/tutkimus/alat/kielitiede/taajuussanasto-B9996/view
|
||||
|
||||
License:
|
||||
Creative Commons
|
||||
Nimeä-Epäkaupallinen-Ei muutettuja teoksia 1.0 Suomi-lisenssi
|
||||
http://creativecommons.org/licenses/by-nd-nc/1.0/fi/
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user