forked from GitHub/gf-core
started project with Finnish frequency dictionary
This commit is contained in:
31
lib/src/finnish/frequency/Freq.hs
Normal file
31
lib/src/finnish/frequency/Freq.hs
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
main = interact (unlines . map mkOne . lines)
|
||||||
|
|
||||||
|
mkOne line = case words line of
|
||||||
|
_:_:_:w:c0:_ -> let c = cat c0 in unwords [mkId w ++ "_" ++ c, ":", c]
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
cat c = case c of
|
||||||
|
"(adjektiivi)" -> "A"
|
||||||
|
"(adverbi)" -> "Adv"
|
||||||
|
"(erisnimi)" -> "PN"
|
||||||
|
"(interjektio)" -> "Interj"
|
||||||
|
"(konjunktio)" -> "Conj"
|
||||||
|
"(lukusana)" -> "Numeral"
|
||||||
|
"(lyhenne)" -> "Abbr"
|
||||||
|
"(prepositio)" -> "Prep"
|
||||||
|
"(pronomini)" -> "Pron"
|
||||||
|
"(substantiivi)" -> "N"
|
||||||
|
"(verbi)" -> "V"
|
||||||
|
_ -> "Junk"
|
||||||
|
|
||||||
|
|
||||||
|
mkId = concatMap trim where
|
||||||
|
trim c = case fromEnum c of
|
||||||
|
32 -> "_" -- space
|
||||||
|
45 -> "_" -- -
|
||||||
|
224 -> "a''" -- à
|
||||||
|
228 -> "a'" -- ä
|
||||||
|
246 -> "o'" -- ö
|
||||||
|
252 -> "u'" -- ü
|
||||||
|
x | x < 65 || (x > 90 && x < 97) || x > 122 -> "_"
|
||||||
|
_ -> [c]
|
||||||
8
lib/src/finnish/frequency/src/README
Normal file
8
lib/src/finnish/frequency/src/README
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
10,000 most frequent words from Finnish newspaper corpus
|
||||||
|
|
||||||
|
Source: http://www.csc.fi/tutkimus/alat/kielitiede/taajuussanasto-B9996/view
|
||||||
|
|
||||||
|
License:
|
||||||
|
Creative Commons
|
||||||
|
Nimeä-Epäkaupallinen-Ei muutettuja teoksia 1.0 Suomi-lisenssi
|
||||||
|
http://creativecommons.org/licenses/by-nd-nc/1.0/fi/
|
||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user