started project with Finnish frequency dictionary

2010-12-28 20:47:27 +00:00
parent ed4e07d0c2
commit 3f9313cc0f
3 changed files with 10044 additions and 0 deletions
@@ -0,0 +1,31 @@
+main = interact (unlines . map mkOne . lines)
+
+mkOne line = case words line of
+  _:_:_:w:c0:_ -> let c = cat c0 in unwords [mkId w ++ "_" ++ c, ":", c]
+  _ -> []
+
+cat c = case c of
+  "(adjektiivi)" -> "A"
+  "(adverbi)" -> "Adv"
+  "(erisnimi)" -> "PN"
+  "(interjektio)" -> "Interj"
+  "(konjunktio)" -> "Conj"
+  "(lukusana)" -> "Numeral"
+  "(lyhenne)"  -> "Abbr"
+  "(prepositio)" -> "Prep"
+  "(pronomini)" -> "Pron"
+  "(substantiivi)" -> "N"
+  "(verbi)" -> "V"
+  _ -> "Junk"
+
+
+mkId = concatMap trim where
+  trim c = case fromEnum c of
+    32 -> "_" -- space
+    45 -> "_" -- -
+    224 -> "a''" -- à
+    228 -> "a'" -- ä
+    246 -> "o'" -- ö
+    252 -> "u'" -- ü
+    x | x < 65 || (x > 90 && x < 97) || x > 122 -> "_"
+    _   -> [c]
@@ -0,0 +1,8 @@
+10,000 most frequent words from Finnish newspaper corpus
+
+Source: http://www.csc.fi/tutkimus/alat/kielitiede/taajuussanasto-B9996/view
+
+License: 
+   Creative Commons
+   Nimeä-Epäkaupallinen-Ei muutettuja teoksia 1.0 Suomi-lisenssi
+   http://creativecommons.org/licenses/by-nd-nc/1.0/fi/