|
|
|
|
@@ -1,17 +1,113 @@
|
|
|
|
|
main = interact (unlines . concatMap mkOne . lines)
|
|
|
|
|
import qualified Data.Map as M
|
|
|
|
|
import qualified Data.Set as S
|
|
|
|
|
import System
|
|
|
|
|
|
|
|
|
|
mkOne = mkEntry . analyse
|
|
|
|
|
-- build DictFin from KOTUS word list. See Makefile.
|
|
|
|
|
-- AR 28/12/2010
|
|
|
|
|
|
|
|
|
|
mkEntry (w,p,g,h) | elem p [0,50,51] = [] -- no inflection information, or a compound
|
|
|
|
|
mkEntry (w,p,g,h) | head w == '-' = mkEntry (tail w,p,g,h) -- suffix only
|
|
|
|
|
mkEntry (w,p,g,h) | last w == 't' && notElem p [5,43,47 ] = [] -- plurale tantum --- to do
|
|
|
|
|
mkEntry (w,p,g,h) = [mkFun fun cat, mkLin fun par w] where
|
|
|
|
|
cat = if p < 50 then catNoun
|
|
|
|
|
else if p < 99 then catVerb
|
|
|
|
|
else catAdverb
|
|
|
|
|
fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ cat
|
|
|
|
|
par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A")
|
|
|
|
|
num p = if p < 10 then "0" ++ show p else show p
|
|
|
|
|
main = do
|
|
|
|
|
f:xx <- getArgs
|
|
|
|
|
ss <- readFile f >>= return . lines
|
|
|
|
|
let dict0 = map getEntry ss
|
|
|
|
|
let dictList = [(word e,e) | e <- dict0]
|
|
|
|
|
let dictMap = M.fromAscList dictList
|
|
|
|
|
let adjList = [(adj,adv) | (adv,e) <- dictList,
|
|
|
|
|
cat e == catAdverb, -- kultamitali - kultamitalisti, hence need this
|
|
|
|
|
Just adj <- [lookAdj adv dictMap]]
|
|
|
|
|
--- mapM_ (\ (adj,adv) -> putStrLn (adv ++ " - " ++ adj)) adjList -- to see the adjectives
|
|
|
|
|
let compoundList = [(whole,(end,beg)) | (whole,e) <- dictList,
|
|
|
|
|
elem (cat e) [catNoun,catAdjective],
|
|
|
|
|
Just (end,beg) <- [lookCompound (whole,e) dictMap]]
|
|
|
|
|
--- mapM_ (\ (x,(y,_)) -> putStrLn (unwords [x,"-", word y])) compoundList -- to see compounds
|
|
|
|
|
let (adjSet,advSet) = let (adjs,advs) = unzip adjList in (S.fromList adjs,S.fromList advs)
|
|
|
|
|
let dictList1 = map (mkAdjAdv adjSet advSet) dictList
|
|
|
|
|
let dictList2 = case xx of
|
|
|
|
|
"-compounds":_ -> concatMap (mkCompound (M.fromList compoundList)) dictList1
|
|
|
|
|
"-all":_ -> dictList1 ++ concatMap (mkCompound (M.fromList compoundList)) dictList1
|
|
|
|
|
_ -> dictList1
|
|
|
|
|
let dict2 = map snd dictList2
|
|
|
|
|
mapM_ mkRules dict2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
----------------------------------------------------------------
|
|
|
|
|
-- identify the parts of compounds
|
|
|
|
|
|
|
|
|
|
-- longest match, suffix length 3..10, prefix length at least 3, both parts exist
|
|
|
|
|
-- this gives 36557 compounds...
|
|
|
|
|
--- no recursion to multi-word compounds
|
|
|
|
|
lookCompound :: (String, Entry) -> M.Map String Entry -> Maybe (Entry,String)
|
|
|
|
|
lookCompound (w,ent) dict =
|
|
|
|
|
looks [splitAt i w | let k = length w, i <- [k-3, k-4 .. max (k-10) 3]]
|
|
|
|
|
where
|
|
|
|
|
looks ws = case [(e,u) | (u,v) <- ws,
|
|
|
|
|
Just e <- [M.lookup v dict], cat e == catNoun,
|
|
|
|
|
Just _ <- [M.lookup u dict]] of
|
|
|
|
|
eu :_ -> return eu
|
|
|
|
|
_ -> Nothing
|
|
|
|
|
|
|
|
|
|
-- return compounds
|
|
|
|
|
mkCompound compMap (w,ent) =
|
|
|
|
|
case M.lookup w compMap of
|
|
|
|
|
Just (e,b) -> return (w, ent {
|
|
|
|
|
word = word e,
|
|
|
|
|
tn = tn e,
|
|
|
|
|
par = "compoundNK " ++ par e ++ " " ++ quoted b,
|
|
|
|
|
isDummy = tn e == 0
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
_ -> []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
----------------------------------------------------------------
|
|
|
|
|
--
|
|
|
|
|
-- for words ending "-sti", look for corresponding adjective. If found, mark the adjective
|
|
|
|
|
-- as adjective, and eliminate the adverb.
|
|
|
|
|
|
|
|
|
|
mkAdjAdv adjSet advSet (w,e) =
|
|
|
|
|
if S.member w adjSet && (cat e == catNoun)
|
|
|
|
|
then (w,e{cat = catAdjective, fun = take (length (fun e) - 2) (fun e) ++ "AK"})
|
|
|
|
|
else if S.member w advSet && (cat e == catAdverb) then (w,e{isDerived = True})
|
|
|
|
|
else (w,e)
|
|
|
|
|
|
|
|
|
|
lookAdj adv dict = case splitAt (length adv - 3) adv of
|
|
|
|
|
(adj,"sti") -> case [e | a <- adjCandidates adj,
|
|
|
|
|
Just e <- [M.lookup a dict], cat e == catNoun] of
|
|
|
|
|
e :_ -> return $ word e
|
|
|
|
|
_ -> Nothing
|
|
|
|
|
_ -> Nothing
|
|
|
|
|
|
|
|
|
|
adjCandidates adj = case reverse adj of
|
|
|
|
|
'e':'s': w -> [reverse w ++ "nen", adj] -- vihainen - vihaisesti
|
|
|
|
|
'a':'a': 'k' : 'k' : w ->
|
|
|
|
|
[reverse w ++ "kas", adj] -- halukas
|
|
|
|
|
'a':'a': w ->
|
|
|
|
|
[init adj ++ "s", adj] -- hurskas - hurskaasti, suulas - suulaasti
|
|
|
|
|
v : v': 'k' : 'k' : w | v == v' && fromEnum v == 228 ->
|
|
|
|
|
[reverse w ++ "käs", adj] -- nenäkäs - nenäkkäästi
|
|
|
|
|
v : v': w | v == v' && fromEnum v == 228 ->
|
|
|
|
|
[init adj ++ "s", adj] -- ylväs - ylväästi
|
|
|
|
|
'e':'e': w -> let rw = reverse w in
|
|
|
|
|
[rw ++ "e", rw ++ "ut", rw ++ "yt", adj] -- terveesti, ahdistuneesti
|
|
|
|
|
'i':'i': w -> [reverse w ++ "is", adj] -- kaunis - kauniisti
|
|
|
|
|
'e':_ -> [init adj ++ "i", adj] -- suuri - suuresti (not: suure - suuresti)
|
|
|
|
|
_ -> [adj]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
-------------------------------------------------
|
|
|
|
|
-- produce rules
|
|
|
|
|
|
|
|
|
|
mkRules e = do
|
|
|
|
|
putRule $ mkFun (fun e) (cat e)
|
|
|
|
|
putRule $ mkLin (fun e) (par e) (word e)
|
|
|
|
|
where
|
|
|
|
|
putRule
|
|
|
|
|
| isDummy e = putStrLn . ("-- " ++)
|
|
|
|
|
| isDerived e = putStrLn . ("--+ " ++)
|
|
|
|
|
| isPlurTant e = putStrLn . ("--? " ++)
|
|
|
|
|
| otherwise = putStrLn
|
|
|
|
|
|
|
|
|
|
mkFun fun cat = unwords ["fun",fun,":",cat,";"]
|
|
|
|
|
mkLin fun par w = unwords ["lin",fun,"=",par,quoted w,";"]
|
|
|
|
|
@@ -29,13 +125,32 @@ mkId = concatMap trim where
|
|
|
|
|
|
|
|
|
|
quoted s = "\"" ++ s ++ "\""
|
|
|
|
|
|
|
|
|
|
analyse :: String -> (String,Int,String,String)
|
|
|
|
|
analyse s = (word,paradigm,gradation,homonym) where
|
|
|
|
|
word = tagged "s" x
|
|
|
|
|
paradigm = (read (tagged "tn" x) :: Int)
|
|
|
|
|
gradation = tagged "av" x
|
|
|
|
|
homonym = tagged "hn" x
|
|
|
|
|
-- analyse each line in KOTUS by this
|
|
|
|
|
|
|
|
|
|
getEntry :: String -> Entry
|
|
|
|
|
getEntry s = emptyEntry {
|
|
|
|
|
word = w,
|
|
|
|
|
tn = p,
|
|
|
|
|
av = g,
|
|
|
|
|
hn = h,
|
|
|
|
|
fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ c,
|
|
|
|
|
cat = c,
|
|
|
|
|
par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A"),
|
|
|
|
|
isSuffix = suff,
|
|
|
|
|
isDummy = elem p [0,50,51,101],
|
|
|
|
|
isPlurTant = last w == 't' && notElem p [5,43,47,99]
|
|
|
|
|
}
|
|
|
|
|
where
|
|
|
|
|
x = getTags s
|
|
|
|
|
(w,suff) = let w0 = tagged "s" x in if (head w0 == '-') then (tail w0, True) else (w0,False)
|
|
|
|
|
p = (read (tagged "tn" x) :: Int)
|
|
|
|
|
h = tagged "hn" x
|
|
|
|
|
g = tagged "av" x
|
|
|
|
|
c = if p < 50 then catNoun
|
|
|
|
|
else if p < 99 then catVerb
|
|
|
|
|
else catAdverb
|
|
|
|
|
num p = if p < 10 then "0" ++ show p else show p
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tagged :: String -> Tags -> String
|
|
|
|
|
tagged s x = maybe "0" id $ lookup s x
|
|
|
|
|
@@ -52,6 +167,25 @@ getTags s = case s of
|
|
|
|
|
|
|
|
|
|
type Tags = [(String,String)]
|
|
|
|
|
|
|
|
|
|
data Entry = Entry {
|
|
|
|
|
word :: String,
|
|
|
|
|
tn :: Int,
|
|
|
|
|
av :: String,
|
|
|
|
|
hn :: String,
|
|
|
|
|
fun :: String,
|
|
|
|
|
cat :: String,
|
|
|
|
|
par :: String,
|
|
|
|
|
comp1 :: String, -- compound word with this as inflected first part
|
|
|
|
|
comp2 :: String, -- compound word with this as last part
|
|
|
|
|
isSuffix :: Bool, -- used only as suffix, e.g. "-tekoinen"
|
|
|
|
|
isDummy :: Bool, -- no inflection information, or compound, or pron, or adverb from adjective
|
|
|
|
|
isPlurTant :: Bool, -- plurale tantum, e.g. "sakset"
|
|
|
|
|
isDerived :: Bool -- is derived from other words, e.g. adverb from adjective
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
emptyEntry = Entry "" 0 "" "" "" "" "" "" "" False False False False
|
|
|
|
|
|
|
|
|
|
catNoun = "NK"
|
|
|
|
|
catVerb = "VK"
|
|
|
|
|
catAdjective = "AK"
|
|
|
|
|
catAdverb = "AdvK"
|
|
|
|
|
|