forked from GitHub/gf-rgl
prepared Kotus to produce adjectives and compounds (by heuristics)
This commit is contained in:
@@ -1,17 +1,113 @@
|
||||
main = interact (unlines . concatMap mkOne . lines)
|
||||
import qualified Data.Map as M
|
||||
import qualified Data.Set as S
|
||||
import System
|
||||
|
||||
mkOne = mkEntry . analyse
|
||||
-- build DictFin from KOTUS word list. See Makefile.
|
||||
-- AR 28/12/2010
|
||||
|
||||
mkEntry (w,p,g,h) | elem p [0,50,51] = [] -- no inflection information, or a compound
|
||||
mkEntry (w,p,g,h) | head w == '-' = mkEntry (tail w,p,g,h) -- suffix only
|
||||
mkEntry (w,p,g,h) | last w == 't' && notElem p [5,43,47 ] = [] -- plurale tantum --- to do
|
||||
mkEntry (w,p,g,h) = [mkFun fun cat, mkLin fun par w] where
|
||||
cat = if p < 50 then catNoun
|
||||
else if p < 99 then catVerb
|
||||
else catAdverb
|
||||
fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ cat
|
||||
par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A")
|
||||
num p = if p < 10 then "0" ++ show p else show p
|
||||
main = do
|
||||
f:xx <- getArgs
|
||||
ss <- readFile f >>= return . lines
|
||||
let dict0 = map getEntry ss
|
||||
let dictList = [(word e,e) | e <- dict0]
|
||||
let dictMap = M.fromAscList dictList
|
||||
let adjList = [(adj,adv) | (adv,e) <- dictList,
|
||||
cat e == catAdverb, -- kultamitali - kultamitalisti, hence need this
|
||||
Just adj <- [lookAdj adv dictMap]]
|
||||
--- mapM_ (\ (adj,adv) -> putStrLn (adv ++ " - " ++ adj)) adjList -- to see the adjectives
|
||||
let compoundList = [(whole,(end,beg)) | (whole,e) <- dictList,
|
||||
elem (cat e) [catNoun,catAdjective],
|
||||
Just (end,beg) <- [lookCompound (whole,e) dictMap]]
|
||||
--- mapM_ (\ (x,(y,_)) -> putStrLn (unwords [x,"-", word y])) compoundList -- to see compounds
|
||||
let (adjSet,advSet) = let (adjs,advs) = unzip adjList in (S.fromList adjs,S.fromList advs)
|
||||
let dictList1 = map (mkAdjAdv adjSet advSet) dictList
|
||||
let dictList2 = case xx of
|
||||
"-compounds":_ -> concatMap (mkCompound (M.fromList compoundList)) dictList1
|
||||
"-all":_ -> dictList1 ++ concatMap (mkCompound (M.fromList compoundList)) dictList1
|
||||
_ -> dictList1
|
||||
let dict2 = map snd dictList2
|
||||
mapM_ mkRules dict2
|
||||
|
||||
|
||||
|
||||
|
||||
----------------------------------------------------------------
|
||||
-- identify the parts of compounds
|
||||
|
||||
-- longest match, suffix length 3..10, prefix length at least 3, both parts exist
|
||||
-- this gives 36557 compounds...
|
||||
--- no recursion to multi-word compounds
|
||||
lookCompound :: (String, Entry) -> M.Map String Entry -> Maybe (Entry,String)
|
||||
lookCompound (w,ent) dict =
|
||||
looks [splitAt i w | let k = length w, i <- [k-3, k-4 .. max (k-10) 3]]
|
||||
where
|
||||
looks ws = case [(e,u) | (u,v) <- ws,
|
||||
Just e <- [M.lookup v dict], cat e == catNoun,
|
||||
Just _ <- [M.lookup u dict]] of
|
||||
eu :_ -> return eu
|
||||
_ -> Nothing
|
||||
|
||||
-- return compounds
|
||||
mkCompound compMap (w,ent) =
|
||||
case M.lookup w compMap of
|
||||
Just (e,b) -> return (w, ent {
|
||||
word = word e,
|
||||
tn = tn e,
|
||||
par = "compoundNK " ++ par e ++ " " ++ quoted b,
|
||||
isDummy = tn e == 0
|
||||
}
|
||||
)
|
||||
_ -> []
|
||||
|
||||
|
||||
----------------------------------------------------------------
|
||||
--
|
||||
-- for words ending "-sti", look for corresponding adjective. If found, mark the adjective
|
||||
-- as adjective, and eliminate the adverb.
|
||||
|
||||
mkAdjAdv adjSet advSet (w,e) =
|
||||
if S.member w adjSet && (cat e == catNoun)
|
||||
then (w,e{cat = catAdjective, fun = take (length (fun e) - 2) (fun e) ++ "AK"})
|
||||
else if S.member w advSet && (cat e == catAdverb) then (w,e{isDerived = True})
|
||||
else (w,e)
|
||||
|
||||
lookAdj adv dict = case splitAt (length adv - 3) adv of
|
||||
(adj,"sti") -> case [e | a <- adjCandidates adj,
|
||||
Just e <- [M.lookup a dict], cat e == catNoun] of
|
||||
e :_ -> return $ word e
|
||||
_ -> Nothing
|
||||
_ -> Nothing
|
||||
|
||||
adjCandidates adj = case reverse adj of
|
||||
'e':'s': w -> [reverse w ++ "nen", adj] -- vihainen - vihaisesti
|
||||
'a':'a': 'k' : 'k' : w ->
|
||||
[reverse w ++ "kas", adj] -- halukas
|
||||
'a':'a': w ->
|
||||
[init adj ++ "s", adj] -- hurskas - hurskaasti, suulas - suulaasti
|
||||
v : v': 'k' : 'k' : w | v == v' && fromEnum v == 228 ->
|
||||
[reverse w ++ "käs", adj] -- nenäkäs - nenäkkäästi
|
||||
v : v': w | v == v' && fromEnum v == 228 ->
|
||||
[init adj ++ "s", adj] -- ylväs - ylväästi
|
||||
'e':'e': w -> let rw = reverse w in
|
||||
[rw ++ "e", rw ++ "ut", rw ++ "yt", adj] -- terveesti, ahdistuneesti
|
||||
'i':'i': w -> [reverse w ++ "is", adj] -- kaunis - kauniisti
|
||||
'e':_ -> [init adj ++ "i", adj] -- suuri - suuresti (not: suure - suuresti)
|
||||
_ -> [adj]
|
||||
|
||||
|
||||
|
||||
-------------------------------------------------
|
||||
-- produce rules
|
||||
|
||||
mkRules e = do
|
||||
putRule $ mkFun (fun e) (cat e)
|
||||
putRule $ mkLin (fun e) (par e) (word e)
|
||||
where
|
||||
putRule
|
||||
| isDummy e = putStrLn . ("-- " ++)
|
||||
| isDerived e = putStrLn . ("--+ " ++)
|
||||
| isPlurTant e = putStrLn . ("--? " ++)
|
||||
| otherwise = putStrLn
|
||||
|
||||
mkFun fun cat = unwords ["fun",fun,":",cat,";"]
|
||||
mkLin fun par w = unwords ["lin",fun,"=",par,quoted w,";"]
|
||||
@@ -29,13 +125,32 @@ mkId = concatMap trim where
|
||||
|
||||
quoted s = "\"" ++ s ++ "\""
|
||||
|
||||
analyse :: String -> (String,Int,String,String)
|
||||
analyse s = (word,paradigm,gradation,homonym) where
|
||||
word = tagged "s" x
|
||||
paradigm = (read (tagged "tn" x) :: Int)
|
||||
gradation = tagged "av" x
|
||||
homonym = tagged "hn" x
|
||||
-- analyse each line in KOTUS by this
|
||||
|
||||
getEntry :: String -> Entry
|
||||
getEntry s = emptyEntry {
|
||||
word = w,
|
||||
tn = p,
|
||||
av = g,
|
||||
hn = h,
|
||||
fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ c,
|
||||
cat = c,
|
||||
par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A"),
|
||||
isSuffix = suff,
|
||||
isDummy = elem p [0,50,51,101],
|
||||
isPlurTant = last w == 't' && notElem p [5,43,47,99]
|
||||
}
|
||||
where
|
||||
x = getTags s
|
||||
(w,suff) = let w0 = tagged "s" x in if (head w0 == '-') then (tail w0, True) else (w0,False)
|
||||
p = (read (tagged "tn" x) :: Int)
|
||||
h = tagged "hn" x
|
||||
g = tagged "av" x
|
||||
c = if p < 50 then catNoun
|
||||
else if p < 99 then catVerb
|
||||
else catAdverb
|
||||
num p = if p < 10 then "0" ++ show p else show p
|
||||
|
||||
|
||||
tagged :: String -> Tags -> String
|
||||
tagged s x = maybe "0" id $ lookup s x
|
||||
@@ -52,6 +167,25 @@ getTags s = case s of
|
||||
|
||||
type Tags = [(String,String)]
|
||||
|
||||
data Entry = Entry {
|
||||
word :: String,
|
||||
tn :: Int,
|
||||
av :: String,
|
||||
hn :: String,
|
||||
fun :: String,
|
||||
cat :: String,
|
||||
par :: String,
|
||||
comp1 :: String, -- compound word with this as inflected first part
|
||||
comp2 :: String, -- compound word with this as last part
|
||||
isSuffix :: Bool, -- used only as suffix, e.g. "-tekoinen"
|
||||
isDummy :: Bool, -- no inflection information, or compound, or pron, or adverb from adjective
|
||||
isPlurTant :: Bool, -- plurale tantum, e.g. "sakset"
|
||||
isDerived :: Bool -- is derived from other words, e.g. adverb from adjective
|
||||
}
|
||||
|
||||
emptyEntry = Entry "" 0 "" "" "" "" "" "" "" False False False False
|
||||
|
||||
catNoun = "NK"
|
||||
catVerb = "VK"
|
||||
catAdjective = "AK"
|
||||
catAdverb = "AdvK"
|
||||
|
||||
Reference in New Issue
Block a user