prepared Kotus to produce adjectives and compounds (by heuristics)

This commit is contained in:
aarne
2010-12-28 15:33:52 +00:00
parent c9ca289a7c
commit 0fd9d1f1a7
5 changed files with 171 additions and 24 deletions

View File

@@ -297,5 +297,10 @@ oper
c101 : Str -> {s : Str} -- pronouns etc
= c99 ; -- dummy
-- compound nouns, latter part inflected
compoundNK : (Str -> NForms) -> Str -> Str -> NForms = \d,x,y ->
let ys = d y in \\v => x + ys ! v ;
}

View File

@@ -1,17 +1,113 @@
main = interact (unlines . concatMap mkOne . lines)
import qualified Data.Map as M
import qualified Data.Set as S
import System
mkOne = mkEntry . analyse
-- build DictFin from KOTUS word list. See Makefile.
-- AR 28/12/2010
mkEntry (w,p,g,h) | elem p [0,50,51] = [] -- no inflection information, or a compound
mkEntry (w,p,g,h) | head w == '-' = mkEntry (tail w,p,g,h) -- suffix only
mkEntry (w,p,g,h) | last w == 't' && notElem p [5,43,47 ] = [] -- plurale tantum --- to do
mkEntry (w,p,g,h) = [mkFun fun cat, mkLin fun par w] where
cat = if p < 50 then catNoun
else if p < 99 then catVerb
else catAdverb
fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ cat
par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A")
num p = if p < 10 then "0" ++ show p else show p
main = do
f:xx <- getArgs
ss <- readFile f >>= return . lines
let dict0 = map getEntry ss
let dictList = [(word e,e) | e <- dict0]
let dictMap = M.fromAscList dictList
let adjList = [(adj,adv) | (adv,e) <- dictList,
cat e == catAdverb, -- kultamitali - kultamitalisti, hence need this
Just adj <- [lookAdj adv dictMap]]
--- mapM_ (\ (adj,adv) -> putStrLn (adv ++ " - " ++ adj)) adjList -- to see the adjectives
let compoundList = [(whole,(end,beg)) | (whole,e) <- dictList,
elem (cat e) [catNoun,catAdjective],
Just (end,beg) <- [lookCompound (whole,e) dictMap]]
--- mapM_ (\ (x,(y,_)) -> putStrLn (unwords [x,"-", word y])) compoundList -- to see compounds
let (adjSet,advSet) = let (adjs,advs) = unzip adjList in (S.fromList adjs,S.fromList advs)
let dictList1 = map (mkAdjAdv adjSet advSet) dictList
let dictList2 = case xx of
"-compounds":_ -> concatMap (mkCompound (M.fromList compoundList)) dictList1
"-all":_ -> dictList1 ++ concatMap (mkCompound (M.fromList compoundList)) dictList1
_ -> dictList1
let dict2 = map snd dictList2
mapM_ mkRules dict2
----------------------------------------------------------------
-- identify the parts of compounds
-- longest match, suffix length 3..10, prefix length at least 3, both parts exist
-- this gives 36557 compounds...
--- no recursion to multi-word compounds
lookCompound :: (String, Entry) -> M.Map String Entry -> Maybe (Entry,String)
lookCompound (w,ent) dict =
looks [splitAt i w | let k = length w, i <- [k-3, k-4 .. max (k-10) 3]]
where
looks ws = case [(e,u) | (u,v) <- ws,
Just e <- [M.lookup v dict], cat e == catNoun,
Just _ <- [M.lookup u dict]] of
eu :_ -> return eu
_ -> Nothing
-- return compounds
mkCompound compMap (w,ent) =
case M.lookup w compMap of
Just (e,b) -> return (w, ent {
word = word e,
tn = tn e,
par = "compoundNK " ++ par e ++ " " ++ quoted b,
isDummy = tn e == 0
}
)
_ -> []
----------------------------------------------------------------
--
-- for words ending "-sti", look for corresponding adjective. If found, mark the adjective
-- as adjective, and eliminate the adverb.
mkAdjAdv adjSet advSet (w,e) =
if S.member w adjSet && (cat e == catNoun)
then (w,e{cat = catAdjective, fun = take (length (fun e) - 2) (fun e) ++ "AK"})
else if S.member w advSet && (cat e == catAdverb) then (w,e{isDerived = True})
else (w,e)
lookAdj adv dict = case splitAt (length adv - 3) adv of
(adj,"sti") -> case [e | a <- adjCandidates adj,
Just e <- [M.lookup a dict], cat e == catNoun] of
e :_ -> return $ word e
_ -> Nothing
_ -> Nothing
adjCandidates adj = case reverse adj of
'e':'s': w -> [reverse w ++ "nen", adj] -- vihainen - vihaisesti
'a':'a': 'k' : 'k' : w ->
[reverse w ++ "kas", adj] -- halukas
'a':'a': w ->
[init adj ++ "s", adj] -- hurskas - hurskaasti, suulas - suulaasti
v : v': 'k' : 'k' : w | v == v' && fromEnum v == 228 ->
[reverse w ++ "käs", adj] -- nenäkäs - nenäkkäästi
v : v': w | v == v' && fromEnum v == 228 ->
[init adj ++ "s", adj] -- ylväs - ylväästi
'e':'e': w -> let rw = reverse w in
[rw ++ "e", rw ++ "ut", rw ++ "yt", adj] -- terveesti, ahdistuneesti
'i':'i': w -> [reverse w ++ "is", adj] -- kaunis - kauniisti
'e':_ -> [init adj ++ "i", adj] -- suuri - suuresti (not: suure - suuresti)
_ -> [adj]
-------------------------------------------------
-- produce rules
mkRules e = do
putRule $ mkFun (fun e) (cat e)
putRule $ mkLin (fun e) (par e) (word e)
where
putRule
| isDummy e = putStrLn . ("-- " ++)
| isDerived e = putStrLn . ("--+ " ++)
| isPlurTant e = putStrLn . ("--? " ++)
| otherwise = putStrLn
mkFun fun cat = unwords ["fun",fun,":",cat,";"]
mkLin fun par w = unwords ["lin",fun,"=",par,quoted w,";"]
@@ -29,13 +125,32 @@ mkId = concatMap trim where
quoted s = "\"" ++ s ++ "\""
analyse :: String -> (String,Int,String,String)
analyse s = (word,paradigm,gradation,homonym) where
word = tagged "s" x
paradigm = (read (tagged "tn" x) :: Int)
gradation = tagged "av" x
homonym = tagged "hn" x
-- analyse each line in KOTUS by this
getEntry :: String -> Entry
getEntry s = emptyEntry {
word = w,
tn = p,
av = g,
hn = h,
fun = mkId w ++ "_" ++ (if h=="0" then "" else h ++ "_") ++ c,
cat = c,
par = (if p < 52 then "d" else "c") ++ num p ++ (if g == "0" then "" else "A"),
isSuffix = suff,
isDummy = elem p [0,50,51,101],
isPlurTant = last w == 't' && notElem p [5,43,47,99]
}
where
x = getTags s
(w,suff) = let w0 = tagged "s" x in if (head w0 == '-') then (tail w0, True) else (w0,False)
p = (read (tagged "tn" x) :: Int)
h = tagged "hn" x
g = tagged "av" x
c = if p < 50 then catNoun
else if p < 99 then catVerb
else catAdverb
num p = if p < 10 then "0" ++ show p else show p
tagged :: String -> Tags -> String
tagged s x = maybe "0" id $ lookup s x
@@ -52,6 +167,25 @@ getTags s = case s of
type Tags = [(String,String)]
data Entry = Entry {
word :: String,
tn :: Int,
av :: String,
hn :: String,
fun :: String,
cat :: String,
par :: String,
comp1 :: String, -- compound word with this as inflected first part
comp2 :: String, -- compound word with this as last part
isSuffix :: Bool, -- used only as suffix, e.g. "-tekoinen"
isDummy :: Bool, -- no inflection information, or compound, or pron, or adverb from adjective
isPlurTant :: Bool, -- plurale tantum, e.g. "sakset"
isDerived :: Bool -- is derived from other words, e.g. adverb from adjective
}
emptyEntry = Entry "" 0 "" "" "" "" "" "" "" False False False False
catNoun = "NK"
catVerb = "VK"
catAdjective = "AK"
catAdverb = "AdvK"

View File

@@ -1,10 +1,17 @@
gf: abs cnc
abs:
src:
runghc Kotus.hs src/kotus-sanalista_v1.xml >src.tmp
compounds:
runghc Kotus.hs src/kotus-sanalista_v1.xml -compounds >src.tmp
abs: src
cp prelDictFinAbs DictFinAbs.gf
runghc Kotus.hs <src/kotus-sanalista_v1.xml | grep ":" >>DictFinAbs.gf
grep ":" src.tmp >>DictFinAbs.gf
echo "}" >>DictFinAbs.gf
cnc:
cnc: src
cp prelDictFin DictFin.gf
runghc Kotus.hs <src/kotus-sanalista_v1.xml | grep "=" >>DictFin.gf
grep "=" src.tmp >>DictFin.gf
echo "}" >>DictFin.gf

View File

@@ -4,4 +4,5 @@ concrete DictFin of DictFinAbs = open MorphoFin, Kotus, Prelude in {
flags coding = utf8 ;
lincat NK = NForms ; VK = VForms ;AdvK = SS ;
lincat NK = NForms ; AK = NForms ; VK = VForms ; AdvK = SS ;

View File

@@ -1,3 +1,3 @@
abstract DictFinAbs = {
cat NK ; VK ; AdvK ;
cat NK ; AK ; VK ; AdvK ;