forked from GitHub/gf-rgl
improved MkMorphoDict.hs, still experimental
This commit is contained in:
@@ -17,86 +17,138 @@ import System.Environment (getArgs)
|
|||||||
-- - functionname = baseform_category, with exceptions
|
-- - functionname = baseform_category, with exceptions
|
||||||
-- - variant inflection tables: lie_1_V, lie_2_V
|
-- - variant inflection tables: lie_1_V, lie_2_V
|
||||||
-- - words that have non-ident characters: 'bird\'s-eye_A'
|
-- - words that have non-ident characters: 'bird\'s-eye_A'
|
||||||
-- - words that start with non-letters: W_'tween_Adv
|
|
||||||
|
|
||||||
-- example:
|
-- example:
|
||||||
-- gf -make ../english/DictEng.gf
|
-- gf -make ../english/DictEng.gf
|
||||||
-- runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
|
-- runghc
|
||||||
|
-- runghc MkMorphodict.hs pgf MorphoDictEng.config DictEngAbs.pgf MorphoDictEng
|
||||||
-- 64923 -> 56599 functions
|
-- 64923 -> 56599 functions
|
||||||
|
|
||||||
usage = "MkMorphodict <pgf> <outfile>"
|
usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
|
||||||
|
|
||||||
main = do
|
main = do
|
||||||
pgfile:outfile:_ <- getArgs
|
xx <- getArgs
|
||||||
pgf <- readPGF pgfile
|
if length xx /= 4
|
||||||
config <- readFile (outfile ++ ".config") >>= return . mkConfig
|
then putStrLn usage
|
||||||
|
else do
|
||||||
|
let mode:configfile:datafile:outfile:_ = xx
|
||||||
|
config <- readFile configfile >>= return . mkConfig
|
||||||
|
|
||||||
let (absrules,cncrules) = mkMorphoDict (MDEnv pgf config (head (languages pgf)))
|
rawdata <- case mode of
|
||||||
|
"pgf" -> pgfFile2rawData config datafile
|
||||||
|
"raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
|
||||||
|
rawdata2gf config rawdata outfile
|
||||||
|
|
||||||
|
|
||||||
|
rawdata2gf config rawdata outfile = do
|
||||||
|
|
||||||
|
let env = MDEnv rawdata config
|
||||||
|
let (absrules,cncrules) = mkMorphoDict env
|
||||||
|
|
||||||
absheader <- readFile (outfile ++ "Abs.header")
|
absheader <- readFile (outfile ++ "Abs.header")
|
||||||
cncheader <- readFile (outfile ++ ".header")
|
cncheader <- readFile (outfile ++ ".header")
|
||||||
|
|
||||||
writeFile (outfile ++ "Abs.gf") absheader
|
writeFile (outfile ++ "Abs.gf") absheader
|
||||||
appendFile (outfile ++ "Abs.gf") $ unlines absrules
|
appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
|
||||||
appendFile (outfile ++ "Abs.gf") "}"
|
appendFile (outfile ++ "Abs.gf") "}"
|
||||||
|
|
||||||
writeFile (outfile ++ ".gf") cncheader
|
writeFile (outfile ++ ".gf") cncheader
|
||||||
appendFile (outfile ++ ".gf") $ unlines cncrules
|
appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
|
||||||
appendFile (outfile ++ ".gf") "}"
|
appendFile (outfile ++ ".gf") "}"
|
||||||
|
|
||||||
|
-- one way to get raw data from a dictionary pgf
|
||||||
|
-- another way, more controllod, is to write a wrapper grammar with a function, for each category, to generate a RawData entry
|
||||||
|
pgfFile2rawData config pgffile = do
|
||||||
|
pgf <- readPGF pgffile
|
||||||
|
|
||||||
type Cat = CId
|
let cats = nub [c | (c,(_,_,_)) <- M.assocs config]
|
||||||
|
let lang:_ = languages pgf
|
||||||
|
|
||||||
|
return [
|
||||||
|
(cat, map snd lin) |
|
||||||
|
cat <- cats,
|
||||||
|
f <- functionsByCat pgf (mkCId cat),
|
||||||
|
lin <- tabularLinearizes pgf lang (mkApp f [])
|
||||||
|
]
|
||||||
|
|
||||||
|
type Cat = String
|
||||||
|
type Fun = String
|
||||||
type Oper = String
|
type Oper = String
|
||||||
type Config = M.Map Cat (Cat,Oper,[Int])
|
type Config = M.Map Cat (Cat,Oper,([Int],[Int])) -- lin word_Cat = Oper str_i1 str_i2 ... str_in featj1 ... featjn ;
|
||||||
|
|
||||||
data MDEnv = MDEnv {
|
data MDEnv = MDEnv {
|
||||||
pgf :: PGF,
|
rawdata :: [RawData],
|
||||||
config :: Config,
|
config :: Config
|
||||||
lang :: Language
|
|
||||||
}
|
}
|
||||||
|
|
||||||
mkConfig :: String -> Config
|
mkConfig :: String -> Config -- N : N mkN 0 2 4 6 # 9
|
||||||
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
|
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
|
||||||
where
|
where
|
||||||
mkOne s = case words s of
|
mkOne s = case words s of
|
||||||
"--":_ -> Right s
|
"--":_ -> Right s
|
||||||
cat:":":tcat:oper:ints -> Left (mkCId cat,(mkCId tcat,oper,map read ints))
|
cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
|
||||||
_ -> Right s
|
_ -> Right s
|
||||||
|
mkArgs ints = case break (=="#") ints of
|
||||||
|
(ss,[]) -> (map read ss, [])
|
||||||
|
(ss,_:fs) -> (map read ss, map read fs)
|
||||||
|
|
||||||
mkMorphoDict :: MDEnv -> ([String],[String])
|
getRawData s = case words s of
|
||||||
|
c:cs -> (c,cs)
|
||||||
|
|
||||||
|
type RawData = (String,[String]) -- old cat name, forms and features
|
||||||
|
type RawRule = (([String],Cat), (Oper, ([String],[String]))) -- parts of fun name, new category, oper, arguments
|
||||||
|
type RuleData = ((Fun, Cat), (Oper, ([String],[String]))) -- final fun name, cat, oper, args
|
||||||
|
|
||||||
|
mkMorphoDict :: MDEnv -> ([String],[String]) -- fun rules, lin rules
|
||||||
mkMorphoDict env =
|
mkMorphoDict env =
|
||||||
unzip $
|
unzip $
|
||||||
map splitRule $
|
map splitRule $
|
||||||
findCompounds $
|
---- findCompounds $ -- let us not care about compounds for the time being, but include them if they are given
|
||||||
nameFunctions $
|
nameFunctions $
|
||||||
mergeRules $
|
mergeRules $
|
||||||
concatMap findRules cats
|
findRules $
|
||||||
|
rawdata env
|
||||||
where
|
where
|
||||||
splitRule (fun,(cat,lin)) = (unwords ["fun",fun,":",showCId cat,";"], unwords ["lin",fun,"=", unwords lin,";"])
|
splitRule ((fun,cat),(oper,(forms,feats))) =
|
||||||
|
(unwords ["fun",fun,":",cat,";"], unwords ["lin",fun,"=", unwords (oper : map quote forms ++ feats),";"])
|
||||||
|
|
||||||
cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)]
|
cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)]
|
||||||
|
|
||||||
findRules cat = [
|
findRules :: [RawData] -> [RawRule]
|
||||||
([snd (lin !! head ints), showCId c], (c, op : appSig ints (map snd lin))) | --- head ints is the base form in smart paradigms
|
findRules raws = [
|
||||||
f <- functionsByCat (pgf env) cat,
|
(([lemma],newcat),(oper, appSig sig args)) |
|
||||||
lin <- tabularLinearizes (pgf env) (lang env) (mkApp f []), -- [[(String, String)]]
|
(oldcat,args) <- raws,
|
||||||
Just (c,op,ints) <- [M.lookup cat (config env)]
|
Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
|
||||||
|
let lemma = args !! head (fst sig)
|
||||||
]
|
]
|
||||||
|
|
||||||
appSig ints forms = [forms !! i | i <- ints]
|
appSig (ints,feats) args = ([args !! i | i <- ints], [args !! i | i <- feats])
|
||||||
|
|
||||||
|
mergeRules :: [RawRule] -> [RawRule]
|
||||||
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
|
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
|
||||||
|
|
||||||
|
nameFunctions :: [RawRule] -> [RuleData]
|
||||||
nameFunctions = expandNames . sortOn fst
|
nameFunctions = expandNames . sortOn fst
|
||||||
|
|
||||||
|
expandNames :: [RawRule] -> [RuleData]
|
||||||
expandNames fls = case fls of
|
expandNames fls = case fls of
|
||||||
(f,l):fls2 -> case span ((==f) . fst) fls2 of
|
(f@(w,c),l) : fls2 -> case span ((==f) . fst) fls2 of
|
||||||
([],_) -> (mkFun f,l) : expandNames fls2
|
([],_) -> ((mkFun (w ++ [c]),c),l) : expandNames fls2
|
||||||
(fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3
|
(fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3
|
||||||
_ -> []
|
_ -> []
|
||||||
|
|
||||||
renames fls = [(mkFun (init f ++ [show i,last f]),l) | (i,(f,l)) <- zip [1..] fls]
|
renames :: [RawRule] -> [RuleData]
|
||||||
|
renames fls = [((mkFun (f ++ [show i,c]),c),l) | (i,((f,c),l)) <- zip [1..] fls] ---- TODO disambiguate with a form, not int
|
||||||
|
|
||||||
|
mkFun = showCId . mkCId . concat . intersperse "_"
|
||||||
|
|
||||||
|
quote s = "\"" ++ s ++ "\""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
{- ---- let us ignore this
|
||||||
|
findCompounds :: [RuleData] -> [RuleData]
|
||||||
findCompounds = getCompounds . sortOn cat_orthrevforms
|
findCompounds = getCompounds . sortOn cat_orthrevforms
|
||||||
|
|
||||||
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
|
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
|
||||||
@@ -105,6 +157,7 @@ mkMorphoDict env =
|
|||||||
revstem = head . snd . cat_revforms
|
revstem = head . snd . cat_revforms
|
||||||
wforms (_,(_,_:forms)) = forms
|
wforms (_,(_,_:forms)) = forms
|
||||||
|
|
||||||
|
getCompounds :: [RuleData] -> [RuleData]
|
||||||
getCompounds fls = case fls of
|
getCompounds fls = case fls of
|
||||||
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
|
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
|
||||||
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
|
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
|
||||||
@@ -117,26 +170,14 @@ mkMorphoDict env =
|
|||||||
True -> markWith fl1 [";","--","compound",(fst fl)]
|
True -> markWith fl1 [";","--","compound",(fst fl)]
|
||||||
False -> markWith fl1 [";","--","notcompound",(fst fl)]
|
False -> markWith fl1 [";","--","notcompound",(fst fl)]
|
||||||
|
|
||||||
markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs))
|
markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs)) ---- TODO only quote string args, not features
|
||||||
|
|
||||||
isPrefixWord x xy =
|
isPrefixWord x xy =
|
||||||
length suff > 1 &&
|
length suff > 1 && ---- compound first part must be at least two letters long
|
||||||
any (\c -> elem c "-0123456789aeiouyåäö") suff &&
|
any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
|
||||||
isPrefixOf x xy
|
isPrefixOf x xy ---- and of course be a prefix
|
||||||
where
|
where
|
||||||
suff = drop (length x) xy
|
suff = drop (length x) xy
|
||||||
|
-}
|
||||||
mkFun = quoteIf . concat . intersperse "_"
|
|
||||||
quoteIf s = case s of
|
|
||||||
_ | any (\c -> not (isAlphaNum c || elem c "_'")) s -> "'" ++ unSgQuote s ++ "'"
|
|
||||||
c:_ | not (isAlpha c) -> "W_" ++ s
|
|
||||||
_ -> s
|
|
||||||
where
|
|
||||||
unSgQuote s = case s of
|
|
||||||
'\'':cs -> "\\\'" ++ unSgQuote cs
|
|
||||||
c:cs -> c : unSgQuote cs
|
|
||||||
_ -> s
|
|
||||||
|
|
||||||
|
|
||||||
quote s = "\"" ++ s ++ "\""
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user