diff --git a/src/morphodict/MkMorphodict.hs b/src/morphodict/MkMorphodict.hs index aa62cadd7..67a952378 100644 --- a/src/morphodict/MkMorphodict.hs +++ b/src/morphodict/MkMorphodict.hs @@ -17,86 +17,138 @@ import System.Environment (getArgs) -- - functionname = baseform_category, with exceptions -- - variant inflection tables: lie_1_V, lie_2_V -- - words that have non-ident characters: 'bird\'s-eye_A' --- - words that start with non-letters: W_'tween_Adv -- example: -- gf -make ../english/DictEng.gf --- runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng +-- runghc +-- runghc MkMorphodict.hs pgf MorphoDictEng.config DictEngAbs.pgf MorphoDictEng -- 64923 -> 56599 functions -usage = "MkMorphodict " +usage = "runghc MkMorphodict (raw|pgf) " main = do - pgfile:outfile:_ <- getArgs - pgf <- readPGF pgfile - config <- readFile (outfile ++ ".config") >>= return . mkConfig + xx <- getArgs + if length xx /= 4 + then putStrLn usage + else do + let mode:configfile:datafile:outfile:_ = xx + config <- readFile configfile >>= return . mkConfig - let (absrules,cncrules) = mkMorphoDict (MDEnv pgf config (head (languages pgf))) + rawdata <- case mode of + "pgf" -> pgfFile2rawData config datafile + "raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines + rawdata2gf config rawdata outfile + + +rawdata2gf config rawdata outfile = do + + let env = MDEnv rawdata config + let (absrules,cncrules) = mkMorphoDict env absheader <- readFile (outfile ++ "Abs.header") cncheader <- readFile (outfile ++ ".header") writeFile (outfile ++ "Abs.gf") absheader - appendFile (outfile ++ "Abs.gf") $ unlines absrules + appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules appendFile (outfile ++ "Abs.gf") "}" writeFile (outfile ++ ".gf") cncheader - appendFile (outfile ++ ".gf") $ unlines cncrules + appendFile (outfile ++ ".gf") $ unlines $ sort cncrules appendFile (outfile ++ ".gf") "}" +-- one way to get raw data from a dictionary pgf +-- another way, more controllod, is to write a wrapper grammar with a function, for each category, to generate a RawData entry +pgfFile2rawData config pgffile = do + pgf <- readPGF pgffile -type Cat = CId + let cats = nub [c | (c,(_,_,_)) <- M.assocs config] + let lang:_ = languages pgf + + return [ + (cat, map snd lin) | + cat <- cats, + f <- functionsByCat pgf (mkCId cat), + lin <- tabularLinearizes pgf lang (mkApp f []) + ] + +type Cat = String +type Fun = String type Oper = String -type Config = M.Map Cat (Cat,Oper,[Int]) +type Config = M.Map Cat (Cat,Oper,([Int],[Int])) -- lin word_Cat = Oper str_i1 str_i2 ... str_in featj1 ... featjn ; data MDEnv = MDEnv { - pgf :: PGF, - config :: Config, - lang :: Language + rawdata :: [RawData], + config :: Config } -mkConfig :: String -> Config +mkConfig :: String -> Config -- N : N mkN 0 2 4 6 # 9 mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)] where mkOne s = case words s of "--":_ -> Right s - cat:":":tcat:oper:ints -> Left (mkCId cat,(mkCId tcat,oper,map read ints)) + cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints)) _ -> Right s + mkArgs ints = case break (=="#") ints of + (ss,[]) -> (map read ss, []) + (ss,_:fs) -> (map read ss, map read fs) -mkMorphoDict :: MDEnv -> ([String],[String]) +getRawData s = case words s of + c:cs -> (c,cs) + +type RawData = (String,[String]) -- old cat name, forms and features +type RawRule = (([String],Cat), (Oper, ([String],[String]))) -- parts of fun name, new category, oper, arguments +type RuleData = ((Fun, Cat), (Oper, ([String],[String]))) -- final fun name, cat, oper, args + +mkMorphoDict :: MDEnv -> ([String],[String]) -- fun rules, lin rules mkMorphoDict env = unzip $ map splitRule $ - findCompounds $ +---- findCompounds $ -- let us not care about compounds for the time being, but include them if they are given nameFunctions $ mergeRules $ - concatMap findRules cats + findRules $ + rawdata env where - splitRule (fun,(cat,lin)) = (unwords ["fun",fun,":",showCId cat,";"], unwords ["lin",fun,"=", unwords lin,";"]) + splitRule ((fun,cat),(oper,(forms,feats))) = + (unwords ["fun",fun,":",cat,";"], unwords ["lin",fun,"=", unwords (oper : map quote forms ++ feats),";"]) cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)] - findRules cat = [ - ([snd (lin !! head ints), showCId c], (c, op : appSig ints (map snd lin))) | --- head ints is the base form in smart paradigms - f <- functionsByCat (pgf env) cat, - lin <- tabularLinearizes (pgf env) (lang env) (mkApp f []), -- [[(String, String)]] - Just (c,op,ints) <- [M.lookup cat (config env)] + findRules :: [RawData] -> [RawRule] + findRules raws = [ + (([lemma],newcat),(oper, appSig sig args)) | + (oldcat,args) <- raws, + Just (newcat, oper, sig) <- [M.lookup oldcat (config env)], + let lemma = args !! head (fst sig) ] - appSig ints forms = [forms !! i | i <- ints] + appSig (ints,feats) args = ([args !! i | i <- ints], [args !! i | i <- feats]) + mergeRules :: [RawRule] -> [RawRule] mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd + nameFunctions :: [RawRule] -> [RuleData] nameFunctions = expandNames . sortOn fst + expandNames :: [RawRule] -> [RuleData] expandNames fls = case fls of - (f,l):fls2 -> case span ((==f) . fst) fls2 of - ([],_) -> (mkFun f,l) : expandNames fls2 + (f@(w,c),l) : fls2 -> case span ((==f) . fst) fls2 of + ([],_) -> ((mkFun (w ++ [c]),c),l) : expandNames fls2 (fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3 _ -> [] - renames fls = [(mkFun (init f ++ [show i,last f]),l) | (i,(f,l)) <- zip [1..] fls] + renames :: [RawRule] -> [RuleData] + renames fls = [((mkFun (f ++ [show i,c]),c),l) | (i,((f,c),l)) <- zip [1..] fls] ---- TODO disambiguate with a form, not int +mkFun = showCId . mkCId . concat . intersperse "_" + +quote s = "\"" ++ s ++ "\"" + + + + +{- ---- let us ignore this + findCompounds :: [RuleData] -> [RuleData] findCompounds = getCompounds . sortOn cat_orthrevforms cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]]) @@ -105,6 +157,7 @@ mkMorphoDict env = revstem = head . snd . cat_revforms wforms (_,(_,_:forms)) = forms + getCompounds :: [RuleData] -> [RuleData] getCompounds fls = case fls of fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1 fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of @@ -117,26 +170,14 @@ mkMorphoDict env = True -> markWith fl1 [";","--","compound",(fst fl)] False -> markWith fl1 [";","--","notcompound",(fst fl)] - markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs)) + markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs)) ---- TODO only quote string args, not features isPrefixWord x xy = - length suff > 1 && - any (\c -> elem c "-0123456789aeiouyåäö") suff && - isPrefixOf x xy + length suff > 1 && ---- compound first part must be at least two letters long + any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit + isPrefixOf x xy ---- and of course be a prefix where suff = drop (length x) xy - -mkFun = quoteIf . concat . intersperse "_" -quoteIf s = case s of - _ | any (\c -> not (isAlphaNum c || elem c "_'")) s -> "'" ++ unSgQuote s ++ "'" - c:_ | not (isAlpha c) -> "W_" ++ s - _ -> s - where - unSgQuote s = case s of - '\'':cs -> "\\\'" ++ unSgQuote cs - c:cs -> c : unSgQuote cs - _ -> s +-} -quote s = "\"" ++ s ++ "\"" -