1
0
forked from GitHub/gf-rgl
Files
gf-rgl/src/morphodict/MkMorphodict.hs
2021-06-04 13:42:14 +08:00

212 lines
7.6 KiB
Haskell
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
module Main where
import PGF
import qualified Data.Map as M
import Data.Char
import Data.List
import Safe
import System.Environment (getArgs)
import Debug.Trace
-- AR 2020-02-28
-- making a word list purely morphological, i.e.
-- - functions are 1-to-1 with lemgrams, i.e.
-- - no sense distinctions
-- - no subcategorizations
-- - no variants
-- - functionname = baseform_category, with exceptions
-- - variant inflection tables: lie_1_V, lie_2_V
-- - words that have non-ident characters: 'bird\'s-eye_A'
-- example:
-- gf -make ../english/DictEng.gf
-- runghc MkMorphodict.hs pgf MorphoDictEng.config DictEngAbs.pgf MorphoDictEng
-- 64923 -> 56599 functions
usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
main = do
xx <- getArgs
if length xx /= 4
then do
putStrLn "Usage:"
putStrLn usage
putStrLn $ "Got instead: " ++ show xx
else do
let mode:configfile:datafile:outfile:_ = xx
config <- readFile configfile >>= return . mkConfig
rawdata <- case mode of
"pgf" -> pgfFile2rawData config datafile
"raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
_ -> error $ "Expected mode (pgf|raw), got " ++ mode
rawdata2gf config rawdata outfile
rawdata2gf config rawdata outfile = do
let env = MDEnv rawdata config
let (absrules,cncrules) = mkMorphoDict env
absheader <- readFile (outfile ++ "Abs.header")
cncheader <- readFile (outfile ++ ".header")
writeFile (outfile ++ "Abs.gf") absheader
appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
appendFile (outfile ++ "Abs.gf") "}"
writeFile (outfile ++ ".gf") cncheader
appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
appendFile (outfile ++ ".gf") "}"
-- one way to get raw data from a dictionary pgf
-- another way, more controllod, is to write a wrapper grammar with a function, for each category, to generate a RawData entry
pgfFile2rawData config pgffile = do
pgf <- readPGF pgffile
let cats = nub [c | (c,(_,_,_)) <- M.assocs config]
let lang:_ = languages pgf
return [
(cat, map snd lin) |
cat <- cats,
f <- functionsByCat pgf (mkCId cat),
lin <- tabularLinearizes pgf lang (mkApp f [])
]
type Cat = String
type Fun = String
type Oper = String
type Config = M.Map Cat (Cat,Oper,([Int],[Int])) -- lin word_Cat = Oper str_i1 str_i2 ... str_in featj1 ... featjn ;
data MDEnv = MDEnv {
rawdata :: [RawData],
config :: Config
}
mkConfig :: String -> Config -- N : N mkN 0 2 4 6 # 9
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
where
mkOne s = case words s of
"--":_ -> Right s
cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
_ -> Right s
mkArgs ints = case break (=="#") ints of
(ss,[]) -> (map read' ss, [])
(ss,_:fs) -> (map read' ss, map read' fs)
read' a = readNote [] a -- Safe.readNote provides better error message
getRawData s = case words s of
c:cs -> (c,cs)
type RawData = (String,[String]) -- old cat name, forms and features
type RawRule = (([String],Cat), (Oper, ([String],[String]))) -- parts of fun name, new category, oper, arguments
type RuleData = ((Fun, Cat), (Oper, ([String],[String]))) -- final fun name, cat, oper, args
mkMorphoDict :: MDEnv -> ([String],[String]) -- fun rules, lin rules
mkMorphoDict env =
unzip $
map splitRule $
---- findCompounds $ -- let us not care about compounds for the time being, but include them if they are given
nameFunctions $
mergeRules $
findRules $
rawdata env
where
splitRule ((fun,cat),(oper,(forms,feats))) =
(unwords ["fun",fun,":",cat,";"], unwords ["lin",fun,"=", unwords (oper : map quote forms ++ feats),";"])
cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)]
findRules :: [RawData] -> [RawRule]
findRules raws = [
(([lemma],newcat),(oper, appSig sig args)) |
(oldcat,args) <- raws,
Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
let lemma = args `at` head (fst sig)
]
appSig (ints,feats) args =
-- If there's wrong number in config file, uncomment the line below to see which number it should be
-- trace (intercalate "\n" $ map show (zip [0..] args)) $
([args `at` i | i <- ints], [args `at` i | i <- feats])
mergeRules :: [RawRule] -> [RawRule]
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
nameFunctions :: [RawRule] -> [RuleData]
nameFunctions = expandNames . sortOn fst
expandNames :: [RawRule] -> [RuleData]
expandNames fls = case fls of
(f@(w,c),l) : fls2 -> case span ((==f) . fst) fls2 of
([],_) -> ((mkFun (w ++ [c]),c),l) : expandNames fls2
(fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3
_ -> []
renames :: [RawRule] -> [RuleData]
--- renames fls = [((mkFun (f ++ [show i,c]),c),l) | (i,((f,c),l)) <- zip [1..] fls] -- disambiguate with int
renames fls = [((mkFun (f ++ fs ++ [c]),c),l) | (i,(((f,c),l),fs)) <- zip [1..] (zip fls (minimize fls))] -- disambiguate with different forms
minimize :: [RawRule] -> [[String]]
minimize fls = shrink [ws ++ fs | (_,(_,(_:ws,fs))) <- fls]
shrink fls = case fls of
fl@(_:_):_ | all ((==head fl) . head) fls -> shrink (map tail fls)
fl@(_:_):_ | all ((==last fl) . last) fls -> shrink (map init fls)
_ -> shrinkMore fls
shrinkMore fls = case fls of
_ | length (nub (map init fls)) == length fls -> shrinkMore (map init fls)
_ | length (nub (map tail fls)) == length fls -> shrinkMore (map tail fls)
_ -> fls
-- >>> mkFun ["hello", "world", "hello friends", "hello-all"]
-- "hello_world_hello_friends_hello_all"
mkFun :: [String] -> String -- if word contains space or hyphen, replace with underscore
mkFun = showCId . mkCId . concat . intersperse "_" . concatMap (words . removeHyphen)
where
removeHyphen [] = []
removeHyphen ['-'] = ['-'] -- If hyphen is the last character, it's usually meaningful, leave it
removeHyphen ('-':cs) = ' ' : removeHyphen cs
removeHyphen (c:cs) = c : removeHyphen cs
quote s = "\"" ++ s ++ "\""
{- ---- let us ignore this
findCompounds :: [RuleData] -> [RuleData]
findCompounds = getCompounds . sortOn cat_orthrevforms
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
cat_revforms (_,(cat,_:forms)) = (cat,map reverse forms)
revstem = head . snd . cat_revforms
wforms (_,(_,_:forms)) = forms
getCompounds :: [RuleData] -> [RuleData]
getCompounds fls = case fls of
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
([],_:_) -> markWith fl [] : getCompounds fls2
(fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
_ -> []
markCompound fl fl1 =
case and [isPrefixWord (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms fl1)] of
True -> markWith fl1 [";","--","compound",(fst fl)]
False -> markWith fl1 [";","--","notcompound",(fst fl)]
markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs)) ---- TODO only quote string args, not features
isPrefixWord x xy =
length suff > 1 && ---- compound first part must be at least two letters long
any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
isPrefixOf x xy ---- and of course be a prefix
where
suff = drop (length x) xy
-}