forked from GitHub/gf-rgl
212 lines
7.6 KiB
Haskell
212 lines
7.6 KiB
Haskell
module Main where
|
||
|
||
import PGF
|
||
|
||
import qualified Data.Map as M
|
||
import Data.Char
|
||
import Data.List
|
||
import Safe
|
||
import System.Environment (getArgs)
|
||
import Debug.Trace
|
||
|
||
-- AR 2020-02-28
|
||
|
||
-- making a word list purely morphological, i.e.
|
||
-- - functions are 1-to-1 with lemgrams, i.e.
|
||
-- - no sense distinctions
|
||
-- - no subcategorizations
|
||
-- - no variants
|
||
-- - functionname = baseform_category, with exceptions
|
||
-- - variant inflection tables: lie_1_V, lie_2_V
|
||
-- - words that have non-ident characters: 'bird\'s-eye_A'
|
||
|
||
-- example:
|
||
-- gf -make ../english/DictEng.gf
|
||
-- runghc MkMorphodict.hs pgf MorphoDictEng.config DictEngAbs.pgf MorphoDictEng
|
||
-- 64923 -> 56599 functions
|
||
|
||
usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
|
||
|
||
main = do
|
||
xx <- getArgs
|
||
if length xx /= 4
|
||
then do
|
||
putStrLn "Usage:"
|
||
putStrLn usage
|
||
putStrLn $ "Got instead: " ++ show xx
|
||
else do
|
||
let mode:configfile:datafile:outfile:_ = xx
|
||
config <- readFile configfile >>= return . mkConfig
|
||
|
||
rawdata <- case mode of
|
||
"pgf" -> pgfFile2rawData config datafile
|
||
"raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
|
||
_ -> error $ "Expected mode (pgf|raw), got " ++ mode
|
||
rawdata2gf config rawdata outfile
|
||
|
||
|
||
rawdata2gf config rawdata outfile = do
|
||
|
||
let env = MDEnv rawdata config
|
||
let (absrules,cncrules) = mkMorphoDict env
|
||
|
||
absheader <- readFile (outfile ++ "Abs.header")
|
||
cncheader <- readFile (outfile ++ ".header")
|
||
|
||
writeFile (outfile ++ "Abs.gf") absheader
|
||
appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
|
||
appendFile (outfile ++ "Abs.gf") "}"
|
||
|
||
writeFile (outfile ++ ".gf") cncheader
|
||
appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
|
||
appendFile (outfile ++ ".gf") "}"
|
||
|
||
-- one way to get raw data from a dictionary pgf
|
||
-- another way, more controllod, is to write a wrapper grammar with a function, for each category, to generate a RawData entry
|
||
pgfFile2rawData config pgffile = do
|
||
pgf <- readPGF pgffile
|
||
|
||
let cats = nub [c | (c,(_,_,_)) <- M.assocs config]
|
||
let lang:_ = languages pgf
|
||
|
||
return [
|
||
(cat, map snd lin) |
|
||
cat <- cats,
|
||
f <- functionsByCat pgf (mkCId cat),
|
||
lin <- tabularLinearizes pgf lang (mkApp f [])
|
||
]
|
||
|
||
type Cat = String
|
||
type Fun = String
|
||
type Oper = String
|
||
type Config = M.Map Cat (Cat,Oper,([Int],[Int])) -- lin word_Cat = Oper str_i1 str_i2 ... str_in featj1 ... featjn ;
|
||
|
||
data MDEnv = MDEnv {
|
||
rawdata :: [RawData],
|
||
config :: Config
|
||
}
|
||
|
||
mkConfig :: String -> Config -- N : N mkN 0 2 4 6 # 9
|
||
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
|
||
where
|
||
mkOne s = case words s of
|
||
"--":_ -> Right s
|
||
cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
|
||
_ -> Right s
|
||
mkArgs ints = case break (=="#") ints of
|
||
(ss,[]) -> (map read' ss, [])
|
||
(ss,_:fs) -> (map read' ss, map read' fs)
|
||
read' a = readNote [] a -- Safe.readNote provides better error message
|
||
|
||
getRawData s = case words s of
|
||
c:cs -> (c,cs)
|
||
|
||
type RawData = (String,[String]) -- old cat name, forms and features
|
||
type RawRule = (([String],Cat), (Oper, ([String],[String]))) -- parts of fun name, new category, oper, arguments
|
||
type RuleData = ((Fun, Cat), (Oper, ([String],[String]))) -- final fun name, cat, oper, args
|
||
|
||
mkMorphoDict :: MDEnv -> ([String],[String]) -- fun rules, lin rules
|
||
mkMorphoDict env =
|
||
unzip $
|
||
map splitRule $
|
||
---- findCompounds $ -- let us not care about compounds for the time being, but include them if they are given
|
||
nameFunctions $
|
||
mergeRules $
|
||
findRules $
|
||
rawdata env
|
||
where
|
||
splitRule ((fun,cat),(oper,(forms,feats))) =
|
||
(unwords ["fun",fun,":",cat,";"], unwords ["lin",fun,"=", unwords (oper : map quote forms ++ feats),";"])
|
||
|
||
cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)]
|
||
|
||
findRules :: [RawData] -> [RawRule]
|
||
findRules raws = [
|
||
(([lemma],newcat),(oper, appSig sig args)) |
|
||
(oldcat,args) <- raws,
|
||
Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
|
||
let lemma = args `at` head (fst sig)
|
||
]
|
||
|
||
appSig (ints,feats) args =
|
||
-- If there's wrong number in config file, uncomment the line below to see which number it should be
|
||
-- trace (intercalate "\n" $ map show (zip [0..] args)) $
|
||
([args `at` i | i <- ints], [args `at` i | i <- feats])
|
||
|
||
mergeRules :: [RawRule] -> [RawRule]
|
||
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
|
||
|
||
nameFunctions :: [RawRule] -> [RuleData]
|
||
nameFunctions = expandNames . sortOn fst
|
||
|
||
expandNames :: [RawRule] -> [RuleData]
|
||
expandNames fls = case fls of
|
||
(f@(w,c),l) : fls2 -> case span ((==f) . fst) fls2 of
|
||
([],_) -> ((mkFun (w ++ [c]),c),l) : expandNames fls2
|
||
(fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3
|
||
_ -> []
|
||
|
||
renames :: [RawRule] -> [RuleData]
|
||
--- renames fls = [((mkFun (f ++ [show i,c]),c),l) | (i,((f,c),l)) <- zip [1..] fls] -- disambiguate with int
|
||
renames fls = [((mkFun (f ++ fs ++ [c]),c),l) | (i,(((f,c),l),fs)) <- zip [1..] (zip fls (minimize fls))] -- disambiguate with different forms
|
||
|
||
minimize :: [RawRule] -> [[String]]
|
||
minimize fls = shrink [ws ++ fs | (_,(_,(_:ws,fs))) <- fls]
|
||
|
||
shrink fls = case fls of
|
||
fl@(_:_):_ | all ((==head fl) . head) fls -> shrink (map tail fls)
|
||
fl@(_:_):_ | all ((==last fl) . last) fls -> shrink (map init fls)
|
||
_ -> shrinkMore fls
|
||
|
||
shrinkMore fls = case fls of
|
||
_ | length (nub (map init fls)) == length fls -> shrinkMore (map init fls)
|
||
_ | length (nub (map tail fls)) == length fls -> shrinkMore (map tail fls)
|
||
_ -> fls
|
||
|
||
-- >>> mkFun ["hello", "world", "hello friends", "hello-all"]
|
||
-- "hello_world_hello_friends_hello_all"
|
||
mkFun :: [String] -> String -- if word contains space or hyphen, replace with underscore
|
||
mkFun = showCId . mkCId . concat . intersperse "_" . concatMap (words . removeHyphen)
|
||
where
|
||
removeHyphen [] = []
|
||
removeHyphen ['-'] = ['-'] -- If hyphen is the last character, it's usually meaningful, leave it
|
||
removeHyphen ('-':cs) = ' ' : removeHyphen cs
|
||
removeHyphen (c:cs) = c : removeHyphen cs
|
||
|
||
quote s = "\"" ++ s ++ "\""
|
||
|
||
{- ---- let us ignore this
|
||
findCompounds :: [RuleData] -> [RuleData]
|
||
findCompounds = getCompounds . sortOn cat_orthrevforms
|
||
|
||
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
|
||
|
||
cat_revforms (_,(cat,_:forms)) = (cat,map reverse forms)
|
||
revstem = head . snd . cat_revforms
|
||
wforms (_,(_,_:forms)) = forms
|
||
|
||
getCompounds :: [RuleData] -> [RuleData]
|
||
getCompounds fls = case fls of
|
||
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
|
||
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
|
||
([],_:_) -> markWith fl [] : getCompounds fls2
|
||
(fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
|
||
_ -> []
|
||
|
||
markCompound fl fl1 =
|
||
case and [isPrefixWord (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms fl1)] of
|
||
True -> markWith fl1 [";","--","compound",(fst fl)]
|
||
False -> markWith fl1 [";","--","notcompound",(fst fl)]
|
||
|
||
markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs)) ---- TODO only quote string args, not features
|
||
|
||
isPrefixWord x xy =
|
||
length suff > 1 && ---- compound first part must be at least two letters long
|
||
any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
|
||
isPrefixOf x xy ---- and of course be a prefix
|
||
where
|
||
suff = drop (length x) xy
|
||
-}
|
||
|
||
|