diff --git a/src-3.0/GF/Compile/GeneratePMCFG.hs b/src-3.0/GF/Compile/GeneratePMCFG.hs new file mode 100644 index 000000000..435a06eb1 --- /dev/null +++ b/src-3.0/GF/Compile/GeneratePMCFG.hs @@ -0,0 +1,356 @@ +{-# OPTIONS -fbang-patterns #-} +---------------------------------------------------------------------- +-- | +-- Maintainer : Krasimir Angelov +-- Stability : (stable) +-- Portability : (portable) +-- +-- Converting SimpleGFC grammars to fast nonerasing MCFG grammar. +-- +-- the resulting grammars might be /very large/ +-- +-- the conversion is only equivalent if the GFC grammar has a context-free backbone. +----------------------------------------------------------------------------- + + +module GF.Compile.GeneratePMCFG + (convertConcrete) where + +import PGF.CId +import PGF.Data +import PGF.Macros --hiding (prt) +import PGF.Parsing.FCFG.Utilities + +import GF.Data.BacktrackM +import GF.Data.SortedList +import GF.Data.Utilities (updateNthM, sortNub) + +import qualified Data.Map as Map +import qualified Data.Set as Set +import qualified Data.List as List +import qualified Data.ByteString.Char8 as BS +import Data.Array +import Data.Maybe +import Control.Monad +import Debug.Trace + +---------------------------------------------------------------------- +-- main conversion function + +convertConcrete :: Abstr -> Concr -> FGrammar +convertConcrete abs cnc = fixHoasFuns $ convert abs_defs' conc' cats' + where abs_defs = Map.assocs (funs abs) + conc = Map.union (opers cnc) (lins cnc) -- "union big+small most efficient" + cats = lincats cnc + (abs_defs',conc',cats') = expandHOAS abs_defs conc cats + +expandHOAS :: [(CId,(Type,Exp))] -> TermMap -> TermMap -> ([(CId,(Type,Exp))],TermMap,TermMap) +expandHOAS funs lins lincats = (funs' ++ hoFuns ++ varFuns, + Map.unions [lins, hoLins, varLins], + Map.unions [lincats, hoLincats, varLincat]) + where + -- replace higher-order fun argument types with new categories + funs' = [(f,(fixType ty,e)) | (f,(ty,e)) <- funs] + where + fixType :: Type -> Type + fixType ty = let (ats,rt) = typeSkeleton ty in cftype (map catName ats) rt + + hoTypes :: [(Int,CId)] + hoTypes = sortNub [(n,c) | (_,(ty,_)) <- funs, (n,c) <- fst (typeSkeleton ty), n > 0] + hoCats = sortNub (map snd hoTypes) + -- for each Cat with N bindings, we add a new category _NCat + -- each new category contains a single function __NCat : Cat -> _Var -> ... -> _Var -> _NCat + hoFuns = [(funName ty,(cftype (c : replicate n varCat) (catName ty),EEq [])) | ty@(n,c) <- hoTypes] + -- lincats for the new categories + hoLincats = Map.fromList [(catName ty, modifyRec (++ replicate n (S [])) (lincatOf c)) | ty@(n,c) <- hoTypes] + -- linearizations of the new functions, lin __NCat v_0 ... v_n-1 x = { s1 = x.s1; ...; sk = x.sk; $0 = v_0.s ... + hoLins = Map.fromList [ (funName ty, mkLin c n) | ty@(n,c) <- hoTypes] + where mkLin c n = modifyRec (\fs -> [P (V 0) (C j) | j <- [0..length fs-1]] ++ [P (V i) (C 0) | i <- [1..n]]) (lincatOf c) + -- for each Cat, we a add a fun _Var_Cat : _Var -> Cat + varFuns = [(varFunName cat, (cftype [varCat] cat,EEq [])) | cat <- hoCats] + -- linearizations of the _Var_Cat functions + varLins = Map.fromList [(varFunName cat, R [P (V 0) (C 0)]) | cat <- hoCats] + -- lincat for the _Var category + varLincat = Map.singleton varCat (R [S []]) + + lincatOf c = fromMaybe (error $ "No lincat for " ++ prCId c) $ Map.lookup c lincats + + modifyRec :: ([Term] -> [Term]) -> Term -> Term + modifyRec f (R xs) = R (f xs) + modifyRec _ t = error $ "Not a record: " ++ show t + + varCat = mkCId "_Var" + + catName :: (Int,CId) -> CId + catName (0,c) = c + catName (n,c) = mkCId ("_" ++ show n ++ prCId c) + + funName :: (Int,CId) -> CId + funName (n,c) = mkCId ("__" ++ show n ++ prCId c) + + varFunName :: CId -> CId + varFunName c = mkCId ("_Var_" ++ prCId c) + +-- replaces __NCat with _B and _Var_Cat with _. +-- the temporary names are just there to avoid name collisions. +fixHoasFuns :: FGrammar -> FGrammar +fixHoasFuns (!rs, !cs) = ([FRule (fixName n) ps args cat lins | FRule n ps args cat lins <- rs], cs) + where fixName (CId n) | BS.pack "__" `BS.isPrefixOf` n = (mkCId "_B") + | BS.pack "_Var_" `BS.isPrefixOf` n = wildCId + fixName n = n + +convert :: [(CId,(Type,Exp))] -> TermMap -> TermMap -> FGrammar +convert abs_defs cnc_defs cat_defs = getFGrammar (List.foldl' (convertRule cnc_defs) emptyFRulesEnv srules) + where + srules = [ + (XRule id args res (map findLinType args) (findLinType res) term) | + (id, (ty,_)) <- abs_defs, let (args,res) = catSkeleton ty, + term <- Map.lookup id cnc_defs] + + findLinType id = fromMaybe (error $ "No lincat for " ++ show id) (Map.lookup id cat_defs) + + +convertRule :: TermMap -> FRulesEnv -> XRule -> FRulesEnv +convertRule cnc_defs frulesEnv (XRule fun args cat ctypes ctype term) = + foldBM addRule + frulesEnv + (convertTerm cnc_defs [] ctype term [([],[])]) + (protoFCat cnc_defs cat ctype, zipWith (protoFCat cnc_defs) args ctypes) + where + addRule linRec (newCat', newArgs') env0 = + let (env1, newCat) = genFCatHead env0 newCat' + (env2, newArgs) = List.mapAccumL (genFCatArg cnc_defs) env1 newArgs' + + newLinRec = mkArray (map (mkArray . snd) linRec) + mkArray lst = listArray (0,length lst-1) lst + + rule = FRule fun [] newArgs newCat newLinRec + in addFRule env2 rule + +---------------------------------------------------------------------- +-- term conversion + +type CnvMonad a = BacktrackM Env a + +type FPath = [FIndex] +data ProtoFCat = PFCat CId [FPath] [(FPath,FIndex)] Term +type Env = (ProtoFCat, [ProtoFCat]) +type LinRec = [(FPath, [FSymbol])] +data XRule = XRule CId {- function -} + [CId] {- argument types -} + CId {- result type -} + [Term] {- argument lin-types representation -} + Term {- result lin-type representation -} + Term {- body -} + +protoFCat :: TermMap -> CId -> Term -> ProtoFCat +protoFCat cnc_defs cat ctype = PFCat cat (getRCS cnc_defs ctype) [] ctype + +type TermMap = Map.Map CId Term + +convertTerm :: TermMap -> FPath -> Term -> Term -> LinRec -> CnvMonad LinRec +convertTerm cnc_defs sel ctype (V nr) ((lbl_path,lin) : lins) = convertArg ctype nr (reverse sel) lbl_path lin lins +convertTerm cnc_defs sel ctype (C nr) ((lbl_path,lin) : lins) = convertCon ctype nr (reverse sel) lbl_path lin lins +convertTerm cnc_defs sel ctype (R record) ((lbl_path,lin) : lins) = convertRec cnc_defs sel ctype record lbl_path lin lins +convertTerm cnc_defs sel ctype (P term p) lins = do nr <- evalTerm cnc_defs [] p + convertTerm cnc_defs (nr:sel) ctype term lins +convertTerm cnc_defs sel ctype (FV vars) lins = do term <- member vars + convertTerm cnc_defs sel ctype term lins +convertTerm cnc_defs sel ctype (S ts) ((lbl_path,lin) : lins) = foldM (\lins t -> convertTerm cnc_defs sel ctype t lins) ((lbl_path,lin) : lins) (reverse ts) +convertTerm cnc_defs sel ctype (K (KS str)) ((lbl_path,lin) : lins) = return ((lbl_path,FSymTok str : lin) : lins) +convertTerm cnc_defs sel ctype (K (KP strs vars))((lbl_path,lin) : lins) = + do toks <- member (strs:[strs' | Var strs' _ <- vars]) + return ((lbl_path, map FSymTok toks ++ lin) : lins) +convertTerm cnc_defs sel ctype (F id) lins = do term <- Map.lookup id cnc_defs + convertTerm cnc_defs sel ctype term lins +convertTerm cnc_defs sel ctype (W s t) ((lbl_path,lin) : lins) = do + ss <- case t of + R ss -> return ss + F f -> do + t <- Map.lookup f cnc_defs + case t of + R ss -> return ss + convertRec cnc_defs sel ctype [K (KS (s ++ s1)) | K (KS s1) <- ss] lbl_path lin lins +convertTerm cnc_defs sel ctype x lins = error ("convertTerm ("++show x++")") + + +convertArg (R record) nr path lbl_path lin lins = + foldM (\lins (lbl, ctype) -> convertArg ctype nr (lbl:path) (lbl:lbl_path) lin lins) lins (zip [0..] record) +convertArg (C max) nr path lbl_path lin lins = do + index <- member [0..max] + restrictHead lbl_path index + restrictArg nr path index + return lins +convertArg (S _) nr path lbl_path lin lins = do + (_, args) <- readState + let PFCat cat rcs tcs _ = args !! nr + return ((lbl_path, FSymCat (index path rcs 0) nr : lin) : lins) + where + index lbl' (lbl:lbls) idx + | lbl' == lbl = idx + | otherwise = index lbl' lbls $! (idx+1) + + +convertCon (C max) index [] lbl_path lin lins = do + guard (index <= max) + restrictHead lbl_path index + return lins +convertCon x _ _ _ _ _ = error $ "SimpleToFCFG,convertCon: " ++ show x + +convertRec cnc_defs [] (R ctypes) record lbl_path lin lins = + foldM (\lins (index,ctype,val) -> convertTerm cnc_defs [] ctype val ((index:lbl_path,lin) : lins)) + lins + (zip3 [0..] ctypes record) +convertRec cnc_defs (index:sub_sel) ctype record lbl_path lin lins = do + convertTerm cnc_defs sub_sel ctype (record !! index) ((lbl_path,lin) : lins) + + +------------------------------------------------------------ +-- eval a term to ground terms + +evalTerm :: TermMap -> FPath -> Term -> CnvMonad FIndex +evalTerm cnc_defs path (V nr) = do (_, args) <- readState + let PFCat _ _ _ ctype = args !! nr + unifyPType nr (reverse path) (selectTerm path ctype) +evalTerm cnc_defs path (C nr) = return nr +evalTerm cnc_defs path (R record) = case path of + (index:path) -> evalTerm cnc_defs path (record !! index) +evalTerm cnc_defs path (P term sel) = do index <- evalTerm cnc_defs [] sel + evalTerm cnc_defs (index:path) term +evalTerm cnc_defs path (FV terms) = member terms >>= evalTerm cnc_defs path +evalTerm cnc_defs path (F id) = do term <- Map.lookup id cnc_defs + evalTerm cnc_defs path term +evalTerm cnc_defs path x = error ("evalTerm ("++show x++")") + +unifyPType :: FIndex -> FPath -> Term -> CnvMonad FIndex +unifyPType nr path (C max_index) = + do (_, args) <- readState + let PFCat _ _ tcs _ = args !! nr + case lookup path tcs of + Just index -> return index + Nothing -> do index <- member [0..max_index] + restrictArg nr path index + return index +unifyPType nr path t = error $ "unifyPType " ++ show t ---- AR 2/10/2007 + +selectTerm :: FPath -> Term -> Term +selectTerm [] term = term +selectTerm (index:path) (R record) = selectTerm path (record !! index) + + +---------------------------------------------------------------------- +-- FRulesEnv + +data FRulesEnv = FRulesEnv {-# UNPACK #-} !Int FCatSet [FRule] +type FCatSet = Map.Map CId (Map.Map [(FPath,FIndex)] FCat) + +emptyFRulesEnv = FRulesEnv 0 (ins fcatString (mkCId "String") [] $ + ins fcatInt (mkCId "Int") [] $ + ins fcatFloat (mkCId "Float") [] $ + ins fcatVar (mkCId "_Var") [] $ + Map.empty) [] + where + ins fcat cat tcs fcatSet = + Map.insertWith (\_ -> Map.insert tcs fcat) cat tmap_s fcatSet + where + tmap_s = Map.singleton tcs fcat + +addFRule :: FRulesEnv -> FRule -> FRulesEnv +addFRule (FRulesEnv last_id fcatSet rules) rule = FRulesEnv last_id fcatSet (rule:rules) + +getFGrammar :: FRulesEnv -> FGrammar +getFGrammar (FRulesEnv last_id fcatSet rules) = (rules, Map.map Map.elems fcatSet) + +genFCatHead :: FRulesEnv -> ProtoFCat -> (FRulesEnv, FCat) +genFCatHead env@(FRulesEnv last_id fcatSet rules) (PFCat cat rcs tcs _) = + case Map.lookup cat fcatSet >>= Map.lookup tcs of + Just fcat -> (env, fcat) + Nothing -> let fcat = last_id+1 + in (FRulesEnv fcat (ins fcat) rules, fcat) + where + ins fcat = Map.insertWith (\_ -> Map.insert tcs fcat) cat tmap_s fcatSet + where + tmap_s = Map.singleton tcs fcat + +genFCatArg :: TermMap -> FRulesEnv -> ProtoFCat -> (FRulesEnv, FCat) +genFCatArg cnc_defs env@(FRulesEnv last_id fcatSet rules) (PFCat cat rcs tcs ctype) = + case Map.lookup cat fcatSet of + Just tmap -> case Map.lookup tcs tmap of + Just fcat -> (env, fcat) + Nothing -> ins tmap + Nothing -> ins Map.empty + where + ins tmap = + let fcat = last_id+1 + (last_id1,tmap1,rules1) + = foldBM (\tcs st (last_id,tmap,rules) -> + let (last_id1,tmap1,fcat_arg) = addArg tcs last_id tmap + rule = FRule wildCId [[0]] [fcat_arg] fcat + (listArray (0,length rcs-1) [listArray (0,0) [FSymCat lbl 0] | lbl <- [0..length rcs-1]]) + in if st + then (last_id1,tmap1,rule:rules) + else (last_id, tmap, rules)) + (fcat,Map.insert tcs fcat tmap,rules) + (gen_tcs ctype [] []) + False + in (FRulesEnv last_id1 (Map.insert cat tmap1 fcatSet) rules1, fcat) + where + addArg tcs last_id tmap = + case Map.lookup tcs tmap of + Just fcat -> (last_id, tmap, fcat) + Nothing -> let fcat = last_id+1 + in (fcat, Map.insert tcs fcat tmap, fcat) + + gen_tcs :: Term -> FPath -> [(FPath,FIndex)] -> BacktrackM Bool [(FPath,FIndex)] + gen_tcs (R record) path acc = foldM (\acc (label,ctype) -> gen_tcs ctype (label:path) acc) acc (zip [0..] record) + gen_tcs (S _) path acc = return acc + gen_tcs (C max_index) path acc = + case List.lookup path tcs of + Just index -> return $! addConstraint path index acc + Nothing -> do writeState True + index <- member [0..max_index] + return $! addConstraint path index acc + where + addConstraint path0 index0 (c@(path,index) : cs) + | path0 > path = c:addConstraint path0 index0 cs + addConstraint path0 index0 cs = (path0,index0) : cs + gen_tcs (F id) path acc = case Map.lookup id cnc_defs of + Just term -> gen_tcs term path acc + Nothing -> error ("unknown identifier: "++prCId id) + + +getRCS :: TermMap -> Term -> [FPath] +getRCS cnc_defs = loop [] [] + where + loop path rcs (R record) = List.foldl' (\rcs (index,term) -> loop (index:path) rcs term) rcs (zip [0..] record) + loop path rcs (C i) = rcs + loop path rcs (S _) = path:rcs + loop path rcs (F id) = case Map.lookup id cnc_defs of + Just term -> loop path rcs term + Nothing -> error ("unknown identifier: "++show id) + +------------------------------------------------------------ +-- updating the MCF rule + +restrictArg :: FIndex -> FPath -> FIndex -> CnvMonad () +restrictArg nr path index = do + (head, args) <- readState + args' <- updateNthM (restrictProtoFCat path index) nr args + writeState (head, args') + +restrictHead :: FPath -> FIndex -> CnvMonad () +restrictHead path term + = do (head, args) <- readState + head' <- restrictProtoFCat path term head + writeState (head', args) + +restrictProtoFCat :: FPath -> FIndex -> ProtoFCat -> CnvMonad ProtoFCat +restrictProtoFCat path0 index0 (PFCat cat rcs tcs ctype) = do + tcs <- addConstraint tcs + return (PFCat cat rcs tcs ctype) + where + addConstraint (c@(path,index) : cs) + | path0 > path = liftM (c:) (addConstraint cs) + | path0 == path = guard (index0 == index) >> + return (c : cs) + addConstraint cs = return ((path0,index0) : cs) diff --git a/src-3.0/GF/Compile/GrammarToGFCC.hs b/src-3.0/GF/Compile/GrammarToGFCC.hs index 61502663d..a27b4e761 100644 --- a/src-3.0/GF/Compile/GrammarToGFCC.hs +++ b/src-3.0/GF/Compile/GrammarToGFCC.hs @@ -3,7 +3,8 @@ module GF.Compile.GrammarToGFCC (prGrammar2gfcc,mkCanon2gfcc,addParsers) where import GF.Compile.Export import GF.Compile.OptimizeGF (unshareModule) -import GF.Compile.GenerateFCFG (convertConcrete) +import qualified GF.Compile.GenerateFCFG as FCFG +import qualified GF.Compile.GeneratePMCFG as PMCFG import PGF.CId import PGF.BuildParser (buildParserInfo) @@ -54,7 +55,12 @@ mkCanon2gfcc opts cnc gr = addParsers :: D.PGF -> D.PGF addParsers pgf = pgf { D.concretes = Map.map conv (D.concretes pgf) } where - conv cnc = cnc { D.parser = Just (buildParserInfo (convertConcrete (D.abstract pgf) cnc)) } + conv cnc = cnc { D.parser = Just (buildParserInfo fcfg) } + where + fcfg + | Map.lookup (mkCId "erasing") (D.cflags cnc) == Just "on" = PMCFG.convertConcrete (D.abstract pgf) cnc + | otherwise = FCFG.convertConcrete (D.abstract pgf) cnc + -- Generate PGF from GFCM. -- this assumes a grammar translated by canon2canon diff --git a/src-3.0/GF/Infra/Option.hs b/src-3.0/GF/Infra/Option.hs index 5be0d4f7c..ed5c24c81 100644 --- a/src-3.0/GF/Infra/Option.hs +++ b/src-3.0/GF/Infra/Option.hs @@ -122,6 +122,7 @@ data ModuleFlags = ModuleFlags { optSpeechLanguage :: Maybe String, optLexer :: Maybe String, optUnlexer :: Maybe String, + optErasing :: Bool, optBuildParser :: Bool, optWarnings :: [Warning], optDump :: [Dump] @@ -176,6 +177,7 @@ moduleOptionsGFO :: ModuleOptions -> [(String,String)] moduleOptionsGFO (ModuleOptions o) = maybe [] (\x -> [("language",x)]) (optSpeechLanguage mfs) ++ maybe [] (\x -> [("startcat",x)]) (optStartCat mfs) + ++ (if optErasing mfs then [("erasing","on")] else []) where mfs = o defaultModuleFlags @@ -262,6 +264,7 @@ defaultModuleFlags = ModuleFlags { optSpeechLanguage = Nothing, optLexer = Nothing, optUnlexer = Nothing, + optErasing = False, optBuildParser = True, optWarnings = [], optDump = [] @@ -311,6 +314,7 @@ moduleOptDescr = Option [] ["coding"] (ReqArg coding "ENCODING") ("Character encoding of the source grammar, ENCODING = " ++ concat (intersperse " | " (map fst encodings)) ++ "."), + Option [] ["erasing"] (onOff erasing False) "Generate erasing grammar (default off).", Option [] ["parser"] (onOff parser True) "Build parser (default on).", Option [] ["startcat"] (ReqArg startcat "CAT") "Grammar start category.", Option [] ["language"] (ReqArg language "LANG") "Set the speech language flag to LANG in the generated grammar.", @@ -339,6 +343,7 @@ moduleOptDescr = coding x = case lookup x encodings of Just c -> set $ \o -> o { optEncoding = c } Nothing -> fail $ "Unknown character encoding: " ++ x + erasing x = set $ \o -> o { optErasing = x } parser x = set $ \o -> o { optBuildParser = x } startcat x = set $ \o -> o { optStartCat = Just x } language x = set $ \o -> o { optSpeechLanguage = Just x } diff --git a/src-3.0/PGF.hs b/src-3.0/PGF.hs index f60ba852e..aa2fa2edf 100644 --- a/src-3.0/PGF.hs +++ b/src-3.0/PGF.hs @@ -172,11 +172,15 @@ readPGF f = do linearize pgf lang = PGF.Linearize.linearize pgf (mkCId lang) parse pgf lang cat s = - case lookParser pgf (mkCId lang) of - Nothing -> error ("Unknown language: " ++ lang) - Just pinfo -> case parseFCFG "bottomup" pinfo (mkCId cat) (words s) of - Ok x -> x - Bad s -> error s + case Map.lookup (mkCId lang) (concretes pgf) of + Just cnc -> case parser cnc of + Just pinfo -> if Map.lookup (mkCId "erasing") (cflags cnc) == Just "on" + then Incremental.parse pinfo (mkCId cat) (words s) + else case parseFCFG "bottomup" pinfo (mkCId cat) (words s) of + Ok x -> x + Bad s -> error s + Nothing -> error ("No parser built fo language: " ++ lang) + Nothing -> error ("Unknown language: " ++ lang) linearizeAll mgr = map snd . linearizeAllLang mgr linearizeAllLang mgr t =