From 67f64cb233edb899b2b1ad550ae8c832960fb7a2 Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Sat, 24 May 2014 07:47:06 +0000 Subject: [PATCH] now we compile context-free grammars directly to PGF without going via GF source code. This makes it quick and lightweight to compile big grammars such as the Berkley grammar --- src/compiler/GF/Command/Importing.hs | 10 +- src/compiler/GF/Compile/CFGtoPGF.hs | 135 +++++++++++++++++++-------- src/compiler/GFC.hs | 12 +-- 3 files changed, 106 insertions(+), 51 deletions(-) diff --git a/src/compiler/GF/Command/Importing.hs b/src/compiler/GF/Command/Importing.hs index 78c019bd4..3cf7674a0 100644 --- a/src/compiler/GF/Command/Importing.hs +++ b/src/compiler/GF/Command/Importing.hs @@ -2,11 +2,12 @@ module GF.Command.Importing (importGrammar, importSource) where import PGF import PGF.Data +import PGF.Optimize import GF.Compile import GF.Compile.Multi (readMulti) import GF.Compile.GetGrammar (getCFRules, getEBNFRules) -import GF.Grammar (identS, SourceGrammar) -- for cc command +import GF.Grammar (SourceGrammar) -- for cc command import GF.Grammar.CFG import GF.Grammar.EBNF import GF.Compile.CFGtoPGF @@ -65,6 +66,7 @@ importCF opts files get convert = do startCat <- case rules of (CFRule cat _ _ : _) -> return cat _ -> fail "empty CFG" - let gf = cf2gf (last files) (uniqueFuns (mkCFG startCat Set.empty rules)) - gr <- compileSourceGrammar opts gf - link opts (identS (justModuleName (last files) ++ "Abs"), (), gr) + let pgf = cf2pgf (last files) (uniqueFuns (mkCFG startCat Set.empty rules)) + probs <- liftIO (maybe (return . defaultProbabilities) readProbabilitiesFromFile (flag optProbsFile opts) pgf) + return $ setProbabilities probs + $ if flag optOptimizePGF opts then optimizePGF pgf else pgf diff --git a/src/compiler/GF/Compile/CFGtoPGF.hs b/src/compiler/GF/Compile/CFGtoPGF.hs index b42c0fbc4..e1eaf53b2 100644 --- a/src/compiler/GF/Compile/CFGtoPGF.hs +++ b/src/compiler/GF/Compile/CFGtoPGF.hs @@ -1,58 +1,111 @@ -module GF.Compile.CFGtoPGF (cf2gf) where +module GF.Compile.CFGtoPGF (cf2pgf) where -import GF.Grammar.Grammar hiding (Cat) -import GF.Grammar.Macros import GF.Grammar.CFG -import GF.Infra.Ident(Ident,identS) -import GF.Infra.Option import GF.Infra.UseIO -import GF.Data.Operations - -import PGF(showCId) +import PGF +import PGF.Data +import PGF.Macros +import PGF.Optimize import qualified Data.Set as Set import qualified Data.Map as Map - +import qualified Data.IntMap as IntMap +import qualified Data.ByteString as BS +import Data.Array.IArray +import Data.List -------------------------- -- the compiler ---------- -------------------------- -cf2gf :: FilePath -> CFG -> SourceGrammar -cf2gf fpath cf = mGrammar [ - (aname, ModInfo MTAbstract MSComplete (modifyFlags (\fs -> fs{optStartCat = Just cat})) [] Nothing [] [] fpath Nothing abs), - (cname, ModInfo (MTConcrete aname) MSComplete noOptions [] Nothing [] [] fpath Nothing cnc) - ] +cf2pgf :: FilePath -> CFG -> PGF +cf2pgf fpath cf = + let pgf = PGF Map.empty aname (cf2abstr cf) (Map.singleton cname (cf2concr cf)) + in updateProductionIndices pgf where name = justModuleName fpath - (abs,cnc,cat) = cf2grammar cf - aname = identS $ name ++ "Abs" - cname = identS name + aname = mkCId (name ++ "Abs") + cname = mkCId name +cf2abstr :: CFG -> Abstr +cf2abstr cfg = Abstr aflags afuns acats BS.empty + where + aflags = Map.singleton (mkCId "startcat") (LStr (cfgStartCat cfg)) + acats = Map.fromList [(mkCId cat, ([], [(0,mkRuleName rule) + | rule <- Set.toList rules], 0, 0)) + | (cat,rules) <- Map.toList (cfgRules cfg)] + afuns = Map.fromList [(mkRuleName rule, (cftype [mkCId c | NonTerminal c <- ruleRhs rule] (mkCId cat), 0, Nothing, 0, 0)) + | (cat,rules) <- Map.toList (cfgRules cfg) + , rule <- Set.toList rules] -cf2grammar :: CFG -> (BinTree Ident Info, BinTree Ident Info, String) -cf2grammar cfg = (buildTree abs, buildTree conc, cfgStartCat cfg) where - abs = cats ++ funs - conc = lincats ++ lins - cats = [(identS cat, AbsCat (Just (L NoLoc []))) | cat <- Map.keys (cfgRules cfg)] - lincats = [(cat, CncCat (Just (L loc defLinType)) Nothing Nothing Nothing Nothing) | (cat,AbsCat (Just (L loc _))) <- cats] - (funs,lins) = unzip (map cf2rule (concatMap Set.toList (Map.elems (cfgRules cfg)))) +cf2concr :: CFG -> Concr +cf2concr cfg = Concr Map.empty Map.empty + cncfuns lindefsrefs lindefsrefs + sequences productions + IntMap.empty Map.empty + cnccats + IntMap.empty + totalCats + where + sequences0 = Set.fromList (listArray (0,0) [SymCat 0 0] : + [mkSequence rule | rules <- Map.elems (cfgRules cfg), rule <- Set.toList rules]) + sequences = listArray (0,Set.size sequences0-1) (Set.toList sequences0) -cf2rule :: CFRule -> ((Ident,Info),(Ident,Info)) -cf2rule (CFRule cat items (CFObj fun _)) = (def,ldef) where - f = identS (showCId fun) - def = (f, AbsFun (Just (L NoLoc (mkProd args' (Cn (identS cat)) []))) Nothing Nothing (Just True)) - args0 = zip (map (identS . ("x" ++) . show) [0..]) items - args = [((Explicit,v), Cn (identS c)) | (v, NonTerminal c) <- args0] - args' = [(Explicit,identS "_", Cn (identS c)) | (_, NonTerminal c) <- args0] - ldef = (f, CncFun - Nothing - (Just (L NoLoc (mkAbs (map fst args) - (mkRecord (const theLinLabel) [foldconcat (map mkIt args0)])))) - Nothing - Nothing) - mkIt (v, NonTerminal _) = P (Vr v) theLinLabel - mkIt (_, Terminal a) = K a - foldconcat [] = K "" - foldconcat tt = foldr1 C tt + idFun = CncFun wildCId (listArray (0,0) [seqid]) + where + seq = listArray (0,0) [SymCat 0 0] + seqid = binSearch seq sequences (bounds sequences) + ((fun_cnt,cncfuns0),productions0) = mapAccumL convertRules (1,[idFun]) (Map.toList (cfgRules cfg)) + productions = IntMap.fromList productions0 + cncfuns = listArray (0,fun_cnt-1) (reverse cncfuns0) + + lbls = listArray (0,0) ["s"] + (totalCats,cnccats0) = mapAccumL mkCncCat 0 (Map.toList (cfgRules cfg)) + cnccats = Map.fromList cnccats0 + + lindefsrefs = + IntMap.fromList (map mkLinDefRef (Map.keys (cfgRules cfg))) + + convertRules st (cat,rules) = + let (st',prods) = mapAccumL convertRule st (Set.toList rules) + in (st',(cat2fid cat,Set.fromList prods)) + + convertRule (funid,funs) rule = + let args = [PArg [] (cat2fid c) | NonTerminal c <- ruleRhs rule] + prod = PApply funid args + seqid = binSearch (mkSequence rule) sequences (bounds sequences) + fun = CncFun (mkRuleName rule) (listArray (0,0) [seqid]) + funid' = funid+1 + in funid' `seq` ((funid',fun:funs),prod) + + mkSequence rule = listArray (0,length syms-1) syms + where + syms = snd $ mapAccumL convertSymbol 0 (ruleRhs rule) + + convertSymbol d (NonTerminal _) = (d+1,SymCat d 0) + convertSymbol d (Terminal t) = (d, SymKS t) + + mkCncCat fid (cat,_) = (fid+1, (mkCId cat,CncCat fid fid lbls)) + + mkLinDefRef cat = + (cat2fid cat,[0]) + + binSearch v arr (i,j) + | i <= j = case compare v (arr ! k) of + LT -> binSearch v arr (i,k-1) + EQ -> k + GT -> binSearch v arr (k+1,j) + | otherwise = error "binSearch" + where + k = (i+j) `div` 2 + + cat2fid cat = + case Map.lookup (mkCId cat) cnccats of + Just (CncCat fid _ _) -> fid + _ -> error "cat2fid" + +mkRuleName rule = + case ruleName rule of + CFObj n _ -> n + _ -> wildCId diff --git a/src/compiler/GFC.hs b/src/compiler/GFC.hs index 8d548e449..b9ad7051a 100644 --- a/src/compiler/GFC.hs +++ b/src/compiler/GFC.hs @@ -12,7 +12,7 @@ import GF.Compile.CFGtoPGF import GF.Compile.GetGrammar import GF.Grammar.CFG -import GF.Infra.Ident(identS,showIdent) +import GF.Infra.Ident(showIdent) import GF.Infra.UseIO import GF.Infra.Option import GF.Data.ErrM @@ -68,13 +68,13 @@ compileCFFiles opts fs = do startCat <- case rules of (CFRule cat _ _ : _) -> return cat _ -> fail "empty CFG" - let gf = cf2gf (last fs) (uniqueFuns (mkCFG startCat Set.empty rules)) - gr <- compileSourceGrammar opts gf + let pgf = cf2pgf (last fs) (uniqueFuns (mkCFG startCat Set.empty rules)) let cnc = justModuleName (last fs) unless (flag optStopAfterPhase opts == Compile) $ - do pgf <- link opts (identS cnc, (), gr) - writePGF opts pgf - writeOutputs opts pgf + do probs <- liftIO (maybe (return . defaultProbabilities) readProbabilitiesFromFile (flag optProbsFile opts) pgf) + let pgf' = setProbabilities probs $ if flag optOptimizePGF opts then optimizePGF pgf else pgf + writePGF opts pgf' + writeOutputs opts pgf' unionPGFFiles :: Options -> [FilePath] -> IOE () unionPGFFiles opts fs =