now we compile context-free grammars directly to PGF without going via GF source code. This makes it quick and lightweight to compile big grammars such as the Berkley grammar

This commit is contained in:
kr.angelov
2014-05-24 07:47:06 +00:00
parent dbb4fdd0f7
commit 67f64cb233
3 changed files with 106 additions and 51 deletions

View File

@@ -2,11 +2,12 @@ module GF.Command.Importing (importGrammar, importSource) where
import PGF import PGF
import PGF.Data import PGF.Data
import PGF.Optimize
import GF.Compile import GF.Compile
import GF.Compile.Multi (readMulti) import GF.Compile.Multi (readMulti)
import GF.Compile.GetGrammar (getCFRules, getEBNFRules) import GF.Compile.GetGrammar (getCFRules, getEBNFRules)
import GF.Grammar (identS, SourceGrammar) -- for cc command import GF.Grammar (SourceGrammar) -- for cc command
import GF.Grammar.CFG import GF.Grammar.CFG
import GF.Grammar.EBNF import GF.Grammar.EBNF
import GF.Compile.CFGtoPGF import GF.Compile.CFGtoPGF
@@ -65,6 +66,7 @@ importCF opts files get convert = do
startCat <- case rules of startCat <- case rules of
(CFRule cat _ _ : _) -> return cat (CFRule cat _ _ : _) -> return cat
_ -> fail "empty CFG" _ -> fail "empty CFG"
let gf = cf2gf (last files) (uniqueFuns (mkCFG startCat Set.empty rules)) let pgf = cf2pgf (last files) (uniqueFuns (mkCFG startCat Set.empty rules))
gr <- compileSourceGrammar opts gf probs <- liftIO (maybe (return . defaultProbabilities) readProbabilitiesFromFile (flag optProbsFile opts) pgf)
link opts (identS (justModuleName (last files) ++ "Abs"), (), gr) return $ setProbabilities probs
$ if flag optOptimizePGF opts then optimizePGF pgf else pgf

View File

@@ -1,58 +1,111 @@
module GF.Compile.CFGtoPGF (cf2gf) where module GF.Compile.CFGtoPGF (cf2pgf) where
import GF.Grammar.Grammar hiding (Cat)
import GF.Grammar.Macros
import GF.Grammar.CFG import GF.Grammar.CFG
import GF.Infra.Ident(Ident,identS)
import GF.Infra.Option
import GF.Infra.UseIO import GF.Infra.UseIO
import GF.Data.Operations import PGF
import PGF.Data
import PGF(showCId) import PGF.Macros
import PGF.Optimize
import qualified Data.Set as Set import qualified Data.Set as Set
import qualified Data.Map as Map import qualified Data.Map as Map
import qualified Data.IntMap as IntMap
import qualified Data.ByteString as BS
import Data.Array.IArray
import Data.List
-------------------------- --------------------------
-- the compiler ---------- -- the compiler ----------
-------------------------- --------------------------
cf2gf :: FilePath -> CFG -> SourceGrammar cf2pgf :: FilePath -> CFG -> PGF
cf2gf fpath cf = mGrammar [ cf2pgf fpath cf =
(aname, ModInfo MTAbstract MSComplete (modifyFlags (\fs -> fs{optStartCat = Just cat})) [] Nothing [] [] fpath Nothing abs), let pgf = PGF Map.empty aname (cf2abstr cf) (Map.singleton cname (cf2concr cf))
(cname, ModInfo (MTConcrete aname) MSComplete noOptions [] Nothing [] [] fpath Nothing cnc) in updateProductionIndices pgf
]
where where
name = justModuleName fpath name = justModuleName fpath
(abs,cnc,cat) = cf2grammar cf aname = mkCId (name ++ "Abs")
aname = identS $ name ++ "Abs" cname = mkCId name
cname = identS name
cf2abstr :: CFG -> Abstr
cf2abstr cfg = Abstr aflags afuns acats BS.empty
where
aflags = Map.singleton (mkCId "startcat") (LStr (cfgStartCat cfg))
acats = Map.fromList [(mkCId cat, ([], [(0,mkRuleName rule)
| rule <- Set.toList rules], 0, 0))
| (cat,rules) <- Map.toList (cfgRules cfg)]
afuns = Map.fromList [(mkRuleName rule, (cftype [mkCId c | NonTerminal c <- ruleRhs rule] (mkCId cat), 0, Nothing, 0, 0))
| (cat,rules) <- Map.toList (cfgRules cfg)
, rule <- Set.toList rules]
cf2grammar :: CFG -> (BinTree Ident Info, BinTree Ident Info, String) cf2concr :: CFG -> Concr
cf2grammar cfg = (buildTree abs, buildTree conc, cfgStartCat cfg) where cf2concr cfg = Concr Map.empty Map.empty
abs = cats ++ funs cncfuns lindefsrefs lindefsrefs
conc = lincats ++ lins sequences productions
cats = [(identS cat, AbsCat (Just (L NoLoc []))) | cat <- Map.keys (cfgRules cfg)] IntMap.empty Map.empty
lincats = [(cat, CncCat (Just (L loc defLinType)) Nothing Nothing Nothing Nothing) | (cat,AbsCat (Just (L loc _))) <- cats] cnccats
(funs,lins) = unzip (map cf2rule (concatMap Set.toList (Map.elems (cfgRules cfg)))) IntMap.empty
totalCats
where
sequences0 = Set.fromList (listArray (0,0) [SymCat 0 0] :
[mkSequence rule | rules <- Map.elems (cfgRules cfg), rule <- Set.toList rules])
sequences = listArray (0,Set.size sequences0-1) (Set.toList sequences0)
cf2rule :: CFRule -> ((Ident,Info),(Ident,Info)) idFun = CncFun wildCId (listArray (0,0) [seqid])
cf2rule (CFRule cat items (CFObj fun _)) = (def,ldef) where where
f = identS (showCId fun) seq = listArray (0,0) [SymCat 0 0]
def = (f, AbsFun (Just (L NoLoc (mkProd args' (Cn (identS cat)) []))) Nothing Nothing (Just True)) seqid = binSearch seq sequences (bounds sequences)
args0 = zip (map (identS . ("x" ++) . show) [0..]) items ((fun_cnt,cncfuns0),productions0) = mapAccumL convertRules (1,[idFun]) (Map.toList (cfgRules cfg))
args = [((Explicit,v), Cn (identS c)) | (v, NonTerminal c) <- args0] productions = IntMap.fromList productions0
args' = [(Explicit,identS "_", Cn (identS c)) | (_, NonTerminal c) <- args0] cncfuns = listArray (0,fun_cnt-1) (reverse cncfuns0)
ldef = (f, CncFun
Nothing lbls = listArray (0,0) ["s"]
(Just (L NoLoc (mkAbs (map fst args) (totalCats,cnccats0) = mapAccumL mkCncCat 0 (Map.toList (cfgRules cfg))
(mkRecord (const theLinLabel) [foldconcat (map mkIt args0)])))) cnccats = Map.fromList cnccats0
Nothing
Nothing) lindefsrefs =
mkIt (v, NonTerminal _) = P (Vr v) theLinLabel IntMap.fromList (map mkLinDefRef (Map.keys (cfgRules cfg)))
mkIt (_, Terminal a) = K a
foldconcat [] = K "" convertRules st (cat,rules) =
foldconcat tt = foldr1 C tt let (st',prods) = mapAccumL convertRule st (Set.toList rules)
in (st',(cat2fid cat,Set.fromList prods))
convertRule (funid,funs) rule =
let args = [PArg [] (cat2fid c) | NonTerminal c <- ruleRhs rule]
prod = PApply funid args
seqid = binSearch (mkSequence rule) sequences (bounds sequences)
fun = CncFun (mkRuleName rule) (listArray (0,0) [seqid])
funid' = funid+1
in funid' `seq` ((funid',fun:funs),prod)
mkSequence rule = listArray (0,length syms-1) syms
where
syms = snd $ mapAccumL convertSymbol 0 (ruleRhs rule)
convertSymbol d (NonTerminal _) = (d+1,SymCat d 0)
convertSymbol d (Terminal t) = (d, SymKS t)
mkCncCat fid (cat,_) = (fid+1, (mkCId cat,CncCat fid fid lbls))
mkLinDefRef cat =
(cat2fid cat,[0])
binSearch v arr (i,j)
| i <= j = case compare v (arr ! k) of
LT -> binSearch v arr (i,k-1)
EQ -> k
GT -> binSearch v arr (k+1,j)
| otherwise = error "binSearch"
where
k = (i+j) `div` 2
cat2fid cat =
case Map.lookup (mkCId cat) cnccats of
Just (CncCat fid _ _) -> fid
_ -> error "cat2fid"
mkRuleName rule =
case ruleName rule of
CFObj n _ -> n
_ -> wildCId

View File

@@ -12,7 +12,7 @@ import GF.Compile.CFGtoPGF
import GF.Compile.GetGrammar import GF.Compile.GetGrammar
import GF.Grammar.CFG import GF.Grammar.CFG
import GF.Infra.Ident(identS,showIdent) import GF.Infra.Ident(showIdent)
import GF.Infra.UseIO import GF.Infra.UseIO
import GF.Infra.Option import GF.Infra.Option
import GF.Data.ErrM import GF.Data.ErrM
@@ -68,13 +68,13 @@ compileCFFiles opts fs = do
startCat <- case rules of startCat <- case rules of
(CFRule cat _ _ : _) -> return cat (CFRule cat _ _ : _) -> return cat
_ -> fail "empty CFG" _ -> fail "empty CFG"
let gf = cf2gf (last fs) (uniqueFuns (mkCFG startCat Set.empty rules)) let pgf = cf2pgf (last fs) (uniqueFuns (mkCFG startCat Set.empty rules))
gr <- compileSourceGrammar opts gf
let cnc = justModuleName (last fs) let cnc = justModuleName (last fs)
unless (flag optStopAfterPhase opts == Compile) $ unless (flag optStopAfterPhase opts == Compile) $
do pgf <- link opts (identS cnc, (), gr) do probs <- liftIO (maybe (return . defaultProbabilities) readProbabilitiesFromFile (flag optProbsFile opts) pgf)
writePGF opts pgf let pgf' = setProbabilities probs $ if flag optOptimizePGF opts then optimizePGF pgf else pgf
writeOutputs opts pgf writePGF opts pgf'
writeOutputs opts pgf'
unionPGFFiles :: Options -> [FilePath] -> IOE () unionPGFFiles :: Options -> [FilePath] -> IOE ()
unionPGFFiles opts fs = unionPGFFiles opts fs =