1
0
forked from GitHub/gf-core

now we compile context-free grammars directly to PGF without going via GF source code. This makes it quick and lightweight to compile big grammars such as the Berkley grammar

This commit is contained in:
kr.angelov
2014-05-24 07:47:06 +00:00
parent dbb4fdd0f7
commit 67f64cb233
3 changed files with 106 additions and 51 deletions

View File

@@ -2,11 +2,12 @@ module GF.Command.Importing (importGrammar, importSource) where
import PGF
import PGF.Data
import PGF.Optimize
import GF.Compile
import GF.Compile.Multi (readMulti)
import GF.Compile.GetGrammar (getCFRules, getEBNFRules)
import GF.Grammar (identS, SourceGrammar) -- for cc command
import GF.Grammar (SourceGrammar) -- for cc command
import GF.Grammar.CFG
import GF.Grammar.EBNF
import GF.Compile.CFGtoPGF
@@ -65,6 +66,7 @@ importCF opts files get convert = do
startCat <- case rules of
(CFRule cat _ _ : _) -> return cat
_ -> fail "empty CFG"
let gf = cf2gf (last files) (uniqueFuns (mkCFG startCat Set.empty rules))
gr <- compileSourceGrammar opts gf
link opts (identS (justModuleName (last files) ++ "Abs"), (), gr)
let pgf = cf2pgf (last files) (uniqueFuns (mkCFG startCat Set.empty rules))
probs <- liftIO (maybe (return . defaultProbabilities) readProbabilitiesFromFile (flag optProbsFile opts) pgf)
return $ setProbabilities probs
$ if flag optOptimizePGF opts then optimizePGF pgf else pgf

View File

@@ -1,58 +1,111 @@
module GF.Compile.CFGtoPGF (cf2gf) where
module GF.Compile.CFGtoPGF (cf2pgf) where
import GF.Grammar.Grammar hiding (Cat)
import GF.Grammar.Macros
import GF.Grammar.CFG
import GF.Infra.Ident(Ident,identS)
import GF.Infra.Option
import GF.Infra.UseIO
import GF.Data.Operations
import PGF(showCId)
import PGF
import PGF.Data
import PGF.Macros
import PGF.Optimize
import qualified Data.Set as Set
import qualified Data.Map as Map
import qualified Data.IntMap as IntMap
import qualified Data.ByteString as BS
import Data.Array.IArray
import Data.List
--------------------------
-- the compiler ----------
--------------------------
cf2gf :: FilePath -> CFG -> SourceGrammar
cf2gf fpath cf = mGrammar [
(aname, ModInfo MTAbstract MSComplete (modifyFlags (\fs -> fs{optStartCat = Just cat})) [] Nothing [] [] fpath Nothing abs),
(cname, ModInfo (MTConcrete aname) MSComplete noOptions [] Nothing [] [] fpath Nothing cnc)
]
cf2pgf :: FilePath -> CFG -> PGF
cf2pgf fpath cf =
let pgf = PGF Map.empty aname (cf2abstr cf) (Map.singleton cname (cf2concr cf))
in updateProductionIndices pgf
where
name = justModuleName fpath
(abs,cnc,cat) = cf2grammar cf
aname = identS $ name ++ "Abs"
cname = identS name
aname = mkCId (name ++ "Abs")
cname = mkCId name
cf2abstr :: CFG -> Abstr
cf2abstr cfg = Abstr aflags afuns acats BS.empty
where
aflags = Map.singleton (mkCId "startcat") (LStr (cfgStartCat cfg))
acats = Map.fromList [(mkCId cat, ([], [(0,mkRuleName rule)
| rule <- Set.toList rules], 0, 0))
| (cat,rules) <- Map.toList (cfgRules cfg)]
afuns = Map.fromList [(mkRuleName rule, (cftype [mkCId c | NonTerminal c <- ruleRhs rule] (mkCId cat), 0, Nothing, 0, 0))
| (cat,rules) <- Map.toList (cfgRules cfg)
, rule <- Set.toList rules]
cf2grammar :: CFG -> (BinTree Ident Info, BinTree Ident Info, String)
cf2grammar cfg = (buildTree abs, buildTree conc, cfgStartCat cfg) where
abs = cats ++ funs
conc = lincats ++ lins
cats = [(identS cat, AbsCat (Just (L NoLoc []))) | cat <- Map.keys (cfgRules cfg)]
lincats = [(cat, CncCat (Just (L loc defLinType)) Nothing Nothing Nothing Nothing) | (cat,AbsCat (Just (L loc _))) <- cats]
(funs,lins) = unzip (map cf2rule (concatMap Set.toList (Map.elems (cfgRules cfg))))
cf2concr :: CFG -> Concr
cf2concr cfg = Concr Map.empty Map.empty
cncfuns lindefsrefs lindefsrefs
sequences productions
IntMap.empty Map.empty
cnccats
IntMap.empty
totalCats
where
sequences0 = Set.fromList (listArray (0,0) [SymCat 0 0] :
[mkSequence rule | rules <- Map.elems (cfgRules cfg), rule <- Set.toList rules])
sequences = listArray (0,Set.size sequences0-1) (Set.toList sequences0)
cf2rule :: CFRule -> ((Ident,Info),(Ident,Info))
cf2rule (CFRule cat items (CFObj fun _)) = (def,ldef) where
f = identS (showCId fun)
def = (f, AbsFun (Just (L NoLoc (mkProd args' (Cn (identS cat)) []))) Nothing Nothing (Just True))
args0 = zip (map (identS . ("x" ++) . show) [0..]) items
args = [((Explicit,v), Cn (identS c)) | (v, NonTerminal c) <- args0]
args' = [(Explicit,identS "_", Cn (identS c)) | (_, NonTerminal c) <- args0]
ldef = (f, CncFun
Nothing
(Just (L NoLoc (mkAbs (map fst args)
(mkRecord (const theLinLabel) [foldconcat (map mkIt args0)]))))
Nothing
Nothing)
mkIt (v, NonTerminal _) = P (Vr v) theLinLabel
mkIt (_, Terminal a) = K a
foldconcat [] = K ""
foldconcat tt = foldr1 C tt
idFun = CncFun wildCId (listArray (0,0) [seqid])
where
seq = listArray (0,0) [SymCat 0 0]
seqid = binSearch seq sequences (bounds sequences)
((fun_cnt,cncfuns0),productions0) = mapAccumL convertRules (1,[idFun]) (Map.toList (cfgRules cfg))
productions = IntMap.fromList productions0
cncfuns = listArray (0,fun_cnt-1) (reverse cncfuns0)
lbls = listArray (0,0) ["s"]
(totalCats,cnccats0) = mapAccumL mkCncCat 0 (Map.toList (cfgRules cfg))
cnccats = Map.fromList cnccats0
lindefsrefs =
IntMap.fromList (map mkLinDefRef (Map.keys (cfgRules cfg)))
convertRules st (cat,rules) =
let (st',prods) = mapAccumL convertRule st (Set.toList rules)
in (st',(cat2fid cat,Set.fromList prods))
convertRule (funid,funs) rule =
let args = [PArg [] (cat2fid c) | NonTerminal c <- ruleRhs rule]
prod = PApply funid args
seqid = binSearch (mkSequence rule) sequences (bounds sequences)
fun = CncFun (mkRuleName rule) (listArray (0,0) [seqid])
funid' = funid+1
in funid' `seq` ((funid',fun:funs),prod)
mkSequence rule = listArray (0,length syms-1) syms
where
syms = snd $ mapAccumL convertSymbol 0 (ruleRhs rule)
convertSymbol d (NonTerminal _) = (d+1,SymCat d 0)
convertSymbol d (Terminal t) = (d, SymKS t)
mkCncCat fid (cat,_) = (fid+1, (mkCId cat,CncCat fid fid lbls))
mkLinDefRef cat =
(cat2fid cat,[0])
binSearch v arr (i,j)
| i <= j = case compare v (arr ! k) of
LT -> binSearch v arr (i,k-1)
EQ -> k
GT -> binSearch v arr (k+1,j)
| otherwise = error "binSearch"
where
k = (i+j) `div` 2
cat2fid cat =
case Map.lookup (mkCId cat) cnccats of
Just (CncCat fid _ _) -> fid
_ -> error "cat2fid"
mkRuleName rule =
case ruleName rule of
CFObj n _ -> n
_ -> wildCId

View File

@@ -12,7 +12,7 @@ import GF.Compile.CFGtoPGF
import GF.Compile.GetGrammar
import GF.Grammar.CFG
import GF.Infra.Ident(identS,showIdent)
import GF.Infra.Ident(showIdent)
import GF.Infra.UseIO
import GF.Infra.Option
import GF.Data.ErrM
@@ -68,13 +68,13 @@ compileCFFiles opts fs = do
startCat <- case rules of
(CFRule cat _ _ : _) -> return cat
_ -> fail "empty CFG"
let gf = cf2gf (last fs) (uniqueFuns (mkCFG startCat Set.empty rules))
gr <- compileSourceGrammar opts gf
let pgf = cf2pgf (last fs) (uniqueFuns (mkCFG startCat Set.empty rules))
let cnc = justModuleName (last fs)
unless (flag optStopAfterPhase opts == Compile) $
do pgf <- link opts (identS cnc, (), gr)
writePGF opts pgf
writeOutputs opts pgf
do probs <- liftIO (maybe (return . defaultProbabilities) readProbabilitiesFromFile (flag optProbsFile opts) pgf)
let pgf' = setProbabilities probs $ if flag optOptimizePGF opts then optimizePGF pgf else pgf
writePGF opts pgf'
writeOutputs opts pgf'
unionPGFFiles :: Options -> [FilePath] -> IOE ()
unionPGFFiles opts fs =