now we compile context-free grammars directly to PGF without going via GF source code. This makes it quick and lightweight to compile big grammars such as the Berkley grammar

This commit is contained in:
kr.angelov
2014-05-24 07:47:06 +00:00
parent dbb4fdd0f7
commit 67f64cb233
3 changed files with 106 additions and 51 deletions

View File

@@ -1,58 +1,111 @@
module GF.Compile.CFGtoPGF (cf2gf) where
module GF.Compile.CFGtoPGF (cf2pgf) where
import GF.Grammar.Grammar hiding (Cat)
import GF.Grammar.Macros
import GF.Grammar.CFG
import GF.Infra.Ident(Ident,identS)
import GF.Infra.Option
import GF.Infra.UseIO
import GF.Data.Operations
import PGF(showCId)
import PGF
import PGF.Data
import PGF.Macros
import PGF.Optimize
import qualified Data.Set as Set
import qualified Data.Map as Map
import qualified Data.IntMap as IntMap
import qualified Data.ByteString as BS
import Data.Array.IArray
import Data.List
--------------------------
-- the compiler ----------
--------------------------
cf2gf :: FilePath -> CFG -> SourceGrammar
cf2gf fpath cf = mGrammar [
(aname, ModInfo MTAbstract MSComplete (modifyFlags (\fs -> fs{optStartCat = Just cat})) [] Nothing [] [] fpath Nothing abs),
(cname, ModInfo (MTConcrete aname) MSComplete noOptions [] Nothing [] [] fpath Nothing cnc)
]
cf2pgf :: FilePath -> CFG -> PGF
cf2pgf fpath cf =
let pgf = PGF Map.empty aname (cf2abstr cf) (Map.singleton cname (cf2concr cf))
in updateProductionIndices pgf
where
name = justModuleName fpath
(abs,cnc,cat) = cf2grammar cf
aname = identS $ name ++ "Abs"
cname = identS name
aname = mkCId (name ++ "Abs")
cname = mkCId name
cf2abstr :: CFG -> Abstr
cf2abstr cfg = Abstr aflags afuns acats BS.empty
where
aflags = Map.singleton (mkCId "startcat") (LStr (cfgStartCat cfg))
acats = Map.fromList [(mkCId cat, ([], [(0,mkRuleName rule)
| rule <- Set.toList rules], 0, 0))
| (cat,rules) <- Map.toList (cfgRules cfg)]
afuns = Map.fromList [(mkRuleName rule, (cftype [mkCId c | NonTerminal c <- ruleRhs rule] (mkCId cat), 0, Nothing, 0, 0))
| (cat,rules) <- Map.toList (cfgRules cfg)
, rule <- Set.toList rules]
cf2grammar :: CFG -> (BinTree Ident Info, BinTree Ident Info, String)
cf2grammar cfg = (buildTree abs, buildTree conc, cfgStartCat cfg) where
abs = cats ++ funs
conc = lincats ++ lins
cats = [(identS cat, AbsCat (Just (L NoLoc []))) | cat <- Map.keys (cfgRules cfg)]
lincats = [(cat, CncCat (Just (L loc defLinType)) Nothing Nothing Nothing Nothing) | (cat,AbsCat (Just (L loc _))) <- cats]
(funs,lins) = unzip (map cf2rule (concatMap Set.toList (Map.elems (cfgRules cfg))))
cf2concr :: CFG -> Concr
cf2concr cfg = Concr Map.empty Map.empty
cncfuns lindefsrefs lindefsrefs
sequences productions
IntMap.empty Map.empty
cnccats
IntMap.empty
totalCats
where
sequences0 = Set.fromList (listArray (0,0) [SymCat 0 0] :
[mkSequence rule | rules <- Map.elems (cfgRules cfg), rule <- Set.toList rules])
sequences = listArray (0,Set.size sequences0-1) (Set.toList sequences0)
cf2rule :: CFRule -> ((Ident,Info),(Ident,Info))
cf2rule (CFRule cat items (CFObj fun _)) = (def,ldef) where
f = identS (showCId fun)
def = (f, AbsFun (Just (L NoLoc (mkProd args' (Cn (identS cat)) []))) Nothing Nothing (Just True))
args0 = zip (map (identS . ("x" ++) . show) [0..]) items
args = [((Explicit,v), Cn (identS c)) | (v, NonTerminal c) <- args0]
args' = [(Explicit,identS "_", Cn (identS c)) | (_, NonTerminal c) <- args0]
ldef = (f, CncFun
Nothing
(Just (L NoLoc (mkAbs (map fst args)
(mkRecord (const theLinLabel) [foldconcat (map mkIt args0)]))))
Nothing
Nothing)
mkIt (v, NonTerminal _) = P (Vr v) theLinLabel
mkIt (_, Terminal a) = K a
foldconcat [] = K ""
foldconcat tt = foldr1 C tt
idFun = CncFun wildCId (listArray (0,0) [seqid])
where
seq = listArray (0,0) [SymCat 0 0]
seqid = binSearch seq sequences (bounds sequences)
((fun_cnt,cncfuns0),productions0) = mapAccumL convertRules (1,[idFun]) (Map.toList (cfgRules cfg))
productions = IntMap.fromList productions0
cncfuns = listArray (0,fun_cnt-1) (reverse cncfuns0)
lbls = listArray (0,0) ["s"]
(totalCats,cnccats0) = mapAccumL mkCncCat 0 (Map.toList (cfgRules cfg))
cnccats = Map.fromList cnccats0
lindefsrefs =
IntMap.fromList (map mkLinDefRef (Map.keys (cfgRules cfg)))
convertRules st (cat,rules) =
let (st',prods) = mapAccumL convertRule st (Set.toList rules)
in (st',(cat2fid cat,Set.fromList prods))
convertRule (funid,funs) rule =
let args = [PArg [] (cat2fid c) | NonTerminal c <- ruleRhs rule]
prod = PApply funid args
seqid = binSearch (mkSequence rule) sequences (bounds sequences)
fun = CncFun (mkRuleName rule) (listArray (0,0) [seqid])
funid' = funid+1
in funid' `seq` ((funid',fun:funs),prod)
mkSequence rule = listArray (0,length syms-1) syms
where
syms = snd $ mapAccumL convertSymbol 0 (ruleRhs rule)
convertSymbol d (NonTerminal _) = (d+1,SymCat d 0)
convertSymbol d (Terminal t) = (d, SymKS t)
mkCncCat fid (cat,_) = (fid+1, (mkCId cat,CncCat fid fid lbls))
mkLinDefRef cat =
(cat2fid cat,[0])
binSearch v arr (i,j)
| i <= j = case compare v (arr ! k) of
LT -> binSearch v arr (i,k-1)
EQ -> k
GT -> binSearch v arr (k+1,j)
| otherwise = error "binSearch"
where
k = (i+j) `div` 2
cat2fid cat =
case Map.lookup (mkCId cat) cnccats of
Just (CncCat fid _ _) -> fid
_ -> error "cat2fid"
mkRuleName rule =
case ruleName rule of
CFObj n _ -> n
_ -> wildCId