the content of ParseEngAbs3.probs is now merged with ParseEngAbs.probs. The later is now retrained. Once the grammar is compiled with the .probs file now it doesn't need anything more to do robust parsing. The robustness itself is controlled by the flags 'heuristic_search_factor', 'meta_prob' and 'meta_token_prob' in ParseEngAbs.gf

This commit is contained in:
kr.angelov
2013-11-06 10:21:46 +00:00
parent 84ef5fa5fa
commit 2483dc7728
27 changed files with 65052 additions and 65106 deletions

View File

@@ -1134,15 +1134,19 @@ allCommands = Map.fromList [
case arg of case arg of
[EFun id] -> case Map.lookup id (funs (abstract pgf)) of [EFun id] -> case Map.lookup id (funs (abstract pgf)) of
Just fd -> do putStrLn $ render (ppFun id fd) Just fd -> do putStrLn $ render (ppFun id fd)
putStrLn ("Probability: "++show (probTree pgf (EFun id))) let (_,_,_,prob,_) = fd
putStrLn ("Probability: "++show prob)
return void return void
Nothing -> case Map.lookup id (cats (abstract pgf)) of Nothing -> case Map.lookup id (cats (abstract pgf)) of
Just hyps -> do putStrLn $ Just cd -> do putStrLn $
render (ppCat id hyps $$ render (ppCat id cd $$
if null (functionsToCat pgf id) if null (functionsToCat pgf id)
then empty then empty
else space $$ else space $$
vcat [ppFun fid (ty,0,Just [],0,0) | (fid,ty) <- functionsToCat pgf id]) vcat [ppFun fid (ty,0,Just [],0,0) | (fid,ty) <- functionsToCat pgf id] $$
space)
let (_,_,prob,_) = cd
putStrLn ("Probability: "++show prob)
return void return void
Nothing -> do putStrLn ("unknown category of function identifier "++show id) Nothing -> do putStrLn ("unknown category of function identifier "++show id)
return void return void

View File

@@ -50,12 +50,12 @@ mkCanon2pgf opts gr am = do
[((cPredefAbs,c), AbsCat (Just (L NoLoc []))) | c <- [cFloat,cInt,cString]] ++ [((cPredefAbs,c), AbsCat (Just (L NoLoc []))) | c <- [cFloat,cInt,cString]] ++
Look.allOrigInfos gr am Look.allOrigInfos gr am
flags = Map.fromList [(mkCId f,if f == "beam_size" then C.LFlt (read x) else C.LStr x) | (f,x) <- optionsPGF aflags] flags = Map.fromList [(mkCId f,x) | (f,x) <- optionsPGF aflags]
funs = Map.fromList [(i2i f, (mkType [] ty, mkArrity ma, mkDef pty, 0, addr)) | funs = Map.fromList [(i2i f, (mkType [] ty, mkArrity ma, mkDef pty, 0, addr)) |
((m,f),AbsFun (Just (L _ ty)) ma pty _,addr) <- adefs] ((m,f),AbsFun (Just (L _ ty)) ma pty _,addr) <- adefs]
cats = Map.fromList [(i2i c, (snd (mkContext [] cont),catfuns c, addr)) | cats = Map.fromList [(i2i c, (snd (mkContext [] cont),catfuns c, 0, addr)) |
((m,c),AbsCat (Just (L _ cont)),addr) <- adefs] ((m,c),AbsCat (Just (L _ cont)),addr) <- adefs]
catfuns cat = catfuns cat =
@@ -69,7 +69,7 @@ mkCanon2pgf opts gr am = do
([((cPredefAbs,c), CncCat (Just (L NoLoc GM.defLinType)) Nothing Nothing Nothing Nothing) | c <- [cInt,cFloat,cString]] ++ ([((cPredefAbs,c), CncCat (Just (L NoLoc GM.defLinType)) Nothing Nothing Nothing Nothing) | c <- [cInt,cFloat,cString]] ++
Look.allOrigInfos gr cm) Look.allOrigInfos gr cm)
let flags = Map.fromList [(mkCId f,if f == "beam_size" then C.LFlt (read x) else C.LStr x) | (f,x) <- optionsPGF cflags] let flags = Map.fromList [(mkCId f,x) | (f,x) <- optionsPGF cflags]
seqs = (mkSetArray . Set.fromList . concat) $ seqs = (mkSetArray . Set.fromList . concat) $
(Map.keys ex_seqs : [maybe [] elems (mseqs mi) | (m,mi) <- allExtends gr cm]) (Map.keys ex_seqs : [maybe [] elems (mseqs mi) | (m,mi) <- allExtends gr cm])

View File

@@ -12,7 +12,7 @@ import qualified Data.Map as Map
grammar2lambdaprolog_mod pgf = render $ grammar2lambdaprolog_mod pgf = render $
text "module" <+> ppCId (absname pgf) <> char '.' $$ text "module" <+> ppCId (absname pgf) <> char '.' $$
space $$ space $$
vcat [ppClauses cat fns | (cat,(_,fs,_)) <- Map.toList (cats (abstract pgf)), vcat [ppClauses cat fns | (cat,(_,fs,_,_)) <- Map.toList (cats (abstract pgf)),
let fns = [(f,fromJust (Map.lookup f (funs (abstract pgf)))) | (_,f) <- fs]] let fns = [(f,fromJust (Map.lookup f (funs (abstract pgf)))) | (_,f) <- fs]]
where where
ppClauses cat fns = ppClauses cat fns =
@@ -25,11 +25,11 @@ grammar2lambdaprolog_mod pgf = render $
grammar2lambdaprolog_sig pgf = render $ grammar2lambdaprolog_sig pgf = render $
text "sig" <+> ppCId (absname pgf) <> char '.' $$ text "sig" <+> ppCId (absname pgf) <> char '.' $$
space $$ space $$
vcat [ppCat c hyps <> dot | (c,(hyps,_,_)) <- Map.toList (cats (abstract pgf))] $$ vcat [ppCat c hyps <> dot | (c,(hyps,_,_,_)) <- Map.toList (cats (abstract pgf))] $$
space $$ space $$
vcat [ppFun f ty <> dot | (f,(ty,_,Nothing,_,_)) <- Map.toList (funs (abstract pgf))] $$ vcat [ppFun f ty <> dot | (f,(ty,_,Nothing,_,_)) <- Map.toList (funs (abstract pgf))] $$
space $$ space $$
vcat [ppExport c hyps <> dot | (c,(hyps,_,_)) <- Map.toList (cats (abstract pgf))] $$ vcat [ppExport c hyps <> dot | (c,(hyps,_,_,_)) <- Map.toList (cats (abstract pgf))] $$
vcat [ppFunPred f (hyps ++ [(Explicit,wildCId,DTyp [] c es)]) <> dot | (f,(DTyp hyps c es,_,Just _,_,_)) <- Map.toList (funs (abstract pgf))] vcat [ppFunPred f (hyps ++ [(Explicit,wildCId,DTyp [] c es)]) <> dot | (f,(DTyp hyps c es,_,Just _,_,_)) <- Map.toList (funs (abstract pgf))]
ppCat :: CId -> [Hypo] -> Doc ppCat :: CId -> [Hypo] -> Doc

View File

@@ -49,7 +49,7 @@ plAbstract name abs
(f, v) <- Map.assocs (aflags abs)] ++++ (f, v) <- Map.assocs (aflags abs)] ++++
plFacts name "cat" 2 "(?Type, ?[X:Type,...])" plFacts name "cat" 2 "(?Type, ?[X:Type,...])"
[[plType cat args, plHypos hypos'] | [[plType cat args, plHypos hypos'] |
(cat, (hypos, _, _)) <- Map.assocs (cats abs), (cat, (hypos,_,_,_)) <- Map.assocs (cats abs),
let ((_, subst), hypos') = mapAccumL alphaConvertHypo emptyEnv hypos, let ((_, subst), hypos') = mapAccumL alphaConvertHypo emptyEnv hypos,
let args = reverse [EFun x | (_,x) <- subst]] ++++ let args = reverse [EFun x | (_,x) <- subst]] ++++
plFacts name "fun" 3 "(?Fun, ?Type, ?[X:Type,...])" plFacts name "fun" 3 "(?Fun, ?Type, ?[X:Type,...])"

View File

@@ -17,6 +17,7 @@ import Data.Binary
--import Control.Monad --import Control.Monad
import qualified Data.Map as Map import qualified Data.Map as Map
import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Char8 as BS
import Text.PrettyPrint (render)
import GF.Data.Operations import GF.Data.Operations
import GF.Infra.Ident import GF.Infra.Ident
@@ -24,9 +25,10 @@ import GF.Infra.Option
import GF.Grammar.Grammar import GF.Grammar.Grammar
import PGF() -- Binary instances import PGF() -- Binary instances
import PGF.Data(ppLit)
-- Please change this every time when the GFO format is changed -- Please change this every time when the GFO format is changed
gfoVersion = "GF02" gfoVersion = "GF03"
instance Binary Ident where instance Binary Ident where
put id = put (ident2bs id) put id = put (ident2bs id)
@@ -91,7 +93,7 @@ instance Binary ModuleStatus where
instance Binary Options where instance Binary Options where
put = put . optionsGFO put = put . optionsGFO
get = do opts <- get get = do opts <- get
case parseModuleOptions ["--" ++ flag ++ "=" ++ value | (flag,value) <- opts] of case parseModuleOptions ["--" ++ flag ++ "=" ++ render (ppLit value) | (flag,value) <- opts] of
Ok x -> return x Ok x -> return x
Bad msg -> fail msg Bad msg -> fail msg

View File

@@ -29,7 +29,7 @@ import GF.Infra.Option
import GF.Grammar.Values import GF.Grammar.Values
import GF.Grammar.Grammar import GF.Grammar.Grammar
import PGF.Data (ppMeta) import PGF.Data (ppMeta, ppLit)
import PGF.Printer (ppFId, ppFunId, ppSeqId, ppSeq) import PGF.Printer (ppFId, ppFunId, ppSeqId, ppSeq)
import Text.PrettyPrint import Text.PrettyPrint
@@ -87,7 +87,7 @@ ppModule q (mn, ModInfo mtype mstat opts exts with opens _ _ mseqs jments) =
ppOptions opts = ppOptions opts =
text "flags" $$ text "flags" $$
nest 2 (vcat [text option <+> equals <+> str value <+> semi | (option,value) <- optionsGFO opts]) nest 2 (vcat [text option <+> equals <+> ppLit value <+> semi | (option,value) <- optionsGFO opts])
ppJudgement q (id, AbsCat pcont ) = ppJudgement q (id, AbsCat pcont ) =
text "cat" <+> ppIdent id <+> text "cat" <+> ppIdent id <+>

View File

@@ -43,6 +43,7 @@ import GF.Data.ErrM
import Data.Set (Set) import Data.Set (Set)
import qualified Data.Set as Set import qualified Data.Set as Set
import PGF.Data(Literal(..))
usageHeader :: String usageHeader :: String
usageHeader = unlines usageHeader = unlines
@@ -170,7 +171,9 @@ data Flags = Flags {
optWarnings :: [Warning], optWarnings :: [Warning],
optDump :: [Dump], optDump :: [Dump],
optTagsOnly :: Bool, optTagsOnly :: Bool,
optBeamSize :: Maybe Double, optHeuristicFactor :: Maybe Double,
optMetaProb :: Maybe Double,
optMetaToknProb :: Maybe Double,
optNewComp :: Bool optNewComp :: Bool
} }
deriving (Show) deriving (Show)
@@ -206,16 +209,18 @@ fixRelativeLibPaths curr_dir lib_dir (Options o) = Options (fixPathFlags . o)
-- Showing options -- Showing options
-- | Pretty-print the options that are preserved in .gfo files. -- | Pretty-print the options that are preserved in .gfo files.
optionsGFO :: Options -> [(String,String)] optionsGFO :: Options -> [(String,Literal)]
optionsGFO opts = optionsPGF opts optionsGFO opts = optionsPGF opts
++ [("coding", flag optEncoding opts)] ++ [("coding", LStr (flag optEncoding opts))]
-- | Pretty-print the options that are preserved in .pgf files. -- | Pretty-print the options that are preserved in .pgf files.
optionsPGF :: Options -> [(String,String)] optionsPGF :: Options -> [(String,Literal)]
optionsPGF opts = optionsPGF opts =
maybe [] (\x -> [("language",x)]) (flag optSpeechLanguage opts) maybe [] (\x -> [("language",LStr x)]) (flag optSpeechLanguage opts)
++ maybe [] (\x -> [("startcat",x)]) (flag optStartCat opts) ++ maybe [] (\x -> [("startcat",LStr x)]) (flag optStartCat opts)
++ maybe [] (\x -> [("beam_size",show x)]) (flag optBeamSize opts) ++ maybe [] (\x -> [("heuristic_search_factor",LFlt x)]) (flag optHeuristicFactor opts)
++ maybe [] (\x -> [("meta_prob",LFlt x)]) (flag optMetaProb opts)
++ maybe [] (\x -> [("meta_token_prob",LFlt x)]) (flag optMetaToknProb opts)
-- Option manipulation -- Option manipulation
@@ -272,7 +277,9 @@ defaultFlags = Flags {
optWarnings = [], optWarnings = [],
optDump = [], optDump = [],
optTagsOnly = False, optTagsOnly = False,
optBeamSize = Nothing, optHeuristicFactor = Nothing,
optMetaProb = Nothing,
optMetaToknProb = Nothing,
optNewComp = optNewComp =
#ifdef NEW_COMP #ifdef NEW_COMP
True True
@@ -358,7 +365,9 @@ optDescr =
Option [] ["stem"] (onOff (toggleOptimize OptStem) True) "Perform stem-suffix analysis (default on).", Option [] ["stem"] (onOff (toggleOptimize OptStem) True) "Perform stem-suffix analysis (default on).",
Option [] ["cse"] (onOff (toggleOptimize OptCSE) True) "Perform common sub-expression elimination (default on).", Option [] ["cse"] (onOff (toggleOptimize OptCSE) True) "Perform common sub-expression elimination (default on).",
Option [] ["cfg"] (ReqArg cfgTransform "TRANS") "Enable or disable specific CFG transformations. TRANS = merge, no-merge, bottomup, no-bottomup, ...", Option [] ["cfg"] (ReqArg cfgTransform "TRANS") "Enable or disable specific CFG transformations. TRANS = merge, no-merge, bottomup, no-bottomup, ...",
Option [] ["beam_size"] (ReqArg readDouble "SIZE") "Set the beam size for statistical parsing", Option [] ["heuristic_search_factor"] (ReqArg (readDouble (\d o -> o { optHeuristicFactor = Just d })) "FACTOR") "Set the heuristic search factor for statistical parsing",
Option [] ["meta_prob"] (ReqArg (readDouble (\d o -> o { optMetaProb = Just d })) "PROB") "Set the probability of introducting a meta variable in the parser",
Option [] ["meta_token_prob"] (ReqArg (readDouble (\d o -> o { optMetaToknProb = Just d })) "PROB") "Set the probability for skipping a token in the parser",
Option [] ["new-comp"] (NoArg (set $ \o -> o{optNewComp = True})) "Use the new experimental compiler.", Option [] ["new-comp"] (NoArg (set $ \o -> o{optNewComp = True})) "Use the new experimental compiler.",
Option [] ["old-comp"] (NoArg (set $ \o -> o{optNewComp = False})) "Use old trusty compiler.", Option [] ["old-comp"] (NoArg (set $ \o -> o{optNewComp = False})) "Use old trusty compiler.",
dumpOption "source" Source, dumpOption "source" Source,
@@ -433,9 +442,9 @@ optDescr =
Nothing -> fail $ "Unknown CFG transformation: " ++ x' Nothing -> fail $ "Unknown CFG transformation: " ++ x'
++ " Known: " ++ show (map fst cfgTransformNames) ++ " Known: " ++ show (map fst cfgTransformNames)
readDouble x = case reads x of readDouble f x = case reads x of
[(d,"")] -> set $ \o -> o { optBeamSize = Just d } [(d,"")] -> set $ f d
_ -> fail "A floating point number is expected" _ -> fail "A floating point number is expected"
dumpOption s d = Option [] ["dump-"++s] (NoArg (set $ \o -> o { optDump = Dump d:optDump o})) ("Dump output of the " ++ s ++ " phase.") dumpOption s d = Option [] ["dump-"++s] (NoArg (set $ \o -> o { optDump = Dump d:optDump o})) ("Dump output of the " ++ s ++ " phase.")

View File

@@ -39,7 +39,7 @@ type Skeleton = [(CId, [(CId, [CId])])]
pgfSkeleton :: PGF -> Skeleton pgfSkeleton :: PGF -> Skeleton
pgfSkeleton pgf = [(c,[(f,fst (catSkeleton (lookType (abstract pgf) f))) | (_,f) <- fs]) pgfSkeleton pgf = [(c,[(f,fst (catSkeleton (lookType (abstract pgf) f))) | (_,f) <- fs])
| (c,(_,fs,_)) <- Map.toList (cats (abstract pgf))] | (c,(_,fs,_,_)) <- Map.toList (cats (abstract pgf))]
-- --
-- * Questions to ask -- * Questions to ask

View File

@@ -104,7 +104,7 @@ writeByteCode opts pgf
where where
addrs = addrs =
[(id,addr) | (id,(_,_,_,_,addr)) <- Map.toList (funs (abstract pgf))] ++ [(id,addr) | (id,(_,_,_,_,addr)) <- Map.toList (funs (abstract pgf))] ++
[(id,addr) | (id,(_,_,addr)) <- Map.toList (cats (abstract pgf))] [(id,addr) | (id,(_,_,_,addr)) <- Map.toList (cats (abstract pgf))]
writePGF :: Options -> PGF -> IOE () writePGF :: Options -> PGF -> IOE ()
writePGF opts pgf = do writePGF opts pgf = do

View File

@@ -23,6 +23,7 @@ import GF.Grammar.Printer(ppParams,ppTerm,getAbs,TermPrintQual(..))
import GF.Grammar.Parser(runP,pModDef) import GF.Grammar.Parser(runP,pModDef)
import GF.Grammar.Lexer(Posn(..)) import GF.Grammar.Lexer(Posn(..))
import GF.Data.ErrM import GF.Data.ErrM
import PGF.Data(Literal(LStr))
import SimpleEditor.Syntax as S import SimpleEditor.Syntax as S
import SimpleEditor.JSON import SimpleEditor.JSON
@@ -57,7 +58,10 @@ convAbstract (modid,src) =
let cats = reverse cats0 let cats = reverse cats0
funs = reverse funs0 funs = reverse funs0
flags = optionsGFO (mflags src) flags = optionsGFO (mflags src)
startcat = maybe "-" id $ lookup "startcat" flags startcat =
case lookup "startcat" flags of
Just (LStr cat) -> cat
_ -> "-"
return $ Grammar (convId modid) extends (Abstract startcat cats funs) [] return $ Grammar (convId modid) extends (Abstract startcat cats funs) []
convExtends = mapM convExtend convExtends = mapM convExtend

View File

@@ -87,9 +87,7 @@ typedef struct {
PgfCId name; PgfCId name;
PgfHypos* context; PgfHypos* context;
prob_t meta_prob; prob_t prob;
prob_t meta_token_prob;
PgfMetaChildMap* meta_child_probs;
void* predicate; void* predicate;
} PgfAbsCat; } PgfAbsCat;
@@ -230,6 +228,7 @@ typedef GuSeq PgfCncFuns;
struct PgfConcr { struct PgfConcr {
PgfCId name; PgfCId name;
PgfAbstr* abstr;
PgfFlags* cflags; PgfFlags* cflags;
PgfPrintNames* printnames; PgfPrintNames* printnames;
GuMap* ccats; GuMap* ccats;

View File

@@ -63,7 +63,10 @@ typedef struct {
int prod_full_count; int prod_full_count;
#endif #endif
PgfItem* free_item; PgfItem* free_item;
prob_t beam_size;
prob_t heuristic_factor;
prob_t meta_prob;
prob_t meta_token_prob;
} PgfParsing; } PgfParsing;
typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE; typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
@@ -1389,12 +1392,14 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
{ {
(void) (err); (void) (err);
PgfAbsCat* abscat = (PgfAbsCat*) key; PgfAbsCat* abscat = *((PgfAbsCat**) value);
prob_t meta_prob = *((prob_t*) value);
PgfMetaPredictFn* clo = (PgfMetaPredictFn*) fn; PgfMetaPredictFn* clo = (PgfMetaPredictFn*) fn;
PgfParsing* ps = clo->ps; PgfParsing* ps = clo->ps;
PgfItem* meta_item = clo->meta_item; PgfItem* meta_item = clo->meta_item;
if (abscat->prob == INFINITY)
return;
PgfCncCat* cnccat = PgfCncCat* cnccat =
gu_map_get(ps->concr->cnccats, abscat->name, PgfCncCat*); gu_map_get(ps->concr->cnccats, abscat->name, PgfCncCat*);
if (cnccat == NULL) if (cnccat == NULL)
@@ -1412,7 +1417,7 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
PgfItem* item = PgfItem* item =
pgf_item_copy(meta_item, ps); pgf_item_copy(meta_item, ps);
item->inside_prob += item->inside_prob +=
ccat->viterbi_prob+meta_prob; ccat->viterbi_prob+abscat->prob;
size_t nargs = gu_seq_length(meta_item->args); size_t nargs = gu_seq_length(meta_item->args);
item->args = gu_new_seq(PgfPArg, nargs+1, ps->pool); item->args = gu_new_seq(PgfPArg, nargs+1, ps->pool);
@@ -1698,18 +1703,14 @@ pgf_parsing_item(PgfParsing* ps, PgfItem* item)
} }
pgf_parsing_complete(ps, item, ep); pgf_parsing_complete(ps, item, ep);
} else { } else {
prob_t meta_token_prob = prob_t meta_token_prob =
item->conts->ccat->cnccat->abscat->meta_token_prob; ps->meta_token_prob;
if (meta_token_prob != INFINITY) { if (meta_token_prob != INFINITY) {
pgf_parsing_meta_scan(ps, item, meta_token_prob); pgf_parsing_meta_scan(ps, item, meta_token_prob);
} }
PgfCIdMap* meta_child_probs = PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, ps, item };
item->conts->ccat->cnccat->abscat->meta_child_probs; gu_map_iter(ps->concr->abstr->cats, &clo.fn, NULL);
if (meta_child_probs != NULL) {
PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, ps, item };
gu_map_iter(meta_child_probs, &clo.fn, NULL);
}
} }
} else { } else {
pgf_parsing_symbol(ps, item, item->curr_sym); pgf_parsing_symbol(ps, item, item->curr_sym);
@@ -1721,22 +1722,38 @@ pgf_parsing_item(PgfParsing* ps, PgfItem* item)
} }
} }
static prob_t static void
pgf_parsing_default_beam_size(PgfConcr* concr) pgf_parsing_set_default_factors(PgfParsing* ps, PgfAbstr* abstr)
{ {
PgfLiteral lit = gu_map_get(concr->cflags, "beam_size", PgfLiteral); PgfLiteral lit;
if (gu_variant_is_null(lit)) lit =
return 0; gu_map_get(abstr->aflags, "heuristic_search_factor", PgfLiteral);
if (!gu_variant_is_null(lit)) {
GuVariantInfo pi = gu_variant_open(lit);
gu_assert (pi.tag == PGF_LITERAL_FLT);
ps->heuristic_factor = ((PgfLiteralFlt*) pi.data)->val;
}
GuVariantInfo pi = gu_variant_open(lit); lit =
gu_assert (pi.tag == PGF_LITERAL_FLT); gu_map_get(abstr->aflags, "meta_prob", PgfLiteral);
return ((PgfLiteralFlt*) pi.data)->val; if (!gu_variant_is_null(lit)) {
GuVariantInfo pi = gu_variant_open(lit);
gu_assert (pi.tag == PGF_LITERAL_FLT);
ps->meta_prob = - log(((PgfLiteralFlt*) pi.data)->val);
}
lit =
gu_map_get(abstr->aflags, "meta_token_prob", PgfLiteral);
if (!gu_variant_is_null(lit)) {
GuVariantInfo pi = gu_variant_open(lit);
gu_assert (pi.tag == PGF_LITERAL_FLT);
ps->meta_token_prob = - log(((PgfLiteralFlt*) pi.data)->val);
}
} }
static PgfParsing* static PgfParsing*
pgf_new_parsing(PgfConcr* concr, pgf_new_parsing(PgfConcr* concr, GuString sentence,
GuString sentence, double heuristics,
GuPool* pool, GuPool* out_pool) GuPool* pool, GuPool* out_pool)
{ {
PgfParsing* ps = gu_new(PgfParsing, pool); PgfParsing* ps = gu_new(PgfParsing, pool);
@@ -1756,7 +1773,11 @@ pgf_new_parsing(PgfConcr* concr,
ps->prod_full_count = 0; ps->prod_full_count = 0;
#endif #endif
ps->free_item = NULL; ps->free_item = NULL;
ps->beam_size = heuristics; ps->heuristic_factor = 0;
ps->meta_prob = INFINITY;
ps->meta_token_prob = INFINITY;
pgf_parsing_set_default_factors(ps, concr->abstr);
PgfExprMeta *expr_meta = PgfExprMeta *expr_meta =
gu_new_variant(PGF_EXPR_META, gu_new_variant(PGF_EXPR_META,
@@ -2107,7 +2128,7 @@ pgf_parse_result_is_new(PgfExprState* st)
// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat // TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
static PgfParsing* static PgfParsing*
pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx, pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx,
GuString sentence, double heuristics, GuString sentence, double heuristic_factor,
GuExn* err, GuExn* err,
GuPool* pool, GuPool* out_pool) GuPool* pool, GuPool* out_pool)
{ {
@@ -2121,12 +2142,13 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx,
gu_assert(lin_idx < cnccat->n_lins); gu_assert(lin_idx < cnccat->n_lins);
if (heuristics < 0) { PgfParsing* ps =
heuristics = pgf_parsing_default_beam_size(concr); pgf_new_parsing(concr, sentence, pool, out_pool);
if (heuristic_factor >= 0) {
ps->heuristic_factor = heuristic_factor;
} }
PgfParsing* ps =
pgf_new_parsing(concr, sentence, heuristics, pool, out_pool);
PgfParseState* state = PgfParseState* state =
pgf_new_parse_state(ps, 0, BIND_SOFT); pgf_new_parse_state(ps, 0, BIND_SOFT);
@@ -2156,11 +2178,13 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx,
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
} }
PgfItem *item = if (ps->meta_prob != INFINITY) {
pgf_new_item(ps, conts, ps->meta_prod); PgfItem *item =
item->inside_prob = pgf_new_item(ps, conts, ps->meta_prod);
ccat->cnccat->abscat->meta_prob; item->inside_prob =
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); ps->meta_prob;
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
}
} }
} }
@@ -2200,7 +2224,7 @@ pgf_parsing_proceed(PgfParsing* ps)
prob_t state_delta = prob_t state_delta =
(st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))* (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))*
ps->beam_size; ps->heuristic_factor;
delta_prob += state_delta; delta_prob += state_delta;
st = st->next; st = st->next;
} }

View File

@@ -35,63 +35,6 @@ pgf_read(const char* fpath,
return pgf; return pgf;
} }
void
pgf_load_meta_child_probs(PgfPGF* pgf, const char* fpath,
GuPool* pool, GuExn* err)
{
FILE *fp = fopen(fpath, "r");
if (!fp) {
gu_raise_errno(err);
return;
}
GuPool* tmp_pool = gu_new_pool();
for (;;) {
char cat1[21];
char cat2[21];
prob_t prob;
if (fscanf(fp, "%20s\t%20s\t%f", cat1, cat2, &prob) < 3)
break;
prob = - log(prob);
PgfAbsCat* abscat1 =
gu_map_get(pgf->abstract.cats, cat1, PgfAbsCat*);
if (abscat1 == NULL) {
GuExnData* exn = gu_raise(err, PgfExn);
exn->data = "Unknown category name";
goto close;
}
if (strcmp(cat2, "*") == 0) {
abscat1->meta_prob = prob;
} else if (strcmp(cat2, "_") == 0) {
abscat1->meta_token_prob = prob;
} else {
PgfAbsCat* abscat2 = gu_map_get(pgf->abstract.cats, cat2, PgfAbsCat*);
if (abscat2 == NULL) {
gu_raise(err, PgfExn);
GuExnData* exn = gu_raise(err, PgfExn);
exn->data = "Unknown category name";
goto close;
}
if (abscat1->meta_child_probs == NULL) {
abscat1->meta_child_probs =
gu_map_type_new(PgfMetaChildMap, pool);
}
gu_map_put(abscat1->meta_child_probs, abscat2, prob_t, prob);
}
}
close:
gu_pool_free(tmp_pool);
fclose(fp);
}
GuString GuString
pgf_abstract_name(PgfPGF* pgf) pgf_abstract_name(PgfPGF* pgf)
{ {

View File

@@ -80,11 +80,6 @@ pgf_read(const char* fpath,
* *
*/ */
void
pgf_load_meta_child_probs(PgfPGF*, const char* fpath,
GuPool* pool, GuExn* err);
GuString GuString
pgf_abstract_name(PgfPGF*); pgf_abstract_name(PgfPGF*);

View File

@@ -48,7 +48,7 @@ pgf_print_cat(GuMapItor* fn, const void* key, void* value,
ctxt = next; ctxt = next;
} }
gu_printf(out, err, " ; -- %f\n",cat->meta_prob); gu_printf(out, err, " ; -- %f\n", cat->prob);
} }
void void

View File

@@ -516,10 +516,6 @@ pgf_read_abscat(PgfReader* rdr, PgfAbstr* abstr, PgfCIdMap* abscats)
gu_return_on_exn(rdr->err, NULL); gu_return_on_exn(rdr->err, NULL);
} }
abscat->meta_prob = INFINITY;
abscat->meta_token_prob = INFINITY;
abscat->meta_child_probs = NULL;
GuBuf* functions = gu_new_buf(PgfAbsFun*, rdr->tmp_pool); GuBuf* functions = gu_new_buf(PgfAbsFun*, rdr->tmp_pool);
size_t n_functions = pgf_read_len(rdr); size_t n_functions = pgf_read_len(rdr);
@@ -538,6 +534,8 @@ pgf_read_abscat(PgfReader* rdr, PgfAbstr* abstr, PgfCIdMap* abscats)
gu_buf_push(functions, PgfAbsFun*, absfun); gu_buf_push(functions, PgfAbsFun*, absfun);
} }
abscat->prob = - log(gu_in_f64be(rdr->in, rdr->err));
pgf_jit_predicate(rdr->jit_state, abscats, abscat, functions); pgf_jit_predicate(rdr->jit_state, abscats, abscat, functions);
return abscat; return abscat;
@@ -1155,6 +1153,8 @@ pgf_read_concrete(PgfReader* rdr, PgfAbstr* abstr, PgfAbsFun* abs_lin_fun)
pgf_read_cid(rdr, rdr->opool); pgf_read_cid(rdr, rdr->opool);
gu_return_on_exn(rdr->err, NULL); gu_return_on_exn(rdr->err, NULL);
concr->abstr = abstr;
concr->cflags = concr->cflags =
pgf_read_flags(rdr); pgf_read_flags(rdr);
gu_return_on_exn(rdr->err, NULL); gu_return_on_exn(rdr->err, NULL);

View File

@@ -53,18 +53,17 @@ int main(int argc, char* argv[]) {
// Create the pool that is used to allocate everything // Create the pool that is used to allocate everything
GuPool* pool = gu_new_pool(); GuPool* pool = gu_new_pool();
int status = EXIT_SUCCESS; int status = EXIT_SUCCESS;
if (argc < 5 || argc > 6) { if (argc < 5) {
fprintf(stderr, "usage: %s pgf cat from-lang to-lang [probs-file]\n", argv[0]); fprintf(stderr, "usage: %s pgf cat from-lang to-lang\n", argv[0]);
status = EXIT_FAILURE; status = EXIT_FAILURE;
goto fail; goto fail;
} }
char* filename = argv[1];
GuString filename = argv[1];
GuString cat = argv[2]; GuString cat = argv[2];
GuString from_lang = argv[3]; GuString from_lang = argv[3];
GuString to_lang = argv[4]; GuString to_lang = argv[4];
// Create an exception frame that catches all errors. // Create an exception frame that catches all errors.
GuExn* err = gu_new_exn(NULL, gu_kind(type), pool); GuExn* err = gu_new_exn(NULL, gu_kind(type), pool);
@@ -78,16 +77,6 @@ int main(int argc, char* argv[]) {
goto fail; goto fail;
} }
if (argc == 6) {
char* meta_probs_filename = argv[5];
pgf_load_meta_child_probs(pgf, meta_probs_filename, pool, err);
if (!gu_ok(err)) {
fprintf(stderr, "Loading meta child probs failed\n");
status = EXIT_FAILURE;
goto fail;
}
}
// Look up the source and destination concrete categories // Look up the source and destination concrete categories
PgfConcr* from_concr = pgf_get_language(pgf, from_lang); PgfConcr* from_concr = pgf_get_language(pgf, from_lang);
PgfConcr* to_concr = pgf_get_language(pgf, to_lang); PgfConcr* to_concr = pgf_get_language(pgf, to_lang);

View File

@@ -292,8 +292,8 @@ categories pgf = [c | (c,hs) <- Map.toList (cats (abstract pgf))]
categoryContext pgf cat = categoryContext pgf cat =
case Map.lookup cat (cats (abstract pgf)) of case Map.lookup cat (cats (abstract pgf)) of
Just (hypos,_,_) -> Just hypos Just (hypos,_,_,_) -> Just hypos
Nothing -> Nothing Nothing -> Nothing
startCat pgf = DTyp [] (lookStartCat pgf) [] startCat pgf = DTyp [] (lookStartCat pgf) []
@@ -301,8 +301,8 @@ functions pgf = Map.keys (funs (abstract pgf))
functionsByCat pgf cat = functionsByCat pgf cat =
case Map.lookup cat (cats (abstract pgf)) of case Map.lookup cat (cats (abstract pgf)) of
Just (_,fns,_) -> map snd fns Just (_,fns,_,_) -> map snd fns
Nothing -> [] Nothing -> []
functionType pgf fun = functionType pgf fun =
case Map.lookup fun (funs (abstract pgf)) of case Map.lookup fun (funs (abstract pgf)) of
@@ -325,8 +325,8 @@ browse pgf id = fmap (\def -> (def,producers,consumers)) definition
in ppCId id <+> hsep ds <+> char '=' <+> ppExpr 0 scope res | Equ patts res <- eqs]) in ppCId id <+> hsep ds <+> char '=' <+> ppExpr 0 scope res | Equ patts res <- eqs])
Just (ty,_,Nothing, _,_) -> Just $ render (text "data" <+> ppCId id <+> colon <+> ppType 0 [] ty) Just (ty,_,Nothing, _,_) -> Just $ render (text "data" <+> ppCId id <+> colon <+> ppType 0 [] ty)
Nothing -> case Map.lookup id (cats (abstract pgf)) of Nothing -> case Map.lookup id (cats (abstract pgf)) of
Just (hyps,_,_) -> Just $ render (text "cat" <+> ppCId id <+> hsep (snd (mapAccumL (ppHypo 4) [] hyps))) Just (hyps,_,_,_) -> Just $ render (text "cat" <+> ppCId id <+> hsep (snd (mapAccumL (ppHypo 4) [] hyps)))
Nothing -> Nothing Nothing -> Nothing
(producers,consumers) = Map.foldWithKey accum ([],[]) (funs (abstract pgf)) (producers,consumers) = Map.foldWithKey accum ([],[]) (funs (abstract pgf))
where where

View File

@@ -40,13 +40,13 @@ instance Binary CId where
instance Binary Abstr where instance Binary Abstr where
put abs = put (aflags abs, put abs = put (aflags abs,
fmap (\(w,x,y,z,_) -> (w,x,y,z)) (funs abs), fmap (\(w,x,y,z,_) -> (w,x,y,z)) (funs abs),
fmap (\(x,y,_) -> (x,y)) (cats abs)) fmap (\(x,y,z,_) -> (x,y,z)) (cats abs))
get = do aflags <- get get = do aflags <- get
funs <- get funs <- get
cats <- get cats <- get
return (Abstr{ aflags=aflags return (Abstr{ aflags=aflags
, funs=fmap (\(w,x,y,z) -> (w,x,y,z,0)) funs , funs=fmap (\(w,x,y,z) -> (w,x,y,z,0)) funs
, cats=fmap (\(x,y) -> (x,y,0)) cats , cats=fmap (\(x,y,z) -> (x,y,z,0)) cats
, code=BS.empty , code=BS.empty
}) })

View File

@@ -29,10 +29,10 @@ data PGF = PGF {
data Abstr = Abstr { data Abstr = Abstr {
aflags :: Map.Map CId Literal, -- ^ value of a flag aflags :: Map.Map CId Literal, -- ^ value of a flag
funs :: Map.Map CId (Type,Int,Maybe [Equation],Double,BCAddr), -- ^ type, arrity and definition of function + probability funs :: Map.Map CId (Type,Int,Maybe [Equation],Double,BCAddr), -- ^ type, arrity and definition of function + probability
cats :: Map.Map CId ([Hypo],[(Double, CId)],BCAddr), -- ^ 1. context of a category cats :: Map.Map CId ([Hypo],[(Double, CId)],Double,BCAddr), -- ^ 1. context of a category
-- ^ 2. functions of a category. The order in the list is important, -- 2. functions of a category. The functions are stored
-- this is the order in which the type singatures are given in the source. -- in decreasing probability order.
-- The termination of the exhaustive generation might depend on this. -- 3. probability
code :: BS.ByteString code :: BS.ByteString
} }

View File

@@ -67,7 +67,7 @@ functionsToCat :: PGF -> CId -> [(CId,Type)]
functionsToCat pgf cat = functionsToCat pgf cat =
[(f,ty) | (_,f) <- fs, Just (ty,_,_,_,_) <- [Map.lookup f $ funs $ abstract pgf]] [(f,ty) | (_,f) <- fs, Just (ty,_,_,_,_) <- [Map.lookup f $ funs $ abstract pgf]]
where where
(_,fs,_) = lookMap ([],[],0) cat $ cats $ abstract pgf (_,fs,_,_) = lookMap ([],[],0,0) cat $ cats $ abstract pgf
-- | List of functions that lack linearizations in the given language. -- | List of functions that lack linearizations in the given language.
missingLins :: PGF -> Language -> [CId] missingLins :: PGF -> Language -> [CId]
@@ -82,7 +82,7 @@ restrictPGF :: (CId -> Bool) -> PGF -> PGF
restrictPGF cond pgf = pgf { restrictPGF cond pgf = pgf {
abstract = abstr { abstract = abstr {
funs = Map.filterWithKey (\c _ -> cond c) (funs abstr), funs = Map.filterWithKey (\c _ -> cond c) (funs abstr),
cats = Map.map (\(hyps,fs,addr) -> (hyps,filter (cond . snd) fs,addr)) (cats abstr) cats = Map.map (\(hyps,fs,p,addr) -> (hyps,filter (cond . snd) fs,p,addr)) (cats abstr)
} }
} ---- restrict concrs also, might be needed } ---- restrict concrs also, might be needed
where where

View File

@@ -26,8 +26,8 @@ ppAbs name a = text "abstract" <+> ppCId name <+> char '{' $$
ppFlag :: CId -> Literal -> Doc ppFlag :: CId -> Literal -> Doc
ppFlag flag value = text "flag" <+> ppCId flag <+> char '=' <+> ppLit value <+> char ';' ppFlag flag value = text "flag" <+> ppCId flag <+> char '=' <+> ppLit value <+> char ';'
ppCat :: CId -> ([Hypo],[(Double,CId)],BCAddr) -> Doc ppCat :: CId -> ([Hypo],[(Double,CId)],Double,BCAddr) -> Doc
ppCat c (hyps,_,_) = text "cat" <+> ppCId c <+> hsep (snd (mapAccumL (ppHypo 4) [] hyps)) <+> char ';' ppCat c (hyps,_,_,_) = text "cat" <+> ppCId c <+> hsep (snd (mapAccumL (ppHypo 4) [] hyps)) <+> char ';'
ppFun :: CId -> (Type,Int,Maybe [Equation],Double,BCAddr) -> Doc ppFun :: CId -> (Type,Int,Maybe [Equation],Double,BCAddr) -> Doc
ppFun f (t,_,Just eqs,_,_) = text "fun" <+> ppCId f <+> colon <+> ppType 0 [] t <+> char ';' $$ ppFun f (t,_,Just eqs,_,_) = text "fun" <+> ppCId f <+> colon <+> ppType 0 [] t <+> char ';' $$

View File

@@ -24,13 +24,14 @@ import Data.Maybe (fromMaybe) --, fromJust
-- the probabilities for the different functions in a grammar. -- the probabilities for the different functions in a grammar.
data Probabilities = Probs { data Probabilities = Probs {
funProbs :: Map.Map CId Double, funProbs :: Map.Map CId Double,
catProbs :: Map.Map CId [(Double, CId)] catProbs :: Map.Map CId (Double, [(Double, CId)])
} }
-- | Renders the probability structure as string -- | Renders the probability structure as string
showProbabilities :: Probabilities -> String showProbabilities :: Probabilities -> String
showProbabilities = unlines . map pr . Map.toList . funProbs where showProbabilities = unlines . concatMap prProb . Map.toList . catProbs where
pr (f,d) = showCId f ++ "\t" ++ show d prProb (c,(p,fns)) = pr (p,c) : map pr fns
pr (p,f) = showCId f ++ "\t" ++ show p
-- | Reads the probabilities from a file. -- | Reads the probabilities from a file.
-- This should be a text file where on every line -- This should be a text file where on every line
@@ -50,8 +51,12 @@ readProbabilitiesFromFile file pgf = do
-- for the result category. -- for the result category.
mkProbabilities :: PGF -> Map.Map CId Double -> Probabilities mkProbabilities :: PGF -> Map.Map CId Double -> Probabilities
mkProbabilities pgf probs = mkProbabilities pgf probs =
let funs1 = Map.fromList [(f,p) | (_,cf) <- Map.toList cats1, (p,f) <- cf] let funs1 = Map.fromList [(f,p) | (_,(_,fns)) <- Map.toList cats1, (p,f) <- fns]
cats1 = Map.map (\(_,fs,_) -> sortBy cmpProb (fill fs)) (cats (abstract pgf)) cats1 = Map.mapWithKey (\c (_,fns,_,_) ->
let p' = fromMaybe 0 (Map.lookup c probs)
fns' = sortBy cmpProb (fill fns)
in (p', fns'))
(cats (abstract pgf))
in Probs funs1 cats1 in Probs funs1 cats1
where where
cmpProb (p1,_) (p2,_) = compare p2 p1 cmpProb (p1,_) (p2,_) = compare p2 p1
@@ -71,15 +76,15 @@ defaultProbabilities pgf = mkProbabilities pgf Map.empty
getProbabilities :: PGF -> Probabilities getProbabilities :: PGF -> Probabilities
getProbabilities pgf = Probs { getProbabilities pgf = Probs {
funProbs = Map.map (\(_,_,_,p,_) -> p) (funs (abstract pgf)), funProbs = Map.map (\(_,_,_,p,_) -> p ) (funs (abstract pgf)),
catProbs = Map.map (\(_,fns,_) -> fns) (cats (abstract pgf)) catProbs = Map.map (\(_,fns,p,_) -> (p,fns)) (cats (abstract pgf))
} }
setProbabilities :: Probabilities -> PGF -> PGF setProbabilities :: Probabilities -> PGF -> PGF
setProbabilities probs pgf = pgf { setProbabilities probs pgf = pgf {
abstract = (abstract pgf) { abstract = (abstract pgf) {
funs = mapUnionWith (\(ty,a,df,_,addr) p -> (ty,a,df,p,addr)) (funs (abstract pgf)) (funProbs probs), funs = mapUnionWith (\(ty,a,df,_,addr) p -> (ty,a,df, p,addr)) (funs (abstract pgf)) (funProbs probs),
cats = mapUnionWith (\(hypos,_,addr) fns -> (hypos,fns,addr)) (cats (abstract pgf)) (catProbs probs) cats = mapUnionWith (\(hypos,_,_,addr) (p,fns) -> (hypos,fns,p,addr)) (cats (abstract pgf)) (catProbs probs)
}} }}
where where
mapUnionWith f map1 map2 = mapUnionWith f map1 map2 =
@@ -102,7 +107,7 @@ rankTreesByProbs pgf ts = sortBy (\ (_,p) (_,q) -> compare q p)
mkProbDefs :: PGF -> ([[CId]],[(CId,Type,[Equation])]) mkProbDefs :: PGF -> ([[CId]],[(CId,Type,[Equation])])
mkProbDefs pgf = mkProbDefs pgf =
let cs = [(c,hyps,fns) | (c,(hyps0,fs,_)) <- Map.toList (cats (abstract pgf)), let cs = [(c,hyps,fns) | (c,(hyps0,fs,_,_)) <- Map.toList (cats (abstract pgf)),
not (elem c [cidString,cidInt,cidFloat]), not (elem c [cidString,cidInt,cidFloat]),
let hyps = zipWith (\(bt,_,ty) n -> (bt,mkCId ('v':show n),ty)) let hyps = zipWith (\(bt,_,ty) n -> (bt,mkCId ('v':show n),ty))
hyps0 hyps0

View File

@@ -121,8 +121,8 @@ runTcM abstr f ms s = unTcM f abstr (\x ms s cp b -> let (es,xs) = cp b
lookupCatHyps :: CId -> TcM s [Hypo] lookupCatHyps :: CId -> TcM s [Hypo]
lookupCatHyps cat = TcM (\abstr k h ms -> case Map.lookup cat (cats abstr) of lookupCatHyps cat = TcM (\abstr k h ms -> case Map.lookup cat (cats abstr) of
Just (hyps,_,_) -> k hyps ms Just (hyps,_,_,_) -> k hyps ms
Nothing -> h (UnknownCat cat)) Nothing -> h (UnknownCat cat))
lookupFunType :: CId -> TcM s Type lookupFunType :: CId -> TcM s Type
lookupFunType fun = TcM (\abstr k h ms -> case Map.lookup fun (funs abstr) of lookupFunType fun = TcM (\abstr k h ms -> case Map.lookup fun (funs abstr) of
@@ -143,8 +143,8 @@ typeGenerators scope cat = fmap normalize (liftM2 (++) x y)
| cat == cidString = return [(1.0,ELit (LStr "Foo"),TTyp [] (DTyp [] cat []))] | cat == cidString = return [(1.0,ELit (LStr "Foo"),TTyp [] (DTyp [] cat []))]
| otherwise = TcM (\abstr k h ms -> | otherwise = TcM (\abstr k h ms ->
case Map.lookup cat (cats abstr) of case Map.lookup cat (cats abstr) of
Just (_,fns,_) -> unTcM (mapM helper fns) abstr k h ms Just (_,fns,_,_) -> unTcM (mapM helper fns) abstr k h ms
Nothing -> h (UnknownCat cat)) Nothing -> h (UnknownCat cat))
helper (p,fn) = do helper (p,fn) = do
ty <- lookupFunType fn ty <- lookupFunType fn

File diff suppressed because it is too large Load Diff

View File

@@ -1,31 +0,0 @@
Phr * 1e-5
Phr A 4.276438451840496e-3
Phr AP 2.2906320842384386e-3
Phr AdA 2.9594729770522466e-6
Phr AdV 3.5217728426921733e-3
Phr Adv 4.47205961562365e-2
Phr CN 5.918945954104493e-6
Phr Cl 0.23274183333431983
Phr ClSlash 2.0716310839365724e-5
Phr Conj 1.2255177597973352e-2
Phr Det 1.8052785160018703e-4
Phr IAdv 2.9594729770522466e-6
Phr IP 2.9594729770522466e-6
Phr N 8.452254822461217e-3
Phr NP 7.473557108950038e-2
Phr Num 1.1837891908208986e-5
Phr Ord 1.7460890564608255e-4
Phr Predet 2.6635256793470218e-5
Phr Prep 7.56145345636849e-3
Phr Pron 2.3675783816417973e-5
Phr QS 4.143262167873145e-5
Phr Quant 4.03080219474516e-3
Phr RP 6.392461630432853e-4
Phr RS 1.021018177083025e-3
Phr S 5.920425690593019e-2
Phr Subj 6.392461630432853e-4
Phr V 1.071033270395208e-2
Phr V2 1.2784923260865705e-3
Phr VPS 4.751729811955087e-2
Phr VS 1.4797364885261233e-5
Phr _ 1.1965149246222233e-9

View File

@@ -3,32 +3,31 @@ import qualified Data.Map as Map
import Data.Maybe import Data.Maybe
import Data.List import Data.List
grammar_name = "ParseEngAbs.pgf"
treebank_name = "log4.txt"
chunk_cats = map mkCId
["A", "AP", "AdA", "AdV", "Adv", "CN", "Cl", "ClSlash", "Conj", "Det",
"IAdv", "IP", "N", "NP", "Num", "Ord", "Predet", "Prep", "Pron", "QS",
"Quant", "RP", "RS", "S", "Subj", "V", "V2", "VPS", "VS"]
main = do main = do
pgf <- readPGF "ParseEngAbs.pgf" pgf <- readPGF grammar_name
ls <- fmap (filterExprs . zip [1..] . lines) $ readFile "log4.txt" ls <- fmap (filterExprs . zip [1..] . lines) $ readFile treebank_name
putStrLn "" putStrLn ""
putStrLn ("trees: "++show (length ls)) putStrLn ("trees: "++show (length ls))
let stats = foldl' (collectStats pgf) let (_,cat,_) = unType (startCat pgf)
stats = foldl' (collectStats pgf)
(initStats pgf) (initStats pgf)
[(n,fromMaybe (error l) (readExpr (toQ l)),Just (mkCId "Phr"),Nothing) | (n,l) <- ls] [(n,fromMaybe (error l) (readExpr (toQ l)),Just cat) | (n,l) <- ls]
putStrLn ("coverage: "++show (coverage stats)) putStrLn ("coverage: "++show (coverage stats))
putStrLn ("Writing ParseEngAbs.probs...") putStrLn ("Writing ParseEngAbs.probs...")
writeFile "ParseEngAbs.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- uprobs pgf stats]) writeFile "ParseEngAbs.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- probs pgf stats])
putStrLn ("Writing ParseEngAbs2.probs...")
writeFile "ParseEngAbs2.probs" (unlines [show cat1 ++ "\t" ++ show cat2 ++ "\t" ++ show p | (cat1,cat2,p) <- mprobs pgf stats])
putStrLn ("Writing global.probs...")
writeFile "global.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- gprobs pgf stats])
putStrLn ("Writing categories.probs...")
writeFile "categories.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- cprobs pgf stats])
where where
toQ [] = [] toQ [] = []
toQ ('[':cs) = let (xs,']':ys) = break (==']') cs
in toQ ('?' : ys)
toQ ('?':cs) = 'Q' : toQ cs toQ ('?':cs) = 'Q' : toQ cs
toQ (c:cs) = c : toQ cs toQ (c:cs) = c : toQ cs
@@ -42,31 +41,26 @@ initStats pgf =
(Map.fromListWith (+) (Map.fromListWith (+)
([(f,1) | f <- functions pgf] ++ ([(f,1) | f <- functions pgf] ++
[(cat pgf f,1) | f <- functions pgf]) [(cat pgf f,1) | f <- functions pgf])
,Map.empty
,0 ,0
) )
collectStats pgf (ustats,bstats,count) (n,e,mb_cat1,mb_cat2) = collectStats pgf (ustats,count) (n,e,mb_cat1) =
case unApp e of case unApp e of
Just (f,args) -> let fcat2 = cat2 pgf f n e Just (f,args) -> let fcat2 = cat2 pgf f n e
fcat = fromMaybe (cat2 pgf f n e) mb_cat1 fcat = fromMaybe fcat2 mb_cat1
cf = fromMaybe 0 (Map.lookup f ustats) cf = fromMaybe 0 (Map.lookup f ustats)
cc = fromMaybe 0 (Map.lookup fcat ustats) cc = fromMaybe 0 (Map.lookup fcat ustats)
in if isJust mb_cat1 && f /= mkCId "Q" && fcat /= fcat2 in if isJust mb_cat1 && f /= mkCId "Q" && fcat /= fcat2
then error (show n ++ ": " ++ showExpr [] e) then error (show n ++ ": " ++ showExpr [] e)
else else
cf `seq` cc `seq` bstats `seq` count `seq` cf `seq` cc `seq` count `seq`
foldl' (collectStats pgf) foldl' (collectStats pgf)
(Map.insert f (cf+1) (Map.insert fcat (cc+1) ustats) (Map.insert f (cf+1) (Map.insert fcat (cc+1) ustats)
,(if null args
then Map.insertWith (+) (fcat,wildCId) 1
else id)
(maybe bstats (\cat2 -> Map.insertWith (+) (cat2,fcat) 1 bstats) mb_cat2)
,count+1 ,count+1
) )
(zipWith3 (\e mb_cat1 mb_cat2 -> (n,e,mb_cat1,mb_cat2)) args (argCats f) (repeat (Just fcat))) (zipWith (\e mb_cat1 -> (n,e,mb_cat1)) args (argCats f))
Nothing -> case unStr e of Nothing -> case unStr e of
Just _ -> (ustats,bstats,count+1) Just _ -> (ustats,count+1)
Nothing -> error ("collectStats ("++showExpr [] e++")") Nothing -> error ("collectStats ("++showExpr [] e++")")
where where
argCats f = argCats f =
@@ -75,44 +69,25 @@ collectStats pgf (ustats,bstats,count) (n,e,mb_cat1,mb_cat2) =
in map tyCat arg_tys in map tyCat arg_tys
Nothing -> repeat Nothing Nothing -> repeat Nothing
coverage (ustats,bstats,count) = coverage (stats,gcount) =
let c = fromMaybe 0 (Map.lookup (mkCId "Q") ustats) let c = fromMaybe 0 (Map.lookup (mkCId "Q") stats)
in (fromIntegral (count - c) / fromIntegral count) * 100 in (fromIntegral (gcount - c) / fromIntegral gcount) * 100
uprobs pgf (ustats,bstats,count) = probs pgf (stats,gcount) =
[toProb f (cat pgf f) | f <- functions pgf] [toFProb f (cat pgf f) | f <- functions pgf] ++
[toCProb c | c <- chunk_cats]
where where
toProb f cat = toFProb f cat =
let count = fromMaybe 0 (Map.lookup f ustats) let count = fromMaybe 0 (Map.lookup f stats)
cat_mass = fromMaybe 0 (Map.lookup cat ustats) cat_mass = fromMaybe 0 (Map.lookup cat stats)
in (f, fromIntegral count / fromIntegral cat_mass :: Double) in (f, fromIntegral count / fromIntegral cat_mass :: Double)
mprobs pgf (ustats,bstats,count) = toCProb c =
concat [toProb cat | cat <- categories pgf] let ccount = fromMaybe 0 (Map.lookup c stats)
where in (c, fromIntegral ccount / fromIntegral chunk_mass :: Double)
toProb cat =
let mass = sum [count | ((cat1,cat2),count) <- Map.toList bstats, cat1==cat]
cat_count = fromMaybe 0 (Map.lookup cat ustats)
fun_count = sum [fromMaybe 0 (Map.lookup f ustats) | f <- functionsByCat pgf cat]
in (cat,mkCId "*",if cat_count == 0 then 0 else fromIntegral (cat_count - fun_count) / fromIntegral cat_count) :
[(cat1,cat2,fromIntegral count / fromIntegral mass)
| ((cat1,cat2),count) <- Map.toList bstats, cat1==cat]
gprobs pgf (ustats,bstats,count) = chunk_mass =
sortBy (\x y -> compare (snd y) (snd x)) [toProb f | f <- functions pgf] sum [fromMaybe 0 (Map.lookup c stats) | c <- chunk_cats]
where
toProb f =
let fcount = fromMaybe 0 (Map.lookup f ustats)
in (f, fromIntegral fcount / fromIntegral count :: Double)
cprobs pgf (ustats,bstats,count) =
sortBy (\x y -> compare (snd y) (snd x)) [toProb c | c <- categories pgf]
where
mass = sum [fromMaybe 0 (Map.lookup c ustats) | c <- categories pgf]
toProb c =
let fcount = fromMaybe 0 (Map.lookup c ustats)
in (c, fromIntegral fcount / fromIntegral mass :: Double)
cat pgf f = cat pgf f =
case fmap unType (functionType pgf f) of case fmap unType (functionType pgf f) of