the content of ParseEngAbs3.probs is now merged with ParseEngAbs.probs. The later is now retrained. Once the grammar is compiled with the .probs file now it doesn't need anything more to do robust parsing. The robustness itself is controlled by the flags 'heuristic_search_factor', 'meta_prob' and 'meta_token_prob' in ParseEngAbs.gf

This commit is contained in:
kr.angelov
2013-11-06 10:21:46 +00:00
parent d094d671bd
commit 475f213c99
30 changed files with 65055 additions and 65108 deletions

View File

@@ -87,9 +87,7 @@ typedef struct {
PgfCId name;
PgfHypos* context;
prob_t meta_prob;
prob_t meta_token_prob;
PgfMetaChildMap* meta_child_probs;
prob_t prob;
void* predicate;
} PgfAbsCat;
@@ -230,6 +228,7 @@ typedef GuSeq PgfCncFuns;
struct PgfConcr {
PgfCId name;
PgfAbstr* abstr;
PgfFlags* cflags;
PgfPrintNames* printnames;
GuMap* ccats;

View File

@@ -63,7 +63,10 @@ typedef struct {
int prod_full_count;
#endif
PgfItem* free_item;
prob_t beam_size;
prob_t heuristic_factor;
prob_t meta_prob;
prob_t meta_token_prob;
} PgfParsing;
typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
@@ -1389,12 +1392,14 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
{
(void) (err);
PgfAbsCat* abscat = (PgfAbsCat*) key;
prob_t meta_prob = *((prob_t*) value);
PgfAbsCat* abscat = *((PgfAbsCat**) value);
PgfMetaPredictFn* clo = (PgfMetaPredictFn*) fn;
PgfParsing* ps = clo->ps;
PgfItem* meta_item = clo->meta_item;
if (abscat->prob == INFINITY)
return;
PgfCncCat* cnccat =
gu_map_get(ps->concr->cnccats, abscat->name, PgfCncCat*);
if (cnccat == NULL)
@@ -1412,7 +1417,7 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
PgfItem* item =
pgf_item_copy(meta_item, ps);
item->inside_prob +=
ccat->viterbi_prob+meta_prob;
ccat->viterbi_prob+abscat->prob;
size_t nargs = gu_seq_length(meta_item->args);
item->args = gu_new_seq(PgfPArg, nargs+1, ps->pool);
@@ -1698,18 +1703,14 @@ pgf_parsing_item(PgfParsing* ps, PgfItem* item)
}
pgf_parsing_complete(ps, item, ep);
} else {
prob_t meta_token_prob =
item->conts->ccat->cnccat->abscat->meta_token_prob;
prob_t meta_token_prob =
ps->meta_token_prob;
if (meta_token_prob != INFINITY) {
pgf_parsing_meta_scan(ps, item, meta_token_prob);
}
PgfCIdMap* meta_child_probs =
item->conts->ccat->cnccat->abscat->meta_child_probs;
if (meta_child_probs != NULL) {
PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, ps, item };
gu_map_iter(meta_child_probs, &clo.fn, NULL);
}
PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, ps, item };
gu_map_iter(ps->concr->abstr->cats, &clo.fn, NULL);
}
} else {
pgf_parsing_symbol(ps, item, item->curr_sym);
@@ -1721,22 +1722,38 @@ pgf_parsing_item(PgfParsing* ps, PgfItem* item)
}
}
static prob_t
pgf_parsing_default_beam_size(PgfConcr* concr)
static void
pgf_parsing_set_default_factors(PgfParsing* ps, PgfAbstr* abstr)
{
PgfLiteral lit = gu_map_get(concr->cflags, "beam_size", PgfLiteral);
PgfLiteral lit;
if (gu_variant_is_null(lit))
return 0;
lit =
gu_map_get(abstr->aflags, "heuristic_search_factor", PgfLiteral);
if (!gu_variant_is_null(lit)) {
GuVariantInfo pi = gu_variant_open(lit);
gu_assert (pi.tag == PGF_LITERAL_FLT);
ps->heuristic_factor = ((PgfLiteralFlt*) pi.data)->val;
}
GuVariantInfo pi = gu_variant_open(lit);
gu_assert (pi.tag == PGF_LITERAL_FLT);
return ((PgfLiteralFlt*) pi.data)->val;
lit =
gu_map_get(abstr->aflags, "meta_prob", PgfLiteral);
if (!gu_variant_is_null(lit)) {
GuVariantInfo pi = gu_variant_open(lit);
gu_assert (pi.tag == PGF_LITERAL_FLT);
ps->meta_prob = - log(((PgfLiteralFlt*) pi.data)->val);
}
lit =
gu_map_get(abstr->aflags, "meta_token_prob", PgfLiteral);
if (!gu_variant_is_null(lit)) {
GuVariantInfo pi = gu_variant_open(lit);
gu_assert (pi.tag == PGF_LITERAL_FLT);
ps->meta_token_prob = - log(((PgfLiteralFlt*) pi.data)->val);
}
}
static PgfParsing*
pgf_new_parsing(PgfConcr* concr,
GuString sentence, double heuristics,
pgf_new_parsing(PgfConcr* concr, GuString sentence,
GuPool* pool, GuPool* out_pool)
{
PgfParsing* ps = gu_new(PgfParsing, pool);
@@ -1756,7 +1773,11 @@ pgf_new_parsing(PgfConcr* concr,
ps->prod_full_count = 0;
#endif
ps->free_item = NULL;
ps->beam_size = heuristics;
ps->heuristic_factor = 0;
ps->meta_prob = INFINITY;
ps->meta_token_prob = INFINITY;
pgf_parsing_set_default_factors(ps, concr->abstr);
PgfExprMeta *expr_meta =
gu_new_variant(PGF_EXPR_META,
@@ -2107,7 +2128,7 @@ pgf_parse_result_is_new(PgfExprState* st)
// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
static PgfParsing*
pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx,
GuString sentence, double heuristics,
GuString sentence, double heuristic_factor,
GuExn* err,
GuPool* pool, GuPool* out_pool)
{
@@ -2121,12 +2142,13 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx,
gu_assert(lin_idx < cnccat->n_lins);
if (heuristics < 0) {
heuristics = pgf_parsing_default_beam_size(concr);
PgfParsing* ps =
pgf_new_parsing(concr, sentence, pool, out_pool);
if (heuristic_factor >= 0) {
ps->heuristic_factor = heuristic_factor;
}
PgfParsing* ps =
pgf_new_parsing(concr, sentence, heuristics, pool, out_pool);
PgfParseState* state =
pgf_new_parse_state(ps, 0, BIND_SOFT);
@@ -2156,11 +2178,13 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx,
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
}
PgfItem *item =
pgf_new_item(ps, conts, ps->meta_prod);
item->inside_prob =
ccat->cnccat->abscat->meta_prob;
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
if (ps->meta_prob != INFINITY) {
PgfItem *item =
pgf_new_item(ps, conts, ps->meta_prod);
item->inside_prob =
ps->meta_prob;
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
}
}
}
@@ -2200,7 +2224,7 @@ pgf_parsing_proceed(PgfParsing* ps)
prob_t state_delta =
(st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))*
ps->beam_size;
ps->heuristic_factor;
delta_prob += state_delta;
st = st->next;
}

View File

@@ -35,63 +35,6 @@ pgf_read(const char* fpath,
return pgf;
}
void
pgf_load_meta_child_probs(PgfPGF* pgf, const char* fpath,
GuPool* pool, GuExn* err)
{
FILE *fp = fopen(fpath, "r");
if (!fp) {
gu_raise_errno(err);
return;
}
GuPool* tmp_pool = gu_new_pool();
for (;;) {
char cat1[21];
char cat2[21];
prob_t prob;
if (fscanf(fp, "%20s\t%20s\t%f", cat1, cat2, &prob) < 3)
break;
prob = - log(prob);
PgfAbsCat* abscat1 =
gu_map_get(pgf->abstract.cats, cat1, PgfAbsCat*);
if (abscat1 == NULL) {
GuExnData* exn = gu_raise(err, PgfExn);
exn->data = "Unknown category name";
goto close;
}
if (strcmp(cat2, "*") == 0) {
abscat1->meta_prob = prob;
} else if (strcmp(cat2, "_") == 0) {
abscat1->meta_token_prob = prob;
} else {
PgfAbsCat* abscat2 = gu_map_get(pgf->abstract.cats, cat2, PgfAbsCat*);
if (abscat2 == NULL) {
gu_raise(err, PgfExn);
GuExnData* exn = gu_raise(err, PgfExn);
exn->data = "Unknown category name";
goto close;
}
if (abscat1->meta_child_probs == NULL) {
abscat1->meta_child_probs =
gu_map_type_new(PgfMetaChildMap, pool);
}
gu_map_put(abscat1->meta_child_probs, abscat2, prob_t, prob);
}
}
close:
gu_pool_free(tmp_pool);
fclose(fp);
}
GuString
pgf_abstract_name(PgfPGF* pgf)
{

View File

@@ -80,11 +80,6 @@ pgf_read(const char* fpath,
*
*/
void
pgf_load_meta_child_probs(PgfPGF*, const char* fpath,
GuPool* pool, GuExn* err);
GuString
pgf_abstract_name(PgfPGF*);

View File

@@ -48,7 +48,7 @@ pgf_print_cat(GuMapItor* fn, const void* key, void* value,
ctxt = next;
}
gu_printf(out, err, " ; -- %f\n",cat->meta_prob);
gu_printf(out, err, " ; -- %f\n", cat->prob);
}
void

View File

@@ -516,10 +516,6 @@ pgf_read_abscat(PgfReader* rdr, PgfAbstr* abstr, PgfCIdMap* abscats)
gu_return_on_exn(rdr->err, NULL);
}
abscat->meta_prob = INFINITY;
abscat->meta_token_prob = INFINITY;
abscat->meta_child_probs = NULL;
GuBuf* functions = gu_new_buf(PgfAbsFun*, rdr->tmp_pool);
size_t n_functions = pgf_read_len(rdr);
@@ -538,6 +534,8 @@ pgf_read_abscat(PgfReader* rdr, PgfAbstr* abstr, PgfCIdMap* abscats)
gu_buf_push(functions, PgfAbsFun*, absfun);
}
abscat->prob = - log(gu_in_f64be(rdr->in, rdr->err));
pgf_jit_predicate(rdr->jit_state, abscats, abscat, functions);
return abscat;
@@ -1155,6 +1153,8 @@ pgf_read_concrete(PgfReader* rdr, PgfAbstr* abstr, PgfAbsFun* abs_lin_fun)
pgf_read_cid(rdr, rdr->opool);
gu_return_on_exn(rdr->err, NULL);
concr->abstr = abstr;
concr->cflags =
pgf_read_flags(rdr);
gu_return_on_exn(rdr->err, NULL);

View File

@@ -53,18 +53,17 @@ int main(int argc, char* argv[]) {
// Create the pool that is used to allocate everything
GuPool* pool = gu_new_pool();
int status = EXIT_SUCCESS;
if (argc < 5 || argc > 6) {
fprintf(stderr, "usage: %s pgf cat from-lang to-lang [probs-file]\n", argv[0]);
if (argc < 5) {
fprintf(stderr, "usage: %s pgf cat from-lang to-lang\n", argv[0]);
status = EXIT_FAILURE;
goto fail;
}
char* filename = argv[1];
GuString filename = argv[1];
GuString cat = argv[2];
GuString from_lang = argv[3];
GuString to_lang = argv[4];
// Create an exception frame that catches all errors.
GuExn* err = gu_new_exn(NULL, gu_kind(type), pool);
@@ -78,16 +77,6 @@ int main(int argc, char* argv[]) {
goto fail;
}
if (argc == 6) {
char* meta_probs_filename = argv[5];
pgf_load_meta_child_probs(pgf, meta_probs_filename, pool, err);
if (!gu_ok(err)) {
fprintf(stderr, "Loading meta child probs failed\n");
status = EXIT_FAILURE;
goto fail;
}
}
// Look up the source and destination concrete categories
PgfConcr* from_concr = pgf_get_language(pgf, from_lang);
PgfConcr* to_concr = pgf_get_language(pgf, to_lang);