1
0
forked from GitHub/gf-core

bugfix for robust parsing with multi-word units

This commit is contained in:
kr.angelov
2012-12-11 12:57:22 +00:00
parent d0d4583bb8
commit 3182e382dc
3 changed files with 87 additions and 156 deletions

View File

@@ -131,7 +131,6 @@ GU_DEFINE_TYPE(
GU_MEMBER(PgfProductionExtern, lins, GuSeq)),
GU_CONSTRUCTOR_S(
PGF_PRODUCTION_META, PgfProductionMeta,
GU_MEMBER(PgfProductionMeta, lins, GuSeq),
GU_MEMBER(PgfProductionMeta, args, PgfPArgs)));
GU_DEFINE_TYPE(PgfProductions, GuList, gu_type(PgfProduction));

View File

@@ -337,7 +337,6 @@ typedef struct {
typedef struct {
PgfExprProb *ep;
GuSeq lins;
PgfPArgs args;
} PgfProductionMeta;

View File

@@ -197,22 +197,14 @@ pgf_item_sequence_length(PgfItem* item)
}
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* pmeta = i.data;
PgfSequence seq;
if (!gu_seq_is_null(pmeta->lins) &&
!gu_seq_is_null(seq = gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) {
return gu_seq_length(seq);
} else {
int seq_len = 0;
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
seq_len++;
sym = pgf_prev_extern_sym(sym);
}
return seq_len;
int seq_len = 0;
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
seq_len++;
sym = pgf_prev_extern_sym(sym);
}
return seq_len;
}
default:
gu_impossible();
@@ -268,12 +260,7 @@ pgf_item_sequence(PgfItem* item,
break;
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* pmeta = i.data;
if (gu_seq_is_null(pmeta->lins) ||
gu_seq_is_null(*seq = gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) {
*seq = pgf_extern_seq_get(item, pool);
}
*seq = pgf_extern_seq_get(item, pool);
break;
}
default:
@@ -760,62 +747,20 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
return;
}
bool extend = false;
GuVariantInfo i = gu_variant_open(cont->prod);
if (i.tag == PGF_PRODUCTION_META) {
PgfProductionMeta* pmeta = i.data;
if (gu_seq_is_null(pmeta->lins) ||
gu_seq_is_null(gu_seq_get(pmeta->lins,PgfSequence,cont->conts->lin_idx))) {
extend = true;
}
}
PgfItem* item = NULL;
if (!extend) {
switch (gu_variant_tag(cont->curr_sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(cont->curr_sym);
item = pgf_item_update_arg(cont, scat->d, cat, before->pool, before->ps);
break;
}
case PGF_SYMBOL_LIT: {
PgfSymbolLit* slit = gu_variant_data(cont->curr_sym);
item = pgf_item_update_arg(cont, slit->d, cat, before->pool, before->ps);
break;
}
default:
gu_impossible();
}
} else {
if (before->meta_item != NULL)
return;
item = pgf_item_copy(cont, before->pool, before->ps);
size_t nargs = gu_seq_length(cont->args);
item->args = gu_new_seq(PgfPArg, nargs+1, before->pool);
memcpy(gu_seq_data(item->args), gu_seq_data(cont->args),
nargs * sizeof(PgfPArg));
gu_seq_set(item->args, PgfPArg, nargs,
((PgfPArg) { .hypos = NULL, .ccat = cat }));
PgfCIdMap* meta_child_probs =
item->conts->ccat->cnccat->abscat->meta_child_probs;
item->inside_prob +=
cat->viterbi_prob+
gu_map_get(meta_child_probs, cat->cnccat->abscat, prob_t);
PgfSymbol prev = item->curr_sym;
PgfSymbolCat* scat = (PgfSymbolCat*)
gu_alloc_variant(PGF_SYMBOL_CAT,
sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolCat),
&item->curr_sym, before->pool);
*((PgfSymbol*)(scat+1)) = prev;
scat->d = nargs;
scat->r = lin_idx;
before->meta_item = item;
switch (gu_variant_tag(cont->curr_sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(cont->curr_sym);
item = pgf_item_update_arg(cont, scat->d, cat, before->pool, before->ps);
break;
}
case PGF_SYMBOL_LIT: {
PgfSymbolLit* slit = gu_variant_data(cont->curr_sym);
item = pgf_item_update_arg(cont, slit->d, cat, before->pool, before->ps);
break;
}
default:
gu_impossible();
}
pgf_item_advance(item, before->pool);
@@ -893,38 +838,12 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
break;
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* pmeta = i.data;
PgfProductionMeta* new_pmeta =
gu_new_variant(PGF_PRODUCTION_META,
PgfProductionMeta,
&prod, pool);
new_pmeta->ep = ep;
new_pmeta->lins = pmeta->lins;
new_pmeta->args = item->args;
if (gu_seq_is_null(pmeta->lins) ||
gu_seq_is_null(gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) {
PgfSequence seq =
pgf_extern_seq_get(item, pool);
size_t n_lins = item->conts->ccat->cnccat->n_lins;
new_pmeta->lins = gu_new_seq(PgfSequence, n_lins, pool);
if (gu_seq_is_null(pmeta->lins)) {
for (size_t i = 0; i < n_lins; i++) {
gu_seq_set(new_pmeta->lins,PgfSequence,i,
gu_null_seq);
}
} else {
for (size_t i = 0; i < n_lins; i++) {
gu_seq_set(new_pmeta->lins,PgfSequence,i,
gu_seq_get(pmeta->lins,PgfSequence,i));
}
}
gu_seq_set(new_pmeta->lins,PgfSequence,item->conts->lin_idx,seq);
}
break;
}
default:
@@ -1064,8 +983,7 @@ pgf_parsing_bu_filter(PgfParseState* before, PgfParseState* after,
static void
pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
PgfItem* item, PgfCCat* ccat, size_t lin_idx,
prob_t delta_prob)
PgfItem* item, PgfCCat* ccat, size_t lin_idx)
{
PgfItemConts* conts =
pgf_parsing_get_conts(before->conts_map,
@@ -1079,8 +997,7 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
conts->outside_prob =
item->inside_prob-conts->ccat->viterbi_prob+
item->conts->outside_prob +
delta_prob;
item->conts->outside_prob;
// Top-down prediction for syntactic rules
PgfProductionSeq prods = ccat->prods;
@@ -1143,6 +1060,26 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
}
}
static void
pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after,
PgfItem* meta_item, prob_t meta_prob)
{
PgfItem* item = pgf_item_copy(meta_item, before->pool, before->ps);
item->inside_prob += meta_prob;
PgfSymbol prev = item->curr_sym;
PgfSymbolKS* sks = (PgfSymbolKS*)
gu_alloc_variant(PGF_SYMBOL_KS,
sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolKS),
&item->curr_sym, after->pool);
*((PgfSymbol*)(sks+1)) = prev;
sks->tokens = gu_new_seq(PgfToken, 1, after->pool);
gu_seq_set(sks->tokens, PgfToken, 0, after->ts->tok);
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
typedef struct {
GuMapItor fn;
PgfParseState* before;
@@ -1182,8 +1119,29 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
continue;
}
pgf_parsing_td_predict(before, after,
meta_item, ccat, lin_idx, meta_prob);
PgfItem* item =
pgf_item_copy(meta_item, before->pool, before->ps);
item->inside_prob +=
ccat->viterbi_prob+meta_prob;
size_t nargs = gu_seq_length(meta_item->args);
item->args = gu_new_seq(PgfPArg, nargs+1, before->pool);
memcpy(gu_seq_data(item->args), gu_seq_data(meta_item->args),
nargs * sizeof(PgfPArg));
gu_seq_set(item->args, PgfPArg, nargs,
((PgfPArg) { .hypos = NULL, .ccat = ccat }));
PgfSymbol prev = item->curr_sym;
PgfSymbolCat* scat = (PgfSymbolCat*)
gu_alloc_variant(PGF_SYMBOL_CAT,
sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolCat),
&item->curr_sym, before->pool);
*((PgfSymbol*)(scat+1)) = prev;
scat->d = nargs;
scat->r = lin_idx;
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
}
}
@@ -1210,7 +1168,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
return;
}
pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r, 0);
pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r);
break;
}
case PGF_SYMBOL_KS: {
@@ -1298,7 +1256,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
if (parg->ccat->fid > 0 &&
parg->ccat->fid >= before->ps->concr->total_cats) {
pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r, 0);
pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r);
}
else {
PgfItemConts* conts =
@@ -1382,9 +1340,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
pgf_parsing_complete(before, after, item, NULL);
pgf_item_free(before, after, item);
} else {
PgfSymbol sym =
gu_seq_get(seq, PgfSymbol, item->seq_idx);
pgf_parsing_symbol(before, after, item, sym);
pgf_parsing_symbol(before, after, item, item->curr_sym);
}
break;
}
@@ -1409,7 +1365,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
pgf_parsing_td_predict(before, after, item,
pcoerce->coerce,
item->conts->lin_idx, 0);
item->conts->lin_idx);
break;
case 1:
pgf_parsing_complete(before, after, item, NULL);
@@ -1469,20 +1425,11 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
break;
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* pmeta = i.data;
PgfSequence seq;
if (!gu_seq_is_null(pmeta->lins) &&
!gu_seq_is_null(seq = gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) {
if (item->seq_idx == gu_seq_length(seq)) {
pgf_parsing_complete(before, after, item, NULL);
pgf_item_free(before, after, item);
} else {
PgfSymbol sym =
gu_seq_get(seq, PgfSymbol, item->seq_idx);
pgf_parsing_symbol(before, after, item, sym);
}
} else {
if (item->seq_idx == pgf_item_sequence_length(item)) {
if (before->meta_item != NULL)
break;
before->meta_item = item;
if (after == NULL) {
PgfExprProb *ep = gu_new(PgfExprProb, before->pool);
ep->expr = before->ps->meta_var;
@@ -1494,34 +1441,21 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
}
pgf_parsing_complete(before, after, item, ep);
} else {
if (after->ts->lexicon_idx == NULL) {
prob_t meta_token_prob =
item->conts->ccat->cnccat->abscat->meta_token_prob;
if (meta_token_prob == INFINITY)
break;
item->inside_prob += meta_token_prob;
prob_t meta_token_prob =
item->conts->ccat->cnccat->abscat->meta_token_prob;
if (meta_token_prob != INFINITY) {
pgf_parsing_meta_scan(before, after, item, meta_token_prob);
}
PgfSymbol prev = item->curr_sym;
PgfSymbolKS* sks = (PgfSymbolKS*)
gu_alloc_variant(PGF_SYMBOL_KS,
sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolKS),
&item->curr_sym, after->pool);
*((PgfSymbol*)(sks+1)) = prev;
sks->tokens = gu_new_seq(PgfToken, 1, after->pool);
gu_seq_set(sks->tokens, PgfToken, 0, after->ts->tok);
item->seq_idx++;
pgf_parsing_add_transition(before, after, after->ts->tok, item);
} else {
PgfCIdMap* meta_child_probs =
item->conts->ccat->cnccat->abscat->meta_child_probs;
if (meta_child_probs != NULL) {
PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, after, item };
gu_map_iter(meta_child_probs, &clo.fn, NULL);
}
PgfCIdMap* meta_child_probs =
item->conts->ccat->cnccat->abscat->meta_child_probs;
if (meta_child_probs != NULL) {
PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, after, item };
gu_map_iter(meta_child_probs, &clo.fn, NULL);
}
}
} else {
pgf_parsing_symbol(before, after, item, item->curr_sym);
}
break;
}
@@ -1609,7 +1543,6 @@ pgf_new_parsing(PgfConcr* concr, GuPool* pool)
PgfProductionMeta,
&ps->meta_prod, pool);
pmeta->ep = NULL;
pmeta->lins = gu_null_seq;
pmeta->args = gu_new_seq(PgfPArg, 0, pool);
return ps;