forked from GitHub/gf-core
bugfix for robust parsing with multi-word units
This commit is contained in:
@@ -131,7 +131,6 @@ GU_DEFINE_TYPE(
|
|||||||
GU_MEMBER(PgfProductionExtern, lins, GuSeq)),
|
GU_MEMBER(PgfProductionExtern, lins, GuSeq)),
|
||||||
GU_CONSTRUCTOR_S(
|
GU_CONSTRUCTOR_S(
|
||||||
PGF_PRODUCTION_META, PgfProductionMeta,
|
PGF_PRODUCTION_META, PgfProductionMeta,
|
||||||
GU_MEMBER(PgfProductionMeta, lins, GuSeq),
|
|
||||||
GU_MEMBER(PgfProductionMeta, args, PgfPArgs)));
|
GU_MEMBER(PgfProductionMeta, args, PgfPArgs)));
|
||||||
|
|
||||||
GU_DEFINE_TYPE(PgfProductions, GuList, gu_type(PgfProduction));
|
GU_DEFINE_TYPE(PgfProductions, GuList, gu_type(PgfProduction));
|
||||||
|
|||||||
@@ -337,7 +337,6 @@ typedef struct {
|
|||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
PgfExprProb *ep;
|
PgfExprProb *ep;
|
||||||
GuSeq lins;
|
|
||||||
PgfPArgs args;
|
PgfPArgs args;
|
||||||
} PgfProductionMeta;
|
} PgfProductionMeta;
|
||||||
|
|
||||||
|
|||||||
@@ -197,22 +197,14 @@ pgf_item_sequence_length(PgfItem* item)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
case PGF_PRODUCTION_META: {
|
case PGF_PRODUCTION_META: {
|
||||||
PgfProductionMeta* pmeta = i.data;
|
int seq_len = 0;
|
||||||
PgfSequence seq;
|
PgfSymbol sym = item->curr_sym;
|
||||||
|
while (!gu_variant_is_null(sym)) {
|
||||||
if (!gu_seq_is_null(pmeta->lins) &&
|
seq_len++;
|
||||||
!gu_seq_is_null(seq = gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) {
|
sym = pgf_prev_extern_sym(sym);
|
||||||
return gu_seq_length(seq);
|
|
||||||
} else {
|
|
||||||
int seq_len = 0;
|
|
||||||
PgfSymbol sym = item->curr_sym;
|
|
||||||
while (!gu_variant_is_null(sym)) {
|
|
||||||
seq_len++;
|
|
||||||
sym = pgf_prev_extern_sym(sym);
|
|
||||||
}
|
|
||||||
|
|
||||||
return seq_len;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return seq_len;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
gu_impossible();
|
gu_impossible();
|
||||||
@@ -268,12 +260,7 @@ pgf_item_sequence(PgfItem* item,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PGF_PRODUCTION_META: {
|
case PGF_PRODUCTION_META: {
|
||||||
PgfProductionMeta* pmeta = i.data;
|
*seq = pgf_extern_seq_get(item, pool);
|
||||||
|
|
||||||
if (gu_seq_is_null(pmeta->lins) ||
|
|
||||||
gu_seq_is_null(*seq = gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) {
|
|
||||||
*seq = pgf_extern_seq_get(item, pool);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
@@ -760,62 +747,20 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool extend = false;
|
|
||||||
GuVariantInfo i = gu_variant_open(cont->prod);
|
|
||||||
if (i.tag == PGF_PRODUCTION_META) {
|
|
||||||
PgfProductionMeta* pmeta = i.data;
|
|
||||||
if (gu_seq_is_null(pmeta->lins) ||
|
|
||||||
gu_seq_is_null(gu_seq_get(pmeta->lins,PgfSequence,cont->conts->lin_idx))) {
|
|
||||||
extend = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
PgfItem* item = NULL;
|
PgfItem* item = NULL;
|
||||||
|
switch (gu_variant_tag(cont->curr_sym)) {
|
||||||
if (!extend) {
|
case PGF_SYMBOL_CAT: {
|
||||||
switch (gu_variant_tag(cont->curr_sym)) {
|
PgfSymbolCat* scat = gu_variant_data(cont->curr_sym);
|
||||||
case PGF_SYMBOL_CAT: {
|
item = pgf_item_update_arg(cont, scat->d, cat, before->pool, before->ps);
|
||||||
PgfSymbolCat* scat = gu_variant_data(cont->curr_sym);
|
break;
|
||||||
item = pgf_item_update_arg(cont, scat->d, cat, before->pool, before->ps);
|
}
|
||||||
break;
|
case PGF_SYMBOL_LIT: {
|
||||||
}
|
PgfSymbolLit* slit = gu_variant_data(cont->curr_sym);
|
||||||
case PGF_SYMBOL_LIT: {
|
item = pgf_item_update_arg(cont, slit->d, cat, before->pool, before->ps);
|
||||||
PgfSymbolLit* slit = gu_variant_data(cont->curr_sym);
|
break;
|
||||||
item = pgf_item_update_arg(cont, slit->d, cat, before->pool, before->ps);
|
}
|
||||||
break;
|
default:
|
||||||
}
|
gu_impossible();
|
||||||
default:
|
|
||||||
gu_impossible();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (before->meta_item != NULL)
|
|
||||||
return;
|
|
||||||
|
|
||||||
item = pgf_item_copy(cont, before->pool, before->ps);
|
|
||||||
size_t nargs = gu_seq_length(cont->args);
|
|
||||||
item->args = gu_new_seq(PgfPArg, nargs+1, before->pool);
|
|
||||||
memcpy(gu_seq_data(item->args), gu_seq_data(cont->args),
|
|
||||||
nargs * sizeof(PgfPArg));
|
|
||||||
gu_seq_set(item->args, PgfPArg, nargs,
|
|
||||||
((PgfPArg) { .hypos = NULL, .ccat = cat }));
|
|
||||||
|
|
||||||
PgfCIdMap* meta_child_probs =
|
|
||||||
item->conts->ccat->cnccat->abscat->meta_child_probs;
|
|
||||||
item->inside_prob +=
|
|
||||||
cat->viterbi_prob+
|
|
||||||
gu_map_get(meta_child_probs, cat->cnccat->abscat, prob_t);
|
|
||||||
|
|
||||||
PgfSymbol prev = item->curr_sym;
|
|
||||||
PgfSymbolCat* scat = (PgfSymbolCat*)
|
|
||||||
gu_alloc_variant(PGF_SYMBOL_CAT,
|
|
||||||
sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
|
|
||||||
gu_alignof(PgfSymbolCat),
|
|
||||||
&item->curr_sym, before->pool);
|
|
||||||
*((PgfSymbol*)(scat+1)) = prev;
|
|
||||||
scat->d = nargs;
|
|
||||||
scat->r = lin_idx;
|
|
||||||
|
|
||||||
before->meta_item = item;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pgf_item_advance(item, before->pool);
|
pgf_item_advance(item, before->pool);
|
||||||
@@ -893,38 +838,12 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PGF_PRODUCTION_META: {
|
case PGF_PRODUCTION_META: {
|
||||||
PgfProductionMeta* pmeta = i.data;
|
|
||||||
|
|
||||||
PgfProductionMeta* new_pmeta =
|
PgfProductionMeta* new_pmeta =
|
||||||
gu_new_variant(PGF_PRODUCTION_META,
|
gu_new_variant(PGF_PRODUCTION_META,
|
||||||
PgfProductionMeta,
|
PgfProductionMeta,
|
||||||
&prod, pool);
|
&prod, pool);
|
||||||
new_pmeta->ep = ep;
|
new_pmeta->ep = ep;
|
||||||
new_pmeta->lins = pmeta->lins;
|
|
||||||
new_pmeta->args = item->args;
|
new_pmeta->args = item->args;
|
||||||
|
|
||||||
if (gu_seq_is_null(pmeta->lins) ||
|
|
||||||
gu_seq_is_null(gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) {
|
|
||||||
PgfSequence seq =
|
|
||||||
pgf_extern_seq_get(item, pool);
|
|
||||||
|
|
||||||
size_t n_lins = item->conts->ccat->cnccat->n_lins;
|
|
||||||
|
|
||||||
new_pmeta->lins = gu_new_seq(PgfSequence, n_lins, pool);
|
|
||||||
|
|
||||||
if (gu_seq_is_null(pmeta->lins)) {
|
|
||||||
for (size_t i = 0; i < n_lins; i++) {
|
|
||||||
gu_seq_set(new_pmeta->lins,PgfSequence,i,
|
|
||||||
gu_null_seq);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (size_t i = 0; i < n_lins; i++) {
|
|
||||||
gu_seq_set(new_pmeta->lins,PgfSequence,i,
|
|
||||||
gu_seq_get(pmeta->lins,PgfSequence,i));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
gu_seq_set(new_pmeta->lins,PgfSequence,item->conts->lin_idx,seq);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
@@ -1064,8 +983,7 @@ pgf_parsing_bu_filter(PgfParseState* before, PgfParseState* after,
|
|||||||
|
|
||||||
static void
|
static void
|
||||||
pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
|
pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
|
||||||
PgfItem* item, PgfCCat* ccat, size_t lin_idx,
|
PgfItem* item, PgfCCat* ccat, size_t lin_idx)
|
||||||
prob_t delta_prob)
|
|
||||||
{
|
{
|
||||||
PgfItemConts* conts =
|
PgfItemConts* conts =
|
||||||
pgf_parsing_get_conts(before->conts_map,
|
pgf_parsing_get_conts(before->conts_map,
|
||||||
@@ -1079,8 +997,7 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
|
|||||||
|
|
||||||
conts->outside_prob =
|
conts->outside_prob =
|
||||||
item->inside_prob-conts->ccat->viterbi_prob+
|
item->inside_prob-conts->ccat->viterbi_prob+
|
||||||
item->conts->outside_prob +
|
item->conts->outside_prob;
|
||||||
delta_prob;
|
|
||||||
|
|
||||||
// Top-down prediction for syntactic rules
|
// Top-down prediction for syntactic rules
|
||||||
PgfProductionSeq prods = ccat->prods;
|
PgfProductionSeq prods = ccat->prods;
|
||||||
@@ -1143,6 +1060,26 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after,
|
||||||
|
PgfItem* meta_item, prob_t meta_prob)
|
||||||
|
{
|
||||||
|
PgfItem* item = pgf_item_copy(meta_item, before->pool, before->ps);
|
||||||
|
item->inside_prob += meta_prob;
|
||||||
|
|
||||||
|
PgfSymbol prev = item->curr_sym;
|
||||||
|
PgfSymbolKS* sks = (PgfSymbolKS*)
|
||||||
|
gu_alloc_variant(PGF_SYMBOL_KS,
|
||||||
|
sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
|
||||||
|
gu_alignof(PgfSymbolKS),
|
||||||
|
&item->curr_sym, after->pool);
|
||||||
|
*((PgfSymbol*)(sks+1)) = prev;
|
||||||
|
sks->tokens = gu_new_seq(PgfToken, 1, after->pool);
|
||||||
|
gu_seq_set(sks->tokens, PgfToken, 0, after->ts->tok);
|
||||||
|
|
||||||
|
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
|
||||||
|
}
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
GuMapItor fn;
|
GuMapItor fn;
|
||||||
PgfParseState* before;
|
PgfParseState* before;
|
||||||
@@ -1182,8 +1119,29 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
pgf_parsing_td_predict(before, after,
|
PgfItem* item =
|
||||||
meta_item, ccat, lin_idx, meta_prob);
|
pgf_item_copy(meta_item, before->pool, before->ps);
|
||||||
|
item->inside_prob +=
|
||||||
|
ccat->viterbi_prob+meta_prob;
|
||||||
|
|
||||||
|
size_t nargs = gu_seq_length(meta_item->args);
|
||||||
|
item->args = gu_new_seq(PgfPArg, nargs+1, before->pool);
|
||||||
|
memcpy(gu_seq_data(item->args), gu_seq_data(meta_item->args),
|
||||||
|
nargs * sizeof(PgfPArg));
|
||||||
|
gu_seq_set(item->args, PgfPArg, nargs,
|
||||||
|
((PgfPArg) { .hypos = NULL, .ccat = ccat }));
|
||||||
|
|
||||||
|
PgfSymbol prev = item->curr_sym;
|
||||||
|
PgfSymbolCat* scat = (PgfSymbolCat*)
|
||||||
|
gu_alloc_variant(PGF_SYMBOL_CAT,
|
||||||
|
sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
|
||||||
|
gu_alignof(PgfSymbolCat),
|
||||||
|
&item->curr_sym, before->pool);
|
||||||
|
*((PgfSymbol*)(scat+1)) = prev;
|
||||||
|
scat->d = nargs;
|
||||||
|
scat->r = lin_idx;
|
||||||
|
|
||||||
|
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1210,7 +1168,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r, 0);
|
pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PGF_SYMBOL_KS: {
|
case PGF_SYMBOL_KS: {
|
||||||
@@ -1298,7 +1256,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
|
|||||||
|
|
||||||
if (parg->ccat->fid > 0 &&
|
if (parg->ccat->fid > 0 &&
|
||||||
parg->ccat->fid >= before->ps->concr->total_cats) {
|
parg->ccat->fid >= before->ps->concr->total_cats) {
|
||||||
pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r, 0);
|
pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
PgfItemConts* conts =
|
PgfItemConts* conts =
|
||||||
@@ -1382,9 +1340,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
|
|||||||
pgf_parsing_complete(before, after, item, NULL);
|
pgf_parsing_complete(before, after, item, NULL);
|
||||||
pgf_item_free(before, after, item);
|
pgf_item_free(before, after, item);
|
||||||
} else {
|
} else {
|
||||||
PgfSymbol sym =
|
pgf_parsing_symbol(before, after, item, item->curr_sym);
|
||||||
gu_seq_get(seq, PgfSymbol, item->seq_idx);
|
|
||||||
pgf_parsing_symbol(before, after, item, sym);
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -1409,7 +1365,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
|
|||||||
|
|
||||||
pgf_parsing_td_predict(before, after, item,
|
pgf_parsing_td_predict(before, after, item,
|
||||||
pcoerce->coerce,
|
pcoerce->coerce,
|
||||||
item->conts->lin_idx, 0);
|
item->conts->lin_idx);
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1:
|
||||||
pgf_parsing_complete(before, after, item, NULL);
|
pgf_parsing_complete(before, after, item, NULL);
|
||||||
@@ -1469,20 +1425,11 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PGF_PRODUCTION_META: {
|
case PGF_PRODUCTION_META: {
|
||||||
PgfProductionMeta* pmeta = i.data;
|
if (item->seq_idx == pgf_item_sequence_length(item)) {
|
||||||
|
if (before->meta_item != NULL)
|
||||||
PgfSequence seq;
|
break;
|
||||||
if (!gu_seq_is_null(pmeta->lins) &&
|
before->meta_item = item;
|
||||||
!gu_seq_is_null(seq = gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) {
|
|
||||||
if (item->seq_idx == gu_seq_length(seq)) {
|
|
||||||
pgf_parsing_complete(before, after, item, NULL);
|
|
||||||
pgf_item_free(before, after, item);
|
|
||||||
} else {
|
|
||||||
PgfSymbol sym =
|
|
||||||
gu_seq_get(seq, PgfSymbol, item->seq_idx);
|
|
||||||
pgf_parsing_symbol(before, after, item, sym);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (after == NULL) {
|
if (after == NULL) {
|
||||||
PgfExprProb *ep = gu_new(PgfExprProb, before->pool);
|
PgfExprProb *ep = gu_new(PgfExprProb, before->pool);
|
||||||
ep->expr = before->ps->meta_var;
|
ep->expr = before->ps->meta_var;
|
||||||
@@ -1494,34 +1441,21 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
|
|||||||
}
|
}
|
||||||
pgf_parsing_complete(before, after, item, ep);
|
pgf_parsing_complete(before, after, item, ep);
|
||||||
} else {
|
} else {
|
||||||
if (after->ts->lexicon_idx == NULL) {
|
prob_t meta_token_prob =
|
||||||
prob_t meta_token_prob =
|
item->conts->ccat->cnccat->abscat->meta_token_prob;
|
||||||
item->conts->ccat->cnccat->abscat->meta_token_prob;
|
if (meta_token_prob != INFINITY) {
|
||||||
if (meta_token_prob == INFINITY)
|
pgf_parsing_meta_scan(before, after, item, meta_token_prob);
|
||||||
break;
|
}
|
||||||
item->inside_prob += meta_token_prob;
|
|
||||||
|
|
||||||
PgfSymbol prev = item->curr_sym;
|
PgfCIdMap* meta_child_probs =
|
||||||
PgfSymbolKS* sks = (PgfSymbolKS*)
|
item->conts->ccat->cnccat->abscat->meta_child_probs;
|
||||||
gu_alloc_variant(PGF_SYMBOL_KS,
|
if (meta_child_probs != NULL) {
|
||||||
sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
|
PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, after, item };
|
||||||
gu_alignof(PgfSymbolKS),
|
gu_map_iter(meta_child_probs, &clo.fn, NULL);
|
||||||
&item->curr_sym, after->pool);
|
|
||||||
*((PgfSymbol*)(sks+1)) = prev;
|
|
||||||
sks->tokens = gu_new_seq(PgfToken, 1, after->pool);
|
|
||||||
gu_seq_set(sks->tokens, PgfToken, 0, after->ts->tok);
|
|
||||||
|
|
||||||
item->seq_idx++;
|
|
||||||
pgf_parsing_add_transition(before, after, after->ts->tok, item);
|
|
||||||
} else {
|
|
||||||
PgfCIdMap* meta_child_probs =
|
|
||||||
item->conts->ccat->cnccat->abscat->meta_child_probs;
|
|
||||||
if (meta_child_probs != NULL) {
|
|
||||||
PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, after, item };
|
|
||||||
gu_map_iter(meta_child_probs, &clo.fn, NULL);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
pgf_parsing_symbol(before, after, item, item->curr_sym);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -1609,7 +1543,6 @@ pgf_new_parsing(PgfConcr* concr, GuPool* pool)
|
|||||||
PgfProductionMeta,
|
PgfProductionMeta,
|
||||||
&ps->meta_prod, pool);
|
&ps->meta_prod, pool);
|
||||||
pmeta->ep = NULL;
|
pmeta->ep = NULL;
|
||||||
pmeta->lins = gu_null_seq;
|
|
||||||
pmeta->args = gu_new_seq(PgfPArg, 0, pool);
|
pmeta->args = gu_new_seq(PgfPArg, 0, pool);
|
||||||
|
|
||||||
return ps;
|
return ps;
|
||||||
|
|||||||
Reference in New Issue
Block a user