From 5779887f9677f1e303fce0aee77882cd86d18cff Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Tue, 11 Dec 2012 12:57:22 +0000 Subject: [PATCH] bugfix for robust parsing with multi-word units --- src/runtime/c/pgf/data.c | 1 - src/runtime/c/pgf/data.h | 1 - src/runtime/c/pgf/parser.c | 241 +++++++++++++------------------------ 3 files changed, 87 insertions(+), 156 deletions(-) diff --git a/src/runtime/c/pgf/data.c b/src/runtime/c/pgf/data.c index ff6fa53e4..ee7ab5cd7 100644 --- a/src/runtime/c/pgf/data.c +++ b/src/runtime/c/pgf/data.c @@ -131,7 +131,6 @@ GU_DEFINE_TYPE( GU_MEMBER(PgfProductionExtern, lins, GuSeq)), GU_CONSTRUCTOR_S( PGF_PRODUCTION_META, PgfProductionMeta, - GU_MEMBER(PgfProductionMeta, lins, GuSeq), GU_MEMBER(PgfProductionMeta, args, PgfPArgs))); GU_DEFINE_TYPE(PgfProductions, GuList, gu_type(PgfProduction)); diff --git a/src/runtime/c/pgf/data.h b/src/runtime/c/pgf/data.h index 82c363923..f5435cee5 100644 --- a/src/runtime/c/pgf/data.h +++ b/src/runtime/c/pgf/data.h @@ -337,7 +337,6 @@ typedef struct { typedef struct { PgfExprProb *ep; - GuSeq lins; PgfPArgs args; } PgfProductionMeta; diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index 6159ab859..6ed589e15 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -197,22 +197,14 @@ pgf_item_sequence_length(PgfItem* item) } } case PGF_PRODUCTION_META: { - PgfProductionMeta* pmeta = i.data; - PgfSequence seq; - - if (!gu_seq_is_null(pmeta->lins) && - !gu_seq_is_null(seq = gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) { - return gu_seq_length(seq); - } else { - int seq_len = 0; - PgfSymbol sym = item->curr_sym; - while (!gu_variant_is_null(sym)) { - seq_len++; - sym = pgf_prev_extern_sym(sym); - } - - return seq_len; + int seq_len = 0; + PgfSymbol sym = item->curr_sym; + while (!gu_variant_is_null(sym)) { + seq_len++; + sym = pgf_prev_extern_sym(sym); } + + return seq_len; } default: gu_impossible(); @@ -268,12 +260,7 @@ pgf_item_sequence(PgfItem* item, break; } case PGF_PRODUCTION_META: { - PgfProductionMeta* pmeta = i.data; - - if (gu_seq_is_null(pmeta->lins) || - gu_seq_is_null(*seq = gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) { - *seq = pgf_extern_seq_get(item, pool); - } + *seq = pgf_extern_seq_get(item, pool); break; } default: @@ -760,62 +747,20 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after, return; } - bool extend = false; - GuVariantInfo i = gu_variant_open(cont->prod); - if (i.tag == PGF_PRODUCTION_META) { - PgfProductionMeta* pmeta = i.data; - if (gu_seq_is_null(pmeta->lins) || - gu_seq_is_null(gu_seq_get(pmeta->lins,PgfSequence,cont->conts->lin_idx))) { - extend = true; - } - } - PgfItem* item = NULL; - - if (!extend) { - switch (gu_variant_tag(cont->curr_sym)) { - case PGF_SYMBOL_CAT: { - PgfSymbolCat* scat = gu_variant_data(cont->curr_sym); - item = pgf_item_update_arg(cont, scat->d, cat, before->pool, before->ps); - break; - } - case PGF_SYMBOL_LIT: { - PgfSymbolLit* slit = gu_variant_data(cont->curr_sym); - item = pgf_item_update_arg(cont, slit->d, cat, before->pool, before->ps); - break; - } - default: - gu_impossible(); - } - } else { - if (before->meta_item != NULL) - return; - - item = pgf_item_copy(cont, before->pool, before->ps); - size_t nargs = gu_seq_length(cont->args); - item->args = gu_new_seq(PgfPArg, nargs+1, before->pool); - memcpy(gu_seq_data(item->args), gu_seq_data(cont->args), - nargs * sizeof(PgfPArg)); - gu_seq_set(item->args, PgfPArg, nargs, - ((PgfPArg) { .hypos = NULL, .ccat = cat })); - - PgfCIdMap* meta_child_probs = - item->conts->ccat->cnccat->abscat->meta_child_probs; - item->inside_prob += - cat->viterbi_prob+ - gu_map_get(meta_child_probs, cat->cnccat->abscat, prob_t); - - PgfSymbol prev = item->curr_sym; - PgfSymbolCat* scat = (PgfSymbolCat*) - gu_alloc_variant(PGF_SYMBOL_CAT, - sizeof(PgfSymbolCat)+sizeof(PgfSymbol), - gu_alignof(PgfSymbolCat), - &item->curr_sym, before->pool); - *((PgfSymbol*)(scat+1)) = prev; - scat->d = nargs; - scat->r = lin_idx; - - before->meta_item = item; + switch (gu_variant_tag(cont->curr_sym)) { + case PGF_SYMBOL_CAT: { + PgfSymbolCat* scat = gu_variant_data(cont->curr_sym); + item = pgf_item_update_arg(cont, scat->d, cat, before->pool, before->ps); + break; + } + case PGF_SYMBOL_LIT: { + PgfSymbolLit* slit = gu_variant_data(cont->curr_sym); + item = pgf_item_update_arg(cont, slit->d, cat, before->pool, before->ps); + break; + } + default: + gu_impossible(); } pgf_item_advance(item, before->pool); @@ -893,38 +838,12 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool) break; } case PGF_PRODUCTION_META: { - PgfProductionMeta* pmeta = i.data; - PgfProductionMeta* new_pmeta = gu_new_variant(PGF_PRODUCTION_META, PgfProductionMeta, &prod, pool); new_pmeta->ep = ep; - new_pmeta->lins = pmeta->lins; new_pmeta->args = item->args; - - if (gu_seq_is_null(pmeta->lins) || - gu_seq_is_null(gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) { - PgfSequence seq = - pgf_extern_seq_get(item, pool); - - size_t n_lins = item->conts->ccat->cnccat->n_lins; - - new_pmeta->lins = gu_new_seq(PgfSequence, n_lins, pool); - - if (gu_seq_is_null(pmeta->lins)) { - for (size_t i = 0; i < n_lins; i++) { - gu_seq_set(new_pmeta->lins,PgfSequence,i, - gu_null_seq); - } - } else { - for (size_t i = 0; i < n_lins; i++) { - gu_seq_set(new_pmeta->lins,PgfSequence,i, - gu_seq_get(pmeta->lins,PgfSequence,i)); - } - } - gu_seq_set(new_pmeta->lins,PgfSequence,item->conts->lin_idx,seq); - } break; } default: @@ -1064,8 +983,7 @@ pgf_parsing_bu_filter(PgfParseState* before, PgfParseState* after, static void pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after, - PgfItem* item, PgfCCat* ccat, size_t lin_idx, - prob_t delta_prob) + PgfItem* item, PgfCCat* ccat, size_t lin_idx) { PgfItemConts* conts = pgf_parsing_get_conts(before->conts_map, @@ -1079,8 +997,7 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after, conts->outside_prob = item->inside_prob-conts->ccat->viterbi_prob+ - item->conts->outside_prob + - delta_prob; + item->conts->outside_prob; // Top-down prediction for syntactic rules PgfProductionSeq prods = ccat->prods; @@ -1143,6 +1060,26 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after, } } +static void +pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after, + PgfItem* meta_item, prob_t meta_prob) +{ + PgfItem* item = pgf_item_copy(meta_item, before->pool, before->ps); + item->inside_prob += meta_prob; + + PgfSymbol prev = item->curr_sym; + PgfSymbolKS* sks = (PgfSymbolKS*) + gu_alloc_variant(PGF_SYMBOL_KS, + sizeof(PgfSymbolKS)+sizeof(PgfSymbol), + gu_alignof(PgfSymbolKS), + &item->curr_sym, after->pool); + *((PgfSymbol*)(sks+1)) = prev; + sks->tokens = gu_new_seq(PgfToken, 1, after->pool); + gu_seq_set(sks->tokens, PgfToken, 0, after->ts->tok); + + gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item); +} + typedef struct { GuMapItor fn; PgfParseState* before; @@ -1182,8 +1119,29 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err continue; } - pgf_parsing_td_predict(before, after, - meta_item, ccat, lin_idx, meta_prob); + PgfItem* item = + pgf_item_copy(meta_item, before->pool, before->ps); + item->inside_prob += + ccat->viterbi_prob+meta_prob; + + size_t nargs = gu_seq_length(meta_item->args); + item->args = gu_new_seq(PgfPArg, nargs+1, before->pool); + memcpy(gu_seq_data(item->args), gu_seq_data(meta_item->args), + nargs * sizeof(PgfPArg)); + gu_seq_set(item->args, PgfPArg, nargs, + ((PgfPArg) { .hypos = NULL, .ccat = ccat })); + + PgfSymbol prev = item->curr_sym; + PgfSymbolCat* scat = (PgfSymbolCat*) + gu_alloc_variant(PGF_SYMBOL_CAT, + sizeof(PgfSymbolCat)+sizeof(PgfSymbol), + gu_alignof(PgfSymbolCat), + &item->curr_sym, before->pool); + *((PgfSymbol*)(scat+1)) = prev; + scat->d = nargs; + scat->r = lin_idx; + + gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item); } } } @@ -1210,7 +1168,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after, return; } - pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r, 0); + pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r); break; } case PGF_SYMBOL_KS: { @@ -1298,7 +1256,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after, if (parg->ccat->fid > 0 && parg->ccat->fid >= before->ps->concr->total_cats) { - pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r, 0); + pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r); } else { PgfItemConts* conts = @@ -1382,9 +1340,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item) pgf_parsing_complete(before, after, item, NULL); pgf_item_free(before, after, item); } else { - PgfSymbol sym = - gu_seq_get(seq, PgfSymbol, item->seq_idx); - pgf_parsing_symbol(before, after, item, sym); + pgf_parsing_symbol(before, after, item, item->curr_sym); } break; } @@ -1409,7 +1365,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item) pgf_parsing_td_predict(before, after, item, pcoerce->coerce, - item->conts->lin_idx, 0); + item->conts->lin_idx); break; case 1: pgf_parsing_complete(before, after, item, NULL); @@ -1469,20 +1425,11 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item) break; } case PGF_PRODUCTION_META: { - PgfProductionMeta* pmeta = i.data; - - PgfSequence seq; - if (!gu_seq_is_null(pmeta->lins) && - !gu_seq_is_null(seq = gu_seq_get(pmeta->lins,PgfSequence,item->conts->lin_idx))) { - if (item->seq_idx == gu_seq_length(seq)) { - pgf_parsing_complete(before, after, item, NULL); - pgf_item_free(before, after, item); - } else { - PgfSymbol sym = - gu_seq_get(seq, PgfSymbol, item->seq_idx); - pgf_parsing_symbol(before, after, item, sym); - } - } else { + if (item->seq_idx == pgf_item_sequence_length(item)) { + if (before->meta_item != NULL) + break; + before->meta_item = item; + if (after == NULL) { PgfExprProb *ep = gu_new(PgfExprProb, before->pool); ep->expr = before->ps->meta_var; @@ -1494,34 +1441,21 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item) } pgf_parsing_complete(before, after, item, ep); } else { - if (after->ts->lexicon_idx == NULL) { - prob_t meta_token_prob = - item->conts->ccat->cnccat->abscat->meta_token_prob; - if (meta_token_prob == INFINITY) - break; - item->inside_prob += meta_token_prob; + prob_t meta_token_prob = + item->conts->ccat->cnccat->abscat->meta_token_prob; + if (meta_token_prob != INFINITY) { + pgf_parsing_meta_scan(before, after, item, meta_token_prob); + } - PgfSymbol prev = item->curr_sym; - PgfSymbolKS* sks = (PgfSymbolKS*) - gu_alloc_variant(PGF_SYMBOL_KS, - sizeof(PgfSymbolKS)+sizeof(PgfSymbol), - gu_alignof(PgfSymbolKS), - &item->curr_sym, after->pool); - *((PgfSymbol*)(sks+1)) = prev; - sks->tokens = gu_new_seq(PgfToken, 1, after->pool); - gu_seq_set(sks->tokens, PgfToken, 0, after->ts->tok); - - item->seq_idx++; - pgf_parsing_add_transition(before, after, after->ts->tok, item); - } else { - PgfCIdMap* meta_child_probs = - item->conts->ccat->cnccat->abscat->meta_child_probs; - if (meta_child_probs != NULL) { - PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, after, item }; - gu_map_iter(meta_child_probs, &clo.fn, NULL); - } + PgfCIdMap* meta_child_probs = + item->conts->ccat->cnccat->abscat->meta_child_probs; + if (meta_child_probs != NULL) { + PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, after, item }; + gu_map_iter(meta_child_probs, &clo.fn, NULL); } } + } else { + pgf_parsing_symbol(before, after, item, item->curr_sym); } break; } @@ -1609,7 +1543,6 @@ pgf_new_parsing(PgfConcr* concr, GuPool* pool) PgfProductionMeta, &ps->meta_prod, pool); pmeta->ep = NULL; - pmeta->lins = gu_null_seq; pmeta->args = gu_new_seq(PgfPArg, 0, pool); return ps;