1
0
forked from GitHub/gf-core

two simple heuristics which speed up the statistical parser more than seven times.

This commit is contained in:
kr.angelov
2012-11-12 22:17:40 +00:00
parent 182e366f5d
commit d1044b202a

View File

@@ -83,12 +83,15 @@ struct PgfParseState {
PgfParseState* next; PgfParseState* next;
PgfItemBuf* agenda; PgfItemBuf* agenda;
PgfItem* meta_item;
PgfContsMap* conts_map; PgfContsMap* conts_map;
PgfGenCatMap* generated_cats; PgfGenCatMap* generated_cats;
#ifdef PGF_PARSER_DEBUG #ifdef PGF_PARSER_DEBUG
unsigned short offset; unsigned short offset;
#endif #endif
prob_t delta_prob;
PgfParsing* ps; PgfParsing* ps;
PgfTokenState* ts; PgfTokenState* ts;
}; };
@@ -493,21 +496,6 @@ pgf_parsing_get_conts(PgfContsMap* conts_map,
return conts; return conts;
} }
static bool
pgf_parsing_has_conts(PgfContsMap* conts_map,
PgfCCat* ccat, size_t lin_idx,
PgfItemConts* conts)
{
gu_require(lin_idx < ccat->cnccat->n_lins);
PgfItemContss* contss = gu_map_get(conts_map, ccat, PgfItemContss*);
if (!contss)
return false;
PgfItemConts* conts0 = gu_list_index(contss, lin_idx);
return (conts == conts0);
}
static PgfCCat* static PgfCCat*
pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts, pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
prob_t viterbi_prob) prob_t viterbi_prob)
@@ -753,8 +741,12 @@ pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after,
PgfToken tok, PgfItem* item) PgfToken tok, PgfItem* item)
{ {
if (gu_string_eq(tok, after->ts->tok)) { if (gu_string_eq(tok, after->ts->tok)) {
if (after->next == NULL) if (after->next == NULL) {
after->ps->target = item; after->ps->target = item;
after->delta_prob =
item->inside_prob+item->conts->outside_prob -
before->delta_prob;
}
gu_buf_heap_push(after->agenda, &pgf_item_prob_order, &item); gu_buf_heap_push(after->agenda, &pgf_item_prob_order, &item);
} else { } else {
@@ -764,8 +756,7 @@ pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after,
static void static void
pgf_parsing_combine(PgfParseState* before, PgfParseState* after, pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
PgfItem* cont, PgfCCat* cat, int lin_idx, PgfItem* cont, PgfCCat* cat, int lin_idx)
bool is_empty)
{ {
if (cont == NULL) { if (cont == NULL) {
if (after == NULL) if (after == NULL)
@@ -801,7 +792,7 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
gu_impossible(); gu_impossible();
} }
} else { } else {
if (is_empty) if (before->meta_item != NULL)
return; return;
item = pgf_item_copy(cont, before->pool, before->ps); item = pgf_item_copy(cont, before->pool, before->ps);
@@ -827,6 +818,8 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
*((PgfSymbol*)(scat+1)) = prev; *((PgfSymbol*)(scat+1)) = prev;
scat->d = nargs; scat->d = nargs;
scat->r = lin_idx; scat->r = lin_idx;
before->meta_item = item;
} }
pgf_item_advance(item, before->pool); pgf_item_advance(item, before->pool);
@@ -1020,15 +1013,10 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
state = state->next; state = state->next;
} }
} else { } else {
bool is_empty =
pgf_parsing_has_conts(before->conts_map,
item->conts->ccat, item->conts->lin_idx,
item->conts);
size_t n_conts = gu_buf_length(item->conts->items); size_t n_conts = gu_buf_length(item->conts->items);
for (size_t i = 0; i < n_conts; i++) { for (size_t i = 0; i < n_conts; i++) {
PgfItem* cont = gu_buf_get(item->conts->items, PgfItem*, i); PgfItem* cont = gu_buf_get(item->conts->items, PgfItem*, i);
pgf_parsing_combine(before, after, cont, cat, item->conts->lin_idx, is_empty); pgf_parsing_combine(before, after, cont, cat, item->conts->lin_idx);
} }
} }
} }
@@ -1105,16 +1093,6 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
gu_seq_get(prods, PgfProduction, i); gu_seq_get(prods, PgfProduction, i);
pgf_parsing_production(before, conts, prod); pgf_parsing_production(before, conts, prod);
} }
if (ccat->cnccat->abscat->meta_prob != INFINITY &&
ccat->conts == NULL /* grammar defined ccat */) {
// Top-down prediction for meta rules
PgfItem *item =
pgf_new_item(conts, before->ps->meta_prod, before->pool, before->ps);
item->inside_prob =
ccat->cnccat->abscat->meta_prob;
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
// Bottom-up prediction for lexical rules // Bottom-up prediction for lexical rules
if (after != NULL && after->ts->lexicon_idx != NULL) { if (after != NULL && after->ts->lexicon_idx != NULL) {
@@ -1153,7 +1131,7 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
PgfCCat* completed = PgfCCat* completed =
pgf_parsing_get_completed(before, conts); pgf_parsing_get_completed(before, conts);
if (completed) { if (completed) {
pgf_parsing_combine(before, after, item, completed, lin_idx, true); pgf_parsing_combine(before, after, item, completed, lin_idx);
} }
PgfParseState* state = after; PgfParseState* state = after;
@@ -1161,7 +1139,7 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
PgfCCat* completed = PgfCCat* completed =
pgf_parsing_get_completed(state, conts); pgf_parsing_get_completed(state, conts);
if (completed) { if (completed) {
pgf_parsing_combine(state, state->next, item, completed, lin_idx, true); pgf_parsing_combine(state, state->next, item, completed, lin_idx);
} }
state = state->next; state = state->next;
@@ -1360,7 +1338,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
PgfCCat* completed = PgfCCat* completed =
pgf_parsing_get_completed(before, conts); pgf_parsing_get_completed(before, conts);
if (completed) { if (completed) {
pgf_parsing_combine(before, after, item, completed, slit->r, true); pgf_parsing_combine(before, after, item, completed, slit->r);
} }
PgfParseState* state = after; PgfParseState* state = after;
@@ -1368,7 +1346,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
PgfCCat* completed = PgfCCat* completed =
pgf_parsing_get_completed(state, conts); pgf_parsing_get_completed(state, conts);
if (completed) { if (completed) {
pgf_parsing_combine(state, state->next, item, completed, slit->r, true); pgf_parsing_combine(state, state->next, item, completed, slit->r);
} }
state = state->next; state = state->next;
@@ -1509,7 +1487,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
pgf_parsing_symbol(before, after, item, sym); pgf_parsing_symbol(before, after, item, sym);
} }
} else { } else {
if (!gu_variant_is_null(item->curr_sym)) { if (after == NULL) {
PgfExprProb *ep = gu_new(PgfExprProb, before->pool); PgfExprProb *ep = gu_new(PgfExprProb, before->pool);
ep->expr = before->ps->meta_var; ep->expr = before->ps->meta_var;
ep->prob = item->inside_prob; ep->prob = item->inside_prob;
@@ -1519,9 +1497,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
ep->prob -= arg->ccat->viterbi_prob; ep->prob -= arg->ccat->viterbi_prob;
} }
pgf_parsing_complete(before, after, item, ep); pgf_parsing_complete(before, after, item, ep);
} } else {
if (after != NULL) {
if (after->ts->lexicon_idx == NULL) { if (after->ts->lexicon_idx == NULL) {
prob_t meta_token_prob = prob_t meta_token_prob =
item->conts->ccat->cnccat->abscat->meta_token_prob; item->conts->ccat->cnccat->abscat->meta_token_prob;
@@ -1563,16 +1539,20 @@ pgf_parsing_proceed(PgfParseState* state) {
prob_t best_prob = INFINITY; prob_t best_prob = INFINITY;
PgfParseState* before = NULL; PgfParseState* before = NULL;
prob_t delta_prob = 0;
PgfParseState* st = state; PgfParseState* st = state;
while (st != NULL) { while (st != NULL) {
if (gu_buf_length(st->agenda) > 0) { if (gu_buf_length(st->agenda) > 0) {
PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0); PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
prob_t item_prob = item->inside_prob+item->conts->outside_prob; prob_t item_prob =
item->inside_prob+item->conts->outside_prob+delta_prob;
if (item_prob < best_prob) { if (item_prob < best_prob) {
best_prob = item_prob; best_prob = item_prob;
before = st; before = st;
} }
} }
delta_prob += st->delta_prob*0.8;
st = st->next; st = st->next;
} }
@@ -1646,11 +1626,13 @@ pgf_new_parse_state(PgfParsing* ps,
state->pool = pool; state->pool = pool;
state->next = next; state->next = next;
state->agenda = gu_new_buf(PgfItem*, pool); state->agenda = gu_new_buf(PgfItem*, pool);
state->meta_item = NULL;
state->generated_cats = gu_map_type_new(PgfGenCatMap, pool); state->generated_cats = gu_map_type_new(PgfGenCatMap, pool);
state->conts_map = gu_map_type_new(PgfContsMap, pool); state->conts_map = gu_map_type_new(PgfContsMap, pool);
#ifdef PGF_PARSER_DEBUG #ifdef PGF_PARSER_DEBUG
state->offset = next ? next->offset+1 : 0; state->offset = next ? next->offset+1 : 0;
#endif #endif
state->delta_prob = 0;
state->ps = ps; state->ps = ps;
state->ts = ts; state->ts = ts;
return state; return state;
@@ -1944,10 +1926,10 @@ pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool)
pgf_new_item(conts, prod, pool, ps); pgf_new_item(conts, prod, pool, ps);
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item); gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
} }
PgfItem *item = PgfItem *item =
pgf_new_item(conts, ps->meta_prod, pool, ps); pgf_new_item(conts, ps->meta_prod, pool, ps);
item->inside_prob = item->inside_prob =
ccat->cnccat->abscat->meta_prob; ccat->cnccat->abscat->meta_prob;
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item); gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
} }