forked from GitHub/gf-core
two simple heuristics which speed up the statistical parser more than seven times.
This commit is contained in:
@@ -83,12 +83,15 @@ struct PgfParseState {
|
|||||||
PgfParseState* next;
|
PgfParseState* next;
|
||||||
|
|
||||||
PgfItemBuf* agenda;
|
PgfItemBuf* agenda;
|
||||||
|
PgfItem* meta_item;
|
||||||
PgfContsMap* conts_map;
|
PgfContsMap* conts_map;
|
||||||
PgfGenCatMap* generated_cats;
|
PgfGenCatMap* generated_cats;
|
||||||
#ifdef PGF_PARSER_DEBUG
|
#ifdef PGF_PARSER_DEBUG
|
||||||
unsigned short offset;
|
unsigned short offset;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
prob_t delta_prob;
|
||||||
|
|
||||||
PgfParsing* ps;
|
PgfParsing* ps;
|
||||||
PgfTokenState* ts;
|
PgfTokenState* ts;
|
||||||
};
|
};
|
||||||
@@ -493,21 +496,6 @@ pgf_parsing_get_conts(PgfContsMap* conts_map,
|
|||||||
return conts;
|
return conts;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
|
||||||
pgf_parsing_has_conts(PgfContsMap* conts_map,
|
|
||||||
PgfCCat* ccat, size_t lin_idx,
|
|
||||||
PgfItemConts* conts)
|
|
||||||
{
|
|
||||||
gu_require(lin_idx < ccat->cnccat->n_lins);
|
|
||||||
|
|
||||||
PgfItemContss* contss = gu_map_get(conts_map, ccat, PgfItemContss*);
|
|
||||||
if (!contss)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
PgfItemConts* conts0 = gu_list_index(contss, lin_idx);
|
|
||||||
return (conts == conts0);
|
|
||||||
}
|
|
||||||
|
|
||||||
static PgfCCat*
|
static PgfCCat*
|
||||||
pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
|
pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
|
||||||
prob_t viterbi_prob)
|
prob_t viterbi_prob)
|
||||||
@@ -753,8 +741,12 @@ pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after,
|
|||||||
PgfToken tok, PgfItem* item)
|
PgfToken tok, PgfItem* item)
|
||||||
{
|
{
|
||||||
if (gu_string_eq(tok, after->ts->tok)) {
|
if (gu_string_eq(tok, after->ts->tok)) {
|
||||||
if (after->next == NULL)
|
if (after->next == NULL) {
|
||||||
after->ps->target = item;
|
after->ps->target = item;
|
||||||
|
after->delta_prob =
|
||||||
|
item->inside_prob+item->conts->outside_prob -
|
||||||
|
before->delta_prob;
|
||||||
|
}
|
||||||
|
|
||||||
gu_buf_heap_push(after->agenda, &pgf_item_prob_order, &item);
|
gu_buf_heap_push(after->agenda, &pgf_item_prob_order, &item);
|
||||||
} else {
|
} else {
|
||||||
@@ -764,8 +756,7 @@ pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after,
|
|||||||
|
|
||||||
static void
|
static void
|
||||||
pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
|
pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
|
||||||
PgfItem* cont, PgfCCat* cat, int lin_idx,
|
PgfItem* cont, PgfCCat* cat, int lin_idx)
|
||||||
bool is_empty)
|
|
||||||
{
|
{
|
||||||
if (cont == NULL) {
|
if (cont == NULL) {
|
||||||
if (after == NULL)
|
if (after == NULL)
|
||||||
@@ -801,7 +792,7 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
|
|||||||
gu_impossible();
|
gu_impossible();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (is_empty)
|
if (before->meta_item != NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
item = pgf_item_copy(cont, before->pool, before->ps);
|
item = pgf_item_copy(cont, before->pool, before->ps);
|
||||||
@@ -827,6 +818,8 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
|
|||||||
*((PgfSymbol*)(scat+1)) = prev;
|
*((PgfSymbol*)(scat+1)) = prev;
|
||||||
scat->d = nargs;
|
scat->d = nargs;
|
||||||
scat->r = lin_idx;
|
scat->r = lin_idx;
|
||||||
|
|
||||||
|
before->meta_item = item;
|
||||||
}
|
}
|
||||||
|
|
||||||
pgf_item_advance(item, before->pool);
|
pgf_item_advance(item, before->pool);
|
||||||
@@ -1020,15 +1013,10 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
|
|||||||
state = state->next;
|
state = state->next;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
bool is_empty =
|
|
||||||
pgf_parsing_has_conts(before->conts_map,
|
|
||||||
item->conts->ccat, item->conts->lin_idx,
|
|
||||||
item->conts);
|
|
||||||
|
|
||||||
size_t n_conts = gu_buf_length(item->conts->items);
|
size_t n_conts = gu_buf_length(item->conts->items);
|
||||||
for (size_t i = 0; i < n_conts; i++) {
|
for (size_t i = 0; i < n_conts; i++) {
|
||||||
PgfItem* cont = gu_buf_get(item->conts->items, PgfItem*, i);
|
PgfItem* cont = gu_buf_get(item->conts->items, PgfItem*, i);
|
||||||
pgf_parsing_combine(before, after, cont, cat, item->conts->lin_idx, is_empty);
|
pgf_parsing_combine(before, after, cont, cat, item->conts->lin_idx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1105,16 +1093,6 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
|
|||||||
gu_seq_get(prods, PgfProduction, i);
|
gu_seq_get(prods, PgfProduction, i);
|
||||||
pgf_parsing_production(before, conts, prod);
|
pgf_parsing_production(before, conts, prod);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ccat->cnccat->abscat->meta_prob != INFINITY &&
|
|
||||||
ccat->conts == NULL /* grammar defined ccat */) {
|
|
||||||
// Top-down prediction for meta rules
|
|
||||||
PgfItem *item =
|
|
||||||
pgf_new_item(conts, before->ps->meta_prod, before->pool, before->ps);
|
|
||||||
item->inside_prob =
|
|
||||||
ccat->cnccat->abscat->meta_prob;
|
|
||||||
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Bottom-up prediction for lexical rules
|
// Bottom-up prediction for lexical rules
|
||||||
if (after != NULL && after->ts->lexicon_idx != NULL) {
|
if (after != NULL && after->ts->lexicon_idx != NULL) {
|
||||||
@@ -1153,7 +1131,7 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
|
|||||||
PgfCCat* completed =
|
PgfCCat* completed =
|
||||||
pgf_parsing_get_completed(before, conts);
|
pgf_parsing_get_completed(before, conts);
|
||||||
if (completed) {
|
if (completed) {
|
||||||
pgf_parsing_combine(before, after, item, completed, lin_idx, true);
|
pgf_parsing_combine(before, after, item, completed, lin_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
PgfParseState* state = after;
|
PgfParseState* state = after;
|
||||||
@@ -1161,7 +1139,7 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
|
|||||||
PgfCCat* completed =
|
PgfCCat* completed =
|
||||||
pgf_parsing_get_completed(state, conts);
|
pgf_parsing_get_completed(state, conts);
|
||||||
if (completed) {
|
if (completed) {
|
||||||
pgf_parsing_combine(state, state->next, item, completed, lin_idx, true);
|
pgf_parsing_combine(state, state->next, item, completed, lin_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
state = state->next;
|
state = state->next;
|
||||||
@@ -1360,7 +1338,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
|
|||||||
PgfCCat* completed =
|
PgfCCat* completed =
|
||||||
pgf_parsing_get_completed(before, conts);
|
pgf_parsing_get_completed(before, conts);
|
||||||
if (completed) {
|
if (completed) {
|
||||||
pgf_parsing_combine(before, after, item, completed, slit->r, true);
|
pgf_parsing_combine(before, after, item, completed, slit->r);
|
||||||
}
|
}
|
||||||
|
|
||||||
PgfParseState* state = after;
|
PgfParseState* state = after;
|
||||||
@@ -1368,7 +1346,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
|
|||||||
PgfCCat* completed =
|
PgfCCat* completed =
|
||||||
pgf_parsing_get_completed(state, conts);
|
pgf_parsing_get_completed(state, conts);
|
||||||
if (completed) {
|
if (completed) {
|
||||||
pgf_parsing_combine(state, state->next, item, completed, slit->r, true);
|
pgf_parsing_combine(state, state->next, item, completed, slit->r);
|
||||||
}
|
}
|
||||||
|
|
||||||
state = state->next;
|
state = state->next;
|
||||||
@@ -1509,7 +1487,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
|
|||||||
pgf_parsing_symbol(before, after, item, sym);
|
pgf_parsing_symbol(before, after, item, sym);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!gu_variant_is_null(item->curr_sym)) {
|
if (after == NULL) {
|
||||||
PgfExprProb *ep = gu_new(PgfExprProb, before->pool);
|
PgfExprProb *ep = gu_new(PgfExprProb, before->pool);
|
||||||
ep->expr = before->ps->meta_var;
|
ep->expr = before->ps->meta_var;
|
||||||
ep->prob = item->inside_prob;
|
ep->prob = item->inside_prob;
|
||||||
@@ -1519,9 +1497,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
|
|||||||
ep->prob -= arg->ccat->viterbi_prob;
|
ep->prob -= arg->ccat->viterbi_prob;
|
||||||
}
|
}
|
||||||
pgf_parsing_complete(before, after, item, ep);
|
pgf_parsing_complete(before, after, item, ep);
|
||||||
}
|
} else {
|
||||||
|
|
||||||
if (after != NULL) {
|
|
||||||
if (after->ts->lexicon_idx == NULL) {
|
if (after->ts->lexicon_idx == NULL) {
|
||||||
prob_t meta_token_prob =
|
prob_t meta_token_prob =
|
||||||
item->conts->ccat->cnccat->abscat->meta_token_prob;
|
item->conts->ccat->cnccat->abscat->meta_token_prob;
|
||||||
@@ -1563,16 +1539,20 @@ pgf_parsing_proceed(PgfParseState* state) {
|
|||||||
prob_t best_prob = INFINITY;
|
prob_t best_prob = INFINITY;
|
||||||
PgfParseState* before = NULL;
|
PgfParseState* before = NULL;
|
||||||
|
|
||||||
|
prob_t delta_prob = 0;
|
||||||
PgfParseState* st = state;
|
PgfParseState* st = state;
|
||||||
while (st != NULL) {
|
while (st != NULL) {
|
||||||
if (gu_buf_length(st->agenda) > 0) {
|
if (gu_buf_length(st->agenda) > 0) {
|
||||||
PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
|
PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
|
||||||
prob_t item_prob = item->inside_prob+item->conts->outside_prob;
|
prob_t item_prob =
|
||||||
|
item->inside_prob+item->conts->outside_prob+delta_prob;
|
||||||
if (item_prob < best_prob) {
|
if (item_prob < best_prob) {
|
||||||
best_prob = item_prob;
|
best_prob = item_prob;
|
||||||
before = st;
|
before = st;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
delta_prob += st->delta_prob*0.8;
|
||||||
st = st->next;
|
st = st->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1646,11 +1626,13 @@ pgf_new_parse_state(PgfParsing* ps,
|
|||||||
state->pool = pool;
|
state->pool = pool;
|
||||||
state->next = next;
|
state->next = next;
|
||||||
state->agenda = gu_new_buf(PgfItem*, pool);
|
state->agenda = gu_new_buf(PgfItem*, pool);
|
||||||
|
state->meta_item = NULL;
|
||||||
state->generated_cats = gu_map_type_new(PgfGenCatMap, pool);
|
state->generated_cats = gu_map_type_new(PgfGenCatMap, pool);
|
||||||
state->conts_map = gu_map_type_new(PgfContsMap, pool);
|
state->conts_map = gu_map_type_new(PgfContsMap, pool);
|
||||||
#ifdef PGF_PARSER_DEBUG
|
#ifdef PGF_PARSER_DEBUG
|
||||||
state->offset = next ? next->offset+1 : 0;
|
state->offset = next ? next->offset+1 : 0;
|
||||||
#endif
|
#endif
|
||||||
|
state->delta_prob = 0;
|
||||||
state->ps = ps;
|
state->ps = ps;
|
||||||
state->ts = ts;
|
state->ts = ts;
|
||||||
return state;
|
return state;
|
||||||
@@ -1944,10 +1926,10 @@ pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool)
|
|||||||
pgf_new_item(conts, prod, pool, ps);
|
pgf_new_item(conts, prod, pool, ps);
|
||||||
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
|
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
|
||||||
}
|
}
|
||||||
|
|
||||||
PgfItem *item =
|
PgfItem *item =
|
||||||
pgf_new_item(conts, ps->meta_prod, pool, ps);
|
pgf_new_item(conts, ps->meta_prod, pool, ps);
|
||||||
item->inside_prob =
|
item->inside_prob =
|
||||||
ccat->cnccat->abscat->meta_prob;
|
ccat->cnccat->abscat->meta_prob;
|
||||||
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
|
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user