From e2ddea6c7d88fab69a98d589288ba5c230496e8e Mon Sep 17 00:00:00 2001 From: krangelov Date: Fri, 30 Aug 2019 13:31:57 +0200 Subject: [PATCH] first version of a parser which returns chunks in case of failure --- src/runtime/c/pgf/data.h | 3 +- src/runtime/c/pgf/parser.c | 454 ++++++++++++++++++++++++++----------- src/runtime/c/pgf/reader.c | 2 + 3 files changed, 326 insertions(+), 133 deletions(-) diff --git a/src/runtime/c/pgf/data.h b/src/runtime/c/pgf/data.h index 45685c82d..680c41a45 100644 --- a/src/runtime/c/pgf/data.h +++ b/src/runtime/c/pgf/data.h @@ -344,8 +344,9 @@ struct PgfCCat { PgfCncFuns* linrefs; size_t n_synprods; PgfProductionSeq* prods; - float viterbi_prob; + prob_t viterbi_prob; int fid; + int chunk_count; PgfItemConts* conts; struct PgfAnswers* answers; GuFinalizer fin[0]; diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index 428ec9f1e..12fed0d60 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -29,6 +29,7 @@ struct PgfItemConts { typedef GuSeq PgfItemContss; typedef GuMap PgfContsMap; typedef GuMap PgfGenCatMap; +typedef GuMap PgfChunksMap; typedef GuBuf PgfCCatBuf; @@ -48,7 +49,6 @@ typedef struct { #ifdef PGF_COUNTS_DEBUG int item_full_count; int item_real_count; - int cont_full_count; int ccat_full_count; int prod_full_count; #endif @@ -67,6 +67,7 @@ struct PgfParseState { PgfItemBuf* agenda; PgfContsMap* conts_map; PgfGenCatMap* generated_cats; + PgfChunksMap* chunks_map; bool needs_bind; size_t start_offset; @@ -78,14 +79,21 @@ struct PgfParseState { typedef struct PgfAnswers { GuBuf* conts; GuBuf* exprs; + PgfCCat* ccat; prob_t outside_prob; } PgfAnswers; +#define PGF_EXPR_CHUNK_STATE ((size_t) -1) + typedef struct { PgfAnswers* answers; PgfExprProb ep; - PgfPArgs* args; - size_t arg_idx; + union { + PgfPArgs* args; + PgfParseState* state; + }; + size_t arg_idx; // if the value is PGF_EXPR_CHUNK_STATE, then + // the relevant value above is state, not args. } PgfExprState; typedef struct PgfItemBase PgfItemBase; @@ -371,7 +379,9 @@ static void pgf_print_expr_state(PgfExprState* st, GuOut* out, GuExn* err, GuBuf* stack) { - gu_buf_push(stack, int, (gu_seq_length(st->args) - st->arg_idx - 1)); + gu_buf_push(stack, int, + (st->arg_idx != PGF_EXPR_CHUNK_STATE) ? + (gu_seq_length(st->args) - st->arg_idx - 1) : 0); if (gu_buf_length(st->answers->conts) > 0) { PgfExprState* cont = gu_buf_get(st->answers->conts, PgfExprState*, 0); @@ -380,6 +390,10 @@ pgf_print_expr_state(PgfExprState* st, } gu_puts(" (", out, err); + if (st->answers->ccat != NULL) { + pgf_print_fid(st->answers->ccat->fid,out,err); + gu_puts(":", out, err); + } if (gu_variant_is_null(st->ep.expr)) gu_puts("_", out, err); else @@ -395,7 +409,8 @@ pgf_print_expr_state0(PgfExprState* st, st->answers->outside_prob, st->answers->outside_prob+st->ep.prob); - size_t n_args = gu_seq_length(st->args); + size_t n_args = (st->arg_idx == PGF_EXPR_CHUNK_STATE) ? + 0 : gu_seq_length(st->args); GuBuf* stack = gu_new_buf(int, tmp_pool); if (n_args > 0) @@ -423,7 +438,7 @@ pgf_print_expr_state0(PgfExprState* st, int count = gu_buf_get(stack, int, i); while (count-- > 0) gu_puts(" ?", out, err); - + gu_puts(")", out, err); } gu_puts("\n", out, err); @@ -508,12 +523,6 @@ pgf_parsing_get_conts(PgfParseState* state, conts->outside_prob = 0; conts->ref_count = 0; gu_seq_get(contss, PgfItemConts*, lin_idx) = conts; - -#ifdef PGF_COUNTS_DEBUG - if (state != NULL) { - state->ps->cont_full_count++; - } -#endif } return conts; } @@ -527,7 +536,7 @@ gu_ccat_fini(GuFinalizer* fin) } static PgfCCat* -pgf_parsing_create_completed(PgfParsing* ps, PgfParseState* state, +pgf_parsing_create_completed(PgfParsing* ps, PgfParseState* state, PgfItemConts* conts, prob_t viterbi_prob) { @@ -537,17 +546,20 @@ pgf_parsing_create_completed(PgfParsing* ps, PgfParseState* state, cat->linrefs = conts->ccat->linrefs; cat->viterbi_prob = viterbi_prob; cat->fid = ps->max_fid++; + cat->chunk_count = (conts->ccat->fid == -5 || + conts->state->end_offset == state->end_offset); cat->conts = conts; cat->answers = NULL; cat->prods = NULL; cat->n_synprods = 0; + gu_map_put(state->generated_cats, conts, PgfCCat*, cat); - + cat->fin[0].fn = gu_ccat_fini; gu_pool_finally(ps->pool, cat->fin); #ifdef PGF_COUNTS_DEBUG - state->ps->ccat_full_count++; + ps->ccat_full_count++; #endif return cat; @@ -589,6 +601,19 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool) break; } case PGF_PRODUCTION_EXTERN: { + PgfProductionExtern* pext = i.data; + + PgfSymbols* syms; + if (pext->lins != NULL && + (syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) { + if (item->sym_idx == gu_seq_length(syms)) { + item->curr_sym = gu_null_variant; + } else { + item->curr_sym = gu_seq_get(syms, PgfSymbol, item->sym_idx); + } + } else { + item->curr_sym = gu_null_variant; + } break; } default: @@ -662,7 +687,7 @@ static PgfItem* pgf_item_copy(PgfItem* item, PgfParsing* ps) { PgfItem* copy; - if (ps == NULL || ps->free_item == NULL) + if (ps->free_item == NULL) copy = gu_new(PgfItem, ps->pool); else { copy = ps->free_item; @@ -671,10 +696,8 @@ pgf_item_copy(PgfItem* item, PgfParsing* ps) memcpy(copy, item, sizeof(PgfItem)); #ifdef PGF_COUNTS_DEBUG - if (ps != NULL) { - ps->item_full_count++; - ps->item_real_count++; - } + ps->item_full_count++; + ps->item_real_count++; #endif item->conts->ref_count++; @@ -747,13 +770,17 @@ pgf_item_free(PgfParsing* ps, PgfItem* item) } static void -pgf_result_predict(PgfParsing* ps, - PgfExprState* cont, PgfCCat* ccat); +pgf_result_predict(PgfParsing* ps, + PgfExprState* cont, PgfCCat* ccat, + prob_t outside_prob); static void pgf_result_production(PgfParsing* ps, PgfAnswers* answers, PgfProduction prod); +static void +pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep); + static void pgf_parsing_push_item(PgfParseState* state, PgfItem* item) { @@ -859,6 +886,10 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool) default: gu_impossible(); } + +#ifdef PGF_COUNTS_DEBUG + ps->prod_full_count++; +#endif return prod; } @@ -877,9 +908,6 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep) PgfProduction prod = pgf_parsing_new_production(item, ep, ps->pool); -#ifdef PGF_COUNTS_DEBUG - ps->prod_full_count++; -#endif PgfCCat* tmp_ccat = pgf_parsing_get_completed(ps->before, item->conts); PgfCCat* ccat = tmp_ccat; @@ -904,7 +932,9 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep) gu_printf(out, err, "; %d; ", item->conts->lin_idx); pgf_print_fid(ccat->fid, out, err); - gu_puts("]\n", out, err); + gu_puts("] ", out, err); + pgf_print_fid(ccat->fid, out, err); + gu_printf(out, err, ".chunk_count=%d\n", ccat->chunk_count); } pgf_print_production(ccat->fid, prod, out, err); gu_pool_free(tmp_pool); @@ -913,9 +943,29 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep) if (item->conts->ccat->fid == -5) { if (ps->before->end_offset == strlen(ps->sentence)) { PgfPArg* parg = gu_seq_index(item->args, PgfPArg, 0); - pgf_result_predict(ps, NULL, parg->ccat); + pgf_result_predict(ps, NULL, parg->ccat, 0); } return; + } else { + size_t i = gu_seq_length(item->args); + while (i > 0) { + PgfPArg* parg = gu_seq_index(item->args, PgfPArg, i-1); + + if (pgf_parsing_get_completed(ps->before, parg->ccat->conts) != NULL) { + parg->ccat->chunk_count++; + +#ifdef PGF_PARSER_DEBUG + GuPool* tmp_pool = gu_new_pool(); + GuOut* out = gu_file_out(stderr, tmp_pool); + GuExn* err = gu_exn(tmp_pool); + pgf_print_fid(parg->ccat->fid, out, err); + gu_printf(out, err, ".chunk_count=%d\n", parg->ccat->chunk_count); + gu_pool_free(tmp_pool); +#endif + } + + i--; + } } if (tmp_ccat != NULL) { @@ -1022,6 +1072,7 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset, state->agenda = gu_new_buf(PgfItem*, ps->pool); state->generated_cats = gu_new_addr_map(PgfItemConts*, PgfCCat*, &gu_null_struct, ps->pool); state->conts_map = gu_new_addr_map(PgfCCat*, PgfItemContss*, &gu_null_struct, ps->pool); + state->chunks_map = NULL; state->needs_bind = (bind_type == BIND_NONE) && (start_offset == end_offset); state->start_offset = start_offset; @@ -1076,15 +1127,17 @@ pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state, break; } else { ptrdiff_t len = current.ptr - start.ptr; - found = true; if (min <= len) - pgf_parsing_scan_helper(ps, state, i, k-1, min, len); + if (pgf_parsing_scan_helper(ps, state, i, k-1, min, len)) + found = true; // Here we do bottom-up prediction for all lexical categories. // The epsilon productions will be predicted in top-down // fashion while parsing. if (seq->idx != NULL && len > 0) { + found = true; + // A new state will mark the end of the current match PgfParseState* new_state = pgf_new_parse_state(ps, (size_t) (current.ptr - ps->sentence), BIND_NONE); @@ -1133,7 +1186,9 @@ pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state, gu_printf(out, err, "; %d; ", conts->lin_idx); pgf_print_fid(ccat->fid, out, err); - gu_puts("]\n", out, err); + gu_puts("] ", out, err); + pgf_print_fid(ccat->fid, out, err); + gu_printf(out, err, ".chunk_count=%d\n", ccat->chunk_count); } pgf_print_production(ccat->fid, prod, out, err); gu_pool_free(tmp_pool); @@ -1142,7 +1197,8 @@ pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state, } if (len <= max) - pgf_parsing_scan_helper(ps, state, k+1, j, len, max); + if (pgf_parsing_scan_helper(ps, state, k+1, j, len, max)) + found = true; break; } @@ -1159,7 +1215,7 @@ pgf_parsing_scan(PgfParsing *ps) PgfParseState* state = pgf_new_parse_state(ps, 0, BIND_SOFT); - while (state->end_offset < len) { + while (state != NULL && state->end_offset < len) { if (state->needs_bind) { // We have encountered two tokens without space in between. // Those can be accepted only if there is a BIND token @@ -1546,79 +1602,6 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym) } } -static void -pgf_parsing_item(PgfParsing* ps, PgfItem* item) -{ -#ifdef PGF_PARSER_DEBUG - GuPool* tmp_pool = gu_new_pool(); - GuOut* out = gu_file_out(stderr, tmp_pool); - GuExn* err = gu_exn(tmp_pool); - pgf_print_item(item, ps->before, out, err, tmp_pool); - gu_pool_free(tmp_pool); -#endif - - GuVariantInfo i = gu_variant_open(item->prod); - switch (i.tag) { - case PGF_PRODUCTION_APPLY: { - PgfProductionApply* papp = i.data; - PgfCncFun* fun = papp->fun; - PgfSymbols* syms = fun->lins[item->conts->lin_idx]->syms; - if (item->sym_idx == gu_seq_length(syms)) { - pgf_parsing_complete(ps, item, NULL); - pgf_item_free(ps, item); - } else { - pgf_parsing_symbol(ps, item, item->curr_sym); - } - break; - } - case PGF_PRODUCTION_COERCE: { - PgfProductionCoerce* pcoerce = i.data; - switch (item->sym_idx) { - case 0: - if (pcoerce->coerce->prods == NULL) { - // empty category - pgf_item_free(ps, item); - return; - } - - pgf_parsing_td_predict(ps, item, - pcoerce->coerce, - item->conts->lin_idx); - break; - case 1: - pgf_parsing_complete(ps, item, NULL); - pgf_item_free(ps, item); - break; - default: - gu_impossible(); - } - break; - } - case PGF_PRODUCTION_EXTERN: { - PgfProductionExtern* pext = i.data; - - PgfSymbols* syms; - if (pext->lins != NULL && - (syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) { - if (item->sym_idx == gu_seq_length(syms)) { - pgf_parsing_complete(ps, item, NULL); - pgf_item_free(ps, item); - } else { - PgfSymbol sym = - gu_seq_get(syms, PgfSymbol, item->sym_idx); - pgf_parsing_symbol(ps, item, sym); - } - } else { - pgf_parsing_complete(ps, item, pext->ep); - pgf_item_free(ps, item); - } - break; - } - default: - gu_impossible(); - } -} - static void pgf_parsing_set_default_factors(PgfParsing* ps, PgfAbstr* abstr) { @@ -1654,7 +1637,6 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence, #ifdef PGF_COUNTS_DEBUG ps->item_full_count = 0; ps->item_real_count = 0; - ps->cont_full_count = 0; ps->ccat_full_count = 0; ps->prod_full_count = 0; #endif @@ -1674,10 +1656,9 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence, static void pgf_parsing_print_counts(PgfParsing* ps) { - printf("%d\t%d\t%d\t%d\t%d\n", + printf("%d\t%d\t%d\t%d\n", ps->item_full_count, ps->item_real_count, - ps->cont_full_count, ps->ccat_full_count, ps->prod_full_count); } @@ -1734,7 +1715,7 @@ pgf_result_production(PgfParsing* ps, st->args = gu_empty_seq(); st->arg_idx = 0; - pgf_result_predict(ps, st, ccat); + pgf_result_predict(ps, st, ccat, answers->outside_prob); break; } case PGF_PRODUCTION_EXTERN: { @@ -1756,21 +1737,16 @@ pgf_result_production(PgfParsing* ps, static void pgf_result_predict(PgfParsing* ps, - PgfExprState* cont, PgfCCat* ccat) + PgfExprState* cont, PgfCCat* ccat, + prob_t outside_prob) { - prob_t outside_prob = 0; - if (cont != NULL) { - cont->ep.prob -= ccat->viterbi_prob; - outside_prob = - cont->answers->outside_prob+cont->ep.prob; - } - PgfAnswers* answers = ccat->answers; if (answers == NULL) { answers = gu_new(PgfAnswers, ps->pool); answers->conts = gu_new_buf(PgfExprState*, ps->pool); answers->exprs = gu_new_buf(PgfExprProb*, ps->pool); answers->outside_prob = outside_prob; + answers->ccat = ccat; ccat->answers = answers; } @@ -1802,8 +1778,14 @@ pgf_result_predict(PgfParsing* ps, .fun = cont->ep.expr, .arg = ep->expr); st->ep.prob = cont->ep.prob+ep->prob; - st->args = cont->args; - st->arg_idx = cont->arg_idx+1; + + if (cont->arg_idx == PGF_EXPR_CHUNK_STATE) { + st->state = gu_map_get(cont->state->chunks_map, ccat, PgfParseState*); + st->arg_idx = PGF_EXPR_CHUNK_STATE; + } else { + st->args = cont->args; + st->arg_idx = cont->arg_idx+1; + } gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st); } @@ -1864,23 +1846,20 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, start_ccat->linrefs = NULL; start_ccat->viterbi_prob = 0; start_ccat->fid = -5; + start_ccat->chunk_count = 1; start_ccat->conts = NULL; start_ccat->answers = NULL; start_ccat->prods = NULL; start_ccat->n_synprods = 0; #ifdef PGF_COUNTS_DEBUG - state->ps->ccat_full_count++; + ps->ccat_full_count++; #endif PgfItemConts* conts = pgf_parsing_get_conts(ps->before, start_ccat, 0, ps->pool); gu_buf_push(conts->items, PgfItem*, NULL); -#ifdef PGF_COUNTS_DEBUG - ps->cont_full_count++; -#endif - size_t n_ccats = gu_seq_length(cnccat->cats); for (size_t i = 0; i < n_ccats; i++) { PgfCCat* ccat = gu_seq_get(cnccat->cats, PgfCCat*, i); @@ -1970,7 +1949,21 @@ pgf_parsing_proceed(PgfParsing* ps) if (has_progress) { PgfItem* item; gu_buf_heap_pop(ps->before->agenda, pgf_item_prob_order, &item); - pgf_parsing_item(ps, item); + +#ifdef PGF_PARSER_DEBUG + GuPool* tmp_pool = gu_new_pool(); + GuOut* out = gu_file_out(stderr, tmp_pool); + GuExn* err = gu_exn(tmp_pool); + pgf_print_item(item, ps->before, out, err, tmp_pool); + gu_pool_free(tmp_pool); +#endif + + if (gu_variant_is_null(item->curr_sym)) { + pgf_parsing_complete(ps, item, NULL); + pgf_item_free(ps, item); + } else { + pgf_parsing_symbol(ps, item, item->curr_sym); + } } while (ps->after != NULL) { @@ -1983,6 +1976,28 @@ pgf_parsing_proceed(PgfParsing* ps) return has_progress; } +typedef struct { + GuMapItor fn; + PgfParsing* ps; + PgfExprState* st; +} PgfChunkCatItor; + +static void +pgf_iter_chunk_cat(GuMapItor* fn, + const void* key, void* value, + GuExn *err) +{ + PgfChunkCatItor* clo = (PgfChunkCatItor*) fn; + PgfCCat* ccat = (PgfCCat*) key; + + prob_t outside_prob = + clo->st->answers->outside_prob+ + clo->st->ep.prob+ + ccat->cnccat->abscat->prob; + + pgf_result_predict(clo->ps, clo->st, ccat, outside_prob); +} + static PgfExprProb* pgf_parse_result_next(PgfParsing* ps) { @@ -2005,11 +2020,28 @@ pgf_parse_result_next(PgfParsing* ps) #endif #endif - if (st->arg_idx < gu_seq_length(st->args)) { + if (st->arg_idx == PGF_EXPR_CHUNK_STATE) { + // here we look for chunks + + if (st->state == ps->before) { + if (pgf_parse_result_is_new(st)) { + gu_buf_push(st->answers->exprs, PgfExprProb*, &st->ep); + return &st->ep; + } + } else { + PgfChunkCatItor clo = { { pgf_iter_chunk_cat }, ps, st }; + if (st->state->chunks_map != NULL) + gu_map_iter(st->state->chunks_map, &clo.fn, NULL); + } + } else if (st->arg_idx < gu_seq_length(st->args)) { + // here we handle normal unfinished expression states + PgfCCat* ccat = gu_seq_index(st->args, PgfPArg, st->arg_idx)->ccat; if (ccat->fid < ps->concr->total_cats) { + // when argument was not used by the parser, + // we create a metavariable PgfExpr meta = gu_new_variant_i(ps->out_pool, PGF_EXPR_META, PgfExprMeta, .id = 0); @@ -2024,7 +2056,10 @@ pgf_parse_result_next(PgfParsing* ps) st->arg_idx++; gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st); } else { - pgf_result_predict(ps, st, ccat); + prob_t outside_prob = + st->answers->outside_prob+ + st->ep.prob-ccat->viterbi_prob; + pgf_result_predict(ps, st, ccat, outside_prob); } } else if (pgf_parse_result_is_new(st)) { gu_buf_push(st->answers->exprs, PgfExprProb*, &st->ep); @@ -2032,7 +2067,7 @@ pgf_parse_result_next(PgfParsing* ps) size_t n_conts = gu_buf_length(st->answers->conts); for (size_t i = 0; i < n_conts; i++) { PgfExprState* st2 = gu_buf_get(st->answers->conts, PgfExprState*, i); - + if (st2 == NULL) { return &st->ep; } @@ -2046,9 +2081,17 @@ pgf_parse_result_next(PgfParsing* ps) PGF_EXPR_APP, PgfExprApp, .fun = st2->ep.expr, .arg = st->ep.expr); - st3->ep.prob = st2->ep.prob + st->ep.prob; - st3->args = st2->args; - st3->arg_idx = st2->arg_idx+1; + if (st2->arg_idx == PGF_EXPR_CHUNK_STATE) { + st3->ep.prob = st2->ep.prob+st->answers->ccat->cnccat->abscat->prob + + st->ep.prob; + st3->state = gu_map_get(st2->state->chunks_map, st->answers->ccat, PgfParseState*); + st3->arg_idx = PGF_EXPR_CHUNK_STATE; + } else { + st3->ep.prob = st2->ep.prob-st->answers->ccat->viterbi_prob + + st->ep.prob; + st3->args = st2->args; + st3->arg_idx = st2->arg_idx+1; + } gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st3); } @@ -2107,6 +2150,126 @@ pgf_parse(PgfConcr* concr, PgfType* typ, GuString sentence, return pgf_parse_with_heuristics(concr, typ, sentence, -1.0, callbacks, err, pool, out_pool); } +static void +pgf_iter_generated_cats(PgfParsing* ps, PgfParseState* next_state); + +static void +pgf_process_generated_cat(PgfParsing* ps, + PgfParseState* state, PgfParseState* next_state, + PgfCCat* ccat) +{ + bool just_coercions = true; + + PgfCCat* children[ccat->n_synprods]; + for (size_t i = 0; i < ccat->n_synprods; i++) { + PgfProduction prod = + gu_seq_get(ccat->prods, PgfProduction, i); + + children[i] = NULL; + + GuVariantInfo inf = gu_variant_open(prod); + switch (inf.tag) { + case PGF_PRODUCTION_APPLY: { + PgfProductionApply* papp = inf.data; + + size_t j = gu_seq_length(papp->args); + while (j > 0) { + PgfPArg* parg = gu_seq_index(papp->args, PgfPArg, j-1); + + if (pgf_parsing_get_completed(state, parg->ccat->conts) != NULL && + ccat->conts->state->end_offset == parg->ccat->conts->state->end_offset) { + children[i] = parg->ccat; + break; + } + + j--; + } + + if (children[i] == NULL) { + just_coercions = false; + break; + } + break; + } + case PGF_PRODUCTION_COERCE: { + PgfProductionCoerce* pcoerce = inf.data; + children[i] = pcoerce->coerce; + break; + } + } + } + + if (just_coercions) { + ccat->chunk_count++; + + for (size_t i = 0; i < ccat->n_synprods; i++) { + children[i]->chunk_count--; + +#ifdef PGF_PARSER_DEBUG + GuPool* tmp_pool = gu_new_pool(); + GuOut* out = gu_file_out(stderr, tmp_pool); + GuExn* err = gu_exn(tmp_pool); + pgf_print_fid(children[i]->fid, out, err); + gu_printf(out, err, ".chunk_count=%d\n", children[i]->chunk_count); + gu_pool_free(tmp_pool); +#endif + + if (children[i]->chunk_count == 0) { + pgf_process_generated_cat(ps, state, next_state, children[i]); + } + } + } else { + PgfParseState* prev_state = ccat->conts->state; + if (prev_state->chunks_map == NULL) { + pgf_iter_generated_cats(ps, prev_state); + + if (prev_state->chunks_map == NULL) { + prev_state->chunks_map = + gu_new_addr_map(PgfCCat*, PgfParseState*, + &gu_null_struct, ps->pool); + } + } + +#ifdef PGF_PARSER_DEBUG + GuPool* tmp_pool = gu_new_pool(); + GuOut* out = gu_file_out(stderr, tmp_pool); + GuExn* err = gu_exn(tmp_pool); + gu_printf(out, err, "[%d - ", prev_state->end_offset); + pgf_print_fid(ccat->fid, out, err); + gu_printf(out, err, " - %d]\n", next_state->start_offset); + gu_pool_free(tmp_pool); +#endif + + gu_map_put(prev_state->chunks_map, ccat, PgfParseState*, next_state); + } +} + +static void +pgf_iter_generated_cats(PgfParsing* ps, PgfParseState* next_state) +{ + size_t count = 0; + PgfParseState* state = next_state; + + for (;;) { + size_t i = 0; + PgfCCat* ccat; + PgfItemConts* conts; + while (gu_map_next(state->generated_cats, &i, (void**)&conts, &ccat)) { + if (ccat->chunk_count > 0) + continue; + + count++; + + pgf_process_generated_cat(ps, state, next_state, ccat); + } + + if (count > 0 || state->next == NULL) + break; + + state = state->next; + } +} + PGF_API GuEnum* pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuString sentence, double heuristics, @@ -2138,7 +2301,34 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuString sentence, if (!pgf_parsing_proceed(ps)) { GuExnData* exn = gu_raise(err, PgfParseError); exn->data = (void*) pgf_parsing_new_exception(ps, exn->pool); - return NULL; + + PgfExprState* st = gu_new(PgfExprState, ps->pool); + st->answers = gu_new(PgfAnswers, ps->pool); + st->answers->conts = gu_new_buf(PgfExprState*, ps->pool); + st->answers->exprs = gu_new_buf(PgfExprProb*, ps->pool); + st->answers->ccat = NULL; + st->answers->outside_prob = 0; + st->ep.expr = + gu_new_variant_i(ps->out_pool, + PGF_EXPR_META, PgfExprMeta, + .id = 0); + st->ep.prob = 0; + st->state = NULL; + st->arg_idx = PGF_EXPR_CHUNK_STATE; + + pgf_iter_generated_cats(ps, ps->before); + + PgfParseState* state = ps->before; + while (state != NULL) { + if (state->chunks_map != NULL) + st->state = state; + state = state->next; + } + + if (st->state != NULL) { + gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st); + } + break; } #ifdef PGF_COUNTS_DEBUG diff --git a/src/runtime/c/pgf/reader.c b/src/runtime/c/pgf/reader.c index 82b6f8abf..755f14a24 100644 --- a/src/runtime/c/pgf/reader.c +++ b/src/runtime/c/pgf/reader.c @@ -844,6 +844,7 @@ pgf_read_fid(PgfReader* rdr, PgfConcr* concr) ccat->prods = NULL; ccat->viterbi_prob = 0; ccat->fid = fid; + ccat->chunk_count = 1; ccat->conts = NULL; ccat->answers = NULL; @@ -1081,6 +1082,7 @@ pgf_read_cnccat(PgfReader* rdr, PgfAbstr* abstr, PgfConcr* concr, PgfCId name) ccat->prods = NULL; ccat->viterbi_prob = 0; ccat->fid = fid; + ccat->chunk_count = 1; ccat->conts = NULL; ccat->answers = NULL;