diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index b1916c218..2650e2618 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -65,6 +65,7 @@ typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE; typedef struct { PgfProductionIdx* idx; size_t offset; + size_t sym_idx; } PgfLexiconIdxEntry; typedef GuBuf PgfLexiconIdx; @@ -1060,16 +1061,16 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep) } static int -pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, bool case_sensitive) +pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, size_t* sym_idx, bool case_sensitive) { size_t n_syms = gu_seq_length(syms); - for (size_t i = 0; i < n_syms; i++) { - PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i); + while (*sym_idx < n_syms) { + PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx); - if (i > 0) { + if (*sym_idx > 0) { if (!skip_space(psent)) { if (**psent == 0) - return -1; + return 0; return 1; } @@ -1085,13 +1086,13 @@ pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, bool case_sensitive) case PGF_SYMBOL_LIT: case PGF_SYMBOL_VAR: { if (**psent == 0) - return -1; + return 0; return 1; } case PGF_SYMBOL_KS: { PgfSymbolKS* pks = inf.data; if (**psent == 0) - return -1; + return 0; int cmp = cmp_string(psent, pks->token, case_sensitive); if (cmp != 0) @@ -1110,6 +1111,8 @@ pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, bool case_sensitive) default: gu_impossible(); } + + (*sym_idx)++; } return 0; @@ -1130,7 +1133,8 @@ pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state, GuString start = ps->sentence + state->end_offset; GuString current = start; - int cmp = pgf_symbols_cmp(¤t, seq->syms, ps->case_sensitive); + size_t sym_idx = 0; + int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive); if (cmp < 0) { j = k-1; } else if (cmp > 0) { @@ -1151,8 +1155,9 @@ pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state, if (seq->idx != NULL) { PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx); - entry->idx = seq->idx; - entry->offset = (size_t) (current - ps->sentence); + entry->idx = seq->idx; + entry->offset = (size_t) (current - ps->sentence); + entry->sym_idx = sym_idx; } if (len+1 <= max) @@ -1278,14 +1283,15 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item) static void pgf_parsing_predict_lexeme(PgfParsing* ps, PgfItemConts* conts, PgfProductionIdxEntry* entry, - size_t offset) + size_t offset, size_t sym_idx) { GuVariantInfo i = { PGF_PRODUCTION_APPLY, entry->papp }; PgfProduction prod = gu_variant_close(i); PgfItem* item = pgf_new_item(ps, conts, prod); PgfSymbols* syms = entry->papp->fun->lins[conts->lin_idx]->syms; - item->sym_idx = gu_seq_length(syms); + item->sym_idx = sym_idx; + pgf_item_set_curr_symbol(item, ps->pool); prob_t prob = item->inside_prob+item->conts->outside_prob; PgfParseState* state = pgf_new_parse_state(ps, offset, BIND_NONE, prob); @@ -1358,7 +1364,7 @@ pgf_parsing_td_predict(PgfParsing* ps, PgfProductionIdxEntry, &key); if (value != NULL) { - pgf_parsing_predict_lexeme(ps, conts, value, lentry->offset); + pgf_parsing_predict_lexeme(ps, conts, value, lentry->offset, lentry->sym_idx); PgfProductionIdxEntry* start = gu_buf_data(lentry->idx); @@ -1369,7 +1375,7 @@ pgf_parsing_td_predict(PgfParsing* ps, while (left >= start && value->ccat->fid == left->ccat->fid && value->lin_idx == left->lin_idx) { - pgf_parsing_predict_lexeme(ps, conts, left, lentry->offset); + pgf_parsing_predict_lexeme(ps, conts, left, lentry->offset, lentry->sym_idx); left--; } @@ -1377,7 +1383,7 @@ pgf_parsing_td_predict(PgfParsing* ps, while (right <= end && value->ccat->fid == right->ccat->fid && value->lin_idx == right->lin_idx) { - pgf_parsing_predict_lexeme(ps, conts, right, lentry->offset); + pgf_parsing_predict_lexeme(ps, conts, right, lentry->offset, lentry->sym_idx); right++; } } @@ -2372,8 +2378,9 @@ pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2) GuString sent = (GuString) p1; const PgfSequence* sp2 = p2; - int res = pgf_symbols_cmp(&sent, sp2->syms, self->case_sensitive); - if (res == 0 && *sent != 0) { + size_t sym_idx = 0; + int res = pgf_symbols_cmp(&sent, sp2->syms, &sym_idx, self->case_sensitive); + if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) { res = 1; }