From b56591c6b6b3315bdb97164dca574c43292f4d98 Mon Sep 17 00:00:00 2001 From: krangelov Date: Tue, 25 Jun 2019 12:58:28 +0200 Subject: [PATCH] the parser now ensures that all word senses are in the chart --- src/runtime/c/pgf/parser.c | 421 ++++++++++++++++++++---------------- src/runtime/c/pgf/scanner.c | 34 +-- 2 files changed, 254 insertions(+), 201 deletions(-) diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index 3f67a4988..baf1e3eb3 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -61,14 +61,6 @@ typedef struct { typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE; -typedef struct { - PgfProductionIdx* idx; - size_t offset; - size_t sym_idx; -} PgfLexiconIdxEntry; - -typedef GuBuf PgfLexiconIdx; - struct PgfParseState { PgfParseState* next; @@ -81,8 +73,6 @@ struct PgfParseState { size_t end_offset; prob_t viterbi_prob; - - PgfLexiconIdx* lexicon_idx; }; typedef struct PgfAnswers { @@ -442,7 +432,7 @@ pgf_print_expr_state0(PgfExprState* st, #endif PGF_INTERNAL_DECL int -cmp_string(GuString* psent, size_t* ppos, GuString tok, +cmp_string(PgfCohortSpot* spot, GuString tok, bool case_sensitive); PGF_INTERNAL_DECL bool @@ -764,6 +754,25 @@ static void pgf_result_production(PgfParsing* ps, PgfAnswers* answers, PgfProduction prod); +static void +pgf_parsing_push_item(PgfParseState* state, PgfItem* item) +{ + if (gu_buf_length(state->agenda) == 0) { + state->viterbi_prob = + item->inside_prob+item->conts->outside_prob; + } + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); +} + +static void +pgf_parsing_push_production(PgfParsing* ps, PgfParseState* state, + PgfItemConts* conts, PgfProduction prod) +{ + PgfItem* item = + pgf_new_item(ps, conts, prod); + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); +} + static void pgf_parsing_combine(PgfParsing* ps, PgfParseState* before, PgfParseState* after, @@ -786,16 +795,7 @@ pgf_parsing_combine(PgfParsing* ps, } pgf_item_advance(item, ps->pool); - gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item); -} - -static void -pgf_parsing_production(PgfParsing* ps, PgfParseState* state, - PgfItemConts* conts, PgfProduction prod) -{ - PgfItem* item = - pgf_new_item(ps, conts, prod); - gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + pgf_parsing_push_item(before, item); } static PgfProduction @@ -931,7 +931,7 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep) * production immediately to the agenda, * i.e. process it. */ if (conts2) { - pgf_parsing_production(ps, ps->before, conts2, prod); + pgf_parsing_push_production(ps, ps->before, conts2, prod); } } } @@ -952,7 +952,7 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep) * production immediately to the agenda, * i.e. process it. */ if (conts2) { - pgf_parsing_production(ps, state, conts2, prod); + pgf_parsing_push_production(ps, state, conts2, prod); } } } @@ -972,66 +972,9 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep) } } -PGF_INTERNAL_DECL int -pgf_symbols_cmp(GuString* psent, size_t* ppos, - PgfSymbols* syms, size_t* sym_idx, - bool case_sensitive); - -static void -pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state, - int i, int j, ptrdiff_t min, ptrdiff_t max) -{ - // This is a variation of a binary search algorithm which - // can retrieve all prefixes of a string with minimal - // comparisons, i.e. there is no need to lookup every - // prefix separately. - - while (i <= j) { - int k = (i+j) / 2; - PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k); - - GuString start = ps->sentence + state->end_offset; - GuString current = start; - size_t pos = 0; - size_t sym_idx = 0; - int cmp = pgf_symbols_cmp(¤t, &pos, seq->syms, &sym_idx, ps->case_sensitive); - if (cmp < 0) { - j = k-1; - } else if (cmp > 0) { - ptrdiff_t len = current - start; - - if (min <= len) - pgf_parsing_lookahead(ps, state, i, k-1, min, len); - - if (len+1 <= max) - pgf_parsing_lookahead(ps, state, k+1, j, len+1, max); - - break; - } else { - ptrdiff_t len = current - start; - - if (min <= len-1) - pgf_parsing_lookahead(ps, state, i, k-1, min, len-1); - - if (seq->idx != NULL) { - PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx); - entry->idx = seq->idx; - entry->offset = (size_t) (current - ps->sentence); - entry->sym_idx = sym_idx; - } - - if (len+1 <= max) - pgf_parsing_lookahead(ps, state, k+1, j, len+1, max); - - break; - } - } -} - static PgfParseState* pgf_new_parse_state(PgfParsing* ps, size_t start_offset, - BIND_TYPE bind_type, - prob_t viterbi_prob) + BIND_TYPE bind_type) { PgfParseState** pstate; if (ps->before == NULL && start_offset == 0) @@ -1083,44 +1026,173 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset, (start_offset == end_offset); state->start_offset = start_offset; state->end_offset = end_offset; - state->viterbi_prob = viterbi_prob; - state->lexicon_idx = - gu_new_buf(PgfLexiconIdxEntry, ps->pool); + state->viterbi_prob = 0; if (ps->before == NULL && start_offset == 0) state->needs_bind = false; - if (gu_seq_length(ps->concr->sequences) > 0) { - // Add epsilon lexical rules to the bottom up index - PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0); - if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) { - PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx); - entry->idx = seq->idx; - entry->offset = state->start_offset; - entry->sym_idx= 0; - } - - // Add non-epsilon lexical rules to the bottom up index - if (!state->needs_bind) { - pgf_parsing_lookahead(ps, state, - 0, gu_seq_length(ps->concr->sequences)-1, - 1, strlen(ps->sentence)-state->end_offset); - } - } - - *pstate = state; return state; } +PGF_INTERNAL_DECL int +pgf_symbols_cmp(PgfCohortSpot* spot, + PgfSymbols* syms, size_t* sym_idx, + bool case_sensitive); + +static bool +pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state, + int i, int j, ptrdiff_t min, ptrdiff_t max) +{ + // This is a variation of a binary search algorithm which + // can retrieve all prefixes of a string with minimal + // comparisons, i.e. there is no need to lookup every + // prefix separately. + + bool found = false; + while (i <= j) { + int k = (i+j) / 2; + PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k); + + PgfCohortSpot start = {0, ps->sentence+state->end_offset}; + PgfCohortSpot current = start; + + size_t sym_idx = 0; + int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive); + if (cmp < 0) { + j = k-1; + } else if (cmp > 0) { + ptrdiff_t len = current.ptr - start.ptr; + + if (min <= len) + if (pgf_parsing_scan_helper(ps, state, i, k-1, min, len)) + found = true; + + if (len+1 <= max) + if (pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max)) + found = true; + + break; + } else { + ptrdiff_t len = current.ptr - start.ptr; + found = true; + + if (min <= len-1) + pgf_parsing_scan_helper(ps, state, i, k-1, min, len-1); + + // Here we do bottom-up prediction for all lexical categories. + // The epsilon productions will be predicted in top-down + // fashion while parsing. + if (seq->idx != NULL && len > 0) { + // A new state will mark the end of the current match + PgfParseState* new_state = + pgf_new_parse_state(ps, (size_t) (current.ptr - ps->sentence), BIND_NONE); + + // Bottom-up prediction for lexical rules + size_t n_entries = gu_buf_length(seq->idx); + for (size_t i = 0; i < n_entries; i++) { + PgfProductionIdxEntry* entry = + gu_buf_index(seq->idx, PgfProductionIdxEntry, i); + + PgfItemConts* conts = + pgf_parsing_get_conts(state, + entry->ccat, entry->lin_idx, + ps->pool); + + // Create the new category if it doesn't exist yet + PgfCCat* tmp_ccat = pgf_parsing_get_completed(new_state, conts); + PgfCCat* ccat = tmp_ccat; + if (ccat == NULL) { + ccat = pgf_parsing_create_completed(ps, new_state, conts, INFINITY); + } + + // Add the production + if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) { + ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1); + } + GuVariantInfo i; + i.tag = PGF_PRODUCTION_APPLY; + i.data = entry->papp; + PgfProduction prod = gu_variant_close(i); + gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod); + + // Update the category's probability to be minimum + if (ccat->viterbi_prob > entry->papp->fun->ep->prob) + ccat->viterbi_prob = entry->papp->fun->ep->prob; + +#ifdef PGF_PARSER_DEBUG + GuPool* tmp_pool = gu_new_pool(); + GuOut* out = gu_file_out(stderr, tmp_pool); + GuExn* err = gu_exn(tmp_pool); + if (tmp_ccat == NULL) { + gu_printf(out, err, "["); + pgf_print_range(state, new_state, out, err); + gu_puts("; ", out, err); + pgf_print_fid(conts->ccat->fid, out, err); + gu_printf(out, err, "; %d; ", + conts->lin_idx); + pgf_print_fid(ccat->fid, out, err); + gu_puts("]\n", out, err); + } + pgf_print_production(ccat->fid, prod, out, err); + gu_pool_free(tmp_pool); +#endif + } + } + + if (len+1 <= max) + pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max); + + break; + } + } + + return found; +} + +static void +pgf_parsing_scan(PgfParsing *ps) +{ + size_t len = strlen(ps->sentence); + + PgfParseState* state = + pgf_new_parse_state(ps, 0, BIND_SOFT); + + while (state != NULL) { + if (state->needs_bind) { + // We have encountered two tokens without space in between. + // Those can be accepted only if there is a BIND token + // in between. We encode this by having one more state + // at the same offset. A transition between these two + // states is possible only with the BIND token. + state = + pgf_new_parse_state(ps, state->end_offset, BIND_HARD); + } + + if (!pgf_parsing_scan_helper + (ps, state, + 0, gu_seq_length(ps->concr->sequences)-1, + 1, len-state->end_offset)) { + // skip one character and try again + GuString s = ps->sentence+state->end_offset; + gu_utf8_decode((const uint8_t**) &s); + pgf_new_parse_state(ps, ps->sentence-s, BIND_NONE); + } + + if (state == ps->before) + state = ps->after; + else + state = state->next; + } +} + static void pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item) -{ - GuString current = ps->sentence + ps->before->end_offset; - size_t pos = 0; +{ + PgfCohortSpot current = {0, ps->sentence + ps->before->end_offset}; - if (ps->prefix != NULL && *current == 0) { + if (ps->prefix != NULL && *current.ptr == 0) { if (gu_string_is_prefix(ps->prefix, tok)) { PgfProductionApply* papp = gu_variant_data(item->prod); @@ -1131,39 +1203,17 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item) ps->tp->prob = item->inside_prob + item->conts->outside_prob; } } else { - if (!ps->before->needs_bind && cmp_string(¤t, &pos, tok, ps->case_sensitive) == 0) { + if (!ps->before->needs_bind && cmp_string(¤t, tok, ps->case_sensitive) == 0) { PgfParseState* state = - pgf_new_parse_state(ps, (current - ps->sentence), - BIND_NONE, - item->inside_prob+item->conts->outside_prob); - gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + pgf_new_parse_state(ps, (current.ptr - ps->sentence), + BIND_NONE); + pgf_parsing_push_item(state, item); } else { pgf_item_free(ps, item); } } } -static void -pgf_parsing_predict_lexeme(PgfParsing* ps, PgfItemConts* conts, - PgfProductionIdxEntry* entry, - size_t offset, size_t sym_idx) -{ - GuVariantInfo i = { PGF_PRODUCTION_APPLY, entry->papp }; - PgfProduction prod = gu_variant_close(i); - PgfItem* item = - pgf_new_item(ps, conts, prod); - PgfSymbols* syms = entry->papp->fun->lins[conts->lin_idx]->syms; - item->sym_idx = sym_idx; - pgf_item_set_curr_symbol(item, ps->pool); - prob_t prob = item->inside_prob+item->conts->outside_prob; - PgfParseState* state = - pgf_new_parse_state(ps, offset, BIND_NONE, prob); - if (state->viterbi_prob > prob) { - state->viterbi_prob = prob; - } - gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); -} - static void pgf_parsing_td_predict(PgfParsing* ps, PgfItem* item, PgfCCat* ccat, size_t lin_idx) @@ -1201,44 +1251,46 @@ pgf_parsing_td_predict(PgfParsing* ps, for (size_t i = 0; i < n_prods; i++) { PgfProduction prod = gu_seq_get(ccat->prods, PgfProduction, i); - pgf_parsing_production(ps, ps->before, conts, prod); + pgf_parsing_push_production(ps, ps->before, conts, prod); } } else { // Top-down prediction for syntactic rules for (size_t i = 0; i < ccat->n_synprods; i++) { PgfProduction prod = gu_seq_get(ccat->prods, PgfProduction, i); - pgf_parsing_production(ps, ps->before, conts, prod); + pgf_parsing_push_production(ps, ps->before, conts, prod); } - // Bottom-up prediction for lexical and epsilon rules - size_t n_idcs = gu_buf_length(ps->before->lexicon_idx); - for (size_t i = 0; i < n_idcs; i++) { - PgfLexiconIdxEntry* lentry = - gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i); + // Top-down prediction for epsilon lexical rules if any + PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0); + if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) { PgfProductionIdxEntry key; key.ccat = ccat; key.lin_idx = lin_idx; key.papp = NULL; PgfProductionIdxEntry* value = - gu_seq_binsearch(gu_buf_data_seq(lentry->idx), + gu_seq_binsearch(gu_buf_data_seq(seq->idx), pgf_production_idx_entry_order, PgfProductionIdxEntry, &key); if (value != NULL) { - pgf_parsing_predict_lexeme(ps, conts, value, lentry->offset, lentry->sym_idx); + GuVariantInfo i = { PGF_PRODUCTION_APPLY, value->papp }; + PgfProduction prod = gu_variant_close(i); + pgf_parsing_push_production(ps, ps->before, conts, prod); PgfProductionIdxEntry* start = - gu_buf_data(lentry->idx); + gu_buf_data(seq->idx); PgfProductionIdxEntry* end = - start + gu_buf_length(lentry->idx)-1; + start + gu_buf_length(seq->idx)-1; PgfProductionIdxEntry* left = value-1; while (left >= start && value->ccat->fid == left->ccat->fid && value->lin_idx == left->lin_idx) { - pgf_parsing_predict_lexeme(ps, conts, left, lentry->offset, lentry->sym_idx); + GuVariantInfo i = { PGF_PRODUCTION_APPLY, left->papp }; + PgfProduction prod = gu_variant_close(i); + pgf_parsing_push_production(ps, ps->before, conts, prod); left--; } @@ -1246,31 +1298,32 @@ pgf_parsing_td_predict(PgfParsing* ps, while (right <= end && value->ccat->fid == right->ccat->fid && value->lin_idx == right->lin_idx) { - pgf_parsing_predict_lexeme(ps, conts, right, lentry->offset, lentry->sym_idx); + GuVariantInfo i = { PGF_PRODUCTION_APPLY, right->papp }; + PgfProduction prod = gu_variant_close(i); + pgf_parsing_push_production(ps, ps->before, conts, prod); right++; } } } } - } else { - /* If it has already been completed, combine. */ + } + /* If the category has already been completed, combine. */ + PgfCCat* completed = + pgf_parsing_get_completed(ps->before, conts); + if (completed) { + pgf_parsing_combine(ps, ps->before, ps->after, item, completed, lin_idx); + } + + PgfParseState* state = ps->after; + while (state != NULL) { PgfCCat* completed = - pgf_parsing_get_completed(ps->before, conts); + pgf_parsing_get_completed(state, conts); if (completed) { - pgf_parsing_combine(ps, ps->before, ps->after, item, completed, lin_idx); + pgf_parsing_combine(ps, state, state->next, item, completed, lin_idx); } - PgfParseState* state = ps->after; - while (state != NULL) { - PgfCCat* completed = - pgf_parsing_get_completed(state, conts); - if (completed) { - pgf_parsing_combine(ps, state, state->next, item, completed, lin_idx); - } - - state = state->next; - } + state = state->next; } } @@ -1286,7 +1339,7 @@ pgf_parsing_pre(PgfParsing* ps, PgfItem* item, PgfSymbols* syms) } else { item->alt = 0; pgf_item_advance(item, ps->pool); - gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item); + pgf_parsing_push_item(ps->before, item); } } @@ -1405,9 +1458,8 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym) item->curr_sym = pgf_collect_extern_tok(ps,start,offset); item->sym_idx = pgf_item_symbols_length(item); PgfParseState* state = - pgf_new_parse_state(ps, offset, BIND_NONE, - item->inside_prob+item->conts->outside_prob); - gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + pgf_new_parse_state(ps, offset, BIND_NONE); + pgf_parsing_push_item(state, item); match = true; } } @@ -1450,11 +1502,10 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym) if (ps->before->start_offset == ps->before->end_offset && ps->before->needs_bind) { PgfParseState* state = - pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD, - item->inside_prob+item->conts->outside_prob); + pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD); if (state != NULL) { pgf_item_advance(item, ps->pool); - gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + pgf_parsing_push_item(state, item); } else { pgf_item_free(ps, item); } @@ -1468,11 +1519,10 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym) if (ps->before->start_offset == ps->before->end_offset) { if (ps->before->needs_bind) { PgfParseState* state = - pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD, - item->inside_prob+item->conts->outside_prob); + pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD); if (state != NULL) { pgf_item_advance(item, ps->pool); - gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + pgf_parsing_push_item(state, item); } else { pgf_item_free(ps, item); } @@ -1481,7 +1531,7 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym) } } else { pgf_item_advance(item, ps->pool); - gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item); + pgf_parsing_push_item(ps->before, item); } break; } @@ -1573,7 +1623,7 @@ static void pgf_parsing_set_default_factors(PgfParsing* ps, PgfAbstr* abstr) { PgfFlag* flag; - + flag = gu_seq_binsearch(abstr->aflags, pgf_flag_order, PgfFlag, "heuristic_search_factor"); if (flag != NULL) { @@ -1803,8 +1853,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, ps->heuristic_factor = heuristic_factor; } - PgfParseState* state = - pgf_new_parse_state(ps, 0, BIND_SOFT, 0); + pgf_parsing_scan(ps); int fidString = -1; PgfCCat* start_ccat = gu_new(PgfCCat, ps->pool); @@ -1823,7 +1872,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, #endif PgfItemConts* conts = - pgf_parsing_get_conts(state, start_ccat, 0, ps->pool); + pgf_parsing_get_conts(ps->before, start_ccat, 0, ps->pool); gu_buf_push(conts->items, PgfItem*, NULL); #ifdef PGF_COUNTS_DEBUG @@ -1849,7 +1898,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, PgfItem* item = gu_new(PgfItem, ps->pool); item->args = args; - item->inside_prob += ccat->viterbi_prob; + item->inside_prob = 0; item->conts = conts; item->prod = prod; item->curr_sym = gu_null_variant; @@ -1866,7 +1915,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat, ps->item_real_count++; #endif - gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item); } } } @@ -1884,10 +1933,18 @@ pgf_parsing_proceed(PgfParsing* ps) best_prob = gu_buf_get(ps->expr_queue, PgfExprState*, 0)->ep.prob; } - prob_t delta_prob = 0; - PgfParseState* st = ps->before; + PgfParseState* st = ps->before; + PgfParseState* last = NULL; + prob_t delta_prob = 0; while (st != NULL) { if (gu_buf_length(st->agenda) > 0) { + if (last != NULL) { + delta_prob += + (last->viterbi_prob-st->viterbi_prob) * + ps->heuristic_factor; + } + last = st; + PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0); prob_t item_prob = item->inside_prob+item->conts->outside_prob+delta_prob; @@ -1900,15 +1957,11 @@ pgf_parsing_proceed(PgfParsing* ps) ps->after = ps->before; ps->before = tmp; } - + has_progress = true; } } - prob_t state_delta = - (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))* - ps->heuristic_factor; - delta_prob += state_delta; st = st->next; } diff --git a/src/runtime/c/pgf/scanner.c b/src/runtime/c/pgf/scanner.c index cdbf56cc8..dae857ff1 100644 --- a/src/runtime/c/pgf/scanner.c +++ b/src/runtime/c/pgf/scanner.c @@ -3,7 +3,7 @@ #include PGF_INTERNAL int -cmp_string(GuString* psent, size_t* ppos, GuString tok, +cmp_string(PgfCohortSpot* spot, GuString tok, bool case_sensitive) { for (;;) { @@ -11,7 +11,7 @@ cmp_string(GuString* psent, size_t* ppos, GuString tok, if (c2 == 0) return 0; - const uint8_t* p = (uint8_t*) *psent; + const uint8_t* p = (uint8_t*) spot->ptr; GuUCS c1 = gu_utf8_decode(&p); if (c1 == 0) return -1; @@ -22,8 +22,8 @@ cmp_string(GuString* psent, size_t* ppos, GuString tok, if (c1 != c2) return (c1-c2); - *psent = (GuString) p; - (*ppos)++; + spot->ptr = (GuString) p; + spot->pos++; } } @@ -40,7 +40,7 @@ skip_space(GuString* psent, size_t* ppos) } PGF_INTERNAL int -pgf_symbols_cmp(GuString* psent, size_t* ppos, +pgf_symbols_cmp(PgfCohortSpot* spot, PgfSymbols* syms, size_t* sym_idx, bool case_sensitive) { @@ -49,14 +49,14 @@ pgf_symbols_cmp(GuString* psent, size_t* ppos, PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx); if (*sym_idx > 0) { - if (!skip_space(psent,ppos)) { - if (**psent == 0) + if (!skip_space(&spot->ptr,&spot->pos)) { + if (*spot->ptr == 0) return -1; return 1; } - while (**psent != 0) { - if (!skip_space(psent,ppos)) + while (*spot->ptr != 0) { + if (!skip_space(&spot->ptr,&spot->pos)) break; } } @@ -66,16 +66,16 @@ pgf_symbols_cmp(GuString* psent, size_t* ppos, case PGF_SYMBOL_CAT: case PGF_SYMBOL_LIT: case PGF_SYMBOL_VAR: { - if (**psent == 0) + if (*spot->ptr == 0) return -1; return 1; } case PGF_SYMBOL_KS: { PgfSymbolKS* pks = inf.data; - if (**psent == 0) + if (*spot->ptr == 0) return -1; - int cmp = cmp_string(psent,ppos,pks->token, case_sensitive); + int cmp = cmp_string(spot,pks->token, case_sensitive); if (cmp != 0) return cmp; break; @@ -131,14 +131,13 @@ pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2) { PgfSequenceOrder* self = gu_container(order, PgfSequenceOrder, order); - size_t pos = 0; - GuString sent = (GuString) p1; + PgfCohortSpot spot = {0, (GuString) p1}; const PgfSequence* sp2 = p2; size_t sym_idx = 0; - int res = pgf_symbols_cmp(&sent, &pos, sp2->syms, &sym_idx, self->case_sensitive); - if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) { + int res = pgf_symbols_cmp(&spot, sp2->syms, &sym_idx, self->case_sensitive); + if (res == 0 && (*spot.ptr != 0 || sym_idx != gu_seq_length(sp2->syms))) { res = 1; } @@ -210,7 +209,7 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, PgfCohortSpot current = *spot; size_t sym_idx = 0; - int cmp = pgf_symbols_cmp(¤t.ptr, ¤t.pos, seq->syms, &sym_idx, state->case_sensitive); + int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, state->case_sensitive); if (cmp < 0) { j = k-1; } else if (cmp > 0) { @@ -273,6 +272,7 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool) 1, (state->sentence+state->len)-spot.ptr); if (gu_buf_length(state->found) == 0) { + // skip one character and try again gu_utf8_decode((const uint8_t**) &spot.ptr); spot.pos++; gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);