diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index f944bc88d..f805ca137 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -54,7 +54,9 @@ typedef struct { int max_fid; PgfParseState *before; PgfParseState *after; - PgfExprEnum en; // enumeration for the generated trees + PgfToken prefix; + PgfTokenProb* tp; + PgfExprEnum en; // enumeration for the generated trees/tokens #ifdef PGF_COUNTS_DEBUG int item_full_count; int item_real_count; @@ -1278,7 +1280,7 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset, BIND_TYPE bind_type) } size_t end_offset = start_offset; - GuString current = ps->sentence + start_offset; + GuString current = ps->sentence + end_offset; size_t len = strlen(current); while (skip_space(¤t, &len)) { end_offset++; @@ -1317,17 +1319,25 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item) GuString current = ps->sentence + ps->before->end_offset; size_t len = strlen(current); - if (!ps->before->needs_bind && cmp_string(¤t, &len, tok) == 0) { - PgfParseState* state = - pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE); - if (state->next == NULL) { - state->viterbi_prob = - item->inside_prob+item->conts->outside_prob; + if (ps->prefix != NULL && ps->sentence[ps->before->end_offset] == 0) { + if (gu_string_is_prefix(ps->prefix, tok)) { + ps->tp = gu_new(PgfTokenProb, ps->out_pool); + ps->tp->tok = tok; + ps->tp->prob = item->inside_prob + item->conts->outside_prob; } + } else { + if (!ps->before->needs_bind && cmp_string(¤t, &len, tok) == 0) { + PgfParseState* state = + pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE); + if (state->next == NULL) { + state->viterbi_prob = + item->inside_prob+item->conts->outside_prob; + } - gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); - } else { - pgf_item_free(ps, item); + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + } else { + pgf_item_free(ps, item); + } } } @@ -1361,53 +1371,68 @@ pgf_parsing_td_predict(PgfParsing* ps, item->inside_prob-conts->ccat->viterbi_prob+ item->conts->outside_prob; - // Top-down prediction for syntactic rules - for (size_t i = 0; i < ccat->n_synprods; i++) { - PgfProduction prod = - gu_seq_get(ccat->prods, PgfProduction, i); - pgf_parsing_production(ps, ps->before, conts, prod); - } + if (ps->prefix != NULL) { + // We do completion: + // - top-down prediction for both syntactic and lexical rules + size_t n_prods; + if (ccat->fid < ps->concr->total_cats) // in grammar + n_prods = gu_seq_length(ccat->prods); + else + n_prods = ccat->n_synprods; + for (size_t i = 0; i < n_prods; i++) { + PgfProduction prod = + gu_seq_get(ccat->prods, PgfProduction, i); + pgf_parsing_production(ps, ps->before, conts, prod); + } + } else { + // Top-down prediction for syntactic rules + for (size_t i = 0; i < ccat->n_synprods; i++) { + PgfProduction prod = + gu_seq_get(ccat->prods, PgfProduction, i); + pgf_parsing_production(ps, ps->before, conts, prod); + } - // Bottom-up prediction for lexical and epsilon rules - size_t n_idcs = gu_buf_length(ps->before->lexicon_idx); - for (size_t i = 0; i < n_idcs; i++) { - PgfLexiconIdxEntry* lentry = - gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i); - PgfParseState* state = - pgf_new_parse_state(ps, lentry->offset, lentry->bind_type); + // Bottom-up prediction for lexical and epsilon rules + size_t n_idcs = gu_buf_length(ps->before->lexicon_idx); + for (size_t i = 0; i < n_idcs; i++) { + PgfLexiconIdxEntry* lentry = + gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i); + PgfParseState* state = + pgf_new_parse_state(ps, lentry->offset, lentry->bind_type); - if (state != NULL) { - PgfProductionIdxEntry key; - key.ccat = ccat; - key.lin_idx = lin_idx; - key.papp = NULL; - PgfProductionIdxEntry* value = - gu_seq_binsearch(gu_buf_data_seq(lentry->idx), - pgf_production_idx_entry_order, - PgfProductionIdxEntry, &key); + if (state != NULL) { + PgfProductionIdxEntry key; + key.ccat = ccat; + key.lin_idx = lin_idx; + key.papp = NULL; + PgfProductionIdxEntry* value = + gu_seq_binsearch(gu_buf_data_seq(lentry->idx), + pgf_production_idx_entry_order, + PgfProductionIdxEntry, &key); - if (value != NULL) { - pgf_parsing_predict_lexeme(ps, state, conts, value); + if (value != NULL) { + pgf_parsing_predict_lexeme(ps, state, conts, value); - PgfProductionIdxEntry* start = - gu_buf_data(lentry->idx); - PgfProductionIdxEntry* end = - start + gu_buf_length(lentry->idx)-1; + PgfProductionIdxEntry* start = + gu_buf_data(lentry->idx); + PgfProductionIdxEntry* end = + start + gu_buf_length(lentry->idx)-1; - PgfProductionIdxEntry* left = value-1; - while (left >= start && - value->ccat->fid == left->ccat->fid && - value->lin_idx == left->lin_idx) { - pgf_parsing_predict_lexeme(ps, state, conts, left); - left--; - } + PgfProductionIdxEntry* left = value-1; + while (left >= start && + value->ccat->fid == left->ccat->fid && + value->lin_idx == left->lin_idx) { + pgf_parsing_predict_lexeme(ps, state, conts, left); + left--; + } - PgfProductionIdxEntry* right = value+1; - while (right <= end && - value->ccat->fid == right->ccat->fid && - value->lin_idx == right->lin_idx) { - pgf_parsing_predict_lexeme(ps, state, conts, right); - right++; + PgfProductionIdxEntry* right = value+1; + while (right <= end && + value->ccat->fid == right->ccat->fid && + value->lin_idx == right->lin_idx) { + pgf_parsing_predict_lexeme(ps, state, conts, right); + right++; + } } } } @@ -1871,6 +1896,8 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence, ps->ccat_full_count = 0; ps->prod_full_count = 0; #endif + ps->prefix = NULL; + ps->tp = NULL; ps->free_item = NULL; ps->heuristic_factor = 0; ps->meta_prob = INFINITY; @@ -1906,157 +1933,6 @@ void pgf_parsing_print_counts(PgfParsing* ps) } #endif -/*static bool -************* -typedef struct { - PgfTokenState ts; - PgfToken tok; - PgfProductionIdx *lexicon_idx; -} PgfRealTokenState; - -static bool -pgf_real_match_token(PgfTokenState* ts, PgfToken tok, PgfItem* item) -{ - return strcmp(gu_container(ts, PgfRealTokenState, ts)->tok, tok) == 0; -} - -static PgfToken -pgf_real_get_token(PgfTokenState* ts) { - return gu_container(ts, PgfRealTokenState, ts)->tok; -} - -static PgfProductionIdx* -pgf_real_get_lexicon_idx(PgfTokenState* ts) { - return gu_container(ts, PgfRealTokenState, ts)->lexicon_idx; -} - -static PgfTokenFn pgf_tsfn_PgfRealTokenState = { - pgf_real_match_token, - pgf_real_get_token, - pgf_real_get_lexicon_idx -}; - -PgfParseState* -pgf_parser_next_state(PgfParseState* prev, PgfToken tok) -{ -#ifdef PGF_COUNTS_DEBUG - pgf_parsing_print_counts(prev->ps); -#endif - - PgfRealTokenState* ts = - pgf_new_token_state(PgfRealTokenState, prev->ps->pool); - ts->tok = tok; - ts->lexicon_idx = gu_map_get(prev->ps->concr->leftcorner_tok_idx, - tok, PgfProductionIdx*); - if (ts->lexicon_idx != NULL) { - PgfLexiconFn clo = { { pgf_parser_compute_lexicon_prob }, &ts->ts }; - gu_map_iter(ts->lexicon_idx, &clo.fn, NULL); - } - if (ts->ts.lexical_prob == INFINITY) - ts->ts.lexical_prob = 0; - - PgfParseState* state = - pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool); - - while (gu_buf_length(state->agenda) == 0) { - if (!pgf_parsing_proceed(state)) - return NULL; - } - - return state; -} - -typedef struct { - PgfTokenState ts; - GuEnum en; - GuString prefix; - PgfTokenProb* tp; - GuPool* pool; - PgfParseState* state; -} PgfPrefixTokenState; - -static bool -^ ^ ^ ^ ^ ^ ^ -pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item) -{ - PgfPrefixTokenState* ts = - gu_container(ts0, PgfPrefixTokenState, ts); - - if (gu_string_is_prefix(ts->prefix, tok)) { - size_t lin_idx; - PgfSequence* seq; - pgf_item_sequence(item, &lin_idx, &seq, ts->pool); - - uint16_t seq_idx = item->seq_idx; - uint8_t tok_idx = item->tok_idx; - - // go one token back - if (tok_idx > 0) - tok_idx--; - else - seq_idx--; - - ts->tp = gu_new(PgfTokenProb, ts->pool); - ts->tp->tok = - pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool); - ts->tp->cat = item->conts->ccat->cnccat->abscat->name; - ts->tp->prob = item->inside_prob+item->conts->outside_prob; - } - - return false; -} - -static PgfToken -pgf_prefix_get_token(PgfTokenState* ts) { - return ""; -} - -static PgfProductionIdx* -pgf_prefix_get_lexicon_idx(PgfTokenState* ts) { - return NULL; -} - -static PgfTokenFn pgf_tsfn_PgfPrefixTokenState = { - pgf_prefix_match_token, - pgf_prefix_get_token, - pgf_prefix_get_lexicon_idx -}; - -static void -pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool) -{ - PgfPrefixTokenState* ts = - gu_container(self, PgfPrefixTokenState, en); - - ts->tp = NULL; - ts->pool = pool; - while (ts->tp == NULL) { - if (!pgf_parsing_proceed(ts->state)) - break; - } - - *((PgfTokenProb**)to) = ts->tp; -}*/ - -GuEnum* -pgf_parsing_completions(PgfParsing* ps, GuString prefix) -{ -#ifdef PGF_COUNTS_DEBUG - pgf_parsing_print_counts(ps); -#endif - -/* PgfPrefixTokenState* ts = - pgf_new_token_state(PgfPrefixTokenState, prev->ps->pool); - ts->en.next = pgf_parser_completions_next; - ts->prefix = prefix; - ts->tp = NULL; - ts->state = - pgf_new_parse_state(prev->ps, prev, &ts->ts); - - return &ts->en;*/ - return NULL; -} - static int cmp_expr_state(GuOrder* self, const void* a, const void* b) { @@ -2501,35 +2377,68 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, GuString sentence, return &ps->en; } +static void +pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool) +{ + PgfParsing* ps = + gu_container(self, PgfParsing, en); + + ps->tp = NULL; + while (ps->tp == NULL) { + if (!pgf_parsing_proceed(ps)) + break; + +#ifdef PGF_COUNTS_DEBUG + pgf_parsing_print_counts(ps); +#endif + } + + *((PgfTokenProb**)to) = ps->tp; +} + GuEnum* pgf_complete(PgfConcr* concr, PgfCId cat, GuString sentence, GuString prefix, GuExn *err, GuPool* pool) { - // Begin parsing a sentence of the specified category + if (concr->sequences == NULL || + concr->pre_sequences == NULL || + concr->cnccats == NULL) { + GuExnData* err_data = gu_raise(err, PgfExn); + if (err_data) { + err_data->data = "The concrete syntax is not loaded"; + return NULL; + } + } + + // Begin parsing a sentence with the specified category PgfParsing* ps = - pgf_parsing_init(concr, cat, 0, sentence, -1, err, pool, pool); + pgf_parsing_init(concr, cat, 0, sentence, -1.0, err, pool, pool); if (ps == NULL) { return NULL; } - // Tokenization - GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool); -/* PgfToken tok = pgf_lexer_read_token(lexer, lex_err); - while (!gu_exn_is_raised(lex_err)) { - // feed the token to get a new parse state - state = pgf_parser_next_state(state, tok); - if (state == NULL) { +#ifdef PGF_COUNTS_DEBUG + pgf_parsing_print_counts(ps); +#endif + + size_t len = strlen(ps->sentence); + while (ps->before->end_offset < len) { + if (!pgf_parsing_proceed(ps)) { + GuExnData* exn = gu_raise(err, PgfParseError); + exn->data = (void*) pgf_parsing_last_token(ps, exn->pool); return NULL; } - tok = pgf_lexer_read_token(lexer, lex_err); - }*/ +#ifdef PGF_COUNTS_DEBUG + pgf_parsing_print_counts(ps); +#endif + } - if (gu_exn_caught(lex_err) != gu_type(GuEOF)) - return NULL; - - // Now begin enumerating the resulting syntax trees - return pgf_parsing_completions(ps, prefix); + // Now begin enumerating the completions + ps->en.next = pgf_parser_completions_next; + ps->prefix = prefix; + ps->tp = NULL; + return &ps->en; } static void