mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
the parser now ensures that all word senses are in the chart
This commit is contained in:
@@ -61,14 +61,6 @@ typedef struct {
|
||||
|
||||
typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
|
||||
|
||||
typedef struct {
|
||||
PgfProductionIdx* idx;
|
||||
size_t offset;
|
||||
size_t sym_idx;
|
||||
} PgfLexiconIdxEntry;
|
||||
|
||||
typedef GuBuf PgfLexiconIdx;
|
||||
|
||||
struct PgfParseState {
|
||||
PgfParseState* next;
|
||||
|
||||
@@ -81,8 +73,6 @@ struct PgfParseState {
|
||||
size_t end_offset;
|
||||
|
||||
prob_t viterbi_prob;
|
||||
|
||||
PgfLexiconIdx* lexicon_idx;
|
||||
};
|
||||
|
||||
typedef struct PgfAnswers {
|
||||
@@ -442,7 +432,7 @@ pgf_print_expr_state0(PgfExprState* st,
|
||||
#endif
|
||||
|
||||
PGF_INTERNAL_DECL int
|
||||
cmp_string(GuString* psent, size_t* ppos, GuString tok,
|
||||
cmp_string(PgfCohortSpot* spot, GuString tok,
|
||||
bool case_sensitive);
|
||||
|
||||
PGF_INTERNAL_DECL bool
|
||||
@@ -764,6 +754,25 @@ static void
|
||||
pgf_result_production(PgfParsing* ps,
|
||||
PgfAnswers* answers, PgfProduction prod);
|
||||
|
||||
static void
|
||||
pgf_parsing_push_item(PgfParseState* state, PgfItem* item)
|
||||
{
|
||||
if (gu_buf_length(state->agenda) == 0) {
|
||||
state->viterbi_prob =
|
||||
item->inside_prob+item->conts->outside_prob;
|
||||
}
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_push_production(PgfParsing* ps, PgfParseState* state,
|
||||
PgfItemConts* conts, PgfProduction prod)
|
||||
{
|
||||
PgfItem* item =
|
||||
pgf_new_item(ps, conts, prod);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_combine(PgfParsing* ps,
|
||||
PgfParseState* before, PgfParseState* after,
|
||||
@@ -786,16 +795,7 @@ pgf_parsing_combine(PgfParsing* ps,
|
||||
}
|
||||
|
||||
pgf_item_advance(item, ps->pool);
|
||||
gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item);
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_production(PgfParsing* ps, PgfParseState* state,
|
||||
PgfItemConts* conts, PgfProduction prod)
|
||||
{
|
||||
PgfItem* item =
|
||||
pgf_new_item(ps, conts, prod);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
pgf_parsing_push_item(before, item);
|
||||
}
|
||||
|
||||
static PgfProduction
|
||||
@@ -931,7 +931,7 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
|
||||
* production immediately to the agenda,
|
||||
* i.e. process it. */
|
||||
if (conts2) {
|
||||
pgf_parsing_production(ps, ps->before, conts2, prod);
|
||||
pgf_parsing_push_production(ps, ps->before, conts2, prod);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -952,7 +952,7 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
|
||||
* production immediately to the agenda,
|
||||
* i.e. process it. */
|
||||
if (conts2) {
|
||||
pgf_parsing_production(ps, state, conts2, prod);
|
||||
pgf_parsing_push_production(ps, state, conts2, prod);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -972,66 +972,9 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL_DECL int
|
||||
pgf_symbols_cmp(GuString* psent, size_t* ppos,
|
||||
PgfSymbols* syms, size_t* sym_idx,
|
||||
bool case_sensitive);
|
||||
|
||||
static void
|
||||
pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
|
||||
int i, int j, ptrdiff_t min, ptrdiff_t max)
|
||||
{
|
||||
// This is a variation of a binary search algorithm which
|
||||
// can retrieve all prefixes of a string with minimal
|
||||
// comparisons, i.e. there is no need to lookup every
|
||||
// prefix separately.
|
||||
|
||||
while (i <= j) {
|
||||
int k = (i+j) / 2;
|
||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
|
||||
|
||||
GuString start = ps->sentence + state->end_offset;
|
||||
GuString current = start;
|
||||
size_t pos = 0;
|
||||
size_t sym_idx = 0;
|
||||
int cmp = pgf_symbols_cmp(¤t, &pos, seq->syms, &sym_idx, ps->case_sensitive);
|
||||
if (cmp < 0) {
|
||||
j = k-1;
|
||||
} else if (cmp > 0) {
|
||||
ptrdiff_t len = current - start;
|
||||
|
||||
if (min <= len)
|
||||
pgf_parsing_lookahead(ps, state, i, k-1, min, len);
|
||||
|
||||
if (len+1 <= max)
|
||||
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
|
||||
|
||||
break;
|
||||
} else {
|
||||
ptrdiff_t len = current - start;
|
||||
|
||||
if (min <= len-1)
|
||||
pgf_parsing_lookahead(ps, state, i, k-1, min, len-1);
|
||||
|
||||
if (seq->idx != NULL) {
|
||||
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
||||
entry->idx = seq->idx;
|
||||
entry->offset = (size_t) (current - ps->sentence);
|
||||
entry->sym_idx = sym_idx;
|
||||
}
|
||||
|
||||
if (len+1 <= max)
|
||||
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static PgfParseState*
|
||||
pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
||||
BIND_TYPE bind_type,
|
||||
prob_t viterbi_prob)
|
||||
BIND_TYPE bind_type)
|
||||
{
|
||||
PgfParseState** pstate;
|
||||
if (ps->before == NULL && start_offset == 0)
|
||||
@@ -1083,44 +1026,173 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
||||
(start_offset == end_offset);
|
||||
state->start_offset = start_offset;
|
||||
state->end_offset = end_offset;
|
||||
state->viterbi_prob = viterbi_prob;
|
||||
state->lexicon_idx =
|
||||
gu_new_buf(PgfLexiconIdxEntry, ps->pool);
|
||||
state->viterbi_prob = 0;
|
||||
|
||||
if (ps->before == NULL && start_offset == 0)
|
||||
state->needs_bind = false;
|
||||
|
||||
if (gu_seq_length(ps->concr->sequences) > 0) {
|
||||
// Add epsilon lexical rules to the bottom up index
|
||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0);
|
||||
if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) {
|
||||
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
||||
entry->idx = seq->idx;
|
||||
entry->offset = state->start_offset;
|
||||
entry->sym_idx= 0;
|
||||
}
|
||||
|
||||
// Add non-epsilon lexical rules to the bottom up index
|
||||
if (!state->needs_bind) {
|
||||
pgf_parsing_lookahead(ps, state,
|
||||
0, gu_seq_length(ps->concr->sequences)-1,
|
||||
1, strlen(ps->sentence)-state->end_offset);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
*pstate = state;
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
PGF_INTERNAL_DECL int
|
||||
pgf_symbols_cmp(PgfCohortSpot* spot,
|
||||
PgfSymbols* syms, size_t* sym_idx,
|
||||
bool case_sensitive);
|
||||
|
||||
static bool
|
||||
pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state,
|
||||
int i, int j, ptrdiff_t min, ptrdiff_t max)
|
||||
{
|
||||
// This is a variation of a binary search algorithm which
|
||||
// can retrieve all prefixes of a string with minimal
|
||||
// comparisons, i.e. there is no need to lookup every
|
||||
// prefix separately.
|
||||
|
||||
bool found = false;
|
||||
while (i <= j) {
|
||||
int k = (i+j) / 2;
|
||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
|
||||
|
||||
PgfCohortSpot start = {0, ps->sentence+state->end_offset};
|
||||
PgfCohortSpot current = start;
|
||||
|
||||
size_t sym_idx = 0;
|
||||
int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive);
|
||||
if (cmp < 0) {
|
||||
j = k-1;
|
||||
} else if (cmp > 0) {
|
||||
ptrdiff_t len = current.ptr - start.ptr;
|
||||
|
||||
if (min <= len)
|
||||
if (pgf_parsing_scan_helper(ps, state, i, k-1, min, len))
|
||||
found = true;
|
||||
|
||||
if (len+1 <= max)
|
||||
if (pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max))
|
||||
found = true;
|
||||
|
||||
break;
|
||||
} else {
|
||||
ptrdiff_t len = current.ptr - start.ptr;
|
||||
found = true;
|
||||
|
||||
if (min <= len-1)
|
||||
pgf_parsing_scan_helper(ps, state, i, k-1, min, len-1);
|
||||
|
||||
// Here we do bottom-up prediction for all lexical categories.
|
||||
// The epsilon productions will be predicted in top-down
|
||||
// fashion while parsing.
|
||||
if (seq->idx != NULL && len > 0) {
|
||||
// A new state will mark the end of the current match
|
||||
PgfParseState* new_state =
|
||||
pgf_new_parse_state(ps, (size_t) (current.ptr - ps->sentence), BIND_NONE);
|
||||
|
||||
// Bottom-up prediction for lexical rules
|
||||
size_t n_entries = gu_buf_length(seq->idx);
|
||||
for (size_t i = 0; i < n_entries; i++) {
|
||||
PgfProductionIdxEntry* entry =
|
||||
gu_buf_index(seq->idx, PgfProductionIdxEntry, i);
|
||||
|
||||
PgfItemConts* conts =
|
||||
pgf_parsing_get_conts(state,
|
||||
entry->ccat, entry->lin_idx,
|
||||
ps->pool);
|
||||
|
||||
// Create the new category if it doesn't exist yet
|
||||
PgfCCat* tmp_ccat = pgf_parsing_get_completed(new_state, conts);
|
||||
PgfCCat* ccat = tmp_ccat;
|
||||
if (ccat == NULL) {
|
||||
ccat = pgf_parsing_create_completed(ps, new_state, conts, INFINITY);
|
||||
}
|
||||
|
||||
// Add the production
|
||||
if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) {
|
||||
ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1);
|
||||
}
|
||||
GuVariantInfo i;
|
||||
i.tag = PGF_PRODUCTION_APPLY;
|
||||
i.data = entry->papp;
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod);
|
||||
|
||||
// Update the category's probability to be minimum
|
||||
if (ccat->viterbi_prob > entry->papp->fun->ep->prob)
|
||||
ccat->viterbi_prob = entry->papp->fun->ep->prob;
|
||||
|
||||
#ifdef PGF_PARSER_DEBUG
|
||||
GuPool* tmp_pool = gu_new_pool();
|
||||
GuOut* out = gu_file_out(stderr, tmp_pool);
|
||||
GuExn* err = gu_exn(tmp_pool);
|
||||
if (tmp_ccat == NULL) {
|
||||
gu_printf(out, err, "[");
|
||||
pgf_print_range(state, new_state, out, err);
|
||||
gu_puts("; ", out, err);
|
||||
pgf_print_fid(conts->ccat->fid, out, err);
|
||||
gu_printf(out, err, "; %d; ",
|
||||
conts->lin_idx);
|
||||
pgf_print_fid(ccat->fid, out, err);
|
||||
gu_puts("]\n", out, err);
|
||||
}
|
||||
pgf_print_production(ccat->fid, prod, out, err);
|
||||
gu_pool_free(tmp_pool);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (len+1 <= max)
|
||||
pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_scan(PgfParsing *ps)
|
||||
{
|
||||
size_t len = strlen(ps->sentence);
|
||||
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, 0, BIND_SOFT);
|
||||
|
||||
while (state != NULL) {
|
||||
if (state->needs_bind) {
|
||||
// We have encountered two tokens without space in between.
|
||||
// Those can be accepted only if there is a BIND token
|
||||
// in between. We encode this by having one more state
|
||||
// at the same offset. A transition between these two
|
||||
// states is possible only with the BIND token.
|
||||
state =
|
||||
pgf_new_parse_state(ps, state->end_offset, BIND_HARD);
|
||||
}
|
||||
|
||||
if (!pgf_parsing_scan_helper
|
||||
(ps, state,
|
||||
0, gu_seq_length(ps->concr->sequences)-1,
|
||||
1, len-state->end_offset)) {
|
||||
// skip one character and try again
|
||||
GuString s = ps->sentence+state->end_offset;
|
||||
gu_utf8_decode((const uint8_t**) &s);
|
||||
pgf_new_parse_state(ps, ps->sentence-s, BIND_NONE);
|
||||
}
|
||||
|
||||
if (state == ps->before)
|
||||
state = ps->after;
|
||||
else
|
||||
state = state->next;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
||||
{
|
||||
GuString current = ps->sentence + ps->before->end_offset;
|
||||
size_t pos = 0;
|
||||
{
|
||||
PgfCohortSpot current = {0, ps->sentence + ps->before->end_offset};
|
||||
|
||||
if (ps->prefix != NULL && *current == 0) {
|
||||
if (ps->prefix != NULL && *current.ptr == 0) {
|
||||
if (gu_string_is_prefix(ps->prefix, tok)) {
|
||||
PgfProductionApply* papp = gu_variant_data(item->prod);
|
||||
|
||||
@@ -1131,39 +1203,17 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
||||
ps->tp->prob = item->inside_prob + item->conts->outside_prob;
|
||||
}
|
||||
} else {
|
||||
if (!ps->before->needs_bind && cmp_string(¤t, &pos, tok, ps->case_sensitive) == 0) {
|
||||
if (!ps->before->needs_bind && cmp_string(¤t, tok, ps->case_sensitive) == 0) {
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, (current - ps->sentence),
|
||||
BIND_NONE,
|
||||
item->inside_prob+item->conts->outside_prob);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
pgf_new_parse_state(ps, (current.ptr - ps->sentence),
|
||||
BIND_NONE);
|
||||
pgf_parsing_push_item(state, item);
|
||||
} else {
|
||||
pgf_item_free(ps, item);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_predict_lexeme(PgfParsing* ps, PgfItemConts* conts,
|
||||
PgfProductionIdxEntry* entry,
|
||||
size_t offset, size_t sym_idx)
|
||||
{
|
||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, entry->papp };
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
PgfItem* item =
|
||||
pgf_new_item(ps, conts, prod);
|
||||
PgfSymbols* syms = entry->papp->fun->lins[conts->lin_idx]->syms;
|
||||
item->sym_idx = sym_idx;
|
||||
pgf_item_set_curr_symbol(item, ps->pool);
|
||||
prob_t prob = item->inside_prob+item->conts->outside_prob;
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, offset, BIND_NONE, prob);
|
||||
if (state->viterbi_prob > prob) {
|
||||
state->viterbi_prob = prob;
|
||||
}
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_td_predict(PgfParsing* ps,
|
||||
PgfItem* item, PgfCCat* ccat, size_t lin_idx)
|
||||
@@ -1201,44 +1251,46 @@ pgf_parsing_td_predict(PgfParsing* ps,
|
||||
for (size_t i = 0; i < n_prods; i++) {
|
||||
PgfProduction prod =
|
||||
gu_seq_get(ccat->prods, PgfProduction, i);
|
||||
pgf_parsing_production(ps, ps->before, conts, prod);
|
||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||
}
|
||||
} else {
|
||||
// Top-down prediction for syntactic rules
|
||||
for (size_t i = 0; i < ccat->n_synprods; i++) {
|
||||
PgfProduction prod =
|
||||
gu_seq_get(ccat->prods, PgfProduction, i);
|
||||
pgf_parsing_production(ps, ps->before, conts, prod);
|
||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||
}
|
||||
|
||||
// Bottom-up prediction for lexical and epsilon rules
|
||||
size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
|
||||
for (size_t i = 0; i < n_idcs; i++) {
|
||||
PgfLexiconIdxEntry* lentry =
|
||||
gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
|
||||
// Top-down prediction for epsilon lexical rules if any
|
||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0);
|
||||
if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) {
|
||||
|
||||
PgfProductionIdxEntry key;
|
||||
key.ccat = ccat;
|
||||
key.lin_idx = lin_idx;
|
||||
key.papp = NULL;
|
||||
PgfProductionIdxEntry* value =
|
||||
gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
|
||||
gu_seq_binsearch(gu_buf_data_seq(seq->idx),
|
||||
pgf_production_idx_entry_order,
|
||||
PgfProductionIdxEntry, &key);
|
||||
|
||||
if (value != NULL) {
|
||||
pgf_parsing_predict_lexeme(ps, conts, value, lentry->offset, lentry->sym_idx);
|
||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, value->papp };
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||
|
||||
PgfProductionIdxEntry* start =
|
||||
gu_buf_data(lentry->idx);
|
||||
gu_buf_data(seq->idx);
|
||||
PgfProductionIdxEntry* end =
|
||||
start + gu_buf_length(lentry->idx)-1;
|
||||
start + gu_buf_length(seq->idx)-1;
|
||||
|
||||
PgfProductionIdxEntry* left = value-1;
|
||||
while (left >= start &&
|
||||
value->ccat->fid == left->ccat->fid &&
|
||||
value->lin_idx == left->lin_idx) {
|
||||
pgf_parsing_predict_lexeme(ps, conts, left, lentry->offset, lentry->sym_idx);
|
||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, left->papp };
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||
left--;
|
||||
}
|
||||
|
||||
@@ -1246,31 +1298,32 @@ pgf_parsing_td_predict(PgfParsing* ps,
|
||||
while (right <= end &&
|
||||
value->ccat->fid == right->ccat->fid &&
|
||||
value->lin_idx == right->lin_idx) {
|
||||
pgf_parsing_predict_lexeme(ps, conts, right, lentry->offset, lentry->sym_idx);
|
||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, right->papp };
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||
right++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* If it has already been completed, combine. */
|
||||
}
|
||||
|
||||
/* If the category has already been completed, combine. */
|
||||
PgfCCat* completed =
|
||||
pgf_parsing_get_completed(ps->before, conts);
|
||||
if (completed) {
|
||||
pgf_parsing_combine(ps, ps->before, ps->after, item, completed, lin_idx);
|
||||
}
|
||||
|
||||
PgfParseState* state = ps->after;
|
||||
while (state != NULL) {
|
||||
PgfCCat* completed =
|
||||
pgf_parsing_get_completed(ps->before, conts);
|
||||
pgf_parsing_get_completed(state, conts);
|
||||
if (completed) {
|
||||
pgf_parsing_combine(ps, ps->before, ps->after, item, completed, lin_idx);
|
||||
pgf_parsing_combine(ps, state, state->next, item, completed, lin_idx);
|
||||
}
|
||||
|
||||
PgfParseState* state = ps->after;
|
||||
while (state != NULL) {
|
||||
PgfCCat* completed =
|
||||
pgf_parsing_get_completed(state, conts);
|
||||
if (completed) {
|
||||
pgf_parsing_combine(ps, state, state->next, item, completed, lin_idx);
|
||||
}
|
||||
|
||||
state = state->next;
|
||||
}
|
||||
state = state->next;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1286,7 +1339,7 @@ pgf_parsing_pre(PgfParsing* ps, PgfItem* item, PgfSymbols* syms)
|
||||
} else {
|
||||
item->alt = 0;
|
||||
pgf_item_advance(item, ps->pool);
|
||||
gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
|
||||
pgf_parsing_push_item(ps->before, item);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1405,9 +1458,8 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
||||
item->curr_sym = pgf_collect_extern_tok(ps,start,offset);
|
||||
item->sym_idx = pgf_item_symbols_length(item);
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, offset, BIND_NONE,
|
||||
item->inside_prob+item->conts->outside_prob);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
pgf_new_parse_state(ps, offset, BIND_NONE);
|
||||
pgf_parsing_push_item(state, item);
|
||||
match = true;
|
||||
}
|
||||
}
|
||||
@@ -1450,11 +1502,10 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
||||
if (ps->before->start_offset == ps->before->end_offset &&
|
||||
ps->before->needs_bind) {
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD,
|
||||
item->inside_prob+item->conts->outside_prob);
|
||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
|
||||
if (state != NULL) {
|
||||
pgf_item_advance(item, ps->pool);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
pgf_parsing_push_item(state, item);
|
||||
} else {
|
||||
pgf_item_free(ps, item);
|
||||
}
|
||||
@@ -1468,11 +1519,10 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
||||
if (ps->before->start_offset == ps->before->end_offset) {
|
||||
if (ps->before->needs_bind) {
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD,
|
||||
item->inside_prob+item->conts->outside_prob);
|
||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
|
||||
if (state != NULL) {
|
||||
pgf_item_advance(item, ps->pool);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
pgf_parsing_push_item(state, item);
|
||||
} else {
|
||||
pgf_item_free(ps, item);
|
||||
}
|
||||
@@ -1481,7 +1531,7 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
||||
}
|
||||
} else {
|
||||
pgf_item_advance(item, ps->pool);
|
||||
gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
|
||||
pgf_parsing_push_item(ps->before, item);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -1573,7 +1623,7 @@ static void
|
||||
pgf_parsing_set_default_factors(PgfParsing* ps, PgfAbstr* abstr)
|
||||
{
|
||||
PgfFlag* flag;
|
||||
|
||||
|
||||
flag =
|
||||
gu_seq_binsearch(abstr->aflags, pgf_flag_order, PgfFlag, "heuristic_search_factor");
|
||||
if (flag != NULL) {
|
||||
@@ -1803,8 +1853,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
|
||||
ps->heuristic_factor = heuristic_factor;
|
||||
}
|
||||
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, 0, BIND_SOFT, 0);
|
||||
pgf_parsing_scan(ps);
|
||||
|
||||
int fidString = -1;
|
||||
PgfCCat* start_ccat = gu_new(PgfCCat, ps->pool);
|
||||
@@ -1823,7 +1872,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
|
||||
#endif
|
||||
|
||||
PgfItemConts* conts =
|
||||
pgf_parsing_get_conts(state, start_ccat, 0, ps->pool);
|
||||
pgf_parsing_get_conts(ps->before, start_ccat, 0, ps->pool);
|
||||
gu_buf_push(conts->items, PgfItem*, NULL);
|
||||
|
||||
#ifdef PGF_COUNTS_DEBUG
|
||||
@@ -1849,7 +1898,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
|
||||
|
||||
PgfItem* item = gu_new(PgfItem, ps->pool);
|
||||
item->args = args;
|
||||
item->inside_prob += ccat->viterbi_prob;
|
||||
item->inside_prob = 0;
|
||||
item->conts = conts;
|
||||
item->prod = prod;
|
||||
item->curr_sym = gu_null_variant;
|
||||
@@ -1866,7 +1915,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
|
||||
ps->item_real_count++;
|
||||
#endif
|
||||
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1884,10 +1933,18 @@ pgf_parsing_proceed(PgfParsing* ps)
|
||||
best_prob = gu_buf_get(ps->expr_queue, PgfExprState*, 0)->ep.prob;
|
||||
}
|
||||
|
||||
prob_t delta_prob = 0;
|
||||
PgfParseState* st = ps->before;
|
||||
PgfParseState* st = ps->before;
|
||||
PgfParseState* last = NULL;
|
||||
prob_t delta_prob = 0;
|
||||
while (st != NULL) {
|
||||
if (gu_buf_length(st->agenda) > 0) {
|
||||
if (last != NULL) {
|
||||
delta_prob +=
|
||||
(last->viterbi_prob-st->viterbi_prob) *
|
||||
ps->heuristic_factor;
|
||||
}
|
||||
last = st;
|
||||
|
||||
PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
|
||||
prob_t item_prob =
|
||||
item->inside_prob+item->conts->outside_prob+delta_prob;
|
||||
@@ -1900,15 +1957,11 @@ pgf_parsing_proceed(PgfParsing* ps)
|
||||
ps->after = ps->before;
|
||||
ps->before = tmp;
|
||||
}
|
||||
|
||||
|
||||
has_progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
prob_t state_delta =
|
||||
(st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))*
|
||||
ps->heuristic_factor;
|
||||
delta_prob += state_delta;
|
||||
st = st->next;
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#include <gu/utf8.h>
|
||||
|
||||
PGF_INTERNAL int
|
||||
cmp_string(GuString* psent, size_t* ppos, GuString tok,
|
||||
cmp_string(PgfCohortSpot* spot, GuString tok,
|
||||
bool case_sensitive)
|
||||
{
|
||||
for (;;) {
|
||||
@@ -11,7 +11,7 @@ cmp_string(GuString* psent, size_t* ppos, GuString tok,
|
||||
if (c2 == 0)
|
||||
return 0;
|
||||
|
||||
const uint8_t* p = (uint8_t*) *psent;
|
||||
const uint8_t* p = (uint8_t*) spot->ptr;
|
||||
GuUCS c1 = gu_utf8_decode(&p);
|
||||
if (c1 == 0)
|
||||
return -1;
|
||||
@@ -22,8 +22,8 @@ cmp_string(GuString* psent, size_t* ppos, GuString tok,
|
||||
if (c1 != c2)
|
||||
return (c1-c2);
|
||||
|
||||
*psent = (GuString) p;
|
||||
(*ppos)++;
|
||||
spot->ptr = (GuString) p;
|
||||
spot->pos++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ skip_space(GuString* psent, size_t* ppos)
|
||||
}
|
||||
|
||||
PGF_INTERNAL int
|
||||
pgf_symbols_cmp(GuString* psent, size_t* ppos,
|
||||
pgf_symbols_cmp(PgfCohortSpot* spot,
|
||||
PgfSymbols* syms, size_t* sym_idx,
|
||||
bool case_sensitive)
|
||||
{
|
||||
@@ -49,14 +49,14 @@ pgf_symbols_cmp(GuString* psent, size_t* ppos,
|
||||
PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx);
|
||||
|
||||
if (*sym_idx > 0) {
|
||||
if (!skip_space(psent,ppos)) {
|
||||
if (**psent == 0)
|
||||
if (!skip_space(&spot->ptr,&spot->pos)) {
|
||||
if (*spot->ptr == 0)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (**psent != 0) {
|
||||
if (!skip_space(psent,ppos))
|
||||
while (*spot->ptr != 0) {
|
||||
if (!skip_space(&spot->ptr,&spot->pos))
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -66,16 +66,16 @@ pgf_symbols_cmp(GuString* psent, size_t* ppos,
|
||||
case PGF_SYMBOL_CAT:
|
||||
case PGF_SYMBOL_LIT:
|
||||
case PGF_SYMBOL_VAR: {
|
||||
if (**psent == 0)
|
||||
if (*spot->ptr == 0)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
case PGF_SYMBOL_KS: {
|
||||
PgfSymbolKS* pks = inf.data;
|
||||
if (**psent == 0)
|
||||
if (*spot->ptr == 0)
|
||||
return -1;
|
||||
|
||||
int cmp = cmp_string(psent,ppos,pks->token, case_sensitive);
|
||||
int cmp = cmp_string(spot,pks->token, case_sensitive);
|
||||
if (cmp != 0)
|
||||
return cmp;
|
||||
break;
|
||||
@@ -131,14 +131,13 @@ pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2)
|
||||
{
|
||||
PgfSequenceOrder* self = gu_container(order, PgfSequenceOrder, order);
|
||||
|
||||
size_t pos = 0;
|
||||
GuString sent = (GuString) p1;
|
||||
PgfCohortSpot spot = {0, (GuString) p1};
|
||||
|
||||
const PgfSequence* sp2 = p2;
|
||||
|
||||
size_t sym_idx = 0;
|
||||
int res = pgf_symbols_cmp(&sent, &pos, sp2->syms, &sym_idx, self->case_sensitive);
|
||||
if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) {
|
||||
int res = pgf_symbols_cmp(&spot, sp2->syms, &sym_idx, self->case_sensitive);
|
||||
if (res == 0 && (*spot.ptr != 0 || sym_idx != gu_seq_length(sp2->syms))) {
|
||||
res = 1;
|
||||
}
|
||||
|
||||
@@ -210,7 +209,7 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
|
||||
PgfCohortSpot current = *spot;
|
||||
|
||||
size_t sym_idx = 0;
|
||||
int cmp = pgf_symbols_cmp(¤t.ptr, ¤t.pos, seq->syms, &sym_idx, state->case_sensitive);
|
||||
int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, state->case_sensitive);
|
||||
if (cmp < 0) {
|
||||
j = k-1;
|
||||
} else if (cmp > 0) {
|
||||
@@ -273,6 +272,7 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
1, (state->sentence+state->len)-spot.ptr);
|
||||
|
||||
if (gu_buf_length(state->found) == 0) {
|
||||
// skip one character and try again
|
||||
gu_utf8_decode((const uint8_t**) &spot.ptr);
|
||||
spot.pos++;
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
|
||||
|
||||
Reference in New Issue
Block a user