1
0
forked from GitHub/gf-core

now word completion from Java works. It could be made better

This commit is contained in:
kr.angelov
2014-04-17 11:00:27 +00:00
parent d2f48f8156
commit d8e5206e7f

View File

@@ -54,7 +54,9 @@ typedef struct {
int max_fid;
PgfParseState *before;
PgfParseState *after;
PgfExprEnum en; // enumeration for the generated trees
PgfToken prefix;
PgfTokenProb* tp;
PgfExprEnum en; // enumeration for the generated trees/tokens
#ifdef PGF_COUNTS_DEBUG
int item_full_count;
int item_real_count;
@@ -1278,7 +1280,7 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset, BIND_TYPE bind_type)
}
size_t end_offset = start_offset;
GuString current = ps->sentence + start_offset;
GuString current = ps->sentence + end_offset;
size_t len = strlen(current);
while (skip_space(&current, &len)) {
end_offset++;
@@ -1317,17 +1319,25 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
GuString current = ps->sentence + ps->before->end_offset;
size_t len = strlen(current);
if (!ps->before->needs_bind && cmp_string(&current, &len, tok) == 0) {
PgfParseState* state =
pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE);
if (state->next == NULL) {
state->viterbi_prob =
item->inside_prob+item->conts->outside_prob;
if (ps->prefix != NULL && ps->sentence[ps->before->end_offset] == 0) {
if (gu_string_is_prefix(ps->prefix, tok)) {
ps->tp = gu_new(PgfTokenProb, ps->out_pool);
ps->tp->tok = tok;
ps->tp->prob = item->inside_prob + item->conts->outside_prob;
}
} else {
if (!ps->before->needs_bind && cmp_string(&current, &len, tok) == 0) {
PgfParseState* state =
pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE);
if (state->next == NULL) {
state->viterbi_prob =
item->inside_prob+item->conts->outside_prob;
}
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
} else {
pgf_item_free(ps, item);
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
} else {
pgf_item_free(ps, item);
}
}
}
@@ -1361,53 +1371,68 @@ pgf_parsing_td_predict(PgfParsing* ps,
item->inside_prob-conts->ccat->viterbi_prob+
item->conts->outside_prob;
// Top-down prediction for syntactic rules
for (size_t i = 0; i < ccat->n_synprods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
pgf_parsing_production(ps, ps->before, conts, prod);
}
if (ps->prefix != NULL) {
// We do completion:
// - top-down prediction for both syntactic and lexical rules
size_t n_prods;
if (ccat->fid < ps->concr->total_cats) // in grammar
n_prods = gu_seq_length(ccat->prods);
else
n_prods = ccat->n_synprods;
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
pgf_parsing_production(ps, ps->before, conts, prod);
}
} else {
// Top-down prediction for syntactic rules
for (size_t i = 0; i < ccat->n_synprods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
pgf_parsing_production(ps, ps->before, conts, prod);
}
// Bottom-up prediction for lexical and epsilon rules
size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
for (size_t i = 0; i < n_idcs; i++) {
PgfLexiconIdxEntry* lentry =
gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
PgfParseState* state =
pgf_new_parse_state(ps, lentry->offset, lentry->bind_type);
// Bottom-up prediction for lexical and epsilon rules
size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
for (size_t i = 0; i < n_idcs; i++) {
PgfLexiconIdxEntry* lentry =
gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
PgfParseState* state =
pgf_new_parse_state(ps, lentry->offset, lentry->bind_type);
if (state != NULL) {
PgfProductionIdxEntry key;
key.ccat = ccat;
key.lin_idx = lin_idx;
key.papp = NULL;
PgfProductionIdxEntry* value =
gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
pgf_production_idx_entry_order,
PgfProductionIdxEntry, &key);
if (state != NULL) {
PgfProductionIdxEntry key;
key.ccat = ccat;
key.lin_idx = lin_idx;
key.papp = NULL;
PgfProductionIdxEntry* value =
gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
pgf_production_idx_entry_order,
PgfProductionIdxEntry, &key);
if (value != NULL) {
pgf_parsing_predict_lexeme(ps, state, conts, value);
if (value != NULL) {
pgf_parsing_predict_lexeme(ps, state, conts, value);
PgfProductionIdxEntry* start =
gu_buf_data(lentry->idx);
PgfProductionIdxEntry* end =
start + gu_buf_length(lentry->idx)-1;
PgfProductionIdxEntry* start =
gu_buf_data(lentry->idx);
PgfProductionIdxEntry* end =
start + gu_buf_length(lentry->idx)-1;
PgfProductionIdxEntry* left = value-1;
while (left >= start &&
value->ccat->fid == left->ccat->fid &&
value->lin_idx == left->lin_idx) {
pgf_parsing_predict_lexeme(ps, state, conts, left);
left--;
}
PgfProductionIdxEntry* left = value-1;
while (left >= start &&
value->ccat->fid == left->ccat->fid &&
value->lin_idx == left->lin_idx) {
pgf_parsing_predict_lexeme(ps, state, conts, left);
left--;
}
PgfProductionIdxEntry* right = value+1;
while (right <= end &&
value->ccat->fid == right->ccat->fid &&
value->lin_idx == right->lin_idx) {
pgf_parsing_predict_lexeme(ps, state, conts, right);
right++;
PgfProductionIdxEntry* right = value+1;
while (right <= end &&
value->ccat->fid == right->ccat->fid &&
value->lin_idx == right->lin_idx) {
pgf_parsing_predict_lexeme(ps, state, conts, right);
right++;
}
}
}
}
@@ -1871,6 +1896,8 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence,
ps->ccat_full_count = 0;
ps->prod_full_count = 0;
#endif
ps->prefix = NULL;
ps->tp = NULL;
ps->free_item = NULL;
ps->heuristic_factor = 0;
ps->meta_prob = INFINITY;
@@ -1906,157 +1933,6 @@ void pgf_parsing_print_counts(PgfParsing* ps)
}
#endif
/*static bool
*************
typedef struct {
PgfTokenState ts;
PgfToken tok;
PgfProductionIdx *lexicon_idx;
} PgfRealTokenState;
static bool
pgf_real_match_token(PgfTokenState* ts, PgfToken tok, PgfItem* item)
{
return strcmp(gu_container(ts, PgfRealTokenState, ts)->tok, tok) == 0;
}
static PgfToken
pgf_real_get_token(PgfTokenState* ts) {
return gu_container(ts, PgfRealTokenState, ts)->tok;
}
static PgfProductionIdx*
pgf_real_get_lexicon_idx(PgfTokenState* ts) {
return gu_container(ts, PgfRealTokenState, ts)->lexicon_idx;
}
static PgfTokenFn pgf_tsfn_PgfRealTokenState = {
pgf_real_match_token,
pgf_real_get_token,
pgf_real_get_lexicon_idx
};
PgfParseState*
pgf_parser_next_state(PgfParseState* prev, PgfToken tok)
{
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(prev->ps);
#endif
PgfRealTokenState* ts =
pgf_new_token_state(PgfRealTokenState, prev->ps->pool);
ts->tok = tok;
ts->lexicon_idx = gu_map_get(prev->ps->concr->leftcorner_tok_idx,
tok, PgfProductionIdx*);
if (ts->lexicon_idx != NULL) {
PgfLexiconFn clo = { { pgf_parser_compute_lexicon_prob }, &ts->ts };
gu_map_iter(ts->lexicon_idx, &clo.fn, NULL);
}
if (ts->ts.lexical_prob == INFINITY)
ts->ts.lexical_prob = 0;
PgfParseState* state =
pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool);
while (gu_buf_length(state->agenda) == 0) {
if (!pgf_parsing_proceed(state))
return NULL;
}
return state;
}
typedef struct {
PgfTokenState ts;
GuEnum en;
GuString prefix;
PgfTokenProb* tp;
GuPool* pool;
PgfParseState* state;
} PgfPrefixTokenState;
static bool
^ ^ ^ ^ ^ ^ ^
pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
{
PgfPrefixTokenState* ts =
gu_container(ts0, PgfPrefixTokenState, ts);
if (gu_string_is_prefix(ts->prefix, tok)) {
size_t lin_idx;
PgfSequence* seq;
pgf_item_sequence(item, &lin_idx, &seq, ts->pool);
uint16_t seq_idx = item->seq_idx;
uint8_t tok_idx = item->tok_idx;
// go one token back
if (tok_idx > 0)
tok_idx--;
else
seq_idx--;
ts->tp = gu_new(PgfTokenProb, ts->pool);
ts->tp->tok =
pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool);
ts->tp->cat = item->conts->ccat->cnccat->abscat->name;
ts->tp->prob = item->inside_prob+item->conts->outside_prob;
}
return false;
}
static PgfToken
pgf_prefix_get_token(PgfTokenState* ts) {
return "";
}
static PgfProductionIdx*
pgf_prefix_get_lexicon_idx(PgfTokenState* ts) {
return NULL;
}
static PgfTokenFn pgf_tsfn_PgfPrefixTokenState = {
pgf_prefix_match_token,
pgf_prefix_get_token,
pgf_prefix_get_lexicon_idx
};
static void
pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
{
PgfPrefixTokenState* ts =
gu_container(self, PgfPrefixTokenState, en);
ts->tp = NULL;
ts->pool = pool;
while (ts->tp == NULL) {
if (!pgf_parsing_proceed(ts->state))
break;
}
*((PgfTokenProb**)to) = ts->tp;
}*/
GuEnum*
pgf_parsing_completions(PgfParsing* ps, GuString prefix)
{
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(ps);
#endif
/* PgfPrefixTokenState* ts =
pgf_new_token_state(PgfPrefixTokenState, prev->ps->pool);
ts->en.next = pgf_parser_completions_next;
ts->prefix = prefix;
ts->tp = NULL;
ts->state =
pgf_new_parse_state(prev->ps, prev, &ts->ts);
return &ts->en;*/
return NULL;
}
static int
cmp_expr_state(GuOrder* self, const void* a, const void* b)
{
@@ -2501,35 +2377,68 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, GuString sentence,
return &ps->en;
}
static void
pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
{
PgfParsing* ps =
gu_container(self, PgfParsing, en);
ps->tp = NULL;
while (ps->tp == NULL) {
if (!pgf_parsing_proceed(ps))
break;
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(ps);
#endif
}
*((PgfTokenProb**)to) = ps->tp;
}
GuEnum*
pgf_complete(PgfConcr* concr, PgfCId cat, GuString sentence,
GuString prefix, GuExn *err, GuPool* pool)
{
// Begin parsing a sentence of the specified category
if (concr->sequences == NULL ||
concr->pre_sequences == NULL ||
concr->cnccats == NULL) {
GuExnData* err_data = gu_raise(err, PgfExn);
if (err_data) {
err_data->data = "The concrete syntax is not loaded";
return NULL;
}
}
// Begin parsing a sentence with the specified category
PgfParsing* ps =
pgf_parsing_init(concr, cat, 0, sentence, -1, err, pool, pool);
pgf_parsing_init(concr, cat, 0, sentence, -1.0, err, pool, pool);
if (ps == NULL) {
return NULL;
}
// Tokenization
GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
/* PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
while (!gu_exn_is_raised(lex_err)) {
// feed the token to get a new parse state
state = pgf_parser_next_state(state, tok);
if (state == NULL) {
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(ps);
#endif
size_t len = strlen(ps->sentence);
while (ps->before->end_offset < len) {
if (!pgf_parsing_proceed(ps)) {
GuExnData* exn = gu_raise(err, PgfParseError);
exn->data = (void*) pgf_parsing_last_token(ps, exn->pool);
return NULL;
}
tok = pgf_lexer_read_token(lexer, lex_err);
}*/
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(ps);
#endif
}
if (gu_exn_caught(lex_err) != gu_type(GuEOF))
return NULL;
// Now begin enumerating the resulting syntax trees
return pgf_parsing_completions(ps, prefix);
// Now begin enumerating the completions
ps->en.next = pgf_parser_completions_next;
ps->prefix = prefix;
ps->tp = NULL;
return &ps->en;
}
static void