a new nice and elegant algorithm for dictionary lookup in the parser

This commit is contained in:
krasimir
2015-05-08 14:38:24 +00:00
parent 365c7bb1d8
commit ea0f74d455

View File

@@ -472,36 +472,28 @@ pgf_print_expr_state0(PgfExprState* st,
#endif #endif
static int static int
cmp_string(GuString* psent, size_t* plen, GuString tok) cmp_string(GuString* psent, GuString tok)
{ {
GuString sent = *psent; for (;;) {
size_t len = *plen; uint8_t c2 = *tok;
if (c2 == 0)
return 0;
while (*tok != 0) { uint8_t c1 = **psent;
if (len == 0) if (c1 == 0)
return -1; return -1;
if (((uint8_t) *sent) > ((uint8_t) *tok)) if (c1 != c2)
return 1; return (c1-c2);
else if (((uint8_t) *sent) < ((uint8_t) *tok))
return -2;
tok++;
sent++;
len--;
}
*psent = sent; tok++;
*plen = len; (*psent)++;
return 0; }
} }
static bool static bool
skip_space(GuString* psent, size_t* plen) skip_space(GuString* psent)
{ {
if (*plen == 0)
return false;
const uint8_t* p = (uint8_t*) *psent; const uint8_t* p = (uint8_t*) *psent;
if (!gu_ucs_is_space(gu_utf8_decode(&p))) if (!gu_ucs_is_space(gu_utf8_decode(&p)))
return false; return false;
@@ -1023,10 +1015,8 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
} }
static int static int
pgf_symbols_cmp(GuString* psent, size_t sent_len, BIND_TYPE* pbind, PgfSymbols* syms) pgf_symbols_cmp(GuString* psent, BIND_TYPE* pbind, PgfSymbols* syms)
{ {
GuString sent = *psent;
size_t n_syms = gu_seq_length(syms); size_t n_syms = gu_seq_length(syms);
for (size_t i = 0; i < n_syms; i++) { for (size_t i = 0; i < n_syms; i++) {
PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i); PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i);
@@ -1036,34 +1026,34 @@ pgf_symbols_cmp(GuString* psent, size_t sent_len, BIND_TYPE* pbind, PgfSymbols*
case PGF_SYMBOL_CAT: case PGF_SYMBOL_CAT:
case PGF_SYMBOL_LIT: case PGF_SYMBOL_LIT:
case PGF_SYMBOL_VAR: { case PGF_SYMBOL_VAR: {
if (sent_len == 0) if (**psent == 0)
return -1; return -1;
return 1; return 1;
} }
case PGF_SYMBOL_KS: { case PGF_SYMBOL_KS: {
PgfSymbolKS* pks = inf.data; PgfSymbolKS* pks = inf.data;
if (sent_len == 0) if (**psent == 0)
return -1; return -1;
if (*pbind == BIND_HARD) if (*pbind == BIND_HARD)
*pbind = BIND_NONE; *pbind = BIND_NONE;
else { else {
if (*pbind != BIND_SOFT && !skip_space(&sent, &sent_len)) if (*pbind != BIND_SOFT && !skip_space(psent))
return 1; return 1;
while (*sent != 0) { while (**psent != 0) {
if (!skip_space(&sent, &sent_len)) if (!skip_space(psent))
break; break;
} }
} }
int cmp = cmp_string(&sent, &sent_len, pks->token); int cmp = cmp_string(psent, pks->token);
if (cmp != 0) if (cmp != 0)
return cmp; return cmp;
break; break;
} }
case PGF_SYMBOL_KP: { case PGF_SYMBOL_KP: {
return -2; return -1;
} }
case PGF_SYMBOL_BIND: { case PGF_SYMBOL_BIND: {
*pbind = BIND_HARD; *pbind = BIND_HARD;
@@ -1077,88 +1067,81 @@ pgf_symbols_cmp(GuString* psent, size_t sent_len, BIND_TYPE* pbind, PgfSymbols*
break; break;
} }
case PGF_SYMBOL_NE: { case PGF_SYMBOL_NE: {
return -2; return -1;
} }
default: default:
gu_impossible(); gu_impossible();
} }
} }
*psent = sent;
return 0; return 0;
} }
static void static void
pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state) pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
size_t i, size_t j, ptrdiff_t min, ptrdiff_t max)
{ {
PgfSequence* epsilon_seq = // This is a variation of a binary search algorithm which
gu_seq_index(ps->concr->sequences, PgfSequence, 0); // can retrieve all prefixes of a string with minimal
if (gu_seq_length(epsilon_seq->syms) == 0 && // comparisons, i.e. there is no need to lookup every
epsilon_seq->idx != NULL) { // prefix separately.
// Since the sequences are sorted, the epsilon sequence will
// always be the first if there is any at all. We should
// always add the epsilon in the index, because we do
// bottom up prediction for epsilons.
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
entry->idx = epsilon_seq->idx;
entry->bind_type = BIND_NONE;
entry->offset = state->start_offset;
}
size_t i = 0; while (i <= j) {
size_t j = gu_seq_length(ps->concr->sequences)-1; size_t k = (i+j) / 2;
size_t s = j; PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
size_t n = 1;
size_t sent_len = strlen(ps->sentence);
while (state->end_offset + n <= sent_len) { GuString start = ps->sentence + state->end_offset;
while (i <= j) { GuString current = start;
size_t k = (i+j) / 2; BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k); int cmp = pgf_symbols_cmp(&current, &bind_type, seq->syms);
if (cmp < 0) {
GuString current = ps->sentence + state->end_offset; j = k-1;
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD; } else if (cmp > 0) {
switch (pgf_symbols_cmp(&current, n, &bind_type, seq->syms)) { ptrdiff_t len = current - start;
case -2:
j = k-1;
s = j;
break;
case -1:
j = k-1;
break;
case 0: {
if (seq->idx != NULL) {
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
entry->idx = seq->idx;
entry->bind_type = bind_type;
entry->offset = (current - ps->sentence);
}
i = k+1;
goto next;
}
case 1:
i = k+1;
break;
}
}
next:; if (min <= len)
size_t n_pres = gu_buf_length(ps->concr->pre_sequences); pgf_parsing_lookahead(ps, state, i, k-1, min, len);
for (size_t pi = 0; pi < n_pres; pi++) {
PgfSequence* seq = gu_buf_index(ps->concr->pre_sequences, PgfSequence, pi);
GuString current = ps->sentence + state->end_offset; if (len+1 <= max)
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD; pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
if (pgf_symbols_cmp(&current, n, &bind_type, seq->syms) == 0) {
break;
} else {
ptrdiff_t len = current - start;
if (min <= len-1)
pgf_parsing_lookahead(ps, state, i, k-1, min, len-1);
if (seq->idx != NULL) {
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx); PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
entry->idx = seq->idx; entry->idx = seq->idx;
entry->bind_type = bind_type; entry->bind_type = bind_type;
entry->offset = (current - ps->sentence); entry->offset = (current - ps->sentence);
} }
}
j = s; if (len+1 <= max)
n++; pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
break;
}
}
}
static void
pgf_parsing_lookahead_pre(PgfParsing *ps, PgfParseState* state)
{
size_t n_pres = gu_buf_length(ps->concr->pre_sequences);
for (size_t pi = 0; pi < n_pres; pi++) {
PgfSequence* seq = gu_buf_index(ps->concr->pre_sequences, PgfSequence, pi);
GuString current = ps->sentence + state->end_offset;
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
if (pgf_symbols_cmp(&current, &bind_type, seq->syms) == 0) {
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
entry->idx = seq->idx;
entry->bind_type = bind_type;
entry->offset = (current - ps->sentence);
}
} }
} }
@@ -1200,8 +1183,7 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
size_t end_offset = start_offset; size_t end_offset = start_offset;
GuString current = ps->sentence + end_offset; GuString current = ps->sentence + end_offset;
size_t len = strlen(current); while (skip_space(&current)) {
while (skip_space(&current, &len)) {
end_offset++; end_offset++;
} }
@@ -1224,7 +1206,10 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
if (ps->before == NULL && start_offset == 0) if (ps->before == NULL && start_offset == 0)
state->needs_bind = false; state->needs_bind = false;
pgf_parsing_lookahead(ps, state); pgf_parsing_lookahead(ps, state,
0, gu_seq_length(ps->concr->sequences)-1,
0, strlen(ps->sentence)-state->end_offset);
pgf_parsing_lookahead_pre(ps, state);
*pstate = state; *pstate = state;
@@ -1235,18 +1220,17 @@ static void
pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item) pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
{ {
GuString current = ps->sentence + ps->before->end_offset; GuString current = ps->sentence + ps->before->end_offset;
size_t len = strlen(current);
if (ps->prefix != NULL && ps->sentence[ps->before->end_offset] == 0) { if (ps->prefix != NULL && *current == 0) {
if (gu_string_is_prefix(ps->prefix, tok)) { if (gu_string_is_prefix(ps->prefix, tok)) {
ps->tp = gu_new(PgfTokenProb, ps->out_pool); ps->tp = gu_new(PgfTokenProb, ps->out_pool);
ps->tp->tok = tok; ps->tp->tok = tok;
ps->tp->prob = item->inside_prob + item->conts->outside_prob; ps->tp->prob = item->inside_prob + item->conts->outside_prob;
} }
} else { } else {
if (!ps->before->needs_bind && cmp_string(&current, &len, tok) == 0) { if (!ps->before->needs_bind && cmp_string(&current, tok) == 0) {
PgfParseState* state = PgfParseState* state =
pgf_new_parse_state(ps, (current - ps->sentence), pgf_new_parse_state(ps, (current - ps->sentence),
BIND_NONE, BIND_NONE,
item->inside_prob+item->conts->outside_prob); item->inside_prob+item->conts->outside_prob);
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
@@ -2224,7 +2208,7 @@ pgf_sequence_cmp_fn(GuOrder* self, const void* p1, const void* p2)
const PgfSequence* sp2 = p2; const PgfSequence* sp2 = p2;
BIND_TYPE bind = BIND_HARD; BIND_TYPE bind = BIND_HARD;
int res = pgf_symbols_cmp(&sent, strlen(sent), &bind, sp2->syms); int res = pgf_symbols_cmp(&sent, &bind, sp2->syms);
if (res == 0 && *sent != 0) { if (res == 0 && *sent != 0) {
res = 1; res = 1;
} }
@@ -2275,19 +2259,18 @@ gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
if (st->sequences != NULL) { if (st->sequences != NULL) {
size_t n_seqs = gu_seq_length(st->sequences); size_t n_seqs = gu_seq_length(st->sequences);
while (st->seq_idx < n_seqs) { while (st->seq_idx < n_seqs) {
PgfSymbols* syms = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->syms; PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx);
GuString tokens = pgf_get_tokens(syms, 0, pool); GuString tokens = pgf_get_tokens(seq->syms, 0, pool);
if (!gu_string_is_prefix(st->prefix, tokens)) { if (gu_string_is_prefix(st->prefix, tokens) != 0) {
st->seq_idx = n_seqs; st->seq_idx = n_seqs;
break; break;
} }
if (strlen(tokens) > 0 && if (*tokens != 0 && seq->idx != NULL) {
gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx != NULL) {
entry = gu_new(PgfFullFormEntry, pool); entry = gu_new(PgfFullFormEntry, pool);
entry->tokens = tokens; entry->tokens = tokens;
entry->idx = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx; entry->idx = seq->idx;
st->seq_idx++; st->seq_idx++;
break; break;