mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 19:42:50 -06:00
a new nice and elegant algorithm for dictionary lookup in the parser
This commit is contained in:
@@ -472,36 +472,28 @@ pgf_print_expr_state0(PgfExprState* st,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int
|
static int
|
||||||
cmp_string(GuString* psent, size_t* plen, GuString tok)
|
cmp_string(GuString* psent, GuString tok)
|
||||||
{
|
{
|
||||||
GuString sent = *psent;
|
for (;;) {
|
||||||
size_t len = *plen;
|
uint8_t c2 = *tok;
|
||||||
|
if (c2 == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
while (*tok != 0) {
|
uint8_t c1 = **psent;
|
||||||
if (len == 0)
|
if (c1 == 0)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (((uint8_t) *sent) > ((uint8_t) *tok))
|
if (c1 != c2)
|
||||||
return 1;
|
return (c1-c2);
|
||||||
else if (((uint8_t) *sent) < ((uint8_t) *tok))
|
|
||||||
return -2;
|
|
||||||
|
|
||||||
tok++;
|
|
||||||
sent++;
|
|
||||||
len--;
|
|
||||||
}
|
|
||||||
|
|
||||||
*psent = sent;
|
tok++;
|
||||||
*plen = len;
|
(*psent)++;
|
||||||
return 0;
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool
|
static bool
|
||||||
skip_space(GuString* psent, size_t* plen)
|
skip_space(GuString* psent)
|
||||||
{
|
{
|
||||||
if (*plen == 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
const uint8_t* p = (uint8_t*) *psent;
|
const uint8_t* p = (uint8_t*) *psent;
|
||||||
if (!gu_ucs_is_space(gu_utf8_decode(&p)))
|
if (!gu_ucs_is_space(gu_utf8_decode(&p)))
|
||||||
return false;
|
return false;
|
||||||
@@ -1023,10 +1015,8 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
pgf_symbols_cmp(GuString* psent, size_t sent_len, BIND_TYPE* pbind, PgfSymbols* syms)
|
pgf_symbols_cmp(GuString* psent, BIND_TYPE* pbind, PgfSymbols* syms)
|
||||||
{
|
{
|
||||||
GuString sent = *psent;
|
|
||||||
|
|
||||||
size_t n_syms = gu_seq_length(syms);
|
size_t n_syms = gu_seq_length(syms);
|
||||||
for (size_t i = 0; i < n_syms; i++) {
|
for (size_t i = 0; i < n_syms; i++) {
|
||||||
PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i);
|
PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i);
|
||||||
@@ -1036,34 +1026,34 @@ pgf_symbols_cmp(GuString* psent, size_t sent_len, BIND_TYPE* pbind, PgfSymbols*
|
|||||||
case PGF_SYMBOL_CAT:
|
case PGF_SYMBOL_CAT:
|
||||||
case PGF_SYMBOL_LIT:
|
case PGF_SYMBOL_LIT:
|
||||||
case PGF_SYMBOL_VAR: {
|
case PGF_SYMBOL_VAR: {
|
||||||
if (sent_len == 0)
|
if (**psent == 0)
|
||||||
return -1;
|
return -1;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
case PGF_SYMBOL_KS: {
|
case PGF_SYMBOL_KS: {
|
||||||
PgfSymbolKS* pks = inf.data;
|
PgfSymbolKS* pks = inf.data;
|
||||||
if (sent_len == 0)
|
if (**psent == 0)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (*pbind == BIND_HARD)
|
if (*pbind == BIND_HARD)
|
||||||
*pbind = BIND_NONE;
|
*pbind = BIND_NONE;
|
||||||
else {
|
else {
|
||||||
if (*pbind != BIND_SOFT && !skip_space(&sent, &sent_len))
|
if (*pbind != BIND_SOFT && !skip_space(psent))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
while (*sent != 0) {
|
while (**psent != 0) {
|
||||||
if (!skip_space(&sent, &sent_len))
|
if (!skip_space(psent))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int cmp = cmp_string(&sent, &sent_len, pks->token);
|
int cmp = cmp_string(psent, pks->token);
|
||||||
if (cmp != 0)
|
if (cmp != 0)
|
||||||
return cmp;
|
return cmp;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PGF_SYMBOL_KP: {
|
case PGF_SYMBOL_KP: {
|
||||||
return -2;
|
return -1;
|
||||||
}
|
}
|
||||||
case PGF_SYMBOL_BIND: {
|
case PGF_SYMBOL_BIND: {
|
||||||
*pbind = BIND_HARD;
|
*pbind = BIND_HARD;
|
||||||
@@ -1077,88 +1067,81 @@ pgf_symbols_cmp(GuString* psent, size_t sent_len, BIND_TYPE* pbind, PgfSymbols*
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case PGF_SYMBOL_NE: {
|
case PGF_SYMBOL_NE: {
|
||||||
return -2;
|
return -1;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
gu_impossible();
|
gu_impossible();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*psent = sent;
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state)
|
pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
|
||||||
|
size_t i, size_t j, ptrdiff_t min, ptrdiff_t max)
|
||||||
{
|
{
|
||||||
PgfSequence* epsilon_seq =
|
// This is a variation of a binary search algorithm which
|
||||||
gu_seq_index(ps->concr->sequences, PgfSequence, 0);
|
// can retrieve all prefixes of a string with minimal
|
||||||
if (gu_seq_length(epsilon_seq->syms) == 0 &&
|
// comparisons, i.e. there is no need to lookup every
|
||||||
epsilon_seq->idx != NULL) {
|
// prefix separately.
|
||||||
// Since the sequences are sorted, the epsilon sequence will
|
|
||||||
// always be the first if there is any at all. We should
|
|
||||||
// always add the epsilon in the index, because we do
|
|
||||||
// bottom up prediction for epsilons.
|
|
||||||
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
|
||||||
entry->idx = epsilon_seq->idx;
|
|
||||||
entry->bind_type = BIND_NONE;
|
|
||||||
entry->offset = state->start_offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t i = 0;
|
while (i <= j) {
|
||||||
size_t j = gu_seq_length(ps->concr->sequences)-1;
|
size_t k = (i+j) / 2;
|
||||||
size_t s = j;
|
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
|
||||||
size_t n = 1;
|
|
||||||
size_t sent_len = strlen(ps->sentence);
|
|
||||||
|
|
||||||
while (state->end_offset + n <= sent_len) {
|
GuString start = ps->sentence + state->end_offset;
|
||||||
while (i <= j) {
|
GuString current = start;
|
||||||
size_t k = (i+j) / 2;
|
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
|
||||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
|
int cmp = pgf_symbols_cmp(¤t, &bind_type, seq->syms);
|
||||||
|
if (cmp < 0) {
|
||||||
GuString current = ps->sentence + state->end_offset;
|
j = k-1;
|
||||||
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
|
} else if (cmp > 0) {
|
||||||
switch (pgf_symbols_cmp(¤t, n, &bind_type, seq->syms)) {
|
ptrdiff_t len = current - start;
|
||||||
case -2:
|
|
||||||
j = k-1;
|
|
||||||
s = j;
|
|
||||||
break;
|
|
||||||
case -1:
|
|
||||||
j = k-1;
|
|
||||||
break;
|
|
||||||
case 0: {
|
|
||||||
if (seq->idx != NULL) {
|
|
||||||
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
|
||||||
entry->idx = seq->idx;
|
|
||||||
entry->bind_type = bind_type;
|
|
||||||
entry->offset = (current - ps->sentence);
|
|
||||||
}
|
|
||||||
i = k+1;
|
|
||||||
goto next;
|
|
||||||
}
|
|
||||||
case 1:
|
|
||||||
i = k+1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
next:;
|
if (min <= len)
|
||||||
size_t n_pres = gu_buf_length(ps->concr->pre_sequences);
|
pgf_parsing_lookahead(ps, state, i, k-1, min, len);
|
||||||
for (size_t pi = 0; pi < n_pres; pi++) {
|
|
||||||
PgfSequence* seq = gu_buf_index(ps->concr->pre_sequences, PgfSequence, pi);
|
|
||||||
|
|
||||||
GuString current = ps->sentence + state->end_offset;
|
if (len+1 <= max)
|
||||||
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
|
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
|
||||||
if (pgf_symbols_cmp(¤t, n, &bind_type, seq->syms) == 0) {
|
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
ptrdiff_t len = current - start;
|
||||||
|
|
||||||
|
if (min <= len-1)
|
||||||
|
pgf_parsing_lookahead(ps, state, i, k-1, min, len-1);
|
||||||
|
|
||||||
|
if (seq->idx != NULL) {
|
||||||
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
||||||
entry->idx = seq->idx;
|
entry->idx = seq->idx;
|
||||||
entry->bind_type = bind_type;
|
entry->bind_type = bind_type;
|
||||||
entry->offset = (current - ps->sentence);
|
entry->offset = (current - ps->sentence);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
j = s;
|
if (len+1 <= max)
|
||||||
n++;
|
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_parsing_lookahead_pre(PgfParsing *ps, PgfParseState* state)
|
||||||
|
{
|
||||||
|
size_t n_pres = gu_buf_length(ps->concr->pre_sequences);
|
||||||
|
for (size_t pi = 0; pi < n_pres; pi++) {
|
||||||
|
PgfSequence* seq = gu_buf_index(ps->concr->pre_sequences, PgfSequence, pi);
|
||||||
|
|
||||||
|
GuString current = ps->sentence + state->end_offset;
|
||||||
|
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
|
||||||
|
if (pgf_symbols_cmp(¤t, &bind_type, seq->syms) == 0) {
|
||||||
|
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
||||||
|
entry->idx = seq->idx;
|
||||||
|
entry->bind_type = bind_type;
|
||||||
|
entry->offset = (current - ps->sentence);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1200,8 +1183,7 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
|||||||
|
|
||||||
size_t end_offset = start_offset;
|
size_t end_offset = start_offset;
|
||||||
GuString current = ps->sentence + end_offset;
|
GuString current = ps->sentence + end_offset;
|
||||||
size_t len = strlen(current);
|
while (skip_space(¤t)) {
|
||||||
while (skip_space(¤t, &len)) {
|
|
||||||
end_offset++;
|
end_offset++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1224,7 +1206,10 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
|||||||
if (ps->before == NULL && start_offset == 0)
|
if (ps->before == NULL && start_offset == 0)
|
||||||
state->needs_bind = false;
|
state->needs_bind = false;
|
||||||
|
|
||||||
pgf_parsing_lookahead(ps, state);
|
pgf_parsing_lookahead(ps, state,
|
||||||
|
0, gu_seq_length(ps->concr->sequences)-1,
|
||||||
|
0, strlen(ps->sentence)-state->end_offset);
|
||||||
|
pgf_parsing_lookahead_pre(ps, state);
|
||||||
|
|
||||||
*pstate = state;
|
*pstate = state;
|
||||||
|
|
||||||
@@ -1235,18 +1220,17 @@ static void
|
|||||||
pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
||||||
{
|
{
|
||||||
GuString current = ps->sentence + ps->before->end_offset;
|
GuString current = ps->sentence + ps->before->end_offset;
|
||||||
size_t len = strlen(current);
|
|
||||||
|
|
||||||
if (ps->prefix != NULL && ps->sentence[ps->before->end_offset] == 0) {
|
if (ps->prefix != NULL && *current == 0) {
|
||||||
if (gu_string_is_prefix(ps->prefix, tok)) {
|
if (gu_string_is_prefix(ps->prefix, tok)) {
|
||||||
ps->tp = gu_new(PgfTokenProb, ps->out_pool);
|
ps->tp = gu_new(PgfTokenProb, ps->out_pool);
|
||||||
ps->tp->tok = tok;
|
ps->tp->tok = tok;
|
||||||
ps->tp->prob = item->inside_prob + item->conts->outside_prob;
|
ps->tp->prob = item->inside_prob + item->conts->outside_prob;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!ps->before->needs_bind && cmp_string(¤t, &len, tok) == 0) {
|
if (!ps->before->needs_bind && cmp_string(¤t, tok) == 0) {
|
||||||
PgfParseState* state =
|
PgfParseState* state =
|
||||||
pgf_new_parse_state(ps, (current - ps->sentence),
|
pgf_new_parse_state(ps, (current - ps->sentence),
|
||||||
BIND_NONE,
|
BIND_NONE,
|
||||||
item->inside_prob+item->conts->outside_prob);
|
item->inside_prob+item->conts->outside_prob);
|
||||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||||
@@ -2224,7 +2208,7 @@ pgf_sequence_cmp_fn(GuOrder* self, const void* p1, const void* p2)
|
|||||||
const PgfSequence* sp2 = p2;
|
const PgfSequence* sp2 = p2;
|
||||||
|
|
||||||
BIND_TYPE bind = BIND_HARD;
|
BIND_TYPE bind = BIND_HARD;
|
||||||
int res = pgf_symbols_cmp(&sent, strlen(sent), &bind, sp2->syms);
|
int res = pgf_symbols_cmp(&sent, &bind, sp2->syms);
|
||||||
if (res == 0 && *sent != 0) {
|
if (res == 0 && *sent != 0) {
|
||||||
res = 1;
|
res = 1;
|
||||||
}
|
}
|
||||||
@@ -2275,19 +2259,18 @@ gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
|
|||||||
if (st->sequences != NULL) {
|
if (st->sequences != NULL) {
|
||||||
size_t n_seqs = gu_seq_length(st->sequences);
|
size_t n_seqs = gu_seq_length(st->sequences);
|
||||||
while (st->seq_idx < n_seqs) {
|
while (st->seq_idx < n_seqs) {
|
||||||
PgfSymbols* syms = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->syms;
|
PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx);
|
||||||
GuString tokens = pgf_get_tokens(syms, 0, pool);
|
GuString tokens = pgf_get_tokens(seq->syms, 0, pool);
|
||||||
|
|
||||||
if (!gu_string_is_prefix(st->prefix, tokens)) {
|
if (gu_string_is_prefix(st->prefix, tokens) != 0) {
|
||||||
st->seq_idx = n_seqs;
|
st->seq_idx = n_seqs;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strlen(tokens) > 0 &&
|
if (*tokens != 0 && seq->idx != NULL) {
|
||||||
gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx != NULL) {
|
|
||||||
entry = gu_new(PgfFullFormEntry, pool);
|
entry = gu_new(PgfFullFormEntry, pool);
|
||||||
entry->tokens = tokens;
|
entry->tokens = tokens;
|
||||||
entry->idx = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx;
|
entry->idx = seq->idx;
|
||||||
|
|
||||||
st->seq_idx++;
|
st->seq_idx++;
|
||||||
break;
|
break;
|
||||||
|
|||||||
Reference in New Issue
Block a user