diff --git a/index.html b/index.html
index c9129f63c..274998d5a 100644
--- a/index.html
+++ b/index.html
@@ -59,7 +59,7 @@ function sitesearch() {
QuickStart
QuickRefCard
GF Shell Reference
- GF Summer School
+ GF Summer School
- The GF Book
diff --git a/src/runtime/c/pgf/lookup.c b/src/runtime/c/pgf/lookup.c
index 21c82450f..5918275c1 100644
--- a/src/runtime/c/pgf/lookup.c
+++ b/src/runtime/c/pgf/lookup.c
@@ -119,7 +119,7 @@ typedef struct {
static PgfAbsProduction*
pgf_lookup_new_production(PgfAbsFun* fun, GuPool *pool)
{
- size_t n_hypos = gu_seq_length(fun->type->hypos);
+ size_t n_hypos = fun->type->hypos ? gu_seq_length(fun->type->hypos) : 0;
PgfAbsProduction* prod = gu_new_flex(pool, PgfAbsProduction, args, n_hypos);
prod->fun = fun;
prod->count = 0;
@@ -699,8 +699,12 @@ pgf_lookup_tokenize(GuMap* lexicon_idx, GuString sentence, GuPool* pool)
break;
const uint8_t* start = p-1;
- while (c != 0 && !gu_ucs_is_space(c)) {
+ if (strchr(".!?,:",c) != NULL)
c = gu_utf8_decode(&p);
+ else {
+ while (c != 0 && strchr(".!?,:",c) == NULL && !gu_ucs_is_space(c)) {
+ c = gu_utf8_decode(&p);
+ }
}
const uint8_t* end = p-1;
diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c
index b1916c218..2650e2618 100644
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -65,6 +65,7 @@ typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
typedef struct {
PgfProductionIdx* idx;
size_t offset;
+ size_t sym_idx;
} PgfLexiconIdxEntry;
typedef GuBuf PgfLexiconIdx;
@@ -1060,16 +1061,16 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
}
static int
-pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, bool case_sensitive)
+pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, size_t* sym_idx, bool case_sensitive)
{
size_t n_syms = gu_seq_length(syms);
- for (size_t i = 0; i < n_syms; i++) {
- PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i);
+ while (*sym_idx < n_syms) {
+ PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx);
- if (i > 0) {
+ if (*sym_idx > 0) {
if (!skip_space(psent)) {
if (**psent == 0)
- return -1;
+ return 0;
return 1;
}
@@ -1085,13 +1086,13 @@ pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, bool case_sensitive)
case PGF_SYMBOL_LIT:
case PGF_SYMBOL_VAR: {
if (**psent == 0)
- return -1;
+ return 0;
return 1;
}
case PGF_SYMBOL_KS: {
PgfSymbolKS* pks = inf.data;
if (**psent == 0)
- return -1;
+ return 0;
int cmp = cmp_string(psent, pks->token, case_sensitive);
if (cmp != 0)
@@ -1110,6 +1111,8 @@ pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, bool case_sensitive)
default:
gu_impossible();
}
+
+ (*sym_idx)++;
}
return 0;
@@ -1130,7 +1133,8 @@ pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
GuString start = ps->sentence + state->end_offset;
GuString current = start;
- int cmp = pgf_symbols_cmp(¤t, seq->syms, ps->case_sensitive);
+ size_t sym_idx = 0;
+ int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive);
if (cmp < 0) {
j = k-1;
} else if (cmp > 0) {
@@ -1151,8 +1155,9 @@ pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
if (seq->idx != NULL) {
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
- entry->idx = seq->idx;
- entry->offset = (size_t) (current - ps->sentence);
+ entry->idx = seq->idx;
+ entry->offset = (size_t) (current - ps->sentence);
+ entry->sym_idx = sym_idx;
}
if (len+1 <= max)
@@ -1278,14 +1283,15 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
static void
pgf_parsing_predict_lexeme(PgfParsing* ps, PgfItemConts* conts,
PgfProductionIdxEntry* entry,
- size_t offset)
+ size_t offset, size_t sym_idx)
{
GuVariantInfo i = { PGF_PRODUCTION_APPLY, entry->papp };
PgfProduction prod = gu_variant_close(i);
PgfItem* item =
pgf_new_item(ps, conts, prod);
PgfSymbols* syms = entry->papp->fun->lins[conts->lin_idx]->syms;
- item->sym_idx = gu_seq_length(syms);
+ item->sym_idx = sym_idx;
+ pgf_item_set_curr_symbol(item, ps->pool);
prob_t prob = item->inside_prob+item->conts->outside_prob;
PgfParseState* state =
pgf_new_parse_state(ps, offset, BIND_NONE, prob);
@@ -1358,7 +1364,7 @@ pgf_parsing_td_predict(PgfParsing* ps,
PgfProductionIdxEntry, &key);
if (value != NULL) {
- pgf_parsing_predict_lexeme(ps, conts, value, lentry->offset);
+ pgf_parsing_predict_lexeme(ps, conts, value, lentry->offset, lentry->sym_idx);
PgfProductionIdxEntry* start =
gu_buf_data(lentry->idx);
@@ -1369,7 +1375,7 @@ pgf_parsing_td_predict(PgfParsing* ps,
while (left >= start &&
value->ccat->fid == left->ccat->fid &&
value->lin_idx == left->lin_idx) {
- pgf_parsing_predict_lexeme(ps, conts, left, lentry->offset);
+ pgf_parsing_predict_lexeme(ps, conts, left, lentry->offset, lentry->sym_idx);
left--;
}
@@ -1377,7 +1383,7 @@ pgf_parsing_td_predict(PgfParsing* ps,
while (right <= end &&
value->ccat->fid == right->ccat->fid &&
value->lin_idx == right->lin_idx) {
- pgf_parsing_predict_lexeme(ps, conts, right, lentry->offset);
+ pgf_parsing_predict_lexeme(ps, conts, right, lentry->offset, lentry->sym_idx);
right++;
}
}
@@ -2372,8 +2378,9 @@ pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2)
GuString sent = (GuString) p1;
const PgfSequence* sp2 = p2;
- int res = pgf_symbols_cmp(&sent, sp2->syms, self->case_sensitive);
- if (res == 0 && *sent != 0) {
+ size_t sym_idx = 0;
+ int res = pgf_symbols_cmp(&sent, sp2->syms, &sym_idx, self->case_sensitive);
+ if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) {
res = 1;
}