From bb4218433fe8afabbc5239b2ca41731bb3e2da4b Mon Sep 17 00:00:00 2001 From: Krasimir Angelov Date: Thu, 22 Feb 2018 11:35:54 +0100 Subject: [PATCH] - tweak the tokenizer in pgf_lookup_sentence to threat .!?,: as separate tokens + bugfix which causes crashes --- src/runtime/c/pgf/lookup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/runtime/c/pgf/lookup.c b/src/runtime/c/pgf/lookup.c index 21c82450f..5918275c1 100644 --- a/src/runtime/c/pgf/lookup.c +++ b/src/runtime/c/pgf/lookup.c @@ -119,7 +119,7 @@ typedef struct { static PgfAbsProduction* pgf_lookup_new_production(PgfAbsFun* fun, GuPool *pool) { - size_t n_hypos = gu_seq_length(fun->type->hypos); + size_t n_hypos = fun->type->hypos ? gu_seq_length(fun->type->hypos) : 0; PgfAbsProduction* prod = gu_new_flex(pool, PgfAbsProduction, args, n_hypos); prod->fun = fun; prod->count = 0; @@ -699,8 +699,12 @@ pgf_lookup_tokenize(GuMap* lexicon_idx, GuString sentence, GuPool* pool) break; const uint8_t* start = p-1; - while (c != 0 && !gu_ucs_is_space(c)) { + if (strchr(".!?,:",c) != NULL) c = gu_utf8_decode(&p); + else { + while (c != 0 && strchr(".!?,:",c) == NULL && !gu_ucs_is_space(c)) { + c = gu_utf8_decode(&p); + } } const uint8_t* end = p-1;