From 855fa7ebf3ef9cb449750719d43db6004427addf Mon Sep 17 00:00:00 2001 From: Krasimir Angelov Date: Fri, 30 Sep 2022 11:58:09 +0200 Subject: [PATCH] use TextSpots for correct extraction of unknown words --- src/runtime/c/pgf/parser.cxx | 30 ++++++++++---------- src/runtime/c/pgf/parser.h | 6 ++-- src/runtime/c/pgf/pgf.cxx | 16 +++++------ src/runtime/c/pgf/phrasetable.cxx | 47 +++++++++++++++---------------- src/runtime/c/pgf/phrasetable.h | 11 ++++++-- 5 files changed, 56 insertions(+), 54 deletions(-) diff --git a/src/runtime/c/pgf/parser.cxx b/src/runtime/c/pgf/parser.cxx index 2a5e1897c..3fce54c1e 100644 --- a/src/runtime/c/pgf/parser.cxx +++ b/src/runtime/c/pgf/parser.cxx @@ -83,7 +83,7 @@ public: } public: - size_t start, end; + PgfTextSpot start, end; State *prev, *next; prob_t viterbi_prob; @@ -495,10 +495,10 @@ public: prev = prev->prev; } - size_t size = state->start-prev->end; + size_t size = state->start.ptr-prev->end.ptr; PgfText *token = (PgfText *) alloca(sizeof(PgfText)+size+1); token->size = size; - memcpy(token->text,parser->sentence->text+prev->end,size); + memcpy(token->text,prev->end.ptr,size); token->text[size] = 0; PgfExpr expr = u->elit(u->lstr(token)); @@ -638,19 +638,19 @@ PgfParser::PgfParser(ref concr, ref start, PgfText *se this->m = m; } -void PgfParser::space(size_t start, size_t end, PgfExn* err) +void PgfParser::space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err) { State *prev = NULL; State *next = before; - while (next != NULL && next->start < start) { + while (next != NULL && next->start.pos < start->pos) { prev = next; next = next->next; } - if (next == NULL || next->start != start) { + if (next == NULL || next->start.pos != start->pos) { before = new State(); - before->start = start; - before->end = end; + before->start = *start; + before->end = *end; before->prev = prev; before->next = next; before->viterbi_prob = prev ? prev->viterbi_prob : 0; @@ -659,23 +659,23 @@ void PgfParser::space(size_t start, size_t end, PgfExn* err) if (next != NULL) next->prev = before; } else { before = next; - before->end = end; + before->end = *end; } } -void PgfParser::start_matches(size_t end, PgfExn* err) +void PgfParser::start_matches(PgfTextSpot *end, PgfExn* err) { State *prev = NULL; State *next = before; - while (next != NULL && next->start < end) { + while (next != NULL && next->start.pos < end->pos) { prev = next; next = next->next; } - if (next == NULL || next->start != end) { + if (next == NULL || next->start.pos != end->pos) { after = new State(); - after->start = end; - after->end = end; + after->start = *end; + after->end = *end; after->prev = prev; after->next = next; after->viterbi_prob = prev ? prev->viterbi_prob : 0; @@ -696,7 +696,7 @@ void PgfParser::match(ref lin, size_t seq_index, PgfExn* err) after->queue.push(new(0) ParseItem(conts, lin, seq_index)); } -void PgfParser::end_matches(size_t end, PgfExn* err) +void PgfParser::end_matches(PgfTextSpot *end, PgfExn* err) { while (!after->queue.empty()) { Item *item = after->queue.top(); diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h index 66e586d28..a4a7ae002 100644 --- a/src/runtime/c/pgf/parser.h +++ b/src/runtime/c/pgf/parser.h @@ -5,10 +5,10 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum public: PgfParser(ref concr, ref start, PgfText *sentence, PgfMarshaller *m); - void space(size_t start, size_t end, PgfExn* err); - void start_matches(size_t end, PgfExn* err); + void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err); + void start_matches(PgfTextSpot *end, PgfExn* err); void match(ref lin, size_t seq_index, PgfExn* err); - void end_matches(size_t end, PgfExn* err); + void end_matches(PgfTextSpot *end, PgfExn* err); void prepare(); PgfExpr fetch(PgfDB *db, PgfUnmarshaller *u, prob_t *prob); diff --git a/src/runtime/c/pgf/pgf.cxx b/src/runtime/c/pgf/pgf.cxx index 1dfdc8c7e..4336bbc69 100644 --- a/src/runtime/c/pgf/pgf.cxx +++ b/src/runtime/c/pgf/pgf.cxx @@ -822,11 +822,11 @@ public: this->callback = callback; } - virtual void space(size_t start, size_t end, PgfExn* err) + virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err) { } - virtual void start_matches(size_t end, PgfExn* err) + virtual void start_matches(PgfTextSpot *end, PgfExn* err) { } @@ -837,7 +837,7 @@ public: callback->fn(callback, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err); } - virtual void end_matches(size_t end, PgfExn* err) + virtual void end_matches(PgfTextSpot *end, PgfExn* err) { } @@ -869,12 +869,12 @@ public: this->callback = callback; } - virtual void space(size_t start, size_t end, PgfExn* err) + virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err) { - match_start = end; + match_start = end->pos; } - virtual void start_matches(size_t match_end, PgfExn* err) + virtual void start_matches(PgfTextSpot *end, PgfExn* err) { } @@ -885,9 +885,9 @@ public: callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err); } - virtual void end_matches(size_t match_end, PgfExn* err) + virtual void end_matches(PgfTextSpot *end, PgfExn* err) { - callback->fn(callback, match_start, match_end, err); + callback->fn(callback, match_start, end->pos, err); } private: diff --git a/src/runtime/c/pgf/phrasetable.cxx b/src/runtime/c/pgf/phrasetable.cxx index c98e749b4..e1bd6d8bb 100644 --- a/src/runtime/c/pgf/phrasetable.cxx +++ b/src/runtime/c/pgf/phrasetable.cxx @@ -228,11 +228,6 @@ int sequence_cmp(ref seq1, ref seq2) return 0; } -struct PGF_INTERNAL_DECL PgfTextSpot { - size_t pos; // position in Unicode characters - const uint8_t *ptr; // pointer into the spot location -}; - static int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end, ref seq, @@ -535,7 +530,7 @@ struct PGF_INTERNAL_DECL PgfCohortsState { PgfTextSpot spot; std::priority_queue, PgfTextSpotComparator> queue; - size_t last_pos; + PgfTextSpot last; bool skipping; const uint8_t *end; // pointer into the end of the sentence @@ -552,34 +547,35 @@ void finish_skipping(PgfCohortsState *state) { if (spot.pos >= state->spot.pos) break; - if (spot.pos != state->last_pos) { - if (state->last_pos > 0) { - state->scanner->space(spot.pos, spot.pos, + if (spot.pos != state->last.pos) { + if (state->last.pos > 0) { + state->scanner->space(&spot, &spot, state->err); if (state->err->type != PGF_EXN_NONE) return; } - state->scanner->start_matches(state->spot.pos, + state->scanner->start_matches(&state->spot, state->err); if (state->err->type != PGF_EXN_NONE) return; - state->scanner->end_matches(state->spot.pos, + state->scanner->end_matches(&state->spot, state->err); if (state->err->type != PGF_EXN_NONE) return; - state->last_pos = spot.pos; + state->last = spot; } state->queue.pop(); } - state->scanner->space(state->spot.pos, state->spot.pos, + state->scanner->space(&state->spot, &state->spot, state->err); - state->last_pos = 0; + state->last.pos = 0; + state->last.ptr = NULL; state->skipping = false; } } @@ -616,20 +612,20 @@ void phrasetable_lookup_prefixes(PgfCohortsState *state, auto backrefs = table->value.backrefs; if (len > 0 && backrefs != 0) { - if (state->last_pos != current.pos) { - if (state->last_pos > 0) { - state->scanner->end_matches(state->last_pos, + if (state->last.pos != current.pos) { + if (state->last.pos > 0) { + state->scanner->end_matches(&state->last, state->err); if (state->err->type != PGF_EXN_NONE) return; } - state->scanner->start_matches(current.pos, + state->scanner->start_matches(¤t, state->err); if (state->err->type != PGF_EXN_NONE) return; - state->last_pos = current.pos; + state->last = current; } state->queue.push(current); @@ -668,13 +664,13 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table, { PgfTextSpot spot; spot.pos = 0; - spot.ptr = (uint8_t *) &sentence->text[0]; + spot.ptr = (uint8_t *) sentence->text; PgfCohortsState state; state.spot.pos = -1; state.spot.ptr = NULL; state.queue.push(spot); - state.last_pos = 0; + state.last = spot; state.skipping = false; state.end = (uint8_t *) &sentence->text[sentence->size]; state.case_sensitive = case_sensitive; @@ -698,7 +694,7 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table, state.spot.ptr = ptr; } - state.scanner->space(spot.pos,state.spot.pos,state.err); + state.scanner->space(&spot,&state.spot,state.err); if (state.err->type != PGF_EXN_NONE) return; @@ -707,14 +703,15 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table, if (state.err->type != PGF_EXN_NONE) return; - if (state.last_pos > 0) { + if (state.last.pos > 0) { // We found at least one match. // The last range is yet to be reported. - state.scanner->end_matches(state.last_pos, + state.scanner->end_matches(&state.last, state.err); if (state.err->type != PGF_EXN_NONE) return; - state.last_pos = 0; + state.last.pos = 0; + state.last.ptr = (uint8_t*) sentence->text; break; } else { // No matches were found, try the next position diff --git a/src/runtime/c/pgf/phrasetable.h b/src/runtime/c/pgf/phrasetable.h index aadd80227..db1c0a4bb 100644 --- a/src/runtime/c/pgf/phrasetable.h +++ b/src/runtime/c/pgf/phrasetable.h @@ -71,12 +71,17 @@ size_t phrasetable_size(PgfPhrasetable table); class PgfConcrLin; +struct PGF_INTERNAL_DECL PgfTextSpot { + size_t pos; // position in Unicode characters + const uint8_t *ptr; // pointer into the spot location +}; + class PGF_INTERNAL_DECL PgfPhraseScanner { public: - virtual void space(size_t start, size_t end, PgfExn* err)=0; - virtual void start_matches(size_t pos, PgfExn* err)=0; + virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err)=0; + virtual void start_matches(PgfTextSpot *spot, PgfExn* err)=0; virtual void match(ref lin, size_t seq_index, PgfExn* err)=0; - virtual void end_matches(size_t pos, PgfExn* err)=0; + virtual void end_matches(PgfTextSpot *spot, PgfExn* err)=0; }; PGF_INTERNAL_DECL