From 5b8212020f9752eae7cb3e3027495c723173321c Mon Sep 17 00:00:00 2001 From: Krasimir Angelov Date: Tue, 12 Jul 2022 12:46:50 +0200 Subject: [PATCH] finished the implementation of lookupCohorts --- src/runtime/c/pgf/heap.h | 3 +- src/runtime/c/pgf/pgf.cxx | 19 +-- src/runtime/c/pgf/phrasetable.cxx | 196 +++++++++++++++++++++++++----- src/runtime/c/pgf/phrasetable.h | 19 +-- 4 files changed, 179 insertions(+), 58 deletions(-) diff --git a/src/runtime/c/pgf/heap.h b/src/runtime/c/pgf/heap.h index f89fc8ba6..61f9d7ce8 100644 --- a/src/runtime/c/pgf/heap.h +++ b/src/runtime/c/pgf/heap.h @@ -7,13 +7,14 @@ public: Heap() { len = 0; avail = 0; + values = NULL; } ~Heap() { free(values); } void push(A value) { if (len >= avail) { - avail = get_next_padovan(len); + avail = get_next_padovan(len+1); A *new_values = (A *) realloc(values, sizeof(A)*avail); if (new_values == NULL) throw pgf_systemerror(errno); diff --git a/src/runtime/c/pgf/pgf.cxx b/src/runtime/c/pgf/pgf.cxx index 525b045e3..0aaff467b 100644 --- a/src/runtime/c/pgf/pgf.cxx +++ b/src/runtime/c/pgf/pgf.cxx @@ -825,12 +825,8 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision, bool case_sensitive = pgf_is_case_sensitive(concr); - PgfTextRange range; - range.pos = 0; - range.begin = (uint8_t *) &sentence->text[0]; - range.end = (uint8_t *) &sentence->text[sentence->size]; phrasetable_lookup(concr->phrasetable, - &range, case_sensitive, + sentence, case_sensitive, concr->lincats, callback, err); } PGF_API_END @@ -847,15 +843,10 @@ void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision, bool case_sensitive = pgf_is_case_sensitive(concr); - PgfTextRange range; - range.pos = 0; - range.begin = (uint8_t *) &sentence->text[0]; - range.end = (uint8_t *) &sentence->text[sentence->size]; - phrasetable_lookup_prefixes(concr->phrasetable, - &range, case_sensitive, - concr->lincats, - 1, sentence->size, - callback, err); + phrasetable_lookup_cohorts(concr->phrasetable, + sentence, case_sensitive, + concr->lincats, + callback, err); } PGF_API_END } diff --git a/src/runtime/c/pgf/phrasetable.cxx b/src/runtime/c/pgf/phrasetable.cxx index 45e6a4ede..1dc10b1f4 100644 --- a/src/runtime/c/pgf/phrasetable.cxx +++ b/src/runtime/c/pgf/phrasetable.cxx @@ -1,4 +1,5 @@ #include "data.h" +#include "heap.h" PgfPhrasetableIds::PgfPhrasetableIds() { @@ -227,9 +228,19 @@ int sequence_cmp(ref seq1, ref seq2) return 0; } +struct PGF_INTERNAL_DECL PgfTextSpot { + size_t pos; // position in Unicode characters + const uint8_t *ptr; // pointer into the spot location + + bool operator >= (PgfTextSpot const &obj) { + return pos >= obj.pos; + } +}; + static -int text_range_cmp(PgfTextRange *range, ref seq, - bool case_sensitive, bool full_match) +int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end, + ref seq, + bool case_sensitive, bool full_match) { int res1 = 0; @@ -240,7 +251,7 @@ int text_range_cmp(PgfTextRange *range, ref seq, size_t count = 0; for (;;) { - if (range->begin >= range->end) { + if (spot->ptr >= end) { if (s2 < e2 || i < seq->syms.len) return -1; return case_sensitive ? res1 : 0; @@ -249,7 +260,7 @@ int text_range_cmp(PgfTextRange *range, ref seq, if (s2 >= e2 && i >= seq->syms.len) return full_match ? 1 : 0; - uint32_t ucs1 = pgf_utf8_decode(&range->begin); range->pos++; + uint32_t ucs1 = pgf_utf8_decode(&spot->ptr); spot->pos++; uint32_t ucs1i = pgf_utf8_to_upper(ucs1); if (s2 >= e2) { @@ -466,7 +477,7 @@ size_t phrasetable_size(PgfPhrasetable table) PGF_INTERNAL void phrasetable_lookup(PgfPhrasetable table, - PgfTextRange *sentence, + PgfText *sentence, bool case_sensitive, Namespace lincats, PgfMorphoCallback* callback, PgfExn* err) @@ -474,8 +485,11 @@ void phrasetable_lookup(PgfPhrasetable table, if (table == 0) return; - PgfTextRange current = *sentence; - int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive,true); + PgfTextSpot current; + current.pos = 0; + current.ptr = (uint8_t *) sentence->text; + const uint8_t *end = current.ptr+sentence->size; + int cmp = text_sequence_cmp(¤t,end,table->value.seq,case_sensitive,true); if (cmp < 0) { phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err); } else if (cmp > 0) { @@ -519,50 +533,85 @@ void phrasetable_lookup(PgfPhrasetable table, } } -PGF_INTERNAL -void phrasetable_lookup_prefixes(PgfPhrasetable table, - PgfTextRange *sentence, - bool case_sensitive, - Namespace lincats, - ptrdiff_t min, ptrdiff_t max, - PgfCohortsCallback* callback, PgfExn* err) +struct PGF_INTERNAL_DECL PgfCohortsState { + PgfTextSpot spot; + Heap queue; + size_t last_pos; + size_t skip_pos; + const uint8_t *end; // pointer into the end of the sentence + + bool case_sensitive; + Namespace lincats; + PgfCohortsCallback* callback; + PgfExn* err; +}; + +static +void phrasetable_lookup_prefixes(PgfCohortsState *state, + PgfPhrasetable table, + ptrdiff_t min, ptrdiff_t max) { if (table == 0) return; - PgfTextRange current = *sentence; - int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive,false); + PgfTextSpot current = state->spot; + int cmp = text_sequence_cmp(¤t,state->end,table->value.seq,state->case_sensitive,false); if (cmp < 0) { - phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,max,callback,err); + phrasetable_lookup_prefixes(state,table->left,min,max); } else if (cmp > 0) { - ptrdiff_t len = current.begin - sentence->begin; + ptrdiff_t len = current.ptr - state->spot.ptr; if (min <= len) - phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err); + phrasetable_lookup_prefixes(state,table->left,min,len); if (len <= max) - phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err); + phrasetable_lookup_prefixes(state,table->right,len,max); } else { - ptrdiff_t len = current.begin - sentence->begin; + ptrdiff_t len = current.ptr - state->spot.ptr; if (min <= len) - phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err); + phrasetable_lookup_prefixes(state,table->left,min,len); auto backrefs = table->value.backrefs; if (len > 0 && backrefs != 0) { + if (state->skip_pos != (size_t) -1) { + state->callback->fn(state->callback, + state->skip_pos, + state->spot.pos, + state->err); + if (state->err->type != PGF_EXN_NONE) + return; + state->skip_pos = (size_t) -1; + } + + if (state->last_pos > 0 && state->last_pos != current.pos) { + state->callback->fn(state->callback, + state->spot.pos, + state->last_pos, + state->err); + if (state->err->type != PGF_EXN_NONE) + return; + } + state->last_pos = current.pos; + state->queue.push(current); + for (size_t i = 0; i < backrefs->len; i++) { PgfSequenceBackref backref = *vector_elem(backrefs,i); switch (ref::get_tag(backref.container)) { case PgfConcrLin::tag: { ref lin = ref::untagged(backref.container); ref lincat = - namespace_lookup(lincats, &lin->absfun->type->name); + namespace_lookup(state->lincats, &lin->absfun->type->name); if (lincat != 0) { ref field = *vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); - callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err); - if (err->type != PGF_EXN_NONE) + state->callback->morpho.fn(&state->callback->morpho, + &lin->absfun->name, + &(*field), + lincat->abscat->prob+lin->absfun->prob, + state->err); + if (state->err->type != PGF_EXN_NONE) return; } break; @@ -573,17 +622,104 @@ void phrasetable_lookup_prefixes(PgfPhrasetable table, } } } - - callback->fn(callback, sentence->pos, current.pos, err); - if (err->type != PGF_EXN_NONE) - return; } if (len <= max) - phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err); + phrasetable_lookup_prefixes(state,table->right,len,max); } } +PGF_INTERNAL +void phrasetable_lookup_cohorts(PgfPhrasetable table, + PgfText *sentence, + bool case_sensitive, + Namespace lincats, + PgfCohortsCallback* callback, PgfExn* err) +{ + PgfTextSpot spot; + spot.pos = 0; + spot.ptr = (uint8_t *) &sentence->text[0]; + + PgfCohortsState state; + state.spot.pos = -1; + state.spot.ptr = NULL; + state.queue.push(spot); + state.last_pos = 0; + state.skip_pos = (size_t) -1; + state.end = (uint8_t *) &sentence->text[sentence->size]; + state.case_sensitive = case_sensitive; + state.lincats = lincats; + state.callback = callback; + state.err = err; + + while (!state.queue.is_empty()) { + PgfTextSpot spot = state.queue.pop(); + if (spot.pos != state.spot.pos) { + state.spot = spot; + + // skip leading spaces + while (state.spot.ptr < state.end) { + const uint8_t *ptr = state.spot.ptr; + uint32_t ucs = pgf_utf8_decode(&ptr); + if (!pgf_utf8_is_space(ucs)) + break; + state.spot.pos++; + state.spot.ptr = ptr; + } + + state.skip_pos = (size_t) -1; + while (state.spot.ptr < state.end) { + phrasetable_lookup_prefixes(&state, table, 1, sentence->size); + + if (state.last_pos > 0) { + // We found at least one match. + // The last range is yet to be reported. + state.callback->fn(state.callback, + state.spot.pos, + state.last_pos, + state.err); + if (state.err->type != PGF_EXN_NONE) + return; + state.last_pos = 0; + break; + } else { + // We didn't find any matches at this position, + // therefore we must skip one character and try again. + if (state.skip_pos == (size_t) -1) + state.skip_pos = state.spot.pos; + const uint8_t *ptr = state.spot.ptr; + uint32_t ucs = pgf_utf8_decode(&ptr); + if (pgf_utf8_is_space(ucs)) { + state.callback->fn(state.callback, + state.skip_pos, + state.spot.pos, + state.err); + if (state.err->type != PGF_EXN_NONE) + return; + state.skip_pos = -1; + state.queue.push(state.spot); + break; + } + state.spot.pos++; + state.spot.ptr = ptr; + } + } + + if (state.skip_pos != (size_t) -1) { + state.callback->fn(state.callback, + state.skip_pos, + state.spot.pos, + state.err); + if (state.err->type != PGF_EXN_NONE) + return; + state.skip_pos = (size_t) -1; + } + + state.spot = spot; + } + } +} + PGF_INTERNAL void phrasetable_iter(PgfConcr *concr, PgfPhrasetable table, diff --git a/src/runtime/c/pgf/phrasetable.h b/src/runtime/c/pgf/phrasetable.h index 1ee9b8f07..45c23ead9 100644 --- a/src/runtime/c/pgf/phrasetable.h +++ b/src/runtime/c/pgf/phrasetable.h @@ -68,26 +68,19 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table, PGF_INTERNAL_DECL size_t phrasetable_size(PgfPhrasetable table); -typedef struct { - size_t pos; // position in Unicode characters - const uint8_t *begin; // pointer into the beginning of the range - const uint8_t *end; // pointer into the end of the range -} PgfTextRange; - PGF_INTERNAL_DECL void phrasetable_lookup(PgfPhrasetable table, - PgfTextRange *sentence, + PgfText *sentence, bool case_sensitive, Namespace lincats, PgfMorphoCallback* callback, PgfExn* err); PGF_INTERNAL_DECL -void phrasetable_lookup_prefixes(PgfPhrasetable table, - PgfTextRange *sentence, - bool case_sensitive, - Namespace lincats, - ptrdiff_t min, ptrdiff_t max, - PgfCohortsCallback* callback, PgfExn* err); +void phrasetable_lookup_cohorts(PgfPhrasetable table, + PgfText *sentence, + bool case_sensitive, + Namespace lincats, + PgfCohortsCallback* callback, PgfExn* err); PGF_INTERNAL_DECL void phrasetable_iter(PgfConcr *concr,