1
0
forked from GitHub/gf-core

use TextSpots for correct extraction of unknown words

This commit is contained in:
Krasimir Angelov
2022-09-30 11:58:09 +02:00
parent 6b63c2f779
commit 855fa7ebf3
5 changed files with 56 additions and 54 deletions

View File

@@ -83,7 +83,7 @@ public:
} }
public: public:
size_t start, end; PgfTextSpot start, end;
State *prev, *next; State *prev, *next;
prob_t viterbi_prob; prob_t viterbi_prob;
@@ -495,10 +495,10 @@ public:
prev = prev->prev; prev = prev->prev;
} }
size_t size = state->start-prev->end; size_t size = state->start.ptr-prev->end.ptr;
PgfText *token = (PgfText *) alloca(sizeof(PgfText)+size+1); PgfText *token = (PgfText *) alloca(sizeof(PgfText)+size+1);
token->size = size; token->size = size;
memcpy(token->text,parser->sentence->text+prev->end,size); memcpy(token->text,prev->end.ptr,size);
token->text[size] = 0; token->text[size] = 0;
PgfExpr expr = u->elit(u->lstr(token)); PgfExpr expr = u->elit(u->lstr(token));
@@ -638,19 +638,19 @@ PgfParser::PgfParser(ref<PgfConcr> concr, ref<PgfConcrLincat> start, PgfText *se
this->m = m; this->m = m;
} }
void PgfParser::space(size_t start, size_t end, PgfExn* err) void PgfParser::space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err)
{ {
State *prev = NULL; State *prev = NULL;
State *next = before; State *next = before;
while (next != NULL && next->start < start) { while (next != NULL && next->start.pos < start->pos) {
prev = next; prev = next;
next = next->next; next = next->next;
} }
if (next == NULL || next->start != start) { if (next == NULL || next->start.pos != start->pos) {
before = new State(); before = new State();
before->start = start; before->start = *start;
before->end = end; before->end = *end;
before->prev = prev; before->prev = prev;
before->next = next; before->next = next;
before->viterbi_prob = prev ? prev->viterbi_prob : 0; before->viterbi_prob = prev ? prev->viterbi_prob : 0;
@@ -659,23 +659,23 @@ void PgfParser::space(size_t start, size_t end, PgfExn* err)
if (next != NULL) next->prev = before; if (next != NULL) next->prev = before;
} else { } else {
before = next; before = next;
before->end = end; before->end = *end;
} }
} }
void PgfParser::start_matches(size_t end, PgfExn* err) void PgfParser::start_matches(PgfTextSpot *end, PgfExn* err)
{ {
State *prev = NULL; State *prev = NULL;
State *next = before; State *next = before;
while (next != NULL && next->start < end) { while (next != NULL && next->start.pos < end->pos) {
prev = next; prev = next;
next = next->next; next = next->next;
} }
if (next == NULL || next->start != end) { if (next == NULL || next->start.pos != end->pos) {
after = new State(); after = new State();
after->start = end; after->start = *end;
after->end = end; after->end = *end;
after->prev = prev; after->prev = prev;
after->next = next; after->next = next;
after->viterbi_prob = prev ? prev->viterbi_prob : 0; after->viterbi_prob = prev ? prev->viterbi_prob : 0;
@@ -696,7 +696,7 @@ void PgfParser::match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err)
after->queue.push(new(0) ParseItem(conts, lin, seq_index)); after->queue.push(new(0) ParseItem(conts, lin, seq_index));
} }
void PgfParser::end_matches(size_t end, PgfExn* err) void PgfParser::end_matches(PgfTextSpot *end, PgfExn* err)
{ {
while (!after->queue.empty()) { while (!after->queue.empty()) {
Item *item = after->queue.top(); Item *item = after->queue.top();

View File

@@ -5,10 +5,10 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum
public: public:
PgfParser(ref<PgfConcr> concr, ref<PgfConcrLincat> start, PgfText *sentence, PgfMarshaller *m); PgfParser(ref<PgfConcr> concr, ref<PgfConcrLincat> start, PgfText *sentence, PgfMarshaller *m);
void space(size_t start, size_t end, PgfExn* err); void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err);
void start_matches(size_t end, PgfExn* err); void start_matches(PgfTextSpot *end, PgfExn* err);
void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err); void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err);
void end_matches(size_t end, PgfExn* err); void end_matches(PgfTextSpot *end, PgfExn* err);
void prepare(); void prepare();
PgfExpr fetch(PgfDB *db, PgfUnmarshaller *u, prob_t *prob); PgfExpr fetch(PgfDB *db, PgfUnmarshaller *u, prob_t *prob);

View File

@@ -822,11 +822,11 @@ public:
this->callback = callback; this->callback = callback;
} }
virtual void space(size_t start, size_t end, PgfExn* err) virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err)
{ {
} }
virtual void start_matches(size_t end, PgfExn* err) virtual void start_matches(PgfTextSpot *end, PgfExn* err)
{ {
} }
@@ -837,7 +837,7 @@ public:
callback->fn(callback, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err); callback->fn(callback, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err);
} }
virtual void end_matches(size_t end, PgfExn* err) virtual void end_matches(PgfTextSpot *end, PgfExn* err)
{ {
} }
@@ -869,12 +869,12 @@ public:
this->callback = callback; this->callback = callback;
} }
virtual void space(size_t start, size_t end, PgfExn* err) virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err)
{ {
match_start = end; match_start = end->pos;
} }
virtual void start_matches(size_t match_end, PgfExn* err) virtual void start_matches(PgfTextSpot *end, PgfExn* err)
{ {
} }
@@ -885,9 +885,9 @@ public:
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err); callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err);
} }
virtual void end_matches(size_t match_end, PgfExn* err) virtual void end_matches(PgfTextSpot *end, PgfExn* err)
{ {
callback->fn(callback, match_start, match_end, err); callback->fn(callback, match_start, end->pos, err);
} }
private: private:

View File

@@ -228,11 +228,6 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
return 0; return 0;
} }
struct PGF_INTERNAL_DECL PgfTextSpot {
size_t pos; // position in Unicode characters
const uint8_t *ptr; // pointer into the spot location
};
static static
int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end, int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end,
ref<PgfSequence> seq, ref<PgfSequence> seq,
@@ -535,7 +530,7 @@ struct PGF_INTERNAL_DECL PgfCohortsState {
PgfTextSpot spot; PgfTextSpot spot;
std::priority_queue<PgfTextSpot, std::vector<PgfTextSpot>, PgfTextSpotComparator> queue; std::priority_queue<PgfTextSpot, std::vector<PgfTextSpot>, PgfTextSpotComparator> queue;
size_t last_pos; PgfTextSpot last;
bool skipping; bool skipping;
const uint8_t *end; // pointer into the end of the sentence const uint8_t *end; // pointer into the end of the sentence
@@ -552,34 +547,35 @@ void finish_skipping(PgfCohortsState *state) {
if (spot.pos >= state->spot.pos) if (spot.pos >= state->spot.pos)
break; break;
if (spot.pos != state->last_pos) { if (spot.pos != state->last.pos) {
if (state->last_pos > 0) { if (state->last.pos > 0) {
state->scanner->space(spot.pos, spot.pos, state->scanner->space(&spot, &spot,
state->err); state->err);
if (state->err->type != PGF_EXN_NONE) if (state->err->type != PGF_EXN_NONE)
return; return;
} }
state->scanner->start_matches(state->spot.pos, state->scanner->start_matches(&state->spot,
state->err); state->err);
if (state->err->type != PGF_EXN_NONE) if (state->err->type != PGF_EXN_NONE)
return; return;
state->scanner->end_matches(state->spot.pos, state->scanner->end_matches(&state->spot,
state->err); state->err);
if (state->err->type != PGF_EXN_NONE) if (state->err->type != PGF_EXN_NONE)
return; return;
state->last_pos = spot.pos; state->last = spot;
} }
state->queue.pop(); state->queue.pop();
} }
state->scanner->space(state->spot.pos, state->spot.pos, state->scanner->space(&state->spot, &state->spot,
state->err); state->err);
state->last_pos = 0; state->last.pos = 0;
state->last.ptr = NULL;
state->skipping = false; state->skipping = false;
} }
} }
@@ -616,20 +612,20 @@ void phrasetable_lookup_prefixes(PgfCohortsState *state,
auto backrefs = table->value.backrefs; auto backrefs = table->value.backrefs;
if (len > 0 && backrefs != 0) { if (len > 0 && backrefs != 0) {
if (state->last_pos != current.pos) { if (state->last.pos != current.pos) {
if (state->last_pos > 0) { if (state->last.pos > 0) {
state->scanner->end_matches(state->last_pos, state->scanner->end_matches(&state->last,
state->err); state->err);
if (state->err->type != PGF_EXN_NONE) if (state->err->type != PGF_EXN_NONE)
return; return;
} }
state->scanner->start_matches(current.pos, state->scanner->start_matches(&current,
state->err); state->err);
if (state->err->type != PGF_EXN_NONE) if (state->err->type != PGF_EXN_NONE)
return; return;
state->last_pos = current.pos; state->last = current;
} }
state->queue.push(current); state->queue.push(current);
@@ -668,13 +664,13 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table,
{ {
PgfTextSpot spot; PgfTextSpot spot;
spot.pos = 0; spot.pos = 0;
spot.ptr = (uint8_t *) &sentence->text[0]; spot.ptr = (uint8_t *) sentence->text;
PgfCohortsState state; PgfCohortsState state;
state.spot.pos = -1; state.spot.pos = -1;
state.spot.ptr = NULL; state.spot.ptr = NULL;
state.queue.push(spot); state.queue.push(spot);
state.last_pos = 0; state.last = spot;
state.skipping = false; state.skipping = false;
state.end = (uint8_t *) &sentence->text[sentence->size]; state.end = (uint8_t *) &sentence->text[sentence->size];
state.case_sensitive = case_sensitive; state.case_sensitive = case_sensitive;
@@ -698,7 +694,7 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table,
state.spot.ptr = ptr; state.spot.ptr = ptr;
} }
state.scanner->space(spot.pos,state.spot.pos,state.err); state.scanner->space(&spot,&state.spot,state.err);
if (state.err->type != PGF_EXN_NONE) if (state.err->type != PGF_EXN_NONE)
return; return;
@@ -707,14 +703,15 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table,
if (state.err->type != PGF_EXN_NONE) if (state.err->type != PGF_EXN_NONE)
return; return;
if (state.last_pos > 0) { if (state.last.pos > 0) {
// We found at least one match. // We found at least one match.
// The last range is yet to be reported. // The last range is yet to be reported.
state.scanner->end_matches(state.last_pos, state.scanner->end_matches(&state.last,
state.err); state.err);
if (state.err->type != PGF_EXN_NONE) if (state.err->type != PGF_EXN_NONE)
return; return;
state.last_pos = 0; state.last.pos = 0;
state.last.ptr = (uint8_t*) sentence->text;
break; break;
} else { } else {
// No matches were found, try the next position // No matches were found, try the next position

View File

@@ -71,12 +71,17 @@ size_t phrasetable_size(PgfPhrasetable table);
class PgfConcrLin; class PgfConcrLin;
struct PGF_INTERNAL_DECL PgfTextSpot {
size_t pos; // position in Unicode characters
const uint8_t *ptr; // pointer into the spot location
};
class PGF_INTERNAL_DECL PgfPhraseScanner { class PGF_INTERNAL_DECL PgfPhraseScanner {
public: public:
virtual void space(size_t start, size_t end, PgfExn* err)=0; virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err)=0;
virtual void start_matches(size_t pos, PgfExn* err)=0; virtual void start_matches(PgfTextSpot *spot, PgfExn* err)=0;
virtual void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err)=0; virtual void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err)=0;
virtual void end_matches(size_t pos, PgfExn* err)=0; virtual void end_matches(PgfTextSpot *spot, PgfExn* err)=0;
}; };
PGF_INTERNAL_DECL PGF_INTERNAL_DECL