1
0
forked from GitHub/gf-core

finished the implementation of lookupCohorts

This commit is contained in:
Krasimir Angelov
2022-07-12 12:46:50 +02:00
parent e546c2a0ce
commit 5b8212020f
4 changed files with 179 additions and 58 deletions

View File

@@ -7,13 +7,14 @@ public:
Heap() { Heap() {
len = 0; len = 0;
avail = 0; avail = 0;
values = NULL;
} }
~Heap() { free(values); } ~Heap() { free(values); }
void push(A value) { void push(A value) {
if (len >= avail) { if (len >= avail) {
avail = get_next_padovan(len); avail = get_next_padovan(len+1);
A *new_values = (A *) realloc(values, sizeof(A)*avail); A *new_values = (A *) realloc(values, sizeof(A)*avail);
if (new_values == NULL) if (new_values == NULL)
throw pgf_systemerror(errno); throw pgf_systemerror(errno);

View File

@@ -825,12 +825,8 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
bool case_sensitive = pgf_is_case_sensitive(concr); bool case_sensitive = pgf_is_case_sensitive(concr);
PgfTextRange range;
range.pos = 0;
range.begin = (uint8_t *) &sentence->text[0];
range.end = (uint8_t *) &sentence->text[sentence->size];
phrasetable_lookup(concr->phrasetable, phrasetable_lookup(concr->phrasetable,
&range, case_sensitive, sentence, case_sensitive,
concr->lincats, concr->lincats,
callback, err); callback, err);
} PGF_API_END } PGF_API_END
@@ -847,15 +843,10 @@ void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
bool case_sensitive = pgf_is_case_sensitive(concr); bool case_sensitive = pgf_is_case_sensitive(concr);
PgfTextRange range; phrasetable_lookup_cohorts(concr->phrasetable,
range.pos = 0; sentence, case_sensitive,
range.begin = (uint8_t *) &sentence->text[0]; concr->lincats,
range.end = (uint8_t *) &sentence->text[sentence->size]; callback, err);
phrasetable_lookup_prefixes(concr->phrasetable,
&range, case_sensitive,
concr->lincats,
1, sentence->size,
callback, err);
} PGF_API_END } PGF_API_END
} }

View File

@@ -1,4 +1,5 @@
#include "data.h" #include "data.h"
#include "heap.h"
PgfPhrasetableIds::PgfPhrasetableIds() PgfPhrasetableIds::PgfPhrasetableIds()
{ {
@@ -227,9 +228,19 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
return 0; return 0;
} }
struct PGF_INTERNAL_DECL PgfTextSpot {
size_t pos; // position in Unicode characters
const uint8_t *ptr; // pointer into the spot location
bool operator >= (PgfTextSpot const &obj) {
return pos >= obj.pos;
}
};
static static
int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq, int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end,
bool case_sensitive, bool full_match) ref<PgfSequence> seq,
bool case_sensitive, bool full_match)
{ {
int res1 = 0; int res1 = 0;
@@ -240,7 +251,7 @@ int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
size_t count = 0; size_t count = 0;
for (;;) { for (;;) {
if (range->begin >= range->end) { if (spot->ptr >= end) {
if (s2 < e2 || i < seq->syms.len) if (s2 < e2 || i < seq->syms.len)
return -1; return -1;
return case_sensitive ? res1 : 0; return case_sensitive ? res1 : 0;
@@ -249,7 +260,7 @@ int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
if (s2 >= e2 && i >= seq->syms.len) if (s2 >= e2 && i >= seq->syms.len)
return full_match ? 1 : 0; return full_match ? 1 : 0;
uint32_t ucs1 = pgf_utf8_decode(&range->begin); range->pos++; uint32_t ucs1 = pgf_utf8_decode(&spot->ptr); spot->pos++;
uint32_t ucs1i = pgf_utf8_to_upper(ucs1); uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
if (s2 >= e2) { if (s2 >= e2) {
@@ -466,7 +477,7 @@ size_t phrasetable_size(PgfPhrasetable table)
PGF_INTERNAL PGF_INTERNAL
void phrasetable_lookup(PgfPhrasetable table, void phrasetable_lookup(PgfPhrasetable table,
PgfTextRange *sentence, PgfText *sentence,
bool case_sensitive, bool case_sensitive,
Namespace<PgfConcrLincat> lincats, Namespace<PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err) PgfMorphoCallback* callback, PgfExn* err)
@@ -474,8 +485,11 @@ void phrasetable_lookup(PgfPhrasetable table,
if (table == 0) if (table == 0)
return; return;
PgfTextRange current = *sentence; PgfTextSpot current;
int cmp = text_range_cmp(&current,table->value.seq,case_sensitive,true); current.pos = 0;
current.ptr = (uint8_t *) sentence->text;
const uint8_t *end = current.ptr+sentence->size;
int cmp = text_sequence_cmp(&current,end,table->value.seq,case_sensitive,true);
if (cmp < 0) { if (cmp < 0) {
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err); phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
} else if (cmp > 0) { } else if (cmp > 0) {
@@ -519,50 +533,85 @@ void phrasetable_lookup(PgfPhrasetable table,
} }
} }
PGF_INTERNAL struct PGF_INTERNAL_DECL PgfCohortsState {
void phrasetable_lookup_prefixes(PgfPhrasetable table, PgfTextSpot spot;
PgfTextRange *sentence, Heap<PgfTextSpot> queue;
bool case_sensitive, size_t last_pos;
Namespace<PgfConcrLincat> lincats, size_t skip_pos;
ptrdiff_t min, ptrdiff_t max, const uint8_t *end; // pointer into the end of the sentence
PgfCohortsCallback* callback, PgfExn* err)
bool case_sensitive;
Namespace<PgfConcrLincat> lincats;
PgfCohortsCallback* callback;
PgfExn* err;
};
static
void phrasetable_lookup_prefixes(PgfCohortsState *state,
PgfPhrasetable table,
ptrdiff_t min, ptrdiff_t max)
{ {
if (table == 0) if (table == 0)
return; return;
PgfTextRange current = *sentence; PgfTextSpot current = state->spot;
int cmp = text_range_cmp(&current,table->value.seq,case_sensitive,false); int cmp = text_sequence_cmp(&current,state->end,table->value.seq,state->case_sensitive,false);
if (cmp < 0) { if (cmp < 0) {
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,max,callback,err); phrasetable_lookup_prefixes(state,table->left,min,max);
} else if (cmp > 0) { } else if (cmp > 0) {
ptrdiff_t len = current.begin - sentence->begin; ptrdiff_t len = current.ptr - state->spot.ptr;
if (min <= len) if (min <= len)
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err); phrasetable_lookup_prefixes(state,table->left,min,len);
if (len <= max) if (len <= max)
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err); phrasetable_lookup_prefixes(state,table->right,len,max);
} else { } else {
ptrdiff_t len = current.begin - sentence->begin; ptrdiff_t len = current.ptr - state->spot.ptr;
if (min <= len) if (min <= len)
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err); phrasetable_lookup_prefixes(state,table->left,min,len);
auto backrefs = table->value.backrefs; auto backrefs = table->value.backrefs;
if (len > 0 && backrefs != 0) { if (len > 0 && backrefs != 0) {
if (state->skip_pos != (size_t) -1) {
state->callback->fn(state->callback,
state->skip_pos,
state->spot.pos,
state->err);
if (state->err->type != PGF_EXN_NONE)
return;
state->skip_pos = (size_t) -1;
}
if (state->last_pos > 0 && state->last_pos != current.pos) {
state->callback->fn(state->callback,
state->spot.pos,
state->last_pos,
state->err);
if (state->err->type != PGF_EXN_NONE)
return;
}
state->last_pos = current.pos;
state->queue.push(current);
for (size_t i = 0; i < backrefs->len; i++) { for (size_t i = 0; i < backrefs->len; i++) {
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i); PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
switch (ref<PgfConcrLin>::get_tag(backref.container)) { switch (ref<PgfConcrLin>::get_tag(backref.container)) {
case PgfConcrLin::tag: { case PgfConcrLin::tag: {
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container); ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
ref<PgfConcrLincat> lincat = ref<PgfConcrLincat> lincat =
namespace_lookup(lincats, &lin->absfun->type->name); namespace_lookup(state->lincats, &lin->absfun->type->name);
if (lincat != 0) { if (lincat != 0) {
ref<PgfText> field = ref<PgfText> field =
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); *vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err); state->callback->morpho.fn(&state->callback->morpho,
if (err->type != PGF_EXN_NONE) &lin->absfun->name,
&(*field),
lincat->abscat->prob+lin->absfun->prob,
state->err);
if (state->err->type != PGF_EXN_NONE)
return; return;
} }
break; break;
@@ -573,17 +622,104 @@ void phrasetable_lookup_prefixes(PgfPhrasetable table,
} }
} }
} }
callback->fn(callback, sentence->pos, current.pos, err);
if (err->type != PGF_EXN_NONE)
return;
} }
if (len <= max) if (len <= max)
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err); phrasetable_lookup_prefixes(state,table->right,len,max);
} }
} }
PGF_INTERNAL
void phrasetable_lookup_cohorts(PgfPhrasetable table,
PgfText *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
PgfCohortsCallback* callback, PgfExn* err)
{
PgfTextSpot spot;
spot.pos = 0;
spot.ptr = (uint8_t *) &sentence->text[0];
PgfCohortsState state;
state.spot.pos = -1;
state.spot.ptr = NULL;
state.queue.push(spot);
state.last_pos = 0;
state.skip_pos = (size_t) -1;
state.end = (uint8_t *) &sentence->text[sentence->size];
state.case_sensitive = case_sensitive;
state.lincats = lincats;
state.callback = callback;
state.err = err;
while (!state.queue.is_empty()) {
PgfTextSpot spot = state.queue.pop();
if (spot.pos != state.spot.pos) {
state.spot = spot;
// skip leading spaces
while (state.spot.ptr < state.end) {
const uint8_t *ptr = state.spot.ptr;
uint32_t ucs = pgf_utf8_decode(&ptr);
if (!pgf_utf8_is_space(ucs))
break;
state.spot.pos++;
state.spot.ptr = ptr;
}
state.skip_pos = (size_t) -1;
while (state.spot.ptr < state.end) {
phrasetable_lookup_prefixes(&state, table, 1, sentence->size);
if (state.last_pos > 0) {
// We found at least one match.
// The last range is yet to be reported.
state.callback->fn(state.callback,
state.spot.pos,
state.last_pos,
state.err);
if (state.err->type != PGF_EXN_NONE)
return;
state.last_pos = 0;
break;
} else {
// We didn't find any matches at this position,
// therefore we must skip one character and try again.
if (state.skip_pos == (size_t) -1)
state.skip_pos = state.spot.pos;
const uint8_t *ptr = state.spot.ptr;
uint32_t ucs = pgf_utf8_decode(&ptr);
if (pgf_utf8_is_space(ucs)) {
state.callback->fn(state.callback,
state.skip_pos,
state.spot.pos,
state.err);
if (state.err->type != PGF_EXN_NONE)
return;
state.skip_pos = -1;
state.queue.push(state.spot);
break;
}
state.spot.pos++;
state.spot.ptr = ptr;
}
}
if (state.skip_pos != (size_t) -1) {
state.callback->fn(state.callback,
state.skip_pos,
state.spot.pos,
state.err);
if (state.err->type != PGF_EXN_NONE)
return;
state.skip_pos = (size_t) -1;
}
state.spot = spot;
}
}
}
PGF_INTERNAL PGF_INTERNAL
void phrasetable_iter(PgfConcr *concr, void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table, PgfPhrasetable table,

View File

@@ -68,26 +68,19 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
PGF_INTERNAL_DECL PGF_INTERNAL_DECL
size_t phrasetable_size(PgfPhrasetable table); size_t phrasetable_size(PgfPhrasetable table);
typedef struct {
size_t pos; // position in Unicode characters
const uint8_t *begin; // pointer into the beginning of the range
const uint8_t *end; // pointer into the end of the range
} PgfTextRange;
PGF_INTERNAL_DECL PGF_INTERNAL_DECL
void phrasetable_lookup(PgfPhrasetable table, void phrasetable_lookup(PgfPhrasetable table,
PgfTextRange *sentence, PgfText *sentence,
bool case_sensitive, bool case_sensitive,
Namespace<struct PgfConcrLincat> lincats, Namespace<struct PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err); PgfMorphoCallback* callback, PgfExn* err);
PGF_INTERNAL_DECL PGF_INTERNAL_DECL
void phrasetable_lookup_prefixes(PgfPhrasetable table, void phrasetable_lookup_cohorts(PgfPhrasetable table,
PgfTextRange *sentence, PgfText *sentence,
bool case_sensitive, bool case_sensitive,
Namespace<PgfConcrLincat> lincats, Namespace<PgfConcrLincat> lincats,
ptrdiff_t min, ptrdiff_t max, PgfCohortsCallback* callback, PgfExn* err);
PgfCohortsCallback* callback, PgfExn* err);
PGF_INTERNAL_DECL PGF_INTERNAL_DECL
void phrasetable_iter(PgfConcr *concr, void phrasetable_iter(PgfConcr *concr,