forked from GitHub/gf-core
finished the implementation of lookupCohorts
This commit is contained in:
@@ -7,13 +7,14 @@ public:
|
||||
Heap() {
|
||||
len = 0;
|
||||
avail = 0;
|
||||
values = NULL;
|
||||
}
|
||||
|
||||
~Heap() { free(values); }
|
||||
|
||||
void push(A value) {
|
||||
if (len >= avail) {
|
||||
avail = get_next_padovan(len);
|
||||
avail = get_next_padovan(len+1);
|
||||
A *new_values = (A *) realloc(values, sizeof(A)*avail);
|
||||
if (new_values == NULL)
|
||||
throw pgf_systemerror(errno);
|
||||
|
||||
@@ -825,12 +825,8 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
|
||||
bool case_sensitive = pgf_is_case_sensitive(concr);
|
||||
|
||||
PgfTextRange range;
|
||||
range.pos = 0;
|
||||
range.begin = (uint8_t *) &sentence->text[0];
|
||||
range.end = (uint8_t *) &sentence->text[sentence->size];
|
||||
phrasetable_lookup(concr->phrasetable,
|
||||
&range, case_sensitive,
|
||||
sentence, case_sensitive,
|
||||
concr->lincats,
|
||||
callback, err);
|
||||
} PGF_API_END
|
||||
@@ -847,15 +843,10 @@ void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
|
||||
bool case_sensitive = pgf_is_case_sensitive(concr);
|
||||
|
||||
PgfTextRange range;
|
||||
range.pos = 0;
|
||||
range.begin = (uint8_t *) &sentence->text[0];
|
||||
range.end = (uint8_t *) &sentence->text[sentence->size];
|
||||
phrasetable_lookup_prefixes(concr->phrasetable,
|
||||
&range, case_sensitive,
|
||||
concr->lincats,
|
||||
1, sentence->size,
|
||||
callback, err);
|
||||
phrasetable_lookup_cohorts(concr->phrasetable,
|
||||
sentence, case_sensitive,
|
||||
concr->lincats,
|
||||
callback, err);
|
||||
} PGF_API_END
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "data.h"
|
||||
#include "heap.h"
|
||||
|
||||
PgfPhrasetableIds::PgfPhrasetableIds()
|
||||
{
|
||||
@@ -227,9 +228,19 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct PGF_INTERNAL_DECL PgfTextSpot {
|
||||
size_t pos; // position in Unicode characters
|
||||
const uint8_t *ptr; // pointer into the spot location
|
||||
|
||||
bool operator >= (PgfTextSpot const &obj) {
|
||||
return pos >= obj.pos;
|
||||
}
|
||||
};
|
||||
|
||||
static
|
||||
int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
|
||||
bool case_sensitive, bool full_match)
|
||||
int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end,
|
||||
ref<PgfSequence> seq,
|
||||
bool case_sensitive, bool full_match)
|
||||
{
|
||||
int res1 = 0;
|
||||
|
||||
@@ -240,7 +251,7 @@ int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
|
||||
size_t count = 0;
|
||||
|
||||
for (;;) {
|
||||
if (range->begin >= range->end) {
|
||||
if (spot->ptr >= end) {
|
||||
if (s2 < e2 || i < seq->syms.len)
|
||||
return -1;
|
||||
return case_sensitive ? res1 : 0;
|
||||
@@ -249,7 +260,7 @@ int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
|
||||
if (s2 >= e2 && i >= seq->syms.len)
|
||||
return full_match ? 1 : 0;
|
||||
|
||||
uint32_t ucs1 = pgf_utf8_decode(&range->begin); range->pos++;
|
||||
uint32_t ucs1 = pgf_utf8_decode(&spot->ptr); spot->pos++;
|
||||
uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
|
||||
|
||||
if (s2 >= e2) {
|
||||
@@ -466,7 +477,7 @@ size_t phrasetable_size(PgfPhrasetable table)
|
||||
|
||||
PGF_INTERNAL
|
||||
void phrasetable_lookup(PgfPhrasetable table,
|
||||
PgfTextRange *sentence,
|
||||
PgfText *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<PgfConcrLincat> lincats,
|
||||
PgfMorphoCallback* callback, PgfExn* err)
|
||||
@@ -474,8 +485,11 @@ void phrasetable_lookup(PgfPhrasetable table,
|
||||
if (table == 0)
|
||||
return;
|
||||
|
||||
PgfTextRange current = *sentence;
|
||||
int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive,true);
|
||||
PgfTextSpot current;
|
||||
current.pos = 0;
|
||||
current.ptr = (uint8_t *) sentence->text;
|
||||
const uint8_t *end = current.ptr+sentence->size;
|
||||
int cmp = text_sequence_cmp(¤t,end,table->value.seq,case_sensitive,true);
|
||||
if (cmp < 0) {
|
||||
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
|
||||
} else if (cmp > 0) {
|
||||
@@ -519,50 +533,85 @@ void phrasetable_lookup(PgfPhrasetable table,
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
void phrasetable_lookup_prefixes(PgfPhrasetable table,
|
||||
PgfTextRange *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<PgfConcrLincat> lincats,
|
||||
ptrdiff_t min, ptrdiff_t max,
|
||||
PgfCohortsCallback* callback, PgfExn* err)
|
||||
struct PGF_INTERNAL_DECL PgfCohortsState {
|
||||
PgfTextSpot spot;
|
||||
Heap<PgfTextSpot> queue;
|
||||
size_t last_pos;
|
||||
size_t skip_pos;
|
||||
const uint8_t *end; // pointer into the end of the sentence
|
||||
|
||||
bool case_sensitive;
|
||||
Namespace<PgfConcrLincat> lincats;
|
||||
PgfCohortsCallback* callback;
|
||||
PgfExn* err;
|
||||
};
|
||||
|
||||
static
|
||||
void phrasetable_lookup_prefixes(PgfCohortsState *state,
|
||||
PgfPhrasetable table,
|
||||
ptrdiff_t min, ptrdiff_t max)
|
||||
{
|
||||
if (table == 0)
|
||||
return;
|
||||
|
||||
PgfTextRange current = *sentence;
|
||||
int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive,false);
|
||||
PgfTextSpot current = state->spot;
|
||||
int cmp = text_sequence_cmp(¤t,state->end,table->value.seq,state->case_sensitive,false);
|
||||
if (cmp < 0) {
|
||||
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,max,callback,err);
|
||||
phrasetable_lookup_prefixes(state,table->left,min,max);
|
||||
} else if (cmp > 0) {
|
||||
ptrdiff_t len = current.begin - sentence->begin;
|
||||
ptrdiff_t len = current.ptr - state->spot.ptr;
|
||||
|
||||
if (min <= len)
|
||||
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
|
||||
phrasetable_lookup_prefixes(state,table->left,min,len);
|
||||
|
||||
if (len <= max)
|
||||
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
|
||||
phrasetable_lookup_prefixes(state,table->right,len,max);
|
||||
} else {
|
||||
ptrdiff_t len = current.begin - sentence->begin;
|
||||
ptrdiff_t len = current.ptr - state->spot.ptr;
|
||||
|
||||
if (min <= len)
|
||||
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
|
||||
phrasetable_lookup_prefixes(state,table->left,min,len);
|
||||
|
||||
auto backrefs = table->value.backrefs;
|
||||
if (len > 0 && backrefs != 0) {
|
||||
if (state->skip_pos != (size_t) -1) {
|
||||
state->callback->fn(state->callback,
|
||||
state->skip_pos,
|
||||
state->spot.pos,
|
||||
state->err);
|
||||
if (state->err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
state->skip_pos = (size_t) -1;
|
||||
}
|
||||
|
||||
if (state->last_pos > 0 && state->last_pos != current.pos) {
|
||||
state->callback->fn(state->callback,
|
||||
state->spot.pos,
|
||||
state->last_pos,
|
||||
state->err);
|
||||
if (state->err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
}
|
||||
state->last_pos = current.pos;
|
||||
state->queue.push(current);
|
||||
|
||||
for (size_t i = 0; i < backrefs->len; i++) {
|
||||
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
|
||||
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
|
||||
case PgfConcrLin::tag: {
|
||||
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
|
||||
ref<PgfConcrLincat> lincat =
|
||||
namespace_lookup(lincats, &lin->absfun->type->name);
|
||||
namespace_lookup(state->lincats, &lin->absfun->type->name);
|
||||
if (lincat != 0) {
|
||||
ref<PgfText> field =
|
||||
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
|
||||
|
||||
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
|
||||
if (err->type != PGF_EXN_NONE)
|
||||
state->callback->morpho.fn(&state->callback->morpho,
|
||||
&lin->absfun->name,
|
||||
&(*field),
|
||||
lincat->abscat->prob+lin->absfun->prob,
|
||||
state->err);
|
||||
if (state->err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
}
|
||||
break;
|
||||
@@ -573,17 +622,104 @@ void phrasetable_lookup_prefixes(PgfPhrasetable table,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
callback->fn(callback, sentence->pos, current.pos, err);
|
||||
if (err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
}
|
||||
|
||||
if (len <= max)
|
||||
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
|
||||
phrasetable_lookup_prefixes(state,table->right,len,max);
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
void phrasetable_lookup_cohorts(PgfPhrasetable table,
|
||||
PgfText *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<PgfConcrLincat> lincats,
|
||||
PgfCohortsCallback* callback, PgfExn* err)
|
||||
{
|
||||
PgfTextSpot spot;
|
||||
spot.pos = 0;
|
||||
spot.ptr = (uint8_t *) &sentence->text[0];
|
||||
|
||||
PgfCohortsState state;
|
||||
state.spot.pos = -1;
|
||||
state.spot.ptr = NULL;
|
||||
state.queue.push(spot);
|
||||
state.last_pos = 0;
|
||||
state.skip_pos = (size_t) -1;
|
||||
state.end = (uint8_t *) &sentence->text[sentence->size];
|
||||
state.case_sensitive = case_sensitive;
|
||||
state.lincats = lincats;
|
||||
state.callback = callback;
|
||||
state.err = err;
|
||||
|
||||
while (!state.queue.is_empty()) {
|
||||
PgfTextSpot spot = state.queue.pop();
|
||||
if (spot.pos != state.spot.pos) {
|
||||
state.spot = spot;
|
||||
|
||||
// skip leading spaces
|
||||
while (state.spot.ptr < state.end) {
|
||||
const uint8_t *ptr = state.spot.ptr;
|
||||
uint32_t ucs = pgf_utf8_decode(&ptr);
|
||||
if (!pgf_utf8_is_space(ucs))
|
||||
break;
|
||||
state.spot.pos++;
|
||||
state.spot.ptr = ptr;
|
||||
}
|
||||
|
||||
state.skip_pos = (size_t) -1;
|
||||
while (state.spot.ptr < state.end) {
|
||||
phrasetable_lookup_prefixes(&state, table, 1, sentence->size);
|
||||
|
||||
if (state.last_pos > 0) {
|
||||
// We found at least one match.
|
||||
// The last range is yet to be reported.
|
||||
state.callback->fn(state.callback,
|
||||
state.spot.pos,
|
||||
state.last_pos,
|
||||
state.err);
|
||||
if (state.err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
state.last_pos = 0;
|
||||
break;
|
||||
} else {
|
||||
// We didn't find any matches at this position,
|
||||
// therefore we must skip one character and try again.
|
||||
if (state.skip_pos == (size_t) -1)
|
||||
state.skip_pos = state.spot.pos;
|
||||
const uint8_t *ptr = state.spot.ptr;
|
||||
uint32_t ucs = pgf_utf8_decode(&ptr);
|
||||
if (pgf_utf8_is_space(ucs)) {
|
||||
state.callback->fn(state.callback,
|
||||
state.skip_pos,
|
||||
state.spot.pos,
|
||||
state.err);
|
||||
if (state.err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
state.skip_pos = -1;
|
||||
state.queue.push(state.spot);
|
||||
break;
|
||||
}
|
||||
state.spot.pos++;
|
||||
state.spot.ptr = ptr;
|
||||
}
|
||||
}
|
||||
|
||||
if (state.skip_pos != (size_t) -1) {
|
||||
state.callback->fn(state.callback,
|
||||
state.skip_pos,
|
||||
state.spot.pos,
|
||||
state.err);
|
||||
if (state.err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
state.skip_pos = (size_t) -1;
|
||||
}
|
||||
|
||||
state.spot = spot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
void phrasetable_iter(PgfConcr *concr,
|
||||
PgfPhrasetable table,
|
||||
|
||||
@@ -68,26 +68,19 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
|
||||
PGF_INTERNAL_DECL
|
||||
size_t phrasetable_size(PgfPhrasetable table);
|
||||
|
||||
typedef struct {
|
||||
size_t pos; // position in Unicode characters
|
||||
const uint8_t *begin; // pointer into the beginning of the range
|
||||
const uint8_t *end; // pointer into the end of the range
|
||||
} PgfTextRange;
|
||||
|
||||
PGF_INTERNAL_DECL
|
||||
void phrasetable_lookup(PgfPhrasetable table,
|
||||
PgfTextRange *sentence,
|
||||
PgfText *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<struct PgfConcrLincat> lincats,
|
||||
PgfMorphoCallback* callback, PgfExn* err);
|
||||
|
||||
PGF_INTERNAL_DECL
|
||||
void phrasetable_lookup_prefixes(PgfPhrasetable table,
|
||||
PgfTextRange *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<PgfConcrLincat> lincats,
|
||||
ptrdiff_t min, ptrdiff_t max,
|
||||
PgfCohortsCallback* callback, PgfExn* err);
|
||||
void phrasetable_lookup_cohorts(PgfPhrasetable table,
|
||||
PgfText *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<PgfConcrLincat> lincats,
|
||||
PgfCohortsCallback* callback, PgfExn* err);
|
||||
|
||||
PGF_INTERNAL_DECL
|
||||
void phrasetable_iter(PgfConcr *concr,
|
||||
|
||||
Reference in New Issue
Block a user