forked from GitHub/gf-core
finished the implementation of lookupCohorts
This commit is contained in:
@@ -7,13 +7,14 @@ public:
|
|||||||
Heap() {
|
Heap() {
|
||||||
len = 0;
|
len = 0;
|
||||||
avail = 0;
|
avail = 0;
|
||||||
|
values = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
~Heap() { free(values); }
|
~Heap() { free(values); }
|
||||||
|
|
||||||
void push(A value) {
|
void push(A value) {
|
||||||
if (len >= avail) {
|
if (len >= avail) {
|
||||||
avail = get_next_padovan(len);
|
avail = get_next_padovan(len+1);
|
||||||
A *new_values = (A *) realloc(values, sizeof(A)*avail);
|
A *new_values = (A *) realloc(values, sizeof(A)*avail);
|
||||||
if (new_values == NULL)
|
if (new_values == NULL)
|
||||||
throw pgf_systemerror(errno);
|
throw pgf_systemerror(errno);
|
||||||
|
|||||||
@@ -825,12 +825,8 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
|
|||||||
|
|
||||||
bool case_sensitive = pgf_is_case_sensitive(concr);
|
bool case_sensitive = pgf_is_case_sensitive(concr);
|
||||||
|
|
||||||
PgfTextRange range;
|
|
||||||
range.pos = 0;
|
|
||||||
range.begin = (uint8_t *) &sentence->text[0];
|
|
||||||
range.end = (uint8_t *) &sentence->text[sentence->size];
|
|
||||||
phrasetable_lookup(concr->phrasetable,
|
phrasetable_lookup(concr->phrasetable,
|
||||||
&range, case_sensitive,
|
sentence, case_sensitive,
|
||||||
concr->lincats,
|
concr->lincats,
|
||||||
callback, err);
|
callback, err);
|
||||||
} PGF_API_END
|
} PGF_API_END
|
||||||
@@ -847,15 +843,10 @@ void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
|
|||||||
|
|
||||||
bool case_sensitive = pgf_is_case_sensitive(concr);
|
bool case_sensitive = pgf_is_case_sensitive(concr);
|
||||||
|
|
||||||
PgfTextRange range;
|
phrasetable_lookup_cohorts(concr->phrasetable,
|
||||||
range.pos = 0;
|
sentence, case_sensitive,
|
||||||
range.begin = (uint8_t *) &sentence->text[0];
|
concr->lincats,
|
||||||
range.end = (uint8_t *) &sentence->text[sentence->size];
|
callback, err);
|
||||||
phrasetable_lookup_prefixes(concr->phrasetable,
|
|
||||||
&range, case_sensitive,
|
|
||||||
concr->lincats,
|
|
||||||
1, sentence->size,
|
|
||||||
callback, err);
|
|
||||||
} PGF_API_END
|
} PGF_API_END
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#include "data.h"
|
#include "data.h"
|
||||||
|
#include "heap.h"
|
||||||
|
|
||||||
PgfPhrasetableIds::PgfPhrasetableIds()
|
PgfPhrasetableIds::PgfPhrasetableIds()
|
||||||
{
|
{
|
||||||
@@ -227,9 +228,19 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct PGF_INTERNAL_DECL PgfTextSpot {
|
||||||
|
size_t pos; // position in Unicode characters
|
||||||
|
const uint8_t *ptr; // pointer into the spot location
|
||||||
|
|
||||||
|
bool operator >= (PgfTextSpot const &obj) {
|
||||||
|
return pos >= obj.pos;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
static
|
static
|
||||||
int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
|
int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end,
|
||||||
bool case_sensitive, bool full_match)
|
ref<PgfSequence> seq,
|
||||||
|
bool case_sensitive, bool full_match)
|
||||||
{
|
{
|
||||||
int res1 = 0;
|
int res1 = 0;
|
||||||
|
|
||||||
@@ -240,7 +251,7 @@ int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
|
|||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
if (range->begin >= range->end) {
|
if (spot->ptr >= end) {
|
||||||
if (s2 < e2 || i < seq->syms.len)
|
if (s2 < e2 || i < seq->syms.len)
|
||||||
return -1;
|
return -1;
|
||||||
return case_sensitive ? res1 : 0;
|
return case_sensitive ? res1 : 0;
|
||||||
@@ -249,7 +260,7 @@ int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
|
|||||||
if (s2 >= e2 && i >= seq->syms.len)
|
if (s2 >= e2 && i >= seq->syms.len)
|
||||||
return full_match ? 1 : 0;
|
return full_match ? 1 : 0;
|
||||||
|
|
||||||
uint32_t ucs1 = pgf_utf8_decode(&range->begin); range->pos++;
|
uint32_t ucs1 = pgf_utf8_decode(&spot->ptr); spot->pos++;
|
||||||
uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
|
uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
|
||||||
|
|
||||||
if (s2 >= e2) {
|
if (s2 >= e2) {
|
||||||
@@ -466,7 +477,7 @@ size_t phrasetable_size(PgfPhrasetable table)
|
|||||||
|
|
||||||
PGF_INTERNAL
|
PGF_INTERNAL
|
||||||
void phrasetable_lookup(PgfPhrasetable table,
|
void phrasetable_lookup(PgfPhrasetable table,
|
||||||
PgfTextRange *sentence,
|
PgfText *sentence,
|
||||||
bool case_sensitive,
|
bool case_sensitive,
|
||||||
Namespace<PgfConcrLincat> lincats,
|
Namespace<PgfConcrLincat> lincats,
|
||||||
PgfMorphoCallback* callback, PgfExn* err)
|
PgfMorphoCallback* callback, PgfExn* err)
|
||||||
@@ -474,8 +485,11 @@ void phrasetable_lookup(PgfPhrasetable table,
|
|||||||
if (table == 0)
|
if (table == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
PgfTextRange current = *sentence;
|
PgfTextSpot current;
|
||||||
int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive,true);
|
current.pos = 0;
|
||||||
|
current.ptr = (uint8_t *) sentence->text;
|
||||||
|
const uint8_t *end = current.ptr+sentence->size;
|
||||||
|
int cmp = text_sequence_cmp(¤t,end,table->value.seq,case_sensitive,true);
|
||||||
if (cmp < 0) {
|
if (cmp < 0) {
|
||||||
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
|
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
|
||||||
} else if (cmp > 0) {
|
} else if (cmp > 0) {
|
||||||
@@ -519,50 +533,85 @@ void phrasetable_lookup(PgfPhrasetable table,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
PGF_INTERNAL
|
struct PGF_INTERNAL_DECL PgfCohortsState {
|
||||||
void phrasetable_lookup_prefixes(PgfPhrasetable table,
|
PgfTextSpot spot;
|
||||||
PgfTextRange *sentence,
|
Heap<PgfTextSpot> queue;
|
||||||
bool case_sensitive,
|
size_t last_pos;
|
||||||
Namespace<PgfConcrLincat> lincats,
|
size_t skip_pos;
|
||||||
ptrdiff_t min, ptrdiff_t max,
|
const uint8_t *end; // pointer into the end of the sentence
|
||||||
PgfCohortsCallback* callback, PgfExn* err)
|
|
||||||
|
bool case_sensitive;
|
||||||
|
Namespace<PgfConcrLincat> lincats;
|
||||||
|
PgfCohortsCallback* callback;
|
||||||
|
PgfExn* err;
|
||||||
|
};
|
||||||
|
|
||||||
|
static
|
||||||
|
void phrasetable_lookup_prefixes(PgfCohortsState *state,
|
||||||
|
PgfPhrasetable table,
|
||||||
|
ptrdiff_t min, ptrdiff_t max)
|
||||||
{
|
{
|
||||||
if (table == 0)
|
if (table == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
PgfTextRange current = *sentence;
|
PgfTextSpot current = state->spot;
|
||||||
int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive,false);
|
int cmp = text_sequence_cmp(¤t,state->end,table->value.seq,state->case_sensitive,false);
|
||||||
if (cmp < 0) {
|
if (cmp < 0) {
|
||||||
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,max,callback,err);
|
phrasetable_lookup_prefixes(state,table->left,min,max);
|
||||||
} else if (cmp > 0) {
|
} else if (cmp > 0) {
|
||||||
ptrdiff_t len = current.begin - sentence->begin;
|
ptrdiff_t len = current.ptr - state->spot.ptr;
|
||||||
|
|
||||||
if (min <= len)
|
if (min <= len)
|
||||||
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
|
phrasetable_lookup_prefixes(state,table->left,min,len);
|
||||||
|
|
||||||
if (len <= max)
|
if (len <= max)
|
||||||
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
|
phrasetable_lookup_prefixes(state,table->right,len,max);
|
||||||
} else {
|
} else {
|
||||||
ptrdiff_t len = current.begin - sentence->begin;
|
ptrdiff_t len = current.ptr - state->spot.ptr;
|
||||||
|
|
||||||
if (min <= len)
|
if (min <= len)
|
||||||
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
|
phrasetable_lookup_prefixes(state,table->left,min,len);
|
||||||
|
|
||||||
auto backrefs = table->value.backrefs;
|
auto backrefs = table->value.backrefs;
|
||||||
if (len > 0 && backrefs != 0) {
|
if (len > 0 && backrefs != 0) {
|
||||||
|
if (state->skip_pos != (size_t) -1) {
|
||||||
|
state->callback->fn(state->callback,
|
||||||
|
state->skip_pos,
|
||||||
|
state->spot.pos,
|
||||||
|
state->err);
|
||||||
|
if (state->err->type != PGF_EXN_NONE)
|
||||||
|
return;
|
||||||
|
state->skip_pos = (size_t) -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state->last_pos > 0 && state->last_pos != current.pos) {
|
||||||
|
state->callback->fn(state->callback,
|
||||||
|
state->spot.pos,
|
||||||
|
state->last_pos,
|
||||||
|
state->err);
|
||||||
|
if (state->err->type != PGF_EXN_NONE)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
state->last_pos = current.pos;
|
||||||
|
state->queue.push(current);
|
||||||
|
|
||||||
for (size_t i = 0; i < backrefs->len; i++) {
|
for (size_t i = 0; i < backrefs->len; i++) {
|
||||||
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
|
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
|
||||||
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
|
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
|
||||||
case PgfConcrLin::tag: {
|
case PgfConcrLin::tag: {
|
||||||
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
|
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
|
||||||
ref<PgfConcrLincat> lincat =
|
ref<PgfConcrLincat> lincat =
|
||||||
namespace_lookup(lincats, &lin->absfun->type->name);
|
namespace_lookup(state->lincats, &lin->absfun->type->name);
|
||||||
if (lincat != 0) {
|
if (lincat != 0) {
|
||||||
ref<PgfText> field =
|
ref<PgfText> field =
|
||||||
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
|
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
|
||||||
|
|
||||||
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
|
state->callback->morpho.fn(&state->callback->morpho,
|
||||||
if (err->type != PGF_EXN_NONE)
|
&lin->absfun->name,
|
||||||
|
&(*field),
|
||||||
|
lincat->abscat->prob+lin->absfun->prob,
|
||||||
|
state->err);
|
||||||
|
if (state->err->type != PGF_EXN_NONE)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@@ -573,17 +622,104 @@ void phrasetable_lookup_prefixes(PgfPhrasetable table,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
callback->fn(callback, sentence->pos, current.pos, err);
|
|
||||||
if (err->type != PGF_EXN_NONE)
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (len <= max)
|
if (len <= max)
|
||||||
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
|
phrasetable_lookup_prefixes(state,table->right,len,max);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PGF_INTERNAL
|
||||||
|
void phrasetable_lookup_cohorts(PgfPhrasetable table,
|
||||||
|
PgfText *sentence,
|
||||||
|
bool case_sensitive,
|
||||||
|
Namespace<PgfConcrLincat> lincats,
|
||||||
|
PgfCohortsCallback* callback, PgfExn* err)
|
||||||
|
{
|
||||||
|
PgfTextSpot spot;
|
||||||
|
spot.pos = 0;
|
||||||
|
spot.ptr = (uint8_t *) &sentence->text[0];
|
||||||
|
|
||||||
|
PgfCohortsState state;
|
||||||
|
state.spot.pos = -1;
|
||||||
|
state.spot.ptr = NULL;
|
||||||
|
state.queue.push(spot);
|
||||||
|
state.last_pos = 0;
|
||||||
|
state.skip_pos = (size_t) -1;
|
||||||
|
state.end = (uint8_t *) &sentence->text[sentence->size];
|
||||||
|
state.case_sensitive = case_sensitive;
|
||||||
|
state.lincats = lincats;
|
||||||
|
state.callback = callback;
|
||||||
|
state.err = err;
|
||||||
|
|
||||||
|
while (!state.queue.is_empty()) {
|
||||||
|
PgfTextSpot spot = state.queue.pop();
|
||||||
|
if (spot.pos != state.spot.pos) {
|
||||||
|
state.spot = spot;
|
||||||
|
|
||||||
|
// skip leading spaces
|
||||||
|
while (state.spot.ptr < state.end) {
|
||||||
|
const uint8_t *ptr = state.spot.ptr;
|
||||||
|
uint32_t ucs = pgf_utf8_decode(&ptr);
|
||||||
|
if (!pgf_utf8_is_space(ucs))
|
||||||
|
break;
|
||||||
|
state.spot.pos++;
|
||||||
|
state.spot.ptr = ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
state.skip_pos = (size_t) -1;
|
||||||
|
while (state.spot.ptr < state.end) {
|
||||||
|
phrasetable_lookup_prefixes(&state, table, 1, sentence->size);
|
||||||
|
|
||||||
|
if (state.last_pos > 0) {
|
||||||
|
// We found at least one match.
|
||||||
|
// The last range is yet to be reported.
|
||||||
|
state.callback->fn(state.callback,
|
||||||
|
state.spot.pos,
|
||||||
|
state.last_pos,
|
||||||
|
state.err);
|
||||||
|
if (state.err->type != PGF_EXN_NONE)
|
||||||
|
return;
|
||||||
|
state.last_pos = 0;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
// We didn't find any matches at this position,
|
||||||
|
// therefore we must skip one character and try again.
|
||||||
|
if (state.skip_pos == (size_t) -1)
|
||||||
|
state.skip_pos = state.spot.pos;
|
||||||
|
const uint8_t *ptr = state.spot.ptr;
|
||||||
|
uint32_t ucs = pgf_utf8_decode(&ptr);
|
||||||
|
if (pgf_utf8_is_space(ucs)) {
|
||||||
|
state.callback->fn(state.callback,
|
||||||
|
state.skip_pos,
|
||||||
|
state.spot.pos,
|
||||||
|
state.err);
|
||||||
|
if (state.err->type != PGF_EXN_NONE)
|
||||||
|
return;
|
||||||
|
state.skip_pos = -1;
|
||||||
|
state.queue.push(state.spot);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
state.spot.pos++;
|
||||||
|
state.spot.ptr = ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state.skip_pos != (size_t) -1) {
|
||||||
|
state.callback->fn(state.callback,
|
||||||
|
state.skip_pos,
|
||||||
|
state.spot.pos,
|
||||||
|
state.err);
|
||||||
|
if (state.err->type != PGF_EXN_NONE)
|
||||||
|
return;
|
||||||
|
state.skip_pos = (size_t) -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
state.spot = spot;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
PGF_INTERNAL
|
PGF_INTERNAL
|
||||||
void phrasetable_iter(PgfConcr *concr,
|
void phrasetable_iter(PgfConcr *concr,
|
||||||
PgfPhrasetable table,
|
PgfPhrasetable table,
|
||||||
|
|||||||
@@ -68,26 +68,19 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
|
|||||||
PGF_INTERNAL_DECL
|
PGF_INTERNAL_DECL
|
||||||
size_t phrasetable_size(PgfPhrasetable table);
|
size_t phrasetable_size(PgfPhrasetable table);
|
||||||
|
|
||||||
typedef struct {
|
|
||||||
size_t pos; // position in Unicode characters
|
|
||||||
const uint8_t *begin; // pointer into the beginning of the range
|
|
||||||
const uint8_t *end; // pointer into the end of the range
|
|
||||||
} PgfTextRange;
|
|
||||||
|
|
||||||
PGF_INTERNAL_DECL
|
PGF_INTERNAL_DECL
|
||||||
void phrasetable_lookup(PgfPhrasetable table,
|
void phrasetable_lookup(PgfPhrasetable table,
|
||||||
PgfTextRange *sentence,
|
PgfText *sentence,
|
||||||
bool case_sensitive,
|
bool case_sensitive,
|
||||||
Namespace<struct PgfConcrLincat> lincats,
|
Namespace<struct PgfConcrLincat> lincats,
|
||||||
PgfMorphoCallback* callback, PgfExn* err);
|
PgfMorphoCallback* callback, PgfExn* err);
|
||||||
|
|
||||||
PGF_INTERNAL_DECL
|
PGF_INTERNAL_DECL
|
||||||
void phrasetable_lookup_prefixes(PgfPhrasetable table,
|
void phrasetable_lookup_cohorts(PgfPhrasetable table,
|
||||||
PgfTextRange *sentence,
|
PgfText *sentence,
|
||||||
bool case_sensitive,
|
bool case_sensitive,
|
||||||
Namespace<PgfConcrLincat> lincats,
|
Namespace<PgfConcrLincat> lincats,
|
||||||
ptrdiff_t min, ptrdiff_t max,
|
PgfCohortsCallback* callback, PgfExn* err);
|
||||||
PgfCohortsCallback* callback, PgfExn* err);
|
|
||||||
|
|
||||||
PGF_INTERNAL_DECL
|
PGF_INTERNAL_DECL
|
||||||
void phrasetable_iter(PgfConcr *concr,
|
void phrasetable_iter(PgfConcr *concr,
|
||||||
|
|||||||
Reference in New Issue
Block a user