started with lookupCohorts

This commit is contained in:
Krasimir Angelov
2022-07-07 14:03:07 +02:00
parent c783da51a4
commit a66693770c
7 changed files with 189 additions and 37 deletions

View File

@@ -825,7 +825,37 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
bool case_sensitive = pgf_is_case_sensitive(concr);
phrasetable_lookup(concr->phrasetable, sentence, case_sensitive, concr->lincats, callback, err);
PgfTextRange range;
range.pos = 0;
range.begin = (uint8_t *) &sentence->text[0];
range.end = (uint8_t *) &sentence->text[sentence->size];
phrasetable_lookup(concr->phrasetable,
&range, case_sensitive,
concr->lincats,
callback, err);
} PGF_API_END
}
PGF_API
void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfCohortsCallback* callback, PgfExn* err)
{
PGF_API_BEGIN {
DB_scope scope(db, READER_SCOPE);
ref<PgfConcr> concr = db->revision2concr(cnc_revision);
bool case_sensitive = pgf_is_case_sensitive(concr);
PgfTextRange range;
range.pos = 0;
range.begin = (uint8_t *) &sentence->text[0];
range.end = (uint8_t *) &sentence->text[sentence->size];
phrasetable_lookup_prefixes(concr->phrasetable,
&range, case_sensitive,
concr->lincats,
1, sentence->size,
callback, err);
} PGF_API_END
}

View File

@@ -418,6 +418,18 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfMorphoCallback* callback, PgfExn* err);
typedef struct PgfCohortsCallback PgfCohortsCallback;
struct PgfCohortsCallback {
PgfMorphoCallback morpho;
void (*fn)(PgfCohortsCallback* self, size_t start, size_t end,
PgfExn* err);
};
PGF_API_DECL
void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfCohortsCallback* callback, PgfExn* err);
PGF_API_DECL
PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision,
PgfSequenceItor *itor,

View File

@@ -228,14 +228,11 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
}
static
int text_cmp(PgfText *sentence, ref<PgfSequence> seq,
bool case_sensitive)
int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
bool case_sensitive)
{
int res1 = 0;
const uint8_t *s1 = (uint8_t *) &sentence->text;
const uint8_t *e1 = s1+sentence->size;
size_t i = 0;
const uint8_t *s2 = NULL;
const uint8_t *e2 = NULL;
@@ -243,13 +240,13 @@ int text_cmp(PgfText *sentence, ref<PgfSequence> seq,
size_t count = 0;
for (;;) {
if (s1 >= e1) {
if (range->begin >= range->end) {
if (s2 < e2 || i < seq->syms.len)
return -1;
return case_sensitive ? res1 : 0;
}
uint32_t ucs1 = pgf_utf8_decode(&s1);
uint32_t ucs1 = pgf_utf8_decode(&range->begin); range->pos++;
uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
if (s2 >= e2) {
@@ -469,7 +466,7 @@ size_t phrasetable_size(PgfPhrasetable table)
PGF_INTERNAL
void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err)
@@ -477,7 +474,8 @@ void phrasetable_lookup(PgfPhrasetable table,
if (table == 0)
return;
int cmp = text_cmp(sentence,table->value.seq,case_sensitive);
PgfTextRange current = *sentence;
int cmp = text_range_cmp(&current,table->value.seq,case_sensitive);
if (cmp < 0) {
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
} else if (cmp > 0) {
@@ -521,6 +519,71 @@ void phrasetable_lookup(PgfPhrasetable table,
}
}
PGF_INTERNAL
void phrasetable_lookup_prefixes(PgfPhrasetable table,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
ptrdiff_t min, ptrdiff_t max,
PgfCohortsCallback* callback, PgfExn* err)
{
if (table == 0)
return;
PgfTextRange current = *sentence;
int cmp = text_range_cmp(&current,table->value.seq,case_sensitive);
if (cmp < 0) {
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,max,callback,err);
} else if (cmp > 0) {
ptrdiff_t len = current.begin - sentence->begin;
if (min <= len)
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
if (len <= max)
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
} else {
ptrdiff_t len = current.begin - sentence->begin;
if (min <= len)
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
auto backrefs = table->value.backrefs;
if (backrefs != 0) {
for (size_t i = 0; i < backrefs->len; i++) {
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
case PgfConcrLin::tag: {
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
ref<PgfConcrLincat> lincat =
namespace_lookup(lincats, &lin->absfun->type->name);
if (lincat != 0) {
ref<PgfText> field =
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
if (err->type != PGF_EXN_NONE)
return;
}
break;
}
case PgfConcrLincat::tag: {
//ignore
break;
}
}
}
callback->fn(callback, sentence->pos, current.pos, err);
if (err->type != PGF_EXN_NONE)
return;
}
if (len <= max)
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
}
}
PGF_INTERNAL
void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table,

View File

@@ -68,13 +68,27 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
PGF_INTERNAL_DECL
size_t phrasetable_size(PgfPhrasetable table);
typedef struct {
size_t pos; // position in Unicode characters
const uint8_t *begin; // pointer into the beginning of the range
const uint8_t *end; // pointer into the end of the range
} PgfTextRange;
PGF_INTERNAL_DECL
void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<struct PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err);
PGF_INTERNAL_DECL
void phrasetable_lookup_prefixes(PgfPhrasetable table,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
ptrdiff_t min, ptrdiff_t max,
PgfCohortsCallback* callback, PgfExn* err);
PGF_INTERNAL_DECL
void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table,