mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
API for scanning for cohorts in an arbitrary text
This commit is contained in:
@@ -68,6 +68,7 @@ libpgf_la_SOURCES = \
|
||||
pgf/data.h \
|
||||
pgf/expr.c \
|
||||
pgf/expr.h \
|
||||
pgf/scanner.c \
|
||||
pgf/parser.c \
|
||||
pgf/lookup.c \
|
||||
pgf/jit.c \
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include <pgf/data.h>
|
||||
#include <pgf/expr.h>
|
||||
#include <pgf/linearizer.h>
|
||||
#include <gu/enum.h>
|
||||
#include <gu/seq.h>
|
||||
#include <gu/assert.h>
|
||||
@@ -502,39 +501,12 @@ pgf_print_expr_state0(PgfExprState* st,
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static int
|
||||
cmp_string(GuString* psent, GuString tok, bool case_sensitive)
|
||||
{
|
||||
for (;;) {
|
||||
GuUCS c2 = gu_utf8_decode((const uint8_t**) &tok);
|
||||
if (c2 == 0)
|
||||
return 0;
|
||||
PGF_INTERNAL_DECL int
|
||||
cmp_string(GuString* psent, size_t* ppos, GuString tok,
|
||||
bool case_sensitive);
|
||||
|
||||
const uint8_t* p = (uint8_t*) *psent;
|
||||
GuUCS c1 = gu_utf8_decode(&p);
|
||||
if (c1 == 0)
|
||||
return -1;
|
||||
|
||||
if (!case_sensitive)
|
||||
c1 = gu_ucs_to_lower(c1);
|
||||
|
||||
if (c1 != c2)
|
||||
return (c1-c2);
|
||||
|
||||
*psent = (GuString) p;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
skip_space(GuString* psent)
|
||||
{
|
||||
const uint8_t* p = (uint8_t*) *psent;
|
||||
if (!gu_ucs_is_space(gu_utf8_decode(&p)))
|
||||
return false;
|
||||
|
||||
*psent = (GuString) p;
|
||||
return true;
|
||||
}
|
||||
PGF_INTERNAL_DECL bool
|
||||
skip_space(GuString* psent, size_t* ppos);
|
||||
|
||||
static int
|
||||
cmp_item_prob(GuOrder* self, const void* a, const void* b)
|
||||
@@ -1060,63 +1032,10 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, size_t* sym_idx, bool case_sensitive)
|
||||
{
|
||||
size_t n_syms = gu_seq_length(syms);
|
||||
while (*sym_idx < n_syms) {
|
||||
PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx);
|
||||
|
||||
if (*sym_idx > 0) {
|
||||
if (!skip_space(psent)) {
|
||||
if (**psent == 0)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (**psent != 0) {
|
||||
if (!skip_space(psent))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
GuVariantInfo inf = gu_variant_open(sym);
|
||||
switch (inf.tag) {
|
||||
case PGF_SYMBOL_CAT:
|
||||
case PGF_SYMBOL_LIT:
|
||||
case PGF_SYMBOL_VAR: {
|
||||
if (**psent == 0)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
case PGF_SYMBOL_KS: {
|
||||
PgfSymbolKS* pks = inf.data;
|
||||
if (**psent == 0)
|
||||
return -1;
|
||||
|
||||
int cmp = cmp_string(psent, pks->token, case_sensitive);
|
||||
if (cmp != 0)
|
||||
return cmp;
|
||||
break;
|
||||
}
|
||||
case PGF_SYMBOL_KP:
|
||||
case PGF_SYMBOL_BIND:
|
||||
case PGF_SYMBOL_NE:
|
||||
case PGF_SYMBOL_SOFT_BIND:
|
||||
case PGF_SYMBOL_SOFT_SPACE:
|
||||
case PGF_SYMBOL_CAPIT:
|
||||
case PGF_SYMBOL_ALL_CAPIT: {
|
||||
return -1;
|
||||
}
|
||||
default:
|
||||
gu_impossible();
|
||||
}
|
||||
|
||||
(*sym_idx)++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
PGF_INTERNAL_DECL int
|
||||
pgf_symbols_cmp(GuString* psent, size_t* ppos,
|
||||
PgfSymbols* syms, size_t* sym_idx,
|
||||
bool case_sensitive);
|
||||
|
||||
static void
|
||||
pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
|
||||
@@ -1133,8 +1052,9 @@ pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
|
||||
|
||||
GuString start = ps->sentence + state->end_offset;
|
||||
GuString current = start;
|
||||
size_t pos = 0;
|
||||
size_t sym_idx = 0;
|
||||
int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive);
|
||||
int cmp = pgf_symbols_cmp(¤t, &pos, seq->syms, &sym_idx, ps->case_sensitive);
|
||||
if (cmp < 0) {
|
||||
j = k-1;
|
||||
} else if (cmp > 0) {
|
||||
@@ -1206,7 +1126,8 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
||||
|
||||
size_t end_offset = start_offset;
|
||||
GuString current = ps->sentence + end_offset;
|
||||
while (skip_space(¤t)) {
|
||||
size_t pos = 0;
|
||||
while (skip_space(¤t, &pos)) {
|
||||
end_offset++;
|
||||
}
|
||||
|
||||
@@ -1257,6 +1178,7 @@ static void
|
||||
pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
||||
{
|
||||
GuString current = ps->sentence + ps->before->end_offset;
|
||||
size_t pos = 0;
|
||||
|
||||
if (ps->prefix != NULL && *current == 0) {
|
||||
if (gu_string_is_prefix(ps->prefix, tok)) {
|
||||
@@ -1269,7 +1191,7 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
||||
ps->tp->prob = item->inside_prob + item->conts->outside_prob;
|
||||
}
|
||||
} else {
|
||||
if (!ps->before->needs_bind && cmp_string(¤t, tok, ps->case_sensitive) == 0) {
|
||||
if (!ps->before->needs_bind && cmp_string(¤t, &pos, tok, ps->case_sensitive) == 0) {
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, (current - ps->sentence),
|
||||
BIND_NONE,
|
||||
@@ -1454,7 +1376,6 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
||||
case PGF_SYMBOL_KP: {
|
||||
PgfSymbolKP* skp = gu_variant_data(sym);
|
||||
|
||||
PgfSymbol sym;
|
||||
if (item->alt == 0) {
|
||||
PgfItem* new_item;
|
||||
|
||||
@@ -2345,173 +2266,6 @@ pgf_complete(PgfConcr* concr, PgfType* type, GuString sentence,
|
||||
return &ps->en;
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_morpho_iter(PgfProductionIdx* idx,
|
||||
PgfMorphoCallback* callback,
|
||||
GuExn* err)
|
||||
{
|
||||
size_t n_entries = gu_buf_length(idx);
|
||||
for (size_t i = 0; i < n_entries; i++) {
|
||||
PgfProductionIdxEntry* entry =
|
||||
gu_buf_index(idx, PgfProductionIdxEntry, i);
|
||||
|
||||
PgfCId lemma = entry->papp->fun->absfun->name;
|
||||
GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx];
|
||||
|
||||
prob_t prob = entry->ccat->cnccat->abscat->prob +
|
||||
entry->papp->fun->absfun->ep.prob;
|
||||
callback->callback(callback,
|
||||
lemma, analysis, prob, err);
|
||||
if (!gu_ok(err))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
GuOrder order;
|
||||
bool case_sensitive;
|
||||
} PgfSequenceOrder;
|
||||
|
||||
static int
|
||||
pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2)
|
||||
{
|
||||
PgfSequenceOrder* self = gu_container(order, PgfSequenceOrder, order);
|
||||
GuString sent = (GuString) p1;
|
||||
const PgfSequence* sp2 = p2;
|
||||
|
||||
size_t sym_idx = 0;
|
||||
int res = pgf_symbols_cmp(&sent, sp2->syms, &sym_idx, self->case_sensitive);
|
||||
if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) {
|
||||
res = 1;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
PGF_API void
|
||||
pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
|
||||
PgfMorphoCallback* callback, GuExn* err)
|
||||
{
|
||||
if (concr->sequences == NULL) {
|
||||
GuExnData* err_data = gu_raise(err, PgfExn);
|
||||
if (err_data) {
|
||||
err_data->data = "The concrete syntax is not loaded";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bool case_sensitive =
|
||||
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||
|
||||
PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive };
|
||||
PgfSequence* seq = (PgfSequence*)
|
||||
gu_seq_binsearch(concr->sequences, &order.order,
|
||||
PgfSequence, (void*) sentence);
|
||||
|
||||
if (seq != NULL && seq->idx != NULL)
|
||||
pgf_morpho_iter(seq->idx, callback, err);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
GuEnum en;
|
||||
PgfSequences* sequences;
|
||||
GuString prefix;
|
||||
size_t seq_idx;
|
||||
} PgfFullFormState;
|
||||
|
||||
struct PgfFullFormEntry {
|
||||
GuString tokens;
|
||||
PgfProductionIdx* idx;
|
||||
};
|
||||
|
||||
static void
|
||||
gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
{
|
||||
PgfFullFormState* st = gu_container(self, PgfFullFormState, en);
|
||||
PgfFullFormEntry* entry = NULL;
|
||||
|
||||
if (st->sequences != NULL) {
|
||||
size_t n_seqs = gu_seq_length(st->sequences);
|
||||
while (st->seq_idx < n_seqs) {
|
||||
PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx);
|
||||
GuString tokens = pgf_get_tokens(seq->syms, 0, pool);
|
||||
|
||||
if (!gu_string_is_prefix(st->prefix, tokens)) {
|
||||
st->seq_idx = n_seqs;
|
||||
break;
|
||||
}
|
||||
|
||||
if (*tokens != 0 && seq->idx != NULL) {
|
||||
entry = gu_new(PgfFullFormEntry, pool);
|
||||
entry->tokens = tokens;
|
||||
entry->idx = seq->idx;
|
||||
|
||||
st->seq_idx++;
|
||||
break;
|
||||
}
|
||||
|
||||
st->seq_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
*((PgfFullFormEntry**) to) = entry;
|
||||
}
|
||||
|
||||
PGF_API GuEnum*
|
||||
pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
|
||||
{
|
||||
PgfFullFormState* st = gu_new(PgfFullFormState, pool);
|
||||
st->en.next = gu_fullform_enum_next;
|
||||
st->sequences = concr->sequences;
|
||||
st->prefix = "";
|
||||
st->seq_idx = 0;
|
||||
return &st->en;
|
||||
}
|
||||
|
||||
PGF_API GuString
|
||||
pgf_fullform_get_string(PgfFullFormEntry* entry)
|
||||
{
|
||||
return entry->tokens;
|
||||
}
|
||||
|
||||
PGF_API void
|
||||
pgf_fullform_get_analyses(PgfFullFormEntry* entry,
|
||||
PgfMorphoCallback* callback, GuExn* err)
|
||||
{
|
||||
pgf_morpho_iter(entry->idx, callback, err);
|
||||
}
|
||||
|
||||
PGF_API GuEnum*
|
||||
pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix,
|
||||
GuPool* pool, GuExn* err)
|
||||
{
|
||||
if (concr->sequences == NULL) {
|
||||
GuExnData* err_data = gu_raise(err, PgfExn);
|
||||
if (err_data) {
|
||||
err_data->data = "The concrete syntax is not loaded";
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
PgfFullFormState* state = gu_new(PgfFullFormState, pool);
|
||||
state->en.next = gu_fullform_enum_next;
|
||||
state->sequences = concr->sequences;
|
||||
state->prefix = prefix;
|
||||
state->seq_idx = 0;
|
||||
|
||||
bool case_sensitive =
|
||||
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||
|
||||
PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive };
|
||||
if (!gu_seq_binsearch_index(concr->sequences, &order.order,
|
||||
PgfSequence, (void*) prefix,
|
||||
&state->seq_idx)) {
|
||||
state->seq_idx++;
|
||||
}
|
||||
|
||||
return &state->en;
|
||||
}
|
||||
|
||||
PGF_API void
|
||||
pgf_parser_index(PgfConcr* concr,
|
||||
PgfCCat* ccat, PgfProduction prod,
|
||||
|
||||
@@ -167,6 +167,22 @@ PGF_API_DECL void
|
||||
pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
|
||||
PgfMorphoCallback* callback, GuExn* err);
|
||||
|
||||
typedef struct {
|
||||
size_t pos;
|
||||
GuString ptr;
|
||||
} PgfCohortSpot;
|
||||
|
||||
typedef struct {
|
||||
PgfCohortSpot start;
|
||||
PgfCohortSpot end;
|
||||
GuBuf* buf;
|
||||
} PgfCohortRange;
|
||||
|
||||
PGF_API_DECL GuEnum*
|
||||
pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
|
||||
PgfMorphoCallback* callback,
|
||||
GuPool* pool, GuExn* err);
|
||||
|
||||
typedef struct PgfFullFormEntry PgfFullFormEntry;
|
||||
|
||||
PGF_API_DECL GuEnum*
|
||||
|
||||
435
src/runtime/c/pgf/scanner.c
Normal file
435
src/runtime/c/pgf/scanner.c
Normal file
@@ -0,0 +1,435 @@
|
||||
#include <pgf/data.h>
|
||||
#include <pgf/linearizer.h>
|
||||
#include <gu/utf8.h>
|
||||
|
||||
PGF_INTERNAL int
|
||||
cmp_string(GuString* psent, size_t* ppos, GuString tok,
|
||||
bool case_sensitive)
|
||||
{
|
||||
for (;;) {
|
||||
GuUCS c2 = gu_utf8_decode((const uint8_t**) &tok);
|
||||
if (c2 == 0)
|
||||
return 0;
|
||||
|
||||
const uint8_t* p = (uint8_t*) *psent;
|
||||
GuUCS c1 = gu_utf8_decode(&p);
|
||||
if (c1 == 0)
|
||||
return -1;
|
||||
|
||||
if (!case_sensitive)
|
||||
c1 = gu_ucs_to_lower(c1);
|
||||
|
||||
if (c1 != c2)
|
||||
return (c1-c2);
|
||||
|
||||
*psent = (GuString) p;
|
||||
(*ppos)++;
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL bool
|
||||
skip_space(GuString* psent, size_t* ppos)
|
||||
{
|
||||
const uint8_t* p = (uint8_t*) *psent;
|
||||
if (!gu_ucs_is_space(gu_utf8_decode(&p)))
|
||||
return false;
|
||||
|
||||
*psent = (GuString) p;
|
||||
(*ppos)++;
|
||||
return true;
|
||||
}
|
||||
|
||||
PGF_INTERNAL int
|
||||
pgf_symbols_cmp(GuString* psent, size_t* ppos,
|
||||
PgfSymbols* syms, size_t* sym_idx,
|
||||
bool case_sensitive)
|
||||
{
|
||||
size_t n_syms = gu_seq_length(syms);
|
||||
while (*sym_idx < n_syms) {
|
||||
PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx);
|
||||
|
||||
if (*sym_idx > 0) {
|
||||
if (!skip_space(psent,ppos)) {
|
||||
if (**psent == 0)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
while (**psent != 0) {
|
||||
if (!skip_space(psent,ppos))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
GuVariantInfo inf = gu_variant_open(sym);
|
||||
switch (inf.tag) {
|
||||
case PGF_SYMBOL_CAT:
|
||||
case PGF_SYMBOL_LIT:
|
||||
case PGF_SYMBOL_VAR: {
|
||||
if (**psent == 0)
|
||||
return -1;
|
||||
return 1;
|
||||
}
|
||||
case PGF_SYMBOL_KS: {
|
||||
PgfSymbolKS* pks = inf.data;
|
||||
if (**psent == 0)
|
||||
return -1;
|
||||
|
||||
int cmp = cmp_string(psent,ppos,pks->token, case_sensitive);
|
||||
if (cmp != 0)
|
||||
return cmp;
|
||||
break;
|
||||
}
|
||||
case PGF_SYMBOL_KP:
|
||||
case PGF_SYMBOL_BIND:
|
||||
case PGF_SYMBOL_NE:
|
||||
case PGF_SYMBOL_SOFT_BIND:
|
||||
case PGF_SYMBOL_SOFT_SPACE:
|
||||
case PGF_SYMBOL_CAPIT:
|
||||
case PGF_SYMBOL_ALL_CAPIT: {
|
||||
return -1;
|
||||
}
|
||||
default:
|
||||
gu_impossible();
|
||||
}
|
||||
|
||||
(*sym_idx)++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_morpho_iter(PgfProductionIdx* idx,
|
||||
PgfMorphoCallback* callback,
|
||||
GuExn* err)
|
||||
{
|
||||
size_t n_entries = gu_buf_length(idx);
|
||||
for (size_t i = 0; i < n_entries; i++) {
|
||||
PgfProductionIdxEntry* entry =
|
||||
gu_buf_index(idx, PgfProductionIdxEntry, i);
|
||||
|
||||
PgfCId lemma = entry->papp->fun->absfun->name;
|
||||
GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx];
|
||||
|
||||
prob_t prob = entry->ccat->cnccat->abscat->prob +
|
||||
entry->papp->fun->absfun->ep.prob;
|
||||
callback->callback(callback,
|
||||
lemma, analysis, prob, err);
|
||||
if (!gu_ok(err))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
GuOrder order;
|
||||
bool case_sensitive;
|
||||
} PgfSequenceOrder;
|
||||
|
||||
static int
|
||||
pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2)
|
||||
{
|
||||
PgfSequenceOrder* self = gu_container(order, PgfSequenceOrder, order);
|
||||
|
||||
size_t pos = 0;
|
||||
GuString sent = (GuString) p1;
|
||||
|
||||
const PgfSequence* sp2 = p2;
|
||||
|
||||
size_t sym_idx = 0;
|
||||
int res = pgf_symbols_cmp(&sent, &pos, sp2->syms, &sym_idx, self->case_sensitive);
|
||||
if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) {
|
||||
res = 1;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
PGF_API void
|
||||
pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
|
||||
PgfMorphoCallback* callback, GuExn* err)
|
||||
{
|
||||
if (concr->sequences == NULL) {
|
||||
GuExnData* err_data = gu_raise(err, PgfExn);
|
||||
if (err_data) {
|
||||
err_data->data = "The concrete syntax is not loaded";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
bool case_sensitive =
|
||||
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||
|
||||
PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive };
|
||||
PgfSequence* seq = (PgfSequence*)
|
||||
gu_seq_binsearch(concr->sequences, &order.order,
|
||||
PgfSequence, (void*) sentence);
|
||||
|
||||
if (seq != NULL && seq->idx != NULL)
|
||||
pgf_morpho_iter(seq->idx, callback, err);
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
GuEnum en;
|
||||
PgfConcr* concr;
|
||||
GuString sentence;
|
||||
GuString current;
|
||||
size_t len;
|
||||
PgfMorphoCallback* callback;
|
||||
GuExn* err;
|
||||
bool case_sensitive;
|
||||
GuBuf* spots;
|
||||
GuBuf* found;
|
||||
} PgfCohortsState;
|
||||
|
||||
static int
|
||||
cmp_cohort_spot(GuOrder* self, const void* a, const void* b)
|
||||
{
|
||||
PgfCohortSpot *s1 = (PgfCohortSpot *) a;
|
||||
PgfCohortSpot *s2 = (PgfCohortSpot *) b;
|
||||
|
||||
return (s1->ptr-s2->ptr);
|
||||
}
|
||||
|
||||
static GuOrder
|
||||
pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }};
|
||||
|
||||
static void
|
||||
pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
|
||||
int i, int j, ptrdiff_t min, ptrdiff_t max)
|
||||
{
|
||||
// This is a variation of a binary search algorithm which
|
||||
// can retrieve all prefixes of a string with minimal
|
||||
// comparisons, i.e. there is no need to lookup every
|
||||
// prefix separately.
|
||||
|
||||
while (i <= j) {
|
||||
int k = (i+j) / 2;
|
||||
PgfSequence* seq = gu_seq_index(state->concr->sequences, PgfSequence, k);
|
||||
|
||||
PgfCohortSpot current = *spot;
|
||||
|
||||
size_t sym_idx = 0;
|
||||
int cmp = pgf_symbols_cmp(¤t.ptr, ¤t.pos, seq->syms, &sym_idx, state->case_sensitive);
|
||||
if (cmp < 0) {
|
||||
j = k-1;
|
||||
} else if (cmp > 0) {
|
||||
ptrdiff_t len = current.ptr - spot->ptr;
|
||||
|
||||
if (min <= len)
|
||||
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);
|
||||
|
||||
if (len+1 <= max)
|
||||
pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max);
|
||||
|
||||
break;
|
||||
} else {
|
||||
ptrdiff_t len = current.ptr - spot->ptr;
|
||||
|
||||
if (min <= len-1)
|
||||
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len-1);
|
||||
|
||||
if (seq->idx != NULL) {
|
||||
PgfCohortRange* range = gu_buf_insert(state->found, 0);
|
||||
range->start = *spot;
|
||||
range->end = current;
|
||||
range->buf = seq->idx;
|
||||
|
||||
while (*current.ptr != 0) {
|
||||
if (!skip_space(¤t.ptr, ¤t.pos))
|
||||
break;
|
||||
}
|
||||
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t);
|
||||
}
|
||||
|
||||
if (len+1 <= max)
|
||||
pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
{
|
||||
PgfCohortsState* state = gu_container(self, PgfCohortsState, en);
|
||||
|
||||
while (gu_buf_length(state->found) == 0 &&
|
||||
gu_buf_length(state->spots) > 0) {
|
||||
PgfCohortSpot spot;
|
||||
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
|
||||
|
||||
if (spot.ptr == state->current)
|
||||
continue;
|
||||
|
||||
if (*spot.ptr == 0)
|
||||
break;
|
||||
|
||||
pgf_lookup_cohorts_helper
|
||||
(state, &spot,
|
||||
0, gu_seq_length(state->concr->sequences)-1,
|
||||
1, (state->sentence+state->len)-spot.ptr);
|
||||
|
||||
if (gu_buf_length(state->found) == 0) {
|
||||
gu_utf8_decode((const uint8_t**) &spot.ptr);
|
||||
spot.pos++;
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
|
||||
}
|
||||
}
|
||||
|
||||
PgfCohortRange* pRes = (PgfCohortRange*)to;
|
||||
|
||||
if (gu_buf_length(state->found) == 0) {
|
||||
pRes->start.pos = 0;
|
||||
pRes->start.ptr = NULL;
|
||||
pRes->end.pos = 0;
|
||||
pRes->end.ptr = NULL;
|
||||
pRes->buf = NULL;
|
||||
state->current = NULL;
|
||||
return;
|
||||
} else {
|
||||
*pRes = gu_buf_pop(state->found, PgfCohortRange);
|
||||
state->current = pRes->start.ptr;
|
||||
pgf_morpho_iter(pRes->buf, state->callback, state->err);
|
||||
}
|
||||
}
|
||||
|
||||
PGF_API GuEnum*
|
||||
pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
|
||||
PgfMorphoCallback* callback,
|
||||
GuPool* pool, GuExn* err)
|
||||
{
|
||||
if (concr->sequences == NULL) {
|
||||
GuExnData* err_data = gu_raise(err, PgfExn);
|
||||
if (err_data) {
|
||||
err_data->data = "The concrete syntax is not loaded";
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
bool case_sensitive =
|
||||
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||
|
||||
PgfCohortsState* state = gu_new(PgfCohortsState, pool);
|
||||
state->en.next = pgf_lookup_cohorts_enum_next;
|
||||
state->concr = concr;
|
||||
state->sentence= sentence;
|
||||
state->len = strlen(sentence);
|
||||
state->callback= callback;
|
||||
state->err = err;
|
||||
state->case_sensitive = case_sensitive;
|
||||
state->spots = gu_new_buf(PgfCohortSpot, pool);
|
||||
state->found = gu_new_buf(PgfCohortRange, pool);
|
||||
|
||||
PgfCohortSpot spot = {0,sentence};
|
||||
while (*spot.ptr != 0) {
|
||||
if (!skip_space(&spot.ptr, &spot.pos))
|
||||
break;
|
||||
}
|
||||
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
|
||||
|
||||
return &state->en;
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
GuEnum en;
|
||||
PgfSequences* sequences;
|
||||
GuString prefix;
|
||||
size_t seq_idx;
|
||||
} PgfFullFormState;
|
||||
|
||||
struct PgfFullFormEntry {
|
||||
GuString tokens;
|
||||
PgfProductionIdx* idx;
|
||||
};
|
||||
|
||||
static void
|
||||
gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
{
|
||||
PgfFullFormState* st = gu_container(self, PgfFullFormState, en);
|
||||
PgfFullFormEntry* entry = NULL;
|
||||
|
||||
if (st->sequences != NULL) {
|
||||
size_t n_seqs = gu_seq_length(st->sequences);
|
||||
while (st->seq_idx < n_seqs) {
|
||||
PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx);
|
||||
GuString tokens = pgf_get_tokens(seq->syms, 0, pool);
|
||||
|
||||
if (!gu_string_is_prefix(st->prefix, tokens)) {
|
||||
st->seq_idx = n_seqs;
|
||||
break;
|
||||
}
|
||||
|
||||
if (*tokens != 0 && seq->idx != NULL) {
|
||||
entry = gu_new(PgfFullFormEntry, pool);
|
||||
entry->tokens = tokens;
|
||||
entry->idx = seq->idx;
|
||||
|
||||
st->seq_idx++;
|
||||
break;
|
||||
}
|
||||
|
||||
st->seq_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
*((PgfFullFormEntry**) to) = entry;
|
||||
}
|
||||
|
||||
PGF_API GuEnum*
|
||||
pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
|
||||
{
|
||||
PgfFullFormState* st = gu_new(PgfFullFormState, pool);
|
||||
st->en.next = gu_fullform_enum_next;
|
||||
st->sequences = concr->sequences;
|
||||
st->prefix = "";
|
||||
st->seq_idx = 0;
|
||||
return &st->en;
|
||||
}
|
||||
|
||||
PGF_API GuString
|
||||
pgf_fullform_get_string(PgfFullFormEntry* entry)
|
||||
{
|
||||
return entry->tokens;
|
||||
}
|
||||
|
||||
PGF_API void
|
||||
pgf_fullform_get_analyses(PgfFullFormEntry* entry,
|
||||
PgfMorphoCallback* callback, GuExn* err)
|
||||
{
|
||||
pgf_morpho_iter(entry->idx, callback, err);
|
||||
}
|
||||
|
||||
PGF_API GuEnum*
|
||||
pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix,
|
||||
GuPool* pool, GuExn* err)
|
||||
{
|
||||
if (concr->sequences == NULL) {
|
||||
GuExnData* err_data = gu_raise(err, PgfExn);
|
||||
if (err_data) {
|
||||
err_data->data = "The concrete syntax is not loaded";
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
PgfFullFormState* state = gu_new(PgfFullFormState, pool);
|
||||
state->en.next = gu_fullform_enum_next;
|
||||
state->sequences = concr->sequences;
|
||||
state->prefix = prefix;
|
||||
state->seq_idx = 0;
|
||||
|
||||
bool case_sensitive =
|
||||
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||
|
||||
PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive };
|
||||
if (!gu_seq_binsearch_index(concr->sequences, &order.order,
|
||||
PgfSequence, (void*) prefix,
|
||||
&state->seq_idx)) {
|
||||
state->seq_idx++;
|
||||
}
|
||||
|
||||
return &state->en;
|
||||
}
|
||||
@@ -70,7 +70,7 @@ module PGF2 (-- * PGF
|
||||
-- ** Generation
|
||||
generateAll,
|
||||
-- ** Morphological Analysis
|
||||
MorphoAnalysis, lookupMorpho, fullFormLexicon,
|
||||
MorphoAnalysis, lookupMorpho, lookupCohorts, fullFormLexicon,
|
||||
-- ** Visualizations
|
||||
GraphvizOptions(..), graphvizDefaults,
|
||||
graphvizAbstractTree, graphvizParseTree, graphvizWordAlignment,
|
||||
@@ -481,6 +481,36 @@ lookupMorpho (Concr concr master) sent =
|
||||
freeHaskellFunPtr fptr
|
||||
readIORef ref
|
||||
|
||||
lookupCohorts :: Concr -> String -> [(Int,[MorphoAnalysis],Int)]
|
||||
lookupCohorts lang@(Concr concr master) sent =
|
||||
unsafePerformIO $
|
||||
do pl <- gu_new_pool
|
||||
ref <- newIORef []
|
||||
cback <- gu_malloc pl (#size PgfMorphoCallback)
|
||||
fptr <- wrapLookupMorphoCallback (getAnalysis ref)
|
||||
(#poke PgfMorphoCallback, callback) cback fptr
|
||||
c_sent <- newUtf8CString sent pl
|
||||
enum <- pgf_lookup_cohorts concr c_sent cback pl nullPtr
|
||||
fpl <- newForeignPtr gu_pool_finalizer pl
|
||||
fromCohortRange enum fpl fptr ref
|
||||
where
|
||||
fromCohortRange enum fpl fptr ref =
|
||||
allocaBytes (#size PgfCohortRange) $ \ptr ->
|
||||
withForeignPtr fpl $ \pl ->
|
||||
do gu_enum_next enum ptr pl
|
||||
buf <- (#peek PgfCohortRange, buf) ptr
|
||||
if buf == nullPtr
|
||||
then do finalizeForeignPtr fpl
|
||||
freeHaskellFunPtr fptr
|
||||
touchConcr lang
|
||||
return []
|
||||
else do start <- (#peek PgfCohortRange, start.pos) ptr
|
||||
end <- (#peek PgfCohortRange, end.pos) ptr
|
||||
ans <- readIORef ref
|
||||
writeIORef ref []
|
||||
cohs <- unsafeInterleaveIO (fromCohortRange enum fpl fptr ref)
|
||||
return ((start,ans,end):cohs)
|
||||
|
||||
fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])]
|
||||
fullFormLexicon lang =
|
||||
unsafePerformIO $
|
||||
|
||||
@@ -100,7 +100,7 @@ foreign import ccall unsafe "gu/string.h gu_string_buf_out"
|
||||
foreign import ccall unsafe "gu/file.h gu_file_in"
|
||||
gu_file_in :: Ptr () -> Ptr GuPool -> IO (Ptr GuIn)
|
||||
|
||||
foreign import ccall unsafe "gu/enum.h gu_enum_next"
|
||||
foreign import ccall safe "gu/enum.h gu_enum_next"
|
||||
gu_enum_next :: Ptr a -> Ptr (Ptr b) -> Ptr GuPool -> IO ()
|
||||
|
||||
foreign import ccall unsafe "gu/string.h gu_string_buf_freeze"
|
||||
@@ -401,6 +401,9 @@ foreign import ccall "pgf/pgf.h pgf_parse_with_oracle"
|
||||
foreign import ccall "pgf/pgf.h pgf_lookup_morpho"
|
||||
pgf_lookup_morpho :: Ptr PgfConcr -> CString -> Ptr PgfMorphoCallback -> Ptr GuExn -> IO ()
|
||||
|
||||
foreign import ccall "pgf/pgf.h pgf_lookup_cohorts"
|
||||
pgf_lookup_cohorts :: Ptr PgfConcr -> CString -> Ptr PgfMorphoCallback -> Ptr GuPool -> Ptr GuExn -> IO (Ptr GuEnum)
|
||||
|
||||
type LookupMorphoCallback = Ptr PgfMorphoCallback -> CString -> CString -> Float -> Ptr GuExn -> IO ()
|
||||
|
||||
foreign import ccall "wrapper"
|
||||
|
||||
Reference in New Issue
Block a user