From 9f0ea19a1ca7eba160746bdc91c55f7ae4c0b4fc Mon Sep 17 00:00:00 2001 From: krangelov Date: Tue, 28 May 2019 12:26:00 +0200 Subject: [PATCH] API for scanning for cohorts in an arbitrary text --- src/runtime/c/Makefile.am | 1 + src/runtime/c/pgf/parser.c | 276 +--------------- src/runtime/c/pgf/pgf.h | 16 + src/runtime/c/pgf/scanner.c | 435 ++++++++++++++++++++++++++ src/runtime/haskell-bind/PGF2.hsc | 32 +- src/runtime/haskell-bind/PGF2/FFI.hsc | 5 +- 6 files changed, 502 insertions(+), 263 deletions(-) create mode 100644 src/runtime/c/pgf/scanner.c diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am index 8f9c8bf56..adec93e6d 100644 --- a/src/runtime/c/Makefile.am +++ b/src/runtime/c/Makefile.am @@ -68,6 +68,7 @@ libpgf_la_SOURCES = \ pgf/data.h \ pgf/expr.c \ pgf/expr.h \ + pgf/scanner.c \ pgf/parser.c \ pgf/lookup.c \ pgf/jit.c \ diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index cb59b2a55..ec623253d 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -502,39 +501,12 @@ pgf_print_expr_state0(PgfExprState* st, #endif #endif -static int -cmp_string(GuString* psent, GuString tok, bool case_sensitive) -{ - for (;;) { - GuUCS c2 = gu_utf8_decode((const uint8_t**) &tok); - if (c2 == 0) - return 0; +PGF_INTERNAL_DECL int +cmp_string(GuString* psent, size_t* ppos, GuString tok, + bool case_sensitive); - const uint8_t* p = (uint8_t*) *psent; - GuUCS c1 = gu_utf8_decode(&p); - if (c1 == 0) - return -1; - - if (!case_sensitive) - c1 = gu_ucs_to_lower(c1); - - if (c1 != c2) - return (c1-c2); - - *psent = (GuString) p; - } -} - -static bool -skip_space(GuString* psent) -{ - const uint8_t* p = (uint8_t*) *psent; - if (!gu_ucs_is_space(gu_utf8_decode(&p))) - return false; - - *psent = (GuString) p; - return true; -} +PGF_INTERNAL_DECL bool +skip_space(GuString* psent, size_t* ppos); static int cmp_item_prob(GuOrder* self, const void* a, const void* b) @@ -1060,63 +1032,10 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep) } } -static int -pgf_symbols_cmp(GuString* psent, PgfSymbols* syms, size_t* sym_idx, bool case_sensitive) -{ - size_t n_syms = gu_seq_length(syms); - while (*sym_idx < n_syms) { - PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx); - - if (*sym_idx > 0) { - if (!skip_space(psent)) { - if (**psent == 0) - return -1; - return 1; - } - - while (**psent != 0) { - if (!skip_space(psent)) - break; - } - } - - GuVariantInfo inf = gu_variant_open(sym); - switch (inf.tag) { - case PGF_SYMBOL_CAT: - case PGF_SYMBOL_LIT: - case PGF_SYMBOL_VAR: { - if (**psent == 0) - return -1; - return 1; - } - case PGF_SYMBOL_KS: { - PgfSymbolKS* pks = inf.data; - if (**psent == 0) - return -1; - - int cmp = cmp_string(psent, pks->token, case_sensitive); - if (cmp != 0) - return cmp; - break; - } - case PGF_SYMBOL_KP: - case PGF_SYMBOL_BIND: - case PGF_SYMBOL_NE: - case PGF_SYMBOL_SOFT_BIND: - case PGF_SYMBOL_SOFT_SPACE: - case PGF_SYMBOL_CAPIT: - case PGF_SYMBOL_ALL_CAPIT: { - return -1; - } - default: - gu_impossible(); - } - - (*sym_idx)++; - } - - return 0; -} +PGF_INTERNAL_DECL int +pgf_symbols_cmp(GuString* psent, size_t* ppos, + PgfSymbols* syms, size_t* sym_idx, + bool case_sensitive); static void pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state, @@ -1133,8 +1052,9 @@ pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state, GuString start = ps->sentence + state->end_offset; GuString current = start; + size_t pos = 0; size_t sym_idx = 0; - int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive); + int cmp = pgf_symbols_cmp(¤t, &pos, seq->syms, &sym_idx, ps->case_sensitive); if (cmp < 0) { j = k-1; } else if (cmp > 0) { @@ -1206,7 +1126,8 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset, size_t end_offset = start_offset; GuString current = ps->sentence + end_offset; - while (skip_space(¤t)) { + size_t pos = 0; + while (skip_space(¤t, &pos)) { end_offset++; } @@ -1257,6 +1178,7 @@ static void pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item) { GuString current = ps->sentence + ps->before->end_offset; + size_t pos = 0; if (ps->prefix != NULL && *current == 0) { if (gu_string_is_prefix(ps->prefix, tok)) { @@ -1269,7 +1191,7 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item) ps->tp->prob = item->inside_prob + item->conts->outside_prob; } } else { - if (!ps->before->needs_bind && cmp_string(¤t, tok, ps->case_sensitive) == 0) { + if (!ps->before->needs_bind && cmp_string(¤t, &pos, tok, ps->case_sensitive) == 0) { PgfParseState* state = pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE, @@ -1454,7 +1376,6 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym) case PGF_SYMBOL_KP: { PgfSymbolKP* skp = gu_variant_data(sym); - PgfSymbol sym; if (item->alt == 0) { PgfItem* new_item; @@ -2345,173 +2266,6 @@ pgf_complete(PgfConcr* concr, PgfType* type, GuString sentence, return &ps->en; } -static void -pgf_morpho_iter(PgfProductionIdx* idx, - PgfMorphoCallback* callback, - GuExn* err) -{ - size_t n_entries = gu_buf_length(idx); - for (size_t i = 0; i < n_entries; i++) { - PgfProductionIdxEntry* entry = - gu_buf_index(idx, PgfProductionIdxEntry, i); - - PgfCId lemma = entry->papp->fun->absfun->name; - GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx]; - - prob_t prob = entry->ccat->cnccat->abscat->prob + - entry->papp->fun->absfun->ep.prob; - callback->callback(callback, - lemma, analysis, prob, err); - if (!gu_ok(err)) - return; - } -} - -typedef struct { - GuOrder order; - bool case_sensitive; -} PgfSequenceOrder; - -static int -pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2) -{ - PgfSequenceOrder* self = gu_container(order, PgfSequenceOrder, order); - GuString sent = (GuString) p1; - const PgfSequence* sp2 = p2; - - size_t sym_idx = 0; - int res = pgf_symbols_cmp(&sent, sp2->syms, &sym_idx, self->case_sensitive); - if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) { - res = 1; - } - - return res; -} - -PGF_API void -pgf_lookup_morpho(PgfConcr *concr, GuString sentence, - PgfMorphoCallback* callback, GuExn* err) -{ - if (concr->sequences == NULL) { - GuExnData* err_data = gu_raise(err, PgfExn); - if (err_data) { - err_data->data = "The concrete syntax is not loaded"; - return; - } - } - - bool case_sensitive = - (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); - - PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; - PgfSequence* seq = (PgfSequence*) - gu_seq_binsearch(concr->sequences, &order.order, - PgfSequence, (void*) sentence); - - if (seq != NULL && seq->idx != NULL) - pgf_morpho_iter(seq->idx, callback, err); -} - -typedef struct { - GuEnum en; - PgfSequences* sequences; - GuString prefix; - size_t seq_idx; -} PgfFullFormState; - -struct PgfFullFormEntry { - GuString tokens; - PgfProductionIdx* idx; -}; - -static void -gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool) -{ - PgfFullFormState* st = gu_container(self, PgfFullFormState, en); - PgfFullFormEntry* entry = NULL; - - if (st->sequences != NULL) { - size_t n_seqs = gu_seq_length(st->sequences); - while (st->seq_idx < n_seqs) { - PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx); - GuString tokens = pgf_get_tokens(seq->syms, 0, pool); - - if (!gu_string_is_prefix(st->prefix, tokens)) { - st->seq_idx = n_seqs; - break; - } - - if (*tokens != 0 && seq->idx != NULL) { - entry = gu_new(PgfFullFormEntry, pool); - entry->tokens = tokens; - entry->idx = seq->idx; - - st->seq_idx++; - break; - } - - st->seq_idx++; - } - } - - *((PgfFullFormEntry**) to) = entry; -} - -PGF_API GuEnum* -pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool) -{ - PgfFullFormState* st = gu_new(PgfFullFormState, pool); - st->en.next = gu_fullform_enum_next; - st->sequences = concr->sequences; - st->prefix = ""; - st->seq_idx = 0; - return &st->en; -} - -PGF_API GuString -pgf_fullform_get_string(PgfFullFormEntry* entry) -{ - return entry->tokens; -} - -PGF_API void -pgf_fullform_get_analyses(PgfFullFormEntry* entry, - PgfMorphoCallback* callback, GuExn* err) -{ - pgf_morpho_iter(entry->idx, callback, err); -} - -PGF_API GuEnum* -pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix, - GuPool* pool, GuExn* err) -{ - if (concr->sequences == NULL) { - GuExnData* err_data = gu_raise(err, PgfExn); - if (err_data) { - err_data->data = "The concrete syntax is not loaded"; - return NULL; - } - } - - PgfFullFormState* state = gu_new(PgfFullFormState, pool); - state->en.next = gu_fullform_enum_next; - state->sequences = concr->sequences; - state->prefix = prefix; - state->seq_idx = 0; - - bool case_sensitive = - (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); - - PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; - if (!gu_seq_binsearch_index(concr->sequences, &order.order, - PgfSequence, (void*) prefix, - &state->seq_idx)) { - state->seq_idx++; - } - - return &state->en; -} - PGF_API void pgf_parser_index(PgfConcr* concr, PgfCCat* ccat, PgfProduction prod, diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 6dd040b49..8fdc52b62 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -167,6 +167,22 @@ PGF_API_DECL void pgf_lookup_morpho(PgfConcr *concr, GuString sentence, PgfMorphoCallback* callback, GuExn* err); +typedef struct { + size_t pos; + GuString ptr; +} PgfCohortSpot; + +typedef struct { + PgfCohortSpot start; + PgfCohortSpot end; + GuBuf* buf; +} PgfCohortRange; + +PGF_API_DECL GuEnum* +pgf_lookup_cohorts(PgfConcr *concr, GuString sentence, + PgfMorphoCallback* callback, + GuPool* pool, GuExn* err); + typedef struct PgfFullFormEntry PgfFullFormEntry; PGF_API_DECL GuEnum* diff --git a/src/runtime/c/pgf/scanner.c b/src/runtime/c/pgf/scanner.c new file mode 100644 index 000000000..71444ca25 --- /dev/null +++ b/src/runtime/c/pgf/scanner.c @@ -0,0 +1,435 @@ +#include +#include +#include + +PGF_INTERNAL int +cmp_string(GuString* psent, size_t* ppos, GuString tok, + bool case_sensitive) +{ + for (;;) { + GuUCS c2 = gu_utf8_decode((const uint8_t**) &tok); + if (c2 == 0) + return 0; + + const uint8_t* p = (uint8_t*) *psent; + GuUCS c1 = gu_utf8_decode(&p); + if (c1 == 0) + return -1; + + if (!case_sensitive) + c1 = gu_ucs_to_lower(c1); + + if (c1 != c2) + return (c1-c2); + + *psent = (GuString) p; + (*ppos)++; + } +} + +PGF_INTERNAL bool +skip_space(GuString* psent, size_t* ppos) +{ + const uint8_t* p = (uint8_t*) *psent; + if (!gu_ucs_is_space(gu_utf8_decode(&p))) + return false; + + *psent = (GuString) p; + (*ppos)++; + return true; +} + +PGF_INTERNAL int +pgf_symbols_cmp(GuString* psent, size_t* ppos, + PgfSymbols* syms, size_t* sym_idx, + bool case_sensitive) +{ + size_t n_syms = gu_seq_length(syms); + while (*sym_idx < n_syms) { + PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx); + + if (*sym_idx > 0) { + if (!skip_space(psent,ppos)) { + if (**psent == 0) + return -1; + return 1; + } + + while (**psent != 0) { + if (!skip_space(psent,ppos)) + break; + } + } + + GuVariantInfo inf = gu_variant_open(sym); + switch (inf.tag) { + case PGF_SYMBOL_CAT: + case PGF_SYMBOL_LIT: + case PGF_SYMBOL_VAR: { + if (**psent == 0) + return -1; + return 1; + } + case PGF_SYMBOL_KS: { + PgfSymbolKS* pks = inf.data; + if (**psent == 0) + return -1; + + int cmp = cmp_string(psent,ppos,pks->token, case_sensitive); + if (cmp != 0) + return cmp; + break; + } + case PGF_SYMBOL_KP: + case PGF_SYMBOL_BIND: + case PGF_SYMBOL_NE: + case PGF_SYMBOL_SOFT_BIND: + case PGF_SYMBOL_SOFT_SPACE: + case PGF_SYMBOL_CAPIT: + case PGF_SYMBOL_ALL_CAPIT: { + return -1; + } + default: + gu_impossible(); + } + + (*sym_idx)++; + } + + return 0; +} + +static void +pgf_morpho_iter(PgfProductionIdx* idx, + PgfMorphoCallback* callback, + GuExn* err) +{ + size_t n_entries = gu_buf_length(idx); + for (size_t i = 0; i < n_entries; i++) { + PgfProductionIdxEntry* entry = + gu_buf_index(idx, PgfProductionIdxEntry, i); + + PgfCId lemma = entry->papp->fun->absfun->name; + GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx]; + + prob_t prob = entry->ccat->cnccat->abscat->prob + + entry->papp->fun->absfun->ep.prob; + callback->callback(callback, + lemma, analysis, prob, err); + if (!gu_ok(err)) + return; + } +} + +typedef struct { + GuOrder order; + bool case_sensitive; +} PgfSequenceOrder; + +static int +pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2) +{ + PgfSequenceOrder* self = gu_container(order, PgfSequenceOrder, order); + + size_t pos = 0; + GuString sent = (GuString) p1; + + const PgfSequence* sp2 = p2; + + size_t sym_idx = 0; + int res = pgf_symbols_cmp(&sent, &pos, sp2->syms, &sym_idx, self->case_sensitive); + if (res == 0 && (*sent != 0 || sym_idx != gu_seq_length(sp2->syms))) { + res = 1; + } + + return res; +} + +PGF_API void +pgf_lookup_morpho(PgfConcr *concr, GuString sentence, + PgfMorphoCallback* callback, GuExn* err) +{ + if (concr->sequences == NULL) { + GuExnData* err_data = gu_raise(err, PgfExn); + if (err_data) { + err_data->data = "The concrete syntax is not loaded"; + return; + } + } + + bool case_sensitive = + (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); + + PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; + PgfSequence* seq = (PgfSequence*) + gu_seq_binsearch(concr->sequences, &order.order, + PgfSequence, (void*) sentence); + + if (seq != NULL && seq->idx != NULL) + pgf_morpho_iter(seq->idx, callback, err); +} + +typedef struct { + GuEnum en; + PgfConcr* concr; + GuString sentence; + GuString current; + size_t len; + PgfMorphoCallback* callback; + GuExn* err; + bool case_sensitive; + GuBuf* spots; + GuBuf* found; +} PgfCohortsState; + +static int +cmp_cohort_spot(GuOrder* self, const void* a, const void* b) +{ + PgfCohortSpot *s1 = (PgfCohortSpot *) a; + PgfCohortSpot *s2 = (PgfCohortSpot *) b; + + return (s1->ptr-s2->ptr); +} + +static GuOrder +pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }}; + +static void +pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, + int i, int j, ptrdiff_t min, ptrdiff_t max) +{ + // This is a variation of a binary search algorithm which + // can retrieve all prefixes of a string with minimal + // comparisons, i.e. there is no need to lookup every + // prefix separately. + + while (i <= j) { + int k = (i+j) / 2; + PgfSequence* seq = gu_seq_index(state->concr->sequences, PgfSequence, k); + + PgfCohortSpot current = *spot; + + size_t sym_idx = 0; + int cmp = pgf_symbols_cmp(¤t.ptr, ¤t.pos, seq->syms, &sym_idx, state->case_sensitive); + if (cmp < 0) { + j = k-1; + } else if (cmp > 0) { + ptrdiff_t len = current.ptr - spot->ptr; + + if (min <= len) + pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len); + + if (len+1 <= max) + pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max); + + break; + } else { + ptrdiff_t len = current.ptr - spot->ptr; + + if (min <= len-1) + pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len-1); + + if (seq->idx != NULL) { + PgfCohortRange* range = gu_buf_insert(state->found, 0); + range->start = *spot; + range->end = current; + range->buf = seq->idx; + + while (*current.ptr != 0) { + if (!skip_space(¤t.ptr, ¤t.pos)) + break; + } + + gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t); + } + + if (len+1 <= max) + pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max); + + break; + } + } +} + +static void +pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool) +{ + PgfCohortsState* state = gu_container(self, PgfCohortsState, en); + + while (gu_buf_length(state->found) == 0 && + gu_buf_length(state->spots) > 0) { + PgfCohortSpot spot; + gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot); + + if (spot.ptr == state->current) + continue; + + if (*spot.ptr == 0) + break; + + pgf_lookup_cohorts_helper + (state, &spot, + 0, gu_seq_length(state->concr->sequences)-1, + 1, (state->sentence+state->len)-spot.ptr); + + if (gu_buf_length(state->found) == 0) { + gu_utf8_decode((const uint8_t**) &spot.ptr); + spot.pos++; + gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot); + } + } + + PgfCohortRange* pRes = (PgfCohortRange*)to; + + if (gu_buf_length(state->found) == 0) { + pRes->start.pos = 0; + pRes->start.ptr = NULL; + pRes->end.pos = 0; + pRes->end.ptr = NULL; + pRes->buf = NULL; + state->current = NULL; + return; + } else { + *pRes = gu_buf_pop(state->found, PgfCohortRange); + state->current = pRes->start.ptr; + pgf_morpho_iter(pRes->buf, state->callback, state->err); + } +} + +PGF_API GuEnum* +pgf_lookup_cohorts(PgfConcr *concr, GuString sentence, + PgfMorphoCallback* callback, + GuPool* pool, GuExn* err) +{ + if (concr->sequences == NULL) { + GuExnData* err_data = gu_raise(err, PgfExn); + if (err_data) { + err_data->data = "The concrete syntax is not loaded"; + return NULL; + } + } + + bool case_sensitive = + (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); + + PgfCohortsState* state = gu_new(PgfCohortsState, pool); + state->en.next = pgf_lookup_cohorts_enum_next; + state->concr = concr; + state->sentence= sentence; + state->len = strlen(sentence); + state->callback= callback; + state->err = err; + state->case_sensitive = case_sensitive; + state->spots = gu_new_buf(PgfCohortSpot, pool); + state->found = gu_new_buf(PgfCohortRange, pool); + + PgfCohortSpot spot = {0,sentence}; + while (*spot.ptr != 0) { + if (!skip_space(&spot.ptr, &spot.pos)) + break; + } + + gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot); + + return &state->en; +} + +typedef struct { + GuEnum en; + PgfSequences* sequences; + GuString prefix; + size_t seq_idx; +} PgfFullFormState; + +struct PgfFullFormEntry { + GuString tokens; + PgfProductionIdx* idx; +}; + +static void +gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool) +{ + PgfFullFormState* st = gu_container(self, PgfFullFormState, en); + PgfFullFormEntry* entry = NULL; + + if (st->sequences != NULL) { + size_t n_seqs = gu_seq_length(st->sequences); + while (st->seq_idx < n_seqs) { + PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx); + GuString tokens = pgf_get_tokens(seq->syms, 0, pool); + + if (!gu_string_is_prefix(st->prefix, tokens)) { + st->seq_idx = n_seqs; + break; + } + + if (*tokens != 0 && seq->idx != NULL) { + entry = gu_new(PgfFullFormEntry, pool); + entry->tokens = tokens; + entry->idx = seq->idx; + + st->seq_idx++; + break; + } + + st->seq_idx++; + } + } + + *((PgfFullFormEntry**) to) = entry; +} + +PGF_API GuEnum* +pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool) +{ + PgfFullFormState* st = gu_new(PgfFullFormState, pool); + st->en.next = gu_fullform_enum_next; + st->sequences = concr->sequences; + st->prefix = ""; + st->seq_idx = 0; + return &st->en; +} + +PGF_API GuString +pgf_fullform_get_string(PgfFullFormEntry* entry) +{ + return entry->tokens; +} + +PGF_API void +pgf_fullform_get_analyses(PgfFullFormEntry* entry, + PgfMorphoCallback* callback, GuExn* err) +{ + pgf_morpho_iter(entry->idx, callback, err); +} + +PGF_API GuEnum* +pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix, + GuPool* pool, GuExn* err) +{ + if (concr->sequences == NULL) { + GuExnData* err_data = gu_raise(err, PgfExn); + if (err_data) { + err_data->data = "The concrete syntax is not loaded"; + return NULL; + } + } + + PgfFullFormState* state = gu_new(PgfFullFormState, pool); + state->en.next = gu_fullform_enum_next; + state->sequences = concr->sequences; + state->prefix = prefix; + state->seq_idx = 0; + + bool case_sensitive = + (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); + + PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; + if (!gu_seq_binsearch_index(concr->sequences, &order.order, + PgfSequence, (void*) prefix, + &state->seq_idx)) { + state->seq_idx++; + } + + return &state->en; +} diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc index 75afabb3d..5644b6ce8 100644 --- a/src/runtime/haskell-bind/PGF2.hsc +++ b/src/runtime/haskell-bind/PGF2.hsc @@ -70,7 +70,7 @@ module PGF2 (-- * PGF -- ** Generation generateAll, -- ** Morphological Analysis - MorphoAnalysis, lookupMorpho, fullFormLexicon, + MorphoAnalysis, lookupMorpho, lookupCohorts, fullFormLexicon, -- ** Visualizations GraphvizOptions(..), graphvizDefaults, graphvizAbstractTree, graphvizParseTree, graphvizWordAlignment, @@ -481,6 +481,36 @@ lookupMorpho (Concr concr master) sent = freeHaskellFunPtr fptr readIORef ref +lookupCohorts :: Concr -> String -> [(Int,[MorphoAnalysis],Int)] +lookupCohorts lang@(Concr concr master) sent = + unsafePerformIO $ + do pl <- gu_new_pool + ref <- newIORef [] + cback <- gu_malloc pl (#size PgfMorphoCallback) + fptr <- wrapLookupMorphoCallback (getAnalysis ref) + (#poke PgfMorphoCallback, callback) cback fptr + c_sent <- newUtf8CString sent pl + enum <- pgf_lookup_cohorts concr c_sent cback pl nullPtr + fpl <- newForeignPtr gu_pool_finalizer pl + fromCohortRange enum fpl fptr ref + where + fromCohortRange enum fpl fptr ref = + allocaBytes (#size PgfCohortRange) $ \ptr -> + withForeignPtr fpl $ \pl -> + do gu_enum_next enum ptr pl + buf <- (#peek PgfCohortRange, buf) ptr + if buf == nullPtr + then do finalizeForeignPtr fpl + freeHaskellFunPtr fptr + touchConcr lang + return [] + else do start <- (#peek PgfCohortRange, start.pos) ptr + end <- (#peek PgfCohortRange, end.pos) ptr + ans <- readIORef ref + writeIORef ref [] + cohs <- unsafeInterleaveIO (fromCohortRange enum fpl fptr ref) + return ((start,ans,end):cohs) + fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])] fullFormLexicon lang = unsafePerformIO $ diff --git a/src/runtime/haskell-bind/PGF2/FFI.hsc b/src/runtime/haskell-bind/PGF2/FFI.hsc index 39b18fcf3..713adcecc 100644 --- a/src/runtime/haskell-bind/PGF2/FFI.hsc +++ b/src/runtime/haskell-bind/PGF2/FFI.hsc @@ -100,7 +100,7 @@ foreign import ccall unsafe "gu/string.h gu_string_buf_out" foreign import ccall unsafe "gu/file.h gu_file_in" gu_file_in :: Ptr () -> Ptr GuPool -> IO (Ptr GuIn) -foreign import ccall unsafe "gu/enum.h gu_enum_next" +foreign import ccall safe "gu/enum.h gu_enum_next" gu_enum_next :: Ptr a -> Ptr (Ptr b) -> Ptr GuPool -> IO () foreign import ccall unsafe "gu/string.h gu_string_buf_freeze" @@ -401,6 +401,9 @@ foreign import ccall "pgf/pgf.h pgf_parse_with_oracle" foreign import ccall "pgf/pgf.h pgf_lookup_morpho" pgf_lookup_morpho :: Ptr PgfConcr -> CString -> Ptr PgfMorphoCallback -> Ptr GuExn -> IO () +foreign import ccall "pgf/pgf.h pgf_lookup_cohorts" + pgf_lookup_cohorts :: Ptr PgfConcr -> CString -> Ptr PgfMorphoCallback -> Ptr GuPool -> Ptr GuExn -> IO (Ptr GuEnum) + type LookupMorphoCallback = Ptr PgfMorphoCallback -> CString -> CString -> Float -> Ptr GuExn -> IO () foreign import ccall "wrapper"