Merge with master and drop the Haskell runtime completely

This commit is contained in:
krangelov
2019-09-19 22:01:57 +02:00
488 changed files with 8762 additions and 39251 deletions

View File

@@ -344,8 +344,9 @@ struct PgfCCat {
PgfCncFuns* linrefs;
size_t n_synprods;
PgfProductionSeq* prods;
float viterbi_prob;
prob_t viterbi_prob;
int fid;
int chunk_count;
PgfItemConts* conts;
struct PgfAnswers* answers;
GuFinalizer fin[0];

View File

@@ -198,16 +198,16 @@ pgf_literal_hash(GuHash h, PgfLiteral lit);
PGF_API_DECL GuHash
pgf_expr_hash(GuHash h, PgfExpr e);
PGF_API size_t
PGF_API_DECL size_t
pgf_expr_size(PgfExpr expr);
PGF_API GuSeq*
PGF_API_DECL GuSeq*
pgf_expr_functions(PgfExpr expr, GuPool* pool);
PGF_API PgfExpr
PGF_API_DECL PgfExpr
pgf_expr_substitute(PgfExpr expr, GuSeq* meta_values, GuPool* pool);
PGF_API PgfType*
PGF_API_DECL PgfType*
pgf_type_substitute(PgfType* type, GuSeq* meta_values, GuPool* pool);
typedef struct PgfPrintContext PgfPrintContext;

View File

@@ -5,9 +5,6 @@
#include <pgf/reasoner.h>
#include <pgf/reader.h>
#include "lightning.h"
#if defined(__MINGW32__) || defined(_MSC_VER)
#include <malloc.h>
#endif
//#define PGF_JIT_DEBUG
@@ -43,18 +40,6 @@ typedef struct {
#define JIT_VSTATE JIT_V1
#define JIT_VCLOS JIT_V2
#if defined(__MINGW32__) || defined(_MSC_VER)
#include <windows.h>
static int
getpagesize()
{
SYSTEM_INFO system_info;
GetSystemInfo(&system_info);
return system_info.dwPageSize;
}
#endif
static void
pgf_jit_finalize_page(GuFinalizer* self)
@@ -65,19 +50,8 @@ pgf_jit_finalize_page(GuFinalizer* self)
static void
pgf_jit_alloc_page(PgfReader* rdr)
{
void *page;
size_t page_size = getpagesize();
#if defined(ANDROID)
if ((page = memalign(page_size, page_size)) == NULL) {
#elif defined(__MINGW32__) || defined(_MSC_VER)
if ((page = malloc(page_size)) == NULL) {
#else
if (posix_memalign(&page, page_size, page_size) != 0) {
#endif
gu_fatal("Memory allocation failed");
}
size_t page_size;
void *page = gu_mem_page_alloc(sizeof(GuFinalizer), &page_size);
GuFinalizer* fin = page;
fin->fn = pgf_jit_finalize_page;

File diff suppressed because it is too large Load Diff

View File

@@ -162,6 +162,22 @@ PGF_API_DECL void
pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
PgfMorphoCallback* callback, GuExn* err);
typedef struct {
size_t pos;
GuString ptr;
} PgfCohortSpot;
typedef struct {
PgfCohortSpot start;
PgfCohortSpot end;
GuBuf* buf;
} PgfCohortRange;
PGF_API_DECL GuEnum*
pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
PgfMorphoCallback* callback,
GuPool* pool, GuExn* err);
typedef struct PgfFullFormEntry PgfFullFormEntry;
PGF_API_DECL GuEnum*

View File

@@ -98,6 +98,74 @@ pgf_print_fid(int fid, GuOut* out, GuExn* err)
gu_printf(out, err, "C%d", fid);
}
PGF_INTERNAL void
pgf_print_production_args(PgfPArgs* args,
GuOut* out, GuExn* err)
{
size_t n_args = gu_seq_length(args);
for (size_t j = 0; j < n_args; j++) {
if (j > 0)
gu_putc(',',out,err);
PgfPArg arg = gu_seq_get(args, PgfPArg, j);
if (arg.hypos != NULL &&
gu_seq_length(arg.hypos) > 0) {
size_t n_hypos = gu_seq_length(arg.hypos);
for (size_t k = 0; k < n_hypos; k++) {
PgfCCat *hypo = gu_seq_get(arg.hypos, PgfCCat*, k);
pgf_print_fid(hypo->fid, out, err);
gu_putc(' ',out,err);
}
gu_puts("-> ",out,err);
}
pgf_print_fid(arg.ccat->fid, out, err);
}
}
PGF_INTERNAL void
pgf_print_production(int fid, PgfProduction prod,
GuOut *out, GuExn* err)
{
pgf_print_fid(fid, out, err);
gu_puts(" -> ", out, err);
GuVariantInfo i = gu_variant_open(prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
gu_printf(out,err,"F%d(",papp->fun->funid);
if (papp->fun->ep != NULL) {
pgf_print_expr(papp->fun->ep->expr, NULL, 0, out, err);
} else {
PgfPArg* parg = gu_seq_index(papp->args, PgfPArg, 0);
gu_printf(out,err,"linref %s", parg->ccat->cnccat->abscat->name);
}
gu_printf(out,err,")[");
pgf_print_production_args(papp->args,out,err);
gu_printf(out,err,"]\n");
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = i.data;
gu_puts("_[",out,err);
pgf_print_fid(pcoerce->coerce->fid, out, err);
gu_puts("]\n",out,err);
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
gu_printf(out,err,"<extern>(");
pgf_print_expr(pext->ep->expr, NULL, 0, out, err);
gu_printf(out,err,")[]\n");
break;
}
default:
gu_impossible();
}
}
static void
pgf_print_productions(GuMapItor* fn, const void* key, void* value,
GuExn* err)
@@ -111,48 +179,7 @@ pgf_print_productions(GuMapItor* fn, const void* key, void* value,
size_t n_prods = gu_seq_length(ccat->prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod = gu_seq_get(ccat->prods, PgfProduction, i);
gu_puts(" ", out, err);
pgf_print_fid(fid, out, err);
gu_puts(" -> ", out, err);
GuVariantInfo i = gu_variant_open(prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
gu_printf(out,err,"F%d[",papp->fun->funid);
size_t n_args = gu_seq_length(papp->args);
for (size_t j = 0; j < n_args; j++) {
if (j > 0)
gu_putc(',',out,err);
PgfPArg arg = gu_seq_get(papp->args, PgfPArg, j);
if (arg.hypos != NULL) {
size_t n_hypos = gu_seq_length(arg.hypos);
for (size_t k = 0; k < n_hypos; k++) {
if (k > 0)
gu_putc(' ',out,err);
PgfCCat *hypo = gu_seq_get(arg.hypos, PgfCCat*, k);
pgf_print_fid(hypo->fid, out, err);
}
}
pgf_print_fid(arg.ccat->fid, out, err);
}
gu_printf(out,err,"]\n");
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = i.data;
gu_puts("_[", out, err);
pgf_print_fid(pcoerce->coerce->fid, out, err);
gu_puts("]\n", out, err);
break;
}
default:
gu_impossible();
}
pgf_print_production(fid, prod, out, err);
}
}
}

View File

@@ -328,16 +328,20 @@ pgf_read_patt(PgfReader* rdr)
uint8_t tag = pgf_read_tag(rdr);
switch (tag) {
case PGF_PATT_APP: {
PgfCId ctor = pgf_read_cid(rdr, rdr->opool);
gu_return_on_exn(rdr->err, gu_null_variant);
size_t n_args = pgf_read_len(rdr);
gu_return_on_exn(rdr->err, gu_null_variant);
PgfPattApp *papp =
gu_new_variant(PGF_PATT_APP,
PgfPattApp,
&patt, rdr->opool);
papp->ctor = pgf_read_cid(rdr, rdr->opool);
gu_return_on_exn(rdr->err, gu_null_variant);
papp->n_args = pgf_read_len(rdr);
gu_return_on_exn(rdr->err, gu_null_variant);
gu_new_flex_variant(PGF_PATT_APP,
PgfPattApp,
args, n_args,
&patt, rdr->opool);
papp->ctor = ctor;
papp->n_args = n_args;
for (size_t i = 0; i < papp->n_args; i++) {
papp->args[i] = pgf_read_patt(rdr);
gu_return_on_exn(rdr->err, gu_null_variant);
@@ -840,6 +844,7 @@ pgf_read_fid(PgfReader* rdr, PgfConcr* concr)
ccat->prods = NULL;
ccat->viterbi_prob = 0;
ccat->fid = fid;
ccat->chunk_count = 1;
ccat->conts = NULL;
ccat->answers = NULL;
@@ -1077,6 +1082,7 @@ pgf_read_cnccat(PgfReader* rdr, PgfAbstr* abstr, PgfConcr* concr, PgfCId name)
ccat->prods = NULL;
ccat->viterbi_prob = 0;
ccat->fid = fid;
ccat->chunk_count = 1;
ccat->conts = NULL;
ccat->answers = NULL;

516
src/runtime/c/pgf/scanner.c Normal file
View File

@@ -0,0 +1,516 @@
#include <pgf/data.h>
#include <pgf/expr.h>
#include <pgf/linearizer.h>
#include <gu/utf8.h>
PGF_INTERNAL int
cmp_string(PgfCohortSpot* spot, GuString tok,
bool case_sensitive)
{
for (;;) {
GuUCS c2 = gu_utf8_decode((const uint8_t**) &tok);
if (c2 == 0)
return 0;
const uint8_t* p = (uint8_t*) spot->ptr;
GuUCS c1 = gu_utf8_decode(&p);
if (c1 == 0)
return -1;
if (!case_sensitive) {
c1 = gu_ucs_to_lower(c1);
c2 = gu_ucs_to_lower(c2);
}
if (c1 != c2)
return (c1-c2);
spot->ptr = (GuString) p;
spot->pos++;
}
}
PGF_INTERNAL bool
skip_space(GuString* psent, size_t* ppos)
{
const uint8_t* p = (uint8_t*) *psent;
if (!gu_ucs_is_space(gu_utf8_decode(&p)))
return false;
*psent = (GuString) p;
(*ppos)++;
return true;
}
PGF_INTERNAL int
pgf_symbols_cmp(PgfCohortSpot* spot,
PgfSymbols* syms, size_t* sym_idx,
bool case_sensitive)
{
size_t n_syms = gu_seq_length(syms);
while (*sym_idx < n_syms) {
PgfSymbol sym = gu_seq_get(syms, PgfSymbol, *sym_idx);
if (*sym_idx > 0) {
if (!skip_space(&spot->ptr,&spot->pos)) {
if (*spot->ptr == 0)
return -1;
return 1;
}
while (*spot->ptr != 0) {
if (!skip_space(&spot->ptr,&spot->pos))
break;
}
}
GuVariantInfo inf = gu_variant_open(sym);
switch (inf.tag) {
case PGF_SYMBOL_CAT:
case PGF_SYMBOL_LIT:
case PGF_SYMBOL_VAR: {
if (*spot->ptr == 0)
return -1;
return 1;
}
case PGF_SYMBOL_KS: {
PgfSymbolKS* pks = inf.data;
if (*spot->ptr == 0)
return -1;
int cmp = cmp_string(spot,pks->token, case_sensitive);
if (cmp != 0)
return cmp;
break;
}
case PGF_SYMBOL_KP:
case PGF_SYMBOL_BIND:
case PGF_SYMBOL_NE:
case PGF_SYMBOL_SOFT_BIND:
case PGF_SYMBOL_SOFT_SPACE:
case PGF_SYMBOL_CAPIT:
case PGF_SYMBOL_ALL_CAPIT: {
return -1;
}
default:
gu_impossible();
}
(*sym_idx)++;
}
return 0;
}
static void
pgf_morpho_iter(PgfProductionIdx* idx,
PgfMorphoCallback* callback,
GuExn* err)
{
size_t n_entries = gu_buf_length(idx);
for (size_t i = 0; i < n_entries; i++) {
PgfProductionIdxEntry* entry =
gu_buf_index(idx, PgfProductionIdxEntry, i);
PgfCId lemma = entry->papp->fun->absfun->name;
GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx];
prob_t prob = entry->ccat->cnccat->abscat->prob +
entry->papp->fun->absfun->ep.prob;
callback->callback(callback,
lemma, analysis, prob, err);
if (!gu_ok(err))
return;
}
}
typedef struct {
GuOrder order;
bool case_sensitive;
} PgfSequenceOrder;
PGF_INTERNAL bool
pgf_is_case_sensitive(PgfConcr* concr)
{
PgfFlag* flag =
gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive");
if (flag != NULL) {
GuVariantInfo inf = gu_variant_open(flag->value);
if (inf.tag == PGF_LITERAL_STR) {
PgfLiteralStr* lstr = inf.data;
if (strcmp(lstr->val, "off") == 0)
return false;
}
}
return true;
}
static int
pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2)
{
PgfSequenceOrder* self = gu_container(order, PgfSequenceOrder, order);
PgfCohortSpot spot = {0, (GuString) p1};
const PgfSequence* sp2 = p2;
size_t sym_idx = 0;
int res = pgf_symbols_cmp(&spot, sp2->syms, &sym_idx, self->case_sensitive);
if (res == 0 && (*spot.ptr != 0 || sym_idx != gu_seq_length(sp2->syms))) {
res = 1;
}
return res;
}
PGF_API void
pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
PgfMorphoCallback* callback, GuExn* err)
{
if (concr->sequences == NULL) {
GuExnData* err_data = gu_raise(err, PgfExn);
if (err_data) {
err_data->data = "The concrete syntax is not loaded";
return;
}
}
size_t index = 0;
PgfSequenceOrder order = { { pgf_sequence_cmp_fn },
pgf_is_case_sensitive(concr) };
if (gu_seq_binsearch_index(concr->sequences, &order.order,
PgfSequence, (void*) sentence,
&index)) {
PgfSequence* seq = NULL;
/* If the match is case-insensitive then there might be more
* matches around the current index. We must check the neighbour
* sequences for matching as well.
*/
if (!order.case_sensitive) {
size_t i = index;
while (i > 0) {
seq = gu_seq_index(concr->sequences, PgfSequence, i-1);
size_t sym_idx = 0;
PgfCohortSpot spot = {0, sentence};
if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, order.case_sensitive) != 0) {
break;
}
if (seq->idx != NULL)
pgf_morpho_iter(seq->idx, callback, err);
i--;
}
}
seq = gu_seq_index(concr->sequences, PgfSequence, index);
if (seq->idx != NULL)
pgf_morpho_iter(seq->idx, callback, err);
if (!order.case_sensitive) {
size_t i = index+1;
while (i < gu_seq_length(concr->sequences)) {
seq = gu_seq_index(concr->sequences, PgfSequence, i);
size_t sym_idx = 0;
PgfCohortSpot spot = {0, sentence};
if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, order.case_sensitive) != 0) {
break;
}
if (seq->idx != NULL)
pgf_morpho_iter(seq->idx, callback, err);
i++;
}
}
}
}
typedef struct {
GuEnum en;
PgfConcr* concr;
GuString sentence;
GuString current;
size_t len;
PgfMorphoCallback* callback;
GuExn* err;
bool case_sensitive;
GuBuf* spots;
GuBuf* found;
} PgfCohortsState;
static int
cmp_cohort_spot(GuOrder* self, const void* a, const void* b)
{
PgfCohortSpot *s1 = (PgfCohortSpot *) a;
PgfCohortSpot *s2 = (PgfCohortSpot *) b;
return (s1->ptr-s2->ptr);
}
static GuOrder
pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }};
static void
pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
int i, int j, ptrdiff_t min, ptrdiff_t max)
{
// This is a variation of a binary search algorithm which
// can retrieve all prefixes of a string with minimal
// comparisons, i.e. there is no need to lookup every
// prefix separately.
while (i <= j) {
int k = (i+j) / 2;
PgfSequence* seq = gu_seq_index(state->concr->sequences, PgfSequence, k);
PgfCohortSpot current = *spot;
size_t sym_idx = 0;
int cmp = pgf_symbols_cmp(&current, seq->syms, &sym_idx, state->case_sensitive);
if (cmp < 0) {
j = k-1;
} else if (cmp > 0) {
ptrdiff_t len = current.ptr - spot->ptr;
if (min <= len)
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);
if (len+1 <= max)
pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max);
break;
} else {
ptrdiff_t len = current.ptr - spot->ptr;
if (min <= len)
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);
if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) {
PgfCohortRange* range = gu_buf_insert(state->found, 0);
range->start = *spot;
range->end = current;
range->buf = seq->idx;
}
while (*current.ptr != 0) {
if (!skip_space(&current.ptr, &current.pos))
break;
}
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &current);
if (len <= max)
pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max);
break;
}
}
}
static void
pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
{
PgfCohortsState* state = gu_container(self, PgfCohortsState, en);
while (gu_buf_length(state->found) == 0 &&
gu_buf_length(state->spots) > 0) {
PgfCohortSpot spot;
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
if (spot.ptr == state->current)
continue;
if (*spot.ptr == 0)
break;
pgf_lookup_cohorts_helper
(state, &spot,
0, gu_seq_length(state->concr->sequences)-1,
1, (state->sentence+state->len)-spot.ptr);
if (gu_buf_length(state->found) == 0) {
// skip one character and try again
gu_utf8_decode((const uint8_t**) &spot.ptr);
spot.pos++;
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
}
}
PgfCohortRange* pRes = (PgfCohortRange*)to;
if (gu_buf_length(state->found) == 0) {
pRes->start.pos = 0;
pRes->start.ptr = NULL;
pRes->end.pos = 0;
pRes->end.ptr = NULL;
pRes->buf = NULL;
state->current = NULL;
return;
} else do {
*pRes = gu_buf_pop(state->found, PgfCohortRange);
state->current = pRes->start.ptr;
pgf_morpho_iter(pRes->buf, state->callback, state->err);
} while (gu_buf_length(state->found) > 0 &&
gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr);
}
PGF_API GuEnum*
pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
PgfMorphoCallback* callback,
GuPool* pool, GuExn* err)
{
if (concr->sequences == NULL) {
GuExnData* err_data = gu_raise(err, PgfExn);
if (err_data) {
err_data->data = "The concrete syntax is not loaded";
return NULL;
}
}
PgfCohortsState* state = gu_new(PgfCohortsState, pool);
state->en.next = pgf_lookup_cohorts_enum_next;
state->concr = concr;
state->sentence= sentence;
state->len = strlen(sentence);
state->callback= callback;
state->err = err;
state->case_sensitive = pgf_is_case_sensitive(concr);
state->spots = gu_new_buf(PgfCohortSpot, pool);
state->found = gu_new_buf(PgfCohortRange, pool);
PgfCohortSpot spot = {0,sentence};
while (*spot.ptr != 0) {
if (!skip_space(&spot.ptr, &spot.pos))
break;
}
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
return &state->en;
}
typedef struct {
GuEnum en;
PgfSequences* sequences;
GuString prefix;
size_t seq_idx;
bool case_sensitive;
} PgfFullFormState;
struct PgfFullFormEntry {
GuString tokens;
PgfProductionIdx* idx;
};
static void
gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
{
PgfFullFormState* st = gu_container(self, PgfFullFormState, en);
PgfFullFormEntry* entry = NULL;
if (st->sequences != NULL) {
size_t n_seqs = gu_seq_length(st->sequences);
while (st->seq_idx < n_seqs) {
PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx);
GuString tokens = pgf_get_tokens(seq->syms, 0, pool);
PgfCohortSpot spot = {0, st->prefix};
if (cmp_string(&spot, tokens, st->case_sensitive) > 0 || *spot.ptr != 0) {
st->seq_idx = n_seqs;
break;
}
if (*tokens != 0 && seq->idx != NULL) {
entry = gu_new(PgfFullFormEntry, pool);
entry->tokens = tokens;
entry->idx = seq->idx;
st->seq_idx++;
break;
}
st->seq_idx++;
}
}
*((PgfFullFormEntry**) to) = entry;
}
PGF_API GuEnum*
pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
{
PgfFullFormState* st = gu_new(PgfFullFormState, pool);
st->en.next = gu_fullform_enum_next;
st->sequences = concr->sequences;
st->prefix = "";
st->seq_idx = 0;
st->case_sensitive = true;
return &st->en;
}
PGF_API GuString
pgf_fullform_get_string(PgfFullFormEntry* entry)
{
return entry->tokens;
}
PGF_API void
pgf_fullform_get_analyses(PgfFullFormEntry* entry,
PgfMorphoCallback* callback, GuExn* err)
{
pgf_morpho_iter(entry->idx, callback, err);
}
PGF_API GuEnum*
pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix,
GuPool* pool, GuExn* err)
{
if (concr->sequences == NULL) {
GuExnData* err_data = gu_raise(err, PgfExn);
if (err_data) {
err_data->data = "The concrete syntax is not loaded";
return NULL;
}
}
PgfFullFormState* state = gu_new(PgfFullFormState, pool);
state->en.next = gu_fullform_enum_next;
state->sequences = concr->sequences;
state->prefix = prefix;
state->seq_idx = 0;
state->case_sensitive = pgf_is_case_sensitive(concr);
PgfSequenceOrder order = { { pgf_sequence_cmp_fn },
state->case_sensitive };
if (!gu_seq_binsearch_index(concr->sequences, &order.order,
PgfSequence, (void*) prefix,
&state->seq_idx)) {
state->seq_idx++;
} else if (!state->case_sensitive) {
/* If the match is case-insensitive then there might be more
* matches around the current index. Since we scroll down
* anyway, it is enough to search upwards now.
*/
while (state->seq_idx > 0) {
PgfSequence* seq =
gu_seq_index(concr->sequences, PgfSequence, state->seq_idx-1);
size_t sym_idx = 0;
PgfCohortSpot spot = {0, state->prefix};
if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, state->case_sensitive) > 0 || *spot.ptr != 0) {
break;
}
state->seq_idx--;
}
}
return &state->en;
}