1
0
forked from GitHub/gf-core
Files
gf-core/src/runtime/c/pgf/parser.c

2711 lines
68 KiB
C

#include <pgf/parser.h>
#include <gu/seq.h>
#include <gu/assert.h>
#include <gu/file.h>
#include <math.h>
#include <stdlib.h>
//#define PGF_PARSER_DEBUG
//#define PGF_COUNTS_DEBUG
//#define PGF_RESULT_DEBUG
typedef GuBuf PgfItemBuf;
static GU_DEFINE_TYPE(PgfItemBuf, abstract, _);
struct PgfItemConts {
PgfCCat* ccat;
size_t lin_idx;
PgfParseState* state;
prob_t outside_prob;
PgfItemBuf* items;
int ref_count; // how many items point to this cont?
};
static GU_DEFINE_TYPE(PgfItemConts, abstract, _);
typedef GuSeq PgfItemContss;
static GU_DEFINE_TYPE(PgfItemContss, abstract);
typedef GuMap PgfContsMap;
static GU_DEFINE_TYPE(PgfContsMap, GuMap,
gu_type(PgfCCat), NULL,
gu_ptr_type(PgfItemContss), &gu_null_struct);
typedef GuMap PgfGenCatMap;
static GU_DEFINE_TYPE(PgfGenCatMap, GuMap,
gu_type(PgfItemConts), NULL,
gu_ptr_type(PgfCCat), &gu_null_struct);
typedef GuBuf PgfCCatBuf;
typedef struct {
PgfConcr* concr;
GuPool* pool; // this pool is used for structures internal to the parser
GuPool* out_pool; // this pool is used for the allocating the final abstract trees
GuBuf* expr_queue;
PgfExpr meta_var;
PgfProduction meta_prod;
int max_fid;
#ifdef PGF_COUNTS_DEBUG
int item_full_count;
int item_real_count;
int cont_full_count;
int ccat_full_count;
int prod_full_count;
#endif
PgfItem* free_item;
prob_t beam_size;
} PgfParsing;
typedef struct {
PgfCCat* ccat;
size_t lin_idx;
} PgfCFCat;
static GU_DEFINE_TYPE(PgfCFCat, struct,
GU_MEMBER(PgfCFCat, ccat, PgfCCat),
GU_MEMBER(PgfCFCat, lin_idx, size_t));
extern GuHasher pgf_cfcat_hasher;
GU_DEFINE_TYPE(PgfProductionIdx, GuMap,
gu_type(PgfCFCat), &pgf_cfcat_hasher,
gu_ptr_type(PgfProductionBuf), &gu_null_struct);
typedef struct PgfTokenState PgfTokenState;
typedef struct {
bool (*match_token)(PgfTokenState* ts, PgfToken tok, PgfItem* item);
PgfToken (*get_token)(PgfTokenState* ts);
PgfProductionIdx* (*get_lexicon_idx)(PgfTokenState* ts);
} PgfTokenFn;
struct PgfTokenState {
PgfTokenFn* fn;
prob_t lexical_prob;
};
struct PgfParseState {
PgfParseState* next;
PgfItemBuf* agenda;
PgfItem* meta_item;
PgfContsMap* conts_map;
PgfGenCatMap* generated_cats;
unsigned short offset;
prob_t viterbi_prob;
PgfParsing* ps;
PgfTokenState* ts;
};
typedef struct PgfAnswers {
GuBuf* conts;
GuBuf* exprs;
prob_t outside_prob;
} PgfAnswers;
typedef struct {
PgfAnswers* answers;
PgfExprProb ep;
PgfPArgs* args;
size_t arg_idx;
} PgfExprState;
typedef struct PgfParseResult PgfParseResult;
struct PgfParseResult {
PgfParseState* state;
PgfExprEnum en;
};
typedef struct PgfItemBase PgfItemBase;
struct PgfItem {
union {
PgfItemConts* conts;
PgfItem *next; // used to collect released items
};
PgfProduction prod;
PgfPArgs* args;
PgfSymbol curr_sym;
uint16_t seq_idx;
uint8_t tok_idx;
uint8_t alt;
prob_t inside_prob;
};
GU_DEFINE_TYPE(PgfLeftcornerTokIdx, GuStringMap,
gu_ptr_type(PgfProductionIdx), &gu_null_struct);
static PgfSymbol
pgf_prev_extern_sym(PgfSymbol sym)
{
GuVariantInfo i = gu_variant_open(sym);
switch (i.tag) {
case PGF_SYMBOL_CAT:
return *((PgfSymbol*) (((PgfSymbolCat*) i.data)+1));
case PGF_SYMBOL_KP:
return *((PgfSymbol*) (((PgfSymbolKP*) i.data)+1));
case PGF_SYMBOL_KS:
return *((PgfSymbol*) (((PgfSymbolKS*) i.data)+1));
case PGF_SYMBOL_LIT:
return *((PgfSymbol*) (((PgfSymbolLit*) i.data)+1));
case PGF_SYMBOL_VAR:
return *((PgfSymbol*) (((PgfSymbolVar*) i.data)+1));
case PGF_SYMBOL_NE:
return *((PgfSymbol*) (((PgfSymbolNE*) i.data)+1));
default:
gu_impossible();
return gu_null_variant;
}
}
size_t
pgf_item_lin_idx(PgfItem* item) {
return item->conts->lin_idx;
}
int
pgf_item_sequence_length(PgfItem* item)
{
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
return gu_seq_length(papp->fun->lins[item->conts->lin_idx]);
}
case PGF_PRODUCTION_COERCE: {
return 1;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
PgfSequence* seq;
if (pext->lins != NULL &&
(seq = gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx)) != NULL) {
return gu_seq_length(seq);
} else {
int seq_len = 0;
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
seq_len++;
sym = pgf_prev_extern_sym(sym);
}
return seq_len;
}
}
case PGF_PRODUCTION_META: {
int seq_len = 0;
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
seq_len++;
sym = pgf_prev_extern_sym(sym);
}
return seq_len;
}
default:
gu_impossible();
return 0;
}
}
static PgfSequence*
pgf_extern_seq_get(PgfItem* item, GuPool* pool)
{
int seq_len = pgf_item_sequence_length(item);
PgfSequence* seq =
gu_new_seq(PgfSymbol, seq_len, pool);
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
gu_seq_set(seq, PgfSymbol, --seq_len, sym);
sym = pgf_prev_extern_sym(sym);
}
return seq;
}
void
pgf_item_sequence(PgfItem* item,
size_t* lin_idx, PgfSequence** seq,
GuPool* pool) {
*lin_idx = item->conts->lin_idx;
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
*seq = papp->fun->lins[item->conts->lin_idx];
break;
}
case PGF_PRODUCTION_COERCE: {
PgfSymbol sym =
gu_new_variant_i(pool, PGF_SYMBOL_CAT,
PgfSymbolCat,
.d = 0, .r = item->conts->lin_idx);
*seq = gu_new_seq(PgfSequence*, 1, pool);
gu_seq_set(*seq, PgfSymbol, 0, sym);
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
if (pext->lins == NULL ||
(*seq = gu_seq_get(pext->lins, PgfSequence*, item->conts->lin_idx)) == NULL) {
*seq = pgf_extern_seq_get(item, pool);
}
break;
}
case PGF_PRODUCTION_META: {
*seq = pgf_extern_seq_get(item, pool);
break;
}
default:
gu_impossible();
}
}
#ifdef PGF_PARSER_DEBUG
static void
pgf_print_production_args(PgfPArgs args,
GuOut* out, GuExn* err)
{
size_t n_args = gu_seq_length(args);
for (size_t j = 0; j < n_args; j++) {
if (j > 0)
gu_putc(',',out,err);
PgfPArg arg = gu_seq_get(args, PgfPArg, j);
if (arg.hypos != NULL &&
gu_seq_length(arg.hypos) > 0) {
size_t n_hypos = gu_seq_length(arg.hypos);
for (size_t k = 0; k < n_hypos; k++) {
PgfCCat *hypo = gu_seq_get(arg.hypos, PgfCCat*, k);
gu_printf(out,err,"C%d ",hypo->fid);
}
gu_printf(out,err,"-> ");
}
gu_printf(out,err,"C%d",arg.ccat->fid);
}
}
static void
pgf_print_production(int fid, PgfProduction prod,
GuOut *out, GuExn* err, GuPool* pool)
{
gu_printf(out,err,"C%d -> ",fid);
GuVariantInfo i = gu_variant_open(prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
gu_printf(out,err,"F%d(",papp->fun->funid);
pgf_print_expr(papp->fun->ep->expr, NULL, 0, out, err);
gu_printf(out,err,")[");
pgf_print_production_args(papp->args,out,err);
gu_printf(out,err,"]\n");
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = i.data;
gu_printf(out,err,"_[C%d]\n",pcoerce->coerce->fid);
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
gu_printf(out,err,"<extern>(");
pgf_print_expr(pext->ep->expr, NULL, 0, out, err);
gu_printf(out,err,")[]\n");
break;
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* pmeta = i.data;
gu_printf(out,err,"<meta>[");
pgf_print_production_args(pmeta->args,out,err);
gu_printf(out,err,"]\n");
break;
}
default:
gu_impossible();
}
}
void
pgf_print_symbol(PgfSymbol sym, GuOut *out, GuExn *err);
static void
pgf_print_item_seq(PgfItem *item,
GuOut *out, GuExn* err, GuPool* pool)
{
size_t lin_idx;
PgfSequence seq;
pgf_item_sequence(item, &lin_idx, &seq, pool);
gu_printf(out, err, "%d : ",lin_idx);
size_t index;
for (index = 0; index < gu_seq_length(seq); index++) {
if (item->seq_idx == index)
gu_printf(out, err, " . ");
PgfSymbol *sym = gu_seq_index(seq, PgfSymbol, index);
pgf_print_symbol(*sym, out, err);
}
if (item->seq_idx == index)
gu_printf(out, err, " .");
}
static void
pgf_print_item(PgfItem* item, PgfParseState* state, GuOut* out, GuExn* err, GuPool* pool)
{
gu_printf(out, err, "[%d-%d; C%d -> ",
item->conts->state ? item->conts->state->offset : 0,
state ? state->offset : 0,
item->conts->ccat->fid);
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
PgfCncFun* fun = papp->fun;
gu_printf(out, err, "F%d(", fun->funid);
pgf_print_expr(fun->ep->expr, NULL, 0, out, err);
gu_printf(out, err, ")[");
pgf_print_production_args(item->args, out, err);
gu_printf(out, err, "]; ");
break;
}
case PGF_PRODUCTION_COERCE: {
gu_printf(out, err, "_[C%d]; ",
gu_seq_index(item->args, PgfPArg, 0)->ccat->fid);
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
gu_printf(out, err, "<extern>");
if (pext->ep != NULL) {
gu_printf(out, err, "(");
pgf_print_expr(pext->ep->expr, NULL, 0, out, err);
gu_printf(out, err, ")");
}
gu_printf(out, err, "[");
pgf_print_production_args(item->args, out, err);
gu_printf(out, err, "]; ");
break;
}
case PGF_PRODUCTION_META: {
gu_printf(out, err, "<meta>[");
pgf_print_production_args(item->args, out, err);
gu_printf(out, err, "]; ");
break;
}
default:
gu_impossible();
}
pgf_print_item_seq(item, out, err, pool);
gu_printf(out, err, "; %f+%f=%f]\n",
item->inside_prob,
item->conts->outside_prob,
item->inside_prob+item->conts->outside_prob);
}
#ifdef PGF_RESULT_DEBUG
static void
pgf_print_expr_state(PgfExprState* st,
GuWriter* wtr, GuExn* err, GuBuf* stack)
{
gu_buf_push(stack, int, (gu_seq_length(st->args) - st->arg_idx - 1));
if (gu_buf_length(st->answers->conts) > 0) {
PgfExprState* cont = gu_buf_get(st->answers->conts, PgfExprState*, 0);
if (cont != NULL)
pgf_print_expr_state(cont, wtr, err, stack);
}
gu_puts(" (", wtr, err);
pgf_print_expr(st->ep.expr, NULL, 0, wtr, err);
}
static void
pgf_print_expr_state0(PgfExprState* st,
GuWriter* wtr, GuExn* err, GuPool* tmp_pool)
{
gu_printf(wtr, err, "[%f+%f=%f]",
st->ep.prob,
st->answers->outside_prob,
st->answers->outside_prob+st->ep.prob);
size_t n_args = gu_seq_length(st->args);
GuBuf* stack = gu_new_buf(int, tmp_pool);
if (n_args > 0)
gu_buf_push(stack, int, n_args - st->arg_idx);
if (gu_buf_length(st->answers->conts) > 0) {
PgfExprState* cont =
gu_buf_get(st->answers->conts, PgfExprState*, 0);
if (cont != NULL)
pgf_print_expr_state(cont, wtr, err, stack);
}
if (n_args > 0)
gu_puts(" (", wtr, err);
else
gu_puts(" ", wtr, err);
pgf_print_expr(st->ep.expr, NULL, 0, wtr, err);
size_t n_counts = gu_buf_length(stack);
for (size_t i = 0; i < n_counts; i++) {
int count = gu_buf_get(stack, int, i);
while (count-- > 0)
gu_puts(" ?", wtr, err);
gu_puts(")", wtr, err);
}
gu_puts("\n", wtr, err);
}
#endif
#endif
static int
cmp_item_prob(GuOrder* self, const void* a, const void* b)
{
PgfItem *item1 = *((PgfItem **) a);
PgfItem *item2 = *((PgfItem **) b);
prob_t prob1 = item1->inside_prob + item1->conts->outside_prob;
prob_t prob2 = item2->inside_prob + item2->conts->outside_prob;
if (prob1 < prob2)
return -1;
else if (prob1 > prob2)
return 1;
else
return 0;
}
static GuOrder
pgf_item_prob_order = { cmp_item_prob };
static PgfItemContss*
pgf_parsing_get_contss(PgfContsMap* conts_map, PgfCCat* cat, GuPool *pool)
{
PgfItemContss* contss = gu_map_get(conts_map, cat, PgfItemContss*);
if (contss == NULL) {
size_t n_lins = cat->cnccat->n_lins;
contss = gu_new_seq(PgfItemConts*, n_lins, pool);
for (size_t i = 0; i < n_lins; i++) {
gu_seq_set(contss, PgfItemConts*, i, NULL);
}
gu_map_put(conts_map, cat, PgfItemContss*, contss);
}
return contss;
}
static PgfItemConts*
pgf_parsing_get_conts(PgfContsMap* conts_map,
PgfCCat* ccat, size_t lin_idx,
PgfParseState* state,
GuPool *pool)
{
gu_require(lin_idx < ccat->cnccat->n_lins);
PgfItemContss* contss =
pgf_parsing_get_contss(conts_map, ccat, pool);
PgfItemConts* conts = gu_seq_get(contss, PgfItemConts*, lin_idx);
if (!conts) {
conts = gu_new(PgfItemConts, pool);
conts->ccat = ccat;
conts->lin_idx = lin_idx;
conts->state = state;
conts->items = gu_new_buf(PgfItem*, pool);
conts->outside_prob = 0;
conts->ref_count = 0;
gu_seq_get(contss, PgfItemConts*, lin_idx) = conts;
#ifdef PGF_COUNTS_DEBUG
if (state != NULL) {
state->ps->cont_full_count++;
}
#endif
}
return conts;
}
static void
gu_ccat_fini(GuFinalizer* fin)
{
PgfCCat* cat = gu_container(fin, PgfCCat, fin);
if (cat->prods != NULL)
gu_seq_free(cat->prods);
}
static PgfCCat*
pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts,
prob_t viterbi_prob)
{
PgfCCat* cat = gu_new_flex(state->ps->pool, PgfCCat, fin, 1);
cat->cnccat = conts->ccat->cnccat;
cat->viterbi_prob = viterbi_prob;
cat->fid = state->ps->max_fid++;
cat->conts = conts;
cat->answers = NULL;
cat->prods = NULL;
cat->n_synprods = 0;
gu_map_put(state->generated_cats, conts, PgfCCat*, cat);
cat->fin[0].fn = gu_ccat_fini;
gu_pool_finally(state->ps->pool, cat->fin);
#ifdef PGF_COUNTS_DEBUG
state->ps->ccat_full_count++;
#endif
return cat;
}
static void
pgf_parsing_add_production(PgfCCat* ccat, PgfProduction prod)
{
if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) {
ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1);
}
gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod);
}
static PgfCCat*
pgf_parsing_get_completed(PgfParseState* state, PgfItemConts* conts)
{
return gu_map_get(state->generated_cats, conts, PgfCCat*);
}
static void
pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool)
{
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
PgfCncFun* fun = papp->fun;
gu_assert(item->conts->lin_idx < fun->n_lins);
PgfSequence* seq = fun->lins[item->conts->lin_idx];
gu_assert(item->seq_idx <= gu_seq_length(seq));
if (item->seq_idx == gu_seq_length(seq)) {
item->curr_sym = gu_null_variant;
} else {
item->curr_sym = gu_seq_get(seq, PgfSymbol, item->seq_idx);
}
break;
}
case PGF_PRODUCTION_COERCE: {
gu_assert(item->seq_idx <= 1);
if (item->seq_idx == 1) {
item->curr_sym = gu_null_variant;
} else {
item->curr_sym = gu_new_variant_i(pool, PGF_SYMBOL_CAT,
PgfSymbolCat,
.d = 0, .r = item->conts->lin_idx);
}
break;
}
case PGF_PRODUCTION_EXTERN: {
break;
}
case PGF_PRODUCTION_META: {
break;
}
default:
gu_impossible();
}
}
static PgfItem*
pgf_new_item(PgfItemConts* conts, PgfProduction prod,
GuPool* pool, PgfParsing* ps)
{
PgfItem* item;
if (ps == NULL || ps->free_item == NULL)
item = gu_new(PgfItem, pool);
else {
item = ps->free_item;
ps->free_item = ps->free_item->next;
}
GuVariantInfo pi = gu_variant_open(prod);
switch (pi.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = pi.data;
item->args = papp->args;
item->inside_prob = papp->fun->ep->prob;
int n_args = gu_seq_length(item->args);
for (int i = 0; i < n_args; i++) {
PgfPArg *arg = gu_seq_index(item->args, PgfPArg, i);
item->inside_prob += arg->ccat->viterbi_prob;
}
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = pi.data;
item->args = gu_new_seq(PgfPArg, 1, pool);
PgfPArg* parg = gu_seq_index(item->args, PgfPArg, 0);
parg->hypos = NULL;
parg->ccat = pcoerce->coerce;
item->inside_prob = pcoerce->coerce->viterbi_prob;
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = pi.data;
item->args = gu_empty_seq();
item->inside_prob = pext->ep ? pext->ep->prob : 0;
int n_args = gu_seq_length(item->args);
for (int i = 0; i < n_args; i++) {
PgfPArg *arg = gu_seq_index(item->args, PgfPArg, i);
item->inside_prob += arg->ccat->viterbi_prob;
}
break;
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* pmeta = pi.data;
item->args = pmeta->args;
item->inside_prob = pmeta->ep ? pmeta->ep->prob : 0;
int n_args = gu_seq_length(item->args);
for (int i = 0; i < n_args; i++) {
PgfPArg *arg = gu_seq_index(item->args, PgfPArg, i);
item->inside_prob += arg->ccat->viterbi_prob;
}
break;
}
default:
gu_impossible();
}
item->conts = conts;
item->prod = prod;
item->curr_sym = gu_null_variant;
item->seq_idx = 0;
item->tok_idx = 0;
item->alt = 0;
conts->ref_count++;
pgf_item_set_curr_symbol(item, pool);
#ifdef PGF_COUNTS_DEBUG
if (ps != NULL) {
ps->item_full_count++;
ps->item_real_count++;
}
#endif
return item;
}
static PgfItem*
pgf_item_copy(PgfItem* item, GuPool* pool, PgfParsing* ps)
{
PgfItem* copy;
if (ps == NULL || ps->free_item == NULL)
copy = gu_new(PgfItem, pool);
else {
copy = ps->free_item;
ps->free_item = ps->free_item->next;
}
memcpy(copy, item, sizeof(PgfItem));
#ifdef PGF_COUNTS_DEBUG
if (ps != NULL) {
ps->item_full_count++;
ps->item_real_count++;
}
#endif
item->conts->ref_count++;
return copy;
}
static PgfItem*
pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat,
GuPool* pool, PgfParsing *ps)
{
PgfCCat *old_ccat =
gu_seq_index(item->args, PgfPArg, d)->ccat;
PgfItem* new_item = pgf_item_copy(item, pool, ps);
size_t nargs = gu_seq_length(item->args);
new_item->args = gu_new_seq(PgfPArg, nargs, pool);
memcpy(gu_seq_data(new_item->args), gu_seq_data(item->args),
nargs * sizeof(PgfPArg));
gu_seq_set(new_item->args, PgfPArg, d,
((PgfPArg) { .hypos = NULL, .ccat = new_ccat }));
new_item->inside_prob +=
new_ccat->viterbi_prob - old_ccat->viterbi_prob;
return new_item;
}
static void
pgf_item_advance(PgfItem* item, GuPool* pool)
{
item->seq_idx++;
pgf_item_set_curr_symbol(item, pool);
}
static void
pgf_item_free(PgfParseState* before, PgfParseState* after,
PgfItem* item)
{
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_META:
return; // for now we don't release meta items
}
PgfItemConts* conts = item->conts;
conts->ref_count--;
do {
if (conts->ref_count != 0)
break;
conts = conts->ccat->conts;
} while (conts != NULL);
if (conts == NULL) {
size_t n_items = gu_buf_length(item->conts->items);
for (size_t i = 0; i < n_items; i++) {
PgfItem* cont = gu_buf_get(item->conts->items, PgfItem*, i);
if (cont == NULL)
continue;
pgf_item_free(before, after, cont);
}
}
#ifdef PGF_PARSER_DEBUG
memset(item, 0, sizeof(*item));
#endif
item->next = before->ps->free_item;
before->ps->free_item = item;
#ifdef PGF_COUNTS_DEBUG
before->ps->item_real_count--;
#endif
}
static void
pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after,
PgfToken tok, PgfItem* item)
{
if (after->ts->fn->match_token(after->ts, tok, item)) {
if (after->next == NULL) {
after->viterbi_prob =
item->inside_prob+item->conts->outside_prob;
}
gu_buf_heap_push(after->agenda, &pgf_item_prob_order, &item);
} else {
pgf_item_free(before, after, item);
}
}
static void
pgf_result_predict(PgfParsing* ps,
PgfExprState* cont, PgfCCat* ccat);
static void
pgf_result_production(PgfParsing* ps,
PgfAnswers* answers, PgfProduction prod);
static void
pgf_parsing_combine(PgfParseState* before, PgfParseState* after,
PgfItem* cont, PgfCCat* cat, int lin_idx)
{
if (cont == NULL) {
if (after == NULL) {
pgf_result_predict(before->ps, NULL, cat);
}
return;
}
PgfItem* item = NULL;
switch (gu_variant_tag(cont->curr_sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(cont->curr_sym);
item = pgf_item_update_arg(cont, scat->d, cat, before->ps->pool, before->ps);
break;
}
case PGF_SYMBOL_LIT: {
PgfSymbolLit* slit = gu_variant_data(cont->curr_sym);
item = pgf_item_update_arg(cont, slit->d, cat, before->ps->pool, before->ps);
break;
}
default:
gu_impossible();
}
pgf_item_advance(item, before->ps->pool);
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
static void
pgf_parsing_production(PgfParseState* state,
PgfItemConts* conts, PgfProduction prod)
{
PgfItem* item =
pgf_new_item(conts, prod, state->ps->pool, state->ps);
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
}
static PgfProduction
pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
{
GuVariantInfo i = gu_variant_open(item->prod);
PgfProduction prod = gu_null_variant;
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
PgfProductionApply* new_papp =
gu_new_variant(PGF_PRODUCTION_APPLY,
PgfProductionApply,
&prod, pool);
new_papp->fun = papp->fun;
new_papp->args = item->args;
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* new_pcoerce =
gu_new_variant(PGF_PRODUCTION_COERCE,
PgfProductionCoerce,
&prod, pool);
PgfPArg* parg = gu_seq_index(item->args, PgfPArg, 0);
new_pcoerce->coerce = parg->ccat;
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
if (pext->lins == NULL ||
gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx) == NULL) {
PgfSequence* seq =
pgf_extern_seq_get(item, pool);
size_t n_lins = item->conts->ccat->cnccat->n_lins;
PgfProductionExtern* new_pext = (PgfProductionExtern*)
gu_new_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
&prod, pool);
new_pext->callback = pext->callback;
new_pext->ep = ep;
new_pext->lins = gu_new_seq(PgfSequence*, n_lins, pool);
if (pext->lins == NULL) {
for (size_t i = 0; i < n_lins; i++) {
gu_seq_set(new_pext->lins,PgfSequence*,i,
NULL);
}
} else {
for (size_t i = 0; i < n_lins; i++) {
gu_seq_set(new_pext->lins,PgfSequence*,i,
gu_seq_get(pext->lins,PgfSequence*,i));
}
}
gu_seq_set(new_pext->lins,PgfSequence*,item->conts->lin_idx,seq);
} else {
prod = item->prod;
}
break;
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* new_pmeta =
gu_new_variant(PGF_PRODUCTION_META,
PgfProductionMeta,
&prod, pool);
new_pmeta->ep = ep;
new_pmeta->args = item->args;
break;
}
default:
gu_impossible();
}
return prod;
}
static void
pgf_parsing_complete(PgfParseState* before, PgfParseState* after,
PgfItem* item, PgfExprProb *ep)
{
PgfProduction prod =
pgf_parsing_new_production(item, ep, before->ps->pool);
#ifdef PGF_COUNTS_DEBUG
before->ps->prod_full_count++;
#endif
PgfCCat* tmp_cat = pgf_parsing_get_completed(before, item->conts);
PgfCCat* cat = tmp_cat;
if (cat == NULL) {
cat = pgf_parsing_create_completed(before, item->conts,
item->inside_prob);
}
pgf_parsing_add_production(cat, prod);
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
if (tmp_cat == NULL)
gu_printf(out, err, "[%d-%d; C%d; %d; C%d]\n",
item->conts->state ? item->conts->state->offset : 0,
before->offset,
item->conts->ccat->fid,
item->conts->lin_idx,
cat->fid);
pgf_print_production(cat->fid, prod, out, err, tmp_pool);
gu_pool_free(tmp_pool);
#endif
if (tmp_cat != NULL) {
PgfItemContss* contss =
pgf_parsing_get_contss(before->conts_map, cat, before->ps->pool);
size_t n_contss = gu_seq_length(contss);
for (size_t i = 0; i < n_contss; i++) {
PgfItemConts* conts2 = gu_seq_get(contss, PgfItemConts*, i);
/* If there are continuations for
* linearization index i, then (cat, i) has
* already been predicted. Add the new
* production immediately to the agenda,
* i.e. process it. */
if (conts2) {
pgf_parsing_production(before, conts2, prod);
}
}
// The category has already been created. If it has also been
// predicted already, then process a new item for this production.
PgfParseState* state = after;
while (state != NULL) {
PgfItemContss* contss =
pgf_parsing_get_contss(state->conts_map, cat, state->ps->pool);
size_t n_contss = gu_seq_length(contss);
for (size_t i = 0; i < n_contss; i++) {
PgfItemConts* conts2 = gu_seq_get(contss, PgfItemConts*, i);
/* If there are continuations for
* linearization index i, then (cat, i) has
* already been predicted. Add the new
* production immediately to the agenda,
* i.e. process it. */
if (conts2) {
pgf_parsing_production(state, conts2, prod);
}
}
state = state->next;
}
if (cat->answers != NULL) {
pgf_result_production(before->ps, cat->answers, prod);
}
} else {
size_t n_conts = gu_buf_length(item->conts->items);
for (size_t i = 0; i < n_conts; i++) {
PgfItem* cont = gu_buf_get(item->conts->items, PgfItem*, i);
pgf_parsing_combine(before, after, cont, cat, item->conts->lin_idx);
}
}
}
static void
pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after,
PgfItem* item, PgfCCat* ccat, size_t lin_idx)
{
PgfItemConts* conts =
pgf_parsing_get_conts(before->conts_map,
ccat, lin_idx, before,
before->ps->pool);
gu_buf_push(conts->items, PgfItem*, item);
if (gu_buf_length(conts->items) == 1) {
/* First time we encounter this linearization
* of this category at the current position,
* so predict it. */
conts->outside_prob =
item->inside_prob-conts->ccat->viterbi_prob+
item->conts->outside_prob;
size_t n_prods = ccat->n_synprods;
PgfProductionIdx* lexicon_idx = NULL;
if (after != NULL) {
lexicon_idx = after->ts->fn->get_lexicon_idx(after->ts);
// we don't know the current token.
// probably we just compute the list of completions
if (lexicon_idx == NULL && ccat->fid < after->ps->concr->total_cats)
n_prods = gu_seq_length(ccat->prods);
}
// Top-down prediction for syntactic rules
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
pgf_parsing_production(before, conts, prod);
}
// Bottom-up prediction for lexical rules
if (lexicon_idx != NULL) {
PgfCFCat cfc = {ccat, lin_idx};
PgfProductionBuf* tok_prods =
gu_map_get(lexicon_idx, &cfc, PgfProductionBuf*);
if (tok_prods != NULL) {
size_t n_prods = gu_buf_length(tok_prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_buf_get(tok_prods, PgfProduction, i);
pgf_parsing_production(before, conts, prod);
}
}
}
// Bottom-up prediction for epsilon rules
PgfCFCat cfc = {ccat, lin_idx};
PgfProductionBuf* eps_prods =
gu_map_get(before->ps->concr->epsilon_idx, &cfc, PgfProductionBuf*);
if (eps_prods != NULL) {
size_t n_prods = gu_buf_length(eps_prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_buf_get(eps_prods, PgfProduction, i);
pgf_parsing_production(before, conts, prod);
}
}
} else {
/* If it has already been completed, combine. */
PgfCCat* completed =
pgf_parsing_get_completed(before, conts);
if (completed) {
pgf_parsing_combine(before, after, item, completed, lin_idx);
}
PgfParseState* state = after;
while (state != NULL) {
PgfCCat* completed =
pgf_parsing_get_completed(state, conts);
if (completed) {
pgf_parsing_combine(state, state->next, item, completed, lin_idx);
}
state = state->next;
}
}
}
static void
pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after,
PgfItem* meta_item, prob_t meta_prob)
{
PgfToken tok = after->ts->fn->get_token(after->ts);
if (!gu_string_eq(tok, gu_empty_string)) {
PgfItem* item = pgf_item_copy(meta_item, before->ps->pool, before->ps);
item->inside_prob += meta_prob;
PgfSymbol prev = item->curr_sym;
PgfSymbolKS* sks = (PgfSymbolKS*)
gu_alloc_variant(PGF_SYMBOL_KS,
sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolKS),
&item->curr_sym, after->ps->pool);
*((PgfSymbol*)(sks+1)) = prev;
sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool);
gu_seq_set(sks->tokens, PgfToken, 0, tok);
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
}
typedef struct {
GuMapItor fn;
PgfParseState* state;
PgfItem* meta_item;
} PgfMetaPredictFn;
static void
pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err)
{
(void) (err);
PgfAbsCat* abscat = (PgfAbsCat*) key;
prob_t meta_prob = *((prob_t*) value);
PgfMetaPredictFn* clo = (PgfMetaPredictFn*) fn;
PgfParseState* state = clo->state;
PgfItem* meta_item = clo->meta_item;
PgfCncCat* cnccat =
gu_map_get(state->ps->concr->cnccats, &abscat->name, PgfCncCat*);
if (cnccat == NULL)
return;
size_t n_cats = gu_seq_length(cnccat->cats);
for (size_t i = 0; i < n_cats; i++) {
PgfCCat* ccat = gu_seq_get(cnccat->cats, PgfCCat*, i);
if (ccat->prods == NULL) {
// empty category
continue;
}
for (size_t lin_idx = 0; lin_idx < cnccat->n_lins; lin_idx++) {
PgfItem* item =
pgf_item_copy(meta_item, state->ps->pool, state->ps);
item->inside_prob +=
ccat->viterbi_prob+meta_prob;
size_t nargs = gu_seq_length(meta_item->args);
item->args = gu_new_seq(PgfPArg, nargs+1, state->ps->pool);
memcpy(gu_seq_data(item->args), gu_seq_data(meta_item->args),
nargs * sizeof(PgfPArg));
gu_seq_set(item->args, PgfPArg, nargs,
((PgfPArg) { .hypos = NULL, .ccat = ccat }));
PgfSymbol prev = item->curr_sym;
PgfSymbolCat* scat = (PgfSymbolCat*)
gu_alloc_variant(PGF_SYMBOL_CAT,
sizeof(PgfSymbolCat)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolCat),
&item->curr_sym, state->ps->pool);
*((PgfSymbol*)(scat+1)) = prev;
scat->d = nargs;
scat->r = lin_idx;
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
}
}
}
static void
pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
PgfItem* item, PgfSymbol sym) {
switch (gu_variant_tag(sym)) {
case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(sym);
PgfPArg* parg = gu_seq_index(item->args, PgfPArg, scat->d);
if (parg->ccat->prods == NULL) {
// empty category
pgf_item_free(before, after, item);
return;
}
pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r);
break;
}
case PGF_SYMBOL_KS: {
if (after != NULL) {
PgfSymbolKS* sks = gu_variant_data(sym);
gu_assert(item->tok_idx < gu_seq_length(sks->tokens));
PgfToken tok =
gu_seq_get(sks->tokens, PgfToken, item->tok_idx++);
if (item->tok_idx == gu_seq_length(sks->tokens)) {
item->tok_idx = 0;
pgf_item_advance(item, after->ps->pool);
}
pgf_parsing_add_transition(before, after, tok, item);
}
break;
}
case PGF_SYMBOL_KP: {
if (after != NULL) {
PgfSymbolKP* skp = gu_variant_data(sym);
size_t idx = item->tok_idx;
uint8_t alt = item->alt;
gu_assert(idx < gu_seq_length(skp->default_form));
if (idx == 0) {
PgfToken tok;
PgfItem* new_item;
tok = gu_seq_get(skp->default_form, PgfToken, 0);
new_item = pgf_item_copy(item, after->ps->pool, after->ps);
new_item->tok_idx++;
if (new_item->tok_idx == gu_seq_length(skp->default_form)) {
new_item->tok_idx = 0;
pgf_item_advance(new_item, after->ps->pool);
}
pgf_parsing_add_transition(before, after, tok, new_item);
for (size_t i = 0; i < skp->n_forms; i++) {
// XXX: do nubbing properly
PgfTokens* toks = skp->forms[i].form;
PgfTokens* toks2 = skp->default_form;
bool skip = pgf_tokens_equal(toks, toks2);
for (size_t j = 0; j < i; j++) {
PgfTokens* toks2 = skp->forms[j].form;
skip |= pgf_tokens_equal(toks, toks2);
}
if (!skip) {
tok = gu_seq_get(toks, PgfToken, 0);
new_item = pgf_item_copy(item, after->ps->pool, after->ps);
new_item->tok_idx++;
new_item->alt = i;
if (new_item->tok_idx == gu_seq_length(toks)) {
new_item->tok_idx = 0;
pgf_item_advance(new_item, after->ps->pool);
}
pgf_parsing_add_transition(before, after, tok, new_item);
}
}
} else if (alt == 0) {
PgfToken tok =
gu_seq_get(skp->default_form, PgfToken, idx);
item->tok_idx++;
if (item->tok_idx == gu_seq_length(skp->default_form)) {
item->tok_idx = 0;
pgf_item_advance(item, after->ps->pool);
}
pgf_parsing_add_transition(before, after, tok, item);
} else {
gu_assert(alt <= skp->n_forms);
PgfTokens* toks = skp->forms[alt - 1].form;
PgfToken tok = gu_seq_get(toks, PgfToken, idx);
item->tok_idx++;
if (item->tok_idx == gu_seq_length(toks)) {
item->tok_idx = 0;
pgf_item_advance(item, after->ps->pool);
}
pgf_parsing_add_transition(before, after, tok, item);
}
}
break;
}
case PGF_SYMBOL_LIT: {
if (after != NULL) {
PgfSymbolLit* slit = gu_variant_data(sym);
PgfPArg* parg = gu_seq_index(item->args, PgfPArg, slit->d);
if (parg->ccat->fid > 0 &&
parg->ccat->fid >= before->ps->concr->total_cats) {
pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r);
}
else {
PgfItemConts* conts =
pgf_parsing_get_conts(before->conts_map,
parg->ccat, slit->r, before,
before->ps->pool);
gu_buf_push(conts->items, PgfItem*, item);
if (gu_buf_length(conts->items) == 1) {
/* This is the first time when we encounter this
* literal category so we must call the callback */
PgfLiteralCallback* callback =
gu_map_get(before->ps->concr->callbacks,
parg->ccat->cnccat,
PgfLiteralCallback*);
if (callback != NULL) {
PgfProduction prod;
PgfProductionExtern* pext =
gu_new_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
&prod, before->ps->pool);
pext->callback = callback;
pext->ep = NULL;
pext->lins = NULL;
pgf_parsing_production(before, conts, prod);
}
} else {
/* If it has already been completed, combine. */
PgfCCat* completed =
pgf_parsing_get_completed(before, conts);
if (completed) {
pgf_parsing_combine(before, after, item, completed, slit->r);
}
PgfParseState* state = after;
while (state != NULL) {
PgfCCat* completed =
pgf_parsing_get_completed(state, conts);
if (completed) {
pgf_parsing_combine(state, state->next, item, completed, slit->r);
}
state = state->next;
}
}
}
}
break;
}
case PGF_SYMBOL_VAR:
// XXX TODO proper support
break;
case PGF_SYMBOL_NE: {
// Nothing to be done here
break;
}
default:
gu_impossible();
}
}
static void
pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
{
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
pgf_print_item(item, before, out, err, tmp_pool);
gu_pool_free(tmp_pool);
#endif
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
PgfCncFun* fun = papp->fun;
PgfSequence* seq = fun->lins[item->conts->lin_idx];
if (item->seq_idx == gu_seq_length(seq)) {
pgf_parsing_complete(before, after, item, NULL);
pgf_item_free(before, after, item);
} else {
pgf_parsing_symbol(before, after, item, item->curr_sym);
}
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = i.data;
switch (item->seq_idx) {
case 0:
if (pcoerce->coerce->prods == NULL) {
// empty category
pgf_item_free(before, after, item);
return;
}
pgf_parsing_td_predict(before, after, item,
pcoerce->coerce,
item->conts->lin_idx);
break;
case 1:
pgf_parsing_complete(before, after, item, NULL);
pgf_item_free(before, after, item);
break;
default:
gu_impossible();
}
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
PgfSequence* seq;
if (pext->lins != NULL &&
(seq = gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx)) != NULL) {
if (item->seq_idx == gu_seq_length(seq)) {
pgf_parsing_complete(before, after, item, NULL);
pgf_item_free(before, after, item);
} else {
PgfSymbol sym =
gu_seq_get(seq, PgfSymbol, item->seq_idx);
pgf_parsing_symbol(before, after, item, sym);
}
} else {
PgfToken tok = (after != NULL)
? after->ts->fn->get_token(after->ts)
: gu_empty_string;
PgfExprProb *ep = NULL;
bool accepted =
pext->callback->match(before->ps->concr, item,
tok,
&ep, before->ps->out_pool);
if (ep != NULL)
pgf_parsing_complete(before, after, item, ep);
if (accepted) {
if (after != NULL) {
PgfSymbol prev = item->curr_sym;
PgfSymbolKS* sks = (PgfSymbolKS*)
gu_alloc_variant(PGF_SYMBOL_KS,
sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolKS),
&item->curr_sym, after->ps->pool);
*((PgfSymbol*)(sks+1)) = prev;
sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool);
gu_seq_set(sks->tokens, PgfToken, 0, tok);
item->seq_idx++;
pgf_parsing_add_transition(before, after, tok, item);
}
} else {
pgf_item_free(before, after, item);
}
}
break;
}
case PGF_PRODUCTION_META: {
if (item->seq_idx == pgf_item_sequence_length(item)) {
if (before->meta_item != NULL)
break;
before->meta_item = item;
if (after == NULL) {
PgfExprProb *ep = gu_new(PgfExprProb, before->ps->pool);
ep->expr = before->ps->meta_var;
ep->prob = item->inside_prob;
size_t n_args = gu_seq_length(item->args);
for (size_t i = 0; i < n_args; i++) {
PgfPArg* arg = gu_seq_index(item->args, PgfPArg, i);
ep->prob -= arg->ccat->viterbi_prob;
}
pgf_parsing_complete(before, after, item, ep);
} else {
prob_t meta_token_prob =
item->conts->ccat->cnccat->abscat->meta_token_prob;
if (meta_token_prob != INFINITY) {
pgf_parsing_meta_scan(before, after, item, meta_token_prob);
}
PgfCIdMap* meta_child_probs =
item->conts->ccat->cnccat->abscat->meta_child_probs;
if (meta_child_probs != NULL) {
PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, item };
gu_map_iter(meta_child_probs, &clo.fn, NULL);
}
}
} else {
pgf_parsing_symbol(before, after, item, item->curr_sym);
}
break;
}
default:
gu_impossible();
}
}
static bool
pgf_parsing_proceed(PgfParseState* state)
{
prob_t best_prob = INFINITY;
if (gu_buf_length(state->ps->expr_queue) > 0) {
best_prob = gu_buf_get(state->ps->expr_queue, PgfExprState*, 0)->ep.prob;
}
PgfParseState* before = NULL;
prob_t delta_prob = 0;
PgfParseState* st = state;
while (st != NULL) {
if (gu_buf_length(st->agenda) > 0) {
PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0);
prob_t item_prob =
item->inside_prob+item->conts->outside_prob+delta_prob;
if (item_prob < best_prob) {
best_prob = item_prob;
before = st;
}
}
prob_t state_delta =
(st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))*
state->ps->beam_size;
//prob_t lexical_prob =
// st->ts ? st->ts->lexical_prob : 0;
delta_prob += state_delta; /*fmax(state_delta, lexical_prob)*/; // the calculation of lexical_prob doesn't work properly.
st = st->next;
}
if (before == NULL)
return false;
PgfParseState* after = NULL;
st = state;
while (st != before) {
PgfParseState* tmp = st->next;
st->next = after;
after = st;
st = tmp;
}
PgfItem* item;
gu_buf_heap_pop(before->agenda, &pgf_item_prob_order, &item);
pgf_parsing_item(before, after, item);
while (after != NULL) {
PgfParseState* tmp = after->next;
after->next = before;
before = after;
after = tmp;
}
state = before;
return true;
}
static prob_t
pgf_parsing_default_beam_size(PgfConcr* concr)
{
GuPool* tmp_pool = gu_new_pool();
PgfCId flag_name = gu_str_string("beam_size", tmp_pool);
PgfLiteral lit = gu_map_get(concr->cflags, &flag_name, PgfLiteral);
if (gu_variant_is_null(lit))
return 0;
GuVariantInfo pi = gu_variant_open(lit);
gu_assert (pi.tag == PGF_LITERAL_FLT);
return ((PgfLiteralFlt*) pi.data)->val;
}
static PgfParsing*
pgf_new_parsing(PgfConcr* concr, double heuristics,
GuPool* pool, GuPool* out_pool)
{
PgfParsing* ps = gu_new(PgfParsing, pool);
ps->concr = concr;
ps->pool = pool;
ps->out_pool = out_pool;
ps->expr_queue = gu_new_buf(PgfExprState*, pool);
ps->max_fid = concr->total_cats;
#ifdef PGF_COUNTS_DEBUG
ps->item_full_count = 0;
ps->item_real_count = 0;
ps->cont_full_count = 0;
ps->ccat_full_count = 0;
ps->prod_full_count = 0;
#endif
ps->free_item = NULL;
ps->beam_size = heuristics;
PgfExprMeta *expr_meta =
gu_new_variant(PGF_EXPR_META,
PgfExprMeta,
&ps->meta_var, pool);
expr_meta->id = 0;
PgfProductionMeta* pmeta =
gu_new_variant(PGF_PRODUCTION_META,
PgfProductionMeta,
&ps->meta_prod, pool);
pmeta->ep = NULL;
pmeta->args = gu_new_seq(PgfPArg, 0, pool);
return ps;
}
static PgfParseState*
pgf_new_parse_state(PgfParsing* ps,
PgfParseState* next,
PgfTokenState* ts,
GuPool* pool)
{
PgfParseState* state = gu_new(PgfParseState, pool);
state->next = next;
state->agenda = gu_new_buf(PgfItem*, pool);
state->meta_item = NULL;
state->generated_cats = gu_map_type_new(PgfGenCatMap, pool);
state->conts_map = gu_map_type_new(PgfContsMap, pool);
state->offset = next ? next->offset+1 : 0;
state->viterbi_prob = 0;
state->ps = ps;
state->ts = ts;
return state;
}
typedef struct {
GuMapItor fn;
PgfTokenState* ts;
} PgfLexiconFn;
static void
pgf_parser_compute_lexicon_prob(GuMapItor* fn, const void* key, void* value, GuExn* err)
{
PgfTokenState* ts = ((PgfLexiconFn*) fn)->ts;
PgfProductionBuf* prods = *((PgfProductionBuf**) value);
if (prods == NULL)
return;
size_t n_prods = gu_buf_length(prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_buf_get(prods, PgfProduction, i);
GuVariantInfo pi = gu_variant_open(prod);
switch (pi.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = pi.data;
if (ts->lexical_prob > papp->fun->ep->prob) {
ts->lexical_prob = papp->fun->ep->prob;
}
break;
}
}
}
}
#define pgf_new_token_state(ty, pool) \
(ty*) pgf_new_token_state_(&pgf_tsfn_##ty, (PgfTokenState*) gu_new(ty, pool))
static PgfTokenState*
pgf_new_token_state_(PgfTokenFn* fn, PgfTokenState* ts)
{
ts->fn = fn;
ts->lexical_prob = INFINITY;
return ts;
}
#ifdef PGF_COUNTS_DEBUG
void pgf_parsing_print_counts(PgfParsing* ps)
{
printf("%d\t%d\t%d\t%d\t%d\n",
ps->item_full_count,
ps->item_real_count,
ps->cont_full_count,
ps->ccat_full_count,
ps->prod_full_count);
}
#endif
typedef struct {
PgfTokenState ts;
PgfToken tok;
PgfProductionIdx *lexicon_idx;
} PgfRealTokenState;
static bool
pgf_real_match_token(PgfTokenState* ts, PgfToken tok, PgfItem* item)
{
return gu_string_eq(gu_container(ts, PgfRealTokenState, ts)->tok, tok);
}
static PgfToken
pgf_real_get_token(PgfTokenState* ts) {
return gu_container(ts, PgfRealTokenState, ts)->tok;
}
static PgfProductionIdx*
pgf_real_get_lexicon_idx(PgfTokenState* ts) {
return gu_container(ts, PgfRealTokenState, ts)->lexicon_idx;
}
static PgfTokenFn pgf_tsfn_PgfRealTokenState = {
pgf_real_match_token,
pgf_real_get_token,
pgf_real_get_lexicon_idx
};
PgfParseState*
pgf_parser_next_state(PgfParseState* prev, PgfToken tok)
{
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(prev->ps);
#endif
PgfRealTokenState* ts =
pgf_new_token_state(PgfRealTokenState, prev->ps->pool);
ts->tok = tok;
ts->lexicon_idx = gu_map_get(prev->ps->concr->leftcorner_tok_idx,
&tok, PgfProductionIdx*);
if (ts->lexicon_idx != NULL) {
PgfLexiconFn clo = { { pgf_parser_compute_lexicon_prob }, &ts->ts };
gu_map_iter(ts->lexicon_idx, &clo.fn, NULL);
}
if (ts->ts.lexical_prob == INFINITY)
ts->ts.lexical_prob = 0;
PgfParseState* state =
pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool);
while (gu_buf_length(state->agenda) == 0) {
if (!pgf_parsing_proceed(state))
return NULL;
}
return state;
}
typedef struct {
PgfTokenState ts;
GuEnum en;
GuString prefix;
PgfTokenProb* tp;
GuPool* pool;
PgfParseState* state;
} PgfPrefixTokenState;
static GuString
pgf_get_tokens(PgfSequence* seq,
uint16_t seq_idx, uint8_t tok_idx,
GuPool* pool)
{
GuPool* tmp_pool = gu_new_pool();
GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
GuStringBuf* sbuf = gu_string_buf(tmp_pool);
GuOut* out = gu_string_buf_out(sbuf);
// collect the tokens in the production
size_t len = gu_seq_length(seq);
for (size_t i = seq_idx; i < len; i++) {
PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
GuVariantInfo i = gu_variant_open(sym);
switch (i.tag) {
case PGF_SYMBOL_KS: {
PgfSymbolKS* symks = i.data;
size_t len = gu_seq_length(symks->tokens);
for (size_t i = tok_idx; i < len; i++) {
if (i > 0) {
gu_putc(' ', out, err);
}
PgfToken tok = gu_seq_get(symks->tokens, PgfToken, i);
gu_string_write(tok, out, err);
}
tok_idx = 0;
}
default:
goto end;
}
}
end:;
GuString tokens = gu_string_buf_freeze(sbuf, pool);
gu_pool_free(tmp_pool);
return tokens;
}
static bool
pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
{
PgfPrefixTokenState* ts =
gu_container(ts0, PgfPrefixTokenState, ts);
if (gu_string_is_prefix(ts->prefix, tok)) {
size_t lin_idx;
PgfSequence* seq;
pgf_item_sequence(item, &lin_idx, &seq, ts->pool);
uint16_t seq_idx = item->seq_idx;
uint8_t tok_idx = item->tok_idx;
// go one token back
if (tok_idx > 0)
tok_idx--;
else
seq_idx--;
ts->tp = gu_new(PgfTokenProb, ts->pool);
ts->tp->tok =
pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool);
ts->tp->cat = item->conts->ccat->cnccat->abscat->name;
ts->tp->prob = item->inside_prob+item->conts->outside_prob;
}
return false;
}
static PgfToken
pgf_prefix_get_token(PgfTokenState* ts) {
return gu_empty_string;
}
static PgfProductionIdx*
pgf_prefix_get_lexicon_idx(PgfTokenState* ts) {
return NULL;
}
static PgfTokenFn pgf_tsfn_PgfPrefixTokenState = {
pgf_prefix_match_token,
pgf_prefix_get_token,
pgf_prefix_get_lexicon_idx
};
static void
pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
{
PgfPrefixTokenState* ts =
gu_container(self, PgfPrefixTokenState, en);
ts->tp = NULL;
ts->pool = pool;
while (ts->tp == NULL) {
if (!pgf_parsing_proceed(ts->state))
break;
}
*((PgfTokenProb**)to) = ts->tp;
}
GuEnum*
pgf_parser_completions(PgfParseState* prev, GuString prefix)
{
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(prev->ps);
#endif
PgfPrefixTokenState* ts =
pgf_new_token_state(PgfPrefixTokenState, prev->ps->pool);
ts->en.next = pgf_parser_completions_next;
ts->prefix = prefix;
ts->tp = NULL;
ts->state =
pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool);
return &ts->en;
}
static int
cmp_expr_state(GuOrder* self, const void* a, const void* b)
{
PgfExprState *s1 = *((PgfExprState **) a);
PgfExprState *s2 = *((PgfExprState **) b);
prob_t prob1 = s1->answers->outside_prob+s1->ep.prob;
prob_t prob2 = s2->answers->outside_prob+s2->ep.prob;
if (prob1 < prob2)
return -1;
else if (prob1 > prob2)
return 1;
else
return 0;
}
static GuOrder
pgf_expr_state_order = { cmp_expr_state };
static void
pgf_result_production(PgfParsing* ps,
PgfAnswers* answers, PgfProduction prod)
{
GuVariantInfo pi = gu_variant_open(prod);
switch (pi.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = pi.data;
PgfExprState *st = gu_new(PgfExprState, ps->pool);
st->answers = answers;
st->ep = *papp->fun->ep;
st->args = papp->args;
st->arg_idx = 0;
size_t n_args = gu_seq_length(st->args);
for (size_t k = 0; k < n_args; k++) {
PgfPArg* parg = gu_seq_index(st->args, PgfPArg, k);
st->ep.prob += parg->ccat->viterbi_prob;
}
gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st);
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = pi.data;
PgfCCat* ccat = pcoerce->coerce;
for (size_t i = 0; i < ccat->n_synprods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
pgf_result_production(ps, answers, prod);
}
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = pi.data;
PgfExprState *st = gu_new(PgfExprState, ps->pool);
st->answers = answers;
st->ep = *pext->ep;
st->args = gu_empty_seq();
st->arg_idx = 0;
gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st);
break;
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* pmeta = pi.data;
PgfExprState *st = gu_new(PgfExprState, ps->pool);
st->answers = answers;
st->ep = *pmeta->ep;
st->args = pmeta->args;
st->arg_idx = 0;
size_t n_args = gu_seq_length(st->args);
for (size_t k = 0; k < n_args; k++) {
PgfPArg* parg = gu_seq_index(st->args, PgfPArg, k);
st->ep.prob += parg->ccat->viterbi_prob;
}
gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st);
break;
}
default:
gu_impossible();
}
}
static void
pgf_result_predict(PgfParsing* ps,
PgfExprState* cont, PgfCCat* ccat)
{
prob_t outside_prob = 0;
if (cont != NULL) {
cont->ep.prob -= ccat->viterbi_prob;
outside_prob =
cont->answers->outside_prob+cont->ep.prob;
}
PgfAnswers* answers = ccat->answers;
if (answers == NULL) {
answers = gu_new(PgfAnswers, ps->pool);
answers->conts = gu_new_buf(PgfExprState*, ps->pool);
answers->exprs = gu_new_buf(PgfExprProb*, ps->pool);
answers->outside_prob = outside_prob;
ccat->answers = answers;
}
gu_buf_push(answers->conts, PgfExprState*, cont);
if (gu_buf_length(answers->conts) == 1) {
if (ccat->prods == NULL)
return;
// Generation
for (size_t i = 0; i < ccat->n_synprods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
pgf_result_production(ps, answers, prod);
}
} else {
size_t n_exprs = gu_buf_length(answers->exprs);
for (size_t i = 0; i < n_exprs; i++) {
PgfExprProb* ep = gu_buf_get(answers->exprs, PgfExprProb*, i);
PgfExprState* st = gu_new(PgfExprState, ps->pool);
st->answers = cont->answers;
st->ep.expr =
gu_new_variant_i(ps->out_pool,
PGF_EXPR_APP, PgfExprApp,
.fun = cont->ep.expr,
.arg = ep->expr);
st->ep.prob = cont->ep.prob+ep->prob;
st->args = cont->args;
st->arg_idx = cont->arg_idx+1;
gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st);
}
}
}
static bool
pgf_parse_result_is_new(PgfExprState* st)
{
// we have found a complete abstract tree but we must check
// whether this is not a duplication. Since the trees are
// generated in probability order it is enough to check only
// trees with the same probability.
size_t i = gu_buf_length(st->answers->exprs);
while (i-- > 0) {
PgfExprProb* ep =
gu_buf_get(st->answers->exprs, PgfExprProb*, i);
if (ep->prob < st->ep.prob)
break;
if (pgf_expr_eq(ep->expr, st->ep.expr))
return false;
}
return true;
}
static PgfExprProb*
pgf_parse_result_next(PgfParseResult* pr)
{
for (;;) {
while (pgf_parsing_proceed(pr->state));
if (gu_buf_length(pr->state->ps->expr_queue) == 0)
break;
PgfExprState* st;
gu_buf_heap_pop(pr->state->ps->expr_queue, &pgf_expr_state_order, &st);
#ifdef PGF_PARSER_DEBUG
#ifdef PGF_RESULT_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
pgf_print_expr_state0(st, wtr, err, tmp_pool);
gu_pool_free(tmp_pool);
#endif
#endif
if (st->arg_idx < gu_seq_length(st->args)) {
PgfCCat* ccat =
gu_seq_index(st->args, PgfPArg, st->arg_idx)->ccat;
if (ccat->fid < pr->state->ps->concr->total_cats) {
st->ep.expr =
gu_new_variant_i(pr->state->ps->out_pool,
PGF_EXPR_APP, PgfExprApp,
.fun = st->ep.expr,
.arg = pr->state->ps->meta_var);
st->arg_idx++;
gu_buf_heap_push(pr->state->ps->expr_queue, &pgf_expr_state_order, &st);
} else {
pgf_result_predict(pr->state->ps, st, ccat);
}
} else if (pgf_parse_result_is_new(st)) {
gu_buf_push(st->answers->exprs, PgfExprProb*, &st->ep);
size_t n_conts = gu_buf_length(st->answers->conts);
for (size_t i = 0; i < n_conts; i++) {
PgfExprState* st2 = gu_buf_get(st->answers->conts, PgfExprState*, i);
if (st2 == NULL) {
return &st->ep;
}
PgfExprState* st3 = gu_new(PgfExprState, pr->state->ps->pool);
st3->answers = st2->answers;
st3->ep.expr =
gu_new_variant_i(pr->state->ps->out_pool,
PGF_EXPR_APP, PgfExprApp,
.fun = st2->ep.expr,
.arg = st->ep.expr);
st3->ep.prob = st2->ep.prob + st->ep.prob;
st3->args = st2->args;
st3->arg_idx = st2->arg_idx+1;
gu_buf_heap_push(pr->state->ps->expr_queue, &pgf_expr_state_order, &st3);
}
}
}
return NULL;
}
static void
pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool)
{
PgfParseResult* pr = gu_container(self, PgfParseResult, en);
*(PgfExprProb**)to = pgf_parse_result_next(pr);
}
PgfExprEnum*
pgf_parse_result(PgfParseState* state)
{
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(state->ps);
#endif
PgfParseResult* res = gu_new(PgfParseResult, state->ps->pool);
res->state = state;
res->en.next = pgf_parse_result_enum_next;
return &res->en;
}
void
pgf_parse_print_chunks(PgfParseState* state)
{
/* if (state->ps->completed == NULL) {
while (state->ps->completed == NULL) {
if (!pgf_parsing_proceed(state))
break;
}
if (state->ps->completed == NULL)
return;
}
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stdout, tmp_pool);
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
PgfCCat* completed = state->ps->completed;
if (gu_seq_length(completed->prods) == 0)
return;
size_t n_args = 0;
size_t arg_idx = 0;
PgfCCat* ccat = NULL;
PgfProductionMeta* pmeta = NULL;
PgfProduction prod = gu_seq_get(completed->prods, PgfProduction, 0);
GuVariantInfo pi = gu_variant_open(prod);
switch (pi.tag) {
case PGF_PRODUCTION_APPLY:
n_args = 1;
arg_idx = 0;
ccat = completed;
break;
case PGF_PRODUCTION_META:
pmeta = pi.data;
n_args = gu_seq_length(pmeta->args);
arg_idx = 0;
ccat = gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat;
break;
}
PgfParseState* next = NULL;
while (state != NULL) {
PgfParseState* tmp = state->next;
state->next = next;
next = state;
state = tmp;
}
int offset = 0;
state = next;
next = NULL;
while (state != NULL) {
if (state->ts != NULL)
{
if (ccat != NULL &&
offset == ((ccat->conts->state != NULL) ? ccat->conts->state->offset : 0)) {
PgfCCat *ccat2 = ccat;
while (ccat2->conts != NULL) {
ccat2 = ccat2->conts->ccat;
}
gu_putc('(', wtr, err);
gu_string_write(ccat2->cnccat->abscat->name, wtr, err);
gu_putc(' ', wtr, err);
}
gu_string_write(state->ts->tok, wtr, err);
offset++;
if (ccat != NULL &&
ccat ==
gu_map_get(state->generated_cats, ccat->conts, PgfCCat*)) {
gu_putc(')', wtr, err);
arg_idx++;
ccat =
(arg_idx >= n_args) ?
NULL :
gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat;
}
gu_putc(' ', wtr, err);
}
PgfParseState* tmp = state->next;
state->next = next;
next = state;
state = tmp;
}
gu_putc('\n', wtr, err);
gu_pool_free(tmp_pool);*/
}
// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
PgfParseState*
pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx,
double heuristics,
GuPool* pool, GuPool* out_pool)
{
PgfCncCat* cnccat =
gu_map_get(concr->cnccats, &cat, PgfCncCat*);
if (!cnccat)
return NULL;
gu_assert(lin_idx < cnccat->n_lins);
if (heuristics < 0) {
heuristics = pgf_parsing_default_beam_size(concr);
}
PgfParsing* ps =
pgf_new_parsing(concr, heuristics, pool, out_pool);
PgfParseState* state =
pgf_new_parse_state(ps, NULL, NULL, pool);
size_t n_ccats = gu_seq_length(cnccat->cats);
for (size_t i = 0; i < n_ccats; i++) {
PgfCCat* ccat = gu_seq_get(cnccat->cats, PgfCCat*, i);
if (ccat != NULL) {
if (ccat->prods == NULL) {
// Empty category
continue;
}
PgfItemConts* conts = gu_new(PgfItemConts, pool);
conts->ccat = ccat;
conts->lin_idx = lin_idx;
conts->state = NULL;
conts->items = gu_new_buf(PgfItem*, pool);
conts->outside_prob = 0;
conts->ref_count = 0;
gu_buf_push(conts->items, PgfItem*, NULL);
#ifdef PGF_COUNTS_DEBUG
ps->cont_full_count++;
#endif
size_t n_prods = gu_seq_length(ccat->prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
PgfItem* item =
pgf_new_item(conts, prod, pool, ps);
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
}
PgfItem *item =
pgf_new_item(conts, ps->meta_prod, pool, ps);
item->inside_prob =
ccat->cnccat->abscat->meta_prob;
gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item);
}
}
return state;
}
void
pgf_parser_add_literal(PgfConcr *concr, PgfCId cat,
PgfLiteralCallback* callback)
{
PgfCncCat* cnccat =
gu_map_get(concr->cnccats, &cat, PgfCncCat*);
if (cnccat == NULL)
return;
gu_map_put(concr->callbacks, cnccat,
PgfLiteralCallback*, callback);
}
typedef struct {
GuMapItor fn;
PgfTokens* tokens;
PgfMorphoCallback* callback;
} PgfMorphoFn;
static void
pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
{
PgfMorphoFn* clo = (PgfMorphoFn*) fn;
PgfCFCat cfc = *((PgfCFCat*) key);
PgfProductionBuf* prods = *((PgfProductionBuf**) value);
if (prods == NULL)
return;
GuString analysis = cfc.ccat->cnccat->labels[cfc.lin_idx];
size_t n_prods = gu_buf_length(prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_buf_get(prods, PgfProduction, i);
GuVariantInfo i = gu_variant_open(prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
if (clo->tokens != NULL) {
// match the tokens with the production
size_t pos = 0;
PgfSequence* seq = papp->fun->lins[cfc.lin_idx];
size_t len = gu_seq_length(seq);
for (size_t i = 0; i < len; i++) {
PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
GuVariantInfo i = gu_variant_open(sym);
switch (i.tag) {
case PGF_SYMBOL_KS: {
PgfSymbolKS* symks = i.data;
size_t len = gu_seq_length(symks->tokens);
for (size_t i = 0; i < len; i++) {
if (pos >= gu_seq_length(clo->tokens))
goto cont;
PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i);
PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
if (!gu_string_eq(tok1, tok2))
goto cont;
}
}
default:
continue;
}
}
if (pos != gu_seq_length(clo->tokens))
goto cont;
}
PgfCId lemma = papp->fun->absfun->name;
prob_t prob = papp->fun->absfun->ep.prob;
clo->callback->callback(clo->callback,
lemma, analysis, prob, err);
}
}
cont:;
}
}
void
pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer,
PgfMorphoCallback* callback, GuExn* err)
{
GuPool* tmp_pool = gu_local_pool();
GuBuf* tokens = gu_new_buf(PgfToken, tmp_pool);
GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
if (gu_exn_is_raised(lex_err)) {
gu_raise(err, PgfExn);
gu_pool_free(tmp_pool);
return;
}
PgfProductionIdx* lexicon_idx =
gu_map_get(concr->leftcorner_tok_idx, &tok, PgfProductionIdx*);
if (lexicon_idx == NULL) {
gu_pool_free(tmp_pool);
return;
}
do {
gu_buf_push(tokens, PgfToken, tok);
tok = pgf_lexer_read_token(lexer, lex_err);
} while (!gu_exn_is_raised(lex_err));
PgfMorphoFn clo = { { pgf_morpho_iter }, gu_buf_data_seq(tokens), callback };
gu_map_iter(lexicon_idx, &clo.fn, err);
gu_pool_free(tmp_pool);
}
typedef struct {
GuEnum en;
GuEnum* map_en1;
GuEnum* map_en2;
GuMapItor fn;
PgfLeftcornerTokIdx* new_idx;
GuPool* pool;
} PgfFullFormState;
static void
pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
{
PgfFullFormState* st = gu_container(fn, PgfFullFormState, fn);
PgfCFCat cfc = *((PgfCFCat*) key);
PgfProductionBuf* prods = *((PgfProductionBuf**) value);
if (prods == NULL)
return;
size_t n_prods = gu_buf_length(prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_buf_get(prods, PgfProduction, i);
GuVariantInfo i = gu_variant_open(prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
PgfSequence* seq = papp->fun->lins[cfc.lin_idx];
GuString tokens = pgf_get_tokens(seq, 0, 0, st->pool);
// create a new production index with keys that
// are multiword units
PgfProductionIdx* lexicon_idx =
gu_map_get(st->new_idx, &tokens, PgfProductionIdx*);
if (lexicon_idx == NULL) {
lexicon_idx = gu_map_type_new(PgfProductionIdx, st->pool);
gu_map_put(st->new_idx, &tokens, PgfProductionIdx*, lexicon_idx);
}
PgfProductionBuf* prods =
gu_map_get(lexicon_idx, &cfc, PgfProductionBuf*);
if (prods == NULL) {
prods = gu_new_buf(PgfProduction, st->pool);
gu_map_put(lexicon_idx, &cfc, PgfProductionBuf*, prods);
}
gu_buf_push(prods, PgfProduction, prod);
}
}
}
}
static void
gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
{
PgfFullFormState* st = gu_container(self, PgfFullFormState, en);
for (;;) {
if (st->new_idx == NULL) {
GuMapKeyValue* kv = gu_next(st->map_en1, GuMapKeyValue*, pool);
if (kv == NULL) {
*((PgfFullFormEntry**)to) = NULL;
return;
}
PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) kv->value);
// we have an index by the first token but we must re-index
// by taking into account the multiword units
st->pool = pool;
st->new_idx = gu_map_type_new(PgfLeftcornerTokIdx, pool);
st->fn.fn = pgf_fullform_iter;
gu_map_iter(lexicon_idx, &st->fn, NULL);
st->map_en2 = gu_map_enum(st->new_idx, pool);
}
PgfFullFormEntry* entry =
gu_next(st->map_en2, PgfFullFormEntry*, pool);
if (entry != NULL) {
*((PgfFullFormEntry**)to) = entry;
break;
}
st->new_idx = NULL;
}
}
GuEnum*
pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
{
PgfFullFormState* st = gu_new(PgfFullFormState, pool);
st->en.next = gu_fullform_enum_next;
st->map_en1 = gu_map_enum(concr->leftcorner_tok_idx, pool);
st->map_en2 = NULL;
st->new_idx = NULL;
st->pool = NULL;
return &st->en;
}
GuString
pgf_fullform_get_string(PgfFullFormEntry* entry)
{
return *((GuString*) entry->key);
}
void
pgf_fullform_get_analyses(PgfFullFormEntry* entry,
PgfMorphoCallback* callback, GuExn* err)
{
PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) entry->value);
PgfMorphoFn clo = { { pgf_morpho_iter }, NULL, callback };
gu_map_iter(lexicon_idx, &clo.fn, err);
}
static void
pgf_parser_index_token(PgfConcr* concr,
PgfTokens* tokens,
PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
GuPool *pool)
{
PgfToken tok = gu_seq_get(tokens, PgfToken, 0);
PgfProductionIdx* set =
gu_map_get(concr->leftcorner_tok_idx, &tok, PgfProductionIdx*);
if (set == NULL) {
set = gu_map_type_new(PgfProductionIdx, pool);
gu_map_put(concr->leftcorner_tok_idx, &tok, PgfProductionIdx*, set);
}
PgfCFCat cfc = {ccat, lin_idx};
PgfProductionBuf* prods = gu_map_get(set, &cfc, PgfProductionBuf*);
if (prods == NULL) {
prods = gu_new_buf(PgfProduction, pool);
gu_map_put(set, &cfc, PgfProductionBuf*, prods);
}
gu_buf_push(prods, PgfProduction, prod);
}
static void
pgf_parser_index_epsilon(PgfConcr* concr,
PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
GuPool *pool)
{
PgfCFCat cfc = {ccat, lin_idx};
PgfProductionBuf* prods =
gu_map_get(concr->epsilon_idx, &cfc, PgfProductionBuf*);
if (prods == NULL) {
prods = gu_new_buf(PgfProduction, pool);
gu_map_put(concr->epsilon_idx, &cfc, PgfProductionBuf*, prods);
}
gu_buf_push(prods, PgfProduction, prod);
}
void
pgf_parser_index(PgfConcr* concr,
PgfCCat* ccat, PgfProduction prod,
GuPool *pool)
{
for (size_t lin_idx = 0; lin_idx < ccat->cnccat->n_lins; lin_idx++) {
GuVariantInfo i = gu_variant_open(prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
if (gu_seq_length(papp->args) > 0)
break;
PgfSequence* seq = papp->fun->lins[lin_idx];
if (gu_seq_length(seq) > 0) {
GuVariantInfo i = gu_variant_open(gu_seq_get(seq, PgfSymbol, 0));
switch (i.tag) {
case PGF_SYMBOL_KS: {
PgfSymbolKS* sks = i.data;
pgf_parser_index_token(concr,
sks->tokens,
ccat, lin_idx, prod,
pool);
break;
}
case PGF_SYMBOL_KP: {
PgfSymbolKP* skp = i.data;
pgf_parser_index_token(concr,
skp->default_form,
ccat, lin_idx, prod,
pool);
for (size_t i = 0; i < skp->n_forms; i++) {
pgf_parser_index_token(concr,
skp->forms[i].form,
ccat, lin_idx, prod,
pool);
}
break;
}
case PGF_SYMBOL_CAT:
case PGF_SYMBOL_LIT:
case PGF_SYMBOL_NE:
case PGF_SYMBOL_VAR:
// Nothing to be done here
break;
default:
gu_impossible();
}
} else {
pgf_parser_index_epsilon(concr,
ccat, lin_idx, prod,
pool);
}
}
break;
case PGF_PRODUCTION_COERCE:
// Nothing to be done here
break;
default:
gu_impossible();
}
}
}
prob_t
pgf_ccat_set_viterbi_prob(PgfCCat* ccat) {
if (ccat->fid < 0)
return 0;
if (ccat->viterbi_prob == 0) { // uninitialized
ccat->viterbi_prob = INFINITY; // set to infinity to avoid loops
if (ccat->prods == NULL)
return INFINITY;
prob_t viterbi_prob = INFINITY;
size_t n_prods = gu_seq_length(ccat->prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
prob_t prob = 0;
GuVariantInfo inf = gu_variant_open(prod);
switch (inf.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = inf.data;
prob = papp->fun->ep->prob;
size_t n_args = gu_seq_length(papp->args);
for (size_t j = 0; j < n_args; j++) {
PgfPArg* arg = gu_seq_index(papp->args, PgfPArg, j);
prob += pgf_ccat_set_viterbi_prob(arg->ccat);
}
break;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = inf.data;
prob = pgf_ccat_set_viterbi_prob(pcoerce->coerce);
break;
}
default:
gu_impossible();
return 0;
}
if (viterbi_prob > prob)
viterbi_prob = prob;
}
ccat->viterbi_prob = viterbi_prob;
}
return ccat->viterbi_prob;
}
static bool
pgf_cfcat_eq_fn(GuEquality* self, const void* a, const void* b)
{
PgfCFCat *x = (PgfCFCat *) a;
PgfCFCat *y = (PgfCFCat *) b;
return (x->ccat->fid == y->ccat->fid && x->lin_idx == y->lin_idx);
}
static GuHash
pgf_cfcat_hash_fn(GuHasher* self, const void* a)
{
PgfCFCat *x = (PgfCFCat *) a;
return ((x->ccat->fid << 16) ^ x->lin_idx);
}
GuHasher pgf_cfcat_hasher = {
{ pgf_cfcat_eq_fn },
pgf_cfcat_hash_fn
};