some fixes in the robust parser and a new API for literals

This commit is contained in:
kr.angelov
2012-04-12 06:55:25 +00:00
parent 13f4eef70c
commit c6c54f8815
5 changed files with 310 additions and 205 deletions

View File

@@ -212,8 +212,10 @@ extern GU_DECLARE_TYPE(PgfEpsilonIdx, GuMap);
typedef struct PgfLiteralCallback PgfLiteralCallback; typedef struct PgfLiteralCallback PgfLiteralCallback;
extern GU_DECLARE_TYPE(PgfLiteralCallback, struct); extern GU_DECLARE_TYPE(PgfLiteralCallback, struct);
typedef struct PgfItem PgfItem;
struct PgfLiteralCallback { struct PgfLiteralCallback {
bool (*match)(PgfLiteralCallback* self, int lin_idx, PgfTokens toks, bool (*match)(PgfConcr* concr, PgfItem* item, PgfToken tok,
PgfExprProb** out_ep, GuPool *pool); PgfExprProb** out_ep, GuPool *pool);
}; };

View File

@@ -36,13 +36,26 @@ pgf_lexer_next_token(PgfLexer *lexer, GuExn* err, GuPool *pool)
if (iswalpha(lexer->ucs) || if (iswalpha(lexer->ucs) ||
lexer->ucs == '\'' || lexer->ucs == '\'' ||
lexer->ucs == '_') { lexer->ucs == '_') {
int counter = 0;
do { do {
gu_ucs_write(lexer->ucs, wtr, err); gu_ucs_write(lexer->ucs, wtr, err);
if (gu_exn_is_raised(err)) if (gu_exn_is_raised(err))
goto stop; goto stop;
counter++;
lexer->ucs = gu_read_ucs(lexer->rdr, err); lexer->ucs = gu_read_ucs(lexer->rdr, err);
if (gu_exn_is_raised(err)) if (gu_exn_is_raised(err))
goto stop; goto stop;
if (lexer->ucs == '.' && counter < 3) {
// perhaps an abreviation
gu_ucs_write(lexer->ucs, wtr, err);
if (gu_exn_is_raised(err))
goto stop;
counter = 0;
lexer->ucs = gu_read_ucs(lexer->rdr, err);
if (gu_exn_is_raised(err))
goto stop;
}
} while (iswalnum(lexer->ucs) || } while (iswalnum(lexer->ucs) ||
lexer->ucs == '\'' || lexer->ucs == '\'' ||
lexer->ucs == '_'); lexer->ucs == '_');

View File

@@ -1,4 +1,5 @@
#include <gu/read.h> #include <gu/read.h>
#include <pgf/parser.h>
#include <pgf/literals.h> #include <pgf/literals.h>
#include <wctype.h> #include <wctype.h>
@@ -10,15 +11,16 @@ GU_DEFINE_TYPE(PgfCallbacksMap, GuMap,
static bool static bool
pgf_match_string_lit(PgfLiteralCallback* self, int lin_idx, PgfTokens toks, pgf_match_string_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
PgfExprProb** out_ep, GuPool *pool) PgfExprProb** out_ep, GuPool *pool)
{ {
gu_assert(lin_idx == 0); gu_assert(pgf_item_lin_idx(item) == 0);
if (gu_seq_length(toks) == 1) { int n_syms = pgf_item_sequence_length(item);
if (n_syms == 0) {
*out_ep = NULL; *out_ep = NULL;
return true; return true;
} else if (gu_seq_length(toks) == 2) { } else if (n_syms == 1) {
PgfExprProb* ep = gu_new(PgfExprProb, pool); PgfExprProb* ep = gu_new(PgfExprProb, pool);
ep->prob = 0; ep->prob = 0;
@@ -30,7 +32,7 @@ pgf_match_string_lit(PgfLiteralCallback* self, int lin_idx, PgfTokens toks,
gu_new_variant(PGF_LITERAL_STR, gu_new_variant(PGF_LITERAL_STR,
PgfLiteralStr, PgfLiteralStr,
&expr_lit->lit, pool); &expr_lit->lit, pool);
lit_str->val = gu_seq_get(toks, PgfToken, 0); lit_str->val = tok;
*out_ep = ep; *out_ep = ep;
return false; return false;
@@ -46,22 +48,18 @@ static PgfLiteralCallback pgf_string_literal_callback =
static bool static bool
pgf_match_int_lit(PgfLiteralCallback* self, int lin_idx, PgfTokens toks, pgf_match_int_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
PgfExprProb** out_ep, GuPool *pool) PgfExprProb** out_ep, GuPool *pool)
{ {
gu_assert(lin_idx == 0); gu_assert(pgf_item_lin_idx(item) == 0);
size_t n_toks = gu_seq_length(toks);
if (n_toks == 1) {
PgfToken tok = gu_seq_get(toks, PgfToken, 0);
size_t n_syms = pgf_item_sequence_length(item);
if (n_syms == 0) {
int val; int val;
*out_ep = NULL; *out_ep = NULL;
return gu_string_to_int(tok, &val); return gu_string_to_int(tok, &val);
} else if (n_toks == 2) { } else if (n_syms == 1) {
PgfToken tok = gu_seq_get(toks, PgfToken, 0);
int val; int val;
if (!gu_string_to_int(tok, &val)) { if (!gu_string_to_int(tok, &val)) {
*out_ep = NULL; *out_ep = NULL;
@@ -95,22 +93,18 @@ static PgfLiteralCallback pgf_int_literal_callback =
static bool static bool
pgf_match_float_lit(PgfLiteralCallback* self, int lin_idx, PgfTokens toks, pgf_match_float_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
PgfExprProb** out_ep, GuPool *pool) PgfExprProb** out_ep, GuPool *pool)
{ {
gu_assert(lin_idx == 0); gu_assert(pgf_item_lin_idx(item) == 0);
size_t n_toks = gu_seq_length(toks);
if (n_toks == 1) {
PgfToken tok = gu_seq_get(toks, PgfToken, 0);
size_t n_syms = pgf_item_sequence_length(item);
if (n_syms == 0) {
double val; double val;
*out_ep = NULL; *out_ep = NULL;
return gu_string_to_double(tok, &val); return gu_string_to_double(tok, &val);
} else if (n_toks == 2) { } else if (n_syms == 1) {
PgfToken tok = gu_seq_get(toks, PgfToken, 0);
double val; double val;
if (!gu_string_to_double(tok, &val)) { if (!gu_string_to_double(tok, &val)) {
*out_ep = NULL; *out_ep = NULL;
@@ -144,34 +138,42 @@ static PgfLiteralCallback pgf_float_literal_callback =
static bool static bool
pgf_match_name_lit(PgfLiteralCallback* self, int lin_idx, PgfTokens toks, pgf_match_name_lit(PgfConcr* concr, PgfItem* item, PgfToken tok,
PgfExprProb** out_ep, GuPool *pool) PgfExprProb** out_ep, GuPool *pool)
{ {
int lin_idx;
PgfSequence seq;
pgf_item_sequence(item, &lin_idx, &seq, pool);
gu_assert(lin_idx == 0); gu_assert(lin_idx == 0);
size_t n_toks = gu_seq_length(toks);
if (n_toks == 0) {
*out_ep = NULL;
return false;
}
PgfToken tok = gu_seq_get(toks, PgfToken, n_toks-1);
GuPool* tmp_pool = gu_new_pool(); GuPool* tmp_pool = gu_new_pool();
GuReader* rdr = gu_string_reader(tok, tmp_pool);
GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool); GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
bool iscap = iswupper(gu_read_ucs(rdr, err)); GuString hyp = gu_str_string("-", tmp_pool);
if (!iscap && n_toks > 1) {
bool iscap = false;
if (gu_string_eq(tok, hyp)) {
iscap = true;
} else if (!gu_string_eq(tok, gu_empty_string)) {
GuReader* rdr = gu_string_reader(tok, tmp_pool);
iscap = iswupper(gu_read_ucs(rdr, err));
}
size_t n_syms = gu_seq_length(seq);
if (!iscap && n_syms > 0) {
GuStringBuf *sbuf = gu_string_buf(tmp_pool); GuStringBuf *sbuf = gu_string_buf(tmp_pool);
GuWriter* wtr = gu_string_buf_writer(sbuf); GuWriter* wtr = gu_string_buf_writer(sbuf);
for (size_t i = 0; i < n_toks-1; i++) { for (size_t i = 0; i < n_syms; i++) {
if (i > 0) if (i > 0)
gu_putc(' ', wtr, err); gu_putc(' ', wtr, err);
tok = gu_seq_get(toks, PgfToken, i); PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
gu_assert(gu_variant_tag(sym) == PGF_SYMBOL_KS);
PgfSymbolKS* sks = gu_variant_data(sym);
PgfToken tok = gu_seq_get(sks->tokens, PgfToken, 0);
gu_string_write(tok, wtr, err); gu_string_write(tok, wtr, err);
} }

View File

@@ -7,8 +7,6 @@
#include <math.h> #include <math.h>
#include <stdlib.h> #include <stdlib.h>
typedef struct PgfItem PgfItem;
typedef GuBuf PgfItemBuf; typedef GuBuf PgfItemBuf;
typedef GuList(PgfItemBuf*) PgfItemBufs; typedef GuList(PgfItemBuf*) PgfItemBufs;
@@ -92,26 +90,26 @@ typedef GuStringMap PgfTransitions;
GU_DEFINE_TYPE(PgfTransitions, GuStringMap, GU_DEFINE_TYPE(PgfTransitions, GuStringMap,
gu_ptr_type(PgfItemBuf), &gu_null_struct); gu_ptr_type(PgfItemBuf), &gu_null_struct);
typedef struct PgfParsing PgfParsing;
typedef const struct PgfLexCallback PgfLexCallback; typedef const struct PgfLexCallback PgfLexCallback;
struct PgfLexCallback { struct PgfLexCallback {
void (*lex)(PgfLexCallback* self, PgfToken tok, PgfItem* item); void (*lex)(PgfLexCallback* self, PgfToken tok, PgfItem* item);
}; };
struct PgfParsing { typedef struct {
GuPool* pool; GuPool* pool;
GuPool* tmp_pool; GuPool* tmp_pool;
PgfConcr* concr; PgfConcr* concr;
PgfContsMap* conts_map; PgfContsMap* conts_map;
PgfGenCatMap* generated_cats; PgfGenCatMap* generated_cats;
PgfCCatBuf* completed; PgfCCatBuf* completed;
PgfLexCallback* callback; PgfLexCallback* lex_callback;
PgfItemBuf *lexicon_idx; PgfItemBuf* lexicon_idx;
PgfExprProb* meta_ep;
PgfItemBuf *metas;
PgfToken tok; PgfToken tok;
int max_fid; int max_fid;
}; } PgfParsing;
static PgfSymbol static PgfSymbol
pgf_prev_extern_sym(PgfSymbol sym) pgf_prev_extern_sym(PgfSymbol sym)
@@ -134,6 +132,99 @@ pgf_prev_extern_sym(PgfSymbol sym)
} }
} }
int
pgf_item_lin_idx(PgfItem* item) {
return item->base->lin_idx;
}
int
pgf_item_sequence_length(PgfItem* item)
{
GuVariantInfo i = gu_variant_open(item->base->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
return gu_seq_length(papp->fun->lins[item->base->lin_idx]);
}
case PGF_PRODUCTION_COERCE: {
return 1;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
PgfSequence seq;
if (pext->fun != NULL &&
!gu_seq_is_null(seq = pext->fun->lins[item->base->lin_idx])) {
return gu_seq_length(seq);
} else {
int seq_len = 0;
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
seq_len++;
sym = pgf_prev_extern_sym(sym);
}
return seq_len;
}
}
default:
gu_impossible();
return 0;
}
}
static PgfSequence
pgf_extern_seq_get(PgfItem* item, GuPool* pool)
{
int seq_len = pgf_item_sequence_length(item);
PgfSequence seq =
gu_new_seq(PgfSymbol, seq_len, pool);
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
gu_seq_set(seq, PgfSymbol, --seq_len, sym);
sym = pgf_prev_extern_sym(sym);
}
return seq;
}
void
pgf_item_sequence(PgfItem* item,
int* lin_idx, PgfSequence* seq,
GuPool* pool) {
*lin_idx = item->base->lin_idx;
GuVariantInfo i = gu_variant_open(item->base->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
*seq = papp->fun->lins[item->base->lin_idx];
break;
}
case PGF_PRODUCTION_COERCE: {
PgfSymbol sym =
gu_new_variant_i(pool, PGF_SYMBOL_CAT,
PgfSymbolCat,
.d = 0, .r = item->base->lin_idx);
*seq = gu_new_seq(PgfSequence, 1, pool);
gu_seq_set(*seq, PgfSymbol, 0, sym);
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
if (pext->fun == NULL ||
gu_seq_is_null(*seq = pext->fun->lins[item->base->lin_idx])) {
*seq = pgf_extern_seq_get(item, pool);
}
break;
}
default:
gu_impossible();
}
}
#ifdef PGF_PARSER_DEBUG #ifdef PGF_PARSER_DEBUG
static void static void
pgf_print_production_args(PgfPArgs args, pgf_print_production_args(PgfPArgs args,
@@ -161,7 +252,8 @@ pgf_print_production_args(PgfPArgs args,
} }
static void static void
pgf_print_production(int fid, PgfProduction prod, GuWriter *wtr, GuExn* err) pgf_print_production(int fid, PgfProduction prod,
GuWriter *wtr, GuExn* err, GuPool* pool)
{ {
gu_printf(wtr,err,"C%d -> ",fid); gu_printf(wtr,err,"C%d -> ",fid);
@@ -198,35 +290,17 @@ pgf_print_production(int fid, PgfProduction prod, GuWriter *wtr, GuExn* err)
void void
pgf_print_symbol(PgfSymbol sym, GuWriter *wtr, GuExn *err); pgf_print_symbol(PgfSymbol sym, GuWriter *wtr, GuExn *err);
static int
pgf_print_extern_seq(PgfSymbol sym, int seq_idx,
GuWriter* wtr, GuExn* err)
{
if (gu_variant_is_null(sym))
return 0;
PgfSymbol prev = pgf_prev_extern_sym(sym);
int index = pgf_print_extern_seq(prev, seq_idx, wtr, err);
if (index == seq_idx)
gu_printf(wtr, err, " . ");
pgf_print_symbol(sym, wtr, err);
return index+1;
}
static void static void
pgf_print_item_seq(PgfCncFun *fun, PgfItem *item, pgf_print_item_seq(PgfItem *item,
GuWriter* wtr, GuExn* err) GuWriter* wtr, GuExn* err, GuPool* pool)
{ {
size_t index; int lin_idx;
PgfSequence seq; PgfSequence seq;
pgf_item_sequence(item, &lin_idx, &seq, pool);
gu_printf(wtr, err, "%d : ",item->base->lin_idx); gu_printf(wtr, err, "%d : ",lin_idx);
if (fun != NULL && size_t index;
!gu_seq_is_null(seq = fun->lins[item->base->lin_idx])) {
for (index = 0; index < gu_seq_length(seq); index++) { for (index = 0; index < gu_seq_length(seq); index++) {
if (item->seq_idx == index) if (item->seq_idx == index)
gu_printf(wtr, err, " . "); gu_printf(wtr, err, " . ");
@@ -234,17 +308,13 @@ pgf_print_item_seq(PgfCncFun *fun, PgfItem *item,
PgfSymbol *sym = gu_seq_index(seq, PgfSymbol, index); PgfSymbol *sym = gu_seq_index(seq, PgfSymbol, index);
pgf_print_symbol(*sym, wtr, err); pgf_print_symbol(*sym, wtr, err);
} }
} else {
index = pgf_print_extern_seq(item->curr_sym, item->seq_idx,
wtr, err);
}
if (item->seq_idx == index) if (item->seq_idx == index)
gu_printf(wtr, err, " ."); gu_printf(wtr, err, " .");
} }
static void static void
pgf_print_item(PgfItem* item, GuWriter* wtr, GuExn* err) pgf_print_item(PgfItem* item, GuWriter* wtr, GuExn* err, GuPool* pool)
{ {
gu_printf(wtr, err, "[C%d -> ",item->base->ccat->fid); gu_printf(wtr, err, "[C%d -> ",item->base->ccat->fid);
@@ -258,18 +328,11 @@ pgf_print_item(PgfItem* item, GuWriter* wtr, GuExn* err)
gu_printf(wtr, err, ")["); gu_printf(wtr, err, ")[");
pgf_print_production_args(item->args, wtr, err); pgf_print_production_args(item->args, wtr, err);
gu_printf(wtr, err, "]; "); gu_printf(wtr, err, "]; ");
pgf_print_item_seq(fun, item, wtr, err);
break; break;
} }
case PGF_PRODUCTION_COERCE: { case PGF_PRODUCTION_COERCE: {
gu_printf(wtr, err, "_[C%d]; %d : ", gu_printf(wtr, err, "_[C%d]; ",
gu_seq_index(item->args, PgfPArg, 0)->ccat->fid, gu_seq_index(item->args, PgfPArg, 0)->ccat->fid);
item->base->lin_idx);
if (item->seq_idx == 0)
gu_printf(wtr, err, ". ");
gu_printf(wtr, err, "<0,%d>", item->base->lin_idx);
if (item->seq_idx == 1)
gu_printf(wtr, err, " .");
break; break;
} }
case PGF_PRODUCTION_EXTERN: { case PGF_PRODUCTION_EXTERN: {
@@ -283,13 +346,13 @@ pgf_print_item(PgfItem* item, GuWriter* wtr, GuExn* err)
gu_printf(wtr, err, "["); gu_printf(wtr, err, "[");
pgf_print_production_args(item->args, wtr, err); pgf_print_production_args(item->args, wtr, err);
gu_printf(wtr, err, "]; "); gu_printf(wtr, err, "]; ");
pgf_print_item_seq(pext->fun, item, wtr, err);
break; break;
} }
default: default:
gu_impossible(); gu_impossible();
} }
pgf_print_item_seq(item, wtr, err, pool);
gu_printf(wtr, err, "]\n"); gu_printf(wtr, err, "]\n");
} }
#endif #endif
@@ -297,7 +360,7 @@ pgf_print_item(PgfItem* item, GuWriter* wtr, GuExn* err)
static void static void
pgf_parsing_add_transition(PgfParsing* parsing, PgfToken tok, PgfItem* item) pgf_parsing_add_transition(PgfParsing* parsing, PgfToken tok, PgfItem* item)
{ {
parsing->callback->lex(parsing->callback, tok, item); parsing->lex_callback->lex(parsing->lex_callback, tok, item);
} }
static PgfItemBufs* static PgfItemBufs*
@@ -492,9 +555,19 @@ pgf_parsing_combine(PgfParsing* parsing, PgfItem* cont,
return; return;
} }
bool extend = false;
GuVariantInfo i = gu_variant_open(cont->base->prod);
if (i.tag == PGF_PRODUCTION_EXTERN) {
PgfProductionExtern* pext = i.data;
if (pext->fun == NULL ||
gu_seq_is_null(pext->fun->lins[cont->base->lin_idx])) {
extend = true;
}
}
PgfItem* item = NULL; PgfItem* item = NULL;
if (!gu_variant_is_null(cont->curr_sym)) { if (!extend) {
switch (gu_variant_tag(cont->curr_sym)) { switch (gu_variant_tag(cont->curr_sym)) {
case PGF_SYMBOL_CAT: { case PGF_SYMBOL_CAT: {
PgfSymbolCat* scat = gu_variant_data(cont->curr_sym); PgfSymbolCat* scat = gu_variant_data(cont->curr_sym);
@@ -575,20 +648,8 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
if (fun == NULL || if (fun == NULL ||
gu_seq_is_null(fun->lins[item->base->lin_idx])) { gu_seq_is_null(fun->lins[item->base->lin_idx])) {
int seq_len = 0;
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
seq_len++;
sym = pgf_prev_extern_sym(sym);
}
PgfSequence seq = PgfSequence seq =
gu_new_seq(PgfSymbol, seq_len, pool); pgf_extern_seq_get(item, pool);
sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
gu_seq_set(seq, PgfSymbol, --seq_len, sym);
sym = pgf_prev_extern_sym(sym);
}
PgfCncCat *cnccat = item->base->ccat->cnccat; PgfCncCat *cnccat = item->base->ccat->cnccat;
size_t size = GU_FLEX_SIZE(PgfCncFun, lins, cnccat->n_lins); size_t size = GU_FLEX_SIZE(PgfCncFun, lins, cnccat->n_lins);
@@ -652,7 +713,7 @@ pgf_parsing_complete(PgfParsing* parsing, PgfItem* item, PgfExprProb *ep)
item->base->ccat->fid, item->base->ccat->fid,
item->base->lin_idx, item->base->lin_idx,
cat->fid); cat->fid);
pgf_print_production(cat->fid, prod, wtr, err); pgf_print_production(cat->fid, prod, wtr, err, tmp_pool);
gu_pool_free(tmp_pool); gu_pool_free(tmp_pool);
#endif #endif
@@ -780,6 +841,16 @@ pgf_parsing_bu_predict(PgfParsing* parsing, PgfItem* item,
n_items = gu_buf_length(item->base->conts); n_items = gu_buf_length(item->base->conts);
for (size_t i = 0; i < n_items; i++) { for (size_t i = 0; i < n_items; i++) {
PgfItem *item_ = gu_buf_get(item->base->conts, PgfItem*, i); PgfItem *item_ = gu_buf_get(item->base->conts, PgfItem*, i);
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
pgf_print_item(item_, wtr, err, tmp_pool);
gu_pool_free(tmp_pool);
#endif
pgf_parsing_bu_predict(parsing, item_, metas, conts); pgf_parsing_bu_predict(parsing, item_, metas, conts);
} }
@@ -789,15 +860,6 @@ pgf_parsing_bu_predict(PgfParsing* parsing, PgfItem* item,
gu_buf_push(conts, PgfItem*, item_); gu_buf_push(conts, PgfItem*, item_);
} }
} }
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
pgf_print_item(copy, wtr, err);
gu_pool_free(tmp_pool);
#endif
} }
static void static void
@@ -942,7 +1004,7 @@ pgf_parsing_item(PgfParsing* parsing, PgfItem* item)
GuOut* out = gu_file_out(stderr, tmp_pool); GuOut* out = gu_file_out(stderr, tmp_pool);
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool); GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool); GuExn* err = gu_exn(NULL, type, tmp_pool);
pgf_print_item(item, wtr, err); pgf_print_item(item, wtr, err, tmp_pool);
gu_pool_free(tmp_pool); gu_pool_free(tmp_pool);
#endif #endif
@@ -992,50 +1054,25 @@ pgf_parsing_item(PgfParsing* parsing, PgfItem* item)
pgf_parsing_symbol(parsing, item, sym); pgf_parsing_symbol(parsing, item, sym);
} }
} else { } else {
PgfSymbol prev = gu_null_variant;
PgfTokens toks;
if (gu_variant_is_null(item->curr_sym) ||
gu_variant_tag(item->curr_sym) != PGF_SYMBOL_KS) {
toks = gu_new_seq(PgfToken, 1, parsing->pool);
gu_seq_set(toks, PgfToken, 0, parsing->tok);
prev = item->curr_sym;
} else {
GuVariantInfo i = gu_variant_open(item->curr_sym);
gu_assert(i.tag == PGF_SYMBOL_KS);
PgfTokens old_toks = ((PgfSymbolKS*) i.data)->tokens;
prev = pgf_prev_extern_sym(item->curr_sym);
size_t n_toks = gu_seq_length(old_toks);
toks = gu_new_seq(PgfToken, n_toks+1, parsing->pool);
for (size_t i = 0; i < n_toks; i++) {
gu_seq_set(toks, PgfToken, i,
gu_seq_get(old_toks, PgfToken, i));
}
gu_seq_set(toks, PgfToken, n_toks, parsing->tok);
}
PgfExprProb *ep = NULL;
bool accepted = bool accepted =
pext->callback->match(pext->callback, pext->callback->match(parsing->concr, item, parsing->tok,
item->base->lin_idx, toks, &ep, &parsing->meta_ep, parsing->pool);
parsing->pool);
if (ep != NULL) if (parsing->meta_ep != NULL)
pgf_parsing_complete(parsing, item, ep); pgf_parsing_complete(parsing, item, parsing->meta_ep);
if (accepted) { if (accepted) {
if (gu_variant_is_null(item->curr_sym)) PgfSymbol prev = item->curr_sym;
item->seq_idx = 1;
PgfSymbolKS* sks = (PgfSymbolKS*) PgfSymbolKS* sks = (PgfSymbolKS*)
gu_alloc_variant(PGF_SYMBOL_KS, gu_alloc_variant(PGF_SYMBOL_KS,
sizeof(PgfSymbolKS)+sizeof(PgfSymbol), sizeof(PgfSymbolKS)+sizeof(PgfSymbol),
gu_alignof(PgfSymbolKS), gu_alignof(PgfSymbolKS),
&item->curr_sym, parsing->pool); &item->curr_sym, parsing->pool);
*((PgfSymbol*)(sks+1)) = prev; *((PgfSymbol*)(sks+1)) = prev;
sks->tokens = toks; sks->tokens = gu_new_seq(PgfToken, 1, parsing->pool);
gu_seq_set(sks->tokens, PgfToken, 0, parsing->tok);
item->seq_idx++;
pgf_parsing_add_transition(parsing, parsing->tok, item); pgf_parsing_add_transition(parsing, parsing->tok, item);
} }
} }
@@ -1055,8 +1092,10 @@ pgf_new_parsing(PgfConcr* concr, PgfLexCallback* callback, int max_fid,
parsing->generated_cats = gu_map_type_new(PgfGenCatMap, out_pool); parsing->generated_cats = gu_map_type_new(PgfGenCatMap, out_pool);
parsing->conts_map = gu_map_type_new(PgfContsMap, out_pool); parsing->conts_map = gu_map_type_new(PgfContsMap, out_pool);
parsing->completed = gu_new_buf(PgfCCat*, parse_pool); parsing->completed = gu_new_buf(PgfCCat*, parse_pool);
parsing->callback = callback; parsing->lex_callback = callback;
parsing->lexicon_idx = NULL; parsing->lexicon_idx = NULL;
parsing->meta_ep = NULL;
parsing->metas = gu_new_buf(PgfItem*, out_pool);
parsing->pool = parse_pool; parsing->pool = parse_pool;
parsing->tmp_pool = out_pool; parsing->tmp_pool = out_pool;
parsing->tok = gu_empty_string; parsing->tok = gu_empty_string;
@@ -1093,11 +1132,10 @@ pgf_match_token(PgfLexCallback* self, PgfToken tok, PgfItem* item)
typedef struct { typedef struct {
GuMapItor fn; GuMapItor fn;
PgfProduction prod; PgfProduction prod;
PgfItemBuf *metas; PgfParsing *parsing;
GuPool *pool;
} PgfGetMetaFn; } PgfGetMetaFn;
void static void
pgf_parsing_get_metas(GuMapItor* fn, const void* key, void* value, pgf_parsing_get_metas(GuMapItor* fn, const void* key, void* value,
GuExn* err) GuExn* err)
{ {
@@ -1105,8 +1143,7 @@ pgf_parsing_get_metas(GuMapItor* fn, const void* key, void* value,
PgfCCat *ccat = (PgfCCat *) key; PgfCCat *ccat = (PgfCCat *) key;
PgfItemBufs* contss = *((PgfItemBufs **) value); PgfItemBufs* contss = *((PgfItemBufs **) value);
PgfProduction prod = clo->prod; PgfProduction prod = clo->prod;
PgfItemBuf *metas = clo->metas; PgfParsing *parsing = clo->parsing;
GuPool *pool = clo->pool;
size_t n_lins = gu_list_length(contss); size_t n_lins = gu_list_length(contss);
for (size_t lin_idx = 0; lin_idx < n_lins; lin_idx++) { for (size_t lin_idx = 0; lin_idx < n_lins; lin_idx++) {
@@ -1114,25 +1151,19 @@ pgf_parsing_get_metas(GuMapItor* fn, const void* key, void* value,
if (conts != NULL) { if (conts != NULL) {
PgfItem *item = PgfItem *item =
pgf_new_item(ccat, lin_idx, prod, conts, pool); pgf_new_item(ccat, lin_idx, prod, conts, parsing->pool);
gu_buf_push(metas, PgfItem*, item); gu_buf_push(parsing->metas, PgfItem*, item);
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
pgf_print_item(item, wtr, err);
gu_pool_free(tmp_pool);
#endif
} }
} }
} }
static bool static bool
pgf_match_meta(PgfLiteralCallback* self, int lin_idx, PgfTokens toks, pgf_match_meta(PgfConcr* concr, PgfItem *item, PgfToken tok,
PgfExprProb** out_ep, GuPool *pool) PgfExprProb** out_ep, GuPool *pool)
{ {
size_t n_syms = pgf_item_sequence_length(item);
if (n_syms > 0) {
PgfExprProb *ep = gu_new(PgfExprProb, pool); PgfExprProb *ep = gu_new(PgfExprProb, pool);
ep->prob = 100000000000 + rand(); ep->prob = 100000000000 + rand();
PgfExprMeta *expr_meta = PgfExprMeta *expr_meta =
@@ -1142,12 +1173,62 @@ pgf_match_meta(PgfLiteralCallback* self, int lin_idx, PgfTokens toks,
expr_meta->id = 0; expr_meta->id = 0;
*out_ep = ep; *out_ep = ep;
} else {
*out_ep = NULL;
}
if (gu_map_get(concr->lexicon_idx, &tok, GuBuf*) == NULL)
return true; return true;
else {
PgfParsing* parsing =
gu_container(out_ep, PgfParsing, meta_ep);
gu_buf_push(parsing->metas, PgfItem*, item);
return false;
}
return false;
} }
static PgfLiteralCallback pgf_meta_callback = static PgfLiteralCallback pgf_meta_callback =
{ pgf_match_meta } ; { pgf_match_meta } ;
static void
pgf_parsing_collect_metas(PgfParsing* parsing, bool print)
{
PgfProduction prod;
PgfProductionExtern* pext =
gu_new_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
&prod, parsing->pool);
pext->fun = NULL;
pext->args = gu_new_seq(PgfPArg, 0, parsing->pool);
pext->callback = &pgf_meta_callback;
#ifdef PGF_PARSER_DEBUG
int n_zero_metas = gu_buf_length(parsing->metas);
#endif
PgfGetMetaFn clo2 = { { pgf_parsing_get_metas }, prod, parsing };
gu_map_iter(parsing->conts_map, &clo2.fn, NULL);
#ifdef PGF_PARSER_DEBUG
if (print) {
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
GuExn* err = gu_exn(NULL, type, tmp_pool);
size_t n_items = gu_buf_length(parsing->metas);
for (size_t i = n_zero_metas; i < n_items; i++) {
PgfItem* item = gu_buf_get(parsing->metas, PgfItem*, i);
pgf_print_item(item, wtr, err, tmp_pool);
}
gu_pool_free(tmp_pool);
}
#endif
}
PgfParse* PgfParse*
pgf_parse_token(PgfParse* parse, PgfToken tok, bool robust, GuPool* pool) pgf_parse_token(PgfParse* parse, PgfToken tok, bool robust, GuPool* pool)
{ {
@@ -1166,35 +1247,30 @@ pgf_parse_token(PgfParse* parse, PgfToken tok, bool robust, GuPool* pool)
pgf_parsing_item(parsing, item); pgf_parsing_item(parsing, item);
} }
if (robust && gu_buf_length(agenda) == 0) { if (robust) {
PgfProduction prod;
PgfProductionExtern* pext =
gu_new_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
&prod, parsing->pool);
pext->fun = NULL;
pext->args = gu_new_seq(PgfPArg, 0, parsing->pool);
pext->callback = &pgf_meta_callback;
PgfItemBuf *metas = gu_new_buf(PgfItem*, parsing->pool);
PgfGetMetaFn clo2 = { { pgf_parsing_get_metas }, prod, metas, pool };
gu_map_iter(parsing->conts_map, &clo2.fn, NULL);
if (parsing->lexicon_idx != NULL) { if (parsing->lexicon_idx != NULL) {
bool flag = false;
size_t n_items = gu_buf_length(parsing->lexicon_idx); size_t n_items = gu_buf_length(parsing->lexicon_idx);
for (size_t i = 0; i < n_items; i++) { for (size_t i = 0; i < n_items; i++) {
PgfItem* item = gu_buf_get(parsing->lexicon_idx, PgfItem*, i); PgfItem* item = gu_buf_get(parsing->lexicon_idx, PgfItem*, i);
if (!pgf_parsing_has_conts(parsing->conts_map, if (!pgf_parsing_has_conts(parsing->conts_map,
item->base->ccat, item->base->lin_idx)) { item->base->ccat, item->base->lin_idx)) {
pgf_parsing_bu_predict(parsing, item, metas, agenda); if (!flag) {
pgf_parsing_collect_metas(parsing, true);
flag = true;
}
pgf_parsing_bu_predict(parsing, item, parsing->metas, agenda);
} }
} }
} else { } else {
// We have unknown word // We have unknown word
size_t n_items = gu_buf_length(metas); pgf_parsing_collect_metas(parsing, false);
size_t n_items = gu_buf_length(parsing->metas);
for (size_t i = 0; i < n_items; i++) { for (size_t i = 0; i < n_items; i++) {
PgfItem* item = gu_buf_get(metas, PgfItem*, i); PgfItem* item = gu_buf_get(parsing->metas, PgfItem*, i);
pgf_parsing_item(parsing, item); pgf_parsing_item(parsing, item);
} }
} }

View File

@@ -103,6 +103,18 @@ pgf_parse_result(PgfParse* parse, GuPool* pool);
PgfExpr PgfExpr
pgf_parse_best_result(PgfParse* parse, GuPool* pool); pgf_parse_best_result(PgfParse* parse, GuPool* pool);
int
pgf_item_lin_idx(PgfItem* item);
void
pgf_item_sequence(PgfItem* item,
int* lin_idx, PgfSequence* seq,
GuPool* pool);
int
pgf_item_sequence_length(PgfItem* item);
/** @} */ /** @} */
#endif // PGF_PARSER_H_ #endif // PGF_PARSER_H_