diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am index a8d507d9d..6716ec042 100644 --- a/src/runtime/c/Makefile.am +++ b/src/runtime/c/Makefile.am @@ -39,7 +39,6 @@ pgfinclude_HEADERS = \ pgf/reader.h \ pgf/linearizer.h \ pgf/parser.h \ - pgf/lexer.h \ pgf/literals.h \ pgf/graphviz.h \ pgf/pgf.h @@ -93,8 +92,6 @@ libpgf_la_SOURCES = \ pgf/parser.h \ pgf/jit.c \ pgf/parseval.c \ - pgf/lexer.c \ - pgf/lexer.h \ pgf/literals.c \ pgf/literals.h \ pgf/reader.h \ diff --git a/src/runtime/c/gu/exn.h b/src/runtime/c/gu/exn.h index 9ee70438a..c043c4b8a 100644 --- a/src/runtime/c/gu/exn.h +++ b/src/runtime/c/gu/exn.h @@ -91,7 +91,7 @@ gu_exn_clear(GuExn* err) { GuType* gu_exn_caught(GuExn* err); -inline const void* +static inline const void* gu_exn_caught_data(GuExn* err) { return err->data.data; diff --git a/src/runtime/c/gu/utf8.h b/src/runtime/c/gu/utf8.h index 3118c4071..db7dccaf9 100644 --- a/src/runtime/c/gu/utf8.h +++ b/src/runtime/c/gu/utf8.h @@ -37,7 +37,7 @@ gu_utf8_decode(const uint8_t** utf8); void gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err); -bool +static inline bool gu_is_space(uint8_t c) { return (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r' || c == ' '); diff --git a/src/runtime/c/pgf/data.h b/src/runtime/c/pgf/data.h index 2562af651..0dae42491 100644 --- a/src/runtime/c/pgf/data.h +++ b/src/runtime/c/pgf/data.h @@ -128,14 +128,16 @@ typedef struct { extern GU_DECLARE_TYPE(PgfCncCat, abstract); +typedef GuString PgfToken; +typedef GuSeq PgfTokens; + bool pgf_tokens_equal(PgfTokens* t1, PgfTokens* t2); -typedef GuSeq PgfSequence; // -> PgfSymbol -typedef GuSeq PgfSequences; +typedef GuSeq PgfSymbols; typedef struct { - PgfSequence* form; + PgfSymbols* form; /**< The form of this variant as a list of tokens. */ GuStrings* prefixes; @@ -154,19 +156,8 @@ extern GU_DECLARE_TYPE(PgfCncFunOverloadMap, GuStringMap); typedef GuMap PgfCncOverloadMap; extern GU_DECLARE_TYPE(PgfCncOverloadMap, GuMap); -typedef GuMap PgfProductionIdx; -extern GU_DECLARE_TYPE(PgfProductionIdx, GuMap); - -typedef GuMap PgfLeftcornerTokIdx; -extern GU_DECLARE_TYPE(PgfLeftcornerTokIdx, GuMap); - typedef struct PgfItem PgfItem; -typedef struct { - bool (*match)(PgfConcr* concr, PgfItem* item, PgfToken tok, - PgfExprProb** out_ep, GuPool *pool); -} PgfLiteralCallback; - typedef GuMap PgfCallbacksMap; extern GU_DECLARE_TYPE(PgfCallbacksMap, GuMap); @@ -197,7 +188,7 @@ typedef struct PgfSymbolKP /** A prefix-dependent symbol. The form that this symbol takes * depends on the form of a prefix of the following symbol. */ { - PgfSequence* default_form; + PgfSymbols* default_form; /**< Default form that this symbol takes if none of of the * variant forms is triggered. */ @@ -213,6 +204,21 @@ typedef struct { typedef struct { } PgfSymbolBIND; +typedef struct { + PgfExprProb* (*match)(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx, + GuString sentence, size_t* poffset, + GuPool *pool, GuPool *out_pool); +} PgfLiteralCallback; + +typedef GuBuf PgfProductionIdx; + +typedef struct { + PgfSymbols* syms; // -> PgfSymbol + PgfProductionIdx* idx; +} PgfSequence; + +typedef GuSeq PgfSequences; + typedef struct { PgfAbsFun* absfun; PgfExprProb *ep; @@ -230,8 +236,6 @@ struct PgfConcr { GuMap* ccats; PgfCncFunOverloadMap* fun_indices; PgfCncOverloadMap* coerce_idx; - PgfProductionIdx* epsilon_idx; - PgfLeftcornerTokIdx* leftcorner_tok_idx; PgfCncFuns* cncfuns; PgfSequences* sequences; PgfCIdMap* cnccats; @@ -274,7 +278,6 @@ typedef struct PgfProductionCoerce } PgfProductionCoerce; typedef struct { - PgfLiteralCallback *callback; PgfExprProb *ep; GuSeq* lins; } PgfProductionExtern; @@ -287,8 +290,11 @@ typedef struct { typedef GuSeq PgfProductionSeq; extern GU_DECLARE_TYPE(PgfProductionSeq, abstract); -typedef GuBuf PgfProductionBuf; -extern GU_DECLARE_TYPE(PgfProductionBuf, abstract); +typedef struct { + PgfCCat* ccat; + size_t lin_idx; + PgfProductionApply* papp; +} PgfProductionIdxEntry; struct PgfCCat { PgfCncCat* cnccat; diff --git a/src/runtime/c/pgf/lexer.c b/src/runtime/c/pgf/lexer.c deleted file mode 100644 index dda0bf6e0..000000000 --- a/src/runtime/c/pgf/lexer.c +++ /dev/null @@ -1,128 +0,0 @@ -#include -#include -#include -#include - -typedef struct { - PgfLexer base; - GuIn* in; - GuPool* pool; - GuUCS ucs; -} PgfSimpleLexer; - -static void -pgf_lexer_read_ucs(PgfSimpleLexer *lexer, GuExn* err) -{ - lexer->ucs = gu_in_utf8(lexer->in, err); - if (gu_exn_is_raised(err)) { - gu_exn_clear(err); - lexer->ucs = ' '; - } -} - -static PgfToken -pgf_simple_lexer_read_token(PgfLexer *base, GuExn* err) -{ - PgfSimpleLexer* lexer = (PgfSimpleLexer*) base; - GuPool* tmp_pool = gu_new_pool(); - - GuStringBuf* buf = gu_string_buf(tmp_pool); - GuOut* out = gu_string_buf_out(buf); - - while (iswspace(lexer->ucs)) { - lexer->ucs = gu_in_utf8(lexer->in, err); - if (gu_exn_is_raised(err)) - goto stop; - } - - if (iswalpha(lexer->ucs) || - lexer->ucs == '\'' || - lexer->ucs == '_') { - int counter = 0; - do { - gu_out_utf8(lexer->ucs, out, err); - if (gu_exn_is_raised(err)) - goto stop; - counter++; - pgf_lexer_read_ucs(lexer, err); - - if (lexer->ucs == '.' && counter < 4) { - // perhaps an abreviation - gu_out_utf8(lexer->ucs, out, err); - if (gu_exn_is_raised(err)) - goto stop; - counter = 0; - pgf_lexer_read_ucs(lexer, err); - } - } while (iswalnum(lexer->ucs) || - lexer->ucs == '\'' || - lexer->ucs == '_'); - } else if (iswdigit(lexer->ucs) || lexer->ucs == '-') { - if (lexer->ucs == '-') { - gu_out_utf8(lexer->ucs, out, err); - if (gu_exn_is_raised(err)) - goto stop; - - pgf_lexer_read_ucs(lexer, err); - if (!iswdigit(lexer->ucs)) - goto stop; - } - - do { - gu_out_utf8(lexer->ucs, out, err); - if (gu_exn_is_raised(err)) - goto stop; - - pgf_lexer_read_ucs(lexer, err); - } while (iswdigit(lexer->ucs)); - - if (lexer->ucs == '.') { - gu_out_utf8(lexer->ucs, out, err); - if (gu_exn_is_raised(err)) - goto stop; - - pgf_lexer_read_ucs(lexer, err); - while (iswdigit(lexer->ucs)) { - gu_out_utf8(lexer->ucs, out, err); - if (gu_exn_is_raised(err)) - goto stop; - pgf_lexer_read_ucs(lexer, err); - } - } - } else { - gu_out_utf8(lexer->ucs, out, err); - if (gu_exn_is_raised(err)) - goto stop; - pgf_lexer_read_ucs(lexer, err); - } - -stop: - lexer->base.tok = gu_string_buf_freeze(buf, lexer->pool); - - gu_pool_free(tmp_pool); - return lexer->base.tok; -} - -PgfLexer* -pgf_new_simple_lexer(GuIn *in, GuPool *pool) -{ - PgfSimpleLexer* lexer = gu_new(PgfSimpleLexer, pool); - lexer->base.read_token = pgf_simple_lexer_read_token; - lexer->base.tok = ""; - lexer->in = in; - lexer->pool = pool; - lexer->ucs = ' '; - return ((PgfLexer*) lexer); -} - -PgfToken -pgf_lexer_read_token(PgfLexer *lexer, GuExn* err) -{ - return lexer->read_token(lexer, err); -} - -PgfToken -pgf_lexer_current_token(PgfLexer *lexer) -{ - return lexer->tok; -} diff --git a/src/runtime/c/pgf/lexer.h b/src/runtime/c/pgf/lexer.h deleted file mode 100644 index b55c07930..000000000 --- a/src/runtime/c/pgf/lexer.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef PGF_LEXER_H_ -#define PGF_LEXER_H_ - -#include -#include - -/// A single lexical token -typedef GuString PgfToken; -typedef GuSeq PgfTokens; // -> PgfToken - -typedef struct { - prob_t prob; - PgfCId cat; - PgfToken tok; -} PgfTokenProb; - -typedef struct { - PgfToken (*read_token)(); - PgfToken tok; -} PgfLexer; - -PgfLexer* -pgf_new_simple_lexer(GuIn *in, GuPool *pool); - -PgfToken -pgf_lexer_read_token(PgfLexer *lexer, GuExn* err); - -PgfToken -pgf_lexer_current_token(PgfLexer *lexer); - -#endif // PGF_LEXER_H_ diff --git a/src/runtime/c/pgf/linearizer.c b/src/runtime/c/pgf/linearizer.c index aa47bd825..0465b6e1c 100644 --- a/src/runtime/c/pgf/linearizer.c +++ b/src/runtime/c/pgf/linearizer.c @@ -455,14 +455,13 @@ pgf_lzr_concretize(PgfConcr* concr, PgfExpr expr, GuPool* pool) } void -pgf_lzr_linearize_sequence(PgfConcr* concr, PgfCncTreeApp* fapp, - PgfSequence* seq, uint16_t seq_idx, - PgfLinFuncs** fnsp) +pgf_lzr_linearize_symbols(PgfConcr* concr, PgfCncTreeApp* fapp, + PgfSymbols* syms, uint16_t sym_idx, + PgfLinFuncs** fnsp) { - size_t nsyms = gu_seq_length(seq); - PgfSymbol* syms = gu_seq_data(seq); - for (size_t i = seq_idx; i < nsyms; i++) { - PgfSymbol sym = syms[i]; + size_t nsyms = gu_seq_length(syms); + for (size_t i = sym_idx; i < nsyms; i++) { + PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i); GuVariantInfo sym_i = gu_variant_open(sym); switch (sym_i.tag) { case PGF_SYMBOL_CAT: @@ -488,7 +487,7 @@ pgf_lzr_linearize_sequence(PgfConcr* concr, PgfCncTreeApp* fapp, case PGF_SYMBOL_KP: { // TODO: correct prefix-dependencies PgfSymbolKP* kp = sym_i.data; - pgf_lzr_linearize_sequence(concr, fapp, kp->default_form, 0, fnsp); + pgf_lzr_linearize_symbols(concr, fapp, kp->default_form, 0, fnsp); break; } case PGF_SYMBOL_NE: { @@ -528,9 +527,7 @@ pgf_lzr_linearize(PgfConcr* concr, PgfCncTree ctree, size_t lin_idx, PgfLinFuncs } gu_require(lin_idx < fun->n_lins); - - PgfSequence* seq = fun->lins[lin_idx]; - pgf_lzr_linearize_sequence(concr, fapp, seq, 0, fnsp); + pgf_lzr_linearize_symbols(concr, fapp, fun->lins[lin_idx]->syms, 0, fnsp); if (fns->end_phrase) { fns->end_phrase(fnsp, @@ -681,7 +678,7 @@ pgf_lzr_linearize_simple(PgfConcr* concr, PgfCncTree ctree, } GuString -pgf_get_tokens(PgfSequence* seq, uint16_t seq_idx, GuPool* pool) +pgf_get_tokens(PgfSymbols* syms, uint16_t sym_idx, GuPool* pool) { GuPool* tmp_pool = gu_new_pool(); GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool); @@ -695,7 +692,7 @@ pgf_get_tokens(PgfSequence* seq, uint16_t seq_idx, GuPool* pool) .err = err }; - pgf_lzr_linearize_sequence(NULL, NULL, seq, seq_idx, &flin.funcs); + pgf_lzr_linearize_symbols(NULL, NULL, syms, sym_idx, &flin.funcs); GuString tokens = gu_ok(err) ? gu_string_buf_freeze(sbuf, pool) : ""; diff --git a/src/runtime/c/pgf/linearizer.h b/src/runtime/c/pgf/linearizer.h index 63ab2705a..1cc244575 100644 --- a/src/runtime/c/pgf/linearizer.h +++ b/src/runtime/c/pgf/linearizer.h @@ -76,5 +76,5 @@ pgf_lzr_linearize_simple(PgfConcr* concr, PgfCncTree ctree, #ifdef PGF_PARSER_H_ // Used internally in the parser GuString -pgf_get_tokens(PgfSequence* seq, uint16_t seq_idx, GuPool* pool); +pgf_get_tokens(PgfSymbols* sym, uint16_t sym_idx, GuPool* pool); #endif diff --git a/src/runtime/c/pgf/literals.c b/src/runtime/c/pgf/literals.c index ad8d900cc..2043dd24c 100644 --- a/src/runtime/c/pgf/literals.c +++ b/src/runtime/c/pgf/literals.c @@ -11,48 +11,41 @@ GU_DEFINE_TYPE(PgfCallbacksMap, GuMap, gu_ptr_type(PgfLiteralCallback), &gu_null_struct); -static bool -pgf_match_string_lit(PgfConcr* concr, PgfItem* item, PgfToken tok, - PgfExprProb** out_ep, GuPool *pool) +static PgfExprProb* +pgf_match_string_lit(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx, + GuString sentence, size_t* poffset, + GuPool *pool, GuPool *out_pool) { - GuPool* tmp_pool = gu_new_pool(); - - size_t lin_idx; - PgfSequence* seq; - pgf_item_sequence(item, &lin_idx, &seq, tmp_pool); gu_assert(lin_idx == 0); - bool accepted = false; - int n_syms = gu_seq_length(seq); - if (n_syms == 0) { - *out_ep = NULL; - accepted = true; - } else if (n_syms == 1) { - PgfExprProb* ep = gu_new(PgfExprProb, pool); + size_t offset = *poffset; + while (!gu_is_space(sentence[offset])) + offset++; + + size_t len = offset - *poffset; + if (len > 0) { + PgfExprProb* ep = gu_new(PgfExprProb, out_pool); ep->prob = 0; - PgfSymbolKS* sks = - gu_variant_data(gu_seq_get(seq, PgfSymbol, 0)); - PgfExprLit *expr_lit = gu_new_variant(PGF_EXPR_LIT, PgfExprLit, - &ep->expr, pool); + &ep->expr, out_pool); + PgfLiteralStr *lit_str = gu_new_flex_variant(PGF_LITERAL_STR, PgfLiteralStr, - val, strlen(sks->token)+1, - &expr_lit->lit, pool); - strcpy(lit_str->val, sks->token); + val, len+1, + &expr_lit->lit, out_pool); + memcpy(lit_str->val, sentence+*poffset, len); + lit_str->val[len] = 0; - *out_ep = ep; - accepted = false; + pgf_add_extern_tok(psym, lit_str->val, pool); + *poffset = offset; + return ep; } else { - *out_ep = NULL; + return NULL; } - - gu_pool_free(tmp_pool); - return accepted; } static PgfLiteralCallback pgf_string_literal_callback = @@ -60,55 +53,46 @@ static PgfLiteralCallback pgf_string_literal_callback = -static bool -pgf_match_int_lit(PgfConcr* concr, PgfItem* item, PgfToken tok, - PgfExprProb** out_ep, GuPool *pool) +static PgfExprProb* +pgf_match_int_lit(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx, + GuString sentence, size_t* poffset, + GuPool *pool, GuPool *out_pool) { - GuPool* tmp_pool = gu_new_pool(); - - size_t lin_idx; - PgfSequence* seq; - pgf_item_sequence(item, &lin_idx, &seq, tmp_pool); gu_assert(lin_idx == 0); - bool accepted = false; - int n_syms = gu_seq_length(seq); - if (n_syms == 0) { - int val; + size_t offset = *poffset; + while (!gu_is_space(sentence[offset])) + offset++; - *out_ep = NULL; - accepted = gu_string_to_int(tok, &val); - } else if (n_syms == 1) { - PgfSymbolKS* sks = - gu_variant_data(gu_seq_get(seq, PgfSymbol, 0)); + size_t len = offset - *poffset; + if (len > 0) { + PgfToken tok = gu_malloc(pool, len+1); + memcpy((char*) tok, sentence+*poffset, len); + ((char*) tok)[len] = 0; int val; - if (!gu_string_to_int(sks->token, &val)) { - *out_ep = NULL; - } else { - PgfExprProb* ep = gu_new(PgfExprProb, pool); - ep->prob = 0; + if (!gu_string_to_int(tok, &val)) + return NULL; - PgfExprLit *expr_lit = - gu_new_variant(PGF_EXPR_LIT, - PgfExprLit, - &ep->expr, pool); - PgfLiteralInt *lit_int = - gu_new_variant(PGF_LITERAL_INT, - PgfLiteralInt, - &expr_lit->lit, pool); - lit_int->val = val; + PgfExprProb* ep = gu_new(PgfExprProb, pool); + ep->prob = 0; - *out_ep = ep; - } - - accepted = false; + PgfExprLit *expr_lit = + gu_new_variant(PGF_EXPR_LIT, + PgfExprLit, + &ep->expr, pool); + PgfLiteralInt *lit_int = + gu_new_variant(PGF_LITERAL_INT, + PgfLiteralInt, + &expr_lit->lit, pool); + lit_int->val = val; + + pgf_add_extern_tok(psym, tok, pool); + *poffset = offset; + return ep; } else { - *out_ep = NULL; + return NULL; } - - gu_pool_free(tmp_pool); - return accepted; } static PgfLiteralCallback pgf_int_literal_callback = @@ -116,55 +100,46 @@ static PgfLiteralCallback pgf_int_literal_callback = -static bool -pgf_match_float_lit(PgfConcr* concr, PgfItem* item, PgfToken tok, - PgfExprProb** out_ep, GuPool *pool) +static PgfExprProb* +pgf_match_float_lit(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx, + GuString sentence, size_t* poffset, + GuPool *pool, GuPool *out_pool) { - GuPool* tmp_pool = gu_new_pool(); - - size_t lin_idx; - PgfSequence* seq; - pgf_item_sequence(item, &lin_idx, &seq, tmp_pool); gu_assert(lin_idx == 0); - bool accepted = false; - int n_syms = gu_seq_length(seq); - if (n_syms == 0) { - double val; + size_t offset = *poffset; + while (!gu_is_space(sentence[offset])) + offset++; - *out_ep = NULL; - accepted = gu_string_to_double(tok, &val); - } else if (n_syms == 1) { - PgfSymbolKS* sks = - gu_variant_data(gu_seq_get(seq, PgfSymbol, 0)); + size_t len = offset - *poffset; + if (len > 0) { + PgfToken tok = gu_malloc(pool, len+1); + memcpy((char*) tok, sentence+*poffset, len); + ((char*) tok)[len] = 0; double val; - if (!gu_string_to_double(sks->token, &val)) { - *out_ep = NULL; - } else { - PgfExprProb* ep = gu_new(PgfExprProb, pool); - ep->prob = 0; + if (!gu_string_to_double(tok, &val)) + return NULL; - PgfExprLit *expr_lit = - gu_new_variant(PGF_EXPR_LIT, - PgfExprLit, - &ep->expr, pool); - PgfLiteralFlt *lit_flt = - gu_new_variant(PGF_LITERAL_FLT, - PgfLiteralFlt, - &expr_lit->lit, pool); - lit_flt->val = val; + PgfExprProb* ep = gu_new(PgfExprProb, pool); + ep->prob = 0; - *out_ep = ep; - } - - accepted = false; + PgfExprLit *expr_lit = + gu_new_variant(PGF_EXPR_LIT, + PgfExprLit, + &ep->expr, pool); + PgfLiteralFlt *lit_flt = + gu_new_variant(PGF_LITERAL_FLT, + PgfLiteralFlt, + &expr_lit->lit, pool); + lit_flt->val = val; + + pgf_add_extern_tok(psym, tok, pool); + *poffset = offset; + return ep; } else { - *out_ep = NULL; + return NULL; } - - gu_pool_free(tmp_pool); - return accepted; } static PgfLiteralCallback pgf_float_literal_callback = @@ -172,45 +147,49 @@ static PgfLiteralCallback pgf_float_literal_callback = -static bool -pgf_match_name_lit(PgfConcr* concr, PgfItem* item, PgfToken tok, - PgfExprProb** out_ep, GuPool *pool) +static PgfExprProb* +pgf_match_name_lit(PgfConcr* concr, PgfSymbol* psym, size_t lin_idx, + GuString sentence, size_t* poffset, + GuPool *pool, GuPool *out_pool) { - GuPool* tmp_pool = gu_new_pool(); - - size_t lin_idx; - PgfSequence* seq; - pgf_item_sequence(item, &lin_idx, &seq, tmp_pool); - gu_assert(lin_idx == 0); + GuPool* tmp_pool = gu_new_pool(); + GuStringBuf *sbuf = gu_string_buf(tmp_pool); + GuOut* out = gu_string_buf_out(sbuf); GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool); - - bool iscap = false; - if (strcmp(tok, "-") == 0) { - iscap = true; - } else if (*tok) { - GuIn* in = gu_string_in(tok, tmp_pool); - iscap = iswupper(gu_in_utf8(in, err)); - } - size_t n_syms = gu_seq_length(seq); - if (!iscap && n_syms > 0) { - GuStringBuf *sbuf = gu_string_buf(tmp_pool); - GuOut* out = gu_string_buf_out(sbuf); + size_t offset = *poffset; - for (size_t i = 0; i < n_syms; i++) { - if (i > 0) - gu_putc(' ', out, err); - - PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i); - gu_assert(gu_variant_tag(sym) == PGF_SYMBOL_KS); - PgfSymbolKS* sks = gu_variant_data(sym); - - gu_string_write(sks->token, out, err); + int i = 0; + while (iswupper(sentence[offset])) { + size_t len = 0; + while (!gu_is_space(sentence[offset+len])) { + len++; } - PgfExprProb* ep = gu_new(PgfExprProb, pool); + PgfToken tok = gu_malloc(pool, len+1); + memcpy((char*) tok, sentence+offset, len); + ((char*) tok)[len] = 0; + + pgf_add_extern_tok(psym, tok, pool); + + if (i > 0) + gu_putc(' ', out, err); + gu_string_write(tok, out, err); + + i++; + + offset += len; + *poffset = offset; + + while (gu_is_space(sentence[offset])) + offset++; + } + + PgfExprProb* ep = NULL; + if (i > 0) { + ep = gu_new(PgfExprProb, pool); ep->prob = 0; PgfExprApp *expr_app = @@ -235,14 +214,11 @@ pgf_match_name_lit(PgfConcr* concr, PgfItem* item, PgfToken tok, val, strlen(val)+1, &expr_lit->lit, pool); strcpy(lit_str->val, val); - *out_ep = ep; - } else { - *out_ep = NULL; } gu_pool_free(tmp_pool); - - return iscap; + + return ep; } PgfLiteralCallback pgf_nerc_literal_callback = diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index 0ab757a62..476dda827 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -14,6 +15,8 @@ typedef GuBuf PgfItemBuf; static GU_DEFINE_TYPE(PgfItemBuf, abstract, _); +typedef struct PgfParseState PgfParseState; + struct PgfItemConts { PgfCCat* ccat; size_t lin_idx; @@ -44,10 +47,14 @@ typedef struct { PgfConcr* concr; GuPool* pool; // this pool is used for structures internal to the parser GuPool* out_pool; // this pool is used for the allocating the final abstract trees - GuBuf* expr_queue; + GuString sentence; // the sentence to be parsed + GuBuf* expr_queue; // during the extraction of abstract trees we push them in this queue PgfExpr meta_var; PgfProduction meta_prod; int max_fid; + PgfParseState *before; + PgfParseState *after; + PgfExprEnum en; // enumeration for the generated trees #ifdef PGF_COUNTS_DEBUG int item_full_count; int item_real_count; @@ -59,33 +66,15 @@ typedef struct { prob_t beam_size; } PgfParsing; -typedef struct { - PgfCCat* ccat; - size_t lin_idx; -} PgfCFCat; - -static GU_DEFINE_TYPE(PgfCFCat, struct, - GU_MEMBER(PgfCFCat, ccat, PgfCCat), - GU_MEMBER(PgfCFCat, lin_idx, size_t)); - -extern GuHasher pgf_cfcat_hasher; - -GU_DEFINE_TYPE(PgfProductionIdx, GuMap, - gu_type(PgfCFCat), &pgf_cfcat_hasher, - gu_ptr_type(PgfProductionBuf), &gu_null_struct); - -typedef struct PgfTokenState PgfTokenState; +typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE; typedef struct { - bool (*match_token)(PgfTokenState* ts, PgfToken tok, PgfItem* item); - PgfToken (*get_token)(PgfTokenState* ts); - PgfProductionIdx* (*get_lexicon_idx)(PgfTokenState* ts); -} PgfTokenFn; + PgfProductionIdx* idx; + BIND_TYPE bind_type; + size_t offset; +} PgfLexiconIdxEntry; -struct PgfTokenState { - PgfTokenFn* fn; - prob_t lexical_prob; -}; +typedef GuBuf PgfLexiconIdx; struct PgfParseState { PgfParseState* next; @@ -94,12 +83,14 @@ struct PgfParseState { PgfItem* meta_item; PgfContsMap* conts_map; PgfGenCatMap* generated_cats; - unsigned short offset; + + bool needs_bind; + size_t start_offset; + size_t end_offset; prob_t viterbi_prob; - PgfParsing* ps; - PgfTokenState* ts; + PgfLexiconIdx* lexicon_idx; }; typedef struct PgfAnswers { @@ -115,13 +106,6 @@ typedef struct { size_t arg_idx; } PgfExprState; -typedef struct PgfParseResult PgfParseResult; - -struct PgfParseResult { - PgfParseState* state; - PgfExprEnum en; -}; - typedef struct PgfItemBase PgfItemBase; struct PgfItem { @@ -133,15 +117,12 @@ struct PgfItem { PgfProduction prod; PgfPArgs* args; PgfSymbol curr_sym; - uint16_t seq_idx; + uint16_t sym_idx; uint8_t alt_idx; // position in the pre alternative uint8_t alt; // the number of the alternative prob_t inside_prob; }; -GU_DEFINE_TYPE(PgfLeftcornerTokIdx, GuStringMap, - gu_ptr_type(PgfProductionIdx), &gu_null_struct); - static PgfSymbol pgf_prev_extern_sym(PgfSymbol sym) { @@ -151,10 +132,11 @@ pgf_prev_extern_sym(PgfSymbol sym) return *((PgfSymbol*) (((PgfSymbolCat*) i.data)+1)); case PGF_SYMBOL_KP: return *((PgfSymbol*) (((PgfSymbolKP*) i.data)+1)); - case PGF_SYMBOL_KS: + case PGF_SYMBOL_KS: { PgfSymbolKS* sks = (PgfSymbolKS*) i.data; size_t tok_len = strlen(sks->token); return *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)); + } case PGF_SYMBOL_LIT: return *((PgfSymbol*) (((PgfSymbolLit*) i.data)+1)); case PGF_SYMBOL_VAR: @@ -169,30 +151,53 @@ pgf_prev_extern_sym(PgfSymbol sym) } } -size_t -pgf_item_lin_idx(PgfItem* item) { - return item->conts->lin_idx; +void +pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool) { + PgfSymbol new_sym; + size_t tok_len = strlen(tok); + PgfSymbolKS* sks = (PgfSymbolKS*) + gu_alloc_variant(PGF_SYMBOL_KS, + sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1, + gu_alignof(PgfSymbolKS), + &new_sym, pool); + strcpy(sks->token, tok); + *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = *psym; + *psym = new_sym; } -int -pgf_item_sequence_length(PgfItem* item) +void +pgf_add_extern_cat(PgfSymbol* psym, int d, int r, GuPool* pool) { + PgfSymbol new_sym; + PgfSymbolCat* scat = (PgfSymbolCat*) + gu_alloc_variant(PGF_SYMBOL_CAT, + sizeof(PgfSymbolCat)+sizeof(PgfSymbol), + gu_alignof(PgfSymbolCat), + &new_sym, pool); + *((PgfSymbol*) (scat+1)) = *psym; + scat->d = d; + scat->r = r; + *psym = new_sym; +} + +static size_t +pgf_item_symbols_length(PgfItem* item) { GuVariantInfo i = gu_variant_open(item->prod); switch (i.tag) { case PGF_PRODUCTION_APPLY: { PgfProductionApply* papp = i.data; - return gu_seq_length(papp->fun->lins[item->conts->lin_idx]); + return gu_seq_length(papp->fun->lins[item->conts->lin_idx]->syms); } case PGF_PRODUCTION_COERCE: { return 1; } case PGF_PRODUCTION_EXTERN: { PgfProductionExtern* pext = i.data; - PgfSequence* seq; - + PgfSymbols* syms; + if (pext->lins != NULL && - (seq = gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx)) != NULL) { - return gu_seq_length(seq); + (syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) { + return gu_seq_length(syms); } else { int seq_len = 0; PgfSymbol sym = item->curr_sym; @@ -220,33 +225,33 @@ pgf_item_sequence_length(PgfItem* item) } } -static PgfSequence* -pgf_extern_seq_get(PgfItem* item, GuPool* pool) +static PgfSymbols* +pgf_extern_syms_get(PgfItem* item, GuPool* pool) { - int seq_len = pgf_item_sequence_length(item); + int syms_len = pgf_item_symbols_length(item); - PgfSequence* seq = - gu_new_seq(PgfSymbol, seq_len, pool); + PgfSymbols* syms = + gu_new_seq(PgfSymbol, syms_len, pool); PgfSymbol sym = item->curr_sym; while (!gu_variant_is_null(sym)) { - gu_seq_set(seq, PgfSymbol, --seq_len, sym); + gu_seq_set(syms, PgfSymbol, --syms_len, sym); sym = pgf_prev_extern_sym(sym); } - return seq; + return syms; } -void -pgf_item_sequence(PgfItem* item, - size_t* lin_idx, PgfSequence** seq, - GuPool* pool) { +static void +pgf_item_symbols(PgfItem* item, + size_t* lin_idx, PgfSymbols** syms, + GuPool* pool) { *lin_idx = item->conts->lin_idx; GuVariantInfo i = gu_variant_open(item->prod); switch (i.tag) { case PGF_PRODUCTION_APPLY: { PgfProductionApply* papp = i.data; - *seq = papp->fun->lins[item->conts->lin_idx]; + *syms = papp->fun->lins[item->conts->lin_idx]->syms; break; } case PGF_PRODUCTION_COERCE: { @@ -254,21 +259,21 @@ pgf_item_sequence(PgfItem* item, gu_new_variant_i(pool, PGF_SYMBOL_CAT, PgfSymbolCat, .d = 0, .r = item->conts->lin_idx); - *seq = gu_new_seq(PgfSequence*, 1, pool); - gu_seq_set(*seq, PgfSymbol, 0, sym); + *syms = gu_new_seq(PgfSymbol, 1, pool); + gu_seq_set(*syms, PgfSymbol, 0, sym); break; } case PGF_PRODUCTION_EXTERN: { PgfProductionExtern* pext = i.data; if (pext->lins == NULL || - (*seq = gu_seq_get(pext->lins, PgfSequence*, item->conts->lin_idx)) == NULL) { - *seq = pgf_extern_seq_get(item, pool); + (*syms = gu_seq_get(pext->lins, PgfSymbols*, item->conts->lin_idx)) == NULL) { + *syms = pgf_extern_syms_get(item, pool); } break; } case PGF_PRODUCTION_META: { - *seq = pgf_extern_seq_get(item, pool); + *syms = pgf_extern_syms_get(item, pool); break; } default: @@ -285,7 +290,7 @@ pgf_print_production_args(PgfPArgs* args, for (size_t j = 0; j < n_args; j++) { if (j > 0) gu_putc(',',out,err); - + PgfPArg arg = gu_seq_get(args, PgfPArg, j); if (arg.hypos != NULL && @@ -351,31 +356,38 @@ pgf_print_item_seq(PgfItem *item, GuOut *out, GuExn* err, GuPool* pool) { size_t lin_idx; - PgfSequence seq; - pgf_item_sequence(item, &lin_idx, &seq, pool); + PgfSymbols* syms = NULL; + pgf_item_symbols(item, &lin_idx, &syms, pool); gu_printf(out, err, "%d : ",lin_idx); size_t index; - for (index = 0; index < gu_seq_length(seq); index++) { - if (item->seq_idx == index) + for (index = 0; index < gu_seq_length(syms); index++) { + if (item->sym_idx == index) gu_printf(out, err, " . "); - PgfSymbol *sym = gu_seq_index(seq, PgfSymbol, index); - pgf_print_symbol(*sym, out, err); + PgfSymbol sym = gu_seq_get(syms, PgfSymbol, index); + pgf_print_symbol(sym, out, err); } - if (item->seq_idx == index) + if (item->sym_idx == index) gu_printf(out, err, " ."); } +static void +pgf_print_range(PgfParseState* start, PgfParseState* end, GuOut* out, GuExn* err) +{ + gu_printf(out, err, "%d-%d", + (start != NULL) ? start->end_offset : 0, + (start == end) ? end->end_offset : end->start_offset); +} + static void pgf_print_item(PgfItem* item, PgfParseState* state, GuOut* out, GuExn* err, GuPool* pool) { - gu_printf(out, err, "[%d-%d; C%d -> ", - item->conts->state ? item->conts->state->offset : 0, - state ? state->offset : 0, - item->conts->ccat->fid); + gu_printf(out, err, "["); + pgf_print_range(item->conts->state, state, out, err); + gu_printf(out, err, "; C%d -> ", item->conts->ccat->fid); GuVariantInfo i = gu_variant_open(item->prod); switch (i.tag) { @@ -482,6 +494,45 @@ pgf_print_expr_state0(PgfExprState* st, #endif #endif +static int +cmp_string(GuString* psent, size_t* plen, GuString tok) +{ + GuString sent = *psent; + size_t len = *plen; + + while (*tok != 0) { + if (len == 0) + return -1; + + if (((uint8_t) *sent) > ((uint8_t) *tok)) + return 1; + else if (((uint8_t) *sent) < ((uint8_t) *tok)) + return -2; + + tok++; + sent++; + len--; + } + + *psent = sent; + *plen = len; + return 0; +} + +static bool +skip_space(GuString* psent, size_t* plen) +{ + if (*plen == 0) + return false; + + char c = **psent; + if (!gu_is_space(c)) + return false; + + (*psent)++; + return true; +} + static int cmp_item_prob(GuOrder* self, const void* a, const void* b) { @@ -500,32 +551,31 @@ cmp_item_prob(GuOrder* self, const void* a, const void* b) } static GuOrder -pgf_item_prob_order = { cmp_item_prob }; +pgf_item_prob_order[1] = { { cmp_item_prob } }; static PgfItemContss* -pgf_parsing_get_contss(PgfContsMap* conts_map, PgfCCat* cat, GuPool *pool) +pgf_parsing_get_contss(PgfParseState* state, PgfCCat* cat, GuPool *pool) { - PgfItemContss* contss = gu_map_get(conts_map, cat, PgfItemContss*); + PgfItemContss* contss = gu_map_get(state->conts_map, cat, PgfItemContss*); if (contss == NULL) { size_t n_lins = cat->cnccat->n_lins; contss = gu_new_seq(PgfItemConts*, n_lins, pool); for (size_t i = 0; i < n_lins; i++) { gu_seq_set(contss, PgfItemConts*, i, NULL); } - gu_map_put(conts_map, cat, PgfItemContss*, contss); + gu_map_put(state->conts_map, cat, PgfItemContss*, contss); } return contss; } static PgfItemConts* -pgf_parsing_get_conts(PgfContsMap* conts_map, - PgfCCat* ccat, size_t lin_idx, - PgfParseState* state, +pgf_parsing_get_conts(PgfParseState* state, + PgfCCat* ccat, size_t lin_idx, GuPool *pool) { gu_require(lin_idx < ccat->cnccat->n_lins); PgfItemContss* contss = - pgf_parsing_get_contss(conts_map, ccat, pool); + pgf_parsing_get_contss(state, ccat, pool); PgfItemConts* conts = gu_seq_get(contss, PgfItemConts*, lin_idx); if (!conts) { conts = gu_new(PgfItemConts, pool); @@ -555,13 +605,14 @@ gu_ccat_fini(GuFinalizer* fin) } static PgfCCat* -pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts, +pgf_parsing_create_completed(PgfParsing* ps, PgfParseState* state, + PgfItemConts* conts, prob_t viterbi_prob) { - PgfCCat* cat = gu_new_flex(state->ps->pool, PgfCCat, fin, 1); + PgfCCat* cat = gu_new_flex(ps->pool, PgfCCat, fin, 1); cat->cnccat = conts->ccat->cnccat; cat->viterbi_prob = viterbi_prob; - cat->fid = state->ps->max_fid++; + cat->fid = ps->max_fid++; cat->conts = conts; cat->answers = NULL; cat->prods = NULL; @@ -569,7 +620,7 @@ pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts, gu_map_put(state->generated_cats, conts, PgfCCat*, cat); cat->fin[0].fn = gu_ccat_fini; - gu_pool_finally(state->ps->pool, cat->fin); + gu_pool_finally(ps->pool, cat->fin); #ifdef PGF_COUNTS_DEBUG state->ps->ccat_full_count++; @@ -578,15 +629,6 @@ pgf_parsing_create_completed(PgfParseState* state, PgfItemConts* conts, return cat; } -static void -pgf_parsing_add_production(PgfCCat* ccat, PgfProduction prod) -{ - if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) { - ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1); - } - gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod); -} - static PgfCCat* pgf_parsing_get_completed(PgfParseState* state, PgfItemConts* conts) { @@ -602,18 +644,18 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool) PgfProductionApply* papp = i.data; PgfCncFun* fun = papp->fun; gu_assert(item->conts->lin_idx < fun->n_lins); - PgfSequence* seq = fun->lins[item->conts->lin_idx]; - gu_assert(item->seq_idx <= gu_seq_length(seq)); - if (item->seq_idx == gu_seq_length(seq)) { + PgfSymbols* syms = fun->lins[item->conts->lin_idx]->syms; + gu_assert(item->sym_idx <= gu_seq_length(syms)); + if (item->sym_idx == gu_seq_length(syms)) { item->curr_sym = gu_null_variant; } else { - item->curr_sym = gu_seq_get(seq, PgfSymbol, item->seq_idx); + item->curr_sym = gu_seq_get(syms, PgfSymbol, item->sym_idx); } break; } case PGF_PRODUCTION_COERCE: { - gu_assert(item->seq_idx <= 1); - if (item->seq_idx == 1) { + gu_assert(item->sym_idx <= 1); + if (item->sym_idx == 1) { item->curr_sym = gu_null_variant; } else { item->curr_sym = gu_new_variant_i(pool, PGF_SYMBOL_CAT, @@ -634,12 +676,11 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool) } static PgfItem* -pgf_new_item(PgfItemConts* conts, PgfProduction prod, - GuPool* pool, PgfParsing* ps) +pgf_new_item(PgfParsing* ps, PgfItemConts* conts, PgfProduction prod) { PgfItem* item; if (ps == NULL || ps->free_item == NULL) - item = gu_new(PgfItem, pool); + item = gu_new(PgfItem, ps->pool); else { item = ps->free_item; ps->free_item = ps->free_item->next; @@ -661,7 +702,7 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod, } case PGF_PRODUCTION_COERCE: { PgfProductionCoerce* pcoerce = pi.data; - item->args = gu_new_seq(PgfPArg, 1, pool); + item->args = gu_new_seq(PgfPArg, 1, ps->pool); PgfPArg* parg = gu_seq_index(item->args, PgfPArg, 0); parg->hypos = NULL; parg->ccat = pcoerce->coerce; @@ -671,13 +712,7 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod, case PGF_PRODUCTION_EXTERN: { PgfProductionExtern* pext = pi.data; item->args = gu_empty_seq(); - item->inside_prob = pext->ep ? pext->ep->prob : 0; - - int n_args = gu_seq_length(item->args); - for (int i = 0; i < n_args; i++) { - PgfPArg *arg = gu_seq_index(item->args, PgfPArg, i); - item->inside_prob += arg->ccat->viterbi_prob; - } + item->inside_prob = pext->ep->prob; break; } case PGF_PRODUCTION_META: { @@ -698,13 +733,13 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod, item->conts = conts; item->prod = prod; item->curr_sym = gu_null_variant; - item->seq_idx = 0; + item->sym_idx = 0; item->alt_idx = 0; item->alt = 0; conts->ref_count++; - pgf_item_set_curr_symbol(item, pool); + pgf_item_set_curr_symbol(item, ps->pool); #ifdef PGF_COUNTS_DEBUG if (ps != NULL) { @@ -717,11 +752,11 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod, } static PgfItem* -pgf_item_copy(PgfItem* item, GuPool* pool, PgfParsing* ps) +pgf_item_copy(PgfItem* item, PgfParsing* ps) { PgfItem* copy; if (ps == NULL || ps->free_item == NULL) - copy = gu_new(PgfItem, pool); + copy = gu_new(PgfItem, ps->pool); else { copy = ps->free_item; ps->free_item = ps->free_item->next; @@ -742,14 +777,14 @@ pgf_item_copy(PgfItem* item, GuPool* pool, PgfParsing* ps) static PgfItem* pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat, - GuPool* pool, PgfParsing *ps) + PgfParsing *ps) { PgfCCat *old_ccat = gu_seq_index(item->args, PgfPArg, d)->ccat; - PgfItem* new_item = pgf_item_copy(item, pool, ps); + PgfItem* new_item = pgf_item_copy(item, ps); size_t nargs = gu_seq_length(item->args); - new_item->args = gu_new_seq(PgfPArg, nargs, pool); + new_item->args = gu_new_seq(PgfPArg, nargs, ps->pool); memcpy(gu_seq_data(new_item->args), gu_seq_data(item->args), nargs * sizeof(PgfPArg)); gu_seq_set(new_item->args, PgfPArg, d, @@ -763,8 +798,9 @@ pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat, static void pgf_item_advance(PgfItem* item, GuPool* pool) { + if (GU_LIKELY(item->alt == 0)) { - item->seq_idx++; + item->sym_idx++; pgf_item_set_curr_symbol(item, pool); } else @@ -772,8 +808,7 @@ pgf_item_advance(PgfItem* item, GuPool* pool) } static void -pgf_item_free(PgfParseState* before, PgfParseState* after, - PgfItem* item) +pgf_item_free(PgfParsing* ps, PgfItem* item) { GuVariantInfo i = gu_variant_open(item->prod); switch (i.tag) { @@ -797,36 +832,20 @@ pgf_item_free(PgfParseState* before, PgfParseState* after, if (cont == NULL) continue; - pgf_item_free(before, after, cont); + pgf_item_free(ps, cont); } } #ifdef PGF_PARSER_DEBUG memset(item, 0, sizeof(*item)); #endif - item->next = before->ps->free_item; - before->ps->free_item = item; + item->next = ps->free_item; + ps->free_item = item; #ifdef PGF_COUNTS_DEBUG - before->ps->item_real_count--; + ps->item_real_count--; #endif } -static void -pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after, - PgfToken tok, PgfItem* item) -{ - if (after->ts->fn->match_token(after->ts, tok, item)) { - if (after->next == NULL) { - after->viterbi_prob = - item->inside_prob+item->conts->outside_prob; - } - - gu_buf_heap_push(after->agenda, &pgf_item_prob_order, &item); - } else { - pgf_item_free(before, after, item); - } -} - static void pgf_result_predict(PgfParsing* ps, PgfExprState* cont, PgfCCat* ccat); @@ -836,12 +855,13 @@ pgf_result_production(PgfParsing* ps, PgfAnswers* answers, PgfProduction prod); static void -pgf_parsing_combine(PgfParseState* before, PgfParseState* after, +pgf_parsing_combine(PgfParsing* ps, + PgfParseState* before, PgfParseState* after, PgfItem* cont, PgfCCat* cat, int lin_idx) { if (cont == NULL) { - if (after == NULL) { - pgf_result_predict(before->ps, NULL, cat); + if (before->end_offset == strlen(ps->sentence)) { + pgf_result_predict(ps, NULL, cat); } return; } @@ -850,29 +870,29 @@ pgf_parsing_combine(PgfParseState* before, PgfParseState* after, switch (gu_variant_tag(cont->curr_sym)) { case PGF_SYMBOL_CAT: { PgfSymbolCat* scat = gu_variant_data(cont->curr_sym); - item = pgf_item_update_arg(cont, scat->d, cat, before->ps->pool, before->ps); + item = pgf_item_update_arg(cont, scat->d, cat, ps); break; } case PGF_SYMBOL_LIT: { PgfSymbolLit* slit = gu_variant_data(cont->curr_sym); - item = pgf_item_update_arg(cont, slit->d, cat, before->ps->pool, before->ps); + item = pgf_item_update_arg(cont, slit->d, cat, ps); break; } default: gu_impossible(); } - pgf_item_advance(item, before->ps->pool); - gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item); + pgf_item_advance(item, ps->pool); + gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item); } static void -pgf_parsing_production(PgfParseState* state, +pgf_parsing_production(PgfParsing* ps, PgfParseState* state, PgfItemConts* conts, PgfProduction prod) { PgfItem* item = - pgf_new_item(conts, prod, state->ps->pool, state->ps); - gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item); + pgf_new_item(ps, conts, prod); + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); } static PgfProduction @@ -904,9 +924,9 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool) PgfProductionExtern* pext = i.data; if (pext->lins == NULL || - gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx) == NULL) { - PgfSequence* seq = - pgf_extern_seq_get(item, pool); + gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx) == NULL) { + PgfSymbols* syms = + pgf_extern_syms_get(item, pool); size_t n_lins = item->conts->ccat->cnccat->n_lins; @@ -914,22 +934,20 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool) gu_new_variant(PGF_PRODUCTION_EXTERN, PgfProductionExtern, &prod, pool); - new_pext->callback = pext->callback; new_pext->ep = ep; - new_pext->lins = gu_new_seq(PgfSequence*, n_lins, pool); + new_pext->lins = gu_new_seq(PgfSymbols*, n_lins, pool); if (pext->lins == NULL) { for (size_t i = 0; i < n_lins; i++) { - gu_seq_set(new_pext->lins,PgfSequence*,i, - NULL); + gu_seq_set(new_pext->lins,PgfSymbols*,i,NULL); } } else { for (size_t i = 0; i < n_lins; i++) { - gu_seq_set(new_pext->lins,PgfSequence*,i, - gu_seq_get(pext->lins,PgfSequence*,i)); + gu_seq_set(new_pext->lins,PgfSymbols*,i, + gu_seq_get(pext->lins,PgfSymbols*,i)); } } - gu_seq_set(new_pext->lins,PgfSequence*,item->conts->lin_idx,seq); + gu_seq_set(new_pext->lins,PgfSymbols*,item->conts->lin_idx,syms); } else { prod = item->prod; } @@ -952,42 +970,41 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool) } static void -pgf_parsing_complete(PgfParseState* before, PgfParseState* after, - PgfItem* item, PgfExprProb *ep) +pgf_parsing_add_production(PgfParsing* ps, + PgfParseState* before, PgfParseState* after, + PgfItemConts* conts, PgfProduction prod, + prob_t inside_prob) { - PgfProduction prod = - pgf_parsing_new_production(item, ep, before->ps->pool); -#ifdef PGF_COUNTS_DEBUG - before->ps->prod_full_count++; -#endif - - PgfCCat* tmp_cat = pgf_parsing_get_completed(before, item->conts); - PgfCCat* cat = tmp_cat; - if (cat == NULL) { - cat = pgf_parsing_create_completed(before, item->conts, - item->inside_prob); + PgfCCat* tmp_ccat = pgf_parsing_get_completed(before, conts); + PgfCCat* ccat = tmp_ccat; + if (ccat == NULL) { + ccat = pgf_parsing_create_completed(ps, before, conts, inside_prob); } - pgf_parsing_add_production(cat, prod); + if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) { + ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1); + } + gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod); #ifdef PGF_PARSER_DEBUG GuPool* tmp_pool = gu_new_pool(); GuOut* out = gu_file_out(stderr, tmp_pool); GuExn* err = gu_exn(NULL, type, tmp_pool); - if (tmp_cat == NULL) - gu_printf(out, err, "[%d-%d; C%d; %d; C%d]\n", - item->conts->state ? item->conts->state->offset : 0, - before->offset, - item->conts->ccat->fid, - item->conts->lin_idx, - cat->fid); - pgf_print_production(cat->fid, prod, out, err, tmp_pool); + if (tmp_ccat == NULL) { + gu_printf(out, err, "["); + pgf_print_range(conts->state, before, out, err); + gu_printf(out, err, "; C%d; %d; C%d]\n", + conts->ccat->fid, + conts->lin_idx, + ccat->fid); + } + pgf_print_production(ccat->fid, prod, out, err, tmp_pool); gu_pool_free(tmp_pool); #endif - if (tmp_cat != NULL) { + if (tmp_ccat != NULL) { PgfItemContss* contss = - pgf_parsing_get_contss(before->conts_map, cat, before->ps->pool); + pgf_parsing_get_contss(before, ccat, ps->pool); size_t n_contss = gu_seq_length(contss); for (size_t i = 0; i < n_contss; i++) { PgfItemConts* conts2 = gu_seq_get(contss, PgfItemConts*, i); @@ -997,7 +1014,7 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after, * production immediately to the agenda, * i.e. process it. */ if (conts2) { - pgf_parsing_production(before, conts2, prod); + pgf_parsing_production(ps, before, conts2, prod); } } @@ -1006,7 +1023,7 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after, PgfParseState* state = after; while (state != NULL) { PgfItemContss* contss = - pgf_parsing_get_contss(state->conts_map, cat, state->ps->pool); + pgf_parsing_get_contss(state, ccat, ps->pool); size_t n_contss = gu_seq_length(contss); for (size_t i = 0; i < n_contss; i++) { PgfItemConts* conts2 = gu_seq_get(contss, PgfItemConts*, i); @@ -1016,33 +1033,249 @@ pgf_parsing_complete(PgfParseState* before, PgfParseState* after, * production immediately to the agenda, * i.e. process it. */ if (conts2) { - pgf_parsing_production(state, conts2, prod); + pgf_parsing_production(ps, state, conts2, prod); } } state = state->next; } - if (cat->answers != NULL) { - pgf_result_production(before->ps, cat->answers, prod); + if (ccat->answers != NULL) { + pgf_result_production(ps, ccat->answers, prod); } } else { - size_t n_conts = gu_buf_length(item->conts->items); + size_t n_conts = gu_buf_length(conts->items); for (size_t i = 0; i < n_conts; i++) { - PgfItem* cont = gu_buf_get(item->conts->items, PgfItem*, i); - pgf_parsing_combine(before, after, cont, cat, item->conts->lin_idx); + PgfItem* cont = gu_buf_get(conts->items, PgfItem*, i); + pgf_parsing_combine(ps, before, after, cont, ccat, conts->lin_idx); } } } static void -pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after, +pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep) +{ + PgfProduction prod = + pgf_parsing_new_production(item, ep, ps->pool); +#ifdef PGF_COUNTS_DEBUG + before->ps->prod_full_count++; +#endif + + pgf_parsing_add_production(ps, ps->before, ps->after, item->conts, prod, item->inside_prob); +} + +static int +pgf_symbols_cmp(GuString* psent, size_t sent_len, BIND_TYPE* pbind, PgfSymbols* syms) +{ + GuString sent = *psent; + + size_t n_syms = gu_seq_length(syms); + for (size_t i = 0; i < n_syms; i++) { + PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i); + + GuVariantInfo inf = gu_variant_open(sym); + switch (inf.tag) { + case PGF_SYMBOL_CAT: + case PGF_SYMBOL_LIT: + case PGF_SYMBOL_VAR: { + if (sent_len == 0) + return -1; + return 1; + } + case PGF_SYMBOL_KS: { + PgfSymbolKS* pks = inf.data; + if (sent_len == 0) + return -1; + + if (*pbind == BIND_HARD) + *pbind = BIND_NONE; + else { + if (*pbind != BIND_SOFT && !skip_space(&sent, &sent_len)) + return 1; + + while (*sent != 0) { + if (!skip_space(&sent, &sent_len)) + break; + } + } + + int cmp = cmp_string(&sent, &sent_len, pks->token); + if (cmp != 0) + return cmp; + break; + } + case PGF_SYMBOL_KP: { + return -2; + } + case PGF_SYMBOL_BIND: { + *pbind = BIND_HARD; + break; + } + case PGF_SYMBOL_NE: { + return -2; + } + default: + gu_impossible(); + } + } + + *psent = sent; + return 0; +} + +static void +pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state) +{ + PgfSequence* epsilon_seq = + gu_seq_index(ps->concr->sequences, PgfSequence, 0); + if (gu_seq_length(epsilon_seq->syms) == 0) { + // Since the sequences are sorted, the epsilon sequence will + // always be the first if there is any at all. We should + // always add the epsilon in the index, because we do + // bottom up prediction for epsilons. + PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx); + entry->idx = epsilon_seq->idx; + entry->bind_type = BIND_NONE; + entry->offset = state->start_offset; + } + + size_t i = 0; + size_t j = gu_seq_length(ps->concr->sequences)-1; + size_t s = j; + size_t n = 1; + size_t sent_len = strlen(ps->sentence); + + while (state->end_offset + n <= sent_len) { + while (i <= j) { + size_t k = (i+j) / 2; + PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k); + + GuString current = ps->sentence + state->end_offset; + BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD; + switch (pgf_symbols_cmp(¤t, n, &bind_type, seq->syms)) { + case -2: + j = k-1; + s = j; + break; + case -1: + j = k-1; + break; + case 0: { + if (seq->idx != NULL) { + PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx); + entry->idx = seq->idx; + entry->bind_type = bind_type; + entry->offset = (current - ps->sentence); + } + i = k+1; + goto next; + } + case 1: + i = k+1; + break; + } + } + +next: + j = s; + n++; + } +} + +static PgfParseState* +pgf_new_parse_state(PgfParsing* ps, size_t start_offset, BIND_TYPE bind_type) +{ + PgfParseState** pstate; + if (ps->before == NULL && start_offset == 0) + pstate = &ps->before; + else { + if (bind_type != BIND_NONE) { + if (ps->before->start_offset == start_offset && + ps->before->end_offset == start_offset && + !ps->before->needs_bind) + return ps->before; + } else { + if (ps->before->start_offset == start_offset) + return ps->before; + } + + pstate = &ps->after; + while (*pstate != NULL) { + if (bind_type != BIND_NONE) { + if (ps->before->start_offset == start_offset && + ps->before->end_offset == start_offset && + !ps->before->needs_bind) + return ps->before; + } else { + if ((*pstate)->start_offset == start_offset) + return *pstate; + } + if ((*pstate)->start_offset > start_offset) + break; + pstate = &(*pstate)->next; + } + } + + size_t end_offset = start_offset; + GuString current = ps->sentence + start_offset; + size_t len = strlen(current); + while (skip_space(¤t, &len)) { + end_offset++; + } + + if (bind_type == BIND_HARD && start_offset != end_offset) + return NULL; + + PgfParseState* state = gu_new(PgfParseState, ps->pool); + state->next = *pstate; + state->agenda = gu_new_buf(PgfItem*, ps->pool); + state->meta_item = NULL; + state->generated_cats = gu_map_type_new(PgfGenCatMap, ps->pool); + state->conts_map = gu_map_type_new(PgfContsMap, ps->pool); + state->needs_bind = (bind_type == BIND_NONE) && + (start_offset == end_offset); + state->start_offset = start_offset; + state->end_offset = end_offset; + state->viterbi_prob = 0; + state->lexicon_idx = + gu_new_buf(PgfLexiconIdxEntry, ps->pool); + + if (ps->before == NULL && start_offset == 0) + state->needs_bind = false; + + pgf_parsing_lookahead(ps, state); + + *pstate = state; + + return state; +} + +static void +pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item) +{ + GuString current = ps->sentence + ps->before->end_offset; + size_t len = strlen(current); + + if (!ps->before->needs_bind && cmp_string(¤t, &len, tok) == 0) { + PgfParseState* state = + pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE); + if (state->next == NULL) { + state->viterbi_prob = + item->inside_prob+item->conts->outside_prob; + } + + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + } else { + pgf_item_free(ps, item); + } +} + +static void +pgf_parsing_td_predict(PgfParsing* ps, PgfItem* item, PgfCCat* ccat, size_t lin_idx) { PgfItemConts* conts = - pgf_parsing_get_conts(before->conts_map, - ccat, lin_idx, before, - before->ps->pool); + pgf_parsing_get_conts(ps->before, ccat, lin_idx, ps->pool); gu_buf_push(conts->items, PgfItem*, item); if (gu_buf_length(conts->items) == 1) { /* First time we encounter this linearization @@ -1053,71 +1286,52 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after, item->inside_prob-conts->ccat->viterbi_prob+ item->conts->outside_prob; - size_t n_prods = ccat->n_synprods; - PgfProductionIdx* lexicon_idx = NULL; - if (after != NULL) { - lexicon_idx = after->ts->fn->get_lexicon_idx(after->ts); - - // we don't know the current token. - // probably we just compute the list of completions - if (lexicon_idx == NULL && ccat->fid < after->ps->concr->total_cats) - n_prods = gu_seq_length(ccat->prods); - } - // Top-down prediction for syntactic rules - for (size_t i = 0; i < n_prods; i++) { + for (size_t i = 0; i < ccat->n_synprods; i++) { PgfProduction prod = gu_seq_get(ccat->prods, PgfProduction, i); - pgf_parsing_production(before, conts, prod); + pgf_parsing_production(ps, ps->before, conts, prod); } - // Bottom-up prediction for lexical rules - - if (lexicon_idx != NULL) { - PgfCFCat cfc = {ccat, lin_idx}; - PgfProductionBuf* tok_prods = - gu_map_get(lexicon_idx, &cfc, PgfProductionBuf*); + // Bottom-up prediction for lexical and epsilon rules + size_t n_idcs = gu_buf_length(ps->before->lexicon_idx); + for (size_t i = 0; i < n_idcs; i++) { + PgfLexiconIdxEntry* lentry = + gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i); + PgfParseState* state = + pgf_new_parse_state(ps, lentry->offset, lentry->bind_type); - if (tok_prods != NULL) { - size_t n_prods = gu_buf_length(tok_prods); - for (size_t i = 0; i < n_prods; i++) { - PgfProduction prod = - gu_buf_get(tok_prods, PgfProduction, i); - - pgf_parsing_production(before, conts, prod); + if (state != NULL) { + size_t n_entries = gu_buf_length(lentry->idx); + for (size_t j = 0; j < n_entries; j++) { + PgfProductionIdxEntry* pentry = + gu_buf_index(lentry->idx, PgfProductionIdxEntry, j); + + if (pentry->ccat == ccat && pentry->lin_idx == lin_idx) { + GuVariantInfo i = { PGF_PRODUCTION_APPLY, pentry->papp }; + PgfProduction prod = gu_variant_close(i); + pgf_parsing_add_production(ps, state, state->next, + conts, prod, + pentry->papp->fun->absfun->ep.prob); + } } } } - - // Bottom-up prediction for epsilon rules - PgfCFCat cfc = {ccat, lin_idx}; - PgfProductionBuf* eps_prods = - gu_map_get(before->ps->concr->epsilon_idx, &cfc, PgfProductionBuf*); - - if (eps_prods != NULL) { - size_t n_prods = gu_buf_length(eps_prods); - for (size_t i = 0; i < n_prods; i++) { - PgfProduction prod = - gu_buf_get(eps_prods, PgfProduction, i); - - pgf_parsing_production(before, conts, prod); - } - } } else { /* If it has already been completed, combine. */ PgfCCat* completed = - pgf_parsing_get_completed(before, conts); + pgf_parsing_get_completed(ps->before, conts); if (completed) { - pgf_parsing_combine(before, after, item, completed, lin_idx); + pgf_parsing_combine(ps, ps->before, ps->after, item, completed, lin_idx); } - PgfParseState* state = after; + PgfParseState* state = ps->after; while (state != NULL) { PgfCCat* completed = pgf_parsing_get_completed(state, conts); if (completed) { - pgf_parsing_combine(state, state->next, item, completed, lin_idx); + pgf_parsing_combine(ps, state, state->next, item, completed, lin_idx); } state = state->next; @@ -1126,32 +1340,24 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after, } static void -pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after, +pgf_parsing_meta_scan(PgfParsing* ps, PgfItem* meta_item, prob_t meta_prob) { - PgfToken tok = after->ts->fn->get_token(after->ts); +/* PgfToken tok = after->ts->fn->get_token(after->ts); if (*tok == 0) { - PgfItem* item = pgf_item_copy(meta_item, before->ps->pool, before->ps); + PgfItem* item = pgf_item_copy(meta_item, before->ps); item->inside_prob += meta_prob; - PgfSymbol prev = item->curr_sym; - size_t tok_len = strlen(tok); - PgfSymbolKS* sks = (PgfSymbolKS*) - gu_alloc_variant(PGF_SYMBOL_KS, - sizeof(PgfSymbolKS)+tok_len+1+sizeof(PgfSymbol), - gu_alignof(PgfSymbolKS), - &item->curr_sym, after->ps->pool); - strcpy((char*) sks->token, (char*) tok); - *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = prev; + pgf_add_extern_tok(item, tok, ps->pool); - gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item); - } + gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item); + }*/ } typedef struct { GuMapItor fn; - PgfParseState* state; + PgfParsing* ps; PgfItem* meta_item; } PgfMetaPredictFn; @@ -1163,11 +1369,11 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err PgfAbsCat* abscat = (PgfAbsCat*) key; prob_t meta_prob = *((prob_t*) value); PgfMetaPredictFn* clo = (PgfMetaPredictFn*) fn; - PgfParseState* state = clo->state; + PgfParsing* ps = clo->ps; PgfItem* meta_item = clo->meta_item; PgfCncCat* cnccat = - gu_map_get(state->ps->concr->cnccats, abscat->name, PgfCncCat*); + gu_map_get(ps->concr->cnccats, abscat->name, PgfCncCat*); if (cnccat == NULL) return; @@ -1181,35 +1387,27 @@ pgf_parsing_meta_predict(GuMapItor* fn, const void* key, void* value, GuExn* err for (size_t lin_idx = 0; lin_idx < cnccat->n_lins; lin_idx++) { PgfItem* item = - pgf_item_copy(meta_item, state->ps->pool, state->ps); + pgf_item_copy(meta_item, ps); item->inside_prob += ccat->viterbi_prob+meta_prob; size_t nargs = gu_seq_length(meta_item->args); - item->args = gu_new_seq(PgfPArg, nargs+1, state->ps->pool); + item->args = gu_new_seq(PgfPArg, nargs+1, ps->pool); memcpy(gu_seq_data(item->args), gu_seq_data(meta_item->args), nargs * sizeof(PgfPArg)); gu_seq_set(item->args, PgfPArg, nargs, ((PgfPArg) { .hypos = NULL, .ccat = ccat })); - PgfSymbol prev = item->curr_sym; - PgfSymbolCat* scat = (PgfSymbolCat*) - gu_alloc_variant(PGF_SYMBOL_CAT, - sizeof(PgfSymbolCat)+sizeof(PgfSymbol), - gu_alignof(PgfSymbolCat), - &item->curr_sym, state->ps->pool); - *((PgfSymbol*) (scat+1)) = prev; - scat->d = nargs; - scat->r = lin_idx; + pgf_add_extern_cat(&item->curr_sym, nargs, lin_idx, ps->pool); - gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item); + gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item); } } } static void -pgf_parsing_symbol(PgfParseState* before, PgfParseState* after, - PgfItem* item, PgfSymbol sym) { +pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym) +{ switch (gu_variant_tag(sym)) { case PGF_SYMBOL_CAT: { PgfSymbolCat* scat = gu_variant_data(sym); @@ -1217,124 +1415,136 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after, if (parg->ccat->prods == NULL) { // empty category - pgf_item_free(before, after, item); + pgf_item_free(ps, item); return; } - pgf_parsing_td_predict(before, after, item, parg->ccat, scat->r); + pgf_parsing_td_predict(ps, item, parg->ccat, scat->r); break; } case PGF_SYMBOL_KS: { - if (after != NULL) { - PgfSymbolKS* sks = gu_variant_data(sym); - pgf_item_advance(item, after->ps->pool); - pgf_parsing_add_transition(before, after, sks->token, item); - } + PgfSymbolKS* sks = gu_variant_data(sym); + pgf_item_advance(item, ps->pool); + pgf_parsing_add_transition(ps, sks->token, item); break; } case PGF_SYMBOL_KP: { - if (after != NULL) { + if (ps->after != NULL) { PgfSymbolKP* skp = gu_variant_data(sym); PgfSymbol sym; if (item->alt == 0) { PgfItem* new_item; - new_item = pgf_item_copy(item, after->ps->pool, after->ps); + new_item = pgf_item_copy(item, ps); new_item->alt = 1; new_item->alt_idx = 0; sym = gu_seq_get(skp->default_form, PgfSymbol, new_item->alt_idx); - pgf_parsing_symbol(before, after, new_item, sym); + pgf_parsing_symbol(ps, new_item, sym); for (size_t i = 0; i < skp->n_forms; i++) { - PgfSequence* syms = skp->forms[i].form; - PgfSequence* syms2 = skp->default_form; + PgfSymbols* syms = skp->forms[i].form; + PgfSymbols* syms2 = skp->default_form; bool skip = false; /*pgf_tokens_equal(toks, toks2); for (size_t j = 0; j < i; j++) { PgfTokens* toks2 = skp->forms[j].form; skip |= pgf_tokens_equal(toks, toks2); }*/ if (!skip) { - new_item = pgf_item_copy(item, after->ps->pool, after->ps); + new_item = pgf_item_copy(item, ps); new_item->alt = i+2; new_item->alt_idx = 0; sym = gu_seq_get(syms, PgfSymbol, new_item->alt_idx); - pgf_parsing_symbol(before, after, new_item, sym); + pgf_parsing_symbol(ps, new_item, sym); } } } else { - PgfSequence* syms = + PgfSymbols* syms = (item->alt == 1) ? skp->default_form : skp->forms[item->alt-2].form; if (item->alt_idx < gu_seq_length(syms)) { sym = gu_seq_get(syms, PgfSymbol, item->alt_idx); - pgf_parsing_symbol(before, after, item, sym); + pgf_parsing_symbol(ps, item, sym); } else { item->alt = 0; - pgf_item_advance(item, after->ps->pool); - gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item); + pgf_item_advance(item, ps->pool); + gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item); } } } break; } case PGF_SYMBOL_LIT: { - if (after != NULL) { - PgfSymbolLit* slit = gu_variant_data(sym); - PgfPArg* parg = gu_seq_index(item->args, PgfPArg, slit->d); + PgfSymbolLit* slit = gu_variant_data(sym); + PgfPArg* parg = gu_seq_index(item->args, PgfPArg, slit->d); - if (parg->ccat->fid > 0 && - parg->ccat->fid >= before->ps->concr->total_cats) { - pgf_parsing_td_predict(before, after, item, parg->ccat, slit->r); - } - else { - PgfItemConts* conts = - pgf_parsing_get_conts(before->conts_map, - parg->ccat, slit->r, before, - before->ps->pool); - gu_buf_push(conts->items, PgfItem*, item); + if (parg->ccat->fid >= ps->concr->total_cats) { + pgf_parsing_td_predict(ps, item, parg->ccat, slit->r); + } + else { + PgfItemConts* conts = + pgf_parsing_get_conts(ps->before, + parg->ccat, slit->r, + ps->pool); + gu_buf_push(conts->items, PgfItem*, item); - if (gu_buf_length(conts->items) == 1) { - /* This is the first time when we encounter this - * literal category so we must call the callback */ + if (gu_buf_length(conts->items) == 1) { + /* This is the first time when we encounter this + * literal category so we must call the callback */ - PgfLiteralCallback* callback = - gu_map_get(before->ps->concr->callbacks, - parg->ccat->cnccat, - PgfLiteralCallback*); + PgfLiteralCallback* callback = + gu_map_get(ps->concr->callbacks, + parg->ccat->cnccat, + PgfLiteralCallback*); - if (callback != NULL) { + if (callback != NULL) { + size_t offset = ps->before->end_offset; + PgfSymbol curr_sym = gu_null_variant; + PgfExprProb *ep = + callback->match(ps->concr, &curr_sym, + item->conts->lin_idx, + ps->sentence, &offset, + ps->pool, ps->out_pool); + + if (ep != NULL) { PgfProduction prod; PgfProductionExtern* pext = gu_new_variant(PGF_PRODUCTION_EXTERN, - PgfProductionExtern, - &prod, before->ps->pool); - pext->callback = callback; - pext->ep = NULL; + PgfProductionExtern, + &prod, ps->pool); + pext->ep = ep; pext->lins = NULL; - pgf_parsing_production(before, conts, prod); + PgfParseState* state = + pgf_new_parse_state(ps, offset, BIND_NONE); + PgfItem* item = + pgf_new_item(ps, conts, prod); + item->curr_sym = curr_sym; + item->sym_idx = pgf_item_symbols_length(item); + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); } } else { - /* If it has already been completed, combine. */ + pgf_item_free(ps, item); + } + } else { + /* If it has already been completed, combine. */ - PgfCCat* completed = - pgf_parsing_get_completed(before, conts); - if (completed) { - pgf_parsing_combine(before, after, item, completed, slit->r); - } + PgfCCat* completed = + pgf_parsing_get_completed(ps->before, conts); + if (completed) { + pgf_parsing_combine(ps, ps->before, ps->after, item, completed, slit->r); + } - PgfParseState* state = after; - while (state != NULL) { - PgfCCat* completed = - pgf_parsing_get_completed(state, conts); - if (completed) { - pgf_parsing_combine(state, state->next, item, completed, slit->r); - } - - state = state->next; + PgfParseState* state = ps->after; + while (state != NULL) { + PgfCCat* completed = + pgf_parsing_get_completed(state, conts); + if (completed) { + pgf_parsing_combine(ps, state, state->next, item, completed, slit->r); } + + state = state->next; } } } @@ -1344,11 +1554,29 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after, // XXX TODO proper support break; case PGF_SYMBOL_NE: { - pgf_item_free(before, after, item); + // Nothing can match with a non-existant token + pgf_item_free(ps, item); break; } case PGF_SYMBOL_BIND: { - pgf_item_free(before, after, item); + if (ps->before->start_offset == ps->before->end_offset && + ps->before->needs_bind) { + PgfParseState* state = + pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD); + if (state != NULL) { + if (state->next == NULL) { + state->viterbi_prob = + item->inside_prob+item->conts->outside_prob; + } + + pgf_item_advance(item, ps->pool); + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + } else { + pgf_item_free(ps, item); + } + } else { + pgf_item_free(ps, item); + } break; } default: @@ -1357,47 +1585,47 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after, } static void -pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item) +pgf_parsing_item(PgfParsing* ps, PgfItem* item) { #ifdef PGF_PARSER_DEBUG GuPool* tmp_pool = gu_new_pool(); GuOut* out = gu_file_out(stderr, tmp_pool); GuExn* err = gu_exn(NULL, type, tmp_pool); - pgf_print_item(item, before, out, err, tmp_pool); + pgf_print_item(item, ps->before, out, err, tmp_pool); gu_pool_free(tmp_pool); #endif - + GuVariantInfo i = gu_variant_open(item->prod); switch (i.tag) { case PGF_PRODUCTION_APPLY: { PgfProductionApply* papp = i.data; PgfCncFun* fun = papp->fun; - PgfSequence* seq = fun->lins[item->conts->lin_idx]; - if (item->seq_idx == gu_seq_length(seq)) { - pgf_parsing_complete(before, after, item, NULL); - pgf_item_free(before, after, item); + PgfSymbols* syms = fun->lins[item->conts->lin_idx]->syms; + if (item->sym_idx == gu_seq_length(syms)) { + pgf_parsing_complete(ps, item, NULL); + pgf_item_free(ps, item); } else { - pgf_parsing_symbol(before, after, item, item->curr_sym); + pgf_parsing_symbol(ps, item, item->curr_sym); } break; } case PGF_PRODUCTION_COERCE: { PgfProductionCoerce* pcoerce = i.data; - switch (item->seq_idx) { + switch (item->sym_idx) { case 0: if (pcoerce->coerce->prods == NULL) { // empty category - pgf_item_free(before, after, item); + pgf_item_free(ps, item); return; } - pgf_parsing_td_predict(before, after, item, + pgf_parsing_td_predict(ps, item, pcoerce->coerce, item->conts->lin_idx); break; case 1: - pgf_parsing_complete(before, after, item, NULL); - pgf_item_free(before, after, item); + pgf_parsing_complete(ps, item, NULL); + pgf_item_free(ps, item); break; default: gu_impossible(); @@ -1406,85 +1634,56 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item) } case PGF_PRODUCTION_EXTERN: { PgfProductionExtern* pext = i.data; - - PgfSequence* seq; + + PgfSymbols* syms; if (pext->lins != NULL && - (seq = gu_seq_get(pext->lins,PgfSequence*,item->conts->lin_idx)) != NULL) { - if (item->seq_idx == gu_seq_length(seq)) { - pgf_parsing_complete(before, after, item, NULL); - pgf_item_free(before, after, item); - } else { + (syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) { + if (item->sym_idx == gu_seq_length(syms)) { + pgf_parsing_complete(ps, item, NULL); + pgf_item_free(ps, item); + } else { PgfSymbol sym = - gu_seq_get(seq, PgfSymbol, item->seq_idx); - pgf_parsing_symbol(before, after, item, sym); + gu_seq_get(syms, PgfSymbol, item->sym_idx); + pgf_parsing_symbol(ps, item, sym); } } else { - PgfToken tok = (after != NULL) - ? after->ts->fn->get_token(after->ts) - : ""; - - PgfExprProb *ep = NULL; - bool accepted = - pext->callback->match(before->ps->concr, item, - tok, - &ep, before->ps->out_pool); - - if (ep != NULL) - pgf_parsing_complete(before, after, item, ep); - - if (accepted) { - if (after != NULL) { - PgfSymbol prev = item->curr_sym; - size_t tok_len = strlen(tok); - PgfSymbolKS* sks = (PgfSymbolKS*) - gu_alloc_variant(PGF_SYMBOL_KS, - sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1, - gu_alignof(PgfSymbolKS), - &item->curr_sym, after->ps->pool); - strcpy((char*) sks->token, (char*) tok); - *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = prev; - - item->seq_idx++; - pgf_parsing_add_transition(before, after, tok, item); - } - } else { - pgf_item_free(before, after, item); - } + pgf_parsing_complete(ps, item, pext->ep); + pgf_item_free(ps, item); } break; } case PGF_PRODUCTION_META: { - if (item->seq_idx == pgf_item_sequence_length(item)) { - if (before->meta_item != NULL) + if (item->sym_idx == pgf_item_symbols_length(item)) { + if (ps->before->meta_item != NULL) break; - before->meta_item = item; + ps->before->meta_item = item; - if (after == NULL) { - PgfExprProb *ep = gu_new(PgfExprProb, before->ps->pool); - ep->expr = before->ps->meta_var; + if (ps->after == NULL) { + PgfExprProb *ep = gu_new(PgfExprProb, ps->pool); + ep->expr = ps->meta_var; ep->prob = item->inside_prob; size_t n_args = gu_seq_length(item->args); for (size_t i = 0; i < n_args; i++) { PgfPArg* arg = gu_seq_index(item->args, PgfPArg, i); ep->prob -= arg->ccat->viterbi_prob; } - pgf_parsing_complete(before, after, item, ep); + pgf_parsing_complete(ps, item, ep); } else { prob_t meta_token_prob = item->conts->ccat->cnccat->abscat->meta_token_prob; if (meta_token_prob != INFINITY) { - pgf_parsing_meta_scan(before, after, item, meta_token_prob); + pgf_parsing_meta_scan(ps, item, meta_token_prob); } PgfCIdMap* meta_child_probs = item->conts->ccat->cnccat->abscat->meta_child_probs; if (meta_child_probs != NULL) { - PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, before, item }; + PgfMetaPredictFn clo = { { pgf_parsing_meta_predict }, ps, item }; gu_map_iter(meta_child_probs, &clo.fn, NULL); } } } else { - pgf_parsing_symbol(before, after, item, item->curr_sym); + pgf_parsing_symbol(ps, item, item->curr_sym); } break; } @@ -1493,66 +1692,6 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item) } } -static bool -pgf_parsing_proceed(PgfParseState* state) -{ - prob_t best_prob = INFINITY; - if (gu_buf_length(state->ps->expr_queue) > 0) { - best_prob = gu_buf_get(state->ps->expr_queue, PgfExprState*, 0)->ep.prob; - } - - PgfParseState* before = NULL; - - prob_t delta_prob = 0; - PgfParseState* st = state; - while (st != NULL) { - if (gu_buf_length(st->agenda) > 0) { - PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0); - prob_t item_prob = - item->inside_prob+item->conts->outside_prob+delta_prob; - if (item_prob < best_prob) { - best_prob = item_prob; - before = st; - } - } - - prob_t state_delta = - (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))* - state->ps->beam_size; - //prob_t lexical_prob = - // st->ts ? st->ts->lexical_prob : 0; - delta_prob += state_delta; /*fmax(state_delta, lexical_prob)*/; // the calculation of lexical_prob doesn't work properly. - st = st->next; - } - - if (before == NULL) - return false; - - PgfParseState* after = NULL; - - st = state; - while (st != before) { - PgfParseState* tmp = st->next; - st->next = after; - after = st; - st = tmp; - } - - PgfItem* item; - gu_buf_heap_pop(before->agenda, &pgf_item_prob_order, &item); - pgf_parsing_item(before, after, item); - - while (after != NULL) { - PgfParseState* tmp = after->next; - after->next = before; - before = after; - after = tmp; - } - state = before; - - return true; -} - static prob_t pgf_parsing_default_beam_size(PgfConcr* concr) { @@ -1567,15 +1706,19 @@ pgf_parsing_default_beam_size(PgfConcr* concr) } static PgfParsing* -pgf_new_parsing(PgfConcr* concr, double heuristics, +pgf_new_parsing(PgfConcr* concr, + GuString sentence, double heuristics, GuPool* pool, GuPool* out_pool) { PgfParsing* ps = gu_new(PgfParsing, pool); ps->concr = concr; ps->pool = pool; ps->out_pool = out_pool; + ps->sentence = sentence; ps->expr_queue = gu_new_buf(PgfExprState*, pool); ps->max_fid = concr->total_cats; + ps->before = NULL; + ps->after = NULL; #ifdef PGF_COUNTS_DEBUG ps->item_full_count = 0; ps->item_real_count = 0; @@ -1602,68 +1745,6 @@ pgf_new_parsing(PgfConcr* concr, double heuristics, return ps; } -static PgfParseState* -pgf_new_parse_state(PgfParsing* ps, - PgfParseState* next, - PgfTokenState* ts, - GuPool* pool) -{ - PgfParseState* state = gu_new(PgfParseState, pool); - state->next = next; - state->agenda = gu_new_buf(PgfItem*, pool); - state->meta_item = NULL; - state->generated_cats = gu_map_type_new(PgfGenCatMap, pool); - state->conts_map = gu_map_type_new(PgfContsMap, pool); - state->offset = next ? next->offset+1 : 0; - state->viterbi_prob = 0; - state->ps = ps; - state->ts = ts; - return state; -} - -typedef struct { - GuMapItor fn; - PgfTokenState* ts; -} PgfLexiconFn; - -static void -pgf_parser_compute_lexicon_prob(GuMapItor* fn, const void* key, void* value, GuExn* err) -{ - PgfTokenState* ts = ((PgfLexiconFn*) fn)->ts; - PgfProductionBuf* prods = *((PgfProductionBuf**) value); - - if (prods == NULL) - return; - - size_t n_prods = gu_buf_length(prods); - for (size_t i = 0; i < n_prods; i++) { - PgfProduction prod = - gu_buf_get(prods, PgfProduction, i); - - GuVariantInfo pi = gu_variant_open(prod); - switch (pi.tag) { - case PGF_PRODUCTION_APPLY: { - PgfProductionApply* papp = pi.data; - if (ts->lexical_prob > papp->fun->ep->prob) { - ts->lexical_prob = papp->fun->ep->prob; - } - break; - } - } - } -} - -#define pgf_new_token_state(ty, pool) \ - (ty*) pgf_new_token_state_(&pgf_tsfn_##ty, (PgfTokenState*) gu_new(ty, pool)) - -static PgfTokenState* -pgf_new_token_state_(PgfTokenFn* fn, PgfTokenState* ts) -{ - ts->fn = fn; - ts->lexical_prob = INFINITY; - return ts; -} - #ifdef PGF_COUNTS_DEBUG void pgf_parsing_print_counts(PgfParsing* ps) { @@ -1676,6 +1757,8 @@ void pgf_parsing_print_counts(PgfParsing* ps) } #endif +/*static bool +************* typedef struct { PgfTokenState ts; PgfToken tok; @@ -1744,6 +1827,7 @@ typedef struct { } PgfPrefixTokenState; static bool +^ ^ ^ ^ ^ ^ ^ pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item) { PgfPrefixTokenState* ts = @@ -1754,9 +1838,18 @@ pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item) PgfSequence* seq; pgf_item_sequence(item, &lin_idx, &seq, ts->pool); + uint16_t seq_idx = item->seq_idx; + uint8_t tok_idx = item->tok_idx; + + // go one token back + if (tok_idx > 0) + tok_idx--; + else + seq_idx--; + ts->tp = gu_new(PgfTokenProb, ts->pool); ts->tp->tok = - pgf_get_tokens(seq, item->seq_idx-1, ts->pool); + pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool); ts->tp->cat = item->conts->ccat->cnccat->abscat->name; ts->tp->prob = item->inside_prob+item->conts->outside_prob; } @@ -1794,24 +1887,25 @@ pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool) } *((PgfTokenProb**)to) = ts->tp; -} +}*/ GuEnum* -pgf_parser_completions(PgfParseState* prev, GuString prefix) +pgf_parsing_completions(PgfParsing* ps, GuString prefix) { #ifdef PGF_COUNTS_DEBUG - pgf_parsing_print_counts(prev->ps); + pgf_parsing_print_counts(ps); #endif - PgfPrefixTokenState* ts = +/* PgfPrefixTokenState* ts = pgf_new_token_state(PgfPrefixTokenState, prev->ps->pool); ts->en.next = pgf_parser_completions_next; ts->prefix = prefix; ts->tp = NULL; ts->state = - pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool); + pgf_new_parse_state(prev->ps, prev, &ts->ts); - return &ts->en; + return &ts->en;*/ + return NULL; } static int @@ -1981,17 +2075,134 @@ pgf_parse_result_is_new(PgfExprState* st) return true; } +// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat +static PgfParsing* +pgf_parsing_init(PgfConcr* concr, PgfCId cat, size_t lin_idx, + GuString sentence, double heuristics, + GuExn* err, + GuPool* pool, GuPool* out_pool) +{ + PgfCncCat* cnccat = + gu_map_get(concr->cnccats, cat, PgfCncCat*); + if (!cnccat) { + GuExnData* exn = gu_raise(err, PgfExn); + exn->data = "Unknown start category"; + return NULL; + } + + gu_assert(lin_idx < cnccat->n_lins); + + if (heuristics < 0) { + heuristics = pgf_parsing_default_beam_size(concr); + } + + PgfParsing* ps = + pgf_new_parsing(concr, sentence, heuristics, pool, out_pool); + PgfParseState* state = + pgf_new_parse_state(ps, 0, BIND_SOFT); + + size_t n_ccats = gu_seq_length(cnccat->cats); + for (size_t i = 0; i < n_ccats; i++) { + PgfCCat* ccat = gu_seq_get(cnccat->cats, PgfCCat*, i); + if (ccat != NULL) { + if (ccat->prods == NULL) { + // Empty category + continue; + } + + PgfItemConts* conts = + pgf_parsing_get_conts(state, ccat, lin_idx, ps->pool); + gu_buf_push(conts->items, PgfItem*, NULL); + +#ifdef PGF_COUNTS_DEBUG + ps->cont_full_count++; +#endif + + size_t n_prods = gu_seq_length(ccat->prods); + for (size_t i = 0; i < n_prods; i++) { + PgfProduction prod = + gu_seq_get(ccat->prods, PgfProduction, i); + PgfItem* item = + pgf_new_item(ps, conts, prod); + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + } + + PgfItem *item = + pgf_new_item(ps, conts, ps->meta_prod); + item->inside_prob = + ccat->cnccat->abscat->meta_prob; + gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item); + } + } + + return ps; +} + +static bool +pgf_parsing_proceed(PgfParsing* ps) +{ + bool has_progress = false; + + prob_t best_prob = INFINITY; + if (gu_buf_length(ps->expr_queue) > 0) { + best_prob = gu_buf_get(ps->expr_queue, PgfExprState*, 0)->ep.prob; + } + + prob_t delta_prob = 0; + PgfParseState* st = ps->before; + while (st != NULL) { + if (gu_buf_length(st->agenda) > 0) { + PgfItem* item = gu_buf_get(st->agenda, PgfItem*, 0); + prob_t item_prob = + item->inside_prob+item->conts->outside_prob+delta_prob; + if (item_prob < best_prob) { + best_prob = item_prob; + + while (st != ps->before) { + PgfParseState* tmp = ps->before->next; + ps->before->next = ps->after; + ps->after = ps->before; + ps->before = tmp; + } + + has_progress = true; + } + } + + prob_t state_delta = + (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))* + ps->beam_size; + delta_prob += state_delta; + st = st->next; + } + + if (has_progress) { + PgfItem* item; + gu_buf_heap_pop(ps->before->agenda, pgf_item_prob_order, &item); + pgf_parsing_item(ps, item); + } + + while (ps->after != NULL) { + PgfParseState* tmp = ps->after->next; + ps->after->next = ps->before; + ps->before = ps->after; + ps->after = tmp; + } + + return has_progress; +} + static PgfExprProb* -pgf_parse_result_next(PgfParseResult* pr) +pgf_parse_result_next(PgfParsing* ps) { for (;;) { - while (pgf_parsing_proceed(pr->state)); + while (pgf_parsing_proceed(ps)); - if (gu_buf_length(pr->state->ps->expr_queue) == 0) + if (gu_buf_length(ps->expr_queue) == 0) break; PgfExprState* st; - gu_buf_heap_pop(pr->state->ps->expr_queue, &pgf_expr_state_order, &st); + gu_buf_heap_pop(ps->expr_queue, &pgf_expr_state_order, &st); #ifdef PGF_PARSER_DEBUG #ifdef PGF_RESULT_DEBUG @@ -2008,16 +2219,16 @@ pgf_parse_result_next(PgfParseResult* pr) PgfCCat* ccat = gu_seq_index(st->args, PgfPArg, st->arg_idx)->ccat; - if (ccat->fid < pr->state->ps->concr->total_cats) { + if (ccat->fid < ps->concr->total_cats) { st->ep.expr = - gu_new_variant_i(pr->state->ps->out_pool, + gu_new_variant_i(ps->out_pool, PGF_EXPR_APP, PgfExprApp, .fun = st->ep.expr, - .arg = pr->state->ps->meta_var); + .arg = ps->meta_var); st->arg_idx++; - gu_buf_heap_push(pr->state->ps->expr_queue, &pgf_expr_state_order, &st); + gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st); } else { - pgf_result_predict(pr->state->ps, st, ccat); + pgf_result_predict(ps, st, ccat); } } else if (pgf_parse_result_is_new(st)) { gu_buf_push(st->answers->exprs, PgfExprProb*, &st->ep); @@ -2030,10 +2241,10 @@ pgf_parse_result_next(PgfParseResult* pr) return &st->ep; } - PgfExprState* st3 = gu_new(PgfExprState, pr->state->ps->pool); + PgfExprState* st3 = gu_new(PgfExprState, ps->pool); st3->answers = st2->answers; st3->ep.expr = - gu_new_variant_i(pr->state->ps->out_pool, + gu_new_variant_i(ps->out_pool, PGF_EXPR_APP, PgfExprApp, .fun = st2->ep.expr, .arg = st->ep.expr); @@ -2041,7 +2252,7 @@ pgf_parse_result_next(PgfParseResult* pr) st3->args = st2->args; st3->arg_idx = st2->arg_idx+1; - gu_buf_heap_push(pr->state->ps->expr_queue, &pgf_expr_state_order, &st3); + gu_buf_heap_push(ps->expr_queue, &pgf_expr_state_order, &st3); } } } @@ -2052,181 +2263,111 @@ pgf_parse_result_next(PgfParseResult* pr) static void pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool) { - PgfParseResult* pr = gu_container(self, PgfParseResult, en); - *(PgfExprProb**)to = pgf_parse_result_next(pr); + PgfParsing* ps = gu_container(self, PgfParsing, en); + *(PgfExprProb**)to = pgf_parse_result_next(ps); } -PgfExprEnum* -pgf_parse_result(PgfParseState* state) +static GuString +pgf_parsing_last_token(PgfParsing* ps, GuPool* pool) +{ + if (ps->before == NULL) + return ""; + + size_t start = ps->before->end_offset; + while (start > 0) { + char c = ps->sentence[start-1]; + if (gu_is_space(c)) + break; + start--; + } + + size_t end = ps->before->end_offset; + while (ps->sentence[end] != 0) { + char c = ps->sentence[end]; + if (gu_is_space(c)) + break; + end++; + } + + char* tok = gu_malloc(pool, end-start+1); + memcpy(tok, ps->sentence+start, (end-start)); + tok[end-start] = 0; + return tok; +} + +GU_DEFINE_TYPE(PgfParseError, abstract, _); + +GuEnum* +pgf_parse(PgfConcr* concr, PgfCId cat, GuString sentence, + GuExn* err, + GuPool* pool, GuPool* out_pool) { + return pgf_parse_with_heuristics(concr, cat, sentence, -1.0, err, pool, out_pool); +} + +GuEnum* +pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, GuString sentence, + double heuristics, + GuExn* err, + GuPool* pool, GuPool* out_pool) +{ + // Begin parsing a sentence with the specified category + PgfParsing* ps = + pgf_parsing_init(concr, cat, 0, sentence, heuristics, err, pool, out_pool); + if (ps == NULL) { + return NULL; + } + #ifdef PGF_COUNTS_DEBUG - pgf_parsing_print_counts(state->ps); + pgf_parsing_print_counts(ps); #endif - PgfParseResult* res = gu_new(PgfParseResult, state->ps->pool); - res->state = state; - res->en.next = pgf_parse_result_enum_next; - return &res->en; -} - -void -pgf_parse_print_chunks(PgfParseState* state) -{ -/* if (state->ps->completed == NULL) { - while (state->ps->completed == NULL) { - if (!pgf_parsing_proceed(state)) - break; - } - if (state->ps->completed == NULL) - return; - } - - GuPool* tmp_pool = gu_new_pool(); - GuOut* out = gu_file_out(stdout, tmp_pool); - GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool); - GuExn* err = gu_exn(NULL, type, tmp_pool); - - PgfCCat* completed = state->ps->completed; - if (gu_seq_length(completed->prods) == 0) - return; - - size_t n_args = 0; - size_t arg_idx = 0; - PgfCCat* ccat = NULL; - PgfProductionMeta* pmeta = NULL; - - PgfProduction prod = gu_seq_get(completed->prods, PgfProduction, 0); - GuVariantInfo pi = gu_variant_open(prod); - switch (pi.tag) { - case PGF_PRODUCTION_APPLY: - n_args = 1; - arg_idx = 0; - ccat = completed; - break; - case PGF_PRODUCTION_META: - pmeta = pi.data; - n_args = gu_seq_length(pmeta->args); - arg_idx = 0; - ccat = gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat; - break; - } - - PgfParseState* next = NULL; - while (state != NULL) { - PgfParseState* tmp = state->next; - state->next = next; - next = state; - state = tmp; - } - - int offset = 0; - - state = next; - next = NULL; - while (state != NULL) { - if (state->ts != NULL) - { - if (ccat != NULL && - offset == ((ccat->conts->state != NULL) ? ccat->conts->state->offset : 0)) { - PgfCCat *ccat2 = ccat; - while (ccat2->conts != NULL) { - ccat2 = ccat2->conts->ccat; - } - - gu_putc('(', wtr, err); - gu_string_write(ccat2->cnccat->abscat->name, wtr, err); - gu_putc(' ', wtr, err); - } - - gu_string_write(state->ts->tok, wtr, err); - offset++; - - if (ccat != NULL && - ccat == - gu_map_get(state->generated_cats, ccat->conts, PgfCCat*)) { - gu_putc(')', wtr, err); - - arg_idx++; - ccat = - (arg_idx >= n_args) ? - NULL : - gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat; - } - - gu_putc(' ', wtr, err); + while (gu_buf_length(ps->expr_queue) == 0) { + if (!pgf_parsing_proceed(ps)) { + GuExnData* exn = gu_raise(err, PgfParseError); + exn->data = (void*) pgf_parsing_last_token(ps, exn->pool); + return NULL; } - PgfParseState* tmp = state->next; - state->next = next; - next = state; - state = tmp; +#ifdef PGF_COUNTS_DEBUG + pgf_parsing_print_counts(ps); +#endif } - gu_putc('\n', wtr, err); - gu_pool_free(tmp_pool);*/ + // Now begin enumerating the resulting syntax trees + ps->en.next = pgf_parse_result_enum_next; + return &ps->en; } -// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat -PgfParseState* -pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx, - double heuristics, - GuPool* pool, GuPool* out_pool) +GuEnum* +pgf_complete(PgfConcr* concr, PgfCId cat, GuString sentence, + GuString prefix, GuExn *err, GuPool* pool) { - PgfCncCat* cnccat = - gu_map_get(concr->cnccats, cat, PgfCncCat*); - if (!cnccat) + // Begin parsing a sentence of the specified category + PgfParsing* ps = + pgf_parsing_init(concr, cat, 0, sentence, -1, err, pool, pool); + if (ps == NULL) { + return NULL; + } + + // Tokenization + GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool); +/* PgfToken tok = pgf_lexer_read_token(lexer, lex_err); + while (!gu_exn_is_raised(lex_err)) { + // feed the token to get a new parse state + state = pgf_parser_next_state(state, tok); + if (state == NULL) { + return NULL; + } + + tok = pgf_lexer_read_token(lexer, lex_err); + }*/ + + if (gu_exn_caught(lex_err) != gu_type(GuEOF)) return NULL; - gu_assert(lin_idx < cnccat->n_lins); - - if (heuristics < 0) { - heuristics = pgf_parsing_default_beam_size(concr); - } - - PgfParsing* ps = - pgf_new_parsing(concr, heuristics, pool, out_pool); - PgfParseState* state = - pgf_new_parse_state(ps, NULL, NULL, pool); - - size_t n_ccats = gu_seq_length(cnccat->cats); - for (size_t i = 0; i < n_ccats; i++) { - PgfCCat* ccat = gu_seq_get(cnccat->cats, PgfCCat*, i); - if (ccat != NULL) { - if (ccat->prods == NULL) { - // Empty category - continue; - } - - PgfItemConts* conts = gu_new(PgfItemConts, pool); - conts->ccat = ccat; - conts->lin_idx = lin_idx; - conts->state = NULL; - conts->items = gu_new_buf(PgfItem*, pool); - conts->outside_prob = 0; - conts->ref_count = 0; - gu_buf_push(conts->items, PgfItem*, NULL); - -#ifdef PGF_COUNTS_DEBUG - ps->cont_full_count++; -#endif - - size_t n_prods = gu_seq_length(ccat->prods); - for (size_t i = 0; i < n_prods; i++) { - PgfProduction prod = - gu_seq_get(ccat->prods, PgfProduction, i); - PgfItem* item = - pgf_new_item(conts, prod, pool, ps); - gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item); - } - - PgfItem *item = - pgf_new_item(conts, ps->meta_prod, pool, ps); - item->inside_prob = - ccat->cnccat->abscat->meta_prob; - gu_buf_heap_push(state->agenda, &pgf_item_prob_order, &item); - } - } - return state; + // Now begin enumerating the resulting syntax trees + return pgf_parsing_completions(ps, prefix); } void @@ -2242,306 +2383,114 @@ pgf_parser_add_literal(PgfConcr *concr, PgfCId cat, PgfLiteralCallback*, callback); } -typedef struct { - GuMapItor fn; - PgfTokens* tokens; - PgfMorphoCallback* callback; -} PgfMorphoFn; - static void -pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err) +pgf_morpho_iter(PgfProductionIdx* idx, + PgfMorphoCallback* callback, + GuExn* err) { - PgfMorphoFn* clo = (PgfMorphoFn*) fn; - PgfCFCat cfc = *((PgfCFCat*) key); - PgfProductionBuf* prods = *((PgfProductionBuf**) value); + size_t n_entries = gu_buf_length(idx); + for (size_t i = 0; i < n_entries; i++) { + PgfProductionIdxEntry* entry = + gu_buf_index(idx, PgfProductionIdxEntry, i); - if (prods == NULL) - return; - - GuString analysis = cfc.ccat->cnccat->labels[cfc.lin_idx]; - - size_t n_prods = gu_buf_length(prods); - for (size_t i = 0; i < n_prods; i++) { - PgfProduction prod = - gu_buf_get(prods, PgfProduction, i); - - GuVariantInfo i = gu_variant_open(prod); - switch (i.tag) { - case PGF_PRODUCTION_APPLY: { - PgfProductionApply* papp = i.data; - - if (clo->tokens != NULL) { - // match the tokens with the production - size_t pos = 0; - PgfSequence* seq = papp->fun->lins[cfc.lin_idx]; - size_t len = gu_seq_length(seq); - for (size_t i = 0; i < len; i++) { - PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i); - - GuVariantInfo i = gu_variant_open(sym); - switch (i.tag) { - case PGF_SYMBOL_KS: { - PgfSymbolKS* symks = i.data; - - if (pos >= gu_seq_length(clo->tokens)) - goto cont; - - PgfToken tok1 = symks->token; - PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++); - - if (strcmp(tok1, tok2) != 0) - goto cont; - } - default: - continue; - } - } - - if (pos != gu_seq_length(clo->tokens)) - goto cont; - } - - PgfCId lemma = papp->fun->absfun->name; - prob_t prob = papp->fun->absfun->ep.prob; - clo->callback->callback(clo->callback, - lemma, analysis, prob, err); - } - } - cont:; + PgfCId lemma = entry->papp->fun->absfun->name; + GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx]; + prob_t prob = entry->papp->fun->absfun->ep.prob; + callback->callback(callback, + lemma, analysis, prob, err); + if (!gu_ok(err)) + return; } } +static int +pgf_sequence_cmp_fn(GuOrder* self, const void* p1, const void* p2) +{ + (void) self; + GuString sent = (GuString) p1; + const PgfSequence* sp2 = p2; + + BIND_TYPE bind = BIND_HARD; + int res = pgf_symbols_cmp(&sent, strlen(sent), &bind, sp2->syms); + if (res == 0 && *sent != 0) { + res = 1; + } + + return res; +} + +static GuOrder pgf_sequence_order[1] = { { pgf_sequence_cmp_fn } }; + void -pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer, +pgf_lookup_morpho(PgfConcr *concr, GuString sentence, PgfMorphoCallback* callback, GuExn* err) { - GuPool* tmp_pool = gu_local_pool(); - - GuBuf* tokens = gu_new_buf(PgfToken, tmp_pool); - GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), tmp_pool); + PgfSequence* seq = (PgfSequence*) + gu_seq_binsearch_(concr->sequences, pgf_sequence_order, + sizeof(PgfSequence), 0, (void*) sentence); - PgfToken tok = pgf_lexer_read_token(lexer, lex_err); - if (gu_exn_is_raised(lex_err)) { - gu_raise(err, PgfExn); - gu_pool_free(tmp_pool); - return; - } - - PgfProductionIdx* lexicon_idx = - gu_map_get(concr->leftcorner_tok_idx, tok, PgfProductionIdx*); - if (lexicon_idx == NULL) { - gu_pool_free(tmp_pool); - return; - } - - do { - gu_buf_push(tokens, PgfToken, tok); - tok = pgf_lexer_read_token(lexer, lex_err); - } while (!gu_exn_is_raised(lex_err)); - - PgfMorphoFn clo = { { pgf_morpho_iter }, gu_buf_data_seq(tokens), callback }; - gu_map_iter(lexicon_idx, &clo.fn, err); - - gu_pool_free(tmp_pool); + if (seq != NULL && seq->idx != NULL) + pgf_morpho_iter(seq->idx, callback, err); } typedef struct { GuEnum en; - GuEnum* map_en1; - GuEnum* map_en2; - - GuMapItor fn; - PgfLeftcornerTokIdx* new_idx; - - GuPool* pool; + PgfSequences* sequences; + size_t seq_idx; } PgfFullFormState; -static void -pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err) -{ - PgfFullFormState* st = gu_container(fn, PgfFullFormState, fn); - PgfCFCat cfc = *((PgfCFCat*) key); - PgfProductionBuf* prods = *((PgfProductionBuf**) value); - - if (prods == NULL) - return; - - size_t n_prods = gu_buf_length(prods); - for (size_t i = 0; i < n_prods; i++) { - PgfProduction prod = - gu_buf_get(prods, PgfProduction, i); - - GuVariantInfo i = gu_variant_open(prod); - switch (i.tag) { - case PGF_PRODUCTION_APPLY: { - PgfProductionApply* papp = i.data; - - PgfSequence* seq = papp->fun->lins[cfc.lin_idx]; - GuString tokens = pgf_get_tokens(seq, 0, st->pool); - - // create a new production index with keys that - // are multiword units - PgfProductionIdx* lexicon_idx = - gu_map_get(st->new_idx, tokens, PgfProductionIdx*); - if (lexicon_idx == NULL) { - lexicon_idx = gu_map_type_new(PgfProductionIdx, st->pool); - gu_map_put(st->new_idx, tokens, PgfProductionIdx*, lexicon_idx); - } - - PgfProductionBuf* prods = - gu_map_get(lexicon_idx, &cfc, PgfProductionBuf*); - if (prods == NULL) { - prods = gu_new_buf(PgfProduction, st->pool); - gu_map_put(lexicon_idx, &cfc, PgfProductionBuf*, prods); - } - - gu_buf_push(prods, PgfProduction, prod); - } - } - } -} +struct PgfFullFormEntry { + GuString tokens; + PgfProductionIdx* idx; +}; static void gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool) { PgfFullFormState* st = gu_container(self, PgfFullFormState, en); + PgfFullFormEntry* entry = NULL; - for (;;) { - if (st->new_idx == NULL) { - GuMapKeyValue* kv = gu_next(st->map_en1, GuMapKeyValue*, pool); - if (kv == NULL) { - *((PgfFullFormEntry**)to) = NULL; - return; - } + size_t n_seqs = gu_seq_length(st->sequences); + while (st->seq_idx < n_seqs) { + PgfSymbols* syms = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->syms; + GuString tokens = pgf_get_tokens(syms, 0, pool); + if (strlen(tokens) > 0 && + gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx != NULL) { + entry = gu_new(PgfFullFormEntry, pool); + entry->tokens = tokens; + entry->idx = gu_seq_index(st->sequences, PgfSequence, st->seq_idx)->idx; - PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) kv->value); - - // we have an index by the first token but we must re-index - // by taking into account the multiword units - st->pool = pool; - st->new_idx = gu_map_type_new(PgfLeftcornerTokIdx, pool); - st->fn.fn = pgf_fullform_iter; - gu_map_iter(lexicon_idx, &st->fn, NULL); - - st->map_en2 = gu_map_enum(st->new_idx, pool); - } - PgfFullFormEntry* entry = - gu_next(st->map_en2, PgfFullFormEntry*, pool); - if (entry != NULL) { - *((PgfFullFormEntry**)to) = entry; + st->seq_idx++; break; } - - st->new_idx = NULL; + + st->seq_idx++; } + + *((PgfFullFormEntry**) to) = entry; } GuEnum* pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool) { PgfFullFormState* st = gu_new(PgfFullFormState, pool); - st->en.next = gu_fullform_enum_next; - st->map_en1 = gu_map_enum(concr->leftcorner_tok_idx, pool); - st->map_en2 = NULL; - st->new_idx = NULL; - st->pool = NULL; + st->en.next = gu_fullform_enum_next; + st->sequences = concr->sequences; + st->seq_idx = 0; return &st->en; } GuString pgf_fullform_get_string(PgfFullFormEntry* entry) { - return (GuString) entry->key; + return entry->tokens; } void pgf_fullform_get_analyses(PgfFullFormEntry* entry, PgfMorphoCallback* callback, GuExn* err) { - PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) entry->value); - PgfMorphoFn clo = { { pgf_morpho_iter }, NULL, callback }; - gu_map_iter(lexicon_idx, &clo.fn, err); -} - -static void -pgf_parser_index_token(PgfConcr* concr, - PgfToken tok, - PgfCCat* ccat, size_t lin_idx, PgfProduction prod, - GuPool *pool) -{ - PgfProductionIdx* set = - gu_map_get(concr->leftcorner_tok_idx, tok, PgfProductionIdx*); - if (set == NULL) { - set = gu_map_type_new(PgfProductionIdx, pool); - gu_map_put(concr->leftcorner_tok_idx, tok, PgfProductionIdx*, set); - } - - PgfCFCat cfc = {ccat, lin_idx}; - PgfProductionBuf* prods = gu_map_get(set, &cfc, PgfProductionBuf*); - if (prods == NULL) { - prods = gu_new_buf(PgfProduction, pool); - gu_map_put(set, &cfc, PgfProductionBuf*, prods); - } - - gu_buf_push(prods, PgfProduction, prod); -} - -static void -pgf_parser_index_epsilon(PgfConcr* concr, - PgfCCat* ccat, size_t lin_idx, PgfProduction prod, - GuPool *pool) -{ - PgfCFCat cfc = {ccat, lin_idx}; - PgfProductionBuf* prods = - gu_map_get(concr->epsilon_idx, &cfc, PgfProductionBuf*); - if (prods == NULL) { - prods = gu_new_buf(PgfProduction, pool); - gu_map_put(concr->epsilon_idx, &cfc, PgfProductionBuf*, prods); - } - - gu_buf_push(prods, PgfProduction, prod); -} - -static void -pgf_parser_index_symbol(PgfConcr* concr, PgfSymbol sym, - PgfCCat* ccat, size_t lin_idx, PgfProduction prod, - GuPool *pool) -{ - GuVariantInfo i = gu_variant_open(sym); - switch (i.tag) { - case PGF_SYMBOL_KS: { - PgfSymbolKS* sks = i.data; - pgf_parser_index_token(concr, - sks->token, - ccat, lin_idx, prod, - pool); - break; - } - case PGF_SYMBOL_KP: { - PgfSymbolKP* skp = i.data; - PgfSymbol sym = - gu_seq_get(skp->default_form, PgfSymbol, 0); - pgf_parser_index_symbol(concr, sym, - ccat, lin_idx, prod, - pool); - for (size_t i = 0; i < skp->n_forms; i++) { - sym = gu_seq_get(skp->forms[i].form, PgfSymbol, 0); - pgf_parser_index_symbol(concr, sym, - ccat, lin_idx, prod, - pool); - } - break; - } - case PGF_SYMBOL_CAT: - case PGF_SYMBOL_LIT: - case PGF_SYMBOL_NE: - case PGF_SYMBOL_BIND: - case PGF_SYMBOL_VAR: - // Nothing to be done here - break; - default: - gu_impossible(); - } + pgf_morpho_iter(entry->idx, callback, err); } void @@ -2559,15 +2508,14 @@ pgf_parser_index(PgfConcr* concr, break; PgfSequence* seq = papp->fun->lins[lin_idx]; - if (gu_seq_length(seq) > 0) { - pgf_parser_index_symbol(concr, gu_seq_get(seq, PgfSymbol, 0), - ccat, lin_idx, prod, - pool); - } else { - pgf_parser_index_epsilon(concr, - ccat, lin_idx, prod, - pool); + if (seq->idx == NULL) { + seq->idx = gu_new_buf(PgfProductionIdxEntry, pool); } + + PgfProductionIdxEntry* entry = gu_buf_extend(seq->idx); + entry->ccat = ccat; + entry->lin_idx = lin_idx; + entry->papp = papp; } break; case PGF_PRODUCTION_COERCE: @@ -2631,24 +2579,3 @@ pgf_ccat_set_viterbi_prob(PgfCCat* ccat) { return ccat->viterbi_prob; } - -static bool -pgf_cfcat_eq_fn(GuEquality* self, const void* a, const void* b) -{ - PgfCFCat *x = (PgfCFCat *) a; - PgfCFCat *y = (PgfCFCat *) b; - - return (x->ccat->fid == y->ccat->fid && x->lin_idx == y->lin_idx); -} - -static GuHash -pgf_cfcat_hash_fn(GuHasher* self, const void* a) -{ - PgfCFCat *x = (PgfCFCat *) a; - return ((x->ccat->fid << 16) ^ x->lin_idx); -} - -GuHasher pgf_cfcat_hasher = { - { pgf_cfcat_eq_fn }, - pgf_cfcat_hash_fn -}; diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h index 5e88b0742..93addbf7f 100644 --- a/src/runtime/c/pgf/parser.h +++ b/src/runtime/c/pgf/parser.h @@ -5,119 +5,14 @@ #include #include -/// Parsing -/** @file - * - * @todo Querying the parser for expected continuations - * - * @todo Literals and custom categories - * - * @todo HOAS, dependent types... - */ +void +pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool); -typedef struct PgfParseState PgfParseState; - -/** @} - * - * @name Parsing a sentence - * - * The progress of parsing is controlled by the client code. Firstly, the - * parsing of a sentence is initiated with #pgf_parser_parse. This returns an - * initial #PgfParse object, which represents the state of the parsing. A new - * parse state is obtained by feeding a token with #pgf_parse_token. The old - * parse state is unaffected by this, so backtracking - and even branching - - * can be accomplished by retaining the earlier #PgfParse objects. - * - * @{ - */ - -/// Begin parsing -PgfParseState* -pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx, - double heuristics, - GuPool* pool, GuPool* out_pool); -/**< - * @param parser The parser to use - * - * @param cat The identifier of the abstract category to parse - * - * @param lin_idx The index of the field of the concrete category to parse - * - * @pool - * - * @return An initial parsing state. -*/ - - -/// Feed a token to the parser -PgfParseState* -pgf_parser_next_state(PgfParseState* prev, PgfToken tok); -/**< - * @param parse The current parse state - * - * @param tok The token to feed - * - * @pool - * - * @return A new parse state obtained by feeding \p tok as an input to \p - * parse, or \c NULL if the token was unexpected. - * - * @note The new parse state partially depends on the old one, so it doesn't - * make sense to use a \p pool argument with a longer lifetime than that of - * the pool used to create \parse. - */ - -GuEnum* -pgf_parser_completions(PgfParseState* prev, GuString prefix); +void +pgf_add_extern_cat(PgfSymbol* psym, int d, int r, GuPool* pool); void pgf_parser_add_literal(PgfConcr *concr, PgfCId cat, PgfLiteralCallback* callback); -/** @} - * @name Retrieving abstract syntax trees - * - * After the desired tokens have been fed to the parser, the resulting parse - * state can be queried for completed results. The #pgf_parse_result function - * returns an enumeration (#GuEnum) of possible abstract syntax trees whose - * linearization is the sequence of tokens fed so far. - * - * @{ - */ - -/// Retrieve the current parses from the parse state. -PgfExprEnum* -pgf_parse_result(PgfParseState* state); -/**< - * @param parse A parse state - * - * @pool - * - * @return An enumeration of #PgfExpr elements representing the abstract - * syntax trees that would linearize to the sequence of tokens fed to produce - * \p parse. The enumeration may yield zero, one or more abstract syntax - * trees, depending on whether the parse was unsuccesful, unambiguously - * succesful, or ambiguously successful. - */ - -// Use this procedure only on your own risk. -// It is dirty and it will probably be removed or replaced -// with something else. Currently it is here only for experimental -// purposes. -void -pgf_parse_print_chunks(PgfParseState* state); - -size_t -pgf_item_lin_idx(PgfItem* item); - -void -pgf_item_sequence(PgfItem* item, - size_t* lin_idx, PgfSequence** seq, - GuPool* pool); - -int -pgf_item_sequence_length(PgfItem* item); - -/** @} */ - #endif // PGF_PARSER_H_ diff --git a/src/runtime/c/pgf/parseval.c b/src/runtime/c/pgf/parseval.c index ebd52d4ce..4feadbf07 100644 --- a/src/runtime/c/pgf/parseval.c +++ b/src/runtime/c/pgf/parseval.c @@ -1,8 +1,9 @@ #include +#include #include #include -typedef struct { +/*typedef struct { int start, end; PgfCId cat; int lin_idx; @@ -123,24 +124,50 @@ pgf_metrics_lzn_end_phrase2(PgfLinFuncs** funcs, PgfCId cat, int fid, int lin_id } static PgfLinFuncs pgf_metrics_lin_funcs1 = { +v v v v v v v + .symbol_tokens = pgf_metrics_lzn_symbol_tokens, + .expr_literal = pgf_metrics_lzn_expr_literal, + .begin_phrase = pgf_metrics_lzn_begin_phrase, + .end_phrase = pgf_metrics_lzn_end_phrase1 +============= .symbol_token = pgf_metrics_lzn_symbol_token, .expr_literal = pgf_metrics_lzn_expr_literal, .begin_phrase = pgf_metrics_lzn_begin_phrase, .end_phrase = pgf_metrics_lzn_end_phrase1 +************* + .symbol_tokens = pgf_metrics_lzn_symbol_tokens, + .expr_literal = pgf_metrics_lzn_expr_literal, + .begin_phrase = pgf_metrics_lzn_begin_phrase, + .end_phrase = pgf_metrics_lzn_end_phrase1, + .symbol_glue = NULL +^ ^ ^ ^ ^ ^ ^ }; static PgfLinFuncs pgf_metrics_lin_funcs2 = { +v v v v v v v + .symbol_tokens = pgf_metrics_lzn_symbol_tokens, + .expr_literal = pgf_metrics_lzn_expr_literal, + .begin_phrase = pgf_metrics_lzn_begin_phrase, + .end_phrase = pgf_metrics_lzn_end_phrase2 +============= .symbol_token = pgf_metrics_lzn_symbol_token, .expr_literal = pgf_metrics_lzn_expr_literal, .begin_phrase = pgf_metrics_lzn_begin_phrase, .end_phrase = pgf_metrics_lzn_end_phrase2 +************* + .symbol_tokens = pgf_metrics_lzn_symbol_tokens, + .expr_literal = pgf_metrics_lzn_expr_literal, + .begin_phrase = pgf_metrics_lzn_begin_phrase, + .end_phrase = pgf_metrics_lzn_end_phrase2, + .symbol_glue = NULL +^ ^ ^ ^ ^ ^ ^ }; - +*/ bool pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat, double *precision, double *recall, double *exact) { - GuPool* pool = gu_new_pool(); +/* GuPool* pool = gu_new_pool(); GuEnum* en_lins1 = pgf_lzr_concretize(concr, expr, pool); @@ -190,6 +217,6 @@ pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat, *precision = ((double) state.matches)/((double) state.found); *recall = ((double) state.matches)/((double) gu_buf_length(state.phrases)); *exact = pgf_expr_eq(expr, ep->expr) ? 1 : 0; - +*/ return true; } diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c index a1eb62c95..39b032adb 100644 --- a/src/runtime/c/pgf/pgf.c +++ b/src/runtime/c/pgf/pgf.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include @@ -61,7 +60,8 @@ pgf_load_meta_child_probs(PgfPGF* pgf, const char* fpath, PgfAbsCat* abscat1 = gu_map_get(pgf->abstract.cats, cat1, PgfAbsCat*); if (abscat1 == NULL) { - gu_raise(err, PgfExn); + GuExnData* exn = gu_raise(err, PgfExn); + exn->data = "Unknown category name"; goto close; } @@ -73,6 +73,8 @@ pgf_load_meta_child_probs(PgfPGF* pgf, const char* fpath, PgfAbsCat* abscat2 = gu_map_get(pgf->abstract.cats, cat2, PgfAbsCat*); if (abscat2 == NULL) { gu_raise(err, PgfExn); + GuExnData* exn = gu_raise(err, PgfExn); + exn->data = "Unknown category name"; goto close; } @@ -228,73 +230,3 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err) gu_pool_free(tmp_pool); } - -GuEnum* -pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, - GuPool* pool, GuPool* out_pool) -{ - return pgf_parse_with_heuristics(concr, cat, lexer, -1.0, pool, out_pool); -} - -GuEnum* -pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, - double heuristics, - GuPool* pool, GuPool* out_pool) -{ - // Begin parsing a sentence of the specified category - PgfParseState* state = - pgf_parser_init_state(concr, cat, 0, heuristics, pool, out_pool); - if (state == NULL) { - return NULL; - } - - // Tokenization - GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool); - PgfToken tok = pgf_lexer_read_token(lexer, lex_err); - while (!gu_exn_is_raised(lex_err)) { - // feed the token to get a new parse state - state = pgf_parser_next_state(state, tok); - if (state == NULL) { - return NULL; - } - - tok = pgf_lexer_read_token(lexer, lex_err); - } - - if (gu_exn_caught(lex_err) != gu_type(GuEOF)) - return NULL; - - // Now begin enumerating the resulting syntax trees - return pgf_parse_result(state); -} - -GuEnum* -pgf_complete(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, - GuString prefix, GuPool* pool) -{ - // Begin parsing a sentence of the specified category - PgfParseState* state = - pgf_parser_init_state(concr, cat, 0, -1, pool, pool); - if (state == NULL) { - return NULL; - } - - // Tokenization - GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool); - PgfToken tok = pgf_lexer_read_token(lexer, lex_err); - while (!gu_exn_is_raised(lex_err)) { - // feed the token to get a new parse state - state = pgf_parser_next_state(state, tok); - if (state == NULL) { - return NULL; - } - - tok = pgf_lexer_read_token(lexer, lex_err); - } - - if (gu_exn_caught(lex_err) != gu_type(GuEOF)) - return NULL; - - // Now begin enumerating the resulting syntax trees - return pgf_parser_completions(state, prefix); -} diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 9c95f6f90..b586eb7b0 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -37,6 +37,7 @@ extern GU_DECLARE_TYPE(PgfCId, typedef); extern GU_DECLARE_TYPE(PgfExn, abstract); +extern GU_DECLARE_TYPE(PgfParseError, abstract); /// @name PGF Grammar objects /// @{ @@ -50,7 +51,6 @@ typedef struct PgfConcr PgfConcr; */ #include -#include #include /// An enumeration of #PgfExpr elements. @@ -120,8 +120,16 @@ pgf_print_name(PgfConcr*, PgfCId id); void pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err); +bool +pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat, + double *precision, double *recall, double *exact); + PgfExprEnum* -pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, +pgf_generate_all(PgfPGF* pgf, PgfCId cat, GuPool* pool); + +PgfExprEnum* +pgf_parse(PgfConcr* concr, PgfCId cat, GuString sentence, + GuExn* err, GuPool* pool, GuPool* out_pool); typedef struct PgfMorphoCallback PgfMorphoCallback; @@ -132,10 +140,10 @@ struct PgfMorphoCallback { }; void -pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer, +pgf_lookup_morpho(PgfConcr *concr, GuString sentence, PgfMorphoCallback* callback, GuExn* err); -typedef GuMapKeyValue PgfFullFormEntry; +typedef struct PgfFullFormEntry PgfFullFormEntry; GuEnum* pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool); @@ -148,20 +156,14 @@ pgf_fullform_get_analyses(PgfFullFormEntry* entry, PgfMorphoCallback* callback, GuExn* err); PgfExprEnum* -pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, - double heuristics, +pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, + GuString sentence, double heuristics, + GuExn* err, GuPool* pool, GuPool* out_pool); GuEnum* -pgf_complete(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, - GuString prefix, GuPool* pool); - -bool -pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat, - double *precision, double *recall, double *exact); - -PgfExprEnum* -pgf_generate_all(PgfPGF* pgf, PgfCId cat, GuPool* pool); +pgf_complete(PgfConcr* concr, PgfCId cat, GuString string, + GuString prefix, GuExn* err, GuPool* pool); /// @} diff --git a/src/runtime/c/pgf/printer.c b/src/runtime/c/pgf/printer.c index 1b2775b94..9ce74d495 100644 --- a/src/runtime/c/pgf/printer.c +++ b/src/runtime/c/pgf/printer.c @@ -169,21 +169,14 @@ pgf_print_cncfun(PgfCncFun *cncfun, PgfSequences* sequences, GuOut *out, GuExn *err) { gu_printf(out,err," F%d := (", cncfun->funid); - - size_t n_seqs = gu_seq_length(sequences); - + for (size_t i = 0; i < cncfun->n_lins; i++) { if (i > 0) gu_putc(',', out, err); - PgfSequence* seq = cncfun->lins[i]; - for (size_t seqid = 0; seqid < n_seqs; seqid++) { - if (gu_seq_data(gu_seq_get(sequences, PgfSequence*, seqid)) == gu_seq_data(seq)) { - gu_printf(out,err,"S%d", seqid); - break; - } - } + PgfSequence* seq = cncfun->lins[i]; + gu_printf(out,err,"S%d", (seq - ((PgfSequence*) gu_seq_data(sequences)))); } - + gu_puts(")", out, err); if (cncfun->absfun != NULL) { @@ -204,7 +197,7 @@ pgf_print_token(PgfToken tok, GuOut *out, GuExn *err) } static void -pgf_print_sequence(PgfSequence* seq, GuOut *out, GuExn *err); +pgf_print_symbols(PgfSymbols* syms, GuOut *out, GuExn *err); void pgf_print_symbol(PgfSymbol sym, GuOut *out, GuExn *err) @@ -224,11 +217,11 @@ pgf_print_symbol(PgfSymbol sym, GuOut *out, GuExn *err) PgfSymbolKP* skp = gu_variant_data(sym); gu_puts("pre {", out, err); - pgf_print_sequence(skp->default_form, out, err); + pgf_print_symbols(skp->default_form, out, err); for (size_t i = 0; i < skp->n_forms; i++) { gu_puts("; ", out, err); - pgf_print_sequence(skp->forms[i].form, out, err); + pgf_print_symbols(skp->forms[i].form, out, err); gu_puts(" / ", out, err); size_t n_prefixes = gu_seq_length(skp->forms[i].prefixes); @@ -269,13 +262,13 @@ pgf_print_symbol(PgfSymbol sym, GuOut *out, GuExn *err) } static void -pgf_print_sequence(PgfSequence* seq, GuOut *out, GuExn *err) +pgf_print_symbols(PgfSymbols* syms, GuOut *out, GuExn *err) { - int n_syms = gu_seq_length(seq); + int n_syms = gu_seq_length(syms); for (int i = 0; i < n_syms; i++) { if (i > 0) gu_putc(' ', out, err); - PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i); + PgfSymbol sym = gu_seq_get(syms, PgfSymbol, i); pgf_print_symbol(sym, out, err); } } @@ -338,10 +331,9 @@ pgf_print_concrete(PgfCId cncname, PgfConcr* concr, gu_puts(" sequences\n", out, err); size_t n_seqs = gu_seq_length(concr->sequences); for (size_t i = 0; i < n_seqs; i++) { - PgfSequence* seq = gu_seq_get(concr->sequences, PgfSequence*, i); - gu_printf(out,err," S%d := ", i); - pgf_print_sequence(seq, out, err); + PgfSymbols* syms = gu_seq_index(concr->sequences, PgfSequence, i)->syms; + pgf_print_symbols(syms, out, err); gu_putc('\n', out, err); } diff --git a/src/runtime/c/pgf/reader.c b/src/runtime/c/pgf/reader.c index 9f08662e3..1e29c119c 100644 --- a/src/runtime/c/pgf/reader.c +++ b/src/runtime/c/pgf/reader.c @@ -606,13 +606,13 @@ pgf_read_printnames(PgfReader* rdr) return printnames; } -static PgfSequence* -pgf_read_sequence(PgfReader* rdr); +static PgfSymbols* +pgf_read_symbols(PgfReader* rdr); static void pgf_read_alternative(PgfReader* rdr, PgfAlternative* alt) { - alt->form = pgf_read_sequence(rdr); + alt->form = pgf_read_symbols(rdr); gu_return_on_exn(rdr->err,); size_t n_prefixes = pgf_read_len(rdr); @@ -692,7 +692,7 @@ pgf_read_symbol(PgfReader* rdr) break; } case PGF_SYMBOL_KP: { - PgfSequence* default_form = pgf_read_sequence(rdr); + PgfSymbols* default_form = pgf_read_symbols(rdr); gu_return_on_exn(rdr->err, gu_null_variant); size_t n_forms = pgf_read_len(rdr); @@ -732,21 +732,21 @@ pgf_read_symbol(PgfReader* rdr) return sym; } -static PgfSequence* -pgf_read_sequence(PgfReader* rdr) +static PgfSymbols* +pgf_read_symbols(PgfReader* rdr) { size_t len = pgf_read_len(rdr); gu_return_on_exn(rdr->err, NULL); - PgfSequence* seq = gu_new_seq(PgfSymbol, len, rdr->opool); + PgfSymbols* syms = gu_new_seq(PgfSymbol, len, rdr->opool); for (size_t i = 0; i < len; i++) { PgfSymbol sym = pgf_read_symbol(rdr); gu_return_on_exn(rdr->err, NULL); - gu_seq_set(seq, PgfSymbol, i, sym); + gu_seq_set(syms, PgfSymbol, i, sym); } - return seq; + return syms; } static PgfSequences* @@ -755,12 +755,14 @@ pgf_read_sequences(PgfReader* rdr) size_t len = pgf_read_len(rdr); gu_return_on_exn(rdr->err, NULL); - PgfSequences* seqs = gu_new_seq(PgfSequence*, len, rdr->opool); + PgfSequences* seqs = gu_new_seq(PgfSequence, len, rdr->opool); for (size_t i = 0; i < len; i++) { - PgfSequence* seq = pgf_read_sequence(rdr); + PgfSymbols* syms = pgf_read_symbols(rdr); gu_return_on_exn(rdr->err, NULL); - gu_seq_set(seqs, PgfSequence*, i, seq); + + gu_seq_index(seqs, PgfSequence, i)->syms = syms; + gu_seq_index(seqs, PgfSequence, i)->idx = NULL; } return seqs; @@ -793,7 +795,7 @@ pgf_read_cncfun(PgfReader* rdr, PgfAbstr* abstr, PgfConcr* concr, int funid) return NULL; } - cncfun->lins[i] = gu_seq_get(concr->sequences, PgfSequence*, seqid); + cncfun->lins[i] = gu_seq_index(concr->sequences, PgfSequence, seqid); } return cncfun; @@ -1149,8 +1151,6 @@ pgf_read_concrete(PgfReader* rdr, PgfAbstr* abstr) gu_new_int_map(PgfCCat*, &gu_null_struct, rdr->opool); concr->fun_indices = gu_map_type_new(PgfCncFunOverloadMap, rdr->opool); concr->coerce_idx = gu_map_type_new(PgfCncOverloadMap, rdr->opool); - concr->epsilon_idx = gu_map_type_new(PgfProductionIdx, rdr->opool); - concr->leftcorner_tok_idx = gu_map_type_new(PgfLeftcornerTokIdx,rdr->opool); pgf_read_lindefs(rdr, concr); pgf_read_ccats(rdr, concr); concr->cnccats = pgf_read_cnccats(rdr, abstr, concr); @@ -1200,7 +1200,7 @@ pgf_read_pgf(PgfReader* rdr) { pgf_read_abstract(rdr, &pgf->abstract); gu_return_on_exn(rdr->err, NULL); - + pgf->concretes = pgf_read_concretes(rdr, &pgf->abstract); gu_return_on_exn(rdr->err, NULL); diff --git a/src/runtime/c/utils/pgf-parse.c b/src/runtime/c/utils/pgf-parse.c index 2790e49de..cb5fc6c7e 100644 --- a/src/runtime/c/utils/pgf-parse.c +++ b/src/runtime/c/utils/pgf-parse.c @@ -110,12 +110,11 @@ int main(int argc, char* argv[]) { clock_t start = clock(); - GuIn *in = gu_string_in(line, ppool); - PgfLexer *lexer = pgf_new_simple_lexer(in, ppool); - GuEnum* result = pgf_parse_with_heuristics(concr, cat, lexer, heuristics, ppool, ppool); + GuExn* parse_err = gu_new_exn(NULL, gu_kind(type), ppool); + GuEnum* result = pgf_parse_with_heuristics(concr, cat, line, heuristics, parse_err, ppool, ppool); PgfExprProb* ep = NULL; - if (result != NULL) + if (gu_ok(parse_err)) ep = gu_next(result, PgfExprProb*, ppool); clock_t end = clock(); diff --git a/src/runtime/c/utils/pgf-translate.c b/src/runtime/c/utils/pgf-translate.c index aa98f1a7a..abd22267c 100644 --- a/src/runtime/c/utils/pgf-translate.c +++ b/src/runtime/c/utils/pgf-translate.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -153,23 +154,19 @@ int main(int argc, char* argv[]) { // sentence, so our memory usage doesn't increase over time. ppool = gu_new_pool(); - GuIn *in = - gu_string_in(line, ppool); - PgfLexer *lexer = - pgf_new_simple_lexer(in, ppool); - clock_t start = clock(); + GuExn* parse_err = gu_new_exn(NULL, gu_kind(type), ppool); result = - pgf_parse(from_concr, cat, lexer, ppool, ppool); - if (result == NULL) { - PgfToken tok = - pgf_lexer_current_token(lexer); - - if (*tok == 0) - gu_puts("Couldn't begin parsing", out, err); - else { + pgf_parse(from_concr, cat, line, parse_err, ppool, ppool); + if (!gu_ok(parse_err)) { + if (gu_exn_caught(parse_err) == gu_type(PgfExn)) { + GuString msg = gu_exn_caught_data(parse_err); + gu_string_write(msg, out, err); + gu_putc('\n', out, err); + } else if (gu_exn_caught(parse_err) == gu_type(PgfParseError)) { gu_puts("Unexpected token: \"", out, err); + GuString tok = gu_exn_caught_data(parse_err); gu_string_write(tok, out, err); gu_puts("\"\n", out, err); } diff --git a/src/runtime/java/jpgf.c b/src/runtime/java/jpgf.c index 6bc83ed9a..0367fb28e 100644 --- a/src/runtime/java/jpgf.c +++ b/src/runtime/java/jpgf.c @@ -1,6 +1,5 @@ #include #include -#include #include #include #include @@ -19,7 +18,7 @@ gu2j_string(JNIEnv *env, GuString s) { jchar* dst = utf16; while (s-utf8 < len) { GuUCS ucs = gu_utf8_decode((const uint8_t**) &s); - + if (ucs <= 0xFFFF) { *dst++ = ucs; } else { @@ -281,21 +280,21 @@ Java_org_grammaticalframework_pgf_Parser_parse GuString startCat = j2gu_string(env, jstartCat, pool); GuString s = j2gu_string(env, js, pool); - - GuIn* in = gu_string_in(s, pool); - PgfLexer *lexer = pgf_new_simple_lexer(in, pool); + GuExn* parse_err = gu_new_exn(NULL, gu_kind(type), pool); GuEnum* res = - pgf_parse(get_ref(env, concr), startCat, lexer, pool, out_pool); + pgf_parse(get_ref(env, concr), startCat, s, parse_err, pool, out_pool); - if (res == NULL) { - PgfToken tok = - pgf_lexer_current_token(lexer); - - if (*tok == 0) - throw_string_exception(env, "org/grammaticalframework/pgf/PGFError", "The sentence cannot be parsed"); - else - throw_jstring_exception(env, "org/grammaticalframework/pgf/ParseError", gu2j_string(env, tok)); + if (!gu_ok(parse_err)) { + if (gu_exn_caught(parse_err) == gu_type(PgfExn)) { + GuString msg = (GuString) gu_exn_caught_data(parse_err); + jstring jmsg = gu2j_string(env, msg); + throw_string_exception(env, "org/grammaticalframework/pgf/PGFError", jmsg); + } else if (gu_exn_caught(parse_err) == gu_type(PgfParseError)) { + GuString tok = (GuString) gu_exn_caught_data(parse_err); + jstring jtok = gu2j_string(env, tok); + throw_jstring_exception(env, "org/grammaticalframework/pgf/ParseError", jtok); + } gu_pool_free(pool); gu_pool_free(out_pool); diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c index 056cdc924..d5d3e78b1 100644 --- a/src/runtime/python/pypgf.c +++ b/src/runtime/python/pypgf.c @@ -1046,48 +1046,6 @@ Concr_printName(ConcrObject* self, PyObject *args) return PyString_FromString(pgf_print_name(self->concr, name)); } -typedef struct { - PgfLexer base; - PyObject* pylexer; - GuPool* pool; -} PgfPythonLexer; - -GU_DEFINE_TYPE(PyPgfLexerExn, abstract, _); - -static PgfToken -pypgf_python_lexer_read_token(PgfLexer *base, GuExn* err) -{ - PgfPythonLexer* lexer = (PgfPythonLexer*) base; - lexer->base.tok = ""; - - PyObject* item = PyIter_Next(lexer->pylexer); - if (item == NULL) - if (PyErr_Occurred() != NULL) - gu_raise(err, PyPgfLexerExn); - else - gu_raise(err, GuEOF); - else { - const char* str = PyString_AsString(item); - if (str == NULL) - gu_raise(err, PyPgfLexerExn); - else - lexer->base.tok = gu_string_copy(str, lexer->pool); - } - - return lexer->base.tok; -} - -static PgfLexer* -pypgf_new_python_lexer(PyObject* pylexer, GuPool* pool) -{ - PgfPythonLexer* lexer = gu_new(PgfPythonLexer, pool); - lexer->base.read_token = pypgf_python_lexer_read_token; - lexer->base.tok = ""; - lexer->pylexer = pylexer; - lexer->pool = pool; - return ((PgfLexer*) lexer); -} - #if ( (PY_VERSION_HEX < 0x02070000) \ || ((PY_VERSION_HEX >= 0x03000000) \ && (PY_VERSION_HEX < 0x03010000)) ) @@ -1114,35 +1072,19 @@ void pypgf_container_descructor(PyObject *capsule) static IterObject* Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) { - static char *kwlist[] = {"sentence", "tokens", "cat", "n", "heuristics", NULL}; + static char *kwlist[] = {"sentence", "cat", "n", "heuristics", NULL}; - int len; - const uint8_t *buf = NULL; - PyObject* py_lexer = NULL; + const char *sentence = NULL; PgfCId catname = pgf_start_cat(self->grammar->pgf); int max_count = -1; double heuristics = -1; - if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Osid", kwlist, - &buf, &len, &py_lexer, &catname, &max_count, &heuristics)) + if (!PyArg_ParseTupleAndKeywords(args, keywds, "s|sid", kwlist, + &sentence, &catname, &max_count, &heuristics)) return NULL; - if ((buf == NULL && py_lexer == NULL) || - (buf != NULL && py_lexer != NULL)) { - PyErr_SetString(PyExc_TypeError, "either the sentence or the tokens argument must be provided"); - return NULL; - } - - if (py_lexer != NULL) { - // get an iterator out of the iterable object - py_lexer = PyObject_GetIter(py_lexer); - if (py_lexer == NULL) - return NULL; - } - IterObject* pyres = (IterObject*) pgf_IterType.tp_alloc(&pgf_IterType, 0); if (pyres == NULL) { - Py_XDECREF(py_lexer); return NULL; } @@ -1160,30 +1102,22 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) pyres->counter = 0; pyres->fetch = Iter_fetch_expr; - PgfLexer *lexer = NULL; - if (buf != NULL) { - GuIn* in = gu_data_in(buf, len, pyres->pool); - lexer = pgf_new_simple_lexer(in, pyres->pool); - } - if (py_lexer != NULL) { - lexer = pypgf_new_python_lexer(py_lexer, pyres->pool); - } + GuExn* parse_err = gu_new_exn(NULL, gu_kind(type), pyres->pool); pyres->res = - pgf_parse_with_heuristics(self->concr, catname, lexer, - heuristics, pyres->pool, out_pool); + pgf_parse_with_heuristics(self->concr, catname, sentence, + heuristics, parse_err, + pyres->pool, out_pool); - if (pyres->res == NULL) { - PgfToken tok = - pgf_lexer_current_token(lexer); - - if (*tok == 0) - PyErr_SetString(PGFError, "The sentence cannot be parsed"); - else { + if (!gu_ok(parse_err)) { + if (gu_exn_caught(parse_err) == gu_type(PgfExn)) { + GuString msg = (GuString) gu_exn_caught_data(parse_err); + PyErr_SetString(PGFError, msg); + } else if (gu_exn_caught(parse_err) == gu_type(PgfParseError)) { + GuString tok = (GuString) gu_exn_caught_data(parse_err); PyObject* py_tok = PyString_FromString(tok); PyObject_SetAttrString(ParseError, "token", py_tok); - PyErr_Format(ParseError, "Unexpected token: \"%s\"", - PyString_AsString(py_tok)); + PyErr_Format(ParseError, "Unexpected token: \"%s\"", tok); Py_DECREF(py_tok); } @@ -1191,45 +1125,26 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) pyres = NULL; } - Py_XDECREF(py_lexer); - return pyres; } static IterObject* Concr_complete(ConcrObject* self, PyObject *args, PyObject *keywds) { - static char *kwlist[] = {"sentence", "tokens", "cat", - "prefix", "n", NULL}; + static char *kwlist[] = {"sentence", "cat", "prefix", "n", NULL}; - int len; - const uint8_t *buf = NULL; - PyObject* py_lexer = NULL; + const char *sentence = NULL; GuString catname = pgf_start_cat(self->grammar->pgf); GuString prefix = ""; int max_count = -1; - if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Ossi", kwlist, - &buf, &len, &py_lexer, &catname, + if (!PyArg_ParseTupleAndKeywords(args, keywds, "s|ssi", kwlist, + &sentence, &catname, &prefix, &max_count)) return NULL; - if ((buf == NULL && py_lexer == NULL) || - (buf != NULL && py_lexer != NULL)) { - PyErr_SetString(PyExc_TypeError, "either the sentence or the tokens argument must be provided"); - return NULL; - } - - if (py_lexer != NULL) { - // get an iterator out of the iterable object - py_lexer = PyObject_GetIter(py_lexer); - if (py_lexer == NULL) - return NULL; - } - IterObject* pyres = (IterObject*) pgf_IterType.tp_alloc(&pgf_IterType, 0); if (pyres == NULL) { - Py_XDECREF(py_lexer); return NULL; } @@ -1245,37 +1160,27 @@ Concr_complete(ConcrObject* self, PyObject *args, PyObject *keywds) GuPool *tmp_pool = gu_local_pool(); - PgfLexer *lexer = NULL; - if (buf != NULL) { - GuIn* in = gu_data_in(buf, len, tmp_pool); - lexer = pgf_new_simple_lexer(in, tmp_pool); - } - if (py_lexer != NULL) { - lexer = pypgf_new_python_lexer(py_lexer, tmp_pool); - } - + GuExn* parse_err = gu_new_exn(NULL, gu_kind(type), tmp_pool); + pyres->res = - pgf_complete(self->concr, catname, lexer, prefix, pyres->pool); + pgf_complete(self->concr, catname, sentence, prefix, parse_err, pyres->pool); - if (pyres->res == NULL) { + if (!gu_ok(parse_err)) { Py_DECREF(pyres); pyres = NULL; - PgfToken tok = - pgf_lexer_current_token(lexer); - - if (*tok == 0) - PyErr_SetString(PGFError, "The sentence cannot be parsed"); - else { + if (gu_exn_caught(parse_err) == gu_type(PgfExn)) { + GuString msg = (GuString) gu_exn_caught_data(parse_err); + PyErr_SetString(PGFError, msg); + } else if (gu_exn_caught(parse_err) == gu_type(PgfParseError)) { + GuString tok = (GuString) gu_exn_caught_data(parse_err); PyObject* py_tok = PyString_FromString(tok); PyObject_SetAttrString(ParseError, "token", py_tok); - PyErr_Format(ParseError, "Unexpected token: \"%s\"", - PyString_AsString(py_tok)); + PyErr_Format(ParseError, "Unexpected token: \"%s\"", tok); Py_DECREF(py_tok); } } - Py_XDECREF(py_lexer); gu_pool_free(tmp_pool); return pyres; @@ -1671,56 +1576,21 @@ pypgf_collect_morpho(PgfMorphoCallback* self, } static PyObject* -Concr_lookupMorpho(ConcrObject* self, PyObject *args, PyObject *keywds) { - static char *kwlist[] = {"sentence", "tokens", NULL}; - - int len; - const uint8_t *buf = NULL; - PyObject* py_lexer = NULL; - if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#O", kwlist, - &buf, &len, &py_lexer)) +Concr_lookupMorpho(ConcrObject* self, PyObject *args) { + GuString sent; + if (!PyArg_ParseTuple(args, "s", &sent)) return NULL; - if ((buf == NULL && py_lexer == NULL) || - (buf != NULL && py_lexer != NULL)) { - PyErr_SetString(PyExc_TypeError, "either the sentence or the tokens argument must be provided"); - return NULL; - } - - GuPool* tmp_pool = gu_local_pool(); - - PgfLexer *lexer = NULL; - if (buf != NULL) { - GuIn* in = gu_data_in(buf, len, tmp_pool); - lexer = pgf_new_simple_lexer(in, tmp_pool); - } - if (py_lexer != NULL) { - // get an iterator out of the iterable object - py_lexer = PyObject_GetIter(py_lexer); - if (py_lexer == NULL) { - gu_pool_free(tmp_pool); - return NULL; - } - - lexer = pypgf_new_python_lexer(py_lexer, tmp_pool); - } - - GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool); + GuPool *tmp_pool = gu_local_pool(); + GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool); PyObject* analyses = PyList_New(0); PyMorphoCallback callback = { { pypgf_collect_morpho }, analyses }; - pgf_lookup_morpho(self->concr, lexer, &callback.fn, err); - - Py_XDECREF(py_lexer); + pgf_lookup_morpho(self->concr, sent, &callback.fn, err); gu_pool_free(tmp_pool); - if (!gu_ok(err)) { - Py_DECREF(analyses); - return NULL; - } - return analyses; } @@ -1833,7 +1703,7 @@ static PyMethodDef Concr_methods[] = { {"graphvizParseTree", (PyCFunction)Concr_graphvizParseTree, METH_VARARGS, "Renders an abstract syntax tree as a parse tree in Graphviz format" }, - {"lookupMorpho", (PyCFunction)Concr_lookupMorpho, METH_VARARGS | METH_KEYWORDS, + {"lookupMorpho", (PyCFunction)Concr_lookupMorpho, METH_VARARGS, "Looks up a word in the lexicon of the grammar" }, {"fullFormLexicon", (PyCFunction)Concr_fullFormLexicon, METH_VARARGS,