mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-11 12:12:51 -06:00
Merge branch 'master' into c-runtime
This commit is contained in:
@@ -142,14 +142,14 @@ pgf_aligner_lzn_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_aligner_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lindex, PgfCId fun)
|
||||
pgf_aligner_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
|
||||
gu_buf_push(alin->parent_stack, int, fid);
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_aligner_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lindex, PgfCId fun)
|
||||
pgf_aligner_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
|
||||
gu_buf_pop(alin->parent_stack, int);
|
||||
|
||||
@@ -322,7 +322,8 @@ typedef struct PgfProductionCoerce
|
||||
|
||||
typedef struct {
|
||||
PgfExprProb *ep;
|
||||
GuSeq* lins;
|
||||
size_t n_lins;
|
||||
PgfSymbols* lins[];
|
||||
} PgfProductionExtern;
|
||||
|
||||
typedef struct {
|
||||
|
||||
@@ -953,94 +953,6 @@ pgf_read_expr(GuIn* in, GuPool* pool, GuPool* tmp_pool, GuExn* err)
|
||||
return expr;
|
||||
}
|
||||
|
||||
PGF_API int
|
||||
pgf_read_expr_tuple(GuIn* in,
|
||||
size_t n_exprs, PgfExpr exprs[],
|
||||
GuPool* pool, GuExn* err)
|
||||
{
|
||||
GuPool* tmp_pool = gu_new_pool();
|
||||
PgfExprParser* parser =
|
||||
pgf_new_parser(in, pgf_expr_parser_in_getc, pool, tmp_pool, err);
|
||||
if (parser->token_tag != PGF_TOKEN_LTRIANGLE)
|
||||
goto fail;
|
||||
pgf_expr_parser_token(parser, false);
|
||||
for (size_t i = 0; i < n_exprs; i++) {
|
||||
if (i > 0) {
|
||||
if (parser->token_tag != PGF_TOKEN_COMMA)
|
||||
goto fail;
|
||||
pgf_expr_parser_token(parser, false);
|
||||
}
|
||||
|
||||
exprs[i] = pgf_expr_parser_expr(parser, false);
|
||||
if (gu_variant_is_null(exprs[i]))
|
||||
goto fail;
|
||||
}
|
||||
if (parser->token_tag != PGF_TOKEN_RTRIANGLE)
|
||||
goto fail;
|
||||
pgf_expr_parser_token(parser, false);
|
||||
if (parser->token_tag != PGF_TOKEN_EOF)
|
||||
goto fail;
|
||||
gu_pool_free(tmp_pool);
|
||||
|
||||
return 1;
|
||||
|
||||
fail:
|
||||
gu_pool_free(tmp_pool);
|
||||
return 0;
|
||||
}
|
||||
|
||||
PGF_API GuSeq*
|
||||
pgf_read_expr_matrix(GuIn* in,
|
||||
size_t n_exprs,
|
||||
GuPool* pool, GuExn* err)
|
||||
{
|
||||
GuPool* tmp_pool = gu_new_pool();
|
||||
PgfExprParser* parser =
|
||||
pgf_new_parser(in, pgf_expr_parser_in_getc, pool, tmp_pool, err);
|
||||
if (parser->token_tag != PGF_TOKEN_LTRIANGLE)
|
||||
goto fail;
|
||||
pgf_expr_parser_token(parser, false);
|
||||
|
||||
GuBuf* buf = gu_new_buf(PgfExpr, pool);
|
||||
|
||||
if (parser->token_tag != PGF_TOKEN_RTRIANGLE) {
|
||||
for (;;) {
|
||||
PgfExpr* exprs = gu_buf_extend_n(buf, n_exprs);
|
||||
|
||||
for (size_t i = 0; i < n_exprs; i++) {
|
||||
if (i > 0) {
|
||||
if (parser->token_tag != PGF_TOKEN_COMMA)
|
||||
goto fail;
|
||||
pgf_expr_parser_token(parser, false);
|
||||
}
|
||||
|
||||
exprs[i] = pgf_expr_parser_expr(parser, false);
|
||||
if (gu_variant_is_null(exprs[i]))
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (parser->token_tag != PGF_TOKEN_SEMI)
|
||||
break;
|
||||
|
||||
pgf_expr_parser_token(parser, false);
|
||||
}
|
||||
|
||||
if (parser->token_tag != PGF_TOKEN_RTRIANGLE)
|
||||
goto fail;
|
||||
}
|
||||
|
||||
pgf_expr_parser_token(parser, false);
|
||||
if (parser->token_tag != PGF_TOKEN_EOF)
|
||||
goto fail;
|
||||
gu_pool_free(tmp_pool);
|
||||
|
||||
return gu_buf_data_seq(buf);
|
||||
|
||||
fail:
|
||||
gu_pool_free(tmp_pool);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PGF_API PgfType*
|
||||
pgf_read_type(GuIn* in, GuPool* pool, GuPool* tmp_pool, GuExn* err)
|
||||
{
|
||||
@@ -1758,19 +1670,6 @@ pgf_print_context(PgfHypos *hypos, PgfPrintContext* ctxt,
|
||||
}
|
||||
}
|
||||
|
||||
PGF_API void
|
||||
pgf_print_expr_tuple(size_t n_exprs, PgfExpr exprs[], PgfPrintContext* ctxt,
|
||||
GuOut* out, GuExn* err)
|
||||
{
|
||||
gu_putc('<', out, err);
|
||||
for (size_t i = 0; i < n_exprs; i++) {
|
||||
if (i > 0)
|
||||
gu_putc(',', out, err);
|
||||
pgf_print_expr(exprs[i], ctxt, 0, out, err);
|
||||
}
|
||||
gu_putc('>', out, err);
|
||||
}
|
||||
|
||||
PGF_API bool
|
||||
pgf_type_eq(PgfType* t1, PgfType* t2)
|
||||
{
|
||||
@@ -1806,6 +1705,168 @@ pgf_type_eq(PgfType* t1, PgfType* t2)
|
||||
return true;
|
||||
}
|
||||
|
||||
PGF_API PgfLiteral
|
||||
pgf_clone_literal(PgfLiteral lit, GuPool* pool)
|
||||
{
|
||||
PgfLiteral new_lit = gu_null_variant;
|
||||
|
||||
GuVariantInfo inf = gu_variant_open(lit);
|
||||
switch (inf.tag) {
|
||||
case PGF_LITERAL_STR: {
|
||||
PgfLiteralStr* lit_str = inf.data;
|
||||
PgfLiteralStr* new_lit_str =
|
||||
gu_new_flex_variant(PGF_LITERAL_STR,
|
||||
PgfLiteralStr,
|
||||
val, strlen(lit_str->val)+1,
|
||||
&new_lit, pool);
|
||||
strcpy(new_lit_str->val, lit_str->val);
|
||||
break;
|
||||
}
|
||||
case PGF_LITERAL_INT: {
|
||||
PgfLiteralInt *lit_int = inf.data;
|
||||
PgfLiteralInt *new_lit_int =
|
||||
gu_new_variant(PGF_LITERAL_INT,
|
||||
PgfLiteralInt,
|
||||
&new_lit, pool);
|
||||
new_lit_int->val = lit_int->val;
|
||||
break;
|
||||
}
|
||||
case PGF_LITERAL_FLT: {
|
||||
PgfLiteralFlt *lit_flt = inf.data;
|
||||
PgfLiteralFlt *new_lit_flt =
|
||||
gu_new_variant(PGF_LITERAL_FLT,
|
||||
PgfLiteralFlt,
|
||||
&new_lit, pool);
|
||||
new_lit_flt->val = lit_flt->val;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
gu_impossible();
|
||||
}
|
||||
|
||||
return new_lit;
|
||||
}
|
||||
|
||||
PGF_API PgfExpr
|
||||
pgf_clone_expr(PgfExpr expr, GuPool* pool)
|
||||
{
|
||||
PgfExpr new_expr = gu_null_variant;
|
||||
|
||||
GuVariantInfo inf = gu_variant_open(expr);
|
||||
switch (inf.tag) {
|
||||
case PGF_EXPR_ABS: {
|
||||
PgfExprAbs* abs = inf.data;
|
||||
PgfExprAbs* new_abs =
|
||||
gu_new_variant(PGF_EXPR_ABS,
|
||||
PgfExprAbs,
|
||||
&new_expr, pool);
|
||||
|
||||
new_abs->bind_type = abs->bind_type;
|
||||
new_abs->id = gu_string_copy(abs->id, pool);
|
||||
new_abs->body = pgf_clone_expr(abs->body,pool);
|
||||
break;
|
||||
}
|
||||
case PGF_EXPR_APP: {
|
||||
PgfExprApp* app = inf.data;
|
||||
PgfExprApp* new_app =
|
||||
gu_new_variant(PGF_EXPR_APP,
|
||||
PgfExprApp,
|
||||
&new_expr, pool);
|
||||
new_app->fun = pgf_clone_expr(app->fun, pool);
|
||||
new_app->arg = pgf_clone_expr(app->arg, pool);
|
||||
break;
|
||||
}
|
||||
case PGF_EXPR_LIT: {
|
||||
PgfExprLit* lit = inf.data;
|
||||
PgfExprLit* new_lit =
|
||||
gu_new_variant(PGF_EXPR_LIT,
|
||||
PgfExprLit,
|
||||
&new_expr, pool);
|
||||
new_lit->lit = pgf_clone_literal(lit->lit, pool);
|
||||
break;
|
||||
}
|
||||
case PGF_EXPR_META: {
|
||||
PgfExprMeta* meta = inf.data;
|
||||
PgfExprMeta* new_meta =
|
||||
gu_new_variant(PGF_EXPR_META,
|
||||
PgfExprMeta,
|
||||
&new_expr, pool);
|
||||
new_meta->id = meta->id;
|
||||
break;
|
||||
}
|
||||
case PGF_EXPR_FUN: {
|
||||
PgfExprFun* fun = inf.data;
|
||||
PgfExprFun* new_fun =
|
||||
gu_new_flex_variant(PGF_EXPR_FUN,
|
||||
PgfExprFun,
|
||||
fun, strlen(fun->fun)+1,
|
||||
&new_expr, pool);
|
||||
strcpy(new_fun->fun, fun->fun);
|
||||
break;
|
||||
}
|
||||
case PGF_EXPR_VAR: {
|
||||
PgfExprVar* var = inf.data;
|
||||
PgfExprVar* new_var =
|
||||
gu_new_variant(PGF_EXPR_VAR,
|
||||
PgfExprVar,
|
||||
&new_expr, pool);
|
||||
new_var->var = var->var;
|
||||
break;
|
||||
}
|
||||
case PGF_EXPR_TYPED: {
|
||||
PgfExprTyped* typed = inf.data;
|
||||
|
||||
PgfExprTyped *new_typed =
|
||||
gu_new_variant(PGF_EXPR_TYPED,
|
||||
PgfExprTyped,
|
||||
&new_expr, pool);
|
||||
new_typed->expr = pgf_clone_expr(typed->expr, pool);
|
||||
new_typed->type = pgf_clone_type(typed->type, pool);
|
||||
break;
|
||||
}
|
||||
case PGF_EXPR_IMPL_ARG: {
|
||||
PgfExprImplArg* impl = inf.data;
|
||||
PgfExprImplArg *new_impl =
|
||||
gu_new_variant(PGF_EXPR_IMPL_ARG,
|
||||
PgfExprImplArg,
|
||||
&new_expr, pool);
|
||||
new_impl->expr = pgf_clone_expr(impl->expr, pool);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
gu_impossible();
|
||||
}
|
||||
|
||||
return new_expr;
|
||||
}
|
||||
|
||||
PGF_API PgfType*
|
||||
pgf_clone_type(PgfType* type, GuPool* pool)
|
||||
{
|
||||
PgfType* new_type =
|
||||
gu_new_flex(pool, PgfType, exprs, type->n_exprs);
|
||||
|
||||
size_t n_hypos = gu_seq_length(type->hypos);
|
||||
new_type->hypos = gu_new_seq(PgfHypo, n_hypos, pool);
|
||||
for (size_t i = 0; i < n_hypos; i++) {
|
||||
PgfHypo* hypo = gu_seq_index(type->hypos, PgfHypo, i);
|
||||
PgfHypo* new_hypo = gu_seq_index(new_type->hypos, PgfHypo, i);
|
||||
|
||||
new_hypo->bind_type = hypo->bind_type;
|
||||
new_hypo->cid = gu_string_copy(hypo->cid, pool);
|
||||
new_hypo->type = pgf_clone_type(hypo->type, pool);
|
||||
}
|
||||
|
||||
new_type->cid = gu_string_copy(type->cid, pool);
|
||||
|
||||
new_type->n_exprs = type->n_exprs;
|
||||
for (size_t i = 0; i < new_type->n_exprs; i++) {
|
||||
new_type->exprs[i] = pgf_clone_expr(type->exprs[i], pool);
|
||||
}
|
||||
|
||||
return new_type;
|
||||
}
|
||||
|
||||
PGF_API prob_t
|
||||
pgf_compute_tree_probability(PgfPGF *gr, PgfExpr expr)
|
||||
{
|
||||
|
||||
@@ -171,15 +171,6 @@ pgf_expr_unmeta(PgfExpr expr);
|
||||
PGF_API_DECL PgfExpr
|
||||
pgf_read_expr(GuIn* in, GuPool* pool, GuPool* tmp_pool, GuExn* err);
|
||||
|
||||
PGF_API_DECL int
|
||||
pgf_read_expr_tuple(GuIn* in,
|
||||
size_t n_exprs, PgfExpr exprs[],
|
||||
GuPool* pool, GuExn* err);
|
||||
|
||||
PGF_API_DECL GuSeq*
|
||||
pgf_read_expr_matrix(GuIn* in, size_t n_exprs,
|
||||
GuPool* pool, GuExn* err);
|
||||
|
||||
PGF_API_DECL PgfType*
|
||||
pgf_read_type(GuIn* in, GuPool* pool, GuPool* tmp_pool, GuExn* err);
|
||||
|
||||
@@ -239,9 +230,14 @@ PGF_API_DECL void
|
||||
pgf_print_context(PgfHypos *hypos, PgfPrintContext* ctxt,
|
||||
GuOut *out, GuExn *err);
|
||||
|
||||
PGF_API_DECL void
|
||||
pgf_print_expr_tuple(size_t n_exprs, PgfExpr exprs[], PgfPrintContext* ctxt,
|
||||
GuOut* out, GuExn* err);
|
||||
PGF_API PgfLiteral
|
||||
pgf_clone_literal(PgfLiteral lit, GuPool* pool);
|
||||
|
||||
PGF_API PgfExpr
|
||||
pgf_clone_expr(PgfExpr expr, GuPool* pool);
|
||||
|
||||
PGF_API PgfType*
|
||||
pgf_clone_type(PgfType* type, GuPool* pool);
|
||||
|
||||
PGF_API_DECL prob_t
|
||||
pgf_compute_tree_probability(PgfPGF *gr, PgfExpr expr);
|
||||
|
||||
@@ -155,7 +155,7 @@ pgf_bracket_lzn_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_bracket_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lindex, PgfCId fun)
|
||||
pgf_bracket_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfBracketLznState* state = gu_container(funcs, PgfBracketLznState, funcs);
|
||||
|
||||
@@ -192,7 +192,7 @@ pgf_bracket_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t li
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_bracket_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lindex, PgfCId fun)
|
||||
pgf_bracket_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfBracketLznState* state = gu_container(funcs, PgfBracketLznState, funcs);
|
||||
|
||||
|
||||
@@ -628,7 +628,7 @@ typedef struct {
|
||||
PgfLzrCachedTag tag;
|
||||
PgfCId cat;
|
||||
int fid;
|
||||
int lin_idx;
|
||||
GuString ann;
|
||||
PgfCId fun;
|
||||
} PgfLzrCached;
|
||||
|
||||
@@ -666,7 +666,7 @@ pgf_lzr_cache_flush(PgfLzrCache* cache, PgfSymbols* form)
|
||||
cache->lzr->funcs,
|
||||
event->cat,
|
||||
event->fid,
|
||||
event->lin_idx,
|
||||
event->ann,
|
||||
event->fun);
|
||||
}
|
||||
break;
|
||||
@@ -676,7 +676,7 @@ pgf_lzr_cache_flush(PgfLzrCache* cache, PgfSymbols* form)
|
||||
cache->lzr->funcs,
|
||||
event->cat,
|
||||
event->fid,
|
||||
event->lin_idx,
|
||||
event->ann,
|
||||
event->fun);
|
||||
}
|
||||
break;
|
||||
@@ -731,27 +731,27 @@ found:
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_lzr_cache_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_idx, PgfCId fun)
|
||||
pgf_lzr_cache_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfLzrCache* cache = gu_container(funcs, PgfLzrCache, funcs);
|
||||
PgfLzrCached* event = gu_buf_extend(cache->events);
|
||||
event->tag = PGF_CACHED_BEGIN;
|
||||
event->cat = cat;
|
||||
event->fid = fid;
|
||||
event->lin_idx = lin_idx;
|
||||
event->fun = fun;
|
||||
event->tag = PGF_CACHED_BEGIN;
|
||||
event->cat = cat;
|
||||
event->fid = fid;
|
||||
event->ann = ann;
|
||||
event->fun = fun;
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_lzr_cache_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_idx, PgfCId fun)
|
||||
pgf_lzr_cache_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfLzrCache* cache = gu_container(funcs, PgfLzrCache, funcs);
|
||||
PgfLzrCached* event = gu_buf_extend(cache->events);
|
||||
event->tag = PGF_CACHED_END;
|
||||
event->cat = cat;
|
||||
event->fid = fid;
|
||||
event->lin_idx = lin_idx;
|
||||
event->fun = fun;
|
||||
event->tag = PGF_CACHED_END;
|
||||
event->cat = cat;
|
||||
event->fid = fid;
|
||||
event->ann = ann;
|
||||
event->fun = fun;
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -939,8 +939,8 @@ pgf_lzr_linearize_tree(PgfLzr* lzr, PgfCncTree ctree, size_t lin_idx)
|
||||
|
||||
if ((*lzr->funcs)->begin_phrase && fapp->ccat != NULL) {
|
||||
(*lzr->funcs)->begin_phrase(lzr->funcs,
|
||||
fun->absfun->type->cid,
|
||||
fapp->fid, lin_idx,
|
||||
fapp->ccat->cnccat->abscat->name,
|
||||
fapp->fid, fapp->ccat->cnccat->labels[lin_idx],
|
||||
fun->absfun->name);
|
||||
}
|
||||
|
||||
@@ -949,8 +949,8 @@ pgf_lzr_linearize_tree(PgfLzr* lzr, PgfCncTree ctree, size_t lin_idx)
|
||||
|
||||
if ((*lzr->funcs)->end_phrase && fapp->ccat != NULL) {
|
||||
(*lzr->funcs)->end_phrase(lzr->funcs,
|
||||
fun->absfun->type->cid,
|
||||
fapp->fid, lin_idx,
|
||||
fapp->ccat->cnccat->abscat->name,
|
||||
fapp->fid, fapp->ccat->cnccat->labels[lin_idx],
|
||||
fun->absfun->name);
|
||||
}
|
||||
break;
|
||||
@@ -979,7 +979,7 @@ pgf_lzr_linearize_tree(PgfLzr* lzr, PgfCncTree ctree, size_t lin_idx)
|
||||
|
||||
if ((*lzr->funcs)->begin_phrase) {
|
||||
(*lzr->funcs)->begin_phrase(lzr->funcs,
|
||||
cat, flit->fid, 0,
|
||||
cat, flit->fid, "s",
|
||||
"");
|
||||
}
|
||||
|
||||
@@ -1011,7 +1011,7 @@ pgf_lzr_linearize_tree(PgfLzr* lzr, PgfCncTree ctree, size_t lin_idx)
|
||||
|
||||
if ((*lzr->funcs)->end_phrase) {
|
||||
(*lzr->funcs)->end_phrase(lzr->funcs,
|
||||
cat, flit->fid, 0,
|
||||
cat, flit->fid, "s",
|
||||
"");
|
||||
}
|
||||
|
||||
|
||||
@@ -83,10 +83,10 @@ struct PgfLinFuncs
|
||||
void (*symbol_token)(PgfLinFuncs** self, PgfToken tok);
|
||||
|
||||
/// Begin phrase
|
||||
void (*begin_phrase)(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex, PgfCId fun);
|
||||
void (*begin_phrase)(PgfLinFuncs** self, PgfCId cat, int fid, GuString ann, PgfCId fun);
|
||||
|
||||
/// End phrase
|
||||
void (*end_phrase)(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex, PgfCId fun);
|
||||
void (*end_phrase)(PgfLinFuncs** self, PgfCId cat, int fid, GuString ann, PgfCId fun);
|
||||
|
||||
/// handling nonExist
|
||||
void (*symbol_ne)(PgfLinFuncs** self);
|
||||
|
||||
@@ -6,11 +6,12 @@
|
||||
|
||||
static PgfExprProb*
|
||||
pgf_match_string_lit(PgfLiteralCallback* self, PgfConcr* concr,
|
||||
size_t lin_idx,
|
||||
GuString ann,
|
||||
GuString sentence, size_t* poffset,
|
||||
GuPool *out_pool)
|
||||
{
|
||||
gu_assert(lin_idx == 0);
|
||||
if (strcmp(ann,"s") != 0)
|
||||
return NULL;
|
||||
|
||||
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
|
||||
const uint8_t* p = buf;
|
||||
@@ -51,7 +52,7 @@ pgf_predict_empty_next(GuEnum* self, void* to, GuPool* pool)
|
||||
|
||||
static GuEnum*
|
||||
pgf_predict_empty(PgfLiteralCallback* self, PgfConcr* concr,
|
||||
size_t lin_idx,
|
||||
GuString ann,
|
||||
GuString prefix,
|
||||
GuPool *out_pool)
|
||||
{
|
||||
@@ -67,11 +68,12 @@ static PgfLiteralCallback pgf_string_literal_callback =
|
||||
|
||||
static PgfExprProb*
|
||||
pgf_match_int_lit(PgfLiteralCallback* self, PgfConcr* concr,
|
||||
size_t lin_idx,
|
||||
GuString ann,
|
||||
GuString sentence, size_t* poffset,
|
||||
GuPool *out_pool)
|
||||
{
|
||||
gu_assert(lin_idx == 0);
|
||||
if (strcmp(ann,"s") != 0)
|
||||
return NULL;
|
||||
|
||||
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
|
||||
const uint8_t* p = buf;
|
||||
@@ -121,11 +123,12 @@ static PgfLiteralCallback pgf_int_literal_callback =
|
||||
|
||||
static PgfExprProb*
|
||||
pgf_match_float_lit(PgfLiteralCallback* self, PgfConcr* concr,
|
||||
size_t lin_idx,
|
||||
GuString ann,
|
||||
GuString sentence, size_t* poffset,
|
||||
GuPool *out_pool)
|
||||
{
|
||||
gu_assert(lin_idx == 0);
|
||||
if (strcmp(ann,"s") != 0)
|
||||
return NULL;
|
||||
|
||||
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
|
||||
const uint8_t* p = buf;
|
||||
@@ -226,11 +229,11 @@ pgf_match_name_morpho_callback(PgfMorphoCallback* self_,
|
||||
|
||||
static PgfExprProb*
|
||||
pgf_match_name_lit(PgfLiteralCallback* self, PgfConcr* concr,
|
||||
size_t lin_idx,
|
||||
GuString ann,
|
||||
GuString sentence, size_t* poffset,
|
||||
GuPool *out_pool)
|
||||
{
|
||||
if (lin_idx != 0)
|
||||
if (strcmp(ann,"s") != 0)
|
||||
return NULL;
|
||||
|
||||
GuPool* tmp_pool = gu_local_pool();
|
||||
@@ -349,7 +352,7 @@ pgf_match_unknown_morpho_callback(PgfMorphoCallback* self_,
|
||||
|
||||
static PgfExprProb*
|
||||
pgf_match_unknown_lit(PgfLiteralCallback* self, PgfConcr* concr,
|
||||
size_t lin_idx,
|
||||
GuString ann,
|
||||
GuString sentence, size_t* poffset,
|
||||
GuPool *out_pool)
|
||||
{
|
||||
|
||||
@@ -869,7 +869,7 @@ pgf_lookup_symbol_token(PgfLinFuncs** self, PgfToken token)
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_lookup_begin_phrase(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex, PgfCId funname)
|
||||
pgf_lookup_begin_phrase(PgfLinFuncs** self, PgfCId cat, int fid, GuString ann, PgfCId funname)
|
||||
{
|
||||
PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
|
||||
|
||||
@@ -883,7 +883,7 @@ pgf_lookup_begin_phrase(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex,
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_lookup_end_phrase(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex, PgfCId fun)
|
||||
pgf_lookup_end_phrase(PgfLinFuncs** self, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
|
||||
st->curr_absfun = NULL;
|
||||
|
||||
@@ -61,6 +61,14 @@ typedef struct {
|
||||
|
||||
typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
|
||||
|
||||
typedef struct {
|
||||
PgfProductionIdx* idx;
|
||||
size_t offset;
|
||||
size_t sym_idx;
|
||||
} PgfLexiconIdxEntry;
|
||||
|
||||
typedef GuBuf PgfLexiconIdx;
|
||||
|
||||
struct PgfParseState {
|
||||
PgfParseState* next;
|
||||
|
||||
@@ -74,6 +82,8 @@ struct PgfParseState {
|
||||
size_t end_offset;
|
||||
|
||||
prob_t viterbi_prob;
|
||||
|
||||
PgfLexiconIdx* lexicon_idx;
|
||||
};
|
||||
|
||||
typedef struct PgfAnswers {
|
||||
@@ -113,43 +123,10 @@ struct PgfItem {
|
||||
prob_t inside_prob;
|
||||
};
|
||||
|
||||
static PgfSymbol
|
||||
pgf_prev_extern_sym(PgfSymbol sym)
|
||||
{
|
||||
GuVariantInfo i = gu_variant_open(sym);
|
||||
switch (i.tag) {
|
||||
case PGF_SYMBOL_CAT:
|
||||
return *((PgfSymbol*) (((PgfSymbolCat*) i.data)+1));
|
||||
case PGF_SYMBOL_KP:
|
||||
return *((PgfSymbol*) (((PgfSymbolKP*) i.data)+1));
|
||||
case PGF_SYMBOL_KS: {
|
||||
PgfSymbolKS* sks = (PgfSymbolKS*) i.data;
|
||||
size_t tok_len = strlen(sks->token);
|
||||
return *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1));
|
||||
}
|
||||
case PGF_SYMBOL_LIT:
|
||||
return *((PgfSymbol*) (((PgfSymbolLit*) i.data)+1));
|
||||
case PGF_SYMBOL_VAR:
|
||||
return *((PgfSymbol*) (((PgfSymbolVar*) i.data)+1));
|
||||
case PGF_SYMBOL_BIND:
|
||||
case PGF_SYMBOL_SOFT_BIND:
|
||||
case PGF_SYMBOL_SOFT_SPACE:
|
||||
return *((PgfSymbol*) (((PgfSymbolBIND*) i.data)+1));
|
||||
case PGF_SYMBOL_CAPIT:
|
||||
case PGF_SYMBOL_ALL_CAPIT:
|
||||
return *((PgfSymbol*) (((PgfSymbolCAPIT*) i.data)+1));
|
||||
case PGF_SYMBOL_NE:
|
||||
return *((PgfSymbol*) (((PgfSymbolNE*) i.data)+1));
|
||||
default:
|
||||
gu_impossible();
|
||||
return gu_null_variant;
|
||||
}
|
||||
}
|
||||
|
||||
static PgfSymbol
|
||||
static PgfSymbols*
|
||||
pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
|
||||
{
|
||||
PgfSymbol sym = gu_null_variant;
|
||||
GuBuf* syms = gu_new_buf(PgfSymbol, ps->pool);
|
||||
|
||||
const uint8_t* start = (uint8_t*) ps->sentence+start_offset;
|
||||
const uint8_t* end = (uint8_t*) ps->sentence+end_offset;
|
||||
@@ -163,16 +140,15 @@ pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
|
||||
ucs = gu_utf8_decode(&p);
|
||||
}
|
||||
|
||||
PgfSymbol new_sym;
|
||||
PgfSymbol sym;
|
||||
PgfSymbolKS* sks = (PgfSymbolKS*)
|
||||
gu_alloc_variant(PGF_SYMBOL_KS,
|
||||
sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+len+1,
|
||||
gu_alignof(PgfSymbolKS),
|
||||
&new_sym, ps->pool);
|
||||
sizeof(PgfSymbolKS)+len+1,
|
||||
gu_alignof(PgfSymbolKS),
|
||||
&sym, ps->pool);
|
||||
memcpy((char*) sks->token, start, len);
|
||||
((char*) sks->token)[len] = 0;
|
||||
*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+len+1)) = sym;
|
||||
sym = new_sym;
|
||||
gu_buf_push(syms, PgfSymbol, sym);
|
||||
|
||||
start = p;
|
||||
while (gu_ucs_is_space(ucs)) {
|
||||
@@ -181,68 +157,16 @@ pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
|
||||
}
|
||||
}
|
||||
|
||||
return sym;
|
||||
}
|
||||
|
||||
static size_t
|
||||
pgf_item_symbols_length(PgfItem* item)
|
||||
{
|
||||
GuVariantInfo i = gu_variant_open(item->prod);
|
||||
switch (i.tag) {
|
||||
case PGF_PRODUCTION_APPLY: {
|
||||
PgfProductionApply* papp = i.data;
|
||||
return gu_seq_length(papp->fun->lins[item->conts->lin_idx]->syms);
|
||||
}
|
||||
case PGF_PRODUCTION_COERCE: {
|
||||
return 1;
|
||||
}
|
||||
case PGF_PRODUCTION_EXTERN: {
|
||||
PgfProductionExtern* pext = i.data;
|
||||
PgfSymbols* syms;
|
||||
|
||||
if (pext->lins != NULL &&
|
||||
(syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) {
|
||||
return gu_seq_length(syms);
|
||||
} else {
|
||||
int seq_len = 0;
|
||||
PgfSymbol sym = item->curr_sym;
|
||||
while (!gu_variant_is_null(sym)) {
|
||||
seq_len++;
|
||||
sym = pgf_prev_extern_sym(sym);
|
||||
}
|
||||
|
||||
return seq_len;
|
||||
}
|
||||
}
|
||||
default:
|
||||
gu_impossible();
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static PgfSymbols*
|
||||
pgf_extern_syms_get(PgfItem* item, GuPool* pool)
|
||||
{
|
||||
int syms_len = pgf_item_symbols_length(item);
|
||||
|
||||
PgfSymbols* syms =
|
||||
gu_new_seq(PgfSymbol, syms_len, pool);
|
||||
PgfSymbol sym = item->curr_sym;
|
||||
while (!gu_variant_is_null(sym)) {
|
||||
gu_seq_set(syms, PgfSymbol, --syms_len, sym);
|
||||
sym = pgf_prev_extern_sym(sym);
|
||||
}
|
||||
|
||||
return syms;
|
||||
return gu_buf_data_seq(syms);
|
||||
}
|
||||
|
||||
#ifdef PGF_PARSER_DEBUG
|
||||
PGF_INTERNAL void
|
||||
pgf_print_fid(int fid, GuOut* out, GuExn* err);
|
||||
|
||||
PGF_INTERNAL_DECL void
|
||||
pgf_print_symbol(PgfSymbol sym, GuOut *out, GuExn *err);
|
||||
|
||||
#ifdef PGF_PARSER_DEBUG
|
||||
static void
|
||||
pgf_item_symbols(PgfItem* item,
|
||||
size_t* lin_idx, PgfSymbols** syms,
|
||||
@@ -267,11 +191,7 @@ pgf_item_symbols(PgfItem* item,
|
||||
}
|
||||
case PGF_PRODUCTION_EXTERN: {
|
||||
PgfProductionExtern* pext = i.data;
|
||||
|
||||
if (pext->lins == NULL ||
|
||||
(*syms = gu_seq_get(pext->lins, PgfSymbols*, item->conts->lin_idx)) == NULL) {
|
||||
*syms = pgf_extern_syms_get(item, pool);
|
||||
}
|
||||
*syms = pext->lins[item->conts->lin_idx];
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -603,16 +523,11 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool)
|
||||
case PGF_PRODUCTION_EXTERN: {
|
||||
PgfProductionExtern* pext = i.data;
|
||||
|
||||
PgfSymbols* syms;
|
||||
if (pext->lins != NULL &&
|
||||
(syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) {
|
||||
if (item->sym_idx == gu_seq_length(syms)) {
|
||||
item->curr_sym = gu_null_variant;
|
||||
} else {
|
||||
item->curr_sym = gu_seq_get(syms, PgfSymbol, item->sym_idx);
|
||||
}
|
||||
} else {
|
||||
PgfSymbols* syms = pext->lins[item->conts->lin_idx];
|
||||
if (item->sym_idx == gu_seq_length(syms)) {
|
||||
item->curr_sym = gu_null_variant;
|
||||
} else {
|
||||
item->curr_sym = gu_seq_get(syms, PgfSymbol, item->sym_idx);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -781,16 +696,6 @@ pgf_result_production(PgfParsing* ps,
|
||||
static void
|
||||
pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep);
|
||||
|
||||
static void
|
||||
pgf_parsing_push_item(PgfParseState* state, PgfItem* item)
|
||||
{
|
||||
if (gu_buf_length(state->agenda) == 0) {
|
||||
state->viterbi_prob =
|
||||
item->inside_prob+item->conts->outside_prob;
|
||||
}
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_push_production(PgfParsing* ps, PgfParseState* state,
|
||||
PgfItemConts* conts, PgfProduction prod)
|
||||
@@ -822,7 +727,7 @@ pgf_parsing_combine(PgfParsing* ps,
|
||||
}
|
||||
|
||||
pgf_item_advance(item, ps->pool);
|
||||
pgf_parsing_push_item(before, item);
|
||||
gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item);
|
||||
}
|
||||
|
||||
static PgfProduction
|
||||
@@ -851,36 +756,7 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
|
||||
break;
|
||||
}
|
||||
case PGF_PRODUCTION_EXTERN: {
|
||||
PgfProductionExtern* pext = i.data;
|
||||
|
||||
if (pext->lins == NULL ||
|
||||
gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx) == NULL) {
|
||||
PgfSymbols* syms =
|
||||
pgf_extern_syms_get(item, pool);
|
||||
|
||||
size_t n_lins = item->conts->ccat->cnccat->n_lins;
|
||||
|
||||
PgfProductionExtern* new_pext = (PgfProductionExtern*)
|
||||
gu_new_variant(PGF_PRODUCTION_EXTERN,
|
||||
PgfProductionExtern,
|
||||
&prod, pool);
|
||||
new_pext->ep = ep;
|
||||
new_pext->lins = gu_new_seq(PgfSymbols*, n_lins, pool);
|
||||
|
||||
if (pext->lins == NULL) {
|
||||
for (size_t i = 0; i < n_lins; i++) {
|
||||
gu_seq_set(new_pext->lins,PgfSymbols*,i,NULL);
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < n_lins; i++) {
|
||||
gu_seq_set(new_pext->lins,PgfSymbols*,i,
|
||||
gu_seq_get(pext->lins,PgfSymbols*,i));
|
||||
}
|
||||
}
|
||||
gu_seq_set(new_pext->lins,PgfSymbols*,item->conts->lin_idx,syms);
|
||||
} else {
|
||||
prod = item->prod;
|
||||
}
|
||||
prod = item->prod;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -1022,9 +898,65 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL_DECL int
|
||||
pgf_symbols_cmp(PgfCohortSpot* spot,
|
||||
PgfSymbols* syms, size_t* sym_idx,
|
||||
bool case_sensitive);
|
||||
|
||||
static void
|
||||
pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
|
||||
int i, int j, ptrdiff_t min, ptrdiff_t max)
|
||||
{
|
||||
// This is a variation of a binary search algorithm which
|
||||
// can retrieve all prefixes of a string with minimal
|
||||
// comparisons, i.e. there is no need to lookup every
|
||||
// prefix separately.
|
||||
|
||||
while (i <= j) {
|
||||
int k = (i+j) / 2;
|
||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
|
||||
|
||||
PgfCohortSpot start = {0, ps->sentence + state->end_offset};
|
||||
PgfCohortSpot current = start;
|
||||
size_t sym_idx = 0;
|
||||
int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive);
|
||||
if (cmp < 0) {
|
||||
j = k-1;
|
||||
} else if (cmp > 0) {
|
||||
ptrdiff_t len = current.ptr - start.ptr;
|
||||
|
||||
if (min <= len)
|
||||
pgf_parsing_lookahead(ps, state, i, k-1, min, len);
|
||||
|
||||
if (len+1 <= max)
|
||||
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
|
||||
|
||||
break;
|
||||
} else {
|
||||
ptrdiff_t len = current.ptr - start.ptr;
|
||||
|
||||
if (min <= len-1)
|
||||
pgf_parsing_lookahead(ps, state, i, k-1, min, len-1);
|
||||
|
||||
if (seq->idx != NULL) {
|
||||
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
||||
entry->idx = seq->idx;
|
||||
entry->offset = (size_t) (current.ptr - ps->sentence);
|
||||
entry->sym_idx = sym_idx;
|
||||
}
|
||||
|
||||
if (len+1 <= max)
|
||||
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static PgfParseState*
|
||||
pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
||||
BIND_TYPE bind_type)
|
||||
BIND_TYPE bind_type,
|
||||
prob_t viterbi_prob)
|
||||
{
|
||||
PgfParseState** pstate;
|
||||
if (ps->before == NULL && start_offset == 0)
|
||||
@@ -1077,172 +1009,36 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
||||
(start_offset == end_offset);
|
||||
state->start_offset = start_offset;
|
||||
state->end_offset = end_offset;
|
||||
state->viterbi_prob = 0;
|
||||
state->viterbi_prob = viterbi_prob;
|
||||
state->lexicon_idx =
|
||||
gu_new_buf(PgfLexiconIdxEntry, ps->pool);
|
||||
|
||||
if (ps->before == NULL && start_offset == 0)
|
||||
state->needs_bind = false;
|
||||
|
||||
if (gu_seq_length(ps->concr->sequences) > 0) {
|
||||
// Add epsilon lexical rules to the bottom up index
|
||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0);
|
||||
if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) {
|
||||
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
||||
entry->idx = seq->idx;
|
||||
entry->offset = state->start_offset;
|
||||
entry->sym_idx= 0;
|
||||
}
|
||||
|
||||
// Add non-epsilon lexical rules to the bottom up index
|
||||
if (!state->needs_bind) {
|
||||
pgf_parsing_lookahead(ps, state,
|
||||
0, gu_seq_length(ps->concr->sequences)-1,
|
||||
1, strlen(ps->sentence)-state->end_offset);
|
||||
}
|
||||
}
|
||||
|
||||
*pstate = state;
|
||||
|
||||
return state;
|
||||
}
|
||||
|
||||
PGF_INTERNAL_DECL int
|
||||
pgf_symbols_cmp(PgfCohortSpot* spot,
|
||||
PgfSymbols* syms, size_t* sym_idx,
|
||||
bool case_sensitive);
|
||||
|
||||
static bool
|
||||
pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state,
|
||||
int i, int j, ptrdiff_t min, ptrdiff_t max)
|
||||
{
|
||||
// This is a variation of a binary search algorithm which
|
||||
// can retrieve all prefixes of a string with minimal
|
||||
// comparisons, i.e. there is no need to lookup every
|
||||
// prefix separately.
|
||||
|
||||
bool found = false;
|
||||
while (i <= j) {
|
||||
int k = (i+j) / 2;
|
||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
|
||||
|
||||
PgfCohortSpot start = {0, ps->sentence+state->end_offset};
|
||||
PgfCohortSpot current = start;
|
||||
|
||||
size_t sym_idx = 0;
|
||||
int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive);
|
||||
if (cmp < 0) {
|
||||
j = k-1;
|
||||
} else if (cmp > 0) {
|
||||
ptrdiff_t len = current.ptr - start.ptr;
|
||||
|
||||
if (min <= len)
|
||||
if (pgf_parsing_scan_helper(ps, state, i, k-1, min, len))
|
||||
found = true;
|
||||
|
||||
if (len+1 <= max)
|
||||
if (pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max))
|
||||
found = true;
|
||||
|
||||
break;
|
||||
} else {
|
||||
ptrdiff_t len = current.ptr - start.ptr;
|
||||
|
||||
if (min <= len)
|
||||
if (pgf_parsing_scan_helper(ps, state, i, k-1, min, len))
|
||||
found = true;
|
||||
|
||||
// Here we do bottom-up prediction for all lexical categories.
|
||||
// The epsilon productions will be predicted in top-down
|
||||
// fashion while parsing.
|
||||
if (seq->idx != NULL && len > 0) {
|
||||
found = true;
|
||||
|
||||
// A new state will mark the end of the current match
|
||||
PgfParseState* new_state =
|
||||
pgf_new_parse_state(ps, (size_t) (current.ptr - ps->sentence), BIND_NONE);
|
||||
|
||||
// Bottom-up prediction for lexical rules
|
||||
size_t n_entries = gu_buf_length(seq->idx);
|
||||
for (size_t i = 0; i < n_entries; i++) {
|
||||
PgfProductionIdxEntry* entry =
|
||||
gu_buf_index(seq->idx, PgfProductionIdxEntry, i);
|
||||
|
||||
PgfItemConts* conts =
|
||||
pgf_parsing_get_conts(state,
|
||||
entry->ccat, entry->lin_idx,
|
||||
ps->pool);
|
||||
|
||||
// Create the new category if it doesn't exist yet
|
||||
PgfCCat* tmp_ccat = pgf_parsing_get_completed(new_state, conts);
|
||||
PgfCCat* ccat = tmp_ccat;
|
||||
if (ccat == NULL) {
|
||||
ccat = pgf_parsing_create_completed(ps, new_state, conts, INFINITY);
|
||||
}
|
||||
|
||||
// Add the production
|
||||
if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) {
|
||||
ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1);
|
||||
}
|
||||
GuVariantInfo i;
|
||||
i.tag = PGF_PRODUCTION_APPLY;
|
||||
i.data = entry->papp;
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod);
|
||||
|
||||
// Update the category's probability to be minimum
|
||||
if (ccat->viterbi_prob > entry->papp->fun->ep->prob)
|
||||
ccat->viterbi_prob = entry->papp->fun->ep->prob;
|
||||
|
||||
#ifdef PGF_PARSER_DEBUG
|
||||
GuPool* tmp_pool = gu_new_pool();
|
||||
GuOut* out = gu_file_out(stderr, tmp_pool);
|
||||
GuExn* err = gu_exn(tmp_pool);
|
||||
if (tmp_ccat == NULL) {
|
||||
gu_printf(out, err, "[");
|
||||
pgf_print_range(state, new_state, out, err);
|
||||
gu_puts("; ", out, err);
|
||||
pgf_print_fid(conts->ccat->fid, out, err);
|
||||
gu_printf(out, err, "; %d; ",
|
||||
conts->lin_idx);
|
||||
pgf_print_fid(ccat->fid, out, err);
|
||||
gu_puts("] ", out, err);
|
||||
pgf_print_fid(ccat->fid, out, err);
|
||||
gu_printf(out, err, ".chunk_count=%d\n", ccat->chunk_count);
|
||||
}
|
||||
pgf_print_production(ccat->fid, prod, out, err);
|
||||
gu_pool_free(tmp_pool);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (len <= max)
|
||||
if (pgf_parsing_scan_helper(ps, state, k+1, j, len, max))
|
||||
found = true;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return found;
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_scan(PgfParsing *ps)
|
||||
{
|
||||
size_t len = strlen(ps->sentence);
|
||||
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, 0, BIND_SOFT);
|
||||
|
||||
while (state != NULL && state->end_offset < len) {
|
||||
if (state->needs_bind) {
|
||||
// We have encountered two tokens without space in between.
|
||||
// Those can be accepted only if there is a BIND token
|
||||
// in between. We encode this by having one more state
|
||||
// at the same offset. A transition between these two
|
||||
// states is possible only with the BIND token.
|
||||
state =
|
||||
pgf_new_parse_state(ps, state->end_offset, BIND_HARD);
|
||||
}
|
||||
|
||||
if (!pgf_parsing_scan_helper
|
||||
(ps, state,
|
||||
0, gu_seq_length(ps->concr->sequences)-1,
|
||||
1, len-state->end_offset)) {
|
||||
// skip one character and try again
|
||||
GuString s = ps->sentence+state->end_offset;
|
||||
gu_utf8_decode((const uint8_t**) &s);
|
||||
pgf_new_parse_state(ps, s-ps->sentence, BIND_NONE);
|
||||
}
|
||||
|
||||
if (state == ps->before)
|
||||
state = ps->after;
|
||||
else
|
||||
state = state->next;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
||||
{
|
||||
@@ -1262,8 +1058,9 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
||||
if (!ps->before->needs_bind && cmp_string(¤t, tok, ps->case_sensitive) == 0) {
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, (current.ptr - ps->sentence),
|
||||
BIND_NONE);
|
||||
pgf_parsing_push_item(state, item);
|
||||
BIND_NONE,
|
||||
item->inside_prob+item->conts->outside_prob);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
} else {
|
||||
pgf_item_free(ps, item);
|
||||
}
|
||||
@@ -1273,17 +1070,18 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
||||
static void
|
||||
pgf_parsing_predict_lexeme(PgfParsing* ps, PgfItemConts* conts,
|
||||
PgfProductionIdxEntry* entry,
|
||||
size_t offset)
|
||||
size_t offset, size_t sym_idx)
|
||||
{
|
||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, entry->papp };
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
PgfItem* item =
|
||||
pgf_new_item(ps, conts, prod);
|
||||
PgfSymbols* syms = entry->papp->fun->lins[conts->lin_idx]->syms;
|
||||
item->sym_idx = gu_seq_length(syms);
|
||||
item->sym_idx = sym_idx;
|
||||
pgf_item_set_curr_symbol(item, ps->pool);
|
||||
prob_t prob = item->inside_prob+item->conts->outside_prob;
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, offset, BIND_NONE);
|
||||
pgf_new_parse_state(ps, offset, BIND_NONE, prob);
|
||||
if (state->viterbi_prob > prob) {
|
||||
state->viterbi_prob = prob;
|
||||
}
|
||||
@@ -1337,36 +1135,34 @@ pgf_parsing_td_predict(PgfParsing* ps,
|
||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||
}
|
||||
|
||||
// Top-down prediction for epsilon lexical rules if any
|
||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0);
|
||||
if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) {
|
||||
// Bottom-up prediction for lexical and epsilon rules
|
||||
size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
|
||||
for (size_t i = 0; i < n_idcs; i++) {
|
||||
PgfLexiconIdxEntry* lentry =
|
||||
gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
|
||||
|
||||
PgfProductionIdxEntry key;
|
||||
key.ccat = ccat;
|
||||
key.lin_idx = lin_idx;
|
||||
key.papp = NULL;
|
||||
PgfProductionIdxEntry* value =
|
||||
gu_seq_binsearch(gu_buf_data_seq(seq->idx),
|
||||
gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
|
||||
pgf_production_idx_entry_order,
|
||||
PgfProductionIdxEntry, &key);
|
||||
|
||||
if (value != NULL) {
|
||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, value->papp };
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||
pgf_parsing_predict_lexeme(ps, conts, value, lentry->offset, lentry->sym_idx);
|
||||
|
||||
PgfProductionIdxEntry* start =
|
||||
gu_buf_data(seq->idx);
|
||||
gu_buf_data(lentry->idx);
|
||||
PgfProductionIdxEntry* end =
|
||||
start + gu_buf_length(seq->idx)-1;
|
||||
start + gu_buf_length(lentry->idx)-1;
|
||||
|
||||
PgfProductionIdxEntry* left = value-1;
|
||||
while (left >= start &&
|
||||
value->ccat->fid == left->ccat->fid &&
|
||||
value->lin_idx == left->lin_idx) {
|
||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, left->papp };
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||
pgf_parsing_predict_lexeme(ps, conts, left, lentry->offset, lentry->sym_idx);
|
||||
left--;
|
||||
}
|
||||
|
||||
@@ -1374,9 +1170,7 @@ pgf_parsing_td_predict(PgfParsing* ps,
|
||||
while (right <= end &&
|
||||
value->ccat->fid == right->ccat->fid &&
|
||||
value->lin_idx == right->lin_idx) {
|
||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, right->papp };
|
||||
PgfProduction prod = gu_variant_close(i);
|
||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||
pgf_parsing_predict_lexeme(ps, conts, right, lentry->offset, lentry->sym_idx);
|
||||
right++;
|
||||
}
|
||||
}
|
||||
@@ -1415,7 +1209,7 @@ pgf_parsing_pre(PgfParsing* ps, PgfItem* item, PgfSymbols* syms)
|
||||
} else {
|
||||
item->alt = 0;
|
||||
pgf_item_advance(item, ps->pool);
|
||||
pgf_parsing_push_item(ps->before, item);
|
||||
gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1514,28 +1308,40 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
||||
|
||||
if (callback != NULL) {
|
||||
ep = callback->match(callback, ps->concr,
|
||||
slit->r,
|
||||
parg->ccat->cnccat->labels[slit->r],
|
||||
ps->sentence, &offset,
|
||||
ps->out_pool);
|
||||
}
|
||||
}
|
||||
|
||||
if (ep != NULL) {
|
||||
PgfSymbols* syms =
|
||||
pgf_collect_extern_tok(ps, start, offset);
|
||||
|
||||
size_t n_lins = conts->ccat->cnccat->n_lins;
|
||||
|
||||
PgfProduction prod;
|
||||
PgfProductionExtern* pext =
|
||||
gu_new_variant(PGF_PRODUCTION_EXTERN,
|
||||
PgfProductionExtern,
|
||||
&prod, ps->pool);
|
||||
pext->ep = ep;
|
||||
pext->lins = NULL;
|
||||
gu_new_flex_variant(PGF_PRODUCTION_EXTERN,
|
||||
PgfProductionExtern,
|
||||
lins, n_lins,
|
||||
&prod, ps->pool);
|
||||
pext->ep = ep;
|
||||
pext->n_lins = n_lins;
|
||||
|
||||
for (size_t i = 0; i < n_lins; i++) {
|
||||
pext->lins[i] = NULL;
|
||||
}
|
||||
pext->lins[conts->lin_idx] = syms;
|
||||
|
||||
PgfItem* item =
|
||||
pgf_new_item(ps, conts, prod);
|
||||
item->curr_sym = pgf_collect_extern_tok(ps,start,offset);
|
||||
item->sym_idx = pgf_item_symbols_length(item);
|
||||
item->curr_sym = gu_null_variant;
|
||||
item->sym_idx = gu_seq_length(syms);
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, offset, BIND_NONE);
|
||||
pgf_parsing_push_item(state, item);
|
||||
pgf_new_parse_state(ps, offset, BIND_NONE,
|
||||
item->inside_prob+item->conts->outside_prob);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
match = true;
|
||||
}
|
||||
}
|
||||
@@ -1578,10 +1384,11 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
||||
if (ps->before->start_offset == ps->before->end_offset &&
|
||||
ps->before->needs_bind) {
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
|
||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD,
|
||||
item->inside_prob+item->conts->outside_prob);
|
||||
if (state != NULL) {
|
||||
pgf_item_advance(item, ps->pool);
|
||||
pgf_parsing_push_item(state, item);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
} else {
|
||||
pgf_item_free(ps, item);
|
||||
}
|
||||
@@ -1595,10 +1402,11 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
||||
if (ps->before->start_offset == ps->before->end_offset) {
|
||||
if (ps->before->needs_bind) {
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
|
||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD,
|
||||
item->inside_prob+item->conts->outside_prob);
|
||||
if (state != NULL) {
|
||||
pgf_item_advance(item, ps->pool);
|
||||
pgf_parsing_push_item(state, item);
|
||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||
} else {
|
||||
pgf_item_free(ps, item);
|
||||
}
|
||||
@@ -1607,12 +1415,13 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
||||
}
|
||||
} else {
|
||||
pgf_item_advance(item, ps->pool);
|
||||
pgf_parsing_push_item(ps->before, item);
|
||||
gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PGF_SYMBOL_CAPIT:
|
||||
case PGF_SYMBOL_ALL_CAPIT: {
|
||||
printf("PGF_SYMBOL_CAPIT\n");
|
||||
pgf_item_advance(item, ps->pool);
|
||||
pgf_parsing_symbol(ps, item, item->curr_sym);
|
||||
break;
|
||||
@@ -1857,7 +1666,8 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
|
||||
ps->heuristic_factor = heuristic_factor;
|
||||
}
|
||||
|
||||
pgf_parsing_scan(ps);
|
||||
PgfParseState* state =
|
||||
pgf_new_parse_state(ps, 0, BIND_SOFT, 0);
|
||||
|
||||
int fidString = -1;
|
||||
PgfCCat* start_ccat = gu_new(PgfCCat, ps->pool);
|
||||
@@ -1879,7 +1689,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
|
||||
#endif
|
||||
|
||||
PgfItemConts* conts =
|
||||
pgf_parsing_get_conts(ps->before, start_ccat, 0, ps->pool);
|
||||
pgf_parsing_get_conts(state, start_ccat, 0, ps->pool);
|
||||
gu_buf_push(conts->items, PgfItem*, NULL);
|
||||
|
||||
size_t n_ccats = gu_seq_length(cnccat->cats);
|
||||
@@ -2218,6 +2028,8 @@ pgf_process_generated_cat(PgfParsing* ps,
|
||||
children[i] = pcoerce->coerce;
|
||||
break;
|
||||
}
|
||||
case PGF_PRODUCTION_EXTERN:
|
||||
just_coercions = false;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2363,6 +2175,104 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuString sentence,
|
||||
return &ps->en;
|
||||
}
|
||||
|
||||
PGF_API PgfParsing*
|
||||
pgf_parse_to_chart(PgfConcr* concr, PgfType* typ, GuString sentence,
|
||||
double heuristics,
|
||||
PgfCallbacksMap* callbacks,
|
||||
size_t n_roots,
|
||||
GuExn* err,
|
||||
GuPool* pool, GuPool* out_pool)
|
||||
{
|
||||
if (concr->sequences == NULL ||
|
||||
concr->cnccats == NULL) {
|
||||
GuExnData* err_data = gu_raise(err, PgfExn);
|
||||
if (err_data) {
|
||||
err_data->data = "The concrete syntax is not loaded";
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Begin parsing a sentence with the specified category
|
||||
PgfParsing* ps =
|
||||
pgf_parsing_init(concr, typ->cid, sentence, heuristics, callbacks, NULL, err, pool, out_pool);
|
||||
if (ps == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef PGF_COUNTS_DEBUG
|
||||
pgf_parsing_print_counts(ps);
|
||||
#endif
|
||||
|
||||
while (gu_buf_length(ps->expr_queue) < n_roots) {
|
||||
if (!pgf_parsing_proceed(ps)) {
|
||||
break;
|
||||
}
|
||||
|
||||
#ifdef PGF_COUNTS_DEBUG
|
||||
pgf_parsing_print_counts(ps);
|
||||
#endif
|
||||
}
|
||||
|
||||
return ps;
|
||||
}
|
||||
|
||||
PGF_API PgfCCats*
|
||||
pgf_get_parse_roots(PgfParsing* ps, GuPool* pool)
|
||||
{
|
||||
size_t n_cats = 0;
|
||||
size_t n_states = gu_buf_length(ps->expr_queue);
|
||||
GuSeq* roots = gu_new_seq(PgfCCat*, n_states, pool);
|
||||
for (size_t i = 0; i < n_states; i++) {
|
||||
PgfCCat* ccat = gu_buf_get(ps->expr_queue, PgfExprState*, i)->answers->ccat;
|
||||
|
||||
bool found = false;
|
||||
for (size_t j = 0; j < n_cats; j++) {
|
||||
if (gu_seq_get(roots, PgfCCat*, j) == ccat) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
gu_seq_set(roots, PgfCCat*, n_cats, ccat);
|
||||
n_cats++;
|
||||
}
|
||||
}
|
||||
roots->len = n_cats;
|
||||
return roots;
|
||||
}
|
||||
|
||||
PGF_API GuSeq*
|
||||
pgf_ccat_to_range(PgfParsing* ps, PgfCCat* ccat, GuPool* pool)
|
||||
{
|
||||
PgfParseState* state = ps->before;
|
||||
GuBuf* buf = gu_new_buf(PgfParseRange, pool);
|
||||
|
||||
while (ccat->conts != NULL) {
|
||||
size_t start = ccat->conts->state->end_offset;
|
||||
size_t end = start;
|
||||
while (state != NULL) {
|
||||
if (pgf_parsing_get_completed(state, ccat->conts) == ccat) {
|
||||
if (state->start_offset >= start)
|
||||
end = state->start_offset;
|
||||
break;
|
||||
}
|
||||
state = state->next;
|
||||
}
|
||||
|
||||
if (start != end) {
|
||||
PgfParseRange* range = gu_buf_extend(buf);
|
||||
range->start = start;
|
||||
range->end = end;
|
||||
range->field = ccat->cnccat->labels[ccat->conts->lin_idx];
|
||||
}
|
||||
|
||||
ccat = ccat->conts->ccat;
|
||||
}
|
||||
|
||||
return gu_buf_data_seq(buf);
|
||||
}
|
||||
|
||||
PGF_API PgfExprEnum*
|
||||
pgf_parse_with_oracle(PgfConcr* concr, PgfType* typ,
|
||||
GuString sentence,
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
typedef struct {
|
||||
int start, end;
|
||||
PgfCId cat;
|
||||
size_t lin_idx;
|
||||
GuString ann;
|
||||
} PgfPhrase;
|
||||
|
||||
typedef struct {
|
||||
@@ -46,14 +46,14 @@ pgf_metrics_lzn_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_metrics_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_index, PgfCId fun)
|
||||
pgf_metrics_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfMetricsLznState* state = gu_container(funcs, PgfMetricsLznState, funcs);
|
||||
gu_buf_push(state->marks, int, state->pos);
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_metrics_lzn_end_phrase1(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_idx, PgfCId fun)
|
||||
pgf_metrics_lzn_end_phrase1(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfMetricsLznState* state = gu_container(funcs, PgfMetricsLznState, funcs);
|
||||
|
||||
@@ -65,7 +65,7 @@ pgf_metrics_lzn_end_phrase1(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin
|
||||
phrase->start = start;
|
||||
phrase->end = end;
|
||||
phrase->cat = cat;
|
||||
phrase->lin_idx = lin_idx;
|
||||
phrase->ann = ann;
|
||||
gu_buf_push(state->phrases, PgfPhrase*, phrase);
|
||||
}
|
||||
}
|
||||
@@ -85,7 +85,7 @@ pgf_metrics_symbol_bind(PgfLinFuncs** funcs)
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_metrics_lzn_end_phrase2(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_idx, PgfCId fun)
|
||||
pgf_metrics_lzn_end_phrase2(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
|
||||
{
|
||||
PgfMetricsLznState* state = gu_container(funcs, PgfMetricsLznState, funcs);
|
||||
|
||||
@@ -100,7 +100,7 @@ pgf_metrics_lzn_end_phrase2(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin
|
||||
if (phrase->start == start &&
|
||||
phrase->end == end &&
|
||||
strcmp(phrase->cat, cat) == 0 &&
|
||||
phrase->lin_idx == lin_idx) {
|
||||
strcmp(phrase->ann, ann) == 0) {
|
||||
state->matches++;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -220,6 +220,20 @@ pgf_category_prob(PgfPGF* pgf, PgfCId catname)
|
||||
return abscat->prob;
|
||||
}
|
||||
|
||||
PGF_API GuString*
|
||||
pgf_category_fields(PgfConcr* concr, PgfCId catname, size_t *n_lins)
|
||||
{
|
||||
PgfCncCat* cnccat =
|
||||
gu_map_get(concr->cnccats, catname, PgfCncCat*);
|
||||
if (!cnccat) {
|
||||
*n_lins = 0;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
*n_lins = cnccat->n_lins;
|
||||
return &cnccat->labels;
|
||||
}
|
||||
|
||||
PGF_API GuString
|
||||
pgf_language_code(PgfConcr* concr)
|
||||
{
|
||||
|
||||
@@ -90,6 +90,9 @@ pgf_category_context(PgfPGF *gr, PgfCId catname);
|
||||
PGF_API_DECL prob_t
|
||||
pgf_category_prob(PgfPGF* pgf, PgfCId catname);
|
||||
|
||||
PGF_API GuString*
|
||||
pgf_category_fields(PgfConcr* concr, PgfCId catname, size_t *n_lins);
|
||||
|
||||
PGF_API_DECL void
|
||||
pgf_iter_functions(PgfPGF* pgf, GuMapItor* itor, GuExn* err);
|
||||
|
||||
@@ -163,8 +166,8 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
|
||||
PgfMorphoCallback* callback, GuExn* err);
|
||||
|
||||
typedef struct {
|
||||
size_t pos;
|
||||
GuString ptr;
|
||||
size_t pos; // position in Unicode characters
|
||||
GuString ptr; // pointer into the string
|
||||
} PgfCohortSpot;
|
||||
|
||||
typedef struct {
|
||||
@@ -203,6 +206,12 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ,
|
||||
GuExn* err,
|
||||
GuPool* pool, GuPool* out_pool);
|
||||
|
||||
typedef struct {
|
||||
size_t start;
|
||||
size_t end;
|
||||
GuString field;
|
||||
} PgfParseRange;
|
||||
|
||||
typedef struct PgfOracleCallback PgfOracleCallback;
|
||||
|
||||
struct PgfOracleCallback {
|
||||
@@ -243,11 +252,11 @@ typedef struct PgfLiteralCallback PgfLiteralCallback;
|
||||
|
||||
struct PgfLiteralCallback {
|
||||
PgfExprProb* (*match)(PgfLiteralCallback* self, PgfConcr* concr,
|
||||
size_t lin_idx,
|
||||
GuString ann,
|
||||
GuString sentence, size_t* poffset,
|
||||
GuPool *out_pool);
|
||||
GuEnum* (*predict)(PgfLiteralCallback* self, PgfConcr* concr,
|
||||
size_t lin_idx,
|
||||
GuString ann,
|
||||
GuString prefix,
|
||||
GuPool *out_pool);
|
||||
};
|
||||
|
||||
@@ -114,7 +114,7 @@ pgf_morpho_iter(PgfProductionIdx* idx,
|
||||
|
||||
PgfCId lemma = entry->papp->fun->absfun->name;
|
||||
GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx];
|
||||
|
||||
|
||||
prob_t prob = entry->ccat->cnccat->abscat->prob +
|
||||
entry->papp->fun->absfun->ep.prob;
|
||||
callback->callback(callback,
|
||||
@@ -234,12 +234,13 @@ typedef struct {
|
||||
GuEnum en;
|
||||
PgfConcr* concr;
|
||||
GuString sentence;
|
||||
GuString current;
|
||||
size_t len;
|
||||
PgfMorphoCallback* callback;
|
||||
GuExn* err;
|
||||
bool case_sensitive;
|
||||
GuBuf* spots;
|
||||
GuBuf* skip_spots;
|
||||
GuBuf* empty_buf;
|
||||
GuBuf* found;
|
||||
} PgfCohortsState;
|
||||
|
||||
@@ -255,6 +256,23 @@ cmp_cohort_spot(GuOrder* self, const void* a, const void* b)
|
||||
static GuOrder
|
||||
pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }};
|
||||
|
||||
static void
|
||||
pgf_lookup_cohorts_report_skip(PgfCohortsState *state,
|
||||
PgfCohortSpot* spot)
|
||||
{
|
||||
size_t n_spots = gu_buf_length(state->skip_spots);
|
||||
for (size_t i = 0; i < n_spots; i++) {
|
||||
PgfCohortSpot* skip_spot =
|
||||
gu_buf_index(state->skip_spots, PgfCohortSpot, i);
|
||||
|
||||
PgfCohortRange* range = gu_buf_insert(state->found, 0);
|
||||
range->start = *skip_spot;
|
||||
range->end = *spot;
|
||||
range->buf = state->empty_buf;
|
||||
}
|
||||
gu_buf_flush(state->skip_spots);
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
|
||||
int i, int j, ptrdiff_t min, ptrdiff_t max)
|
||||
@@ -291,18 +309,23 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
|
||||
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);
|
||||
|
||||
if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) {
|
||||
// Report unknown words
|
||||
pgf_lookup_cohorts_report_skip(state, spot);
|
||||
|
||||
// Report the actual hit
|
||||
PgfCohortRange* range = gu_buf_insert(state->found, 0);
|
||||
range->start = *spot;
|
||||
range->end = current;
|
||||
range->buf = seq->idx;
|
||||
}
|
||||
|
||||
while (*current.ptr != 0) {
|
||||
if (!skip_space(¤t.ptr, ¤t.pos))
|
||||
break;
|
||||
}
|
||||
// Schedule the next search spot
|
||||
while (*current.ptr != 0) {
|
||||
if (!skip_space(¤t.ptr, ¤t.pos))
|
||||
break;
|
||||
}
|
||||
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t);
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t);
|
||||
}
|
||||
|
||||
if (len <= max)
|
||||
pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max);
|
||||
@@ -318,29 +341,67 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
PgfCohortsState* state = gu_container(self, PgfCohortsState, en);
|
||||
|
||||
while (gu_buf_length(state->found) == 0 &&
|
||||
gu_buf_length(state->spots) > 0) {
|
||||
gu_buf_length(state->spots) > 0) {
|
||||
PgfCohortSpot spot;
|
||||
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
|
||||
|
||||
if (spot.ptr == state->current)
|
||||
continue;
|
||||
GuString next_ptr = state->sentence+state->len;
|
||||
while (gu_buf_length(state->spots) > 0) {
|
||||
GuString ptr =
|
||||
gu_buf_index(state->spots, PgfCohortSpot, 0)->ptr;
|
||||
if (ptr > spot.ptr) {
|
||||
next_ptr = ptr;
|
||||
break;
|
||||
}
|
||||
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
|
||||
}
|
||||
|
||||
if (*spot.ptr == 0)
|
||||
break;
|
||||
bool needs_report = true;
|
||||
while (next_ptr > spot.ptr) {
|
||||
pgf_lookup_cohorts_helper
|
||||
(state, &spot,
|
||||
0, gu_seq_length(state->concr->sequences)-1,
|
||||
1, (state->sentence+state->len)-spot.ptr);
|
||||
|
||||
pgf_lookup_cohorts_helper
|
||||
(state, &spot,
|
||||
0, gu_seq_length(state->concr->sequences)-1,
|
||||
1, (state->sentence+state->len)-spot.ptr);
|
||||
|
||||
if (gu_buf_length(state->found) == 0) {
|
||||
// skip one character and try again
|
||||
gu_utf8_decode((const uint8_t**) &spot.ptr);
|
||||
spot.pos++;
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
|
||||
// got a hit -> exit
|
||||
if (gu_buf_length(state->found) > 0)
|
||||
break;
|
||||
|
||||
if (needs_report) {
|
||||
// no hit, but the word must be reported as unknown.
|
||||
gu_buf_push(state->skip_spots, PgfCohortSpot, spot);
|
||||
needs_report = false;
|
||||
}
|
||||
|
||||
// skip one character
|
||||
const uint8_t* ptr = (const uint8_t*) spot.ptr;
|
||||
GuUCS c = gu_utf8_decode(&ptr);
|
||||
if (gu_ucs_is_space(c)) {
|
||||
// We have encounter a space and we must report
|
||||
// a new unknown word.
|
||||
pgf_lookup_cohorts_report_skip(state, &spot);
|
||||
|
||||
spot.ptr = (GuString) ptr;
|
||||
spot.pos++;
|
||||
|
||||
// Schedule the next search spot
|
||||
while (*spot.ptr != 0) {
|
||||
if (!skip_space(&spot.ptr, &spot.pos))
|
||||
break;
|
||||
}
|
||||
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
|
||||
break;
|
||||
} else {
|
||||
spot.ptr = (GuString) ptr;
|
||||
spot.pos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PgfCohortSpot end_spot = {state->len, state->sentence+state->len};
|
||||
pgf_lookup_cohorts_report_skip(state, &end_spot);
|
||||
|
||||
PgfCohortRange* pRes = (PgfCohortRange*)to;
|
||||
|
||||
if (gu_buf_length(state->found) == 0) {
|
||||
@@ -349,15 +410,19 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
pRes->end.pos = 0;
|
||||
pRes->end.ptr = NULL;
|
||||
pRes->buf = NULL;
|
||||
state->current = NULL;
|
||||
return;
|
||||
} else do {
|
||||
} else for (;;) {
|
||||
*pRes = gu_buf_pop(state->found, PgfCohortRange);
|
||||
state->current = pRes->start.ptr;
|
||||
pgf_morpho_iter(pRes->buf, state->callback, state->err);
|
||||
} while (gu_buf_length(state->found) > 0 &&
|
||||
gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr);
|
||||
|
||||
|
||||
if (gu_buf_length(state->found) <= 0)
|
||||
break;
|
||||
|
||||
PgfCohortRange* last =
|
||||
gu_buf_index_last(state->found, PgfCohortRange);
|
||||
if (last->start.ptr != pRes->start.ptr ||
|
||||
last->end.ptr != pRes->end.ptr)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
PGF_API GuEnum*
|
||||
@@ -374,15 +439,17 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
|
||||
}
|
||||
|
||||
PgfCohortsState* state = gu_new(PgfCohortsState, pool);
|
||||
state->en.next = pgf_lookup_cohorts_enum_next;
|
||||
state->concr = concr;
|
||||
state->sentence= sentence;
|
||||
state->len = strlen(sentence);
|
||||
state->callback= callback;
|
||||
state->err = err;
|
||||
state->case_sensitive = pgf_is_case_sensitive(concr);
|
||||
state->spots = gu_new_buf(PgfCohortSpot, pool);
|
||||
state->found = gu_new_buf(PgfCohortRange, pool);
|
||||
state->en.next = pgf_lookup_cohorts_enum_next;
|
||||
state->concr = concr;
|
||||
state->sentence = sentence;
|
||||
state->len = strlen(sentence);
|
||||
state->callback = callback;
|
||||
state->err = err;
|
||||
state->case_sensitive= pgf_is_case_sensitive(concr);
|
||||
state->spots = gu_new_buf(PgfCohortSpot, pool);
|
||||
state->skip_spots = gu_new_buf(PgfCohortSpot, pool);
|
||||
state->empty_buf = gu_new_buf(PgfProductionIdxEntry, pool);
|
||||
state->found = gu_new_buf(PgfCohortRange, pool);
|
||||
|
||||
PgfCohortSpot spot = {0,sentence};
|
||||
while (*spot.ptr != 0) {
|
||||
|
||||
Reference in New Issue
Block a user