Merge branch 'master' into c-runtime

This commit is contained in:
krangelov
2021-07-30 11:20:04 +02:00
211 changed files with 7161 additions and 58549 deletions

View File

@@ -142,14 +142,14 @@ pgf_aligner_lzn_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
}
static void
pgf_aligner_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lindex, PgfCId fun)
pgf_aligner_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
gu_buf_push(alin->parent_stack, int, fid);
}
static void
pgf_aligner_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lindex, PgfCId fun)
pgf_aligner_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
gu_buf_pop(alin->parent_stack, int);

View File

@@ -322,7 +322,8 @@ typedef struct PgfProductionCoerce
typedef struct {
PgfExprProb *ep;
GuSeq* lins;
size_t n_lins;
PgfSymbols* lins[];
} PgfProductionExtern;
typedef struct {

View File

@@ -953,94 +953,6 @@ pgf_read_expr(GuIn* in, GuPool* pool, GuPool* tmp_pool, GuExn* err)
return expr;
}
PGF_API int
pgf_read_expr_tuple(GuIn* in,
size_t n_exprs, PgfExpr exprs[],
GuPool* pool, GuExn* err)
{
GuPool* tmp_pool = gu_new_pool();
PgfExprParser* parser =
pgf_new_parser(in, pgf_expr_parser_in_getc, pool, tmp_pool, err);
if (parser->token_tag != PGF_TOKEN_LTRIANGLE)
goto fail;
pgf_expr_parser_token(parser, false);
for (size_t i = 0; i < n_exprs; i++) {
if (i > 0) {
if (parser->token_tag != PGF_TOKEN_COMMA)
goto fail;
pgf_expr_parser_token(parser, false);
}
exprs[i] = pgf_expr_parser_expr(parser, false);
if (gu_variant_is_null(exprs[i]))
goto fail;
}
if (parser->token_tag != PGF_TOKEN_RTRIANGLE)
goto fail;
pgf_expr_parser_token(parser, false);
if (parser->token_tag != PGF_TOKEN_EOF)
goto fail;
gu_pool_free(tmp_pool);
return 1;
fail:
gu_pool_free(tmp_pool);
return 0;
}
PGF_API GuSeq*
pgf_read_expr_matrix(GuIn* in,
size_t n_exprs,
GuPool* pool, GuExn* err)
{
GuPool* tmp_pool = gu_new_pool();
PgfExprParser* parser =
pgf_new_parser(in, pgf_expr_parser_in_getc, pool, tmp_pool, err);
if (parser->token_tag != PGF_TOKEN_LTRIANGLE)
goto fail;
pgf_expr_parser_token(parser, false);
GuBuf* buf = gu_new_buf(PgfExpr, pool);
if (parser->token_tag != PGF_TOKEN_RTRIANGLE) {
for (;;) {
PgfExpr* exprs = gu_buf_extend_n(buf, n_exprs);
for (size_t i = 0; i < n_exprs; i++) {
if (i > 0) {
if (parser->token_tag != PGF_TOKEN_COMMA)
goto fail;
pgf_expr_parser_token(parser, false);
}
exprs[i] = pgf_expr_parser_expr(parser, false);
if (gu_variant_is_null(exprs[i]))
goto fail;
}
if (parser->token_tag != PGF_TOKEN_SEMI)
break;
pgf_expr_parser_token(parser, false);
}
if (parser->token_tag != PGF_TOKEN_RTRIANGLE)
goto fail;
}
pgf_expr_parser_token(parser, false);
if (parser->token_tag != PGF_TOKEN_EOF)
goto fail;
gu_pool_free(tmp_pool);
return gu_buf_data_seq(buf);
fail:
gu_pool_free(tmp_pool);
return NULL;
}
PGF_API PgfType*
pgf_read_type(GuIn* in, GuPool* pool, GuPool* tmp_pool, GuExn* err)
{
@@ -1758,19 +1670,6 @@ pgf_print_context(PgfHypos *hypos, PgfPrintContext* ctxt,
}
}
PGF_API void
pgf_print_expr_tuple(size_t n_exprs, PgfExpr exprs[], PgfPrintContext* ctxt,
GuOut* out, GuExn* err)
{
gu_putc('<', out, err);
for (size_t i = 0; i < n_exprs; i++) {
if (i > 0)
gu_putc(',', out, err);
pgf_print_expr(exprs[i], ctxt, 0, out, err);
}
gu_putc('>', out, err);
}
PGF_API bool
pgf_type_eq(PgfType* t1, PgfType* t2)
{
@@ -1806,6 +1705,168 @@ pgf_type_eq(PgfType* t1, PgfType* t2)
return true;
}
PGF_API PgfLiteral
pgf_clone_literal(PgfLiteral lit, GuPool* pool)
{
PgfLiteral new_lit = gu_null_variant;
GuVariantInfo inf = gu_variant_open(lit);
switch (inf.tag) {
case PGF_LITERAL_STR: {
PgfLiteralStr* lit_str = inf.data;
PgfLiteralStr* new_lit_str =
gu_new_flex_variant(PGF_LITERAL_STR,
PgfLiteralStr,
val, strlen(lit_str->val)+1,
&new_lit, pool);
strcpy(new_lit_str->val, lit_str->val);
break;
}
case PGF_LITERAL_INT: {
PgfLiteralInt *lit_int = inf.data;
PgfLiteralInt *new_lit_int =
gu_new_variant(PGF_LITERAL_INT,
PgfLiteralInt,
&new_lit, pool);
new_lit_int->val = lit_int->val;
break;
}
case PGF_LITERAL_FLT: {
PgfLiteralFlt *lit_flt = inf.data;
PgfLiteralFlt *new_lit_flt =
gu_new_variant(PGF_LITERAL_FLT,
PgfLiteralFlt,
&new_lit, pool);
new_lit_flt->val = lit_flt->val;
break;
}
default:
gu_impossible();
}
return new_lit;
}
PGF_API PgfExpr
pgf_clone_expr(PgfExpr expr, GuPool* pool)
{
PgfExpr new_expr = gu_null_variant;
GuVariantInfo inf = gu_variant_open(expr);
switch (inf.tag) {
case PGF_EXPR_ABS: {
PgfExprAbs* abs = inf.data;
PgfExprAbs* new_abs =
gu_new_variant(PGF_EXPR_ABS,
PgfExprAbs,
&new_expr, pool);
new_abs->bind_type = abs->bind_type;
new_abs->id = gu_string_copy(abs->id, pool);
new_abs->body = pgf_clone_expr(abs->body,pool);
break;
}
case PGF_EXPR_APP: {
PgfExprApp* app = inf.data;
PgfExprApp* new_app =
gu_new_variant(PGF_EXPR_APP,
PgfExprApp,
&new_expr, pool);
new_app->fun = pgf_clone_expr(app->fun, pool);
new_app->arg = pgf_clone_expr(app->arg, pool);
break;
}
case PGF_EXPR_LIT: {
PgfExprLit* lit = inf.data;
PgfExprLit* new_lit =
gu_new_variant(PGF_EXPR_LIT,
PgfExprLit,
&new_expr, pool);
new_lit->lit = pgf_clone_literal(lit->lit, pool);
break;
}
case PGF_EXPR_META: {
PgfExprMeta* meta = inf.data;
PgfExprMeta* new_meta =
gu_new_variant(PGF_EXPR_META,
PgfExprMeta,
&new_expr, pool);
new_meta->id = meta->id;
break;
}
case PGF_EXPR_FUN: {
PgfExprFun* fun = inf.data;
PgfExprFun* new_fun =
gu_new_flex_variant(PGF_EXPR_FUN,
PgfExprFun,
fun, strlen(fun->fun)+1,
&new_expr, pool);
strcpy(new_fun->fun, fun->fun);
break;
}
case PGF_EXPR_VAR: {
PgfExprVar* var = inf.data;
PgfExprVar* new_var =
gu_new_variant(PGF_EXPR_VAR,
PgfExprVar,
&new_expr, pool);
new_var->var = var->var;
break;
}
case PGF_EXPR_TYPED: {
PgfExprTyped* typed = inf.data;
PgfExprTyped *new_typed =
gu_new_variant(PGF_EXPR_TYPED,
PgfExprTyped,
&new_expr, pool);
new_typed->expr = pgf_clone_expr(typed->expr, pool);
new_typed->type = pgf_clone_type(typed->type, pool);
break;
}
case PGF_EXPR_IMPL_ARG: {
PgfExprImplArg* impl = inf.data;
PgfExprImplArg *new_impl =
gu_new_variant(PGF_EXPR_IMPL_ARG,
PgfExprImplArg,
&new_expr, pool);
new_impl->expr = pgf_clone_expr(impl->expr, pool);
break;
}
default:
gu_impossible();
}
return new_expr;
}
PGF_API PgfType*
pgf_clone_type(PgfType* type, GuPool* pool)
{
PgfType* new_type =
gu_new_flex(pool, PgfType, exprs, type->n_exprs);
size_t n_hypos = gu_seq_length(type->hypos);
new_type->hypos = gu_new_seq(PgfHypo, n_hypos, pool);
for (size_t i = 0; i < n_hypos; i++) {
PgfHypo* hypo = gu_seq_index(type->hypos, PgfHypo, i);
PgfHypo* new_hypo = gu_seq_index(new_type->hypos, PgfHypo, i);
new_hypo->bind_type = hypo->bind_type;
new_hypo->cid = gu_string_copy(hypo->cid, pool);
new_hypo->type = pgf_clone_type(hypo->type, pool);
}
new_type->cid = gu_string_copy(type->cid, pool);
new_type->n_exprs = type->n_exprs;
for (size_t i = 0; i < new_type->n_exprs; i++) {
new_type->exprs[i] = pgf_clone_expr(type->exprs[i], pool);
}
return new_type;
}
PGF_API prob_t
pgf_compute_tree_probability(PgfPGF *gr, PgfExpr expr)
{

View File

@@ -171,15 +171,6 @@ pgf_expr_unmeta(PgfExpr expr);
PGF_API_DECL PgfExpr
pgf_read_expr(GuIn* in, GuPool* pool, GuPool* tmp_pool, GuExn* err);
PGF_API_DECL int
pgf_read_expr_tuple(GuIn* in,
size_t n_exprs, PgfExpr exprs[],
GuPool* pool, GuExn* err);
PGF_API_DECL GuSeq*
pgf_read_expr_matrix(GuIn* in, size_t n_exprs,
GuPool* pool, GuExn* err);
PGF_API_DECL PgfType*
pgf_read_type(GuIn* in, GuPool* pool, GuPool* tmp_pool, GuExn* err);
@@ -239,9 +230,14 @@ PGF_API_DECL void
pgf_print_context(PgfHypos *hypos, PgfPrintContext* ctxt,
GuOut *out, GuExn *err);
PGF_API_DECL void
pgf_print_expr_tuple(size_t n_exprs, PgfExpr exprs[], PgfPrintContext* ctxt,
GuOut* out, GuExn* err);
PGF_API PgfLiteral
pgf_clone_literal(PgfLiteral lit, GuPool* pool);
PGF_API PgfExpr
pgf_clone_expr(PgfExpr expr, GuPool* pool);
PGF_API PgfType*
pgf_clone_type(PgfType* type, GuPool* pool);
PGF_API_DECL prob_t
pgf_compute_tree_probability(PgfPGF *gr, PgfExpr expr);

View File

@@ -155,7 +155,7 @@ pgf_bracket_lzn_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
}
static void
pgf_bracket_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lindex, PgfCId fun)
pgf_bracket_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfBracketLznState* state = gu_container(funcs, PgfBracketLznState, funcs);
@@ -192,7 +192,7 @@ pgf_bracket_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t li
}
static void
pgf_bracket_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lindex, PgfCId fun)
pgf_bracket_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfBracketLznState* state = gu_container(funcs, PgfBracketLznState, funcs);

View File

@@ -628,7 +628,7 @@ typedef struct {
PgfLzrCachedTag tag;
PgfCId cat;
int fid;
int lin_idx;
GuString ann;
PgfCId fun;
} PgfLzrCached;
@@ -666,7 +666,7 @@ pgf_lzr_cache_flush(PgfLzrCache* cache, PgfSymbols* form)
cache->lzr->funcs,
event->cat,
event->fid,
event->lin_idx,
event->ann,
event->fun);
}
break;
@@ -676,7 +676,7 @@ pgf_lzr_cache_flush(PgfLzrCache* cache, PgfSymbols* form)
cache->lzr->funcs,
event->cat,
event->fid,
event->lin_idx,
event->ann,
event->fun);
}
break;
@@ -731,27 +731,27 @@ found:
}
static void
pgf_lzr_cache_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_idx, PgfCId fun)
pgf_lzr_cache_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfLzrCache* cache = gu_container(funcs, PgfLzrCache, funcs);
PgfLzrCached* event = gu_buf_extend(cache->events);
event->tag = PGF_CACHED_BEGIN;
event->cat = cat;
event->fid = fid;
event->lin_idx = lin_idx;
event->fun = fun;
event->tag = PGF_CACHED_BEGIN;
event->cat = cat;
event->fid = fid;
event->ann = ann;
event->fun = fun;
}
static void
pgf_lzr_cache_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_idx, PgfCId fun)
pgf_lzr_cache_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfLzrCache* cache = gu_container(funcs, PgfLzrCache, funcs);
PgfLzrCached* event = gu_buf_extend(cache->events);
event->tag = PGF_CACHED_END;
event->cat = cat;
event->fid = fid;
event->lin_idx = lin_idx;
event->fun = fun;
event->tag = PGF_CACHED_END;
event->cat = cat;
event->fid = fid;
event->ann = ann;
event->fun = fun;
}
static void
@@ -939,8 +939,8 @@ pgf_lzr_linearize_tree(PgfLzr* lzr, PgfCncTree ctree, size_t lin_idx)
if ((*lzr->funcs)->begin_phrase && fapp->ccat != NULL) {
(*lzr->funcs)->begin_phrase(lzr->funcs,
fun->absfun->type->cid,
fapp->fid, lin_idx,
fapp->ccat->cnccat->abscat->name,
fapp->fid, fapp->ccat->cnccat->labels[lin_idx],
fun->absfun->name);
}
@@ -949,8 +949,8 @@ pgf_lzr_linearize_tree(PgfLzr* lzr, PgfCncTree ctree, size_t lin_idx)
if ((*lzr->funcs)->end_phrase && fapp->ccat != NULL) {
(*lzr->funcs)->end_phrase(lzr->funcs,
fun->absfun->type->cid,
fapp->fid, lin_idx,
fapp->ccat->cnccat->abscat->name,
fapp->fid, fapp->ccat->cnccat->labels[lin_idx],
fun->absfun->name);
}
break;
@@ -979,7 +979,7 @@ pgf_lzr_linearize_tree(PgfLzr* lzr, PgfCncTree ctree, size_t lin_idx)
if ((*lzr->funcs)->begin_phrase) {
(*lzr->funcs)->begin_phrase(lzr->funcs,
cat, flit->fid, 0,
cat, flit->fid, "s",
"");
}
@@ -1011,7 +1011,7 @@ pgf_lzr_linearize_tree(PgfLzr* lzr, PgfCncTree ctree, size_t lin_idx)
if ((*lzr->funcs)->end_phrase) {
(*lzr->funcs)->end_phrase(lzr->funcs,
cat, flit->fid, 0,
cat, flit->fid, "s",
"");
}

View File

@@ -83,10 +83,10 @@ struct PgfLinFuncs
void (*symbol_token)(PgfLinFuncs** self, PgfToken tok);
/// Begin phrase
void (*begin_phrase)(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex, PgfCId fun);
void (*begin_phrase)(PgfLinFuncs** self, PgfCId cat, int fid, GuString ann, PgfCId fun);
/// End phrase
void (*end_phrase)(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex, PgfCId fun);
void (*end_phrase)(PgfLinFuncs** self, PgfCId cat, int fid, GuString ann, PgfCId fun);
/// handling nonExist
void (*symbol_ne)(PgfLinFuncs** self);

View File

@@ -6,11 +6,12 @@
static PgfExprProb*
pgf_match_string_lit(PgfLiteralCallback* self, PgfConcr* concr,
size_t lin_idx,
GuString ann,
GuString sentence, size_t* poffset,
GuPool *out_pool)
{
gu_assert(lin_idx == 0);
if (strcmp(ann,"s") != 0)
return NULL;
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
const uint8_t* p = buf;
@@ -51,7 +52,7 @@ pgf_predict_empty_next(GuEnum* self, void* to, GuPool* pool)
static GuEnum*
pgf_predict_empty(PgfLiteralCallback* self, PgfConcr* concr,
size_t lin_idx,
GuString ann,
GuString prefix,
GuPool *out_pool)
{
@@ -67,11 +68,12 @@ static PgfLiteralCallback pgf_string_literal_callback =
static PgfExprProb*
pgf_match_int_lit(PgfLiteralCallback* self, PgfConcr* concr,
size_t lin_idx,
GuString ann,
GuString sentence, size_t* poffset,
GuPool *out_pool)
{
gu_assert(lin_idx == 0);
if (strcmp(ann,"s") != 0)
return NULL;
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
const uint8_t* p = buf;
@@ -121,11 +123,12 @@ static PgfLiteralCallback pgf_int_literal_callback =
static PgfExprProb*
pgf_match_float_lit(PgfLiteralCallback* self, PgfConcr* concr,
size_t lin_idx,
GuString ann,
GuString sentence, size_t* poffset,
GuPool *out_pool)
{
gu_assert(lin_idx == 0);
if (strcmp(ann,"s") != 0)
return NULL;
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
const uint8_t* p = buf;
@@ -226,11 +229,11 @@ pgf_match_name_morpho_callback(PgfMorphoCallback* self_,
static PgfExprProb*
pgf_match_name_lit(PgfLiteralCallback* self, PgfConcr* concr,
size_t lin_idx,
GuString ann,
GuString sentence, size_t* poffset,
GuPool *out_pool)
{
if (lin_idx != 0)
if (strcmp(ann,"s") != 0)
return NULL;
GuPool* tmp_pool = gu_local_pool();
@@ -349,7 +352,7 @@ pgf_match_unknown_morpho_callback(PgfMorphoCallback* self_,
static PgfExprProb*
pgf_match_unknown_lit(PgfLiteralCallback* self, PgfConcr* concr,
size_t lin_idx,
GuString ann,
GuString sentence, size_t* poffset,
GuPool *out_pool)
{

View File

@@ -869,7 +869,7 @@ pgf_lookup_symbol_token(PgfLinFuncs** self, PgfToken token)
}
static void
pgf_lookup_begin_phrase(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex, PgfCId funname)
pgf_lookup_begin_phrase(PgfLinFuncs** self, PgfCId cat, int fid, GuString ann, PgfCId funname)
{
PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
@@ -883,7 +883,7 @@ pgf_lookup_begin_phrase(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex,
}
static void
pgf_lookup_end_phrase(PgfLinFuncs** self, PgfCId cat, int fid, size_t lindex, PgfCId fun)
pgf_lookup_end_phrase(PgfLinFuncs** self, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
st->curr_absfun = NULL;

View File

@@ -61,6 +61,14 @@ typedef struct {
typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
typedef struct {
PgfProductionIdx* idx;
size_t offset;
size_t sym_idx;
} PgfLexiconIdxEntry;
typedef GuBuf PgfLexiconIdx;
struct PgfParseState {
PgfParseState* next;
@@ -74,6 +82,8 @@ struct PgfParseState {
size_t end_offset;
prob_t viterbi_prob;
PgfLexiconIdx* lexicon_idx;
};
typedef struct PgfAnswers {
@@ -113,43 +123,10 @@ struct PgfItem {
prob_t inside_prob;
};
static PgfSymbol
pgf_prev_extern_sym(PgfSymbol sym)
{
GuVariantInfo i = gu_variant_open(sym);
switch (i.tag) {
case PGF_SYMBOL_CAT:
return *((PgfSymbol*) (((PgfSymbolCat*) i.data)+1));
case PGF_SYMBOL_KP:
return *((PgfSymbol*) (((PgfSymbolKP*) i.data)+1));
case PGF_SYMBOL_KS: {
PgfSymbolKS* sks = (PgfSymbolKS*) i.data;
size_t tok_len = strlen(sks->token);
return *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1));
}
case PGF_SYMBOL_LIT:
return *((PgfSymbol*) (((PgfSymbolLit*) i.data)+1));
case PGF_SYMBOL_VAR:
return *((PgfSymbol*) (((PgfSymbolVar*) i.data)+1));
case PGF_SYMBOL_BIND:
case PGF_SYMBOL_SOFT_BIND:
case PGF_SYMBOL_SOFT_SPACE:
return *((PgfSymbol*) (((PgfSymbolBIND*) i.data)+1));
case PGF_SYMBOL_CAPIT:
case PGF_SYMBOL_ALL_CAPIT:
return *((PgfSymbol*) (((PgfSymbolCAPIT*) i.data)+1));
case PGF_SYMBOL_NE:
return *((PgfSymbol*) (((PgfSymbolNE*) i.data)+1));
default:
gu_impossible();
return gu_null_variant;
}
}
static PgfSymbol
static PgfSymbols*
pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
{
PgfSymbol sym = gu_null_variant;
GuBuf* syms = gu_new_buf(PgfSymbol, ps->pool);
const uint8_t* start = (uint8_t*) ps->sentence+start_offset;
const uint8_t* end = (uint8_t*) ps->sentence+end_offset;
@@ -163,16 +140,15 @@ pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
ucs = gu_utf8_decode(&p);
}
PgfSymbol new_sym;
PgfSymbol sym;
PgfSymbolKS* sks = (PgfSymbolKS*)
gu_alloc_variant(PGF_SYMBOL_KS,
sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+len+1,
gu_alignof(PgfSymbolKS),
&new_sym, ps->pool);
sizeof(PgfSymbolKS)+len+1,
gu_alignof(PgfSymbolKS),
&sym, ps->pool);
memcpy((char*) sks->token, start, len);
((char*) sks->token)[len] = 0;
*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+len+1)) = sym;
sym = new_sym;
gu_buf_push(syms, PgfSymbol, sym);
start = p;
while (gu_ucs_is_space(ucs)) {
@@ -181,68 +157,16 @@ pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
}
}
return sym;
}
static size_t
pgf_item_symbols_length(PgfItem* item)
{
GuVariantInfo i = gu_variant_open(item->prod);
switch (i.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = i.data;
return gu_seq_length(papp->fun->lins[item->conts->lin_idx]->syms);
}
case PGF_PRODUCTION_COERCE: {
return 1;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
PgfSymbols* syms;
if (pext->lins != NULL &&
(syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) {
return gu_seq_length(syms);
} else {
int seq_len = 0;
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
seq_len++;
sym = pgf_prev_extern_sym(sym);
}
return seq_len;
}
}
default:
gu_impossible();
return 0;
}
}
static PgfSymbols*
pgf_extern_syms_get(PgfItem* item, GuPool* pool)
{
int syms_len = pgf_item_symbols_length(item);
PgfSymbols* syms =
gu_new_seq(PgfSymbol, syms_len, pool);
PgfSymbol sym = item->curr_sym;
while (!gu_variant_is_null(sym)) {
gu_seq_set(syms, PgfSymbol, --syms_len, sym);
sym = pgf_prev_extern_sym(sym);
}
return syms;
return gu_buf_data_seq(syms);
}
#ifdef PGF_PARSER_DEBUG
PGF_INTERNAL void
pgf_print_fid(int fid, GuOut* out, GuExn* err);
PGF_INTERNAL_DECL void
pgf_print_symbol(PgfSymbol sym, GuOut *out, GuExn *err);
#ifdef PGF_PARSER_DEBUG
static void
pgf_item_symbols(PgfItem* item,
size_t* lin_idx, PgfSymbols** syms,
@@ -267,11 +191,7 @@ pgf_item_symbols(PgfItem* item,
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
if (pext->lins == NULL ||
(*syms = gu_seq_get(pext->lins, PgfSymbols*, item->conts->lin_idx)) == NULL) {
*syms = pgf_extern_syms_get(item, pool);
}
*syms = pext->lins[item->conts->lin_idx];
break;
}
default:
@@ -603,16 +523,11 @@ pgf_item_set_curr_symbol(PgfItem* item, GuPool* pool)
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
PgfSymbols* syms;
if (pext->lins != NULL &&
(syms = gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx)) != NULL) {
if (item->sym_idx == gu_seq_length(syms)) {
item->curr_sym = gu_null_variant;
} else {
item->curr_sym = gu_seq_get(syms, PgfSymbol, item->sym_idx);
}
} else {
PgfSymbols* syms = pext->lins[item->conts->lin_idx];
if (item->sym_idx == gu_seq_length(syms)) {
item->curr_sym = gu_null_variant;
} else {
item->curr_sym = gu_seq_get(syms, PgfSymbol, item->sym_idx);
}
break;
}
@@ -781,16 +696,6 @@ pgf_result_production(PgfParsing* ps,
static void
pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep);
static void
pgf_parsing_push_item(PgfParseState* state, PgfItem* item)
{
if (gu_buf_length(state->agenda) == 0) {
state->viterbi_prob =
item->inside_prob+item->conts->outside_prob;
}
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
}
static void
pgf_parsing_push_production(PgfParsing* ps, PgfParseState* state,
PgfItemConts* conts, PgfProduction prod)
@@ -822,7 +727,7 @@ pgf_parsing_combine(PgfParsing* ps,
}
pgf_item_advance(item, ps->pool);
pgf_parsing_push_item(before, item);
gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item);
}
static PgfProduction
@@ -851,36 +756,7 @@ pgf_parsing_new_production(PgfItem* item, PgfExprProb *ep, GuPool *pool)
break;
}
case PGF_PRODUCTION_EXTERN: {
PgfProductionExtern* pext = i.data;
if (pext->lins == NULL ||
gu_seq_get(pext->lins,PgfSymbols*,item->conts->lin_idx) == NULL) {
PgfSymbols* syms =
pgf_extern_syms_get(item, pool);
size_t n_lins = item->conts->ccat->cnccat->n_lins;
PgfProductionExtern* new_pext = (PgfProductionExtern*)
gu_new_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
&prod, pool);
new_pext->ep = ep;
new_pext->lins = gu_new_seq(PgfSymbols*, n_lins, pool);
if (pext->lins == NULL) {
for (size_t i = 0; i < n_lins; i++) {
gu_seq_set(new_pext->lins,PgfSymbols*,i,NULL);
}
} else {
for (size_t i = 0; i < n_lins; i++) {
gu_seq_set(new_pext->lins,PgfSymbols*,i,
gu_seq_get(pext->lins,PgfSymbols*,i));
}
}
gu_seq_set(new_pext->lins,PgfSymbols*,item->conts->lin_idx,syms);
} else {
prod = item->prod;
}
prod = item->prod;
break;
}
default:
@@ -1022,9 +898,65 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
}
}
PGF_INTERNAL_DECL int
pgf_symbols_cmp(PgfCohortSpot* spot,
PgfSymbols* syms, size_t* sym_idx,
bool case_sensitive);
static void
pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
int i, int j, ptrdiff_t min, ptrdiff_t max)
{
// This is a variation of a binary search algorithm which
// can retrieve all prefixes of a string with minimal
// comparisons, i.e. there is no need to lookup every
// prefix separately.
while (i <= j) {
int k = (i+j) / 2;
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
PgfCohortSpot start = {0, ps->sentence + state->end_offset};
PgfCohortSpot current = start;
size_t sym_idx = 0;
int cmp = pgf_symbols_cmp(&current, seq->syms, &sym_idx, ps->case_sensitive);
if (cmp < 0) {
j = k-1;
} else if (cmp > 0) {
ptrdiff_t len = current.ptr - start.ptr;
if (min <= len)
pgf_parsing_lookahead(ps, state, i, k-1, min, len);
if (len+1 <= max)
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
break;
} else {
ptrdiff_t len = current.ptr - start.ptr;
if (min <= len-1)
pgf_parsing_lookahead(ps, state, i, k-1, min, len-1);
if (seq->idx != NULL) {
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
entry->idx = seq->idx;
entry->offset = (size_t) (current.ptr - ps->sentence);
entry->sym_idx = sym_idx;
}
if (len+1 <= max)
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
break;
}
}
}
static PgfParseState*
pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
BIND_TYPE bind_type)
BIND_TYPE bind_type,
prob_t viterbi_prob)
{
PgfParseState** pstate;
if (ps->before == NULL && start_offset == 0)
@@ -1077,172 +1009,36 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
(start_offset == end_offset);
state->start_offset = start_offset;
state->end_offset = end_offset;
state->viterbi_prob = 0;
state->viterbi_prob = viterbi_prob;
state->lexicon_idx =
gu_new_buf(PgfLexiconIdxEntry, ps->pool);
if (ps->before == NULL && start_offset == 0)
state->needs_bind = false;
if (gu_seq_length(ps->concr->sequences) > 0) {
// Add epsilon lexical rules to the bottom up index
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0);
if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) {
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
entry->idx = seq->idx;
entry->offset = state->start_offset;
entry->sym_idx= 0;
}
// Add non-epsilon lexical rules to the bottom up index
if (!state->needs_bind) {
pgf_parsing_lookahead(ps, state,
0, gu_seq_length(ps->concr->sequences)-1,
1, strlen(ps->sentence)-state->end_offset);
}
}
*pstate = state;
return state;
}
PGF_INTERNAL_DECL int
pgf_symbols_cmp(PgfCohortSpot* spot,
PgfSymbols* syms, size_t* sym_idx,
bool case_sensitive);
static bool
pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state,
int i, int j, ptrdiff_t min, ptrdiff_t max)
{
// This is a variation of a binary search algorithm which
// can retrieve all prefixes of a string with minimal
// comparisons, i.e. there is no need to lookup every
// prefix separately.
bool found = false;
while (i <= j) {
int k = (i+j) / 2;
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
PgfCohortSpot start = {0, ps->sentence+state->end_offset};
PgfCohortSpot current = start;
size_t sym_idx = 0;
int cmp = pgf_symbols_cmp(&current, seq->syms, &sym_idx, ps->case_sensitive);
if (cmp < 0) {
j = k-1;
} else if (cmp > 0) {
ptrdiff_t len = current.ptr - start.ptr;
if (min <= len)
if (pgf_parsing_scan_helper(ps, state, i, k-1, min, len))
found = true;
if (len+1 <= max)
if (pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max))
found = true;
break;
} else {
ptrdiff_t len = current.ptr - start.ptr;
if (min <= len)
if (pgf_parsing_scan_helper(ps, state, i, k-1, min, len))
found = true;
// Here we do bottom-up prediction for all lexical categories.
// The epsilon productions will be predicted in top-down
// fashion while parsing.
if (seq->idx != NULL && len > 0) {
found = true;
// A new state will mark the end of the current match
PgfParseState* new_state =
pgf_new_parse_state(ps, (size_t) (current.ptr - ps->sentence), BIND_NONE);
// Bottom-up prediction for lexical rules
size_t n_entries = gu_buf_length(seq->idx);
for (size_t i = 0; i < n_entries; i++) {
PgfProductionIdxEntry* entry =
gu_buf_index(seq->idx, PgfProductionIdxEntry, i);
PgfItemConts* conts =
pgf_parsing_get_conts(state,
entry->ccat, entry->lin_idx,
ps->pool);
// Create the new category if it doesn't exist yet
PgfCCat* tmp_ccat = pgf_parsing_get_completed(new_state, conts);
PgfCCat* ccat = tmp_ccat;
if (ccat == NULL) {
ccat = pgf_parsing_create_completed(ps, new_state, conts, INFINITY);
}
// Add the production
if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) {
ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1);
}
GuVariantInfo i;
i.tag = PGF_PRODUCTION_APPLY;
i.data = entry->papp;
PgfProduction prod = gu_variant_close(i);
gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod);
// Update the category's probability to be minimum
if (ccat->viterbi_prob > entry->papp->fun->ep->prob)
ccat->viterbi_prob = entry->papp->fun->ep->prob;
#ifdef PGF_PARSER_DEBUG
GuPool* tmp_pool = gu_new_pool();
GuOut* out = gu_file_out(stderr, tmp_pool);
GuExn* err = gu_exn(tmp_pool);
if (tmp_ccat == NULL) {
gu_printf(out, err, "[");
pgf_print_range(state, new_state, out, err);
gu_puts("; ", out, err);
pgf_print_fid(conts->ccat->fid, out, err);
gu_printf(out, err, "; %d; ",
conts->lin_idx);
pgf_print_fid(ccat->fid, out, err);
gu_puts("] ", out, err);
pgf_print_fid(ccat->fid, out, err);
gu_printf(out, err, ".chunk_count=%d\n", ccat->chunk_count);
}
pgf_print_production(ccat->fid, prod, out, err);
gu_pool_free(tmp_pool);
#endif
}
}
if (len <= max)
if (pgf_parsing_scan_helper(ps, state, k+1, j, len, max))
found = true;
break;
}
}
return found;
}
static void
pgf_parsing_scan(PgfParsing *ps)
{
size_t len = strlen(ps->sentence);
PgfParseState* state =
pgf_new_parse_state(ps, 0, BIND_SOFT);
while (state != NULL && state->end_offset < len) {
if (state->needs_bind) {
// We have encountered two tokens without space in between.
// Those can be accepted only if there is a BIND token
// in between. We encode this by having one more state
// at the same offset. A transition between these two
// states is possible only with the BIND token.
state =
pgf_new_parse_state(ps, state->end_offset, BIND_HARD);
}
if (!pgf_parsing_scan_helper
(ps, state,
0, gu_seq_length(ps->concr->sequences)-1,
1, len-state->end_offset)) {
// skip one character and try again
GuString s = ps->sentence+state->end_offset;
gu_utf8_decode((const uint8_t**) &s);
pgf_new_parse_state(ps, s-ps->sentence, BIND_NONE);
}
if (state == ps->before)
state = ps->after;
else
state = state->next;
}
}
static void
pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
{
@@ -1262,8 +1058,9 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
if (!ps->before->needs_bind && cmp_string(&current, tok, ps->case_sensitive) == 0) {
PgfParseState* state =
pgf_new_parse_state(ps, (current.ptr - ps->sentence),
BIND_NONE);
pgf_parsing_push_item(state, item);
BIND_NONE,
item->inside_prob+item->conts->outside_prob);
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
} else {
pgf_item_free(ps, item);
}
@@ -1273,17 +1070,18 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
static void
pgf_parsing_predict_lexeme(PgfParsing* ps, PgfItemConts* conts,
PgfProductionIdxEntry* entry,
size_t offset)
size_t offset, size_t sym_idx)
{
GuVariantInfo i = { PGF_PRODUCTION_APPLY, entry->papp };
PgfProduction prod = gu_variant_close(i);
PgfItem* item =
pgf_new_item(ps, conts, prod);
PgfSymbols* syms = entry->papp->fun->lins[conts->lin_idx]->syms;
item->sym_idx = gu_seq_length(syms);
item->sym_idx = sym_idx;
pgf_item_set_curr_symbol(item, ps->pool);
prob_t prob = item->inside_prob+item->conts->outside_prob;
PgfParseState* state =
pgf_new_parse_state(ps, offset, BIND_NONE);
pgf_new_parse_state(ps, offset, BIND_NONE, prob);
if (state->viterbi_prob > prob) {
state->viterbi_prob = prob;
}
@@ -1337,36 +1135,34 @@ pgf_parsing_td_predict(PgfParsing* ps,
pgf_parsing_push_production(ps, ps->before, conts, prod);
}
// Top-down prediction for epsilon lexical rules if any
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0);
if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) {
// Bottom-up prediction for lexical and epsilon rules
size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
for (size_t i = 0; i < n_idcs; i++) {
PgfLexiconIdxEntry* lentry =
gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
PgfProductionIdxEntry key;
key.ccat = ccat;
key.lin_idx = lin_idx;
key.papp = NULL;
PgfProductionIdxEntry* value =
gu_seq_binsearch(gu_buf_data_seq(seq->idx),
gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
pgf_production_idx_entry_order,
PgfProductionIdxEntry, &key);
if (value != NULL) {
GuVariantInfo i = { PGF_PRODUCTION_APPLY, value->papp };
PgfProduction prod = gu_variant_close(i);
pgf_parsing_push_production(ps, ps->before, conts, prod);
pgf_parsing_predict_lexeme(ps, conts, value, lentry->offset, lentry->sym_idx);
PgfProductionIdxEntry* start =
gu_buf_data(seq->idx);
gu_buf_data(lentry->idx);
PgfProductionIdxEntry* end =
start + gu_buf_length(seq->idx)-1;
start + gu_buf_length(lentry->idx)-1;
PgfProductionIdxEntry* left = value-1;
while (left >= start &&
value->ccat->fid == left->ccat->fid &&
value->lin_idx == left->lin_idx) {
GuVariantInfo i = { PGF_PRODUCTION_APPLY, left->papp };
PgfProduction prod = gu_variant_close(i);
pgf_parsing_push_production(ps, ps->before, conts, prod);
pgf_parsing_predict_lexeme(ps, conts, left, lentry->offset, lentry->sym_idx);
left--;
}
@@ -1374,9 +1170,7 @@ pgf_parsing_td_predict(PgfParsing* ps,
while (right <= end &&
value->ccat->fid == right->ccat->fid &&
value->lin_idx == right->lin_idx) {
GuVariantInfo i = { PGF_PRODUCTION_APPLY, right->papp };
PgfProduction prod = gu_variant_close(i);
pgf_parsing_push_production(ps, ps->before, conts, prod);
pgf_parsing_predict_lexeme(ps, conts, right, lentry->offset, lentry->sym_idx);
right++;
}
}
@@ -1415,7 +1209,7 @@ pgf_parsing_pre(PgfParsing* ps, PgfItem* item, PgfSymbols* syms)
} else {
item->alt = 0;
pgf_item_advance(item, ps->pool);
pgf_parsing_push_item(ps->before, item);
gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
}
}
@@ -1514,28 +1308,40 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
if (callback != NULL) {
ep = callback->match(callback, ps->concr,
slit->r,
parg->ccat->cnccat->labels[slit->r],
ps->sentence, &offset,
ps->out_pool);
}
}
if (ep != NULL) {
PgfSymbols* syms =
pgf_collect_extern_tok(ps, start, offset);
size_t n_lins = conts->ccat->cnccat->n_lins;
PgfProduction prod;
PgfProductionExtern* pext =
gu_new_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
&prod, ps->pool);
pext->ep = ep;
pext->lins = NULL;
gu_new_flex_variant(PGF_PRODUCTION_EXTERN,
PgfProductionExtern,
lins, n_lins,
&prod, ps->pool);
pext->ep = ep;
pext->n_lins = n_lins;
for (size_t i = 0; i < n_lins; i++) {
pext->lins[i] = NULL;
}
pext->lins[conts->lin_idx] = syms;
PgfItem* item =
pgf_new_item(ps, conts, prod);
item->curr_sym = pgf_collect_extern_tok(ps,start,offset);
item->sym_idx = pgf_item_symbols_length(item);
item->curr_sym = gu_null_variant;
item->sym_idx = gu_seq_length(syms);
PgfParseState* state =
pgf_new_parse_state(ps, offset, BIND_NONE);
pgf_parsing_push_item(state, item);
pgf_new_parse_state(ps, offset, BIND_NONE,
item->inside_prob+item->conts->outside_prob);
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
match = true;
}
}
@@ -1578,10 +1384,11 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
if (ps->before->start_offset == ps->before->end_offset &&
ps->before->needs_bind) {
PgfParseState* state =
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD,
item->inside_prob+item->conts->outside_prob);
if (state != NULL) {
pgf_item_advance(item, ps->pool);
pgf_parsing_push_item(state, item);
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
} else {
pgf_item_free(ps, item);
}
@@ -1595,10 +1402,11 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
if (ps->before->start_offset == ps->before->end_offset) {
if (ps->before->needs_bind) {
PgfParseState* state =
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD,
item->inside_prob+item->conts->outside_prob);
if (state != NULL) {
pgf_item_advance(item, ps->pool);
pgf_parsing_push_item(state, item);
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
} else {
pgf_item_free(ps, item);
}
@@ -1607,12 +1415,13 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
}
} else {
pgf_item_advance(item, ps->pool);
pgf_parsing_push_item(ps->before, item);
gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
}
break;
}
case PGF_SYMBOL_CAPIT:
case PGF_SYMBOL_ALL_CAPIT: {
printf("PGF_SYMBOL_CAPIT\n");
pgf_item_advance(item, ps->pool);
pgf_parsing_symbol(ps, item, item->curr_sym);
break;
@@ -1857,7 +1666,8 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
ps->heuristic_factor = heuristic_factor;
}
pgf_parsing_scan(ps);
PgfParseState* state =
pgf_new_parse_state(ps, 0, BIND_SOFT, 0);
int fidString = -1;
PgfCCat* start_ccat = gu_new(PgfCCat, ps->pool);
@@ -1879,7 +1689,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
#endif
PgfItemConts* conts =
pgf_parsing_get_conts(ps->before, start_ccat, 0, ps->pool);
pgf_parsing_get_conts(state, start_ccat, 0, ps->pool);
gu_buf_push(conts->items, PgfItem*, NULL);
size_t n_ccats = gu_seq_length(cnccat->cats);
@@ -2218,6 +2028,8 @@ pgf_process_generated_cat(PgfParsing* ps,
children[i] = pcoerce->coerce;
break;
}
case PGF_PRODUCTION_EXTERN:
just_coercions = false;
}
}
@@ -2363,6 +2175,104 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ, GuString sentence,
return &ps->en;
}
PGF_API PgfParsing*
pgf_parse_to_chart(PgfConcr* concr, PgfType* typ, GuString sentence,
double heuristics,
PgfCallbacksMap* callbacks,
size_t n_roots,
GuExn* err,
GuPool* pool, GuPool* out_pool)
{
if (concr->sequences == NULL ||
concr->cnccats == NULL) {
GuExnData* err_data = gu_raise(err, PgfExn);
if (err_data) {
err_data->data = "The concrete syntax is not loaded";
return NULL;
}
}
// Begin parsing a sentence with the specified category
PgfParsing* ps =
pgf_parsing_init(concr, typ->cid, sentence, heuristics, callbacks, NULL, err, pool, out_pool);
if (ps == NULL) {
return NULL;
}
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(ps);
#endif
while (gu_buf_length(ps->expr_queue) < n_roots) {
if (!pgf_parsing_proceed(ps)) {
break;
}
#ifdef PGF_COUNTS_DEBUG
pgf_parsing_print_counts(ps);
#endif
}
return ps;
}
PGF_API PgfCCats*
pgf_get_parse_roots(PgfParsing* ps, GuPool* pool)
{
size_t n_cats = 0;
size_t n_states = gu_buf_length(ps->expr_queue);
GuSeq* roots = gu_new_seq(PgfCCat*, n_states, pool);
for (size_t i = 0; i < n_states; i++) {
PgfCCat* ccat = gu_buf_get(ps->expr_queue, PgfExprState*, i)->answers->ccat;
bool found = false;
for (size_t j = 0; j < n_cats; j++) {
if (gu_seq_get(roots, PgfCCat*, j) == ccat) {
found = true;
break;
}
}
if (!found) {
gu_seq_set(roots, PgfCCat*, n_cats, ccat);
n_cats++;
}
}
roots->len = n_cats;
return roots;
}
PGF_API GuSeq*
pgf_ccat_to_range(PgfParsing* ps, PgfCCat* ccat, GuPool* pool)
{
PgfParseState* state = ps->before;
GuBuf* buf = gu_new_buf(PgfParseRange, pool);
while (ccat->conts != NULL) {
size_t start = ccat->conts->state->end_offset;
size_t end = start;
while (state != NULL) {
if (pgf_parsing_get_completed(state, ccat->conts) == ccat) {
if (state->start_offset >= start)
end = state->start_offset;
break;
}
state = state->next;
}
if (start != end) {
PgfParseRange* range = gu_buf_extend(buf);
range->start = start;
range->end = end;
range->field = ccat->cnccat->labels[ccat->conts->lin_idx];
}
ccat = ccat->conts->ccat;
}
return gu_buf_data_seq(buf);
}
PGF_API PgfExprEnum*
pgf_parse_with_oracle(PgfConcr* concr, PgfType* typ,
GuString sentence,

View File

@@ -6,7 +6,7 @@
typedef struct {
int start, end;
PgfCId cat;
size_t lin_idx;
GuString ann;
} PgfPhrase;
typedef struct {
@@ -46,14 +46,14 @@ pgf_metrics_lzn_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
}
static void
pgf_metrics_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_index, PgfCId fun)
pgf_metrics_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfMetricsLznState* state = gu_container(funcs, PgfMetricsLznState, funcs);
gu_buf_push(state->marks, int, state->pos);
}
static void
pgf_metrics_lzn_end_phrase1(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_idx, PgfCId fun)
pgf_metrics_lzn_end_phrase1(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfMetricsLznState* state = gu_container(funcs, PgfMetricsLznState, funcs);
@@ -65,7 +65,7 @@ pgf_metrics_lzn_end_phrase1(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin
phrase->start = start;
phrase->end = end;
phrase->cat = cat;
phrase->lin_idx = lin_idx;
phrase->ann = ann;
gu_buf_push(state->phrases, PgfPhrase*, phrase);
}
}
@@ -85,7 +85,7 @@ pgf_metrics_symbol_bind(PgfLinFuncs** funcs)
}
static void
pgf_metrics_lzn_end_phrase2(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin_idx, PgfCId fun)
pgf_metrics_lzn_end_phrase2(PgfLinFuncs** funcs, PgfCId cat, int fid, GuString ann, PgfCId fun)
{
PgfMetricsLznState* state = gu_container(funcs, PgfMetricsLznState, funcs);
@@ -100,7 +100,7 @@ pgf_metrics_lzn_end_phrase2(PgfLinFuncs** funcs, PgfCId cat, int fid, size_t lin
if (phrase->start == start &&
phrase->end == end &&
strcmp(phrase->cat, cat) == 0 &&
phrase->lin_idx == lin_idx) {
strcmp(phrase->ann, ann) == 0) {
state->matches++;
break;
}

View File

@@ -220,6 +220,20 @@ pgf_category_prob(PgfPGF* pgf, PgfCId catname)
return abscat->prob;
}
PGF_API GuString*
pgf_category_fields(PgfConcr* concr, PgfCId catname, size_t *n_lins)
{
PgfCncCat* cnccat =
gu_map_get(concr->cnccats, catname, PgfCncCat*);
if (!cnccat) {
*n_lins = 0;
return NULL;
}
*n_lins = cnccat->n_lins;
return &cnccat->labels;
}
PGF_API GuString
pgf_language_code(PgfConcr* concr)
{

View File

@@ -90,6 +90,9 @@ pgf_category_context(PgfPGF *gr, PgfCId catname);
PGF_API_DECL prob_t
pgf_category_prob(PgfPGF* pgf, PgfCId catname);
PGF_API GuString*
pgf_category_fields(PgfConcr* concr, PgfCId catname, size_t *n_lins);
PGF_API_DECL void
pgf_iter_functions(PgfPGF* pgf, GuMapItor* itor, GuExn* err);
@@ -163,8 +166,8 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
PgfMorphoCallback* callback, GuExn* err);
typedef struct {
size_t pos;
GuString ptr;
size_t pos; // position in Unicode characters
GuString ptr; // pointer into the string
} PgfCohortSpot;
typedef struct {
@@ -203,6 +206,12 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfType* typ,
GuExn* err,
GuPool* pool, GuPool* out_pool);
typedef struct {
size_t start;
size_t end;
GuString field;
} PgfParseRange;
typedef struct PgfOracleCallback PgfOracleCallback;
struct PgfOracleCallback {
@@ -243,11 +252,11 @@ typedef struct PgfLiteralCallback PgfLiteralCallback;
struct PgfLiteralCallback {
PgfExprProb* (*match)(PgfLiteralCallback* self, PgfConcr* concr,
size_t lin_idx,
GuString ann,
GuString sentence, size_t* poffset,
GuPool *out_pool);
GuEnum* (*predict)(PgfLiteralCallback* self, PgfConcr* concr,
size_t lin_idx,
GuString ann,
GuString prefix,
GuPool *out_pool);
};

View File

@@ -114,7 +114,7 @@ pgf_morpho_iter(PgfProductionIdx* idx,
PgfCId lemma = entry->papp->fun->absfun->name;
GuString analysis = entry->ccat->cnccat->labels[entry->lin_idx];
prob_t prob = entry->ccat->cnccat->abscat->prob +
entry->papp->fun->absfun->ep.prob;
callback->callback(callback,
@@ -234,12 +234,13 @@ typedef struct {
GuEnum en;
PgfConcr* concr;
GuString sentence;
GuString current;
size_t len;
PgfMorphoCallback* callback;
GuExn* err;
bool case_sensitive;
GuBuf* spots;
GuBuf* skip_spots;
GuBuf* empty_buf;
GuBuf* found;
} PgfCohortsState;
@@ -255,6 +256,23 @@ cmp_cohort_spot(GuOrder* self, const void* a, const void* b)
static GuOrder
pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }};
static void
pgf_lookup_cohorts_report_skip(PgfCohortsState *state,
PgfCohortSpot* spot)
{
size_t n_spots = gu_buf_length(state->skip_spots);
for (size_t i = 0; i < n_spots; i++) {
PgfCohortSpot* skip_spot =
gu_buf_index(state->skip_spots, PgfCohortSpot, i);
PgfCohortRange* range = gu_buf_insert(state->found, 0);
range->start = *skip_spot;
range->end = *spot;
range->buf = state->empty_buf;
}
gu_buf_flush(state->skip_spots);
}
static void
pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
int i, int j, ptrdiff_t min, ptrdiff_t max)
@@ -291,18 +309,23 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);
if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) {
// Report unknown words
pgf_lookup_cohorts_report_skip(state, spot);
// Report the actual hit
PgfCohortRange* range = gu_buf_insert(state->found, 0);
range->start = *spot;
range->end = current;
range->buf = seq->idx;
}
while (*current.ptr != 0) {
if (!skip_space(&current.ptr, &current.pos))
break;
}
// Schedule the next search spot
while (*current.ptr != 0) {
if (!skip_space(&current.ptr, &current.pos))
break;
}
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &current);
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &current);
}
if (len <= max)
pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max);
@@ -318,29 +341,67 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
PgfCohortsState* state = gu_container(self, PgfCohortsState, en);
while (gu_buf_length(state->found) == 0 &&
gu_buf_length(state->spots) > 0) {
gu_buf_length(state->spots) > 0) {
PgfCohortSpot spot;
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
if (spot.ptr == state->current)
continue;
GuString next_ptr = state->sentence+state->len;
while (gu_buf_length(state->spots) > 0) {
GuString ptr =
gu_buf_index(state->spots, PgfCohortSpot, 0)->ptr;
if (ptr > spot.ptr) {
next_ptr = ptr;
break;
}
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
}
if (*spot.ptr == 0)
break;
bool needs_report = true;
while (next_ptr > spot.ptr) {
pgf_lookup_cohorts_helper
(state, &spot,
0, gu_seq_length(state->concr->sequences)-1,
1, (state->sentence+state->len)-spot.ptr);
pgf_lookup_cohorts_helper
(state, &spot,
0, gu_seq_length(state->concr->sequences)-1,
1, (state->sentence+state->len)-spot.ptr);
if (gu_buf_length(state->found) == 0) {
// skip one character and try again
gu_utf8_decode((const uint8_t**) &spot.ptr);
spot.pos++;
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
// got a hit -> exit
if (gu_buf_length(state->found) > 0)
break;
if (needs_report) {
// no hit, but the word must be reported as unknown.
gu_buf_push(state->skip_spots, PgfCohortSpot, spot);
needs_report = false;
}
// skip one character
const uint8_t* ptr = (const uint8_t*) spot.ptr;
GuUCS c = gu_utf8_decode(&ptr);
if (gu_ucs_is_space(c)) {
// We have encounter a space and we must report
// a new unknown word.
pgf_lookup_cohorts_report_skip(state, &spot);
spot.ptr = (GuString) ptr;
spot.pos++;
// Schedule the next search spot
while (*spot.ptr != 0) {
if (!skip_space(&spot.ptr, &spot.pos))
break;
}
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
break;
} else {
spot.ptr = (GuString) ptr;
spot.pos++;
}
}
}
PgfCohortSpot end_spot = {state->len, state->sentence+state->len};
pgf_lookup_cohorts_report_skip(state, &end_spot);
PgfCohortRange* pRes = (PgfCohortRange*)to;
if (gu_buf_length(state->found) == 0) {
@@ -349,15 +410,19 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
pRes->end.pos = 0;
pRes->end.ptr = NULL;
pRes->buf = NULL;
state->current = NULL;
return;
} else do {
} else for (;;) {
*pRes = gu_buf_pop(state->found, PgfCohortRange);
state->current = pRes->start.ptr;
pgf_morpho_iter(pRes->buf, state->callback, state->err);
} while (gu_buf_length(state->found) > 0 &&
gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr);
if (gu_buf_length(state->found) <= 0)
break;
PgfCohortRange* last =
gu_buf_index_last(state->found, PgfCohortRange);
if (last->start.ptr != pRes->start.ptr ||
last->end.ptr != pRes->end.ptr)
break;
}
}
PGF_API GuEnum*
@@ -374,15 +439,17 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
}
PgfCohortsState* state = gu_new(PgfCohortsState, pool);
state->en.next = pgf_lookup_cohorts_enum_next;
state->concr = concr;
state->sentence= sentence;
state->len = strlen(sentence);
state->callback= callback;
state->err = err;
state->case_sensitive = pgf_is_case_sensitive(concr);
state->spots = gu_new_buf(PgfCohortSpot, pool);
state->found = gu_new_buf(PgfCohortRange, pool);
state->en.next = pgf_lookup_cohorts_enum_next;
state->concr = concr;
state->sentence = sentence;
state->len = strlen(sentence);
state->callback = callback;
state->err = err;
state->case_sensitive= pgf_is_case_sensitive(concr);
state->spots = gu_new_buf(PgfCohortSpot, pool);
state->skip_spots = gu_new_buf(PgfCohortSpot, pool);
state->empty_buf = gu_new_buf(PgfProductionIdxEntry, pool);
state->found = gu_new_buf(PgfCohortRange, pool);
PgfCohortSpot spot = {0,sentence};
while (*spot.ptr != 0) {