diff --git a/src/runtime/c/pgf/literals.c b/src/runtime/c/pgf/literals.c index 9008ccdc7..ac57b150f 100644 --- a/src/runtime/c/pgf/literals.c +++ b/src/runtime/c/pgf/literals.c @@ -177,15 +177,15 @@ typedef struct { PgfExpr expr; bool is_known; GuPool* out_pool; -} PgfMatchNameMorphoCallback; +} PgfMatchMorphoCallback; -void +static void pgf_match_name_morpho_callback(PgfMorphoCallback* self_, PgfCId lemma, GuString analysis, prob_t prob, GuExn* err) { - PgfMatchNameMorphoCallback* self = - gu_container(self_, PgfMatchNameMorphoCallback, callback); + PgfMatchMorphoCallback* self = + gu_container(self_, PgfMatchMorphoCallback, callback); PgfAbsFun* absfun = gu_seq_binsearch(self->abstract->funs, pgf_absfun_order, PgfAbsFun, lemma); @@ -277,12 +277,12 @@ pgf_match_name_lit(PgfLiteralCallback* self, PgfConcr* concr, } } - PgfMatchNameMorphoCallback clo = { { pgf_match_name_morpho_callback }, - concr->abstr, - gu_null_variant, - false, - out_pool - }; + PgfMatchMorphoCallback clo = { { pgf_match_name_morpho_callback }, + concr->abstr, + gu_null_variant, + false, + out_pool + }; pgf_lookup_morpho(concr, name, &clo.callback, NULL); if (clo.is_known) { @@ -337,6 +337,90 @@ pgf_match_name_lit(PgfLiteralCallback* self, PgfConcr* concr, PgfLiteralCallback pgf_nerc_literal_callback = { pgf_match_name_lit, pgf_predict_empty } ; +static void +pgf_match_unknown_morpho_callback(PgfMorphoCallback* self_, + PgfCId lemma, GuString analysis, prob_t prob, + GuExn* err) +{ + PgfMatchMorphoCallback* self = + gu_container(self_, PgfMatchMorphoCallback, callback); + self->is_known = true; +} + +static PgfExprProb* +pgf_match_unknown_lit(PgfLiteralCallback* self, PgfConcr* concr, + size_t lin_idx, + GuString sentence, size_t* poffset, + GuPool *out_pool) +{ + const uint8_t* buf = (uint8_t*) (sentence + *poffset); + const uint8_t* p = buf; + + PgfExprProb* ep = NULL; + + GuUCS ucs = gu_utf8_decode(&p); + if (!gu_ucs_is_upper(ucs)) { + GuPool* tmp_pool = gu_local_pool(); + GuStringBuf *sbuf = gu_string_buf(tmp_pool); + GuOut* out = gu_string_buf_out(sbuf); + GuExn* err = gu_new_exn(tmp_pool); + + gu_out_utf8(ucs, out, err); + *poffset = p - ((uint8_t*) sentence); + + ucs = gu_utf8_decode(&p); + while (ucs != 0 && !gu_ucs_is_space(ucs)) { + gu_out_utf8(ucs, out, err); + *poffset = p - ((uint8_t*) sentence); + + ucs = gu_utf8_decode(&p); + } + + GuString word = gu_string_buf_freeze(sbuf, tmp_pool); + + PgfMatchMorphoCallback clo = { { pgf_match_unknown_morpho_callback }, + concr->abstr, + gu_null_variant, + false, + out_pool + }; + pgf_lookup_morpho(concr, word, &clo.callback, NULL); + + if (!clo.is_known) { + ep = gu_new(PgfExprProb, out_pool); + ep->prob = 0; + + PgfExprApp *expr_app = + gu_new_variant(PGF_EXPR_APP, + PgfExprApp, + &ep->expr, out_pool); + GuString con = "MkSymb"; + PgfExprFun *expr_fun = + gu_new_flex_variant(PGF_EXPR_FUN, + PgfExprFun, + fun, strlen(con)+1, + &expr_app->fun, out_pool); + strcpy(expr_fun->fun, con); + PgfExprLit *expr_lit = + gu_new_variant(PGF_EXPR_LIT, + PgfExprLit, + &expr_app->arg, out_pool); + PgfLiteralStr *lit_str = + gu_new_flex_variant(PGF_LITERAL_STR, + PgfLiteralStr, + val, strlen(word)+1, + &expr_lit->lit, out_pool); + strcpy(lit_str->val, word); + } + + gu_pool_free(tmp_pool); + } + + return ep; +} + +PgfLiteralCallback pgf_unknown_literal_callback = + { pgf_match_unknown_lit, pgf_predict_empty } ; PgfCallbacksMap* pgf_new_callbacks_map(PgfConcr* concr, GuPool *pool) diff --git a/src/runtime/c/pgf/literals.h b/src/runtime/c/pgf/literals.h index bf071c202..7de17b121 100644 --- a/src/runtime/c/pgf/literals.h +++ b/src/runtime/c/pgf/literals.h @@ -6,6 +6,9 @@ // literal for named entities recognition extern PgfLiteralCallback pgf_nerc_literal_callback; +// literal for finding unknown words +extern PgfLiteralCallback pgf_unknown_literal_callback; + PgfCCat* pgf_literal_cat(PgfConcr* concr, PgfLiteral lit); diff --git a/src/runtime/c/utils/pgf-translate.c b/src/runtime/c/utils/pgf-translate.c index b99a8f3fd..00506e4cc 100644 --- a/src/runtime/c/utils/pgf-translate.c +++ b/src/runtime/c/utils/pgf-translate.c @@ -91,6 +91,8 @@ int main(int argc, char* argv[]) { pgf_new_callbacks_map(from_concr, pool); pgf_callbacks_map_add_literal(from_concr, callbacks, "PN", &pgf_nerc_literal_callback); + pgf_callbacks_map_add_literal(from_concr, callbacks, + "Symb", &pgf_unknown_literal_callback); // Create an output stream for stdout GuOut* out = gu_file_out(stdout, pool);