From a8ded63c95cd87a175ab6ab2da511f563b422ab9 Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Fri, 16 Aug 2013 15:02:24 +0000 Subject: [PATCH] implemented lookupMorpho for C and Python --- src/runtime/c/pgf/parser.c | 103 +++++++++++++++++++++++++++++++++++++ src/runtime/c/pgf/pgf.h | 11 ++++ src/runtime/python/pypgf.c | 84 ++++++++++++++++++++++++++++++ 3 files changed, 198 insertions(+) diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index 52fa0cf9a..ac863db0a 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -2311,6 +2311,109 @@ pgf_parser_add_literal(PgfConcr *concr, PgfCId cat, PgfLiteralCallback*, callback); } +typedef struct { + GuMapItor fn; + PgfTokens tokens; + PgfMorphoCallback* callback; +} PgfMorphoFn; + +static void +pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err) +{ + PgfMorphoFn* clo = (PgfMorphoFn*) fn; + PgfCFCat cfc = *((PgfCFCat*) key); + PgfProductionSeq prods = *((PgfProductionSeq*) value); + + if (gu_seq_is_null(prods)) + return; + + GuString analysis = cfc.ccat->cnccat->labels[cfc.lin_idx]; + + size_t n_prods = gu_seq_length(prods); + for (size_t i = 0; i < n_prods; i++) { + PgfProduction prod = + gu_seq_get(prods, PgfProduction, i); + + GuVariantInfo i = gu_variant_open(prod); + switch (i.tag) { + case PGF_PRODUCTION_APPLY: { + PgfProductionApply* papp = i.data; + + // match the tokens with the production + size_t pos = 0; + PgfSequence seq = papp->fun->lins[cfc.lin_idx]; + size_t len = gu_seq_length(seq); + for (size_t i = 0; i < len; i++) { + PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i); + + GuVariantInfo i = gu_variant_open(sym); + switch (i.tag) { + case PGF_SYMBOL_KS: { + PgfSymbolKS* symks = i.data; + size_t len = gu_seq_length(symks->tokens); + for (size_t i = 0; i < len; i++) { + if (pos >= gu_seq_length(clo->tokens)) + goto cont; + + PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i); + PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++); + + if (!gu_string_eq(tok1, tok2)) + goto cont; + } + } + default: + continue; + } + } + + if (pos != gu_seq_length(clo->tokens)) + goto cont; + + PgfCId lemma = papp->fun->absfun->name; + prob_t prob = papp->fun->absfun->ep.prob; + clo->callback->callback(clo->callback, clo->tokens, + lemma, analysis, prob, err); + } + } + cont:; + } +} + +void +pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer, + PgfMorphoCallback* callback, GuExn* err) +{ + GuPool* tmp_pool = gu_local_pool(); + + GuBuf* tokens = gu_new_buf(PgfToken, tmp_pool); + GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), tmp_pool); + + PgfToken tok = pgf_lexer_read_token(lexer, lex_err); + if (gu_exn_is_raised(lex_err)) { + gu_raise(err, PgfExn); + gu_pool_free(tmp_pool); + return; + } + + PgfProductionIdx* lexicon_idx = + gu_map_get(concr->leftcorner_tok_idx, &tok, PgfProductionIdx*); + if (lexicon_idx == NULL) { + gu_pool_free(tmp_pool); + return; + } + + do { + gu_buf_push(tokens, PgfToken, tok); + tok = pgf_lexer_read_token(lexer, lex_err); + } while (!gu_exn_is_raised(lex_err)); + + PgfMorphoFn clo = { { pgf_morpho_iter }, gu_buf_seq(tokens), callback }; + gu_map_iter(lexicon_idx, &clo.fn, err); + + gu_pool_free(tmp_pool); +} + static void pgf_parser_leftcorner_add_token(PgfConcr* concr, PgfTokens tokens, PgfItem* item, diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index a575a5188..40ae1ca45 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -124,6 +124,17 @@ PgfExprEnum* pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool, GuPool* out_pool); +typedef struct PgfMorphoCallback PgfMorphoCallback; +struct PgfMorphoCallback { + void (*callback)(PgfMorphoCallback* self, PgfTokens tokens, + PgfCId lemma, GuString analysis, prob_t prob, + GuExn* err); +}; + +void +pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer, + PgfMorphoCallback* callback, GuExn* err); + PgfExprEnum* pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, double heuristics, diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c index fd2b39906..5b4ae9193 100644 --- a/src/runtime/python/pypgf.c +++ b/src/runtime/python/pypgf.c @@ -1684,6 +1684,87 @@ Concr_graphvizParseTree(ConcrObject* self, PyObject *args) { return pystr; } +typedef struct { + PgfMorphoCallback fn; + PyObject* analyses; +} PyMorphoCallback; + +static void +pypgf_collect_morpho(PgfMorphoCallback* self, PgfTokens tokens, + PgfCId lemma, GuString analysis, prob_t prob, + GuExn* err) +{ + PyMorphoCallback* callback = (PyMorphoCallback*) self; + + PyObject* py_lemma = gu2py_string(lemma); + PyObject* py_analysis = gu2py_string(analysis); + PyObject* res = + Py_BuildValue("OOf", py_lemma, py_analysis, prob); + + if (PyList_Append(callback->analyses, res) != 0) { + gu_raise(err, PgfExn); + } + + Py_DECREF(py_lemma); + Py_DECREF(py_analysis); + Py_DECREF(res); +} + +static PyObject* +Concr_lookupMorpho(ConcrObject* self, PyObject *args, PyObject *keywds) { + static char *kwlist[] = {"sentence", "tokens", NULL}; + + int len; + const uint8_t *buf = NULL; + PyObject* py_lexer = NULL; + if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#O", kwlist, + &buf, &len, &py_lexer)) + return NULL; + + if ((buf == NULL && py_lexer == NULL) || + (buf != NULL && py_lexer != NULL)) { + PyErr_SetString(PyExc_TypeError, "either the sentence or the tokens argument must be provided"); + return NULL; + } + + GuPool* tmp_pool = gu_local_pool(); + + PgfLexer *lexer = NULL; + if (buf != NULL) { + GuIn* in = gu_data_in(buf, len, tmp_pool); + GuReader* rdr = gu_new_utf8_reader(in, tmp_pool); + lexer = pgf_new_simple_lexer(rdr, tmp_pool); + } + if (py_lexer != NULL) { + // get an iterator out of the iterable object + py_lexer = PyObject_GetIter(py_lexer); + if (py_lexer == NULL) { + gu_pool_free(tmp_pool); + return NULL; + } + + lexer = pypgf_new_python_lexer(py_lexer, tmp_pool); + } + + GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool); + + PyObject* analyses = PyList_New(0); + + PyMorphoCallback callback = { { pypgf_collect_morpho }, analyses }; + pgf_lookup_morpho(self->concr, lexer, &callback.fn, err); + + Py_XDECREF(py_lexer); + + gu_pool_free(tmp_pool); + + if (!gu_ok(err)) { + Py_DECREF(analyses); + return NULL; + } + + return analyses; +} + static PyGetSetDef Concr_getseters[] = { {"name", (getter)Concr_getName, NULL, @@ -1726,6 +1807,9 @@ static PyMethodDef Concr_methods[] = { {"graphvizParseTree", (PyCFunction)Concr_graphvizParseTree, METH_VARARGS, "Renders an abstract syntax tree as a parse tree in Graphviz format" }, + {"lookupMorpho", (PyCFunction)Concr_lookupMorpho, METH_VARARGS | METH_KEYWORDS, + "Looks up a word in the lexicon of the grammar" + }, {NULL} /* Sentinel */ };