From 5e2474e3467e51d46314745d855d64921e4e0e4c Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Fri, 1 Feb 2013 09:29:43 +0000 Subject: [PATCH] This patch removes Gregoire's parse_tokens function in the python binding and adds another implementation which builds on the existing API for lexers in the C runtime. Now it is possible to write incremental Lexers in Python --- src/runtime/c/pgf/lexer.c | 46 ++++---- src/runtime/c/pgf/lexer.h | 7 +- src/runtime/c/pgf/pgf.c | 30 +----- src/runtime/c/pgf/pgf.h | 3 - src/runtime/c/utils/pgf-chunk.c | 2 +- src/runtime/c/utils/pgf-parse.c | 2 +- src/runtime/c/utils/pgf-translate.c | 2 +- src/runtime/python/pypgf.c | 158 +++++++++++++--------------- 8 files changed, 113 insertions(+), 137 deletions(-) diff --git a/src/runtime/c/pgf/lexer.c b/src/runtime/c/pgf/lexer.c index 15caab151..d50098072 100644 --- a/src/runtime/c/pgf/lexer.c +++ b/src/runtime/c/pgf/lexer.c @@ -3,26 +3,15 @@ #include #include -struct PgfLexer { +typedef struct { + PgfLexer base; GuReader* rdr; GuPool* pool; GuUCS ucs; - PgfToken tok; -}; - -PgfLexer* -pgf_new_lexer(GuReader *rdr, GuPool *pool) -{ - PgfLexer* lexer = gu_new(PgfLexer, pool); - lexer->rdr = rdr; - lexer->pool = pool; - lexer->ucs = ' '; - lexer->tok = gu_empty_string; - return lexer; -} +} PgfSimpleLexer; static void -pgf_lexer_read_ucs(PgfLexer *lexer, GuExn* err) +pgf_lexer_read_ucs(PgfSimpleLexer *lexer, GuExn* err) { lexer->ucs = gu_read_ucs(lexer->rdr, err); if (gu_exn_is_raised(err)) { @@ -31,9 +20,10 @@ pgf_lexer_read_ucs(PgfLexer *lexer, GuExn* err) } } -PgfToken -pgf_lexer_read_token(PgfLexer *lexer, GuExn* err) +static PgfToken +pgf_simple_lexer_read_token(PgfLexer *base, GuExn* err) { + PgfSimpleLexer* lexer = (PgfSimpleLexer*) base; GuPool* tmp_pool = gu_new_pool(); GuStringBuf* buf = gu_string_buf(tmp_pool); @@ -107,10 +97,28 @@ pgf_lexer_read_token(PgfLexer *lexer, GuExn* err) } stop: - lexer->tok = gu_string_buf_freeze(buf, lexer->pool); + lexer->base.tok = gu_string_buf_freeze(buf, lexer->pool); gu_pool_free(tmp_pool); - return lexer->tok; + return lexer->base.tok; +} + +PgfLexer* +pgf_new_simple_lexer(GuReader *rdr, GuPool *pool) +{ + PgfSimpleLexer* lexer = gu_new(PgfSimpleLexer, pool); + lexer->base.read_token = pgf_simple_lexer_read_token; + lexer->base.tok = gu_empty_string; + lexer->rdr = rdr; + lexer->pool = pool; + lexer->ucs = ' '; + return ((PgfLexer*) lexer); +} + +PgfToken +pgf_lexer_read_token(PgfLexer *lexer, GuExn* err) +{ + return lexer->read_token(lexer, err); } PgfToken diff --git a/src/runtime/c/pgf/lexer.h b/src/runtime/c/pgf/lexer.h index 6f01d4d10..f89629cea 100644 --- a/src/runtime/c/pgf/lexer.h +++ b/src/runtime/c/pgf/lexer.h @@ -6,10 +6,13 @@ /// A single lexical token typedef GuString PgfToken; -typedef struct PgfLexer PgfLexer; +typedef struct { + PgfToken (*read_token)(); + PgfToken tok; +} PgfLexer; PgfLexer* -pgf_new_lexer(GuReader *rdr, GuPool *pool); +pgf_new_simple_lexer(GuReader *rdr, GuPool *pool); PgfToken pgf_lexer_read_token(PgfLexer *lexer, GuExn* err); diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c index 2b720f093..6c6872867 100644 --- a/src/runtime/c/pgf/pgf.c +++ b/src/runtime/c/pgf/pgf.c @@ -223,37 +223,13 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool) tok = pgf_lexer_read_token(lexer, lex_err); } + if (gu_exn_caught(lex_err) != gu_type(GuEOF)) + return NULL; + // Now begin enumerating the resulting syntax trees return pgf_parse_result(state, pool); } -// Same as previous but accept a list of tokens as input instead of a -// lexer -GuEnum* -pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char **tokens, int len, GuPool* pool) -{ - // Begin parsing a sentence of the specified category - PgfParseState* state = - pgf_parser_init_state(concr, cat, 0, pool); - if (state == NULL) { - return NULL; - } - - // Parsing - PgfToken tok; - for (int i = 0; i < len; i++) { - tok = gu_str_string(tokens[i], pool); - - state = pgf_parser_next_state(state, tok, pool); - if (state == NULL) { - return NULL; - } - } - - // Now begin enumerating the resulting syntax trees - return pgf_parse_result(state, pool); -} - void pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool) { diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index afef6ec48..1f3947bff 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -115,9 +115,6 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuWriter* wtr, GuExn* err); PgfExprEnum* pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool); -PgfExprEnum* -pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char* tokens[], int len, GuPool* pool); - PgfExprEnum* pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool); diff --git a/src/runtime/c/utils/pgf-chunk.c b/src/runtime/c/utils/pgf-chunk.c index fada1c0b4..5f4b8972a 100644 --- a/src/runtime/c/utils/pgf-chunk.c +++ b/src/runtime/c/utils/pgf-chunk.c @@ -98,7 +98,7 @@ int main(int argc, char* argv[]) { GuReader *rdr = gu_string_reader(gu_str_string(line, ppool), ppool); PgfLexer *lexer = - pgf_new_lexer(rdr, ppool); + pgf_new_simple_lexer(rdr, ppool); pgf_print_chunks(from_concr, cat, lexer, ppool); diff --git a/src/runtime/c/utils/pgf-parse.c b/src/runtime/c/utils/pgf-parse.c index 4e1444806..648295312 100644 --- a/src/runtime/c/utils/pgf-parse.c +++ b/src/runtime/c/utils/pgf-parse.c @@ -123,7 +123,7 @@ int main(int argc, char* argv[]) { } GuReader *rdr = gu_string_reader(gu_str_string(line, ppool), ppool); - PgfLexer *lexer = pgf_new_lexer(rdr, ppool); + PgfLexer *lexer = pgf_new_simple_lexer(rdr, ppool); GuEnum* result = pgf_parse(concr, cat, lexer, ppool); PgfExprProb* ep = NULL; diff --git a/src/runtime/c/utils/pgf-translate.c b/src/runtime/c/utils/pgf-translate.c index ea3cca5af..ac427cb0e 100644 --- a/src/runtime/c/utils/pgf-translate.c +++ b/src/runtime/c/utils/pgf-translate.c @@ -164,7 +164,7 @@ int main(int argc, char* argv[]) { GuReader *rdr = gu_string_reader(gu_str_string(line, ppool), ppool); PgfLexer *lexer = - pgf_new_lexer(rdr, ppool); + pgf_new_simple_lexer(rdr, ppool); clock_t start = clock(); diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c index 997f3d3f7..4b2ab5891 100644 --- a/src/runtime/python/pypgf.c +++ b/src/runtime/python/pypgf.c @@ -581,22 +581,79 @@ Concr_printName(ConcrObject* self, PyObject *args) return pyname; } +typedef struct { + PgfLexer base; + PyObject* pylexer; + GuPool* pool; +} PgfPythonLexer; + +GU_DEFINE_TYPE(PyPgfLexerExn, abstract, _); + +static PgfToken +pypgf_python_lexer_read_token(PgfLexer *base, GuExn* err) +{ + PgfPythonLexer* lexer = (PgfPythonLexer*) base; + lexer->base.tok = gu_empty_string; + + PyObject* item = PyIter_Next(lexer->pylexer); + if (item == NULL) + if (PyErr_Occurred() != NULL) + gu_raise(err, PyPgfLexerExn); + else + gu_raise(err, GuEOF); + else { + const char* str = PyString_AsString(item); + if (str == NULL) + gu_raise(err, PyPgfLexerExn); + else + lexer->base.tok = gu_str_string(str, lexer->pool); + } + + return lexer->base.tok; +} + +static PgfLexer* +pypgf_new_python_lexer(PyObject* pylexer, GuPool* pool) +{ + PgfPythonLexer* lexer = gu_new(PgfPythonLexer, pool); + lexer->base.read_token = pypgf_python_lexer_read_token; + lexer->base.tok = gu_empty_string; + lexer->pylexer = pylexer; + lexer->pool = pool; + return ((PgfLexer*) lexer); +} + static ExprIterObject* Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) { - static char *kwlist[] = {"sentence", "cat", "n", NULL}; + static char *kwlist[] = {"sentence", "tokens", "cat", "n", NULL}; size_t len; - const uint8_t *buf; + const uint8_t *buf = NULL; + PyObject* py_lexer = NULL; const char *catname_s = NULL; int max_count = -1; - if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|si", kwlist, - &buf, &len, &catname_s, &max_count)) + if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Osi", kwlist, + &buf, &len, &py_lexer, &catname_s, &max_count)) return NULL; + if ((buf == NULL && py_lexer == NULL) || + (buf != NULL && py_lexer != NULL)) { + PyErr_SetString(PyExc_TypeError, "either the sentence or the tokens argument must be provided"); + return NULL; + } + + if (py_lexer != NULL) { + // get an iterator out of the iterable object + py_lexer = PyObject_GetIter(py_lexer); + if (py_lexer == NULL) + return NULL; + } + ExprIterObject* pyres = (ExprIterObject*) pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0); if (pyres == NULL) { + Py_XDECREF(py_lexer); return NULL; } @@ -608,18 +665,26 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) pyres->counter = 0; GuPool *tmp_pool = gu_local_pool(); - GuString catname = + GuString catname = (catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool) : gu_str_string(catname_s, tmp_pool); - GuIn* in = gu_data_in(buf, len, tmp_pool); - GuReader* rdr = gu_new_utf8_reader(in, tmp_pool); - PgfLexer *lexer = - pgf_new_lexer(rdr, tmp_pool); + + PgfLexer *lexer = NULL; + if (buf != NULL) { + GuIn* in = gu_data_in(buf, len, tmp_pool); + GuReader* rdr = gu_new_utf8_reader(in, tmp_pool); + lexer = pgf_new_simple_lexer(rdr, tmp_pool); + } + if (py_lexer != NULL) { + lexer = pypgf_new_python_lexer(py_lexer, tmp_pool); + } pyres->res = pgf_parse(self->concr, catname, lexer, pyres->pool); + if (pyres->res == NULL) { Py_DECREF(pyres); + pyres = NULL; PgfToken tok = pgf_lexer_current_token(lexer); @@ -633,84 +698,14 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) PyString_AsString(py_tok)); Py_DECREF(py_tok); } - - gu_pool_free(tmp_pool); - return NULL; } + Py_XDECREF(py_lexer); gu_pool_free(tmp_pool); return pyres; } -// Concr_parse_tokens is the same as the above function but -// instead of a string it expect a sequence of tokens as argument. -// This is usefull if you want to implement your own tokenizer in -// python. -static ExprIterObject* -Concr_parse_tokens(ConcrObject* self, PyObject *args, PyObject *keywds) -{ - static char *kwlist[] = {"tokens", "cat", "n", NULL}; - // Variable for the input list of tokens - PyObject* obj; - PyObject* seq; - int len; - const char *catname_s = NULL; - int max_count = -1; - - // Parsing arguments: the tokens is a python object (O), - // cat is a string (s) and n an integer (i) - if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|si", kwlist, - &obj, &catname_s, &max_count)) - return NULL; - // The python object should be a sequence - seq = PySequence_Fast(obj, "expected a sequence"); - len = PySequence_Size(obj); - - ExprIterObject* pyres = (ExprIterObject*) - pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0); - if (pyres == NULL) { - return NULL; - } - - pyres->pool = gu_new_pool(); - pyres->max_count = max_count; - pyres->counter = 0; - - GuPool *tmp_pool = gu_local_pool(); - GuString catname = - (catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool) - : gu_str_string(catname_s, tmp_pool); - - // turn the (python) list of tokens into a string array - char* tokens[len]; - for (int i = 0; i < len; i++) { - tokens[i] = PyString_AsString(PySequence_Fast_GET_ITEM(seq, i)); - if (tokens[i] == NULL) { - // Note: if the list item is not a string, - // PyString_AsString raises TypeError itself - // so we just have to return - gu_pool_free(tmp_pool); - return NULL; - } - } - Py_DECREF(seq); - - pyres->res = - pgf_parse_tokens(self->concr, catname, tokens, len, pyres->pool); - - if (pyres->res == NULL) { - Py_DECREF(pyres); - - PyErr_SetString(PGFError, "Something went wrong during parsing"); - gu_pool_free(tmp_pool); - return NULL; - } - - gu_pool_free(tmp_pool); - return pyres; -} - static PyObject* Concr_linearize(ConcrObject* self, PyObject *args) { @@ -743,9 +738,6 @@ static PyMethodDef Concr_methods[] = { {"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS, "Parses a string and returns an iterator over the abstract trees for this sentence" }, - {"parse_tokens", (PyCFunction)Concr_parse_tokens, METH_VARARGS | METH_KEYWORDS, - "Parses list of tokens and returns an iterator over the abstract trees for this sentence. Allows you to write your own tokenizer in python." - }, {"linearize", (PyCFunction)Concr_linearize, METH_VARARGS, "Takes an abstract tree and linearizes it to a sentence" },