From 5e2474e3467e51d46314745d855d64921e4e0e4c Mon Sep 17 00:00:00 2001
From: "kr.angelov" <kr.angelov@gmail.com>
Date: Fri, 1 Feb 2013 09:29:43 +0000
Subject: [PATCH] This patch removes Gregoire's parse_tokens function in the
 python binding and adds another implementation which builds on the existing
 API for lexers in the C runtime. Now it is possible to write incremental
 Lexers in Python

---
 src/runtime/c/pgf/lexer.c           |  46 ++++----
 src/runtime/c/pgf/lexer.h           |   7 +-
 src/runtime/c/pgf/pgf.c             |  30 +-----
 src/runtime/c/pgf/pgf.h             |   3 -
 src/runtime/c/utils/pgf-chunk.c     |   2 +-
 src/runtime/c/utils/pgf-parse.c     |   2 +-
 src/runtime/c/utils/pgf-translate.c |   2 +-
 src/runtime/python/pypgf.c          | 158 +++++++++++++---------------
 8 files changed, 113 insertions(+), 137 deletions(-)

diff --git a/src/runtime/c/pgf/lexer.c b/src/runtime/c/pgf/lexer.c
index 15caab151..d50098072 100644
--- a/src/runtime/c/pgf/lexer.c
+++ b/src/runtime/c/pgf/lexer.c
@@ -3,26 +3,15 @@
 #include <pgf/data.h>
 #include <wctype.h>
 
-struct PgfLexer {
+typedef struct {
+	PgfLexer base;
 	GuReader* rdr;
 	GuPool* pool;
 	GuUCS ucs;
-	PgfToken tok;
-};
-
-PgfLexer*
-pgf_new_lexer(GuReader *rdr, GuPool *pool)
-{
-	PgfLexer* lexer = gu_new(PgfLexer, pool);
-	lexer->rdr = rdr;
-	lexer->pool = pool;
-	lexer->ucs = ' ';
-	lexer->tok = gu_empty_string;
-	return lexer;
-}
+} PgfSimpleLexer;
 
 static void
-pgf_lexer_read_ucs(PgfLexer *lexer, GuExn* err)
+pgf_lexer_read_ucs(PgfSimpleLexer *lexer, GuExn* err)
 {
 	lexer->ucs = gu_read_ucs(lexer->rdr, err);
 	if (gu_exn_is_raised(err)) {
@@ -31,9 +20,10 @@ pgf_lexer_read_ucs(PgfLexer *lexer, GuExn* err)
 	}
 }
 
-PgfToken
-pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
+static PgfToken
+pgf_simple_lexer_read_token(PgfLexer *base, GuExn* err)
 {
+	PgfSimpleLexer* lexer = (PgfSimpleLexer*) base;
 	GuPool* tmp_pool = gu_new_pool();
 
 	GuStringBuf* buf = gu_string_buf(tmp_pool);
@@ -107,10 +97,28 @@ pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
 	}
 
 stop:
-	lexer->tok = gu_string_buf_freeze(buf, lexer->pool);
+	lexer->base.tok = gu_string_buf_freeze(buf, lexer->pool);
 
 	gu_pool_free(tmp_pool);
-	return lexer->tok;
+	return lexer->base.tok;
+}
+
+PgfLexer*
+pgf_new_simple_lexer(GuReader *rdr, GuPool *pool)
+{
+	PgfSimpleLexer* lexer = gu_new(PgfSimpleLexer, pool);
+	lexer->base.read_token = pgf_simple_lexer_read_token;
+	lexer->base.tok = gu_empty_string;
+	lexer->rdr = rdr;
+	lexer->pool = pool;
+	lexer->ucs = ' ';	
+	return ((PgfLexer*) lexer);
+}
+
+PgfToken
+pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
+{
+	return lexer->read_token(lexer, err);
 }
 
 PgfToken
diff --git a/src/runtime/c/pgf/lexer.h b/src/runtime/c/pgf/lexer.h
index 6f01d4d10..f89629cea 100644
--- a/src/runtime/c/pgf/lexer.h
+++ b/src/runtime/c/pgf/lexer.h
@@ -6,10 +6,13 @@
 /// A single lexical token			      
 typedef GuString PgfToken;
 
-typedef struct PgfLexer PgfLexer;
+typedef struct {
+	PgfToken (*read_token)();
+	PgfToken tok;
+} PgfLexer;
 
 PgfLexer*
-pgf_new_lexer(GuReader *rdr, GuPool *pool);
+pgf_new_simple_lexer(GuReader *rdr, GuPool *pool);
 
 PgfToken
 pgf_lexer_read_token(PgfLexer *lexer, GuExn* err);
diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c
index 2b720f093..6c6872867 100644
--- a/src/runtime/c/pgf/pgf.c
+++ b/src/runtime/c/pgf/pgf.c
@@ -223,37 +223,13 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
 		tok = pgf_lexer_read_token(lexer, lex_err);
 	}
 
+	if (gu_exn_caught(lex_err) != gu_type(GuEOF))
+		return NULL;
+
 	// Now begin enumerating the resulting syntax trees
 	return pgf_parse_result(state, pool);
 }
 
-// Same as previous but accept a list of tokens as input instead of a 
-// lexer
-GuEnum*
-pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char **tokens, int len, GuPool* pool)
-{
-    // Begin parsing a sentence of the specified category
-    PgfParseState* state =
-        pgf_parser_init_state(concr, cat, 0, pool);
-    if (state == NULL) {
-        return NULL;
-    }
-
-    // Parsing
-    PgfToken tok;
-    for (int i = 0; i < len; i++) {
-        tok = gu_str_string(tokens[i], pool);
-
-        state = pgf_parser_next_state(state, tok, pool);
-        if (state == NULL) {
-            return NULL;
-        }
-    }
-
-    // Now begin enumerating the resulting syntax trees
-    return pgf_parse_result(state, pool);
-}
-
 void
 pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
 {
diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h
index afef6ec48..1f3947bff 100644
--- a/src/runtime/c/pgf/pgf.h
+++ b/src/runtime/c/pgf/pgf.h
@@ -115,9 +115,6 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuWriter* wtr, GuExn* err);
 PgfExprEnum*
 pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool);
 
-PgfExprEnum*
-pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char* tokens[], int len, GuPool* pool);
-
 PgfExprEnum*
 pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool);
 
diff --git a/src/runtime/c/utils/pgf-chunk.c b/src/runtime/c/utils/pgf-chunk.c
index fada1c0b4..5f4b8972a 100644
--- a/src/runtime/c/utils/pgf-chunk.c
+++ b/src/runtime/c/utils/pgf-chunk.c
@@ -98,7 +98,7 @@ int main(int argc, char* argv[]) {
 		GuReader *rdr =
 			gu_string_reader(gu_str_string(line, ppool), ppool);
 		PgfLexer *lexer =
-			pgf_new_lexer(rdr, ppool);
+			pgf_new_simple_lexer(rdr, ppool);
 
 		pgf_print_chunks(from_concr, cat, lexer, ppool);
 		
diff --git a/src/runtime/c/utils/pgf-parse.c b/src/runtime/c/utils/pgf-parse.c
index 4e1444806..648295312 100644
--- a/src/runtime/c/utils/pgf-parse.c
+++ b/src/runtime/c/utils/pgf-parse.c
@@ -123,7 +123,7 @@ int main(int argc, char* argv[]) {
     }
 
     GuReader *rdr = gu_string_reader(gu_str_string(line, ppool), ppool);
-    PgfLexer *lexer = pgf_new_lexer(rdr, ppool);
+    PgfLexer *lexer = pgf_new_simple_lexer(rdr, ppool);
     GuEnum* result = pgf_parse(concr, cat, lexer, ppool);
 
     PgfExprProb* ep = NULL;
diff --git a/src/runtime/c/utils/pgf-translate.c b/src/runtime/c/utils/pgf-translate.c
index ea3cca5af..ac427cb0e 100644
--- a/src/runtime/c/utils/pgf-translate.c
+++ b/src/runtime/c/utils/pgf-translate.c
@@ -164,7 +164,7 @@ int main(int argc, char* argv[]) {
 		GuReader *rdr =
 			gu_string_reader(gu_str_string(line, ppool), ppool);
 		PgfLexer *lexer =
-			pgf_new_lexer(rdr, ppool);
+			pgf_new_simple_lexer(rdr, ppool);
 
 		clock_t start = clock();
 
diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c
index 997f3d3f7..4b2ab5891 100644
--- a/src/runtime/python/pypgf.c
+++ b/src/runtime/python/pypgf.c
@@ -581,22 +581,79 @@ Concr_printName(ConcrObject* self, PyObject *args)
 	return pyname;
 }
 
+typedef struct {
+	PgfLexer base;
+	PyObject* pylexer;
+	GuPool* pool;
+} PgfPythonLexer;
+
+GU_DEFINE_TYPE(PyPgfLexerExn, abstract, _);
+
+static PgfToken
+pypgf_python_lexer_read_token(PgfLexer *base, GuExn* err)
+{
+	PgfPythonLexer* lexer = (PgfPythonLexer*) base;
+	lexer->base.tok = gu_empty_string;
+
+	PyObject* item = PyIter_Next(lexer->pylexer);
+	if (item == NULL)
+		if (PyErr_Occurred() != NULL)
+			gu_raise(err, PyPgfLexerExn);
+		else
+			gu_raise(err, GuEOF);
+	else {
+		const char* str = PyString_AsString(item);
+		if (str == NULL)
+			gu_raise(err, PyPgfLexerExn);
+		else
+			lexer->base.tok = gu_str_string(str, lexer->pool);
+	}
+
+	return lexer->base.tok;
+}
+
+static PgfLexer*
+pypgf_new_python_lexer(PyObject* pylexer, GuPool* pool)
+{
+	PgfPythonLexer* lexer = gu_new(PgfPythonLexer, pool);
+	lexer->base.read_token = pypgf_python_lexer_read_token;
+	lexer->base.tok = gu_empty_string;
+	lexer->pylexer = pylexer;
+	lexer->pool = pool;
+	return ((PgfLexer*) lexer);
+}
+
 static ExprIterObject*
 Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
 {
-	static char *kwlist[] = {"sentence", "cat", "n", NULL};
+	static char *kwlist[] = {"sentence", "tokens", "cat", "n", NULL};
 
 	size_t len;
-	const uint8_t *buf;
+	const uint8_t *buf = NULL;
+	PyObject* py_lexer = NULL;
 	const char *catname_s = NULL;
 	int max_count = -1;
-    if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|si", kwlist,
-                                     &buf, &len, &catname_s, &max_count))
+    if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Osi", kwlist,
+                                     &buf, &len, &py_lexer, &catname_s, &max_count))
         return NULL;
 
+    if ((buf == NULL && py_lexer == NULL) || 
+        (buf != NULL && py_lexer != NULL)) {
+		PyErr_SetString(PyExc_TypeError, "either the sentence or the tokens argument must be provided");
+		return NULL;
+	}
+
+	if (py_lexer != NULL) {
+		// get an iterator out of the iterable object
+		py_lexer = PyObject_GetIter(py_lexer);
+		if (py_lexer == NULL)
+			return NULL;
+	}
+
 	ExprIterObject* pyres = (ExprIterObject*) 
 		pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0);
 	if (pyres == NULL) {
+		Py_XDECREF(py_lexer);
 		return NULL;
 	}
 
@@ -608,18 +665,26 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
 	pyres->counter   = 0;
 
 	GuPool *tmp_pool = gu_local_pool();
-    GuString catname = 
+    GuString catname =
 		(catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
 		                    : gu_str_string(catname_s, tmp_pool);
-	GuIn* in = gu_data_in(buf, len, tmp_pool);
-	GuReader* rdr = gu_new_utf8_reader(in, tmp_pool);
-	PgfLexer *lexer =
-		pgf_new_lexer(rdr, tmp_pool);
+
+	PgfLexer *lexer = NULL;
+	if (buf != NULL) {
+		GuIn* in = gu_data_in(buf, len, tmp_pool);
+		GuReader* rdr = gu_new_utf8_reader(in, tmp_pool);
+		lexer = pgf_new_simple_lexer(rdr, tmp_pool);
+	} 
+	if (py_lexer != NULL) {
+		lexer = pypgf_new_python_lexer(py_lexer, tmp_pool);
+	}
 
 	pyres->res =
 		pgf_parse(self->concr, catname, lexer, pyres->pool);
+
 	if (pyres->res == NULL) {
 		Py_DECREF(pyres);
+		pyres = NULL;
 
 		PgfToken tok =
 			pgf_lexer_current_token(lexer);
@@ -633,84 +698,14 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
 										PyString_AsString(py_tok));
 			Py_DECREF(py_tok);
 		}
-
-		gu_pool_free(tmp_pool);
-		return NULL;
 	}
 
+	Py_XDECREF(py_lexer);
 	gu_pool_free(tmp_pool);
 
 	return pyres;
 }
 
-// Concr_parse_tokens is the same as the above function but
-// instead of a string it expect a sequence of tokens as argument.
-// This is usefull if you want to implement your own tokenizer in
-// python.
-static ExprIterObject*
-Concr_parse_tokens(ConcrObject* self, PyObject *args, PyObject *keywds)
-{
-    static char *kwlist[] = {"tokens", "cat", "n", NULL};
-    // Variable for the input list of tokens
-    PyObject* obj;
-    PyObject* seq;
-    int len;
-    const char *catname_s = NULL;
-    int max_count = -1;
-
-    // Parsing arguments: the tokens is a python object (O),
-    // cat is a string (s) and n an integer (i)
-    if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|si", kwlist,
-                                    &obj, &catname_s, &max_count))
-        return NULL;
-    // The python object should be a sequence
-    seq = PySequence_Fast(obj, "expected a sequence");
-    len = PySequence_Size(obj);
-
-    ExprIterObject* pyres = (ExprIterObject*) 
-        pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0);
-    if (pyres == NULL) {
-        return NULL;
-    }
-
-    pyres->pool = gu_new_pool();
-    pyres->max_count = max_count;
-    pyres->counter   = 0;
-
-    GuPool *tmp_pool = gu_local_pool();
-    GuString catname = 
-        (catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
-                            : gu_str_string(catname_s, tmp_pool);
-
-    // turn the (python) list of tokens into a string array
-    char* tokens[len];
-    for (int i = 0; i < len; i++) {
-        tokens[i] = PyString_AsString(PySequence_Fast_GET_ITEM(seq, i));
-        if (tokens[i] == NULL) {
-            // Note: if the list item is not a string, 
-            // PyString_AsString raises TypeError itself
-            // so we just have to return
-            gu_pool_free(tmp_pool);
-            return NULL;
-        }
-    }
-    Py_DECREF(seq);
-    
-    pyres->res =
-        pgf_parse_tokens(self->concr, catname, tokens, len, pyres->pool);
-
-    if (pyres->res == NULL) {
-        Py_DECREF(pyres);
-
-        PyErr_SetString(PGFError, "Something went wrong during parsing");
-        gu_pool_free(tmp_pool);
-        return NULL;
-    }
-
-    gu_pool_free(tmp_pool);
-    return pyres;
-}
-
 static PyObject*
 Concr_linearize(ConcrObject* self, PyObject *args)
 {
@@ -743,9 +738,6 @@ static PyMethodDef Concr_methods[] = {
     {"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS,
      "Parses a string and returns an iterator over the abstract trees for this sentence"
     },
-    {"parse_tokens", (PyCFunction)Concr_parse_tokens, METH_VARARGS | METH_KEYWORDS,
-     "Parses list of tokens and returns an iterator over the abstract trees for this sentence. Allows you to write your own tokenizer in python."
-    },
     {"linearize", (PyCFunction)Concr_linearize, METH_VARARGS,
      "Takes an abstract tree and linearizes it to a sentence"
     },