This patch removes Gregoire's parse_tokens function in the python binding and adds another implementation which builds on the existing API for lexers in the C runtime. Now it is possible to write incremental Lexers in Python

2013-02-01 09:29:43 +00:00
parent eca4a28563
commit e9b5557c6c
8 changed files with 113 additions and 137 deletions
@@ -3,26 +3,15 @@
 #include <pgf/data.h>
 #include <wctype.h>
-struct PgfLexer {
+typedef struct {
 	PgfLexer base;
 	GuReader* rdr;
 	GuPool* pool;
 	GuUCS ucs;
-	PgfToken tok;
+} PgfSimpleLexer;
 };
 PgfLexer*
 pgf_new_lexer(GuReader *rdr, GuPool *pool)
 {
 	PgfLexer* lexer = gu_new(PgfLexer, pool);
 	lexer->rdr = rdr;
 	lexer->pool = pool;
 	lexer->ucs = ' ';
 	lexer->tok = gu_empty_string;
 	return lexer;
 }
 static void
-pgf_lexer_read_ucs(PgfLexer *lexer, GuExn* err)
+pgf_lexer_read_ucs(PgfSimpleLexer *lexer, GuExn* err)
 {
 	lexer->ucs = gu_read_ucs(lexer->rdr, err);
 	if (gu_exn_is_raised(err)) {
@@ -31,9 +20,10 @@ pgf_lexer_read_ucs(PgfLexer *lexer, GuExn* err)
 	}
 }
-PgfToken
+static PgfToken
-pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
+pgf_simple_lexer_read_token(PgfLexer *base, GuExn* err)
 {
 	PgfSimpleLexer* lexer = (PgfSimpleLexer*) base;
 	GuPool* tmp_pool = gu_new_pool();
 	GuStringBuf* buf = gu_string_buf(tmp_pool);
@@ -107,10 +97,28 @@ pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
 	}
 stop:
-	lexer->tok = gu_string_buf_freeze(buf, lexer->pool);
+	lexer->base.tok = gu_string_buf_freeze(buf, lexer->pool);
 	gu_pool_free(tmp_pool);
-	return lexer->tok;
+	return lexer->base.tok;
 }
 PgfLexer*
 pgf_new_simple_lexer(GuReader *rdr, GuPool *pool)
 {
 	PgfSimpleLexer* lexer = gu_new(PgfSimpleLexer, pool);
 	lexer->base.read_token = pgf_simple_lexer_read_token;
 	lexer->base.tok = gu_empty_string;
 	lexer->rdr = rdr;
 	lexer->pool = pool;
 	lexer->ucs = ' ';	
 	return ((PgfLexer*) lexer);
 }
 PgfToken
 pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
 {
 	return lexer->read_token(lexer, err);
 }
 PgfToken
@@ -6,10 +6,13 @@
 /// A single lexical token			      
 typedef GuString PgfToken;
-typedef struct PgfLexer PgfLexer;
+typedef struct {
 	PgfToken (*read_token)();
 	PgfToken tok;
 } PgfLexer;
 PgfLexer*
-pgf_new_lexer(GuReader *rdr, GuPool *pool);
+pgf_new_simple_lexer(GuReader *rdr, GuPool *pool);
 PgfToken
 pgf_lexer_read_token(PgfLexer *lexer, GuExn* err);
@@ -223,37 +223,13 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
 		tok = pgf_lexer_read_token(lexer, lex_err);
 	}
 	if (gu_exn_caught(lex_err) != gu_type(GuEOF))
 		return NULL;
 	// Now begin enumerating the resulting syntax trees
 	return pgf_parse_result(state, pool);
 }
 // Same as previous but accept a list of tokens as input instead of a 
 // lexer
 GuEnum*
 pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char **tokens, int len, GuPool* pool)
 {
    // Begin parsing a sentence of the specified category
    PgfParseState* state =
        pgf_parser_init_state(concr, cat, 0, pool);
    if (state == NULL) {
        return NULL;
    }
    // Parsing
    PgfToken tok;
    for (int i = 0; i < len; i++) {
        tok = gu_str_string(tokens[i], pool);
        state = pgf_parser_next_state(state, tok, pool);
        if (state == NULL) {
            return NULL;
        }
    }
    // Now begin enumerating the resulting syntax trees
    return pgf_parse_result(state, pool);
 }
 void
 pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
 {
@@ -115,9 +115,6 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuWriter* wtr, GuExn* err);
 PgfExprEnum*
 pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool);
 PgfExprEnum*
 pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char* tokens[], int len, GuPool* pool);
 PgfExprEnum*
 pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool);
@@ -98,7 +98,7 @@ int main(int argc, char* argv[]) {
 		GuReader *rdr =
 			gu_string_reader(gu_str_string(line, ppool), ppool);
 		PgfLexer *lexer =
-			pgf_new_lexer(rdr, ppool);
+			pgf_new_simple_lexer(rdr, ppool);
 		pgf_print_chunks(from_concr, cat, lexer, ppool);
@@ -123,7 +123,7 @@ int main(int argc, char* argv[]) {
    }
    GuReader *rdr = gu_string_reader(gu_str_string(line, ppool), ppool);
-    PgfLexer *lexer = pgf_new_lexer(rdr, ppool);
+    PgfLexer *lexer = pgf_new_simple_lexer(rdr, ppool);
    GuEnum* result = pgf_parse(concr, cat, lexer, ppool);
    PgfExprProb* ep = NULL;
@@ -164,7 +164,7 @@ int main(int argc, char* argv[]) {
 		GuReader *rdr =
 			gu_string_reader(gu_str_string(line, ppool), ppool);
 		PgfLexer *lexer =
-			pgf_new_lexer(rdr, ppool);
+			pgf_new_simple_lexer(rdr, ppool);
 		clock_t start = clock();
@@ -581,22 +581,79 @@ Concr_printName(ConcrObject* self, PyObject *args)
 	return pyname;
 }
 typedef struct {
 	PgfLexer base;
 	PyObject* pylexer;
 	GuPool* pool;
 } PgfPythonLexer;
 GU_DEFINE_TYPE(PyPgfLexerExn, abstract, _);
 static PgfToken
 pypgf_python_lexer_read_token(PgfLexer *base, GuExn* err)
 {
 	PgfPythonLexer* lexer = (PgfPythonLexer*) base;
 	lexer->base.tok = gu_empty_string;
 	PyObject* item = PyIter_Next(lexer->pylexer);
 	if (item == NULL)
 		if (PyErr_Occurred() != NULL)
 			gu_raise(err, PyPgfLexerExn);
 		else
 			gu_raise(err, GuEOF);
 	else {
 		const char* str = PyString_AsString(item);
 		if (str == NULL)
 			gu_raise(err, PyPgfLexerExn);
 		else
 			lexer->base.tok = gu_str_string(str, lexer->pool);
 	}
 	return lexer->base.tok;
 }
 static PgfLexer*
 pypgf_new_python_lexer(PyObject* pylexer, GuPool* pool)
 {
 	PgfPythonLexer* lexer = gu_new(PgfPythonLexer, pool);
 	lexer->base.read_token = pypgf_python_lexer_read_token;
 	lexer->base.tok = gu_empty_string;
 	lexer->pylexer = pylexer;
 	lexer->pool = pool;
 	return ((PgfLexer*) lexer);
 }
 static ExprIterObject*
 Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
 {
-	static char *kwlist[] = {"sentence", "cat", "n", NULL};
+	static char *kwlist[] = {"sentence", "tokens", "cat", "n", NULL};
 	size_t len;
-	const uint8_t *buf;
+	const uint8_t *buf = NULL;
 	PyObject* py_lexer = NULL;
 	const char *catname_s = NULL;
 	int max_count = -1;
-    if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|si", kwlist,
+    if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Osi", kwlist,
-                                     &buf, &len, &catname_s, &max_count))
+                                     &buf, &len, &py_lexer, &catname_s, &max_count))
        return NULL;
    if ((buf == NULL && py_lexer == NULL) || 
        (buf != NULL && py_lexer != NULL)) {
 		PyErr_SetString(PyExc_TypeError, "either the sentence or the tokens argument must be provided");
 		return NULL;
 	}
 	if (py_lexer != NULL) {
 		// get an iterator out of the iterable object
 		py_lexer = PyObject_GetIter(py_lexer);
 		if (py_lexer == NULL)
 			return NULL;
 	}
 	ExprIterObject* pyres = (ExprIterObject*) 
 		pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0);
 	if (pyres == NULL) {
 		Py_XDECREF(py_lexer);
 		return NULL;
 	}
@@ -608,18 +665,26 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
 	pyres->counter   = 0;
 	GuPool *tmp_pool = gu_local_pool();
-    GuString catname = 
+    GuString catname =
 		(catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
 		                    : gu_str_string(catname_s, tmp_pool);
-	GuIn* in = gu_data_in(buf, len, tmp_pool);
+
-	GuReader* rdr = gu_new_utf8_reader(in, tmp_pool);
+	PgfLexer *lexer = NULL;
-	PgfLexer *lexer =
+	if (buf != NULL) {
-		pgf_new_lexer(rdr, tmp_pool);
+		GuIn* in = gu_data_in(buf, len, tmp_pool);
 		GuReader* rdr = gu_new_utf8_reader(in, tmp_pool);
 		lexer = pgf_new_simple_lexer(rdr, tmp_pool);
 	} 
 	if (py_lexer != NULL) {
 		lexer = pypgf_new_python_lexer(py_lexer, tmp_pool);
 	}
 	pyres->res =
 		pgf_parse(self->concr, catname, lexer, pyres->pool);
 	if (pyres->res == NULL) {
 		Py_DECREF(pyres);
 		pyres = NULL;
 		PgfToken tok =
 			pgf_lexer_current_token(lexer);
@@ -633,84 +698,14 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
 										PyString_AsString(py_tok));
 			Py_DECREF(py_tok);
 		}
 		gu_pool_free(tmp_pool);
 		return NULL;
 	}
 	Py_XDECREF(py_lexer);
 	gu_pool_free(tmp_pool);
 	return pyres;
 }
 // Concr_parse_tokens is the same as the above function but
 // instead of a string it expect a sequence of tokens as argument.
 // This is usefull if you want to implement your own tokenizer in
 // python.
 static ExprIterObject*
 Concr_parse_tokens(ConcrObject* self, PyObject *args, PyObject *keywds)
 {
    static char *kwlist[] = {"tokens", "cat", "n", NULL};
    // Variable for the input list of tokens
    PyObject* obj;
    PyObject* seq;
    int len;
    const char *catname_s = NULL;
    int max_count = -1;
    // Parsing arguments: the tokens is a python object (O),
    // cat is a string (s) and n an integer (i)
    if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|si", kwlist,
                                    &obj, &catname_s, &max_count))
        return NULL;
    // The python object should be a sequence
    seq = PySequence_Fast(obj, "expected a sequence");
    len = PySequence_Size(obj);
    ExprIterObject* pyres = (ExprIterObject*) 
        pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0);
    if (pyres == NULL) {
        return NULL;
    }
    pyres->pool = gu_new_pool();
    pyres->max_count = max_count;
    pyres->counter   = 0;
    GuPool *tmp_pool = gu_local_pool();
    GuString catname = 
        (catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
                            : gu_str_string(catname_s, tmp_pool);
    // turn the (python) list of tokens into a string array
    char* tokens[len];
    for (int i = 0; i < len; i++) {
        tokens[i] = PyString_AsString(PySequence_Fast_GET_ITEM(seq, i));
        if (tokens[i] == NULL) {
            // Note: if the list item is not a string, 
            // PyString_AsString raises TypeError itself
            // so we just have to return
            gu_pool_free(tmp_pool);
            return NULL;
        }
    }
    Py_DECREF(seq);
    pyres->res =
        pgf_parse_tokens(self->concr, catname, tokens, len, pyres->pool);
    if (pyres->res == NULL) {
        Py_DECREF(pyres);
        PyErr_SetString(PGFError, "Something went wrong during parsing");
        gu_pool_free(tmp_pool);
        return NULL;
    }
    gu_pool_free(tmp_pool);
    return pyres;
 }
 static PyObject*
 Concr_linearize(ConcrObject* self, PyObject *args)
 {
@@ -743,9 +738,6 @@ static PyMethodDef Concr_methods[] = {
    {"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS,
     "Parses a string and returns an iterator over the abstract trees for this sentence"
    },
    {"parse_tokens", (PyCFunction)Concr_parse_tokens, METH_VARARGS | METH_KEYWORDS,
     "Parses list of tokens and returns an iterator over the abstract trees for this sentence. Allows you to write your own tokenizer in python."
    },
    {"linearize", (PyCFunction)Concr_linearize, METH_VARARGS,
     "Takes an abstract tree and linearizes it to a sentence"
    },