forked from GitHub/gf-core
This patch removes Gregoire's parse_tokens function in the python binding and adds another implementation which builds on the existing API for lexers in the C runtime. Now it is possible to write incremental Lexers in Python
This commit is contained in:
@@ -3,26 +3,15 @@
|
|||||||
#include <pgf/data.h>
|
#include <pgf/data.h>
|
||||||
#include <wctype.h>
|
#include <wctype.h>
|
||||||
|
|
||||||
struct PgfLexer {
|
typedef struct {
|
||||||
|
PgfLexer base;
|
||||||
GuReader* rdr;
|
GuReader* rdr;
|
||||||
GuPool* pool;
|
GuPool* pool;
|
||||||
GuUCS ucs;
|
GuUCS ucs;
|
||||||
PgfToken tok;
|
} PgfSimpleLexer;
|
||||||
};
|
|
||||||
|
|
||||||
PgfLexer*
|
|
||||||
pgf_new_lexer(GuReader *rdr, GuPool *pool)
|
|
||||||
{
|
|
||||||
PgfLexer* lexer = gu_new(PgfLexer, pool);
|
|
||||||
lexer->rdr = rdr;
|
|
||||||
lexer->pool = pool;
|
|
||||||
lexer->ucs = ' ';
|
|
||||||
lexer->tok = gu_empty_string;
|
|
||||||
return lexer;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
pgf_lexer_read_ucs(PgfLexer *lexer, GuExn* err)
|
pgf_lexer_read_ucs(PgfSimpleLexer *lexer, GuExn* err)
|
||||||
{
|
{
|
||||||
lexer->ucs = gu_read_ucs(lexer->rdr, err);
|
lexer->ucs = gu_read_ucs(lexer->rdr, err);
|
||||||
if (gu_exn_is_raised(err)) {
|
if (gu_exn_is_raised(err)) {
|
||||||
@@ -31,9 +20,10 @@ pgf_lexer_read_ucs(PgfLexer *lexer, GuExn* err)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
PgfToken
|
static PgfToken
|
||||||
pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
|
pgf_simple_lexer_read_token(PgfLexer *base, GuExn* err)
|
||||||
{
|
{
|
||||||
|
PgfSimpleLexer* lexer = (PgfSimpleLexer*) base;
|
||||||
GuPool* tmp_pool = gu_new_pool();
|
GuPool* tmp_pool = gu_new_pool();
|
||||||
|
|
||||||
GuStringBuf* buf = gu_string_buf(tmp_pool);
|
GuStringBuf* buf = gu_string_buf(tmp_pool);
|
||||||
@@ -107,10 +97,28 @@ pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
|
|||||||
}
|
}
|
||||||
|
|
||||||
stop:
|
stop:
|
||||||
lexer->tok = gu_string_buf_freeze(buf, lexer->pool);
|
lexer->base.tok = gu_string_buf_freeze(buf, lexer->pool);
|
||||||
|
|
||||||
gu_pool_free(tmp_pool);
|
gu_pool_free(tmp_pool);
|
||||||
return lexer->tok;
|
return lexer->base.tok;
|
||||||
|
}
|
||||||
|
|
||||||
|
PgfLexer*
|
||||||
|
pgf_new_simple_lexer(GuReader *rdr, GuPool *pool)
|
||||||
|
{
|
||||||
|
PgfSimpleLexer* lexer = gu_new(PgfSimpleLexer, pool);
|
||||||
|
lexer->base.read_token = pgf_simple_lexer_read_token;
|
||||||
|
lexer->base.tok = gu_empty_string;
|
||||||
|
lexer->rdr = rdr;
|
||||||
|
lexer->pool = pool;
|
||||||
|
lexer->ucs = ' ';
|
||||||
|
return ((PgfLexer*) lexer);
|
||||||
|
}
|
||||||
|
|
||||||
|
PgfToken
|
||||||
|
pgf_lexer_read_token(PgfLexer *lexer, GuExn* err)
|
||||||
|
{
|
||||||
|
return lexer->read_token(lexer, err);
|
||||||
}
|
}
|
||||||
|
|
||||||
PgfToken
|
PgfToken
|
||||||
|
|||||||
@@ -6,10 +6,13 @@
|
|||||||
/// A single lexical token
|
/// A single lexical token
|
||||||
typedef GuString PgfToken;
|
typedef GuString PgfToken;
|
||||||
|
|
||||||
typedef struct PgfLexer PgfLexer;
|
typedef struct {
|
||||||
|
PgfToken (*read_token)();
|
||||||
|
PgfToken tok;
|
||||||
|
} PgfLexer;
|
||||||
|
|
||||||
PgfLexer*
|
PgfLexer*
|
||||||
pgf_new_lexer(GuReader *rdr, GuPool *pool);
|
pgf_new_simple_lexer(GuReader *rdr, GuPool *pool);
|
||||||
|
|
||||||
PgfToken
|
PgfToken
|
||||||
pgf_lexer_read_token(PgfLexer *lexer, GuExn* err);
|
pgf_lexer_read_token(PgfLexer *lexer, GuExn* err);
|
||||||
|
|||||||
@@ -223,37 +223,13 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
|
|||||||
tok = pgf_lexer_read_token(lexer, lex_err);
|
tok = pgf_lexer_read_token(lexer, lex_err);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (gu_exn_caught(lex_err) != gu_type(GuEOF))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
// Now begin enumerating the resulting syntax trees
|
// Now begin enumerating the resulting syntax trees
|
||||||
return pgf_parse_result(state, pool);
|
return pgf_parse_result(state, pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Same as previous but accept a list of tokens as input instead of a
|
|
||||||
// lexer
|
|
||||||
GuEnum*
|
|
||||||
pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char **tokens, int len, GuPool* pool)
|
|
||||||
{
|
|
||||||
// Begin parsing a sentence of the specified category
|
|
||||||
PgfParseState* state =
|
|
||||||
pgf_parser_init_state(concr, cat, 0, pool);
|
|
||||||
if (state == NULL) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parsing
|
|
||||||
PgfToken tok;
|
|
||||||
for (int i = 0; i < len; i++) {
|
|
||||||
tok = gu_str_string(tokens[i], pool);
|
|
||||||
|
|
||||||
state = pgf_parser_next_state(state, tok, pool);
|
|
||||||
if (state == NULL) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now begin enumerating the resulting syntax trees
|
|
||||||
return pgf_parse_result(state, pool);
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
|
pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -115,9 +115,6 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuWriter* wtr, GuExn* err);
|
|||||||
PgfExprEnum*
|
PgfExprEnum*
|
||||||
pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool);
|
pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool);
|
||||||
|
|
||||||
PgfExprEnum*
|
|
||||||
pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char* tokens[], int len, GuPool* pool);
|
|
||||||
|
|
||||||
PgfExprEnum*
|
PgfExprEnum*
|
||||||
pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool);
|
pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool);
|
||||||
|
|
||||||
|
|||||||
@@ -98,7 +98,7 @@ int main(int argc, char* argv[]) {
|
|||||||
GuReader *rdr =
|
GuReader *rdr =
|
||||||
gu_string_reader(gu_str_string(line, ppool), ppool);
|
gu_string_reader(gu_str_string(line, ppool), ppool);
|
||||||
PgfLexer *lexer =
|
PgfLexer *lexer =
|
||||||
pgf_new_lexer(rdr, ppool);
|
pgf_new_simple_lexer(rdr, ppool);
|
||||||
|
|
||||||
pgf_print_chunks(from_concr, cat, lexer, ppool);
|
pgf_print_chunks(from_concr, cat, lexer, ppool);
|
||||||
|
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ int main(int argc, char* argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
GuReader *rdr = gu_string_reader(gu_str_string(line, ppool), ppool);
|
GuReader *rdr = gu_string_reader(gu_str_string(line, ppool), ppool);
|
||||||
PgfLexer *lexer = pgf_new_lexer(rdr, ppool);
|
PgfLexer *lexer = pgf_new_simple_lexer(rdr, ppool);
|
||||||
GuEnum* result = pgf_parse(concr, cat, lexer, ppool);
|
GuEnum* result = pgf_parse(concr, cat, lexer, ppool);
|
||||||
|
|
||||||
PgfExprProb* ep = NULL;
|
PgfExprProb* ep = NULL;
|
||||||
|
|||||||
@@ -164,7 +164,7 @@ int main(int argc, char* argv[]) {
|
|||||||
GuReader *rdr =
|
GuReader *rdr =
|
||||||
gu_string_reader(gu_str_string(line, ppool), ppool);
|
gu_string_reader(gu_str_string(line, ppool), ppool);
|
||||||
PgfLexer *lexer =
|
PgfLexer *lexer =
|
||||||
pgf_new_lexer(rdr, ppool);
|
pgf_new_simple_lexer(rdr, ppool);
|
||||||
|
|
||||||
clock_t start = clock();
|
clock_t start = clock();
|
||||||
|
|
||||||
|
|||||||
@@ -581,22 +581,79 @@ Concr_printName(ConcrObject* self, PyObject *args)
|
|||||||
return pyname;
|
return pyname;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
PgfLexer base;
|
||||||
|
PyObject* pylexer;
|
||||||
|
GuPool* pool;
|
||||||
|
} PgfPythonLexer;
|
||||||
|
|
||||||
|
GU_DEFINE_TYPE(PyPgfLexerExn, abstract, _);
|
||||||
|
|
||||||
|
static PgfToken
|
||||||
|
pypgf_python_lexer_read_token(PgfLexer *base, GuExn* err)
|
||||||
|
{
|
||||||
|
PgfPythonLexer* lexer = (PgfPythonLexer*) base;
|
||||||
|
lexer->base.tok = gu_empty_string;
|
||||||
|
|
||||||
|
PyObject* item = PyIter_Next(lexer->pylexer);
|
||||||
|
if (item == NULL)
|
||||||
|
if (PyErr_Occurred() != NULL)
|
||||||
|
gu_raise(err, PyPgfLexerExn);
|
||||||
|
else
|
||||||
|
gu_raise(err, GuEOF);
|
||||||
|
else {
|
||||||
|
const char* str = PyString_AsString(item);
|
||||||
|
if (str == NULL)
|
||||||
|
gu_raise(err, PyPgfLexerExn);
|
||||||
|
else
|
||||||
|
lexer->base.tok = gu_str_string(str, lexer->pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
return lexer->base.tok;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PgfLexer*
|
||||||
|
pypgf_new_python_lexer(PyObject* pylexer, GuPool* pool)
|
||||||
|
{
|
||||||
|
PgfPythonLexer* lexer = gu_new(PgfPythonLexer, pool);
|
||||||
|
lexer->base.read_token = pypgf_python_lexer_read_token;
|
||||||
|
lexer->base.tok = gu_empty_string;
|
||||||
|
lexer->pylexer = pylexer;
|
||||||
|
lexer->pool = pool;
|
||||||
|
return ((PgfLexer*) lexer);
|
||||||
|
}
|
||||||
|
|
||||||
static ExprIterObject*
|
static ExprIterObject*
|
||||||
Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
|
Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
|
||||||
{
|
{
|
||||||
static char *kwlist[] = {"sentence", "cat", "n", NULL};
|
static char *kwlist[] = {"sentence", "tokens", "cat", "n", NULL};
|
||||||
|
|
||||||
size_t len;
|
size_t len;
|
||||||
const uint8_t *buf;
|
const uint8_t *buf = NULL;
|
||||||
|
PyObject* py_lexer = NULL;
|
||||||
const char *catname_s = NULL;
|
const char *catname_s = NULL;
|
||||||
int max_count = -1;
|
int max_count = -1;
|
||||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "s#|si", kwlist,
|
if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Osi", kwlist,
|
||||||
&buf, &len, &catname_s, &max_count))
|
&buf, &len, &py_lexer, &catname_s, &max_count))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
if ((buf == NULL && py_lexer == NULL) ||
|
||||||
|
(buf != NULL && py_lexer != NULL)) {
|
||||||
|
PyErr_SetString(PyExc_TypeError, "either the sentence or the tokens argument must be provided");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (py_lexer != NULL) {
|
||||||
|
// get an iterator out of the iterable object
|
||||||
|
py_lexer = PyObject_GetIter(py_lexer);
|
||||||
|
if (py_lexer == NULL)
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
ExprIterObject* pyres = (ExprIterObject*)
|
ExprIterObject* pyres = (ExprIterObject*)
|
||||||
pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0);
|
pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0);
|
||||||
if (pyres == NULL) {
|
if (pyres == NULL) {
|
||||||
|
Py_XDECREF(py_lexer);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -608,18 +665,26 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
|
|||||||
pyres->counter = 0;
|
pyres->counter = 0;
|
||||||
|
|
||||||
GuPool *tmp_pool = gu_local_pool();
|
GuPool *tmp_pool = gu_local_pool();
|
||||||
GuString catname =
|
GuString catname =
|
||||||
(catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
|
(catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
|
||||||
: gu_str_string(catname_s, tmp_pool);
|
: gu_str_string(catname_s, tmp_pool);
|
||||||
GuIn* in = gu_data_in(buf, len, tmp_pool);
|
|
||||||
GuReader* rdr = gu_new_utf8_reader(in, tmp_pool);
|
PgfLexer *lexer = NULL;
|
||||||
PgfLexer *lexer =
|
if (buf != NULL) {
|
||||||
pgf_new_lexer(rdr, tmp_pool);
|
GuIn* in = gu_data_in(buf, len, tmp_pool);
|
||||||
|
GuReader* rdr = gu_new_utf8_reader(in, tmp_pool);
|
||||||
|
lexer = pgf_new_simple_lexer(rdr, tmp_pool);
|
||||||
|
}
|
||||||
|
if (py_lexer != NULL) {
|
||||||
|
lexer = pypgf_new_python_lexer(py_lexer, tmp_pool);
|
||||||
|
}
|
||||||
|
|
||||||
pyres->res =
|
pyres->res =
|
||||||
pgf_parse(self->concr, catname, lexer, pyres->pool);
|
pgf_parse(self->concr, catname, lexer, pyres->pool);
|
||||||
|
|
||||||
if (pyres->res == NULL) {
|
if (pyres->res == NULL) {
|
||||||
Py_DECREF(pyres);
|
Py_DECREF(pyres);
|
||||||
|
pyres = NULL;
|
||||||
|
|
||||||
PgfToken tok =
|
PgfToken tok =
|
||||||
pgf_lexer_current_token(lexer);
|
pgf_lexer_current_token(lexer);
|
||||||
@@ -633,84 +698,14 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
|
|||||||
PyString_AsString(py_tok));
|
PyString_AsString(py_tok));
|
||||||
Py_DECREF(py_tok);
|
Py_DECREF(py_tok);
|
||||||
}
|
}
|
||||||
|
|
||||||
gu_pool_free(tmp_pool);
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Py_XDECREF(py_lexer);
|
||||||
gu_pool_free(tmp_pool);
|
gu_pool_free(tmp_pool);
|
||||||
|
|
||||||
return pyres;
|
return pyres;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Concr_parse_tokens is the same as the above function but
|
|
||||||
// instead of a string it expect a sequence of tokens as argument.
|
|
||||||
// This is usefull if you want to implement your own tokenizer in
|
|
||||||
// python.
|
|
||||||
static ExprIterObject*
|
|
||||||
Concr_parse_tokens(ConcrObject* self, PyObject *args, PyObject *keywds)
|
|
||||||
{
|
|
||||||
static char *kwlist[] = {"tokens", "cat", "n", NULL};
|
|
||||||
// Variable for the input list of tokens
|
|
||||||
PyObject* obj;
|
|
||||||
PyObject* seq;
|
|
||||||
int len;
|
|
||||||
const char *catname_s = NULL;
|
|
||||||
int max_count = -1;
|
|
||||||
|
|
||||||
// Parsing arguments: the tokens is a python object (O),
|
|
||||||
// cat is a string (s) and n an integer (i)
|
|
||||||
if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|si", kwlist,
|
|
||||||
&obj, &catname_s, &max_count))
|
|
||||||
return NULL;
|
|
||||||
// The python object should be a sequence
|
|
||||||
seq = PySequence_Fast(obj, "expected a sequence");
|
|
||||||
len = PySequence_Size(obj);
|
|
||||||
|
|
||||||
ExprIterObject* pyres = (ExprIterObject*)
|
|
||||||
pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0);
|
|
||||||
if (pyres == NULL) {
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
pyres->pool = gu_new_pool();
|
|
||||||
pyres->max_count = max_count;
|
|
||||||
pyres->counter = 0;
|
|
||||||
|
|
||||||
GuPool *tmp_pool = gu_local_pool();
|
|
||||||
GuString catname =
|
|
||||||
(catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
|
|
||||||
: gu_str_string(catname_s, tmp_pool);
|
|
||||||
|
|
||||||
// turn the (python) list of tokens into a string array
|
|
||||||
char* tokens[len];
|
|
||||||
for (int i = 0; i < len; i++) {
|
|
||||||
tokens[i] = PyString_AsString(PySequence_Fast_GET_ITEM(seq, i));
|
|
||||||
if (tokens[i] == NULL) {
|
|
||||||
// Note: if the list item is not a string,
|
|
||||||
// PyString_AsString raises TypeError itself
|
|
||||||
// so we just have to return
|
|
||||||
gu_pool_free(tmp_pool);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Py_DECREF(seq);
|
|
||||||
|
|
||||||
pyres->res =
|
|
||||||
pgf_parse_tokens(self->concr, catname, tokens, len, pyres->pool);
|
|
||||||
|
|
||||||
if (pyres->res == NULL) {
|
|
||||||
Py_DECREF(pyres);
|
|
||||||
|
|
||||||
PyErr_SetString(PGFError, "Something went wrong during parsing");
|
|
||||||
gu_pool_free(tmp_pool);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
gu_pool_free(tmp_pool);
|
|
||||||
return pyres;
|
|
||||||
}
|
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
Concr_linearize(ConcrObject* self, PyObject *args)
|
Concr_linearize(ConcrObject* self, PyObject *args)
|
||||||
{
|
{
|
||||||
@@ -743,9 +738,6 @@ static PyMethodDef Concr_methods[] = {
|
|||||||
{"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS,
|
{"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS,
|
||||||
"Parses a string and returns an iterator over the abstract trees for this sentence"
|
"Parses a string and returns an iterator over the abstract trees for this sentence"
|
||||||
},
|
},
|
||||||
{"parse_tokens", (PyCFunction)Concr_parse_tokens, METH_VARARGS | METH_KEYWORDS,
|
|
||||||
"Parses list of tokens and returns an iterator over the abstract trees for this sentence. Allows you to write your own tokenizer in python."
|
|
||||||
},
|
|
||||||
{"linearize", (PyCFunction)Concr_linearize, METH_VARARGS,
|
{"linearize", (PyCFunction)Concr_linearize, METH_VARARGS,
|
||||||
"Takes an abstract tree and linearizes it to a sentence"
|
"Takes an abstract tree and linearizes it to a sentence"
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user