mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 11:42:49 -06:00
Python binding: add a parsing function that accepts directly a list of tokens.
Is allows to define a tokenizer in python (or use an existing one, from nltk for instance.)
This commit is contained in:
@@ -227,6 +227,33 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
|
|||||||
return pgf_parse_result(state, pool);
|
return pgf_parse_result(state, pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Same as previous but accept a list of tokens as input instead of a
|
||||||
|
// lexer
|
||||||
|
GuEnum*
|
||||||
|
pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char **tokens, int len, GuPool* pool)
|
||||||
|
{
|
||||||
|
// Begin parsing a sentence of the specified category
|
||||||
|
PgfParseState* state =
|
||||||
|
pgf_parser_init_state(concr, cat, 0, pool);
|
||||||
|
if (state == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parsing
|
||||||
|
PgfToken tok;
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
tok = gu_str_string(tokens[i], pool);
|
||||||
|
|
||||||
|
state = pgf_parser_next_state(state, tok, pool);
|
||||||
|
if (state == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now begin enumerating the resulting syntax trees
|
||||||
|
return pgf_parse_result(state, pool);
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
|
pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -115,6 +115,9 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuWriter* wtr, GuExn* err);
|
|||||||
PgfExprEnum*
|
PgfExprEnum*
|
||||||
pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool);
|
pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool);
|
||||||
|
|
||||||
|
PgfExprEnum*
|
||||||
|
pgf_parse_tokens(PgfConcr* concr, PgfCId cat, char* tokens[], int len, GuPool* pool);
|
||||||
|
|
||||||
PgfExprEnum*
|
PgfExprEnum*
|
||||||
pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool);
|
pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool);
|
||||||
|
|
||||||
|
|||||||
@@ -362,6 +362,74 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds)
|
|||||||
return pyres;
|
return pyres;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Concr_parse_tokens is the same as the above function but
|
||||||
|
// instead of a string it expect a sequence of tokens as argument.
|
||||||
|
// This is usefull if you want to implement your own tokenizer in
|
||||||
|
// python.
|
||||||
|
static ExprIterObject*
|
||||||
|
Concr_parse_tokens(ConcrObject* self, PyObject *args, PyObject *keywds)
|
||||||
|
{
|
||||||
|
static char *kwlist[] = {"tokens", "cat", "n", NULL};
|
||||||
|
// Variable for the input list of tokens
|
||||||
|
PyObject* obj;
|
||||||
|
PyObject* seq;
|
||||||
|
int len;
|
||||||
|
const char *catname_s = NULL;
|
||||||
|
int max_count = -1;
|
||||||
|
|
||||||
|
// Parsing arguments: the tokens is a python object (O),
|
||||||
|
// cat is a string (s) and n an integer (i)
|
||||||
|
if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|si", kwlist,
|
||||||
|
&obj, &catname_s, &max_count))
|
||||||
|
return NULL;
|
||||||
|
// The python object should be a sequence
|
||||||
|
seq = PySequence_Fast(obj, "expected a sequence");
|
||||||
|
len = PySequence_Size(obj);
|
||||||
|
|
||||||
|
ExprIterObject* pyres = (ExprIterObject*)
|
||||||
|
pgf_ExprType.tp_alloc(&pgf_ExprIterType, 0);
|
||||||
|
if (pyres == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
pyres->pool = gu_new_pool();
|
||||||
|
pyres->max_count = max_count;
|
||||||
|
pyres->counter = 0;
|
||||||
|
|
||||||
|
GuPool *tmp_pool = gu_local_pool();
|
||||||
|
GuString catname =
|
||||||
|
(catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool)
|
||||||
|
: gu_str_string(catname_s, tmp_pool);
|
||||||
|
|
||||||
|
// turn the (python) list of tokens into a string array
|
||||||
|
char* tokens[len];
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
tokens[i] = PyString_AsString(PySequence_Fast_GET_ITEM(seq, i));
|
||||||
|
if (tokens[i] == NULL) {
|
||||||
|
// Note: if the list item is not a string,
|
||||||
|
// PyString_AsString raises TypeError itself
|
||||||
|
// so we just have to return
|
||||||
|
gu_pool_free(tmp_pool);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Py_DECREF(seq);
|
||||||
|
|
||||||
|
pyres->res =
|
||||||
|
pgf_parse_tokens(self->concr, catname, tokens, len, pyres->pool);
|
||||||
|
|
||||||
|
if (pyres->res == NULL) {
|
||||||
|
Py_DECREF(pyres);
|
||||||
|
|
||||||
|
PyErr_SetString(PGFError, "Something went wrong during parsing");
|
||||||
|
gu_pool_free(tmp_pool);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
gu_pool_free(tmp_pool);
|
||||||
|
return pyres;
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
Concr_linearize(ConcrObject* self, PyObject *args)
|
Concr_linearize(ConcrObject* self, PyObject *args)
|
||||||
{
|
{
|
||||||
@@ -394,6 +462,9 @@ static PyMethodDef Concr_methods[] = {
|
|||||||
{"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS,
|
{"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS,
|
||||||
"Parses a string and returns an iterator over the abstract trees for this sentence"
|
"Parses a string and returns an iterator over the abstract trees for this sentence"
|
||||||
},
|
},
|
||||||
|
{"parse_tokens", (PyCFunction)Concr_parse_tokens, METH_VARARGS | METH_KEYWORDS,
|
||||||
|
"Parses list of tokens and returns an iterator over the abstract trees for this sentence. Allows you to write your own tokenizer in python."
|
||||||
|
},
|
||||||
{"linearize", (PyCFunction)Concr_linearize, METH_VARARGS,
|
{"linearize", (PyCFunction)Concr_linearize, METH_VARARGS,
|
||||||
"Takes an abstract tree and linearizes it to a sentence"
|
"Takes an abstract tree and linearizes it to a sentence"
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user