diff --git a/src/runtime/c/gu/string.c b/src/runtime/c/gu/string.c index 526a6871c..7dc90e1e3 100644 --- a/src/runtime/c/gu/string.c +++ b/src/runtime/c/gu/string.c @@ -311,6 +311,58 @@ gu_string_to_double(GuString s, double *res) return true; } +bool +gu_string_is_prefix(GuString s1, GuString s2) +{ + GuWord w1 = s1.w_; + uint8_t buf1[sizeof(GuWord)]; + size_t sz1; + char* str1; + if (w1 & 1) { + sz1 = (w1 & 0xff) >> 1; + gu_assert(sz1 <= sizeof(GuWord)); + size_t i = sz1; + while (i > 0) { + w1 >>= 8; + buf1[--i] = w1 & 0xff; + } + str1 = (char*) buf1; + } else { + uint8_t* p = (void*) w1; + sz1 = (p[0] == 0) ? ((size_t*) p)[-1] : p[0]; + str1 = (char*) &p[1]; + } + + GuWord w2 = s2.w_; + uint8_t buf2[sizeof(GuWord)]; + size_t sz2; + char* str2; + if (w2 & 1) { + sz2 = (w2 & 0xff) >> 1; + gu_assert(sz2 <= sizeof(GuWord)); + size_t i = sz2; + while (i > 0) { + w2 >>= 8; + buf2[--i] = w2 & 0xff; + } + str2 = (char*) buf2; + } else { + uint8_t* p = (void*) w2; + sz2 = (p[0] == 0) ? ((size_t*) p)[-1] : p[0]; + str2 = (char*) &p[1]; + } + + while (sz1 > 0 && sz2 > 0) { + if (*str1 != *str2) + return false; + + str1++; sz1--; + str2++; sz2--; + } + + return true; +} + GuWord gu_string_hash(GuString s) { diff --git a/src/runtime/c/gu/string.h b/src/runtime/c/gu/string.h index b041518b3..37cb56ac2 100644 --- a/src/runtime/c/gu/string.h +++ b/src/runtime/c/gu/string.h @@ -69,6 +69,10 @@ gu_string_to_int(GuString s, int *res); bool gu_string_to_double(GuString s, double *res); + +bool +gu_string_is_prefix(GuString s1, GuString s2); + #endif // GU_STRING_H_ #if defined(GU_HASH_H_) && !defined(GU_STRING_H_HASH_) diff --git a/src/runtime/c/pgf/lexer.h b/src/runtime/c/pgf/lexer.h index 270a7949b..d2992ee06 100644 --- a/src/runtime/c/pgf/lexer.h +++ b/src/runtime/c/pgf/lexer.h @@ -2,11 +2,17 @@ #define PGF_LEXER_H_ #include +#include /// A single lexical token typedef GuString PgfToken; typedef GuSeq PgfTokens; // -> PgfToken +typedef struct { + prob_t prob; + PgfToken tok; +} PgfTokenProb; + typedef struct { PgfToken (*read_token)(); PgfToken tok; diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index f5e8afcde..c1362b88d 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -46,7 +46,6 @@ typedef struct { PgfConcr* concr; GuPool* pool; GuBuf* expr_queue; - PgfItem* target; PgfExpr meta_var; PgfProduction meta_prod; int max_fid; @@ -76,11 +75,18 @@ GU_DEFINE_TYPE(PgfProductionIdx, GuMap, gu_type(PgfCFCat), &pgf_cfcat_hasher, gu_type(PgfProductionSeq), &gu_null_seq); +typedef struct PgfTokenState PgfTokenState; + typedef struct { - PgfToken tok; - PgfProductionIdx* lexicon_idx; + bool (*match_token)(PgfTokenState* ts, PgfToken tok, PgfItem* item); + PgfToken (*get_token)(PgfTokenState* ts); + PgfProductionIdx* (*get_lexicon_idx)(PgfTokenState* ts); +} PgfTokenFn; + +struct PgfTokenState { + PgfTokenFn* fn; prob_t lexical_prob; -} PgfTokenState; +}; struct PgfParseState { PgfParseState* next; @@ -785,9 +791,8 @@ static void pgf_parsing_add_transition(PgfParseState* before, PgfParseState* after, PgfToken tok, PgfItem* item) { - if (gu_string_eq(tok, after->ts->tok)) { + if (after->ts->fn->match_token(after->ts, tok, item)) { if (after->next == NULL) { - after->ps->target = item; after->viterbi_prob = item->inside_prob+item->conts->outside_prob; } @@ -1076,20 +1081,31 @@ pgf_parsing_td_predict(PgfParseState* before, PgfParseState* after, item->inside_prob-conts->ccat->viterbi_prob+ item->conts->outside_prob; + size_t n_prods = ccat->n_synprods; + PgfProductionIdx* lexicon_idx = NULL; + if (after != NULL) { + lexicon_idx = after->ts->fn->get_lexicon_idx(after->ts); + + // we don't know the current token. + // probably we just compute the list of completions + if (lexicon_idx == NULL) + n_prods = gu_seq_length(ccat->prods); + } + // Top-down prediction for syntactic rules - PgfProductionSeq prods = ccat->prods; - for (size_t i = 0; i < ccat->n_synprods; i++) { + for (size_t i = 0; i < n_prods; i++) { PgfProduction prod = - gu_seq_get(prods, PgfProduction, i); + gu_seq_get(ccat->prods, PgfProduction, i); pgf_parsing_production(before, conts, prod); } // Bottom-up prediction for lexical rules - if (after != NULL && after->ts->lexicon_idx != NULL) { + + if (lexicon_idx != NULL) { PgfCFCat cfc = {ccat, lin_idx}; PgfProductionSeq tok_prods = - gu_map_get(after->ts->lexicon_idx, &cfc, PgfProductionSeq); - + gu_map_get(lexicon_idx, &cfc, PgfProductionSeq); + if (!gu_seq_is_null(tok_prods)) { size_t n_prods = gu_seq_length(tok_prods); for (size_t i = 0; i < n_prods; i++) { @@ -1141,20 +1157,24 @@ static void pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after, PgfItem* meta_item, prob_t meta_prob) { - PgfItem* item = pgf_item_copy(meta_item, before->ps->pool, before->ps); - item->inside_prob += meta_prob; + PgfToken tok = after->ts->fn->get_token(after->ts); + + if (!gu_string_eq(tok, gu_empty_string)) { + PgfItem* item = pgf_item_copy(meta_item, before->ps->pool, before->ps); + item->inside_prob += meta_prob; - PgfSymbol prev = item->curr_sym; - PgfSymbolKS* sks = (PgfSymbolKS*) - gu_alloc_variant(PGF_SYMBOL_KS, - sizeof(PgfSymbolKS)+sizeof(PgfSymbol), - gu_alignof(PgfSymbolKS), - &item->curr_sym, after->ps->pool); - *((PgfSymbol*)(sks+1)) = prev; - sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool); - gu_seq_set(sks->tokens, PgfToken, 0, after->ts->tok); + PgfSymbol prev = item->curr_sym; + PgfSymbolKS* sks = (PgfSymbolKS*) + gu_alloc_variant(PGF_SYMBOL_KS, + sizeof(PgfSymbolKS)+sizeof(PgfSymbol), + gu_alignof(PgfSymbolKS), + &item->curr_sym, after->ps->pool); + *((PgfSymbol*)(sks+1)) = prev; + sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool); + gu_seq_set(sks->tokens, PgfToken, 0, tok); - gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item); + gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item); + } } typedef struct { @@ -1468,8 +1488,9 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item) pgf_parsing_symbol(before, after, item, sym); } } else { - PgfToken tok = (after != NULL) ? after->ts->tok - : gu_empty_string; + PgfToken tok = (after != NULL) + ? after->ts->fn->get_token(after->ts) + : gu_empty_string; PgfExprProb *ep = NULL; bool accepted = @@ -1563,7 +1584,7 @@ pgf_parsing_proceed(PgfParseState* state) before = st; } } - + prob_t state_delta = (st->viterbi_prob-(st->next ? st->next->viterbi_prob : 0))* state->ps->beam_size; @@ -1623,7 +1644,6 @@ pgf_new_parsing(PgfConcr* concr, GuPool* pool) ps->concr = concr; ps->pool = pool; ps->expr_queue = gu_new_buf(PgfExprState*, pool); - ps->target = NULL; ps->max_fid = concr->total_cats; #ifdef PGF_COUNTS_DEBUG ps->item_full_count = 0; @@ -1702,20 +1722,14 @@ pgf_parser_compute_lexicon_prob(GuMapItor* fn, const void* key, void* value, GuE } } +#define pgf_new_token_state(ty, pool) \ + (ty*) pgf_new_token_state_(&pgf_tsfn_##ty, (PgfTokenState*) gu_new(ty, pool)) + static PgfTokenState* -pgf_new_token_state(PgfConcr *concr, PgfToken tok, GuPool* pool) +pgf_new_token_state_(PgfTokenFn* fn, PgfTokenState* ts) { - PgfTokenState* ts = gu_new(PgfTokenState, pool); - ts->tok = tok; - ts->lexicon_idx = gu_map_get(concr->leftcorner_tok_idx, - &tok, PgfProductionIdx*); - ts->lexical_prob = INFINITY; - if (ts->lexicon_idx != NULL) { - PgfLexiconFn clo = { { pgf_parser_compute_lexicon_prob }, ts }; - gu_map_iter(ts->lexicon_idx, &clo.fn, NULL); - } - if (ts->lexical_prob == INFINITY) - ts->lexical_prob = 0; + ts->fn = fn; + ts->lexical_prob = INFINITY; return ts; } @@ -1731,6 +1745,34 @@ void pgf_parsing_print_counts(PgfParsing* ps) } #endif +typedef struct { + PgfTokenState ts; + PgfToken tok; + PgfProductionIdx *lexicon_idx; +} PgfRealTokenState; + +static bool +pgf_real_match_token(PgfTokenState* ts, PgfToken tok, PgfItem* item) +{ + return gu_string_eq(gu_container(ts, PgfRealTokenState, ts)->tok, tok); +} + +static PgfToken +pgf_real_get_token(PgfTokenState* ts) { + return gu_container(ts, PgfRealTokenState, ts)->tok; +} + +static PgfProductionIdx* +pgf_real_get_lexicon_idx(PgfTokenState* ts) { + return gu_container(ts, PgfRealTokenState, ts)->lexicon_idx; +} + +static PgfTokenFn pgf_tsfn_PgfRealTokenState = { + pgf_real_match_token, + pgf_real_get_token, + pgf_real_get_lexicon_idx +}; + PgfParseState* pgf_parser_next_state(PgfParseState* prev, PgfToken tok) { @@ -1738,21 +1780,102 @@ pgf_parser_next_state(PgfParseState* prev, PgfToken tok) pgf_parsing_print_counts(prev->ps); #endif - PgfTokenState* ts = - pgf_new_token_state(prev->ps->concr,tok,prev->ps->pool); - PgfParseState* state = - pgf_new_parse_state(prev->ps, prev, ts, prev->ps->pool); + PgfRealTokenState* ts = + pgf_new_token_state(PgfRealTokenState, prev->ps->pool); + ts->tok = tok; + ts->lexicon_idx = gu_map_get(prev->ps->concr->leftcorner_tok_idx, + &tok, PgfProductionIdx*); + if (ts->lexicon_idx != NULL) { + PgfLexiconFn clo = { { pgf_parser_compute_lexicon_prob }, &ts->ts }; + gu_map_iter(ts->lexicon_idx, &clo.fn, NULL); + } + if (ts->ts.lexical_prob == INFINITY) + ts->ts.lexical_prob = 0; - state->ps->target = NULL; - while (state->ps->target == NULL) { + PgfParseState* state = + pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool); + + while (gu_buf_length(state->agenda) == 0) { if (!pgf_parsing_proceed(state)) + return NULL; + } + + return state; +} + +typedef struct { + PgfTokenState ts; + GuEnum en; + GuString prefix; + PgfTokenProb* tp; + GuPool* pool; + PgfParseState* state; +} PgfPrefixTokenState; + +static bool +pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item) +{ + PgfPrefixTokenState* ts = + gu_container(ts0, PgfPrefixTokenState, ts); + + if (gu_string_is_prefix(ts->prefix, tok)) { + ts->tp = gu_new(PgfTokenProb, ts->pool); + ts->tp->tok = tok; + ts->tp->prob = item->inside_prob+item->conts->outside_prob; + } + + return false; +} + +static PgfToken +pgf_prefix_get_token(PgfTokenState* ts) { + return gu_empty_string; +} + +static PgfProductionIdx* +pgf_prefix_get_lexicon_idx(PgfTokenState* ts) { + return NULL; +} + +static PgfTokenFn pgf_tsfn_PgfPrefixTokenState = { + pgf_prefix_match_token, + pgf_prefix_get_token, + pgf_prefix_get_lexicon_idx +}; + +static void +pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool) +{ + PgfPrefixTokenState* ts = + gu_container(self, PgfPrefixTokenState, en); + + ts->tp = NULL; + ts->pool = pool; + while (ts->tp == NULL) { + if (!pgf_parsing_proceed(ts->state)) break; } - if (state->ps->target != NULL) { - return state; - } - return NULL; + *((PgfTokenProb**)to) = ts->tp; +} + +GuEnum* +pgf_parser_completions(PgfParseState* prev, GuString prefix, + GuPool* pool) +{ +#ifdef PGF_COUNTS_DEBUG + pgf_parsing_print_counts(prev->ps); +#endif + + PgfPrefixTokenState* ts = + pgf_new_token_state(PgfPrefixTokenState, pool); + ts->en.next = pgf_parser_completions_next; + ts->prefix = prefix; + ts->tp = NULL; + ts->state = + pgf_new_parse_state(prev->ps, prev, &ts->ts, pool); + + return &ts->en; } static int diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h index 8c9839298..9fae0a565 100644 --- a/src/runtime/c/pgf/parser.h +++ b/src/runtime/c/pgf/parser.h @@ -66,6 +66,10 @@ pgf_parser_next_state(PgfParseState* prev, PgfToken tok); * the pool used to create \parse. */ +GuEnum* +pgf_parser_completions(PgfParseState* prev, GuString prefix, + GuPool* pool); + void pgf_parser_set_beam_size(PgfParseState* state, double beam_size); diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c index 39d3fcfbf..24d330981 100644 --- a/src/runtime/c/pgf/pgf.c +++ b/src/runtime/c/pgf/pgf.c @@ -236,6 +236,37 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool) return pgf_parse_result(state, pool); } +GuEnum* +pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, + GuString prefix, GuPool* pool) +{ + // Begin parsing a sentence of the specified category + PgfParseState* state = + pgf_parser_init_state(concr, cat, 0, pool); + if (state == NULL) { + return NULL; + } + + // Tokenization + GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool); + PgfToken tok = pgf_lexer_read_token(lexer, lex_err); + while (!gu_exn_is_raised(lex_err)) { + // feed the token to get a new parse state + state = pgf_parser_next_state(state, tok); + if (state == NULL) { + return NULL; + } + + tok = pgf_lexer_read_token(lexer, lex_err); + } + + if (gu_exn_caught(lex_err) != gu_type(GuEOF)) + return NULL; + + // Now begin enumerating the resulting syntax trees + return pgf_parser_completions(state, prefix, pool); +} + void pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool) { diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 03f1d4d48..39dc0dd04 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -116,6 +116,10 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuWriter* wtr, GuExn* err); PgfExprEnum* pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool); +GuEnum* +pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, + GuString prefix, GuPool* pool); + bool pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat, double *precision, double *recall, double *exact); diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c index 46c6a67be..f9428f96d 100644 --- a/src/runtime/python/pypgf.c +++ b/src/runtime/python/pypgf.c @@ -432,62 +432,19 @@ Expr_getattro(ExprObject *self, PyObject *attr_name) { return PyObject_GenericGetAttr((PyObject*)self, attr_name); } -typedef struct { +typedef struct IterObject { PyObject_HEAD PGFObject* grammar; GuPool* pool; int max_count; int counter; GuEnum* res; -} ExprIterObject; + PyObject* (*fetch)(struct IterObject* self); +} IterObject; -static ExprIterObject* -ExprIter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +PyObject* +Iter_fetch_expr(IterObject* self) { - ExprIterObject* self = (ExprIterObject *)type->tp_alloc(type, 0); - if (self != NULL) { - self->grammar = NULL; - self->pool = NULL; - self->max_count = -1; - self->counter = 0; - self->res = NULL; - } - - return self; -} - -static void -ExprIter_dealloc(ExprIterObject* self) -{ - if (self->pool != NULL) - gu_pool_free(self->pool); - - Py_XDECREF(self->grammar); - - self->ob_type->tp_free((PyObject*)self); -} - -static int -ExprIter_init(ExprIterObject *self, PyObject *args, PyObject *kwds) -{ - return -1; -} - -static PyObject* -ExprIter_iter(ExprIterObject *self) -{ - Py_INCREF(self); - return (PyObject*) self; -} - -static PyObject* -ExprIter_iternext(ExprIterObject *self) -{ - if (self->max_count >= 0 && self->counter >= self->max_count) { - return NULL; - } - self->counter++; - PgfExprProb* ep = gu_next(self->res, PgfExprProb*, self->pool); if (ep == NULL) return NULL; @@ -506,17 +463,81 @@ ExprIter_iternext(ExprIterObject *self) return res; } -static PyMethodDef ExprIter_methods[] = { +PyObject* +Iter_fetch_token(IterObject* self) +{ + PgfTokenProb* tp = gu_next(self->res, PgfTokenProb*, self->pool); + if (tp == NULL) + return NULL; + + PyObject* ty_tok = gu2py_string(tp->tok); + PyObject* res = Py_BuildValue("(f,O)", tp->prob, ty_tok); + Py_DECREF(ty_tok); + + return res; +} + +static IterObject* +Iter_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + IterObject* self = (IterObject *)type->tp_alloc(type, 0); + if (self != NULL) { + self->grammar = NULL; + self->pool = NULL; + self->max_count = -1; + self->counter = 0; + self->res = NULL; + } + + return self; +} + +static void +Iter_dealloc(IterObject* self) +{ + if (self->pool != NULL) + gu_pool_free(self->pool); + + Py_XDECREF(self->grammar); + + self->ob_type->tp_free((PyObject*)self); +} + +static int +Iter_init(IterObject *self, PyObject *args, PyObject *kwds) +{ + return -1; +} + +static PyObject* +Iter_iter(IterObject *self) +{ + Py_INCREF(self); + return (PyObject*) self; +} + +static PyObject* +Iter_iternext(IterObject *self) +{ + if (self->max_count >= 0 && self->counter >= self->max_count) { + return NULL; + } + self->counter++; + + return self->fetch(self); +} + +static PyMethodDef Iter_methods[] = { {NULL} /* Sentinel */ }; static PyTypeObject pgf_ExprIterType = { PyObject_HEAD_INIT(NULL) 0, /*ob_size*/ - "pgf.ExprIter", /*tp_name*/ - sizeof(ExprIterObject), /*tp_basicsize*/ + "pgf.Iter", /*tp_name*/ + sizeof(IterObject), /*tp_basicsize*/ 0, /*tp_itemsize*/ - (destructor)ExprIter_dealloc, /*tp_dealloc*/ + (destructor)Iter_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ @@ -537,9 +558,9 @@ static PyTypeObject pgf_ExprIterType = { 0, /*tp_clear */ 0, /*tp_richcompare */ 0, /*tp_weaklistoffset */ - (getiterfunc) ExprIter_iter, /*tp_iter */ - (iternextfunc) ExprIter_iternext, /*tp_iternext */ - ExprIter_methods, /*tp_methods */ + (getiterfunc) Iter_iter, /*tp_iter */ + (iternextfunc) Iter_iternext, /*tp_iternext */ + Iter_methods, /*tp_methods */ 0, /*tp_members */ 0, /*tp_getset */ 0, /*tp_base */ @@ -547,9 +568,9 @@ static PyTypeObject pgf_ExprIterType = { 0, /*tp_descr_get */ 0, /*tp_descr_set */ 0, /*tp_dictoffset */ - (initproc)ExprIter_init, /*tp_init */ + (initproc)Iter_init, /*tp_init */ 0, /*tp_alloc */ - (newfunc) ExprIter_new, /*tp_new */ + (newfunc) Iter_new, /*tp_new */ }; typedef struct { @@ -640,7 +661,7 @@ pypgf_new_python_lexer(PyObject* pylexer, GuPool* pool) return ((PgfLexer*) lexer); } -static ExprIterObject* +static IterObject* Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) { static char *kwlist[] = {"sentence", "tokens", "cat", "n", NULL}; @@ -667,7 +688,7 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) return NULL; } - ExprIterObject* pyres = (ExprIterObject*) + IterObject* pyres = (IterObject*) pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0); if (pyres == NULL) { Py_XDECREF(py_lexer); @@ -680,6 +701,7 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) pyres->pool = gu_new_pool(); pyres->max_count = max_count; pyres->counter = 0; + pyres->fetch = Iter_fetch_expr; GuPool *tmp_pool = gu_local_pool(); GuString catname = @@ -723,6 +745,98 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) return pyres; } +static IterObject* +Concr_getCompletions(ConcrObject* self, PyObject *args, PyObject *keywds) +{ + static char *kwlist[] = {"sentence", "tokens", "cat", + "prefix", "n", NULL}; + + size_t len; + const uint8_t *buf = NULL; + PyObject* py_lexer = NULL; + const char *catname_s = NULL; + const char *prefix_s = NULL; + int max_count = -1; + if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Ossi", kwlist, + &buf, &len, &py_lexer, &catname_s, + &prefix_s, &max_count)) + return NULL; + + if ((buf == NULL && py_lexer == NULL) || + (buf != NULL && py_lexer != NULL)) { + PyErr_SetString(PyExc_TypeError, "either the sentence or the tokens argument must be provided"); + return NULL; + } + + if (py_lexer != NULL) { + // get an iterator out of the iterable object + py_lexer = PyObject_GetIter(py_lexer); + if (py_lexer == NULL) + return NULL; + } + + IterObject* pyres = (IterObject*) + pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0); + if (pyres == NULL) { + Py_XDECREF(py_lexer); + return NULL; + } + + pyres->grammar = self->grammar; + Py_XINCREF(pyres->grammar); + + pyres->pool = gu_new_pool(); + pyres->max_count = max_count; + pyres->counter = 0; + pyres->fetch = Iter_fetch_token; + + GuPool *tmp_pool = gu_local_pool(); + + GuString catname = + (catname_s == NULL) ? pgf_start_cat(self->grammar->pgf, tmp_pool) + : gu_str_string(catname_s, tmp_pool); + + GuString prefix = + (prefix_s == NULL) ? gu_empty_string + : gu_str_string(prefix_s, pyres->pool); + + PgfLexer *lexer = NULL; + if (buf != NULL) { + GuIn* in = gu_data_in(buf, len, tmp_pool); + GuReader* rdr = gu_new_utf8_reader(in, tmp_pool); + lexer = pgf_new_simple_lexer(rdr, tmp_pool); + } + if (py_lexer != NULL) { + lexer = pypgf_new_python_lexer(py_lexer, tmp_pool); + } + + pyres->res = + pgf_get_completions(self->concr, catname, lexer, prefix, pyres->pool); + + if (pyres->res == NULL) { + Py_DECREF(pyres); + pyres = NULL; + + PgfToken tok = + pgf_lexer_current_token(lexer); + + if (gu_string_eq(tok, gu_empty_string)) + PyErr_SetString(PGFError, "The sentence cannot be parsed"); + else { + PyObject* py_tok = gu2py_string(tok); + PyObject_SetAttrString(ParseError, "token", py_tok); + PyErr_Format(ParseError, "Unexpected token: \"%s\"", + PyString_AsString(py_tok)); + Py_DECREF(py_tok); + } + } + + Py_XDECREF(py_lexer); + gu_pool_free(tmp_pool); + + return pyres; +} + static PyObject* Concr_parseval(ConcrObject* self, PyObject *args) { ExprObject* pyexpr = NULL; @@ -747,6 +861,26 @@ Concr_parseval(ConcrObject* self, PyObject *args) { return Py_BuildValue("ddd", precision, recall, exact); } +static PyObject* +Concr_addLiteral(ConcrObject* self, PyObject *args) { + ExprObject* pyexpr = NULL; + const char* s_cat = NULL; + if (!PyArg_ParseTuple(args, "sO!", &s_cat, &pgf_ExprType, &pyexpr)) + return NULL; +/* + PgfLiteralCallback* callback = NULL; + + GuPool* tmp_pool = gu_local_pool(); + + PgfCId cat = gu_str_string(s_cat, tmp_pool); + + pgf_parser_add_literal(self->concr, cat, callback); + + gu_pool_free(tmp_pool); +*/ + Py_RETURN_NONE; +} + static PyObject* Concr_linearize(ConcrObject* self, PyObject *args) { @@ -1045,9 +1179,15 @@ static PyMethodDef Concr_methods[] = { {"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS, "Parses a string and returns an iterator over the abstract trees for this sentence" }, + {"getCompletions", (PyCFunction)Concr_getCompletions, METH_VARARGS | METH_KEYWORDS, + "Parses a partial string and returns a list with the top n possible next tokens" + }, {"parseval", (PyCFunction)Concr_parseval, METH_VARARGS, "Computes precision, recall and exact match for the parser on a given abstract tree" }, + {"addLiteral", (PyCFunction)Concr_addLiteral, METH_VARARGS, + "adds callbacks for custom literals in the grammar" + }, {"linearize", (PyCFunction)Concr_linearize, METH_VARARGS, "Takes an abstract tree and linearizes it to a string" }, @@ -1335,7 +1475,7 @@ PGF_functionsByCat(PGFObject* self, PyObject *args) return functions; } -static ExprIterObject* +static IterObject* PGF_generate(PGFObject* self, PyObject *args, PyObject *keywds) { static char *kwlist[] = {"cat", "n", NULL}; @@ -1346,7 +1486,7 @@ PGF_generate(PGFObject* self, PyObject *args, PyObject *keywds) &catname_s, &max_count)) return NULL; - ExprIterObject* pyres = (ExprIterObject*) + IterObject* pyres = (IterObject*) pgf_ExprIterType.tp_alloc(&pgf_ExprIterType, 0); if (pyres == NULL) { return NULL; @@ -1358,6 +1498,7 @@ PGF_generate(PGFObject* self, PyObject *args, PyObject *keywds) pyres->pool = gu_new_pool(); pyres->max_count = max_count; pyres->counter = 0; + pyres->fetch = Iter_fetch_expr; GuPool *tmp_pool = gu_local_pool(); GuString catname = gu_str_string(catname_s, tmp_pool); diff --git a/src/runtime/python/test.py b/src/runtime/python/test.py index 437c29776..93a3c4a2d 100644 --- a/src/runtime/python/test.py +++ b/src/runtime/python/test.py @@ -1,26 +1,63 @@ -import sys import pgf +import sys +import sets +import readline sys.stdout.write("loading...") sys.stdout.flush(); gr = pgf.readPGF("../../../treebanks/PennTreebank/ParseEngAbs.pgf") sys.stdout.write("\n") +source_lang = gr.languages["ParseEng"] +target_lang = gr.languages["ParseBul"] + we = pgf.readExpr("UttImpSg PPos (ImpVP (UseV try_V))") -print gr.languages["ParseEng"].linearize(we) +print source_lang.linearize(we) sys.stdout.write("start cat: "+gr.startCat+"\n\n") +class Completer(): + def __init__(self, lang): + self.gr = lang + + def complete(self, prefix, state): + if state == 0: + line = readline.get_line_buffer() + line = line[0:readline.get_begidx()] + self.i = source_lang.getCompletions(line, prefix=prefix) + self.tokens = sets.Set() + + if len(self.tokens) > 50: + return None + + while True: + try: + (p,t) = self.i.next() + if t not in self.tokens: + self.tokens.add(t) + return t + except StopIteration: + return None + +completer = Completer(source_lang) +readline.set_completer(completer.complete) +readline.parse_and_bind("tab: complete") + while True: - sys.stdout.write("> ") - line = sys.stdin.readline(); - if line == '': + try: + line = raw_input("> "); + except EOFError: sys.stdout.write("\n") - break; + readline.set_completer(None) + break + except KeyboardInterrupt: + sys.stdout.write("\n") + readline.set_completer(None) + break try: - for (p,e) in gr.languages["ParseEng"].parse(line, n=5): + for (p,e) in source_lang.parse(line, n=1): sys.stdout.write("["+str(p)+"] "+str(e)+"\n") - print gr.languages["ParseEngBul"].linearize(e) + print target_lang.linearize(e) except pgf.ParseError as e: print e.message