From e9eeefa6f05babcf9b9e2ca471797368672f806c Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Mon, 19 Aug 2013 12:16:30 +0000 Subject: [PATCH] fullFormLexicon in C and Python --- src/runtime/c/pgf/parser.c | 202 ++++++++++++++++++++++++++++++++----- src/runtime/c/pgf/pgf.h | 14 ++- src/runtime/python/pypgf.c | 72 ++++++++++++- 3 files changed, 259 insertions(+), 29 deletions(-) diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index ac863db0a..f792fdaa8 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -2339,40 +2339,42 @@ pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err) case PGF_PRODUCTION_APPLY: { PgfProductionApply* papp = i.data; - // match the tokens with the production - size_t pos = 0; - PgfSequence seq = papp->fun->lins[cfc.lin_idx]; - size_t len = gu_seq_length(seq); - for (size_t i = 0; i < len; i++) { - PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i); + if (!gu_seq_is_null(clo->tokens)) { + // match the tokens with the production + size_t pos = 0; + PgfSequence seq = papp->fun->lins[cfc.lin_idx]; + size_t len = gu_seq_length(seq); + for (size_t i = 0; i < len; i++) { + PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i); - GuVariantInfo i = gu_variant_open(sym); - switch (i.tag) { - case PGF_SYMBOL_KS: { - PgfSymbolKS* symks = i.data; - size_t len = gu_seq_length(symks->tokens); - for (size_t i = 0; i < len; i++) { - if (pos >= gu_seq_length(clo->tokens)) - goto cont; + GuVariantInfo i = gu_variant_open(sym); + switch (i.tag) { + case PGF_SYMBOL_KS: { + PgfSymbolKS* symks = i.data; + size_t len = gu_seq_length(symks->tokens); + for (size_t i = 0; i < len; i++) { + if (pos >= gu_seq_length(clo->tokens)) + goto cont; - PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i); - PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++); - - if (!gu_string_eq(tok1, tok2)) - goto cont; + PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i); + PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++); + + if (!gu_string_eq(tok1, tok2)) + goto cont; + } + } + default: + continue; } } - default: - continue; - } + + if (pos != gu_seq_length(clo->tokens)) + goto cont; } - - if (pos != gu_seq_length(clo->tokens)) - goto cont; PgfCId lemma = papp->fun->absfun->name; prob_t prob = papp->fun->absfun->ep.prob; - clo->callback->callback(clo->callback, clo->tokens, + clo->callback->callback(clo->callback, lemma, analysis, prob, err); } } @@ -2410,10 +2412,156 @@ pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer, PgfMorphoFn clo = { { pgf_morpho_iter }, gu_buf_seq(tokens), callback }; gu_map_iter(lexicon_idx, &clo.fn, err); - + gu_pool_free(tmp_pool); } +typedef struct { + GuEnum en; + GuEnum* map_en1; + GuEnum* map_en2; + + GuMapItor fn; + PgfLeftcornerTokIdx* new_idx; + + GuPool* pool; +} PgfFullFormState; + +static void +pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err) +{ + PgfFullFormState* st = gu_container(fn, PgfFullFormState, fn); + PgfCFCat cfc = *((PgfCFCat*) key); + PgfProductionSeq prods = *((PgfProductionSeq*) value); + + if (gu_seq_is_null(prods)) + return; + + size_t n_prods = gu_seq_length(prods); + for (size_t i = 0; i < n_prods; i++) { + PgfProduction prod = + gu_seq_get(prods, PgfProduction, i); + + GuVariantInfo i = gu_variant_open(prod); + switch (i.tag) { + case PGF_PRODUCTION_APPLY: { + PgfProductionApply* papp = i.data; + + GuPool* tmp_pool = gu_new_pool(); + GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool); + GuStringBuf* sbuf = gu_string_buf(tmp_pool); + GuWriter* wtr = gu_string_buf_writer(sbuf); + + // collect the tokens in the production + PgfSequence seq = papp->fun->lins[cfc.lin_idx]; + size_t len = gu_seq_length(seq); + for (size_t i = 0; i < len; i++) { + PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i); + + GuVariantInfo i = gu_variant_open(sym); + switch (i.tag) { + case PGF_SYMBOL_KS: { + PgfSymbolKS* symks = i.data; + size_t len = gu_seq_length(symks->tokens); + for (size_t i = 0; i < len; i++) { + if (i > 0) { + gu_putc(' ', wtr, err); + } + + PgfToken tok = gu_seq_get(symks->tokens, PgfToken, i); + gu_string_write(tok, wtr, err); + } + } + default: + continue; + } + } + GuString tokens = gu_string_buf_freeze(sbuf, st->pool); + + // create a new production index with keys that + // are multiword units + PgfProductionIdx* lexicon_idx = + gu_map_get(st->new_idx, &tokens, PgfProductionIdx*); + if (lexicon_idx == NULL) { + lexicon_idx = gu_map_type_new(PgfProductionIdx, st->pool); + gu_map_put(st->new_idx, &tokens, PgfProductionIdx*, lexicon_idx); + } + + PgfProductionSeq prods = + gu_map_get(lexicon_idx, &cfc, PgfProductionSeq); + if (gu_seq_is_null(prods)) { + prods = gu_buf_seq(gu_new_buf(PgfProduction, st->pool)); + gu_map_put(lexicon_idx, &cfc, PgfProductionSeq, prods); + } + + gu_buf_push(gu_seq_buf(prods), PgfProduction, prod); + } + } + } +} + +static void +gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool) +{ + PgfFullFormState* st = gu_container(self, PgfFullFormState, en); + + for (;;) { + if (st->new_idx == NULL) { + GuMapKeyValue* kv = gu_next(st->map_en1, GuMapKeyValue*, pool); + if (kv == NULL) { + *((PgfFullFormEntry**)to) = NULL; + return; + } + + PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) kv->value); + + // we have an index by the first token but we must re-index + // by taking into account the multiword units + st->pool = pool; + st->new_idx = gu_map_type_new(PgfLeftcornerTokIdx, pool); + st->fn.fn = pgf_fullform_iter; + gu_map_iter(lexicon_idx, &st->fn, NULL); + + st->map_en2 = gu_map_enum(st->new_idx, pool); + } + PgfFullFormEntry* entry = + gu_next(st->map_en2, PgfFullFormEntry*, pool); + if (entry != NULL) { + *((PgfFullFormEntry**)to) = entry; + break; + } + + st->new_idx = NULL; + } +} + +GuEnum* +pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool) +{ + PgfFullFormState* st = gu_new(PgfFullFormState, pool); + st->en.next = gu_fullform_enum_next; + st->map_en1 = gu_map_enum(concr->leftcorner_tok_idx, pool); + st->map_en2 = NULL; + st->new_idx = NULL; + st->pool = NULL; + return &st->en; +} + +GuString +pgf_fullform_get_string(PgfFullFormEntry* entry) +{ + return *((GuString*) entry->key); +} + +void +pgf_fullform_get_analyses(PgfFullFormEntry* entry, + PgfMorphoCallback* callback, GuExn* err) +{ + PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) entry->value); + PgfMorphoFn clo = { { pgf_morpho_iter }, gu_null_seq, callback }; + gu_map_iter(lexicon_idx, &clo.fn, err); +} + static void pgf_parser_leftcorner_add_token(PgfConcr* concr, PgfTokens tokens, PgfItem* item, diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 40ae1ca45..51998c7b8 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -126,7 +126,7 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, typedef struct PgfMorphoCallback PgfMorphoCallback; struct PgfMorphoCallback { - void (*callback)(PgfMorphoCallback* self, PgfTokens tokens, + void (*callback)(PgfMorphoCallback* self, PgfCId lemma, GuString analysis, prob_t prob, GuExn* err); }; @@ -135,6 +135,18 @@ void pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer, PgfMorphoCallback* callback, GuExn* err); +typedef GuMapKeyValue PgfFullFormEntry; + +GuEnum* +pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool); + +GuString +pgf_fullform_get_string(PgfFullFormEntry* entry); + +void +pgf_fullform_get_analyses(PgfFullFormEntry* entry, + PgfMorphoCallback* callback, GuExn* err); + PgfExprEnum* pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, double heuristics, diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c index 5b4ae9193..eb6725050 100644 --- a/src/runtime/python/pypgf.c +++ b/src/runtime/python/pypgf.c @@ -1690,7 +1690,7 @@ typedef struct { } PyMorphoCallback; static void -pypgf_collect_morpho(PgfMorphoCallback* self, PgfTokens tokens, +pypgf_collect_morpho(PgfMorphoCallback* self, PgfCId lemma, GuString analysis, prob_t prob, GuExn* err) { @@ -1765,6 +1765,73 @@ Concr_lookupMorpho(ConcrObject* self, PyObject *args, PyObject *keywds) { return analyses; } +PyObject* +Iter_fetch_fullform(IterObject* self) +{ + PgfFullFormEntry* entry = + gu_next(self->res, PgfFullFormEntry*, self->pool); + if (entry == NULL) + return NULL; + + PyObject* res = NULL; + PyObject* py_tokens = NULL; + PyObject* py_analyses = NULL; + + GuString tokens = + pgf_fullform_get_string(entry); + + py_tokens = gu2py_string(tokens); + if (py_tokens == NULL) + goto done; + + py_analyses = PyList_New(0); + if (py_analyses == NULL) + goto done; + + GuPool* tmp_pool = gu_local_pool(); + GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool); + + PyMorphoCallback callback = { { pypgf_collect_morpho }, py_analyses }; + pgf_fullform_get_analyses(entry, &callback.fn, err); + + if (!gu_ok(err)) + goto done; + + res = Py_BuildValue("OO", py_tokens, py_analyses); + +done: + Py_XDECREF(py_tokens); + Py_XDECREF(py_analyses); + + return res; +} + +static PyObject* +Concr_fullFormLexicon(ConcrObject* self, PyObject *args) +{ + IterObject* pyres = (IterObject*) + pgf_IterType.tp_alloc(&pgf_IterType, 0); + if (pyres == NULL) + return NULL; + + pyres->grammar = self->grammar; + Py_XINCREF(pyres->grammar); + + pyres->container = NULL; + pyres->pool = gu_new_pool(); + pyres->max_count = -1; + pyres->counter = 0; + pyres->fetch = Iter_fetch_fullform; + + pyres->res = pgf_fullform_lexicon(self->concr, pyres->pool); + if (pyres->res == NULL) { + Py_DECREF(pyres); + return NULL; + } + + return (PyObject*) pyres; +} + static PyGetSetDef Concr_getseters[] = { {"name", (getter)Concr_getName, NULL, @@ -1810,6 +1877,9 @@ static PyMethodDef Concr_methods[] = { {"lookupMorpho", (PyCFunction)Concr_lookupMorpho, METH_VARARGS | METH_KEYWORDS, "Looks up a word in the lexicon of the grammar" }, + {"fullFormLexicon", (PyCFunction)Concr_fullFormLexicon, METH_VARARGS, + "Enumerates all words in the lexicon (useful for extracting full form lexicons)" + }, {NULL} /* Sentinel */ };