mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 11:42:49 -06:00
fullFormLexicon in C and Python
This commit is contained in:
@@ -2339,40 +2339,42 @@ pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
|
|||||||
case PGF_PRODUCTION_APPLY: {
|
case PGF_PRODUCTION_APPLY: {
|
||||||
PgfProductionApply* papp = i.data;
|
PgfProductionApply* papp = i.data;
|
||||||
|
|
||||||
// match the tokens with the production
|
if (!gu_seq_is_null(clo->tokens)) {
|
||||||
size_t pos = 0;
|
// match the tokens with the production
|
||||||
PgfSequence seq = papp->fun->lins[cfc.lin_idx];
|
size_t pos = 0;
|
||||||
size_t len = gu_seq_length(seq);
|
PgfSequence seq = papp->fun->lins[cfc.lin_idx];
|
||||||
for (size_t i = 0; i < len; i++) {
|
size_t len = gu_seq_length(seq);
|
||||||
PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
|
for (size_t i = 0; i < len; i++) {
|
||||||
|
PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
|
||||||
|
|
||||||
GuVariantInfo i = gu_variant_open(sym);
|
GuVariantInfo i = gu_variant_open(sym);
|
||||||
switch (i.tag) {
|
switch (i.tag) {
|
||||||
case PGF_SYMBOL_KS: {
|
case PGF_SYMBOL_KS: {
|
||||||
PgfSymbolKS* symks = i.data;
|
PgfSymbolKS* symks = i.data;
|
||||||
size_t len = gu_seq_length(symks->tokens);
|
size_t len = gu_seq_length(symks->tokens);
|
||||||
for (size_t i = 0; i < len; i++) {
|
for (size_t i = 0; i < len; i++) {
|
||||||
if (pos >= gu_seq_length(clo->tokens))
|
if (pos >= gu_seq_length(clo->tokens))
|
||||||
goto cont;
|
goto cont;
|
||||||
|
|
||||||
PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i);
|
PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i);
|
||||||
PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
|
PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
|
||||||
|
|
||||||
if (!gu_string_eq(tok1, tok2))
|
if (!gu_string_eq(tok1, tok2))
|
||||||
goto cont;
|
goto cont;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
default:
|
|
||||||
continue;
|
if (pos != gu_seq_length(clo->tokens))
|
||||||
}
|
goto cont;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pos != gu_seq_length(clo->tokens))
|
|
||||||
goto cont;
|
|
||||||
|
|
||||||
PgfCId lemma = papp->fun->absfun->name;
|
PgfCId lemma = papp->fun->absfun->name;
|
||||||
prob_t prob = papp->fun->absfun->ep.prob;
|
prob_t prob = papp->fun->absfun->ep.prob;
|
||||||
clo->callback->callback(clo->callback, clo->tokens,
|
clo->callback->callback(clo->callback,
|
||||||
lemma, analysis, prob, err);
|
lemma, analysis, prob, err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2410,10 +2412,156 @@ pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer,
|
|||||||
|
|
||||||
PgfMorphoFn clo = { { pgf_morpho_iter }, gu_buf_seq(tokens), callback };
|
PgfMorphoFn clo = { { pgf_morpho_iter }, gu_buf_seq(tokens), callback };
|
||||||
gu_map_iter(lexicon_idx, &clo.fn, err);
|
gu_map_iter(lexicon_idx, &clo.fn, err);
|
||||||
|
|
||||||
gu_pool_free(tmp_pool);
|
gu_pool_free(tmp_pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
GuEnum en;
|
||||||
|
GuEnum* map_en1;
|
||||||
|
GuEnum* map_en2;
|
||||||
|
|
||||||
|
GuMapItor fn;
|
||||||
|
PgfLeftcornerTokIdx* new_idx;
|
||||||
|
|
||||||
|
GuPool* pool;
|
||||||
|
} PgfFullFormState;
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
|
||||||
|
{
|
||||||
|
PgfFullFormState* st = gu_container(fn, PgfFullFormState, fn);
|
||||||
|
PgfCFCat cfc = *((PgfCFCat*) key);
|
||||||
|
PgfProductionSeq prods = *((PgfProductionSeq*) value);
|
||||||
|
|
||||||
|
if (gu_seq_is_null(prods))
|
||||||
|
return;
|
||||||
|
|
||||||
|
size_t n_prods = gu_seq_length(prods);
|
||||||
|
for (size_t i = 0; i < n_prods; i++) {
|
||||||
|
PgfProduction prod =
|
||||||
|
gu_seq_get(prods, PgfProduction, i);
|
||||||
|
|
||||||
|
GuVariantInfo i = gu_variant_open(prod);
|
||||||
|
switch (i.tag) {
|
||||||
|
case PGF_PRODUCTION_APPLY: {
|
||||||
|
PgfProductionApply* papp = i.data;
|
||||||
|
|
||||||
|
GuPool* tmp_pool = gu_new_pool();
|
||||||
|
GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
|
||||||
|
GuStringBuf* sbuf = gu_string_buf(tmp_pool);
|
||||||
|
GuWriter* wtr = gu_string_buf_writer(sbuf);
|
||||||
|
|
||||||
|
// collect the tokens in the production
|
||||||
|
PgfSequence seq = papp->fun->lins[cfc.lin_idx];
|
||||||
|
size_t len = gu_seq_length(seq);
|
||||||
|
for (size_t i = 0; i < len; i++) {
|
||||||
|
PgfSymbol sym = gu_seq_get(seq, PgfSymbol, i);
|
||||||
|
|
||||||
|
GuVariantInfo i = gu_variant_open(sym);
|
||||||
|
switch (i.tag) {
|
||||||
|
case PGF_SYMBOL_KS: {
|
||||||
|
PgfSymbolKS* symks = i.data;
|
||||||
|
size_t len = gu_seq_length(symks->tokens);
|
||||||
|
for (size_t i = 0; i < len; i++) {
|
||||||
|
if (i > 0) {
|
||||||
|
gu_putc(' ', wtr, err);
|
||||||
|
}
|
||||||
|
|
||||||
|
PgfToken tok = gu_seq_get(symks->tokens, PgfToken, i);
|
||||||
|
gu_string_write(tok, wtr, err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GuString tokens = gu_string_buf_freeze(sbuf, st->pool);
|
||||||
|
|
||||||
|
// create a new production index with keys that
|
||||||
|
// are multiword units
|
||||||
|
PgfProductionIdx* lexicon_idx =
|
||||||
|
gu_map_get(st->new_idx, &tokens, PgfProductionIdx*);
|
||||||
|
if (lexicon_idx == NULL) {
|
||||||
|
lexicon_idx = gu_map_type_new(PgfProductionIdx, st->pool);
|
||||||
|
gu_map_put(st->new_idx, &tokens, PgfProductionIdx*, lexicon_idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
PgfProductionSeq prods =
|
||||||
|
gu_map_get(lexicon_idx, &cfc, PgfProductionSeq);
|
||||||
|
if (gu_seq_is_null(prods)) {
|
||||||
|
prods = gu_buf_seq(gu_new_buf(PgfProduction, st->pool));
|
||||||
|
gu_map_put(lexicon_idx, &cfc, PgfProductionSeq, prods);
|
||||||
|
}
|
||||||
|
|
||||||
|
gu_buf_push(gu_seq_buf(prods), PgfProduction, prod);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||||
|
{
|
||||||
|
PgfFullFormState* st = gu_container(self, PgfFullFormState, en);
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
if (st->new_idx == NULL) {
|
||||||
|
GuMapKeyValue* kv = gu_next(st->map_en1, GuMapKeyValue*, pool);
|
||||||
|
if (kv == NULL) {
|
||||||
|
*((PgfFullFormEntry**)to) = NULL;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) kv->value);
|
||||||
|
|
||||||
|
// we have an index by the first token but we must re-index
|
||||||
|
// by taking into account the multiword units
|
||||||
|
st->pool = pool;
|
||||||
|
st->new_idx = gu_map_type_new(PgfLeftcornerTokIdx, pool);
|
||||||
|
st->fn.fn = pgf_fullform_iter;
|
||||||
|
gu_map_iter(lexicon_idx, &st->fn, NULL);
|
||||||
|
|
||||||
|
st->map_en2 = gu_map_enum(st->new_idx, pool);
|
||||||
|
}
|
||||||
|
PgfFullFormEntry* entry =
|
||||||
|
gu_next(st->map_en2, PgfFullFormEntry*, pool);
|
||||||
|
if (entry != NULL) {
|
||||||
|
*((PgfFullFormEntry**)to) = entry;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
st->new_idx = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GuEnum*
|
||||||
|
pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
|
||||||
|
{
|
||||||
|
PgfFullFormState* st = gu_new(PgfFullFormState, pool);
|
||||||
|
st->en.next = gu_fullform_enum_next;
|
||||||
|
st->map_en1 = gu_map_enum(concr->leftcorner_tok_idx, pool);
|
||||||
|
st->map_en2 = NULL;
|
||||||
|
st->new_idx = NULL;
|
||||||
|
st->pool = NULL;
|
||||||
|
return &st->en;
|
||||||
|
}
|
||||||
|
|
||||||
|
GuString
|
||||||
|
pgf_fullform_get_string(PgfFullFormEntry* entry)
|
||||||
|
{
|
||||||
|
return *((GuString*) entry->key);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pgf_fullform_get_analyses(PgfFullFormEntry* entry,
|
||||||
|
PgfMorphoCallback* callback, GuExn* err)
|
||||||
|
{
|
||||||
|
PgfProductionIdx* lexicon_idx = *((PgfProductionIdx**) entry->value);
|
||||||
|
PgfMorphoFn clo = { { pgf_morpho_iter }, gu_null_seq, callback };
|
||||||
|
gu_map_iter(lexicon_idx, &clo.fn, err);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
pgf_parser_leftcorner_add_token(PgfConcr* concr,
|
pgf_parser_leftcorner_add_token(PgfConcr* concr,
|
||||||
PgfTokens tokens, PgfItem* item,
|
PgfTokens tokens, PgfItem* item,
|
||||||
|
|||||||
@@ -126,7 +126,7 @@ pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
|
|||||||
|
|
||||||
typedef struct PgfMorphoCallback PgfMorphoCallback;
|
typedef struct PgfMorphoCallback PgfMorphoCallback;
|
||||||
struct PgfMorphoCallback {
|
struct PgfMorphoCallback {
|
||||||
void (*callback)(PgfMorphoCallback* self, PgfTokens tokens,
|
void (*callback)(PgfMorphoCallback* self,
|
||||||
PgfCId lemma, GuString analysis, prob_t prob,
|
PgfCId lemma, GuString analysis, prob_t prob,
|
||||||
GuExn* err);
|
GuExn* err);
|
||||||
};
|
};
|
||||||
@@ -135,6 +135,18 @@ void
|
|||||||
pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer,
|
pgf_lookup_morpho(PgfConcr *concr, PgfLexer *lexer,
|
||||||
PgfMorphoCallback* callback, GuExn* err);
|
PgfMorphoCallback* callback, GuExn* err);
|
||||||
|
|
||||||
|
typedef GuMapKeyValue PgfFullFormEntry;
|
||||||
|
|
||||||
|
GuEnum*
|
||||||
|
pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool);
|
||||||
|
|
||||||
|
GuString
|
||||||
|
pgf_fullform_get_string(PgfFullFormEntry* entry);
|
||||||
|
|
||||||
|
void
|
||||||
|
pgf_fullform_get_analyses(PgfFullFormEntry* entry,
|
||||||
|
PgfMorphoCallback* callback, GuExn* err);
|
||||||
|
|
||||||
PgfExprEnum*
|
PgfExprEnum*
|
||||||
pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
|
pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer,
|
||||||
double heuristics,
|
double heuristics,
|
||||||
|
|||||||
@@ -1690,7 +1690,7 @@ typedef struct {
|
|||||||
} PyMorphoCallback;
|
} PyMorphoCallback;
|
||||||
|
|
||||||
static void
|
static void
|
||||||
pypgf_collect_morpho(PgfMorphoCallback* self, PgfTokens tokens,
|
pypgf_collect_morpho(PgfMorphoCallback* self,
|
||||||
PgfCId lemma, GuString analysis, prob_t prob,
|
PgfCId lemma, GuString analysis, prob_t prob,
|
||||||
GuExn* err)
|
GuExn* err)
|
||||||
{
|
{
|
||||||
@@ -1765,6 +1765,73 @@ Concr_lookupMorpho(ConcrObject* self, PyObject *args, PyObject *keywds) {
|
|||||||
return analyses;
|
return analyses;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyObject*
|
||||||
|
Iter_fetch_fullform(IterObject* self)
|
||||||
|
{
|
||||||
|
PgfFullFormEntry* entry =
|
||||||
|
gu_next(self->res, PgfFullFormEntry*, self->pool);
|
||||||
|
if (entry == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
PyObject* res = NULL;
|
||||||
|
PyObject* py_tokens = NULL;
|
||||||
|
PyObject* py_analyses = NULL;
|
||||||
|
|
||||||
|
GuString tokens =
|
||||||
|
pgf_fullform_get_string(entry);
|
||||||
|
|
||||||
|
py_tokens = gu2py_string(tokens);
|
||||||
|
if (py_tokens == NULL)
|
||||||
|
goto done;
|
||||||
|
|
||||||
|
py_analyses = PyList_New(0);
|
||||||
|
if (py_analyses == NULL)
|
||||||
|
goto done;
|
||||||
|
|
||||||
|
GuPool* tmp_pool = gu_local_pool();
|
||||||
|
GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
|
||||||
|
|
||||||
|
PyMorphoCallback callback = { { pypgf_collect_morpho }, py_analyses };
|
||||||
|
pgf_fullform_get_analyses(entry, &callback.fn, err);
|
||||||
|
|
||||||
|
if (!gu_ok(err))
|
||||||
|
goto done;
|
||||||
|
|
||||||
|
res = Py_BuildValue("OO", py_tokens, py_analyses);
|
||||||
|
|
||||||
|
done:
|
||||||
|
Py_XDECREF(py_tokens);
|
||||||
|
Py_XDECREF(py_analyses);
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
Concr_fullFormLexicon(ConcrObject* self, PyObject *args)
|
||||||
|
{
|
||||||
|
IterObject* pyres = (IterObject*)
|
||||||
|
pgf_IterType.tp_alloc(&pgf_IterType, 0);
|
||||||
|
if (pyres == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
pyres->grammar = self->grammar;
|
||||||
|
Py_XINCREF(pyres->grammar);
|
||||||
|
|
||||||
|
pyres->container = NULL;
|
||||||
|
pyres->pool = gu_new_pool();
|
||||||
|
pyres->max_count = -1;
|
||||||
|
pyres->counter = 0;
|
||||||
|
pyres->fetch = Iter_fetch_fullform;
|
||||||
|
|
||||||
|
pyres->res = pgf_fullform_lexicon(self->concr, pyres->pool);
|
||||||
|
if (pyres->res == NULL) {
|
||||||
|
Py_DECREF(pyres);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (PyObject*) pyres;
|
||||||
|
}
|
||||||
|
|
||||||
static PyGetSetDef Concr_getseters[] = {
|
static PyGetSetDef Concr_getseters[] = {
|
||||||
{"name",
|
{"name",
|
||||||
(getter)Concr_getName, NULL,
|
(getter)Concr_getName, NULL,
|
||||||
@@ -1810,6 +1877,9 @@ static PyMethodDef Concr_methods[] = {
|
|||||||
{"lookupMorpho", (PyCFunction)Concr_lookupMorpho, METH_VARARGS | METH_KEYWORDS,
|
{"lookupMorpho", (PyCFunction)Concr_lookupMorpho, METH_VARARGS | METH_KEYWORDS,
|
||||||
"Looks up a word in the lexicon of the grammar"
|
"Looks up a word in the lexicon of the grammar"
|
||||||
},
|
},
|
||||||
|
{"fullFormLexicon", (PyCFunction)Concr_fullFormLexicon, METH_VARARGS,
|
||||||
|
"Enumerates all words in the lexicon (useful for extracting full form lexicons)"
|
||||||
|
},
|
||||||
{NULL} /* Sentinel */
|
{NULL} /* Sentinel */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user