libpgf: preliminary version for the statistical ranking. we use naive statistical model with random weight for the meta variables.

This commit is contained in:
kr.angelov
2012-03-02 19:25:01 +00:00
parent c2289b5f21
commit fdf6dd7798
3 changed files with 193 additions and 126 deletions

View File

@@ -4,6 +4,7 @@
#include <gu/assert.h> #include <gu/assert.h>
#include <gu/log.h> #include <gu/log.h>
#include <gu/file.h> #include <gu/file.h>
#include <math.h>
#include <stdlib.h> #include <stdlib.h>
typedef struct PgfItem PgfItem; typedef struct PgfItem PgfItem;
@@ -14,17 +15,34 @@ typedef GuList(PgfItemBuf*) PgfItemBufs;
typedef GuBuf PgfCCatBuf; typedef GuBuf PgfCCatBuf;
struct PgfParse { struct PgfParse {
PgfAbstr* abstr;
PgfConcr* concr; PgfConcr* concr;
PgfItemBuf* agenda; PgfItemBuf* agenda;
int max_fid; int max_fid;
}; };
typedef struct PgfExprState PgfExprState;
typedef struct {
double prob;
PgfExprState *state;
} PgfExprPState;
struct PgfExprState {
double prob;
PgfExprState* prev;
PgfExpr expr;
PgfPArgs args;
size_t arg_idx;
};
typedef struct PgfParseResult PgfParseResult; typedef struct PgfParseResult PgfParseResult;
struct PgfParseResult { struct PgfParseResult {
PgfAbstr* abstr;
PgfConcr* concr; PgfConcr* concr;
PgfCCatBuf* completed; GuPool *tmp_pool;
GuChoice* choice; GuBuf *pqueue;
PgfExprEnum en; PgfExprEnum en;
}; };
@@ -101,7 +119,6 @@ struct PgfParsing {
int max_fid; int max_fid;
}; };
#ifdef PGF_PARSER_DEBUG #ifdef PGF_PARSER_DEBUG
static void static void
pgf_print_production(int fid, PgfProduction prod, GuWriter *wtr, GuExn* err) pgf_print_production(int fid, PgfProduction prod, GuWriter *wtr, GuExn* err)
@@ -835,9 +852,11 @@ pgf_new_parsing(PgfConcr* concr, PgfLexCallback* callback, int max_fid,
} }
static PgfParse* static PgfParse*
pgf_new_parse(PgfConcr* concr, int max_fid, GuPool* pool) pgf_new_parse(PgfAbstr* abstr, PgfConcr* concr,
int max_fid, GuPool* pool)
{ {
PgfParse* parse = gu_new(PgfParse, pool); PgfParse* parse = gu_new(PgfParse, pool);
parse->abstr = abstr;
parse->concr = concr; parse->concr = concr;
parse->agenda = NULL; parse->agenda = NULL;
parse->max_fid = max_fid; parse->max_fid = max_fid;
@@ -950,7 +969,7 @@ pgf_parse_token(PgfParse* parse, PgfToken tok, bool robust, GuPool* pool)
PgfParse* next_parse = NULL; PgfParse* next_parse = NULL;
if (gu_buf_length(agenda) > 0) { if (gu_buf_length(agenda) > 0) {
next_parse = pgf_new_parse(parse->concr, parse->max_fid, pool); next_parse = pgf_new_parse(parse->abstr, parse->concr, parse->max_fid, pool);
next_parse->agenda = agenda; next_parse->agenda = agenda;
next_parse->max_fid= parsing->max_fid; next_parse->max_fid= parsing->max_fid;
} }
@@ -959,131 +978,150 @@ pgf_parse_token(PgfParse* parse, PgfToken tok, bool robust, GuPool* pool)
return next_parse; return next_parse;
} }
static PgfExpr
pgf_cat_to_expr(PgfConcr* concr, PgfCCat* cat,
PgfCCatBuf* visited, GuChoice* choice,
GuPool* pool);
static PgfExpr int cmp_expr_prob(GuOrder* self, const void* a, const void* b)
pgf_production_to_expr(PgfConcr* concr, PgfProduction prod,
PgfCCatBuf* visited, GuChoice* choice,
GuPool* pool)
{ {
GuVariantInfo pi = gu_variant_open(prod); PgfExprPState *s1 = (PgfExprPState *) a;
switch (pi.tag) { PgfExprPState *s2 = (PgfExprPState *) b;
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = pi.data; if (s1->prob < s2->prob)
PgfExpr expr = gu_new_variant_i(pool, PGF_EXPR_FUN, return -1;
PgfExprFun, else if (s1->prob > s2->prob)
.fun = papp->fun->fun); return 1;
size_t n_args = gu_seq_length(papp->args); else
for (size_t i = 0; i < n_args; i++) { return 0;
PgfPArg* parg = gu_seq_index(papp->args, PgfPArg, i);
gu_assert(!parg->hypos || !parg->hypos->len);
PgfExpr earg = pgf_cat_to_expr(concr, parg->ccat, visited, choice, pool);
if (gu_variant_is_null(earg))
return gu_null_variant;
expr = gu_new_variant_i(pool, PGF_EXPR_APP,
PgfExprApp,
.fun = expr, .arg = earg);
}
return expr;
}
case PGF_PRODUCTION_COERCE: {
PgfProductionCoerce* pcoerce = pi.data;
return pgf_cat_to_expr(concr, pcoerce->coerce, visited, choice, pool);
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* pmeta = pi.data;
PgfExpr expr = gu_new_variant_i(pool, PGF_EXPR_META,
PgfExprMeta,
.id = 0);
size_t n_args = gu_seq_length(pmeta->args);
for (size_t i = 0; i < n_args; i++) {
PgfPArg* parg = gu_seq_index(pmeta->args, PgfPArg, i);
gu_assert(!parg->hypos || !parg->hypos->len);
PgfExpr earg = pgf_cat_to_expr(concr, parg->ccat, visited, choice, pool);
if (gu_variant_is_null(earg))
return gu_null_variant;
expr = gu_new_variant_i(pool, PGF_EXPR_APP,
PgfExprApp,
.fun = expr, .arg = earg);
}
return expr;
}
default:
gu_impossible();
}
return gu_null_variant;
} }
static PgfExpr static GuOrder
pgf_cat_to_expr(PgfConcr* concr, PgfCCat* cat, pgf_expr_state_order = { cmp_expr_prob };
PgfCCatBuf* visited, GuChoice* choice,
GuPool* pool)
{
if (cat->fid < concr->total_cats) {
// XXX: What should the PgfMetaId be?
return gu_new_variant_i(pool, PGF_EXPR_META,
PgfExprMeta,
.id = 0);
}
size_t n_prods = gu_seq_length(cat->prods); static void
for (size_t i = 0; i < gu_buf_length(visited); i++) { pgf_parse_result_from_ccat(PgfParseResult *result, PgfCCat *ccat,
if (gu_buf_get(visited, PgfCCat*, i) == cat) { PgfExprPState *ps, GuPool *pool)
n_prods = 0; {
size_t n_prods = gu_seq_length(ccat->prods);
for (size_t i = 0; i < n_prods; i++) {
PgfProduction prod =
gu_seq_get(ccat->prods, PgfProduction, i);
GuVariantInfo pi = gu_variant_open(prod);
switch (pi.tag) {
case PGF_PRODUCTION_APPLY: {
PgfProductionApply* papp = pi.data;
PgfFunDecl *fun_decl =
gu_map_get(result->abstr->funs, &papp->fun->fun, PgfFunDecl*);
gu_assert(fun_decl != NULL);
double prob = ps->prob - log(fun_decl->prob);
PgfExprState *state = gu_new(PgfExprState, result->tmp_pool);
state->prev = ps->state;
PgfExprFun *expr_fun =
gu_new_variant(PGF_EXPR_FUN,
PgfExprFun,
&state->expr, pool);
expr_fun->fun = papp->fun->fun;
state->args = papp->args;
state->arg_idx = 0;
PgfExprPState ps1 = { prob, state };
gu_buf_heap_push(result->pqueue, &pgf_expr_state_order, &ps1);
break; break;
} }
} case PGF_PRODUCTION_COERCE: {
gu_buf_push(visited, PgfCCat*, cat); PgfProductionCoerce* pcoerce = pi.data;
pgf_parse_result_from_ccat(result, pcoerce->coerce, ps, pool);
break;
}
case PGF_PRODUCTION_META: {
PgfProductionMeta* pmeta = pi.data;
int i = gu_choice_next(choice, n_prods); double prob = ps->prob + 100000000000 + rand();
if (i == -1) {
return gu_null_variant; PgfExprState *state = gu_new(PgfExprState, result->tmp_pool);
state->prev = ps->state;
PgfExprMeta *expr_meta =
gu_new_variant(PGF_EXPR_META,
PgfExprMeta,
&state->expr, pool);
expr_meta->id = 0;
state->args = pmeta->args;
state->arg_idx = 0;
PgfExprPState ps1 = { prob, state };
gu_buf_heap_push(result->pqueue, &pgf_expr_state_order, &ps1);
break;
}
}
} }
PgfProduction prod = gu_seq_get(cat->prods, PgfProduction, i);
return pgf_production_to_expr(concr, prod, visited, choice, pool);
} }
static PgfExpr static PgfExpr
pgf_parse_result_next(PgfParseResult* pr, GuPool* pool) pgf_parse_result_next(PgfParseResult *result, GuPool *pool)
{ {
if (pr->choice == NULL) { if (result->pqueue == NULL)
return gu_null_variant; return gu_null_variant;
while (gu_buf_length(result->pqueue) > 0) {
PgfExprPState ps;
gu_buf_heap_pop(result->pqueue, &pgf_expr_state_order, &ps);
if (ps.state->arg_idx >= gu_seq_length(ps.state->args)) {
PgfExprState *prev = ps.state->prev;
if (prev == NULL)
return ps.state->expr;
PgfExprState *next = gu_new(PgfExprState, result->tmp_pool);
next->prev = prev->prev;
PgfExprApp *expr_apply =
gu_new_variant(PGF_EXPR_APP,
PgfExprApp,
&next->expr, pool);
expr_apply->fun = prev->expr;
expr_apply->arg = ps.state->expr;
next->args = prev->args;
next->arg_idx = prev->arg_idx;
PgfExprPState ps1 = { ps.prob, next };
gu_buf_heap_push(result->pqueue, &pgf_expr_state_order, &ps1);
} else {
PgfPArg *arg = gu_seq_index(ps.state->args, PgfPArg, ps.state->arg_idx++);
if (arg->ccat->fid < result->concr->total_cats) {
PgfExpr fun = ps.state->expr;
PgfExpr arg;
PgfExprMeta *expr_meta =
gu_new_variant(PGF_EXPR_META,
PgfExprMeta,
&arg, pool);
expr_meta->id = 0;
PgfExprApp *expr_apply =
gu_new_variant(PGF_EXPR_APP,
PgfExprApp,
&ps.state->expr, pool);
expr_apply->fun = fun;
expr_apply->arg = arg;
gu_buf_heap_push(result->pqueue, &pgf_expr_state_order, &ps);
} else {
pgf_parse_result_from_ccat(result, arg->ccat, &ps, pool);
}
}
} }
PgfExpr ret = gu_null_variant; gu_pool_free(result->tmp_pool);
result->tmp_pool = NULL;
do { result->pqueue = NULL;
size_t n_results = gu_buf_length(pr->completed); return gu_null_variant;
GuChoiceMark mark = gu_choice_mark(pr->choice);
int i = gu_choice_next(pr->choice, n_results);
if (i == -1) {
return gu_null_variant;
}
PgfCCat* cat = gu_buf_get(pr->completed, PgfCCat*, i);
GuPool* tmp_pool = gu_new_pool();
PgfCCatBuf* visited = gu_new_buf(PgfCCat*, tmp_pool);
ret = pgf_cat_to_expr(pr->concr, cat, visited, pr->choice, pool);
gu_pool_free(tmp_pool);
gu_choice_reset(pr->choice, mark);
if (!gu_choice_advance(pr->choice)) {
pr->choice = NULL;
};
} while (gu_variant_is_null(ret));
return ret;
} }
static void static void
pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool) pgf_parse_result_enum_next(GuEnum* self, void* to, GuPool* pool)
{ {
PgfParseResult* pr = gu_container(self, PgfParseResult, en); PgfParseResult* result = gu_container(self, PgfParseResult, en);
*(PgfExpr*)to = pgf_parse_result_next(pr, pool); *(PgfExpr*)to = pgf_parse_result_next(result, pool);
} }
static static
@@ -1096,30 +1134,41 @@ pgf_parse_result(PgfParse* parse, GuPool* pool)
{ {
PgfLexCallback fn = { pgf_noop }; PgfLexCallback fn = { pgf_noop };
GuPool* tmp_pool = gu_new_pool(); GuPool* parsing_pool = gu_new_pool();
PgfParsing* parsing = pgf_new_parsing(parse->concr, &fn, parse->max_fid, pool, tmp_pool); PgfParsing* parsing = pgf_new_parsing(parse->concr, &fn, parse->max_fid,
pool, parsing_pool);
size_t n_items = gu_buf_length(parse->agenda); size_t n_items = gu_buf_length(parse->agenda);
for (size_t i = 0; i < n_items; i++) { for (size_t i = 0; i < n_items; i++) {
PgfItem* item = gu_buf_get(parse->agenda, PgfItem*, i); PgfItem* item = gu_buf_get(parse->agenda, PgfItem*, i);
pgf_parsing_item(parsing, item); pgf_parsing_item(parsing, item);
} }
PgfCCatBuf *completed = parsing->completed;
gu_pool_free(parsing_pool);
PgfExprEnum* en = GuPool *states_pool = gu_new_pool();
&gu_new_i(pool, PgfParseResult, PgfParseResult *result =
gu_new_i(pool, PgfParseResult,
.abstr = parse->abstr,
.concr = parse->concr, .concr = parse->concr,
.completed = parsing->completed, .tmp_pool = states_pool,
.choice = gu_new_choice(pool), .pqueue = gu_new_buf(PgfExprPState, states_pool),
.en.next = pgf_parse_result_enum_next)->en; .en.next = pgf_parse_result_enum_next);
gu_pool_free(tmp_pool); size_t n_completed = gu_buf_length(completed);
return en; for (size_t i = 0; i < n_completed; i++) {
PgfCCat *ccat = gu_buf_get(completed, PgfCCat*, i);
PgfExprPState ps = { 0, NULL };
pgf_parse_result_from_ccat(result, ccat, &ps, pool);
}
return &result->en;
} }
// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat // TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
PgfParse* PgfParse*
pgf_parser_parse(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool) pgf_parser_parse(PgfAbstr* abstr, PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool)
{ {
PgfCncCat* cnccat = PgfCncCat* cnccat =
gu_map_get(concr->cnccats, &cat, PgfCncCat*); gu_map_get(concr->cnccats, &cat, PgfCncCat*);
@@ -1129,7 +1178,7 @@ pgf_parser_parse(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool)
} }
gu_assert(lin_idx < cnccat->n_lins); gu_assert(lin_idx < cnccat->n_lins);
PgfParse* parse = pgf_new_parse(concr, concr->max_fid, pool); PgfParse* parse = pgf_new_parse(abstr, concr, concr->max_fid, pool);
parse->agenda = gu_new_buf(PgfItem*, pool); parse->agenda = gu_new_buf(PgfItem*, pool);
PgfItemBuf* conts = gu_new_buf(PgfItem*, pool); PgfItemBuf* conts = gu_new_buf(PgfItem*, pool);

View File

@@ -33,7 +33,7 @@ typedef struct PgfParse PgfParse;
/// Begin parsing /// Begin parsing
PgfParse* PgfParse*
pgf_parser_parse(PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool); pgf_parser_parse(PgfAbstr* abstr, PgfConcr* concr, PgfCId cat, size_t lin_idx, GuPool* pool);
/**< /**<
* @param parser The parser to use * @param parser The parser to use
* *

View File

@@ -24,14 +24,23 @@ int main(int argc, char* argv[]) {
GuPool* pool = gu_new_pool(); GuPool* pool = gu_new_pool();
int status = EXIT_SUCCESS; int status = EXIT_SUCCESS;
if (argc != 5) { if (argc != 5) {
fprintf(stderr, "usage: %s pgf cat from_lang to_lang\n", argv[0]); fprintf(stderr, "usage: %s pgf [.]cat from_lang to_lang\n", argv[0]);
status = EXIT_FAILURE; status = EXIT_FAILURE;
goto fail; goto fail;
} }
char* filename = argv[1]; char* filename = argv[1];
// Transform C strings to libgu strings GuString cat;
GuString cat = gu_str_string(argv[2], pool); bool robust_mode;
if (argv[2][0] == '.') {
printf("%s\n", argv[2]+1);
cat = gu_str_string(argv[2]+1, pool);
robust_mode = true;
} else {
cat = gu_str_string(argv[2], pool);
robust_mode = false;
}
GuString from_lang = gu_str_string(argv[3], pool); GuString from_lang = gu_str_string(argv[3], pool);
GuString to_lang = gu_str_string(argv[4], pool); GuString to_lang = gu_str_string(argv[4], pool);
@@ -107,7 +116,7 @@ int main(int argc, char* argv[]) {
// Begin parsing a sentence of the specified category // Begin parsing a sentence of the specified category
PgfParse* parse = PgfParse* parse =
pgf_parser_parse(from_concr, cat, lin_idx, pool); pgf_parser_parse(&pgf->abstract, from_concr, cat, lin_idx, pool);
if (parse == NULL) { if (parse == NULL) {
fprintf(stderr, "Couldn't begin parsing\n"); fprintf(stderr, "Couldn't begin parsing\n");
status = EXIT_FAILURE; status = EXIT_FAILURE;
@@ -120,7 +129,7 @@ int main(int argc, char* argv[]) {
GuString tok_s = gu_str_string(tok, pool); GuString tok_s = gu_str_string(tok, pool);
gu_debug("parsing token \"%s\"", tok); gu_debug("parsing token \"%s\"", tok);
// feed the token to get a new parse state // feed the token to get a new parse state
parse = pgf_parse_token(parse, tok_s, true, ppool); parse = pgf_parse_token(parse, tok_s, robust_mode, ppool);
if (!parse) { if (!parse) {
fprintf(stderr, fprintf(stderr,
"Unexpected token: \"%s\"\n", tok); "Unexpected token: \"%s\"\n", tok);
@@ -139,6 +148,9 @@ int main(int argc, char* argv[]) {
while (true) { while (true) {
PgfExpr expr = gu_next(result, PgfExpr, ppool); PgfExpr expr = gu_next(result, PgfExpr, ppool);
clock_t end = clock();
// The enumerator will return a null variant at the // The enumerator will return a null variant at the
// end of the results. // end of the results.
if (gu_variant_is_null(expr)) { if (gu_variant_is_null(expr)) {
@@ -149,6 +161,12 @@ int main(int argc, char* argv[]) {
pgf_print_expr(expr, 0, wtr, err); pgf_print_expr(expr, 0, wtr, err);
gu_putc('\n', wtr, err); gu_putc('\n', wtr, err);
if (robust_mode) {
double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
printf("%.2f sec\n", cpu_time_used);
break;
}
// Enumerate the concrete syntax trees corresponding // Enumerate the concrete syntax trees corresponding
// to the abstract tree. // to the abstract tree.
GuEnum* cts = pgf_lzr_concretize(to_concr, expr, ppool); GuEnum* cts = pgf_lzr_concretize(to_concr, expr, ppool);