diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am index e9dc866a5..47dd082fc 100644 --- a/src/runtime/c/Makefile.am +++ b/src/runtime/c/Makefile.am @@ -121,8 +121,7 @@ libpgf_la_SOURCES = \ bin_PROGRAMS = \ utils/pgf-print \ utils/pgf-translate \ - utils/pgf-parse \ - utils/pgf-chunk + utils/pgf-parse utils_pgf_print_SOURCES = utils/pgf-print.c utils_pgf_print_LDADD = libpgf.la libgu.la @@ -133,9 +132,6 @@ utils_pgf_translate_LDADD = libpgf.la libgu.la utils_pgf_parse_SOURCES = utils/pgf-parse.c utils_pgf_parse_LDADD = libpgf.la libgu.la -utils_pgf_chunk_SOURCES = utils/pgf-chunk.c -utils_pgf_chunk_LDADD = libpgf.la libgu.la - AUTOMAKE_OPTIONS = foreign subdir-objects dist-bzip2 ACLOCAL_AMFLAGS = -I m4 include doxygen.am diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index a1d8084ed..899628f6a 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -1638,7 +1638,8 @@ pgf_parsing_default_beam_size(PgfConcr* concr) } static PgfParsing* -pgf_new_parsing(PgfConcr* concr, GuPool* pool, GuPool* out_pool) +pgf_new_parsing(PgfConcr* concr, double heuristics, + GuPool* pool, GuPool* out_pool) { PgfParsing* ps = gu_new(PgfParsing, pool); ps->concr = concr; @@ -1654,7 +1655,7 @@ pgf_new_parsing(PgfConcr* concr, GuPool* pool, GuPool* out_pool) ps->prod_full_count = 0; #endif ps->free_item = NULL; - ps->beam_size = pgf_parsing_default_beam_size(concr); + ps->beam_size = heuristics; PgfExprMeta *expr_meta = gu_new_variant(PGF_EXPR_META, @@ -2214,6 +2215,7 @@ pgf_parse_print_chunks(PgfParseState* state) // TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat PgfParseState* pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx, + double heuristics, GuPool* pool, GuPool* out_pool) { PgfCncCat* cnccat = @@ -2223,8 +2225,12 @@ pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx, gu_assert(lin_idx < cnccat->n_lins); + if (heuristics < 0) { + heuristics = pgf_parsing_default_beam_size(concr); + } + PgfParsing* ps = - pgf_new_parsing(concr, pool, out_pool); + pgf_new_parsing(concr, heuristics, pool, out_pool); PgfParseState* state = pgf_new_parse_state(ps, NULL, NULL, pool); @@ -2269,12 +2275,6 @@ pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx, return state; } -void -pgf_parser_set_beam_size(PgfParseState* state, double beam_size) -{ - state->ps->beam_size = beam_size; -} - void pgf_parser_add_literal(PgfConcr *concr, PgfCId cat, PgfLiteralCallback* callback) diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h index 8c4ba77e9..b49cba868 100644 --- a/src/runtime/c/pgf/parser.h +++ b/src/runtime/c/pgf/parser.h @@ -34,6 +34,7 @@ typedef struct PgfParseState PgfParseState; /// Begin parsing PgfParseState* pgf_parser_init_state(PgfConcr* concr, PgfCId cat, size_t lin_idx, + double heuristics, GuPool* pool, GuPool* out_pool); /**< * @param parser The parser to use @@ -69,9 +70,6 @@ pgf_parser_next_state(PgfParseState* prev, PgfToken tok); GuEnum* pgf_parser_completions(PgfParseState* prev, GuString prefix); -void -pgf_parser_set_beam_size(PgfParseState* state, double beam_size); - void pgf_parser_add_literal(PgfConcr *concr, PgfCId cat, PgfLiteralCallback* callback); diff --git a/src/runtime/c/pgf/parseval.c b/src/runtime/c/pgf/parseval.c index 70b2666fd..eed216b82 100644 --- a/src/runtime/c/pgf/parseval.c +++ b/src/runtime/c/pgf/parseval.c @@ -157,7 +157,7 @@ pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat, PgfMetricsLznState state; state.funcs = &pgf_metrics_lin_funcs1; - state.ps = pgf_parser_init_state(concr, cat, 0, pool, pool); + state.ps = pgf_parser_init_state(concr, cat, 0, -1, pool, pool); state.marks = gu_new_buf(int, pool); state.pos = 0; state.phrases = gu_new_buf(PgfPhrase*, pool); diff --git a/src/runtime/c/pgf/pgf.c b/src/runtime/c/pgf/pgf.c index f1b85cae3..95b2132f5 100644 --- a/src/runtime/c/pgf/pgf.c +++ b/src/runtime/c/pgf/pgf.c @@ -210,10 +210,18 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuWriter* wtr, GuExn* err) GuEnum* pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool, GuPool* out_pool) +{ + return pgf_parse_with_heuristics(concr, cat, lexer, -1.0, pool, out_pool); +} + +GuEnum* +pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, + double heuristics, + GuPool* pool, GuPool* out_pool) { // Begin parsing a sentence of the specified category PgfParseState* state = - pgf_parser_init_state(concr, cat, 0, pool, out_pool); + pgf_parser_init_state(concr, cat, 0, heuristics, pool, out_pool); if (state == NULL) { return NULL; } @@ -244,7 +252,7 @@ pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, { // Begin parsing a sentence of the specified category PgfParseState* state = - pgf_parser_init_state(concr, cat, 0, pool, pool); + pgf_parser_init_state(concr, cat, 0, -1, pool, pool); if (state == NULL) { return NULL; } @@ -268,31 +276,3 @@ pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, // Now begin enumerating the resulting syntax trees return pgf_parser_completions(state, prefix); } - -void -pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool) -{ - // Begin parsing a sentence of the specified category - PgfParseState* state = - pgf_parser_init_state(concr, cat, 0, pool, pool); - if (state == NULL) { - printf("\n"); - return; - } - - // Tokenization - GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool); - PgfToken tok = pgf_lexer_read_token(lexer, lex_err); - while (!gu_exn_is_raised(lex_err)) { - // feed the token to get a new parse state - state = pgf_parser_next_state(state, tok); - if (state == NULL) { - printf("\n"); - return; - } - - tok = pgf_lexer_read_token(lexer, lex_err); - } - - pgf_parse_print_chunks(state); -} diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index d83598cc0..2e7e43584 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -117,6 +117,11 @@ PgfExprEnum* pgf_parse(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool, GuPool* out_pool); +PgfExprEnum* +pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, + double heuristics, + GuPool* pool, GuPool* out_pool); + GuEnum* pgf_get_completions(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuString prefix, GuPool* pool); @@ -128,11 +133,6 @@ pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat, PgfExprEnum* pgf_generate(PgfPGF* pgf, PgfCId cat, GuPool* pool); -// an experimental function. Please don't use it -void -pgf_print_chunks(PgfConcr* concr, PgfCId cat, PgfLexer *lexer, GuPool* pool); - - /// @} void diff --git a/src/runtime/c/utils/pgf-chunk.c b/src/runtime/c/utils/pgf-chunk.c deleted file mode 100644 index 5f4b8972a..000000000 --- a/src/runtime/c/utils/pgf-chunk.c +++ /dev/null @@ -1,112 +0,0 @@ -// Don't give too much hope to this script. It is doing the wrong thing -// but let's see how far we can get with it. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int main(int argc, char* argv[]) { - // Set the character locale, so we can produce proper output. - setlocale(LC_CTYPE, ""); - - // Create the pool that is used to allocate everything - GuPool* pool = gu_new_pool(); - int status = EXIT_SUCCESS; - if (argc != 4) { - fprintf(stderr, "usage: %s pgf cat from_lang\n", argv[0]); - status = EXIT_FAILURE; - goto fail; - } - char* filename = argv[1]; - - GuString cat = gu_str_string(argv[2], pool); - - GuString from_lang = gu_str_string(argv[3], pool); - - // Create an exception frame that catches all errors. - GuExn* err = gu_new_exn(NULL, gu_kind(type), pool); - - // Read the PGF grammar. - PgfPGF* pgf = pgf_read(filename, pool, err); - - // If an error occured, it shows in the exception frame - if (!gu_ok(err)) { - fprintf(stderr, "Reading PGF failed\n"); - status = EXIT_FAILURE; - goto fail; - } - - pgf_load_meta_child_probs(pgf, "../../../treebanks/PennTreebank/ParseEngAbs3.probs", pool, err); - if (!gu_ok(err)) { - fprintf(stderr, "Loading meta child probs failed\n"); - status = EXIT_FAILURE; - goto fail; - } - - // Look up the source and destination concrete categories - PgfConcr* from_concr = pgf_get_language(pgf, from_lang); - if (!from_concr) { - fprintf(stderr, "Unknown language\n"); - status = EXIT_FAILURE; - goto fail_concr; - } - - // Register a callback for the literal category Symbol - pgf_parser_add_literal(from_concr, gu_str_string("Symb", pool), - &pgf_nerc_literal_callback); - - // We will keep the latest results in the 'ppool' and - // we will iterate over them by using 'result'. - GuPool* ppool = NULL; - - // The interactive translation loop. - // XXX: This currently reads stdin directly, so it doesn't support - // encodings properly. TODO: use a locale reader for input - while (true) { - char buf[4096]; - char* line = fgets(buf, sizeof(buf), stdin); - if (line == NULL) { - if (ferror(stdin)) { - fprintf(stderr, "Input error\n"); - status = EXIT_FAILURE; - } - break; - } else if (strcmp(line, "") == 0) { - // End nicely on empty input - break; - } - - // We create a temporary pool for translating a single - // sentence, so our memory usage doesn't increase over time. - ppool = gu_new_pool(); - - GuReader *rdr = - gu_string_reader(gu_str_string(line, ppool), ppool); - PgfLexer *lexer = - pgf_new_simple_lexer(rdr, ppool); - - pgf_print_chunks(from_concr, cat, lexer, ppool); - - // Free all resources allocated during parsing and linearization - gu_pool_free(ppool); - } -fail_concr: -fail: - gu_pool_free(pool); - return status; -} diff --git a/src/runtime/c/utils/pgf-parse.c b/src/runtime/c/utils/pgf-parse.c index a05d7988b..ba1088890 100644 --- a/src/runtime/c/utils/pgf-parse.c +++ b/src/runtime/c/utils/pgf-parse.c @@ -25,8 +25,8 @@ int main(int argc, char* argv[]) { // Create the pool that is used to allocate everything GuPool* pool = gu_new_pool(); int status = EXIT_SUCCESS; - if (argc != 4) { - fprintf(stderr, "usage: %s pgf-file start-cat cnc-lang\n", argv[0]); + if (argc < 4 || argc > 5) { + fprintf(stderr, "usage: %s pgf-file start-cat cnc-lang [heuristics]\n(0.0 <= heuristics < 1.0, default: 0.95)\n", argv[0]); status = EXIT_FAILURE; goto fail; } @@ -34,6 +34,11 @@ int main(int argc, char* argv[]) { GuString cat = gu_str_string(argv[2], pool); GuString lang = gu_str_string(argv[3], pool); + double heuristics = 0.95; + if (argc == 5) { + heuristics = atof(argv[4]); + } + // Create an exception frame that catches all errors. GuExn* err = gu_new_exn(NULL, gu_kind(type), pool); @@ -65,7 +70,7 @@ int main(int argc, char* argv[]) { clock_t end = clock(); double cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC; - fprintf(stderr, "(%.0f ms) Ready to parse!\n", 1000.0 * cpu_time_used); + fprintf(stderr, "(%.0f ms) Ready to parse [heuristics=%.2f]!\n", 1000.0 * cpu_time_used, heuristics); // Create an output stream for stdout GuOut* out = gu_file_out(stdout, pool); @@ -113,18 +118,9 @@ int main(int argc, char* argv[]) { clock_t start = clock(); - // Begin parsing a sentence of the specified category - PgfParseState* state = - pgf_parser_init_state(concr, cat, 0, ppool, ppool); - if (state == NULL) { - fprintf(stderr, "Couldn't begin parsing\n"); - status = EXIT_FAILURE; - break; - } - GuReader *rdr = gu_string_reader(gu_str_string(line, ppool), ppool); PgfLexer *lexer = pgf_new_simple_lexer(rdr, ppool); - GuEnum* result = pgf_parse(concr, cat, lexer, ppool, ppool); + GuEnum* result = pgf_parse_with_heuristics(concr, cat, lexer, heuristics, ppool, ppool); PgfExprProb* ep = NULL; if (result != NULL) diff --git a/src/runtime/python/pypgf.c b/src/runtime/python/pypgf.c index dc2d18bfa..1c7cd5edc 100644 --- a/src/runtime/python/pypgf.c +++ b/src/runtime/python/pypgf.c @@ -692,15 +692,16 @@ void pypgf_container_descructor(PyObject *capsule) static IterObject* Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) { - static char *kwlist[] = {"sentence", "tokens", "cat", "n", NULL}; + static char *kwlist[] = {"sentence", "tokens", "cat", "n", "heuristics", NULL}; int len; const uint8_t *buf = NULL; PyObject* py_lexer = NULL; const char *catname_s = NULL; int max_count = -1; - if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Osi", kwlist, - &buf, &len, &py_lexer, &catname_s, &max_count)) + double heuristics = -1; + if (!PyArg_ParseTupleAndKeywords(args, keywds, "|s#Osid", kwlist, + &buf, &len, &py_lexer, &catname_s, &max_count, &heuristics)) return NULL; if ((buf == NULL && py_lexer == NULL) || @@ -752,7 +753,8 @@ Concr_parse(ConcrObject* self, PyObject *args, PyObject *keywds) } pyres->res = - pgf_parse(self->concr, catname, lexer, pyres->pool, out_pool); + pgf_parse_with_heuristics(self->concr, catname, lexer, + heuristics, pyres->pool, out_pool); if (pyres->res == NULL) { Py_DECREF(pyres); @@ -1217,7 +1219,12 @@ static PyMethodDef Concr_methods[] = { "Returns the print name of a function or category" }, {"parse", (PyCFunction)Concr_parse, METH_VARARGS | METH_KEYWORDS, - "Parses a string and returns an iterator over the abstract trees for this sentence" + "Parses a string and returns an iterator over the abstract trees for this sentence\n\n" + "Named arguments:\n" + "- sentence (string) or tokens (list of strings)\n" + "- cat (string); OPTIONAL, default: the startcat of the grammar\n" + "- n (int), max. trees; OPTIONAL, default: extract all trees\n" + "- heuristics (double >= 0.0); OPTIONAL, default: taken from the flags in the grammar" }, {"getCompletions", (PyCFunction)Concr_getCompletions, METH_VARARGS | METH_KEYWORDS, "Parses a partial string and returns a list with the top n possible next tokens"