From 1863e4c3d6babe2176e5444f9f6a01f942e257e5 Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Mon, 3 Dec 2012 10:07:54 +0000 Subject: [PATCH] added experimental script for chunking in the C runtime --- src/runtime/c/Makefile.am | 6 +- src/runtime/c/pgf/parser.c | 100 +++++++++++++++++++- src/runtime/c/pgf/parser.h | 9 +- src/runtime/c/utils/pgf-chunk.c | 160 ++++++++++++++++++++++++++++++++ 4 files changed, 267 insertions(+), 8 deletions(-) create mode 100644 src/runtime/c/utils/pgf-chunk.c diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am index f30a909ee..3757dbbf4 100644 --- a/src/runtime/c/Makefile.am +++ b/src/runtime/c/Makefile.am @@ -116,7 +116,8 @@ libpgf_la_SOURCES = \ bin_PROGRAMS = \ utils/pgf2yaml \ utils/pgf-print \ - utils/pgf-translate + utils/pgf-translate \ + utils/pgf-chunk utils_pgf2yaml_SOURCES = utils/pgf2yaml.c utils_pgf2yaml_LDADD = libpgf.la libgu.la @@ -127,6 +128,9 @@ utils_pgf_print_LDADD = libpgf.la libgu.la utils_pgf_translate_SOURCES = utils/pgf-translate.c utils_pgf_translate_LDADD = libpgf.la libgu.la +utils_pgf_chunk_SOURCES = utils/pgf-chunk.c +utils_pgf_chunk_LDADD = libpgf.la libgu.la + AUTOMAKE_OPTIONS = foreign subdir-objects dist-bzip2 ACLOCAL_AMFLAGS = -I m4 include doxygen.am diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index ad792ebb4..6159ab859 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -85,9 +85,7 @@ struct PgfParseState { PgfItem* meta_item; PgfContsMap* conts_map; PgfGenCatMap* generated_cats; -#ifdef PGF_PARSER_DEBUG unsigned short offset; -#endif prob_t viterbi_prob; @@ -1630,9 +1628,7 @@ pgf_new_parse_state(PgfParsing* ps, state->meta_item = NULL; state->generated_cats = gu_map_type_new(PgfGenCatMap, pool); state->conts_map = gu_map_type_new(PgfContsMap, pool); -#ifdef PGF_PARSER_DEBUG state->offset = next ? next->offset+1 : 0; -#endif state->viterbi_prob = 0; state->ps = ps; state->ts = ts; @@ -1884,6 +1880,102 @@ pgf_parse_result(PgfParseState* state, GuPool* pool) return en; } +void +pgf_parse_print_chunks(PgfParseState* state) +{ + if (state->ps->completed == NULL) { + while (state->ps->completed == NULL) { + if (!pgf_parsing_proceed(state)) + break; + } + if (state->ps->completed == NULL) + return; + } + + GuPool* tmp_pool = gu_new_pool(); + GuOut* out = gu_file_out(stdout, tmp_pool); + GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool); + GuExn* err = gu_exn(NULL, type, tmp_pool); + + PgfCCat* completed = state->ps->completed; + if (gu_seq_length(completed->prods) == 0) + return; + + size_t n_args = 0; + size_t arg_idx = 0; + PgfCCat* ccat = NULL; + PgfProductionMeta* pmeta = NULL; + + PgfProduction prod = gu_seq_get(completed->prods, PgfProduction, 0); + GuVariantInfo pi = gu_variant_open(prod); + switch (pi.tag) { + case PGF_PRODUCTION_APPLY: + n_args = 1; + arg_idx = 0; + ccat = completed; + break; + case PGF_PRODUCTION_META: + pmeta = pi.data; + n_args = gu_seq_length(pmeta->args); + arg_idx = 0; + ccat = gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat; + break; + } + + PgfParseState* next = NULL; + while (state != NULL) { + PgfParseState* tmp = state->next; + state->next = next; + next = state; + state = tmp; + } + + int offset = 0; + + state = next; + next = NULL; + while (state != NULL) { + if (state->ts != NULL) + { + if (ccat != NULL && + offset == ((ccat->conts->state != NULL) ? ccat->conts->state->offset : 0)) { + PgfCCat *ccat2 = ccat; + while (ccat2->conts != NULL) { + ccat2 = ccat2->conts->ccat; + } + + gu_putc('(', wtr, err); + gu_string_write(ccat2->cnccat->abscat->name, wtr, err); + gu_putc(' ', wtr, err); + } + + gu_string_write(state->ts->tok, wtr, err); + offset++; + + if (ccat != NULL && + ccat == + gu_map_get(state->generated_cats, ccat->conts, PgfCCat*)) { + gu_putc(')', wtr, err); + + arg_idx++; + ccat = + (arg_idx >= n_args) ? + NULL : + gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat; + } + + gu_putc(' ', wtr, err); + } + + PgfParseState* tmp = state->next; + state->next = next; + next = state; + state = tmp; + } + gu_putc('\n', wtr, err); + + gu_pool_free(tmp_pool); +} // TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat PgfParseState* diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h index 65997f601..dcc3ca3af 100644 --- a/src/runtime/c/pgf/parser.h +++ b/src/runtime/c/pgf/parser.h @@ -102,9 +102,12 @@ pgf_parse_result(PgfParseState* state, GuPool* pool); * succesful, or ambiguously successful. */ -PgfExpr -pgf_parse_best_result(PgfParseState* state, GuPool* pool); - +// Use this procedure only on your own risk. +// It is dirty and it will probably be removed or replaced +// with something else. Currently it is here only for experimental +// purposes. +void +pgf_parse_print_chunks(PgfParseState* state); size_t pgf_item_lin_idx(PgfItem* item); diff --git a/src/runtime/c/utils/pgf-chunk.c b/src/runtime/c/utils/pgf-chunk.c new file mode 100644 index 000000000..c4d0d0b3f --- /dev/null +++ b/src/runtime/c/utils/pgf-chunk.c @@ -0,0 +1,160 @@ +// Don't give too much hope to this script. It is doing the wrong thing +// but let's see how far we can get with it. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char* argv[]) { + // Set the character locale, so we can produce proper output. + setlocale(LC_CTYPE, ""); + + // Create the pool that is used to allocate everything + GuPool* pool = gu_new_pool(); + int status = EXIT_SUCCESS; + if (argc != 4) { + fprintf(stderr, "usage: %s pgf cat from_lang\n", argv[0]); + status = EXIT_FAILURE; + goto fail; + } + char* filename = argv[1]; + + GuString cat = gu_str_string(argv[2], pool); + + GuString from_lang = gu_str_string(argv[3], pool); + + FILE* infile = fopen(filename, "r"); + if (infile == NULL) { + fprintf(stderr, "couldn't open %s\n", filename); + status = EXIT_FAILURE; + goto fail; + } + + // Create an input stream from the input file + GuIn* in = gu_file_in(infile, pool); + + // Create an exception frame that catches all errors. + GuExn* err = gu_new_exn(NULL, gu_kind(type), pool); + + // Read the PGF grammar. + PgfPGF* pgf = pgf_read(in, pool, err); + + // If an error occured, it shows in the exception frame + if (!gu_ok(err)) { + fprintf(stderr, "Reading PGF failed\n"); + status = EXIT_FAILURE; + goto fail_read; + } + + if (!pgf_load_meta_child_probs(pgf, "../../../treebanks/PennTreebank/ParseEngAbs3.probs", pool)) { + fprintf(stderr, "Loading meta child probs failed\n"); + status = EXIT_FAILURE; + goto fail_read; + } + + // Look up the source and destination concrete categories + PgfConcr* from_concr = + gu_map_get(pgf->concretes, &from_lang, PgfConcr*); + if (!from_concr) { + fprintf(stderr, "Unknown language\n"); + status = EXIT_FAILURE; + goto fail_concr; + } + + // Register a callback for the literal category Symbol + pgf_parser_add_literal(from_concr, gu_str_string("Symb", pool), + &pgf_nerc_literal_callback); + + // Create an output stream for stdout + GuOut* out = gu_file_out(stdout, pool); + + // Locale-encoding writers are currently unsupported + // GuWriter* wtr = gu_locale_writer(out, pool); + // Use a writer with hard-coded utf-8 encoding for now. + GuWriter* wtr = gu_new_utf8_writer(out, pool); + + // We will keep the latest results in the 'ppool' and + // we will iterate over them by using 'result'. + GuPool* ppool = NULL; + + // The interactive translation loop. + // XXX: This currently reads stdin directly, so it doesn't support + // encodings properly. TODO: use a locale reader for input + while (true) { + char buf[4096]; + char* line = fgets(buf, sizeof(buf), stdin); + if (line == NULL) { + if (ferror(stdin)) { + fprintf(stderr, "Input error\n"); + status = EXIT_FAILURE; + } + break; + } else if (strcmp(line, "") == 0) { + // End nicely on empty input + break; + } + + // We create a temporary pool for translating a single + // sentence, so our memory usage doesn't increase over time. + ppool = gu_new_pool(); + + // Begin parsing a sentence of the specified category + PgfParseState* state = + pgf_parser_init_state(from_concr, cat, 0, ppool); + if (state == NULL) { + fprintf(stderr, "Couldn't begin parsing\n"); + status = EXIT_FAILURE; + break; + } + + GuReader *rdr = + gu_string_reader(gu_str_string(line, ppool), ppool); + PgfLexer *lexer = + pgf_new_lexer(rdr, ppool); + + // Tokenization + GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), ppool); + PgfToken tok = pgf_lexer_next_token(lexer, lex_err, ppool); + while (!gu_exn_is_raised(lex_err)) { + // feed the token to get a new parse state + state = pgf_parser_next_state(state, tok, ppool); + if (!state) { + gu_puts("Unexpected token: \"", wtr, err); + gu_string_write(tok, wtr, err); + gu_puts("\"\n", wtr, err); + goto fail_parse; + } + + tok = pgf_lexer_next_token(lexer, lex_err, ppool); + } + + pgf_parse_print_chunks(state); + continue; + fail_parse: + // Free all resources allocated during parsing and linearization + gu_pool_free(ppool); + ppool = NULL; + } +fail_concr: +fail_read: + fclose(infile); +fail: + gu_pool_free(pool); + return status; +}