forked from GitHub/gf-core
added experimental script for chunking in the C runtime
This commit is contained in:
@@ -116,7 +116,8 @@ libpgf_la_SOURCES = \
|
||||
bin_PROGRAMS = \
|
||||
utils/pgf2yaml \
|
||||
utils/pgf-print \
|
||||
utils/pgf-translate
|
||||
utils/pgf-translate \
|
||||
utils/pgf-chunk
|
||||
|
||||
utils_pgf2yaml_SOURCES = utils/pgf2yaml.c
|
||||
utils_pgf2yaml_LDADD = libpgf.la libgu.la
|
||||
@@ -127,6 +128,9 @@ utils_pgf_print_LDADD = libpgf.la libgu.la
|
||||
utils_pgf_translate_SOURCES = utils/pgf-translate.c
|
||||
utils_pgf_translate_LDADD = libpgf.la libgu.la
|
||||
|
||||
utils_pgf_chunk_SOURCES = utils/pgf-chunk.c
|
||||
utils_pgf_chunk_LDADD = libpgf.la libgu.la
|
||||
|
||||
AUTOMAKE_OPTIONS = foreign subdir-objects dist-bzip2
|
||||
ACLOCAL_AMFLAGS = -I m4
|
||||
include doxygen.am
|
||||
|
||||
@@ -85,9 +85,7 @@ struct PgfParseState {
|
||||
PgfItem* meta_item;
|
||||
PgfContsMap* conts_map;
|
||||
PgfGenCatMap* generated_cats;
|
||||
#ifdef PGF_PARSER_DEBUG
|
||||
unsigned short offset;
|
||||
#endif
|
||||
|
||||
prob_t viterbi_prob;
|
||||
|
||||
@@ -1630,9 +1628,7 @@ pgf_new_parse_state(PgfParsing* ps,
|
||||
state->meta_item = NULL;
|
||||
state->generated_cats = gu_map_type_new(PgfGenCatMap, pool);
|
||||
state->conts_map = gu_map_type_new(PgfContsMap, pool);
|
||||
#ifdef PGF_PARSER_DEBUG
|
||||
state->offset = next ? next->offset+1 : 0;
|
||||
#endif
|
||||
state->viterbi_prob = 0;
|
||||
state->ps = ps;
|
||||
state->ts = ts;
|
||||
@@ -1884,6 +1880,102 @@ pgf_parse_result(PgfParseState* state, GuPool* pool)
|
||||
return en;
|
||||
}
|
||||
|
||||
void
|
||||
pgf_parse_print_chunks(PgfParseState* state)
|
||||
{
|
||||
if (state->ps->completed == NULL) {
|
||||
while (state->ps->completed == NULL) {
|
||||
if (!pgf_parsing_proceed(state))
|
||||
break;
|
||||
}
|
||||
if (state->ps->completed == NULL)
|
||||
return;
|
||||
}
|
||||
|
||||
GuPool* tmp_pool = gu_new_pool();
|
||||
GuOut* out = gu_file_out(stdout, tmp_pool);
|
||||
GuWriter* wtr = gu_new_utf8_writer(out, tmp_pool);
|
||||
GuExn* err = gu_exn(NULL, type, tmp_pool);
|
||||
|
||||
PgfCCat* completed = state->ps->completed;
|
||||
if (gu_seq_length(completed->prods) == 0)
|
||||
return;
|
||||
|
||||
size_t n_args = 0;
|
||||
size_t arg_idx = 0;
|
||||
PgfCCat* ccat = NULL;
|
||||
PgfProductionMeta* pmeta = NULL;
|
||||
|
||||
PgfProduction prod = gu_seq_get(completed->prods, PgfProduction, 0);
|
||||
GuVariantInfo pi = gu_variant_open(prod);
|
||||
switch (pi.tag) {
|
||||
case PGF_PRODUCTION_APPLY:
|
||||
n_args = 1;
|
||||
arg_idx = 0;
|
||||
ccat = completed;
|
||||
break;
|
||||
case PGF_PRODUCTION_META:
|
||||
pmeta = pi.data;
|
||||
n_args = gu_seq_length(pmeta->args);
|
||||
arg_idx = 0;
|
||||
ccat = gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat;
|
||||
break;
|
||||
}
|
||||
|
||||
PgfParseState* next = NULL;
|
||||
while (state != NULL) {
|
||||
PgfParseState* tmp = state->next;
|
||||
state->next = next;
|
||||
next = state;
|
||||
state = tmp;
|
||||
}
|
||||
|
||||
int offset = 0;
|
||||
|
||||
state = next;
|
||||
next = NULL;
|
||||
while (state != NULL) {
|
||||
if (state->ts != NULL)
|
||||
{
|
||||
if (ccat != NULL &&
|
||||
offset == ((ccat->conts->state != NULL) ? ccat->conts->state->offset : 0)) {
|
||||
PgfCCat *ccat2 = ccat;
|
||||
while (ccat2->conts != NULL) {
|
||||
ccat2 = ccat2->conts->ccat;
|
||||
}
|
||||
|
||||
gu_putc('(', wtr, err);
|
||||
gu_string_write(ccat2->cnccat->abscat->name, wtr, err);
|
||||
gu_putc(' ', wtr, err);
|
||||
}
|
||||
|
||||
gu_string_write(state->ts->tok, wtr, err);
|
||||
offset++;
|
||||
|
||||
if (ccat != NULL &&
|
||||
ccat ==
|
||||
gu_map_get(state->generated_cats, ccat->conts, PgfCCat*)) {
|
||||
gu_putc(')', wtr, err);
|
||||
|
||||
arg_idx++;
|
||||
ccat =
|
||||
(arg_idx >= n_args) ?
|
||||
NULL :
|
||||
gu_seq_index(pmeta->args, PgfPArg, arg_idx)->ccat;
|
||||
}
|
||||
|
||||
gu_putc(' ', wtr, err);
|
||||
}
|
||||
|
||||
PgfParseState* tmp = state->next;
|
||||
state->next = next;
|
||||
next = state;
|
||||
state = tmp;
|
||||
}
|
||||
gu_putc('\n', wtr, err);
|
||||
|
||||
gu_pool_free(tmp_pool);
|
||||
}
|
||||
|
||||
// TODO: s/CId/Cat, add the cid to Cat, make Cat the key to CncCat
|
||||
PgfParseState*
|
||||
|
||||
@@ -102,9 +102,12 @@ pgf_parse_result(PgfParseState* state, GuPool* pool);
|
||||
* succesful, or ambiguously successful.
|
||||
*/
|
||||
|
||||
PgfExpr
|
||||
pgf_parse_best_result(PgfParseState* state, GuPool* pool);
|
||||
|
||||
// Use this procedure only on your own risk.
|
||||
// It is dirty and it will probably be removed or replaced
|
||||
// with something else. Currently it is here only for experimental
|
||||
// purposes.
|
||||
void
|
||||
pgf_parse_print_chunks(PgfParseState* state);
|
||||
|
||||
size_t
|
||||
pgf_item_lin_idx(PgfItem* item);
|
||||
|
||||
160
src/runtime/c/utils/pgf-chunk.c
Normal file
160
src/runtime/c/utils/pgf-chunk.c
Normal file
@@ -0,0 +1,160 @@
|
||||
// Don't give too much hope to this script. It is doing the wrong thing
|
||||
// but let's see how far we can get with it.
|
||||
|
||||
#include <gu/variant.h>
|
||||
#include <gu/map.h>
|
||||
#include <gu/dump.h>
|
||||
#include <gu/log.h>
|
||||
#include <gu/enum.h>
|
||||
#include <gu/file.h>
|
||||
#include <pgf/pgf.h>
|
||||
#include <pgf/data.h>
|
||||
#include <pgf/parser.h>
|
||||
#include <pgf/lexer.h>
|
||||
#include <pgf/literals.h>
|
||||
#include <pgf/linearize.h>
|
||||
#include <pgf/expr.h>
|
||||
#include <pgf/edsl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <locale.h>
|
||||
#include <time.h>
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
// Set the character locale, so we can produce proper output.
|
||||
setlocale(LC_CTYPE, "");
|
||||
|
||||
// Create the pool that is used to allocate everything
|
||||
GuPool* pool = gu_new_pool();
|
||||
int status = EXIT_SUCCESS;
|
||||
if (argc != 4) {
|
||||
fprintf(stderr, "usage: %s pgf cat from_lang\n", argv[0]);
|
||||
status = EXIT_FAILURE;
|
||||
goto fail;
|
||||
}
|
||||
char* filename = argv[1];
|
||||
|
||||
GuString cat = gu_str_string(argv[2], pool);
|
||||
|
||||
GuString from_lang = gu_str_string(argv[3], pool);
|
||||
|
||||
FILE* infile = fopen(filename, "r");
|
||||
if (infile == NULL) {
|
||||
fprintf(stderr, "couldn't open %s\n", filename);
|
||||
status = EXIT_FAILURE;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
// Create an input stream from the input file
|
||||
GuIn* in = gu_file_in(infile, pool);
|
||||
|
||||
// Create an exception frame that catches all errors.
|
||||
GuExn* err = gu_new_exn(NULL, gu_kind(type), pool);
|
||||
|
||||
// Read the PGF grammar.
|
||||
PgfPGF* pgf = pgf_read(in, pool, err);
|
||||
|
||||
// If an error occured, it shows in the exception frame
|
||||
if (!gu_ok(err)) {
|
||||
fprintf(stderr, "Reading PGF failed\n");
|
||||
status = EXIT_FAILURE;
|
||||
goto fail_read;
|
||||
}
|
||||
|
||||
if (!pgf_load_meta_child_probs(pgf, "../../../treebanks/PennTreebank/ParseEngAbs3.probs", pool)) {
|
||||
fprintf(stderr, "Loading meta child probs failed\n");
|
||||
status = EXIT_FAILURE;
|
||||
goto fail_read;
|
||||
}
|
||||
|
||||
// Look up the source and destination concrete categories
|
||||
PgfConcr* from_concr =
|
||||
gu_map_get(pgf->concretes, &from_lang, PgfConcr*);
|
||||
if (!from_concr) {
|
||||
fprintf(stderr, "Unknown language\n");
|
||||
status = EXIT_FAILURE;
|
||||
goto fail_concr;
|
||||
}
|
||||
|
||||
// Register a callback for the literal category Symbol
|
||||
pgf_parser_add_literal(from_concr, gu_str_string("Symb", pool),
|
||||
&pgf_nerc_literal_callback);
|
||||
|
||||
// Create an output stream for stdout
|
||||
GuOut* out = gu_file_out(stdout, pool);
|
||||
|
||||
// Locale-encoding writers are currently unsupported
|
||||
// GuWriter* wtr = gu_locale_writer(out, pool);
|
||||
// Use a writer with hard-coded utf-8 encoding for now.
|
||||
GuWriter* wtr = gu_new_utf8_writer(out, pool);
|
||||
|
||||
// We will keep the latest results in the 'ppool' and
|
||||
// we will iterate over them by using 'result'.
|
||||
GuPool* ppool = NULL;
|
||||
|
||||
// The interactive translation loop.
|
||||
// XXX: This currently reads stdin directly, so it doesn't support
|
||||
// encodings properly. TODO: use a locale reader for input
|
||||
while (true) {
|
||||
char buf[4096];
|
||||
char* line = fgets(buf, sizeof(buf), stdin);
|
||||
if (line == NULL) {
|
||||
if (ferror(stdin)) {
|
||||
fprintf(stderr, "Input error\n");
|
||||
status = EXIT_FAILURE;
|
||||
}
|
||||
break;
|
||||
} else if (strcmp(line, "") == 0) {
|
||||
// End nicely on empty input
|
||||
break;
|
||||
}
|
||||
|
||||
// We create a temporary pool for translating a single
|
||||
// sentence, so our memory usage doesn't increase over time.
|
||||
ppool = gu_new_pool();
|
||||
|
||||
// Begin parsing a sentence of the specified category
|
||||
PgfParseState* state =
|
||||
pgf_parser_init_state(from_concr, cat, 0, ppool);
|
||||
if (state == NULL) {
|
||||
fprintf(stderr, "Couldn't begin parsing\n");
|
||||
status = EXIT_FAILURE;
|
||||
break;
|
||||
}
|
||||
|
||||
GuReader *rdr =
|
||||
gu_string_reader(gu_str_string(line, ppool), ppool);
|
||||
PgfLexer *lexer =
|
||||
pgf_new_lexer(rdr, ppool);
|
||||
|
||||
// Tokenization
|
||||
GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), ppool);
|
||||
PgfToken tok = pgf_lexer_next_token(lexer, lex_err, ppool);
|
||||
while (!gu_exn_is_raised(lex_err)) {
|
||||
// feed the token to get a new parse state
|
||||
state = pgf_parser_next_state(state, tok, ppool);
|
||||
if (!state) {
|
||||
gu_puts("Unexpected token: \"", wtr, err);
|
||||
gu_string_write(tok, wtr, err);
|
||||
gu_puts("\"\n", wtr, err);
|
||||
goto fail_parse;
|
||||
}
|
||||
|
||||
tok = pgf_lexer_next_token(lexer, lex_err, ppool);
|
||||
}
|
||||
|
||||
pgf_parse_print_chunks(state);
|
||||
continue;
|
||||
fail_parse:
|
||||
// Free all resources allocated during parsing and linearization
|
||||
gu_pool_free(ppool);
|
||||
ppool = NULL;
|
||||
}
|
||||
fail_concr:
|
||||
fail_read:
|
||||
fclose(infile);
|
||||
fail:
|
||||
gu_pool_free(pool);
|
||||
return status;
|
||||
}
|
||||
Reference in New Issue
Block a user