From 73e9eb934a3f6d74b4acfea3aa474650b3bd2c91 Mon Sep 17 00:00:00 2001 From: krasimir Date: Thu, 4 May 2017 07:55:11 +0000 Subject: [PATCH] merging spines --- src/runtime/c/pgf/lookup.c | 287 ++++++++++++++++++++++++++++++++----- src/runtime/c/pgf/pgf.h | 2 +- 2 files changed, 253 insertions(+), 36 deletions(-) diff --git a/src/runtime/c/pgf/lookup.c b/src/runtime/c/pgf/lookup.c index 9e6f58bb7..ac1e14a0d 100644 --- a/src/runtime/c/pgf/lookup.c +++ b/src/runtime/c/pgf/lookup.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -30,6 +31,22 @@ pgf_print_abs_production(PgfMetaId id, } gu_putc('\n',out,err); } + +static void +pgf_print_abs_productions(GuBuf* prods, + GuOut* out, GuExn* err) +{ + size_t n_prods = gu_buf_length(prods); + for (size_t id = 1; id < n_prods; id++) { + GuBuf* id_prods = gu_buf_get(prods, GuBuf*, id); + size_t n_id_prods = gu_buf_length(id_prods); + for (size_t i = 0; i < n_id_prods; i++) { + PgfAbsProduction* prod = + gu_buf_get(id_prods, PgfAbsProduction*, i); + pgf_print_abs_production(id, prod, out, err); + } + } +} #endif static void @@ -70,14 +87,14 @@ pgf_lookup_index_syms(GuMap* lexicon_idx, PgfSymbols* syms, PgfProductionIdx* id typedef struct { GuMap* function_idx; GuMap* cat_ids; - PgfMetaId next_id; + GuBuf* spine; GuPool* pool; } PgfSpineBuilder; static PgfAbsProduction* -pgf_lookup_new_production(PgfSpineBuilder* builder, PgfAbsFun* fun) { +pgf_lookup_new_production(PgfAbsFun* fun, GuPool *pool) { size_t n_hypos = gu_seq_length(fun->type->hypos); - PgfAbsProduction* prod = gu_new_flex(builder->pool, PgfAbsProduction, args, n_hypos); + PgfAbsProduction* prod = gu_new_flex(pool, PgfAbsProduction, args, n_hypos); prod->fun = fun; for (size_t i = 0; i < n_hypos; i++) { prod->args[i] = 0; @@ -85,6 +102,13 @@ pgf_lookup_new_production(PgfSpineBuilder* builder, PgfAbsFun* fun) { return prod; } +static void +pgf_lookup_add_production(PgfSpineBuilder* builder, PgfMetaId id, PgfAbsProduction* prod) +{ + GuBuf* prods = gu_buf_get(builder->spine, GuBuf*, id); + gu_buf_push(prods, PgfAbsProduction*, prod); +} + static PgfMetaId pgf_lookup_add_spine_nodes(PgfSpineBuilder* builder, PgfCId cat) { PgfMetaId cat_id = gu_map_get(builder->cat_ids, cat, PgfMetaId); @@ -92,7 +116,9 @@ pgf_lookup_add_spine_nodes(PgfSpineBuilder* builder, PgfCId cat) { return cat_id; } - cat_id = ++builder->next_id; + cat_id = gu_buf_length(builder->spine); + gu_buf_push(builder->spine, GuBuf*, gu_new_buf(PgfAbsProduction*, builder->pool)); + gu_map_put(builder->cat_ids, cat, PgfMetaId, cat_id); GuBuf* entries = gu_map_get(builder->function_idx, cat, GuBuf*); @@ -103,16 +129,10 @@ pgf_lookup_add_spine_nodes(PgfSpineBuilder* builder, PgfCId cat) { PgfMetaId id = pgf_lookup_add_spine_nodes(builder, entry->fun->type->cid); - PgfAbsProduction* prod = pgf_lookup_new_production(builder, entry->fun); + PgfAbsProduction* prod = pgf_lookup_new_production(entry->fun, builder->pool); prod->args[entry->arg_idx] = cat_id; - -#ifdef PGF_LOOKUP_DEBUG - GuPool* tmp_pool = gu_new_pool(); - GuOut* out = gu_file_out(stderr, tmp_pool); - GuExn* err = gu_exn(tmp_pool); - pgf_print_abs_production(id, prod, out, err); - gu_pool_free(tmp_pool); -#endif + + pgf_lookup_add_production(builder, id, prod); } } @@ -123,19 +143,189 @@ static void pgf_lookup_add_spine_leaf(PgfSpineBuilder* builder, PgfAbsFun *fun) { PgfMetaId id = pgf_lookup_add_spine_nodes(builder, fun->type->cid); - PgfAbsProduction* prod = pgf_lookup_new_production(builder, fun); + PgfAbsProduction* prod = pgf_lookup_new_production(fun, builder->pool); + + pgf_lookup_add_production(builder, id, prod); +} -#ifdef PGF_LOOKUP_DEBUG - GuPool* tmp_pool = gu_new_pool(); - GuOut* out = gu_file_out(stderr, tmp_pool); - GuExn* err = gu_exn(tmp_pool); - pgf_print_abs_production(id, prod, out, err); - gu_pool_free(tmp_pool); -#endif +static GuBuf* +pgf_lookup_build_spine(GuMap* lexicon_idx, GuMap* function_idx, + GuString tok, PgfType* typ, PgfMetaId* cat_id, + GuPool* pool) +{ + PgfSpineBuilder builder; + builder.function_idx = function_idx; + builder.cat_ids = gu_new_string_map(PgfMetaId, &gu_null_struct, pool); + builder.spine = gu_new_buf(GuBuf*, pool); + builder.pool = pool; + + gu_buf_push(builder.spine, GuBuf*, NULL); + + GuBuf* funs = gu_map_get(lexicon_idx, tok, GuBuf*); + if (funs != NULL) { + size_t n_funs = gu_buf_length(funs); + for (size_t i = 0; i < n_funs; i++) { + PgfAbsFun* absfun = + gu_buf_get(funs, PgfAbsFun*, i); + pgf_lookup_add_spine_leaf(&builder, absfun); + } + } + + *cat_id = gu_map_get(builder.cat_ids, typ->cid, PgfMetaId); + + return builder.spine; +} + +typedef PgfMetaId PgfPair[2]; + +static bool +pgf_pair_eq_fn(GuEquality* self, const void* p1, const void* p2) +{ + (void) self; + const PgfMetaId* ip1 = p1; + const PgfMetaId* ip2 = p2; + return (ip1[0] == ip2[0] && ip1[1] == ip2[1]); +} + +static GuHash +pgf_pair_hash_fn(GuHasher* self, const void* p) +{ + (void) self; + const PgfMetaId* ip = p; + return (GuHash) (((ip[1] & 0xFFFF) << 16) | (ip[0] & 0xFFFF)); +} + +static GuHasher +pgf_pair_hasher[1] = { + { + { pgf_pair_eq_fn }, + pgf_pair_hash_fn + } +}; + +static PgfMetaId +pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs, + PgfMetaId cat_id1, GuBuf* spine1, + PgfMetaId cat_id2, GuBuf* spine2, + GuPool* pool) +{ + if (cat_id1 == 0 && cat_id2 == 0) + return 0; + + PgfPair pair; + pair[0] = cat_id1; + pair[1] = cat_id2; + PgfMetaId cat_id = gu_map_get(pairs, &pair, PgfMetaId); + if (cat_id != 0) + return cat_id; + + cat_id = gu_buf_length(spine); + GuBuf* id_prods = gu_new_buf(PgfAbsProduction*, pool); + gu_buf_push(spine, GuBuf*, id_prods); + + gu_map_put(pairs, &pair, PgfMetaId, cat_id); + + GuBuf* id_prods1 = gu_buf_get(spine1, GuBuf*, cat_id1); + GuBuf* id_prods2 = gu_buf_get(spine2, GuBuf*, cat_id2); + size_t n_id_prods1 = (cat_id1 == 0) ? 0 : gu_buf_length(id_prods1); + size_t n_id_prods2 = (cat_id2 == 0) ? 0 : gu_buf_length(id_prods2); + + if (cat_id1 != 0) { + for (size_t i = 0; i < n_id_prods1; i++) { + PgfAbsProduction* prod1 = + gu_buf_get(id_prods1, PgfAbsProduction*, i); + int count = 0; + for (size_t j = 0; j < n_id_prods2; j++) { + PgfAbsProduction* prod2 = + gu_buf_get(id_prods2, PgfAbsProduction*, j); + if (prod1->fun == prod2->fun) { + PgfAbsProduction* prod = + pgf_lookup_new_production(prod1->fun, pool); + size_t n_hypos = gu_seq_length(prod->fun->type->hypos); + for (size_t l = 0; l < n_hypos; l++) { + prod->args[l] = + pgf_lookup_merge_cats(spine, pairs, + prod1->args[l], spine1, + prod2->args[l], spine2, + pool); + } + gu_buf_push(id_prods, PgfAbsProduction*, prod); + + count++; + } + } + + if (count == 0) { + PgfAbsProduction* prod = + pgf_lookup_new_production(prod1->fun, pool); + size_t n_hypos = gu_seq_length(prod->fun->type->hypos); + for (size_t l = 0; l < n_hypos; l++) { + prod->args[l] = + pgf_lookup_merge_cats(spine, pairs, + prod1->args[l], spine1, + 0, spine2, + pool); + } + gu_buf_push(id_prods, PgfAbsProduction*, prod); + } + } + } + + if (cat_id2 != 0) { + for (size_t i = 0; i < n_id_prods2; i++) { + PgfAbsProduction* prod2 = + gu_buf_get(id_prods2, PgfAbsProduction*, i); + bool found = false; + for (size_t j = 0; j < n_id_prods1; j++) { + PgfAbsProduction* prod1 = + gu_buf_get(id_prods1, PgfAbsProduction*, j); + if (prod1->fun == prod2->fun) { + found = true; + break; + } + } + + if (!found) { + PgfAbsProduction* prod = + pgf_lookup_new_production(prod2->fun, pool); + size_t n_hypos = gu_seq_length(prod->fun->type->hypos); + for (size_t l = 0; l < n_hypos; l++) { + prod->args[l] = + pgf_lookup_merge_cats(spine, pairs, + 0, spine1, + prod2->args[l], spine2, + pool); + } + gu_buf_push(id_prods, PgfAbsProduction*, prod); + } + } + } + + return cat_id; +} + +static GuBuf* +pgf_lookup_merge(PgfMetaId cat_id1, GuBuf* spine1, + PgfMetaId cat_id2, GuBuf* spine2, + PgfMetaId* cat_id, + GuPool* pool) +{ + GuBuf* spine = gu_new_buf(GuBuf*, pool); + gu_buf_push(spine, GuBuf*, NULL); + + GuMap* pairs = gu_new_map(PgfPair, pgf_pair_hasher, PgfMetaId, &gu_null_struct, pool); + + *cat_id = + pgf_lookup_merge_cats(spine, pairs, + cat_id1, spine1, + cat_id2, spine2, + pool); + + return spine; } PGF_API GuEnum* -pgf_lookup_sentence(PgfConcr* concr, GuString sentence, GuPool* pool, GuPool* out_pool) +pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* pool, GuPool* out_pool) { //// building search indices // GuMap* lexicon_idx = gu_new_string_map(GuBuf*, &gu_null_struct, pool); @@ -168,21 +358,48 @@ pgf_lookup_sentence(PgfConcr* concr, GuString sentence, GuPool* pool, GuPool* ou } } /////////////////////////////// - - PgfSpineBuilder builder; - builder.function_idx = function_idx; - builder.cat_ids = gu_new_string_map(PgfMetaId, &gu_null_struct, pool); - builder.next_id = 0; - builder.pool = pool; - GuBuf* funs = gu_map_get(lexicon_idx, sentence, GuBuf*); - if (funs != NULL) { - size_t n_funs = gu_buf_length(funs); - for (size_t i = 0; i < n_funs; i++) { - PgfAbsFun* absfun = - gu_buf_get(funs, PgfAbsFun*, i); - pgf_lookup_add_spine_leaf(&builder, absfun); + PgfMetaId cat_id1 = 0; + GuBuf* join = gu_new_buf(GuBuf*, pool); + gu_buf_push(join, GuBuf*, NULL); + + GuUCS c = ' '; + const uint8_t* p = (const uint8_t*) sentence; + for (;;) { + while (gu_ucs_is_space(c)) { + c = gu_utf8_decode(&p); } + if (c == 0) + break; + + const uint8_t* start = p-1; + while (c != 0 && !gu_ucs_is_space(c)) { + c = gu_utf8_decode(&p); + } + const uint8_t* end = p-1; + + size_t len = end-start; + GuString tok = gu_malloc(pool, len+1); + memcpy((uint8_t*) tok, start, len); + ((uint8_t*) tok)[len] = 0; + + PgfMetaId cat_id2 = 0; + GuBuf* spine = + pgf_lookup_build_spine(lexicon_idx, function_idx, + tok, typ, &cat_id2, + pool); + + join = pgf_lookup_merge(cat_id1, join, cat_id2, spine, &cat_id1, pool); } + +#ifdef PGF_LOOKUP_DEBUG + GuPool* tmp_pool = gu_new_pool(); + GuOut* out = gu_file_out(stderr, tmp_pool); + GuExn* err = gu_exn(tmp_pool); + pgf_print_abs_productions(join, out, err); + gu_putc('\n',out,err); + gu_pool_free(tmp_pool); +#endif + return NULL; } diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index c5e9cd52a..cf2d85212 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -127,7 +127,7 @@ pgf_parse(PgfConcr* concr, PgfType* typ, GuString sentence, GuExn* err, GuPool* pool, GuPool* out_pool); PGF_API_DECL GuEnum* -pgf_lookup_sentence(PgfConcr* concr, GuString sentence, GuPool* pool, GuPool* out_pool); +pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* pool, GuPool* out_pool); typedef struct PgfMorphoCallback PgfMorphoCallback; struct PgfMorphoCallback {