1
0
forked from GitHub/gf-core

finally a smoothed and more precise ranking for lookups

This commit is contained in:
krasimir
2017-05-30 11:42:32 +00:00
parent 4c59db0dac
commit bf3557cf90

View File

@@ -20,7 +20,6 @@ typedef struct {
typedef struct { typedef struct {
PgfAbsFun* fun; PgfAbsFun* fun;
size_t count;
PgfMetaId args[0]; PgfMetaId args[0];
} PgfAbsProduction; } PgfAbsProduction;
@@ -35,7 +34,7 @@ pgf_print_abs_production(PgfMetaId id,
for (size_t i = 0; i < n_hypos; i++) { for (size_t i = 0; i < n_hypos; i++) {
gu_printf(out,err," ?%d", prod->args[i]); gu_printf(out,err," ?%d", prod->args[i]);
} }
gu_printf(out,err," (%d)\n",prod->count); gu_putc('\n',out,err);
} }
static void static void
@@ -112,12 +111,18 @@ typedef struct {
GuPool* pool; GuPool* pool;
} PgfSpineBuilder; } PgfSpineBuilder;
typedef struct {
PgfToken token;
size_t n_funs;
PgfAbsFun** funs;
} PgfInputToken;
static PgfAbsProduction* static PgfAbsProduction*
pgf_lookup_new_production(PgfAbsFun* fun, GuPool *pool) { pgf_lookup_new_production(PgfAbsFun* fun, GuPool *pool)
{
size_t n_hypos = gu_seq_length(fun->type->hypos); size_t n_hypos = gu_seq_length(fun->type->hypos);
PgfAbsProduction* prod = gu_new_flex(pool, PgfAbsProduction, args, n_hypos); PgfAbsProduction* prod = gu_new_flex(pool, PgfAbsProduction, args, n_hypos);
prod->fun = fun; prod->fun = fun;
prod->count = 0;
for (size_t i = 0; i < n_hypos; i++) { for (size_t i = 0; i < n_hypos; i++) {
prod->args[i] = 0; prod->args[i] = 0;
} }
@@ -166,14 +171,13 @@ pgf_lookup_add_spine_leaf(PgfSpineBuilder* builder, PgfAbsFun *fun)
{ {
PgfMetaId id = pgf_lookup_add_spine_nodes(builder, fun->type->cid); PgfMetaId id = pgf_lookup_add_spine_nodes(builder, fun->type->cid);
PgfAbsProduction* prod = pgf_lookup_new_production(fun, builder->pool); PgfAbsProduction* prod = pgf_lookup_new_production(fun, builder->pool);
prod->count = 1;
pgf_lookup_add_production(builder, id, prod); pgf_lookup_add_production(builder, id, prod);
} }
static GuBuf* static GuBuf*
pgf_lookup_build_spine(GuMap* lexicon_idx, GuMap* function_idx, pgf_lookup_build_spine(GuMap* function_idx,
GuString tok, PgfType* typ, PgfMetaId* meta_id, PgfInputToken* tok, PgfType* typ, PgfMetaId* meta_id,
GuPool* pool) GuPool* pool)
{ {
PgfSpineBuilder builder; PgfSpineBuilder builder;
@@ -184,14 +188,8 @@ pgf_lookup_build_spine(GuMap* lexicon_idx, GuMap* function_idx,
gu_buf_push(builder.spine, GuBuf*, NULL); gu_buf_push(builder.spine, GuBuf*, NULL);
GuBuf* funs = gu_map_get(lexicon_idx, tok, GuBuf*); for (size_t i = 0; i < tok->n_funs; i++) {
if (funs != NULL) { pgf_lookup_add_spine_leaf(&builder, tok->funs[i]);
size_t n_funs = gu_buf_length(funs);
for (size_t i = 0; i < n_funs; i++) {
PgfAbsFun* absfun =
gu_buf_get(funs, PgfAbsFun*, i);
pgf_lookup_add_spine_leaf(&builder, absfun);
}
} }
*meta_id = gu_map_get(builder.meta_ids, typ->cid, PgfMetaId); *meta_id = gu_map_get(builder.meta_ids, typ->cid, PgfMetaId);
@@ -264,7 +262,6 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
if (prod1->fun == prod2->fun) { if (prod1->fun == prod2->fun) {
PgfAbsProduction* prod = PgfAbsProduction* prod =
pgf_lookup_new_production(prod1->fun, pool); pgf_lookup_new_production(prod1->fun, pool);
prod->count = prod1->count+prod2->count;
size_t n_hypos = gu_seq_length(prod->fun->type->hypos); size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
for (size_t l = 0; l < n_hypos; l++) { for (size_t l = 0; l < n_hypos; l++) {
prod->args[l] = prod->args[l] =
@@ -282,7 +279,6 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
if (count == 0) { if (count == 0) {
PgfAbsProduction* prod = PgfAbsProduction* prod =
pgf_lookup_new_production(prod1->fun, pool); pgf_lookup_new_production(prod1->fun, pool);
prod->count = prod1->count;
size_t n_hypos = gu_seq_length(prod->fun->type->hypos); size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
for (size_t l = 0; l < n_hypos; l++) { for (size_t l = 0; l < n_hypos; l++) {
prod->args[l] = prod->args[l] =
@@ -313,7 +309,6 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
if (!found) { if (!found) {
PgfAbsProduction* prod = PgfAbsProduction* prod =
pgf_lookup_new_production(prod2->fun, pool); pgf_lookup_new_production(prod2->fun, pool);
prod->count = prod2->count;
size_t n_hypos = gu_seq_length(prod->fun->type->hypos); size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
for (size_t l = 0; l < n_hypos; l++) { for (size_t l = 0; l < n_hypos; l++) {
prod->args[l] = prod->args[l] =
@@ -359,7 +354,7 @@ typedef struct {
GuBuf* stack; GuBuf* stack;
GuBuf* expr_tokens; GuBuf* expr_tokens;
GuBuf* ctrees; GuBuf* ctrees;
int fid; PgfAbsFun** curr_absfun;
GuPool* pool; GuPool* pool;
} PgfLookupState; } PgfLookupState;
@@ -380,7 +375,6 @@ pgf_lookup_extract_app(PgfLookupState* st,
size_t n_args, PgfMetaId* args) size_t n_args, PgfMetaId* args)
{ {
GuChoiceMark mark = gu_choice_mark(st->choice); GuChoiceMark mark = gu_choice_mark(st->choice);
int save_fid = st->fid;
PgfCncTree ret = gu_null_variant; PgfCncTree ret = gu_null_variant;
PgfCncTreeApp* capp = PgfCncTreeApp* capp =
@@ -417,7 +411,6 @@ redo:;
} else { } else {
int index = gu_choice_next(st->choice, gu_buf_length(coercions)); int index = gu_choice_next(st->choice, gu_buf_length(coercions));
if (index < 0) { if (index < 0) {
st->fid = save_fid;
gu_choice_reset(st->choice, mark); gu_choice_reset(st->choice, mark);
if (!gu_choice_advance(st->choice)) if (!gu_choice_advance(st->choice))
return gu_null_variant; return gu_null_variant;
@@ -499,7 +492,7 @@ pgf_lookup_extract(PgfLookupState* st, PgfMetaId meta_id, PgfCCat *ccat)
args, 1, &ret, st->pool); args, 1, &ret, st->pool);
capp->ccat = ccat; capp->ccat = ccat;
capp->fun = gu_seq_get(ccat->lindefs, PgfCncFun*, index); capp->fun = gu_seq_get(ccat->lindefs, PgfCncFun*, index);
capp->fid = st->fid++; capp->fid = 0;
capp->n_vars = 0; capp->n_vars = 0;
capp->context = NULL; capp->context = NULL;
capp->n_args = 1; capp->n_args = 1;
@@ -567,12 +560,12 @@ done:
} }
static GuBuf* static GuBuf*
pgf_lookup_tokenize(GuString buf, size_t len, GuPool* pool) pgf_lookup_tokenize(GuMap* lexicon_idx, GuString sentence, GuPool* pool)
{ {
GuBuf* tokens = gu_new_buf(GuString, pool); GuBuf* tokens = gu_new_buf(PgfInputToken, pool);
GuUCS c = ' '; GuUCS c = ' ';
const uint8_t* p = (const uint8_t*) buf; const uint8_t* p = (const uint8_t*) sentence;
for (;;) { for (;;) {
while (gu_ucs_is_space(c)) { while (gu_ucs_is_space(c)) {
c = gu_utf8_decode(&p); c = gu_utf8_decode(&p);
@@ -586,12 +579,21 @@ pgf_lookup_tokenize(GuString buf, size_t len, GuPool* pool)
} }
const uint8_t* end = p-1; const uint8_t* end = p-1;
size_t len = end-start; PgfInputToken* tok = gu_buf_extend(tokens);
GuString tok = gu_malloc(pool, len+1);
memcpy((uint8_t*) tok, start, len);
((uint8_t*) tok)[len] = 0;
gu_buf_push(tokens, GuString, tok); size_t len = end-start;
tok->token = gu_malloc(pool, len+1);
memcpy((uint8_t*) tok->token, start, len);
((uint8_t*) tok->token)[len] = 0;
GuBuf* funs = gu_map_get(lexicon_idx, tok->token, GuBuf*);
if (funs != NULL) {
tok->n_funs = gu_buf_length(funs);
tok->funs = gu_buf_data(funs);
} else {
tok->n_funs = 0;
tok->funs = NULL;
}
} }
return tokens; return tokens;
@@ -610,11 +612,25 @@ pgf_lookup_compute_kernel_helper(GuBuf* sentence_tokens, GuBuf* expr_tokens,
for (size_t l = 0; l < i; l++) { for (size_t l = 0; l < i; l++) {
matrix[l + dim*j] = score; matrix[l + dim*j] = score;
for (size_t k = j; k > 0; k--) { for (size_t k = j; k > 0; k--) {
GuString sentence_token = gu_buf_get(sentence_tokens, GuString, l); PgfInputToken* sentence_token = gu_buf_index(sentence_tokens, PgfInputToken, l);
GuString expr_token = gu_buf_get(expr_tokens, GuString, k-1); PgfInputToken* expr_token = gu_buf_index(expr_tokens, PgfInputToken, k-1);
if (strcmp(sentence_token, expr_token) == 0) { if (strcmp(sentence_token->token, expr_token->token) == 0) {
score += 1 + pgf_lookup_compute_kernel_helper(sentence_tokens, expr_tokens, matrix, l, k-1); score += 1 + pgf_lookup_compute_kernel_helper(sentence_tokens, expr_tokens, matrix, l, k-1);
} else {
bool match = false;
for (size_t i = 0; i < sentence_token->n_funs; i++) {
for (size_t j = 0; j < expr_token->n_funs; j++) {
if (sentence_token->funs[i] == expr_token->funs[j]) {
match = true;
goto done;
}
}
}
done:
if (match) {
score += 0.5 + pgf_lookup_compute_kernel_helper(sentence_tokens, expr_tokens, matrix, l, k-1);
}
} }
} }
} }
@@ -720,16 +736,40 @@ pgf_lookup_enum_next(GuEnum* self, void* to, GuPool* pool)
} }
static void static void
pgf_lookup_symbol_token(PgfLinFuncs** funcs, PgfToken tok) pgf_lookup_symbol_token(PgfLinFuncs** self, PgfToken token)
{ {
PgfLookupState* st = gu_container(funcs, PgfLookupState, funcs); PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
gu_buf_push(st->expr_tokens, PgfToken, tok); PgfInputToken* tok = gu_buf_extend(st->expr_tokens);
tok->token = token;
tok->n_funs = st->curr_absfun ? 1 : 0;
tok->funs = st->curr_absfun;
}
static void
pgf_lookup_begin_phrase(PgfLinFuncs** self, PgfCId cat, int fid, int lindex, PgfCId funname)
{
PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
PgfAbsFun* absfun = gu_seq_binsearch(st->concr->abstr->funs, pgf_absfun_order, PgfAbsFun, funname);
if (absfun != NULL) {
st->curr_absfun = gu_new(PgfAbsFun*, st->pool);
*st->curr_absfun = absfun;
} else {
st->curr_absfun = NULL;
}
}
static void
pgf_lookup_end_phrase(PgfLinFuncs** self, PgfCId cat, int fid, int lindex, PgfCId fun)
{
PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
st->curr_absfun = NULL;
} }
static PgfLinFuncs pgf_lookup_lin_funcs = { static PgfLinFuncs pgf_lookup_lin_funcs = {
.symbol_token = pgf_lookup_symbol_token, .symbol_token = pgf_lookup_symbol_token,
.begin_phrase = NULL, .begin_phrase = pgf_lookup_begin_phrase,
.end_phrase = NULL, .end_phrase = pgf_lookup_end_phrase,
.symbol_ne = NULL, .symbol_ne = NULL,
.symbol_bind = NULL, .symbol_bind = NULL,
.symbol_capit = NULL .symbol_capit = NULL
@@ -773,9 +813,7 @@ pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* po
GuPool *work_pool = gu_new_pool(); GuPool *work_pool = gu_new_pool();
GuBuf* sentence_tokens = GuBuf* sentence_tokens =
pgf_lookup_tokenize(sentence, pgf_lookup_tokenize(lexicon_idx, sentence, work_pool);
strlen(sentence),
work_pool);
PgfMetaId meta_id1 = 0; PgfMetaId meta_id1 = 0;
GuBuf* join = gu_new_buf(GuBuf*, pool); GuBuf* join = gu_new_buf(GuBuf*, pool);
@@ -783,11 +821,11 @@ pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* po
size_t n_tokens = gu_buf_length(sentence_tokens); size_t n_tokens = gu_buf_length(sentence_tokens);
for (size_t i = 0; i < n_tokens; i++) { for (size_t i = 0; i < n_tokens; i++) {
GuString tok = gu_buf_get(sentence_tokens, GuString, i); PgfInputToken* tok = gu_buf_index(sentence_tokens, PgfInputToken, i);
PgfMetaId meta_id2 = 0; PgfMetaId meta_id2 = 0;
GuBuf* spine = GuBuf* spine =
pgf_lookup_build_spine(lexicon_idx, function_idx, pgf_lookup_build_spine(function_idx,
tok, typ, &meta_id2, tok, typ, &meta_id2,
work_pool); work_pool);
@@ -810,9 +848,9 @@ pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* po
st.start_id= meta_id1; st.start_id= meta_id1;
st.choice = gu_new_choice(work_pool); st.choice = gu_new_choice(work_pool);
st.stack = gu_new_buf(PgfMetaId, work_pool); st.stack = gu_new_buf(PgfMetaId, work_pool);
st.expr_tokens=gu_new_buf(GuString, work_pool); st.expr_tokens=gu_new_buf(PgfInputToken, work_pool);
st.ctrees = gu_new_buf(PgfCncTreeScore, pool); st.ctrees = gu_new_buf(PgfCncTreeScore, pool);
st.fid = 0; st.curr_absfun= NULL;
st.pool = pool; st.pool = pool;
GuChoiceMark mark = gu_choice_mark(st.choice); GuChoiceMark mark = gu_choice_mark(st.choice);