forked from GitHub/gf-core
finally a smoothed and more precise ranking for lookups
This commit is contained in:
@@ -20,7 +20,6 @@ typedef struct {
|
|||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
PgfAbsFun* fun;
|
PgfAbsFun* fun;
|
||||||
size_t count;
|
|
||||||
PgfMetaId args[0];
|
PgfMetaId args[0];
|
||||||
} PgfAbsProduction;
|
} PgfAbsProduction;
|
||||||
|
|
||||||
@@ -35,7 +34,7 @@ pgf_print_abs_production(PgfMetaId id,
|
|||||||
for (size_t i = 0; i < n_hypos; i++) {
|
for (size_t i = 0; i < n_hypos; i++) {
|
||||||
gu_printf(out,err," ?%d", prod->args[i]);
|
gu_printf(out,err," ?%d", prod->args[i]);
|
||||||
}
|
}
|
||||||
gu_printf(out,err," (%d)\n",prod->count);
|
gu_putc('\n',out,err);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@@ -112,12 +111,18 @@ typedef struct {
|
|||||||
GuPool* pool;
|
GuPool* pool;
|
||||||
} PgfSpineBuilder;
|
} PgfSpineBuilder;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
PgfToken token;
|
||||||
|
size_t n_funs;
|
||||||
|
PgfAbsFun** funs;
|
||||||
|
} PgfInputToken;
|
||||||
|
|
||||||
static PgfAbsProduction*
|
static PgfAbsProduction*
|
||||||
pgf_lookup_new_production(PgfAbsFun* fun, GuPool *pool) {
|
pgf_lookup_new_production(PgfAbsFun* fun, GuPool *pool)
|
||||||
|
{
|
||||||
size_t n_hypos = gu_seq_length(fun->type->hypos);
|
size_t n_hypos = gu_seq_length(fun->type->hypos);
|
||||||
PgfAbsProduction* prod = gu_new_flex(pool, PgfAbsProduction, args, n_hypos);
|
PgfAbsProduction* prod = gu_new_flex(pool, PgfAbsProduction, args, n_hypos);
|
||||||
prod->fun = fun;
|
prod->fun = fun;
|
||||||
prod->count = 0;
|
|
||||||
for (size_t i = 0; i < n_hypos; i++) {
|
for (size_t i = 0; i < n_hypos; i++) {
|
||||||
prod->args[i] = 0;
|
prod->args[i] = 0;
|
||||||
}
|
}
|
||||||
@@ -166,14 +171,13 @@ pgf_lookup_add_spine_leaf(PgfSpineBuilder* builder, PgfAbsFun *fun)
|
|||||||
{
|
{
|
||||||
PgfMetaId id = pgf_lookup_add_spine_nodes(builder, fun->type->cid);
|
PgfMetaId id = pgf_lookup_add_spine_nodes(builder, fun->type->cid);
|
||||||
PgfAbsProduction* prod = pgf_lookup_new_production(fun, builder->pool);
|
PgfAbsProduction* prod = pgf_lookup_new_production(fun, builder->pool);
|
||||||
prod->count = 1;
|
|
||||||
|
|
||||||
pgf_lookup_add_production(builder, id, prod);
|
pgf_lookup_add_production(builder, id, prod);
|
||||||
}
|
}
|
||||||
|
|
||||||
static GuBuf*
|
static GuBuf*
|
||||||
pgf_lookup_build_spine(GuMap* lexicon_idx, GuMap* function_idx,
|
pgf_lookup_build_spine(GuMap* function_idx,
|
||||||
GuString tok, PgfType* typ, PgfMetaId* meta_id,
|
PgfInputToken* tok, PgfType* typ, PgfMetaId* meta_id,
|
||||||
GuPool* pool)
|
GuPool* pool)
|
||||||
{
|
{
|
||||||
PgfSpineBuilder builder;
|
PgfSpineBuilder builder;
|
||||||
@@ -184,14 +188,8 @@ pgf_lookup_build_spine(GuMap* lexicon_idx, GuMap* function_idx,
|
|||||||
|
|
||||||
gu_buf_push(builder.spine, GuBuf*, NULL);
|
gu_buf_push(builder.spine, GuBuf*, NULL);
|
||||||
|
|
||||||
GuBuf* funs = gu_map_get(lexicon_idx, tok, GuBuf*);
|
for (size_t i = 0; i < tok->n_funs; i++) {
|
||||||
if (funs != NULL) {
|
pgf_lookup_add_spine_leaf(&builder, tok->funs[i]);
|
||||||
size_t n_funs = gu_buf_length(funs);
|
|
||||||
for (size_t i = 0; i < n_funs; i++) {
|
|
||||||
PgfAbsFun* absfun =
|
|
||||||
gu_buf_get(funs, PgfAbsFun*, i);
|
|
||||||
pgf_lookup_add_spine_leaf(&builder, absfun);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*meta_id = gu_map_get(builder.meta_ids, typ->cid, PgfMetaId);
|
*meta_id = gu_map_get(builder.meta_ids, typ->cid, PgfMetaId);
|
||||||
@@ -264,7 +262,6 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
|
|||||||
if (prod1->fun == prod2->fun) {
|
if (prod1->fun == prod2->fun) {
|
||||||
PgfAbsProduction* prod =
|
PgfAbsProduction* prod =
|
||||||
pgf_lookup_new_production(prod1->fun, pool);
|
pgf_lookup_new_production(prod1->fun, pool);
|
||||||
prod->count = prod1->count+prod2->count;
|
|
||||||
size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
|
size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
|
||||||
for (size_t l = 0; l < n_hypos; l++) {
|
for (size_t l = 0; l < n_hypos; l++) {
|
||||||
prod->args[l] =
|
prod->args[l] =
|
||||||
@@ -282,7 +279,6 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
|
|||||||
if (count == 0) {
|
if (count == 0) {
|
||||||
PgfAbsProduction* prod =
|
PgfAbsProduction* prod =
|
||||||
pgf_lookup_new_production(prod1->fun, pool);
|
pgf_lookup_new_production(prod1->fun, pool);
|
||||||
prod->count = prod1->count;
|
|
||||||
size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
|
size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
|
||||||
for (size_t l = 0; l < n_hypos; l++) {
|
for (size_t l = 0; l < n_hypos; l++) {
|
||||||
prod->args[l] =
|
prod->args[l] =
|
||||||
@@ -313,7 +309,6 @@ pgf_lookup_merge_cats(GuBuf* spine, GuMap* pairs,
|
|||||||
if (!found) {
|
if (!found) {
|
||||||
PgfAbsProduction* prod =
|
PgfAbsProduction* prod =
|
||||||
pgf_lookup_new_production(prod2->fun, pool);
|
pgf_lookup_new_production(prod2->fun, pool);
|
||||||
prod->count = prod2->count;
|
|
||||||
size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
|
size_t n_hypos = gu_seq_length(prod->fun->type->hypos);
|
||||||
for (size_t l = 0; l < n_hypos; l++) {
|
for (size_t l = 0; l < n_hypos; l++) {
|
||||||
prod->args[l] =
|
prod->args[l] =
|
||||||
@@ -359,7 +354,7 @@ typedef struct {
|
|||||||
GuBuf* stack;
|
GuBuf* stack;
|
||||||
GuBuf* expr_tokens;
|
GuBuf* expr_tokens;
|
||||||
GuBuf* ctrees;
|
GuBuf* ctrees;
|
||||||
int fid;
|
PgfAbsFun** curr_absfun;
|
||||||
GuPool* pool;
|
GuPool* pool;
|
||||||
} PgfLookupState;
|
} PgfLookupState;
|
||||||
|
|
||||||
@@ -380,7 +375,6 @@ pgf_lookup_extract_app(PgfLookupState* st,
|
|||||||
size_t n_args, PgfMetaId* args)
|
size_t n_args, PgfMetaId* args)
|
||||||
{
|
{
|
||||||
GuChoiceMark mark = gu_choice_mark(st->choice);
|
GuChoiceMark mark = gu_choice_mark(st->choice);
|
||||||
int save_fid = st->fid;
|
|
||||||
|
|
||||||
PgfCncTree ret = gu_null_variant;
|
PgfCncTree ret = gu_null_variant;
|
||||||
PgfCncTreeApp* capp =
|
PgfCncTreeApp* capp =
|
||||||
@@ -417,7 +411,6 @@ redo:;
|
|||||||
} else {
|
} else {
|
||||||
int index = gu_choice_next(st->choice, gu_buf_length(coercions));
|
int index = gu_choice_next(st->choice, gu_buf_length(coercions));
|
||||||
if (index < 0) {
|
if (index < 0) {
|
||||||
st->fid = save_fid;
|
|
||||||
gu_choice_reset(st->choice, mark);
|
gu_choice_reset(st->choice, mark);
|
||||||
if (!gu_choice_advance(st->choice))
|
if (!gu_choice_advance(st->choice))
|
||||||
return gu_null_variant;
|
return gu_null_variant;
|
||||||
@@ -499,7 +492,7 @@ pgf_lookup_extract(PgfLookupState* st, PgfMetaId meta_id, PgfCCat *ccat)
|
|||||||
args, 1, &ret, st->pool);
|
args, 1, &ret, st->pool);
|
||||||
capp->ccat = ccat;
|
capp->ccat = ccat;
|
||||||
capp->fun = gu_seq_get(ccat->lindefs, PgfCncFun*, index);
|
capp->fun = gu_seq_get(ccat->lindefs, PgfCncFun*, index);
|
||||||
capp->fid = st->fid++;
|
capp->fid = 0;
|
||||||
capp->n_vars = 0;
|
capp->n_vars = 0;
|
||||||
capp->context = NULL;
|
capp->context = NULL;
|
||||||
capp->n_args = 1;
|
capp->n_args = 1;
|
||||||
@@ -567,12 +560,12 @@ done:
|
|||||||
}
|
}
|
||||||
|
|
||||||
static GuBuf*
|
static GuBuf*
|
||||||
pgf_lookup_tokenize(GuString buf, size_t len, GuPool* pool)
|
pgf_lookup_tokenize(GuMap* lexicon_idx, GuString sentence, GuPool* pool)
|
||||||
{
|
{
|
||||||
GuBuf* tokens = gu_new_buf(GuString, pool);
|
GuBuf* tokens = gu_new_buf(PgfInputToken, pool);
|
||||||
|
|
||||||
GuUCS c = ' ';
|
GuUCS c = ' ';
|
||||||
const uint8_t* p = (const uint8_t*) buf;
|
const uint8_t* p = (const uint8_t*) sentence;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
while (gu_ucs_is_space(c)) {
|
while (gu_ucs_is_space(c)) {
|
||||||
c = gu_utf8_decode(&p);
|
c = gu_utf8_decode(&p);
|
||||||
@@ -586,12 +579,21 @@ pgf_lookup_tokenize(GuString buf, size_t len, GuPool* pool)
|
|||||||
}
|
}
|
||||||
const uint8_t* end = p-1;
|
const uint8_t* end = p-1;
|
||||||
|
|
||||||
size_t len = end-start;
|
PgfInputToken* tok = gu_buf_extend(tokens);
|
||||||
GuString tok = gu_malloc(pool, len+1);
|
|
||||||
memcpy((uint8_t*) tok, start, len);
|
|
||||||
((uint8_t*) tok)[len] = 0;
|
|
||||||
|
|
||||||
gu_buf_push(tokens, GuString, tok);
|
size_t len = end-start;
|
||||||
|
tok->token = gu_malloc(pool, len+1);
|
||||||
|
memcpy((uint8_t*) tok->token, start, len);
|
||||||
|
((uint8_t*) tok->token)[len] = 0;
|
||||||
|
|
||||||
|
GuBuf* funs = gu_map_get(lexicon_idx, tok->token, GuBuf*);
|
||||||
|
if (funs != NULL) {
|
||||||
|
tok->n_funs = gu_buf_length(funs);
|
||||||
|
tok->funs = gu_buf_data(funs);
|
||||||
|
} else {
|
||||||
|
tok->n_funs = 0;
|
||||||
|
tok->funs = NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return tokens;
|
return tokens;
|
||||||
@@ -610,11 +612,25 @@ pgf_lookup_compute_kernel_helper(GuBuf* sentence_tokens, GuBuf* expr_tokens,
|
|||||||
for (size_t l = 0; l < i; l++) {
|
for (size_t l = 0; l < i; l++) {
|
||||||
matrix[l + dim*j] = score;
|
matrix[l + dim*j] = score;
|
||||||
for (size_t k = j; k > 0; k--) {
|
for (size_t k = j; k > 0; k--) {
|
||||||
GuString sentence_token = gu_buf_get(sentence_tokens, GuString, l);
|
PgfInputToken* sentence_token = gu_buf_index(sentence_tokens, PgfInputToken, l);
|
||||||
GuString expr_token = gu_buf_get(expr_tokens, GuString, k-1);
|
PgfInputToken* expr_token = gu_buf_index(expr_tokens, PgfInputToken, k-1);
|
||||||
|
|
||||||
if (strcmp(sentence_token, expr_token) == 0) {
|
if (strcmp(sentence_token->token, expr_token->token) == 0) {
|
||||||
score += 1 + pgf_lookup_compute_kernel_helper(sentence_tokens, expr_tokens, matrix, l, k-1);
|
score += 1 + pgf_lookup_compute_kernel_helper(sentence_tokens, expr_tokens, matrix, l, k-1);
|
||||||
|
} else {
|
||||||
|
bool match = false;
|
||||||
|
for (size_t i = 0; i < sentence_token->n_funs; i++) {
|
||||||
|
for (size_t j = 0; j < expr_token->n_funs; j++) {
|
||||||
|
if (sentence_token->funs[i] == expr_token->funs[j]) {
|
||||||
|
match = true;
|
||||||
|
goto done;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
done:
|
||||||
|
if (match) {
|
||||||
|
score += 0.5 + pgf_lookup_compute_kernel_helper(sentence_tokens, expr_tokens, matrix, l, k-1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -720,16 +736,40 @@ pgf_lookup_enum_next(GuEnum* self, void* to, GuPool* pool)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
pgf_lookup_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
|
pgf_lookup_symbol_token(PgfLinFuncs** self, PgfToken token)
|
||||||
{
|
{
|
||||||
PgfLookupState* st = gu_container(funcs, PgfLookupState, funcs);
|
PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
|
||||||
gu_buf_push(st->expr_tokens, PgfToken, tok);
|
PgfInputToken* tok = gu_buf_extend(st->expr_tokens);
|
||||||
|
tok->token = token;
|
||||||
|
tok->n_funs = st->curr_absfun ? 1 : 0;
|
||||||
|
tok->funs = st->curr_absfun;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_lookup_begin_phrase(PgfLinFuncs** self, PgfCId cat, int fid, int lindex, PgfCId funname)
|
||||||
|
{
|
||||||
|
PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
|
||||||
|
|
||||||
|
PgfAbsFun* absfun = gu_seq_binsearch(st->concr->abstr->funs, pgf_absfun_order, PgfAbsFun, funname);
|
||||||
|
if (absfun != NULL) {
|
||||||
|
st->curr_absfun = gu_new(PgfAbsFun*, st->pool);
|
||||||
|
*st->curr_absfun = absfun;
|
||||||
|
} else {
|
||||||
|
st->curr_absfun = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_lookup_end_phrase(PgfLinFuncs** self, PgfCId cat, int fid, int lindex, PgfCId fun)
|
||||||
|
{
|
||||||
|
PgfLookupState* st = gu_container(self, PgfLookupState, funcs);
|
||||||
|
st->curr_absfun = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static PgfLinFuncs pgf_lookup_lin_funcs = {
|
static PgfLinFuncs pgf_lookup_lin_funcs = {
|
||||||
.symbol_token = pgf_lookup_symbol_token,
|
.symbol_token = pgf_lookup_symbol_token,
|
||||||
.begin_phrase = NULL,
|
.begin_phrase = pgf_lookup_begin_phrase,
|
||||||
.end_phrase = NULL,
|
.end_phrase = pgf_lookup_end_phrase,
|
||||||
.symbol_ne = NULL,
|
.symbol_ne = NULL,
|
||||||
.symbol_bind = NULL,
|
.symbol_bind = NULL,
|
||||||
.symbol_capit = NULL
|
.symbol_capit = NULL
|
||||||
@@ -773,9 +813,7 @@ pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* po
|
|||||||
GuPool *work_pool = gu_new_pool();
|
GuPool *work_pool = gu_new_pool();
|
||||||
|
|
||||||
GuBuf* sentence_tokens =
|
GuBuf* sentence_tokens =
|
||||||
pgf_lookup_tokenize(sentence,
|
pgf_lookup_tokenize(lexicon_idx, sentence, work_pool);
|
||||||
strlen(sentence),
|
|
||||||
work_pool);
|
|
||||||
|
|
||||||
PgfMetaId meta_id1 = 0;
|
PgfMetaId meta_id1 = 0;
|
||||||
GuBuf* join = gu_new_buf(GuBuf*, pool);
|
GuBuf* join = gu_new_buf(GuBuf*, pool);
|
||||||
@@ -783,11 +821,11 @@ pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* po
|
|||||||
|
|
||||||
size_t n_tokens = gu_buf_length(sentence_tokens);
|
size_t n_tokens = gu_buf_length(sentence_tokens);
|
||||||
for (size_t i = 0; i < n_tokens; i++) {
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
GuString tok = gu_buf_get(sentence_tokens, GuString, i);
|
PgfInputToken* tok = gu_buf_index(sentence_tokens, PgfInputToken, i);
|
||||||
|
|
||||||
PgfMetaId meta_id2 = 0;
|
PgfMetaId meta_id2 = 0;
|
||||||
GuBuf* spine =
|
GuBuf* spine =
|
||||||
pgf_lookup_build_spine(lexicon_idx, function_idx,
|
pgf_lookup_build_spine(function_idx,
|
||||||
tok, typ, &meta_id2,
|
tok, typ, &meta_id2,
|
||||||
work_pool);
|
work_pool);
|
||||||
|
|
||||||
@@ -810,9 +848,9 @@ pgf_lookup_sentence(PgfConcr* concr, PgfType* typ, GuString sentence, GuPool* po
|
|||||||
st.start_id= meta_id1;
|
st.start_id= meta_id1;
|
||||||
st.choice = gu_new_choice(work_pool);
|
st.choice = gu_new_choice(work_pool);
|
||||||
st.stack = gu_new_buf(PgfMetaId, work_pool);
|
st.stack = gu_new_buf(PgfMetaId, work_pool);
|
||||||
st.expr_tokens=gu_new_buf(GuString, work_pool);
|
st.expr_tokens=gu_new_buf(PgfInputToken, work_pool);
|
||||||
st.ctrees = gu_new_buf(PgfCncTreeScore, pool);
|
st.ctrees = gu_new_buf(PgfCncTreeScore, pool);
|
||||||
st.fid = 0;
|
st.curr_absfun= NULL;
|
||||||
st.pool = pool;
|
st.pool = pool;
|
||||||
|
|
||||||
GuChoiceMark mark = gu_choice_mark(st.choice);
|
GuChoiceMark mark = gu_choice_mark(st.choice);
|
||||||
|
|||||||
Reference in New Issue
Block a user