1
0
forked from GitHub/gf-core

a major refactoring in the C and the Haskell runtimes. Note incompatible change in the PGF format!!!

The following are the outcomes:

   - Predef.nonExist is fully supported by both the Haskell and the C runtimes

   - Predef.BIND is now an internal compiler defined token. For now
     it behaves just as usual for the Haskell runtime, i.e. it generates &+.
     However, the special treatment will let us to handle it properly in 
     the C runtime.

   - This required a major change in the PGF format since both 
     nonExist and BIND may appear inside 'pre' and this was not supported
     before.
This commit is contained in:
kr.angelov
2013-09-27 15:09:48 +00:00
parent 3f65253f0e
commit efa4bc4d62
30 changed files with 332 additions and 372 deletions

View File

@@ -133,8 +133,8 @@ struct PgfItem {
PgfPArgs* args;
PgfSymbol curr_sym;
uint16_t seq_idx;
uint8_t tok_idx;
uint8_t alt;
uint8_t alt_idx; // position in the pre alternative
uint8_t alt; // the number of the alternative
prob_t inside_prob;
};
@@ -694,7 +694,7 @@ pgf_new_item(PgfItemConts* conts, PgfProduction prod,
item->prod = prod;
item->curr_sym = gu_null_variant;
item->seq_idx = 0;
item->tok_idx = 0;
item->alt_idx = 0;
item->alt = 0;
conts->ref_count++;
@@ -758,8 +758,12 @@ pgf_item_update_arg(PgfItem* item, size_t d, PgfCCat *new_ccat,
static void
pgf_item_advance(PgfItem* item, GuPool* pool)
{
item->seq_idx++;
pgf_item_set_curr_symbol(item, pool);
if (GU_LIKELY(item->alt == 0)) {
item->seq_idx++;
pgf_item_set_curr_symbol(item, pool);
}
else
item->alt_idx++;
}
static void
@@ -1133,8 +1137,7 @@ pgf_parsing_meta_scan(PgfParseState* before, PgfParseState* after,
gu_alignof(PgfSymbolKS),
&item->curr_sym, after->ps->pool);
*((PgfSymbol*)(sks+1)) = prev;
sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool);
gu_seq_set(sks->tokens, PgfToken, 0, tok);
sks->token = tok;
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
@@ -1218,76 +1221,54 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
case PGF_SYMBOL_KS: {
if (after != NULL) {
PgfSymbolKS* sks = gu_variant_data(sym);
gu_assert(item->tok_idx < gu_seq_length(sks->tokens));
PgfToken tok =
gu_seq_get(sks->tokens, PgfToken, item->tok_idx++);
if (item->tok_idx == gu_seq_length(sks->tokens)) {
item->tok_idx = 0;
pgf_item_advance(item, after->ps->pool);
}
pgf_parsing_add_transition(before, after, tok, item);
pgf_item_advance(item, after->ps->pool);
pgf_parsing_add_transition(before, after, sks->token, item);
}
break;
}
case PGF_SYMBOL_KP: {
if (after != NULL) {
PgfSymbolKP* skp = gu_variant_data(sym);
size_t idx = item->tok_idx;
uint8_t alt = item->alt;
gu_assert(idx < gu_seq_length(skp->default_form));
if (idx == 0) {
PgfToken tok;
PgfSymbol sym;
if (item->alt == 0) {
PgfItem* new_item;
tok = gu_seq_get(skp->default_form, PgfToken, 0);
new_item = pgf_item_copy(item, after->ps->pool, after->ps);
new_item->tok_idx++;
if (new_item->tok_idx == gu_seq_length(skp->default_form)) {
new_item->tok_idx = 0;
pgf_item_advance(new_item, after->ps->pool);
}
pgf_parsing_add_transition(before, after, tok, new_item);
new_item->alt = 1;
new_item->alt_idx = 0;
sym = gu_seq_get(skp->default_form, PgfSymbol, new_item->alt_idx);
pgf_parsing_symbol(before, after, new_item, sym);
for (size_t i = 0; i < skp->n_forms; i++) {
// XXX: do nubbing properly
PgfTokens* toks = skp->forms[i].form;
PgfTokens* toks2 = skp->default_form;
bool skip = pgf_tokens_equal(toks, toks2);
PgfSequence* syms = skp->forms[i].form;
PgfSequence* syms2 = skp->default_form;
bool skip = false; /*pgf_tokens_equal(toks, toks2);
for (size_t j = 0; j < i; j++) {
PgfTokens* toks2 = skp->forms[j].form;
skip |= pgf_tokens_equal(toks, toks2);
}
}*/
if (!skip) {
tok = gu_seq_get(toks, PgfToken, 0);
new_item = pgf_item_copy(item, after->ps->pool, after->ps);
new_item->tok_idx++;
new_item->alt = i;
if (new_item->tok_idx == gu_seq_length(toks)) {
new_item->tok_idx = 0;
pgf_item_advance(new_item, after->ps->pool);
}
pgf_parsing_add_transition(before, after, tok, new_item);
new_item->alt = i+2;
new_item->alt_idx = 0;
sym = gu_seq_get(syms, PgfSymbol, new_item->alt_idx);
pgf_parsing_symbol(before, after, new_item, sym);
}
}
} else if (alt == 0) {
PgfToken tok =
gu_seq_get(skp->default_form, PgfToken, idx);
item->tok_idx++;
if (item->tok_idx == gu_seq_length(skp->default_form)) {
item->tok_idx = 0;
pgf_item_advance(item, after->ps->pool);
}
pgf_parsing_add_transition(before, after, tok, item);
} else {
gu_assert(alt <= skp->n_forms);
PgfTokens* toks = skp->forms[alt - 1].form;
PgfToken tok = gu_seq_get(toks, PgfToken, idx);
item->tok_idx++;
if (item->tok_idx == gu_seq_length(toks)) {
item->tok_idx = 0;
PgfSequence* syms =
(item->alt == 1) ? skp->default_form :
skp->forms[item->alt-2].form;
if (item->alt_idx < gu_seq_length(syms)) {
sym = gu_seq_get(syms, PgfSymbol, item->alt_idx);
pgf_parsing_symbol(before, after, item, sym);
} else {
item->alt = 0;
pgf_item_advance(item, after->ps->pool);
gu_buf_heap_push(before->agenda, &pgf_item_prob_order, &item);
}
pgf_parsing_add_transition(before, after, tok, item);
}
}
break;
@@ -1357,7 +1338,7 @@ pgf_parsing_symbol(PgfParseState* before, PgfParseState* after,
// XXX TODO proper support
break;
case PGF_SYMBOL_NE: {
// Nothing to be done here
pgf_item_free(before, after, item);
break;
}
default:
@@ -1450,8 +1431,7 @@ pgf_parsing_item(PgfParseState* before, PgfParseState* after, PgfItem* item)
gu_alignof(PgfSymbolKS),
&item->curr_sym, after->ps->pool);
*((PgfSymbol*)(sks+1)) = prev;
sks->tokens = gu_new_seq(PgfToken, 1, after->ps->pool);
gu_seq_set(sks->tokens, PgfToken, 0, tok);
sks->token = tok;
item->seq_idx++;
pgf_parsing_add_transition(before, after, tok, item);
@@ -1755,9 +1735,7 @@ typedef struct {
} PgfPrefixTokenState;
static GuString
pgf_get_tokens(PgfSequence* seq,
uint16_t seq_idx, uint8_t tok_idx,
GuPool* pool)
pgf_get_tokens(PgfSequence* seq, uint16_t seq_idx, GuPool* pool)
{
GuPool* tmp_pool = gu_new_pool();
GuExn* err = gu_new_exn(NULL, gu_kind(type), tmp_pool);
@@ -1773,17 +1751,7 @@ pgf_get_tokens(PgfSequence* seq,
switch (i.tag) {
case PGF_SYMBOL_KS: {
PgfSymbolKS* symks = i.data;
size_t len = gu_seq_length(symks->tokens);
for (size_t i = tok_idx; i < len; i++) {
if (i > 0) {
gu_putc(' ', out, err);
}
PgfToken tok = gu_seq_get(symks->tokens, PgfToken, i);
gu_string_write(tok, out, err);
}
tok_idx = 0;
gu_string_write(symks->token, out, err);
}
default:
goto end;
@@ -1809,18 +1777,9 @@ pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
PgfSequence* seq;
pgf_item_sequence(item, &lin_idx, &seq, ts->pool);
uint16_t seq_idx = item->seq_idx;
uint8_t tok_idx = item->tok_idx;
// go one token back
if (tok_idx > 0)
tok_idx--;
else
seq_idx--;
ts->tp = gu_new(PgfTokenProb, ts->pool);
ts->tp->tok =
pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool);
pgf_get_tokens(seq, item->seq_idx-1, ts->pool);
ts->tp->cat = item->conts->ccat->cnccat->abscat->name;
ts->tp->prob = item->inside_prob+item->conts->outside_prob;
}
@@ -2346,17 +2305,15 @@ pgf_morpho_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
switch (i.tag) {
case PGF_SYMBOL_KS: {
PgfSymbolKS* symks = i.data;
size_t len = gu_seq_length(symks->tokens);
for (size_t i = 0; i < len; i++) {
if (pos >= gu_seq_length(clo->tokens))
goto cont;
if (pos >= gu_seq_length(clo->tokens))
goto cont;
PgfToken tok1 = gu_seq_get(symks->tokens, PgfToken, i);
PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
if (!gu_string_eq(tok1, tok2))
goto cont;
}
PgfToken tok1 = symks->token;
PgfToken tok2 = gu_seq_get(clo->tokens, PgfToken, pos++);
if (!gu_string_eq(tok1, tok2))
goto cont;
}
default:
continue;
@@ -2443,7 +2400,7 @@ pgf_fullform_iter(GuMapItor* fn, const void* key, void* value, GuExn* err)
PgfProductionApply* papp = i.data;
PgfSequence* seq = papp->fun->lins[cfc.lin_idx];
GuString tokens = pgf_get_tokens(seq, 0, 0, st->pool);
GuString tokens = pgf_get_tokens(seq, 0, st->pool);
// create a new production index with keys that
// are multiword units
@@ -2531,12 +2488,10 @@ pgf_fullform_get_analyses(PgfFullFormEntry* entry,
static void
pgf_parser_index_token(PgfConcr* concr,
PgfTokens* tokens,
PgfToken tok,
PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
GuPool *pool)
{
PgfToken tok = gu_seq_get(tokens, PgfToken, 0);
PgfProductionIdx* set =
gu_map_get(concr->leftcorner_tok_idx, &tok, PgfProductionIdx*);
if (set == NULL) {
@@ -2570,6 +2525,47 @@ pgf_parser_index_epsilon(PgfConcr* concr,
gu_buf_push(prods, PgfProduction, prod);
}
static void
pgf_parser_index_symbol(PgfConcr* concr, PgfSymbol sym,
PgfCCat* ccat, size_t lin_idx, PgfProduction prod,
GuPool *pool)
{
GuVariantInfo i = gu_variant_open(sym);
switch (i.tag) {
case PGF_SYMBOL_KS: {
PgfSymbolKS* sks = i.data;
pgf_parser_index_token(concr,
sks->token,
ccat, lin_idx, prod,
pool);
break;
}
case PGF_SYMBOL_KP: {
PgfSymbolKP* skp = i.data;
PgfSymbol sym =
gu_seq_get(skp->default_form, PgfSymbol, 0);
pgf_parser_index_symbol(concr, sym,
ccat, lin_idx, prod,
pool);
for (size_t i = 0; i < skp->n_forms; i++) {
sym = gu_seq_get(skp->forms[i].form, PgfSymbol, 0);
pgf_parser_index_symbol(concr, sym,
ccat, lin_idx, prod,
pool);
}
break;
}
case PGF_SYMBOL_CAT:
case PGF_SYMBOL_LIT:
case PGF_SYMBOL_NE:
case PGF_SYMBOL_VAR:
// Nothing to be done here
break;
default:
gu_impossible();
}
}
void
pgf_parser_index(PgfConcr* concr,
PgfCCat* ccat, PgfProduction prod,
@@ -2586,39 +2582,9 @@ pgf_parser_index(PgfConcr* concr,
PgfSequence* seq = papp->fun->lins[lin_idx];
if (gu_seq_length(seq) > 0) {
GuVariantInfo i = gu_variant_open(gu_seq_get(seq, PgfSymbol, 0));
switch (i.tag) {
case PGF_SYMBOL_KS: {
PgfSymbolKS* sks = i.data;
pgf_parser_index_token(concr,
sks->tokens,
ccat, lin_idx, prod,
pool);
break;
}
case PGF_SYMBOL_KP: {
PgfSymbolKP* skp = i.data;
pgf_parser_index_token(concr,
skp->default_form,
ccat, lin_idx, prod,
pool);
for (size_t i = 0; i < skp->n_forms; i++) {
pgf_parser_index_token(concr,
skp->forms[i].form,
ccat, lin_idx, prod,
pool);
}
break;
}
case PGF_SYMBOL_CAT:
case PGF_SYMBOL_LIT:
case PGF_SYMBOL_NE:
case PGF_SYMBOL_VAR:
// Nothing to be done here
break;
default:
gu_impossible();
}
pgf_parser_index_symbol(concr, gu_seq_get(seq, PgfSymbol, 0),
ccat, lin_idx, prod,
pool);
} else {
pgf_parser_index_epsilon(concr,
ccat, lin_idx, prod,