1
0
forked from GitHub/gf-core

support syntagmatic words

This commit is contained in:
Krasimir Angelov
2024-01-02 16:31:22 +01:00
parent 51ea3926a5
commit 68da9226b1
8 changed files with 272 additions and 76 deletions

View File

@@ -269,6 +269,12 @@ struct PGF_INTERNAL_DECL PgfLRShift {
size_t r;
};
struct PGF_INTERNAL_DECL PgfLRShiftKS {
size_t next_state;
ref<PgfSequence> seq;
size_t sym_idx;
};
struct PgfLRReduceArg;
struct PGF_INTERNAL_DECL PgfLRProduction {
@@ -300,6 +306,7 @@ struct PGF_INTERNAL_DECL PgfLRReduce {
struct PGF_INTERNAL_DECL PgfLRState {
ref<Vector<PgfLRShift>> shifts;
ref<Vector<PgfLRShiftKS>> tokens;
ref<Vector<PgfLRReduce>> reductions;
};

View File

@@ -4,8 +4,8 @@
#include <algorithm>
//#define DEBUG_STATE_CREATION
//#define DEBUG_AUTOMATON
//#define DEBUG_PARSER
#define DEBUG_AUTOMATON
#define DEBUG_PARSER
//#define DEBUG_GENERATOR
struct PgfLRTableMaker::CCat {
@@ -356,12 +356,39 @@ void *PgfLRTableMaker::Item::operator new(size_t size, Item *item) {
return new_item;
}
bool PgfLRTableMaker::CompareKey3::operator() (const Key3& k1, const Key3& k2) const {
size_t i = k1.second;
size_t j = k2.second;
for (;;) {
if (i >= k1.first->syms.len || ref<PgfSymbol>::get_tag(k1.first->syms.data[i]) != PgfSymbolKS::tag)
return (j < k2.first->syms.len && ref<PgfSymbol>::get_tag(k2.first->syms.data[j]) == PgfSymbolKS::tag);
if (j >= k2.first->syms.len || ref<PgfSymbol>::get_tag(k2.first->syms.data[j]) != PgfSymbolKS::tag)
return false;
auto symks1 = ref<PgfSymbolKS>::untagged(k1.first->syms.data[i]);
auto symks2 = ref<PgfSymbolKS>::untagged(k2.first->syms.data[j]);
int res[2] = {0,0};
texticmp(&symks1->token, &symks2->token, res);
if (res[0] < 0)
return true;
if (res[0] > 0)
return false;
i++; j++;
}
return false;
}
struct PgfLRTableMaker::State {
size_t id;
std::vector<Item*> items;
std::vector<Item*> completed;
std::map<Key1,State*,CompareKey1> ccats1;
std::map<Key2,State*,CompareKey2> ccats2;
std::map<Key3,State*,CompareKey3> tokens;
State() {
this->id = 0;
@@ -651,9 +678,19 @@ void PgfLRTableMaker::symbol(State *state, Fold fold, Item *item, PgfSymbol sym)
auto symks = ref<PgfSymbolKS>::untagged(sym);
if (fold == PROBE) {
item->ccat->productive = true;
} else {
auto &next_state = state->tokens[Key3(item->seq,item->sym_idx)];
if (next_state == NULL) {
next_state = new State;
}
while (item->sym_idx < item->seq->syms.len) {
if (ref<PgfSymbol>::get_tag(item->seq->syms.data[item->sym_idx]) != PgfSymbolKS::tag)
break;
item->sym_idx++;
}
item->stk_size++;
next_state->push_item(item);
}
if (item->ref_cnt == 0)
delete item;
break;
}
default:
@@ -879,7 +916,7 @@ void PgfLRTableMaker::complete(State *state, Fold fold, Item *item)
}
}
void PgfLRTableMaker::transition(PgfConcrLincat *lincat, size_t lin_idx, State *&state)
void PgfLRTableMaker::internalize_state(State *&state)
{
MD5Context ctxt;
auto begin = state->items.begin();
@@ -912,11 +949,6 @@ void PgfLRTableMaker::transition(PgfConcrLincat *lincat, size_t lin_idx, State *
delete state;
state = next_state;
}
#if defined(DEBUG_AUTOMATON)
fprintf(stderr, "%s.%zu: state %ld\n",
lincat->name.text, lin_idx, state->id);
#endif
}
ref<PgfLRTable> PgfLRTableMaker::make()
@@ -945,10 +977,38 @@ ref<PgfLRTable> PgfLRTableMaker::make()
}
for (auto &i : state->ccats1) {
transition(i.first.first, i.first.second, i.second);
internalize_state(i.second);
#if defined(DEBUG_AUTOMATON)
fprintf(stderr, "%s.%zu: state %ld\n",
i.first.first->name.text, i.first.second, i.second->id);
#endif
}
for (auto &i : state->ccats2) {
transition(i.first.first->lincat, i.first.second, i.second);
internalize_state(i.second);
#if defined(DEBUG_AUTOMATON)
fprintf(stderr, "%s.%zu: state %ld\n",
i.first.first->lincat->name.text, i.first.second, i.second->id);
#endif
}
for (auto &i : state->tokens) {
internalize_state(i.second);
#if defined(DEBUG_AUTOMATON)
PgfPrinter printer(NULL, 0, NULL);
size_t sym_idx = i.first.second;
ref<PgfSequence> seq = i.first.first;
while (sym_idx < seq->syms.len) {
PgfSymbol sym = seq->syms.data[sym_idx];
if (ref<PgfSymbol>::get_tag(sym) != PgfSymbolKS::tag)
break;
printer.symbol(sym);
sym_idx++;
}
printer.nprintf(64, ": state %ld\n", i.second->id);
PgfText *text = printer.get_text();
fputs(text->text, stderr);
free(text);
#endif
}
}
@@ -971,6 +1031,18 @@ ref<PgfLRTable> PgfLRTableMaker::make()
shift->next_state = i.second->id;
}
ref<Vector<PgfLRShiftKS>> tokens = 0;
if (state->tokens.size() > 0) {
size_t index = 0;
tokens = vector_new<PgfLRShiftKS>(state->tokens.size());
for (auto i : state->tokens) {
ref<PgfLRShiftKS> shift = vector_elem(tokens,index++);
shift->seq = i.first.first;
shift->sym_idx = i.first.second;
shift->next_state = i.second->id;
}
}
auto reductions = vector_new<PgfLRReduce>(state->completed.size());
for (size_t i = 0; i < state->completed.size(); i++) {
Item *item = state->completed[i];
@@ -993,6 +1065,7 @@ ref<PgfLRTable> PgfLRTableMaker::make()
ref<PgfLRState> lrstate = vector_elem(lrtable, state->id);
lrstate->shifts = shifts;
lrstate->tokens = tokens;
lrstate->reductions = reductions;
}
return lrtable;
@@ -1111,19 +1184,38 @@ void PgfParser::print_prod(Choice *choice, Production *prod)
free(text);
}
void PgfParser::print_transition(StackNode *source, StackNode *target, Stage *stage)
void PgfParser::print_transition(StackNode *source, StackNode *target, Stage *stage, ref<PgfLRShiftKS> shift)
{
fprintf(stderr, "state %ld --- ?%d ---> state %ld (position %zu-%zu, nodes %zu)\n",
source->state_id, target->choice->fid, target->state_id,
stage->start.pos, stage->end.pos,
stage->nodes.size());
PgfPrinter printer(NULL, 0, m);
printer.nprintf(64, "state %ld --- ", source->state_id);
if (target->choice != 0) {
printer.nprintf(32, "?%d", target->choice->fid);
}
if (shift != 0) {
size_t sym_idx = shift->sym_idx;
ref<PgfSequence> seq = shift->seq;
while (sym_idx < seq->syms.len) {
PgfSymbol sym = seq->syms.data[sym_idx];
if (ref<PgfSymbol>::get_tag(sym) != PgfSymbolKS::tag)
break;
printer.symbol(sym);
sym_idx++;
}
}
printer.nprintf(80, " ---> state %ld (position %zu-%zu, nodes %zu)\n",
target->state_id,
stage->start.pos, stage->end.pos, stage->nodes.size());
PgfText *text = printer.get_text();
fputs(text->text, stderr);
free(text);
}
#endif
PgfParser::PgfParser(ref<PgfConcr> concr, ref<PgfConcrLincat> start, PgfText *sentence, PgfMarshaller *m, PgfUnmarshaller *u)
PgfParser::PgfParser(ref<PgfConcr> concr, ref<PgfConcrLincat> start, PgfText *sentence, bool case_sensitive, PgfMarshaller *m, PgfUnmarshaller *u)
{
this->concr = concr;
this->sentence = sentence;
this->case_sensitive = case_sensitive;
this->m = m;
this->u = u;
this->last_fid = 0;
@@ -1134,12 +1226,12 @@ PgfParser::PgfParser(ref<PgfConcr> concr, ref<PgfConcrLincat> start, PgfText *se
spot.pos = 0;
spot.ptr = (uint8_t*) sentence->text;
this->before = NULL;
this->before = new Stage(spot);
this->after = NULL;
this->ahead = new Stage(spot);
this->ahead = NULL;
StackNode *node = new StackNode(ahead, 0);
this->ahead->nodes.push_back(node);
StackNode *node = new StackNode(before, 0);
this->before->nodes.push_back(node);
}
void PgfParser::shift(StackNode *parent, ref<PgfConcrLincat> lincat, size_t r, Production *prod,
@@ -1172,7 +1264,7 @@ void PgfParser::shift(StackNode *parent, ref<PgfConcrLincat> lincat, size_t r, P
if (std::find(node->parents.begin(), node->parents.end(), parent) == node->parents.end()) {
node->parents.push_back(parent);
#ifdef DEBUG_PARSER
print_transition(parent,node,after);
print_transition(parent,node,after,0);
#endif
}
@@ -1181,6 +1273,48 @@ void PgfParser::shift(StackNode *parent, ref<PgfConcrLincat> lincat, size_t r, P
}
}
void PgfParser::shift(StackNode *parent, Stage *before)
{
ref<Vector<PgfLRShiftKS>> shifts = vector_elem(concr->lrtable,parent->state_id)->tokens;
if (shifts != 0) {
const uint8_t *sent_end = (const uint8_t *) &sentence->text[sentence->size];
for (size_t i = 0; i < shifts->len; i++) {
ref<PgfLRShiftKS> shift = vector_elem(shifts, i);
PgfTextSpot spot = before->end;
size_t sym_idx = shift->sym_idx;
int cmp =
text_sequence_cmp(&spot, sent_end,
shift->seq, &sym_idx,
case_sensitive, SM_PARTIAL);
if (cmp == 0) {
start_matches(&spot, NULL);
StackNode *node = NULL;
for (StackNode *n : after->nodes) {
if (n->stage == before && n->state_id == shift->next_state) {
node = n;
break;
}
}
if (node == NULL) {
node = new StackNode(before, shift->next_state);
node->choice = NULL;
after->nodes.push_back(node);
}
if (std::find(node->parents.begin(), node->parents.end(), parent) == node->parents.end()) {
node->parents.push_back(parent);
#ifdef DEBUG_PARSER
print_transition(parent,node,after,shift);
#endif
}
end_matches(&spot, NULL);
}
}
}
}
PgfParser::Choice *PgfParser::intersect_choice(Choice *choice1, Choice *choice2, intersection_map &im)
{
if (choice1 == NULL)
@@ -1352,6 +1486,7 @@ void PgfParser::space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err)
while (i < before->nodes.size()) {
StackNode *node = before->nodes[i++];
reduce_all(node);
shift(node, before);
}
}

View File

@@ -53,6 +53,12 @@ class PGF_INTERNAL_DECL PgfLRTableMaker
}
};
typedef std::pair<ref<PgfSequence>,size_t> Key3;
struct PGF_INTERNAL_DECL CompareKey3 : std::less<Key3> {
bool operator() (const Key3& k1, const Key3& k2) const;
};
ref<PgfAbstr> abstr;
ref<PgfConcr> concr;
@@ -81,7 +87,7 @@ class PGF_INTERNAL_DECL PgfLRTableMaker
void print_production(CCat *ccat, Production *prod);
void print_item(Item *item);
void transition(PgfConcrLincat *lincat, size_t lin_idx, State *&state);
void internalize_state(State *&state);
public:
PgfLRTableMaker(ref<PgfAbstr> abstr, ref<PgfConcr> concr);
@@ -95,6 +101,7 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum
{
ref<PgfConcr> concr;
PgfText *sentence;
bool case_sensitive;
PgfMarshaller *m;
PgfUnmarshaller *u;
@@ -119,6 +126,7 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum
void shift(StackNode *parent, ref<PgfConcrLincat> lincat, size_t r, Production *prod,
Stage *before, Stage *after);
void shift(StackNode *parent, Stage *before);
void reduce(StackNode *parent, ref<PgfConcrLin> lin, ref<PgfLRReduce> red,
size_t n, std::vector<Choice*> &args,
Stage *before, Stage *after);
@@ -127,7 +135,7 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum
size_t n, std::vector<Choice*> &args);
void reduce_all(StackNode *state);
void print_prod(Choice *choice, Production *prod);
void print_transition(StackNode *source, StackNode *target, Stage *stage);
void print_transition(StackNode *source, StackNode *target, Stage *stage, ref<PgfLRShiftKS> shift);
typedef std::map<std::pair<Choice*,Choice*>,Choice*> intersection_map;
@@ -144,7 +152,7 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum
void release_expr_state(ExprState *state);
public:
PgfParser(ref<PgfConcr> concr, ref<PgfConcrLincat> start, PgfText *sentence, PgfMarshaller *m, PgfUnmarshaller *u);
PgfParser(ref<PgfConcr> concr, ref<PgfConcrLincat> start, PgfText *sentence, bool case_sensitive, PgfMarshaller *m, PgfUnmarshaller *u);
virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err);
virtual void start_matches(PgfTextSpot *end, PgfExn* err);

View File

@@ -2743,7 +2743,7 @@ PgfExprEnum *pgf_parse(PgfDB *db, PgfConcrRevision revision,
if (lincat_u.lincat == 0)
return 0;
PgfParser *parser = new PgfParser(concr, lincat_u.lincat, sentence, m, u);
PgfParser *parser = new PgfParser(concr, lincat_u.lincat, sentence, case_sensitive, m, u);
phrasetable_lookup_cohorts(concr->phrasetable,
sentence, case_sensitive,
parser, err);
@@ -3170,6 +3170,24 @@ pgf_graphviz_lr_automaton(PgfDB *db, PgfConcrRevision revision,
printer.efun(&shift->lincat->name);
printer.nprintf(16, ".%zu\"];\n", shift->r);
}
for (size_t j = 0; j < state->tokens->len; j++) {
ref<PgfLRShiftKS> shift = vector_elem(state->tokens, j);
printer.nprintf(16, " s%zu -> s%zu [label=\"", i, shift->next_state);
size_t sym_idx = shift->sym_idx;
while (sym_idx < shift->seq->syms.len) {
if (ref<PgfSymbol>::get_tag(shift->seq->syms.data[sym_idx]) != PgfSymbolKS::tag)
break;
if (sym_idx > shift->sym_idx)
printer.puts(" ");
auto symks = ref<PgfSymbolKS>::untagged(shift->seq->syms.data[sym_idx]);
printer.puts("\\\"");
printer.put_esc_str(&symks->token);
printer.puts("\\\"");
sym_idx++;
}
printer.puts("\"];\n");
}
}
printer.puts("}");

View File

@@ -228,28 +228,33 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
return 0;
}
static
PGF_INTERNAL
int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end,
ref<PgfSequence> seq,
bool case_sensitive, bool full_match)
ref<PgfSequence> seq, size_t *p_i,
bool case_sensitive, SeqMatch sm)
{
int res1 = 0;
size_t i = 0;
const uint8_t *s2 = NULL;
const uint8_t *e2 = NULL;
uint8_t t = 0xff;
if (*p_i < seq->syms.len) {
t = ref<PgfSymbol>::get_tag(seq->syms.data[*p_i]);
}
size_t count = 0;
for (;;) {
if (spot->ptr >= end) {
if (s2 < e2 || i < seq->syms.len)
if (s2 < e2 || t == PgfSymbolKS::tag)
return -1;
return case_sensitive ? res1 : 0;
}
if (s2 >= e2 && i >= seq->syms.len)
return full_match ? 1 : 0;
if (s2 >= e2 && t != PgfSymbolKS::tag) {
return (sm == SM_FULL_MATCH) ? 1 : 0;
}
uint32_t ucs1 = pgf_utf8_decode(&spot->ptr); spot->pos++;
uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
@@ -268,16 +273,21 @@ int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end,
}
}
uint8_t t = ref<PgfSymbol>::get_tag(seq->syms.data[i]);
if (t != PgfSymbolKS::tag) {
if (sm == SM_PARTIAL)
return 0;
return ((int) PgfSymbolKS::tag) - ((int) t);
}
auto sym_ks = ref<PgfSymbolKS>::untagged(seq->syms.data[i]);
auto sym_ks = ref<PgfSymbolKS>::untagged(seq->syms.data[*p_i]);
s2 = (uint8_t *) &sym_ks->token.text;
e2 = s2+sym_ks->token.size;
i++;
(*p_i)++;
t = 0xff;
if (*p_i < seq->syms.len) {
t = ref<PgfSymbol>::get_tag(seq->syms.data[*p_i]);
}
}
uint32_t ucs2 = pgf_utf8_decode(&s2);
@@ -552,7 +562,8 @@ void phrasetable_lookup(PgfPhrasetable table,
current.pos = 0;
current.ptr = (uint8_t *) sentence->text;
const uint8_t *end = current.ptr+sentence->size;
int cmp = text_sequence_cmp(&current,end,table->value.seq,case_sensitive,true);
size_t sym_idx = 0;
int cmp = text_sequence_cmp(&current,end,table->value.seq,&sym_idx,case_sensitive,SM_FULL_MATCH);
if (cmp < 0) {
phrasetable_lookup(table->left,sentence,case_sensitive,scanner,err);
} else if (cmp > 0) {
@@ -662,7 +673,8 @@ void phrasetable_lookup_prefixes(PgfCohortsState *state,
return;
PgfTextSpot current = state->spot;
int cmp = text_sequence_cmp(&current,state->end,table->value.seq,state->case_sensitive,false);
size_t sym_idx = 0;
int cmp = text_sequence_cmp(&current,state->end,table->value.seq,&sym_idx,state->case_sensitive,SM_PREFIX);
if (cmp < 0) {
phrasetable_lookup_prefixes(state,table->left,min,max);
} else if (cmp > 0) {

View File

@@ -115,4 +115,13 @@ void phrasetable_iter(PgfConcr *concr,
PGF_INTERNAL_DECL
void phrasetable_release(PgfPhrasetable table);
// The following are used internally in the parser
enum SeqMatch { SM_FULL_MATCH, SM_PREFIX, SM_PARTIAL };
PGF_INTERNAL_DECL
int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end,
ref<PgfSequence> seq, size_t *p_i,
bool case_sensitive, SeqMatch sm);
#endif

View File

@@ -45,6 +45,47 @@ void PgfPrinter::puts(const char *s)
}
}
void PgfPrinter::put_esc_str(PgfText *v)
{
PgfText *charbuf = (PgfText *) alloca(sizeof(PgfText)+7);
const uint8_t* start = (uint8_t*) v->text;
const uint8_t* end = start + v->size;
while (start < end) {
const uint8_t* s = start;
uint32_t c = pgf_utf8_decode(&s);
switch (c) {
case '\\':
puts("\\\\");
break;
case '"':
puts("\\\"");
break;
case '\n':
puts("\\n");
break;
case '\r':
puts("\\r");
break;
case '\b':
puts("\\b");
break;
case '\t':
puts("\\t");
break;
case '\0':
puts("\\0");
break;
default:
charbuf->size = s-start;
memcpy(charbuf->text, start, charbuf->size);
charbuf->text[charbuf->size] = 0;
puts(charbuf);
}
start = s;
}
}
void PgfPrinter::nprintf(size_t buf_size, const char *format, ...)
{
again: {
@@ -348,44 +389,8 @@ PgfLiteral PgfPrinter::lflt(double v)
PgfLiteral PgfPrinter::lstr(PgfText *v)
{
PgfText *charbuf = (PgfText *) alloca(sizeof(PgfText)+7);
puts("\"");
const uint8_t* start = (uint8_t*) v->text;
const uint8_t* end = start + v->size;
while (start < end) {
const uint8_t* s = start;
uint32_t c = pgf_utf8_decode(&s);
switch (c) {
case '\\':
puts("\\\\");
break;
case '"':
puts("\\\"");
break;
case '\n':
puts("\\n");
break;
case '\r':
puts("\\r");
break;
case '\b':
puts("\\b");
break;
case '\t':
puts("\\t");
break;
case '\0':
puts("\\0");
break;
default:
charbuf->size = s-start;
memcpy(charbuf->text, start, charbuf->size);
charbuf->text[charbuf->size] = 0;
puts(charbuf);
}
start = s;
}
put_esc_str(v);
puts("\"");
return 0;
}

View File

@@ -46,6 +46,8 @@ public:
void puts(PgfText *s);
void puts(const char *s);
void put_esc_str(PgfText *v);
// buf_size is the expected buffer size. If larger is needed,
// it will be allocated automatically.
#if defined(_MSC_VER)