diff --git a/src/runtime/c/pgf/data.h b/src/runtime/c/pgf/data.h index 5dfd97336..8ad6e34dd 100644 --- a/src/runtime/c/pgf/data.h +++ b/src/runtime/c/pgf/data.h @@ -269,6 +269,12 @@ struct PGF_INTERNAL_DECL PgfLRShift { size_t r; }; +struct PGF_INTERNAL_DECL PgfLRShiftKS { + size_t next_state; + ref seq; + size_t sym_idx; +}; + struct PgfLRReduceArg; struct PGF_INTERNAL_DECL PgfLRProduction { @@ -300,6 +306,7 @@ struct PGF_INTERNAL_DECL PgfLRReduce { struct PGF_INTERNAL_DECL PgfLRState { ref> shifts; + ref> tokens; ref> reductions; }; diff --git a/src/runtime/c/pgf/parser.cxx b/src/runtime/c/pgf/parser.cxx index b5850a889..b862f3016 100644 --- a/src/runtime/c/pgf/parser.cxx +++ b/src/runtime/c/pgf/parser.cxx @@ -4,8 +4,8 @@ #include //#define DEBUG_STATE_CREATION -//#define DEBUG_AUTOMATON -//#define DEBUG_PARSER +#define DEBUG_AUTOMATON +#define DEBUG_PARSER //#define DEBUG_GENERATOR struct PgfLRTableMaker::CCat { @@ -356,12 +356,39 @@ void *PgfLRTableMaker::Item::operator new(size_t size, Item *item) { return new_item; } +bool PgfLRTableMaker::CompareKey3::operator() (const Key3& k1, const Key3& k2) const { + size_t i = k1.second; + size_t j = k2.second; + for (;;) { + if (i >= k1.first->syms.len || ref::get_tag(k1.first->syms.data[i]) != PgfSymbolKS::tag) + return (j < k2.first->syms.len && ref::get_tag(k2.first->syms.data[j]) == PgfSymbolKS::tag); + + if (j >= k2.first->syms.len || ref::get_tag(k2.first->syms.data[j]) != PgfSymbolKS::tag) + return false; + + auto symks1 = ref::untagged(k1.first->syms.data[i]); + auto symks2 = ref::untagged(k2.first->syms.data[j]); + + int res[2] = {0,0}; + texticmp(&symks1->token, &symks2->token, res); + if (res[0] < 0) + return true; + if (res[0] > 0) + return false; + + i++; j++; + } + + return false; +} + struct PgfLRTableMaker::State { size_t id; std::vector items; std::vector completed; std::map ccats1; std::map ccats2; + std::map tokens; State() { this->id = 0; @@ -651,9 +678,19 @@ void PgfLRTableMaker::symbol(State *state, Fold fold, Item *item, PgfSymbol sym) auto symks = ref::untagged(sym); if (fold == PROBE) { item->ccat->productive = true; + } else { + auto &next_state = state->tokens[Key3(item->seq,item->sym_idx)]; + if (next_state == NULL) { + next_state = new State; + } + while (item->sym_idx < item->seq->syms.len) { + if (ref::get_tag(item->seq->syms.data[item->sym_idx]) != PgfSymbolKS::tag) + break; + item->sym_idx++; + } + item->stk_size++; + next_state->push_item(item); } - if (item->ref_cnt == 0) - delete item; break; } default: @@ -879,7 +916,7 @@ void PgfLRTableMaker::complete(State *state, Fold fold, Item *item) } } -void PgfLRTableMaker::transition(PgfConcrLincat *lincat, size_t lin_idx, State *&state) +void PgfLRTableMaker::internalize_state(State *&state) { MD5Context ctxt; auto begin = state->items.begin(); @@ -912,11 +949,6 @@ void PgfLRTableMaker::transition(PgfConcrLincat *lincat, size_t lin_idx, State * delete state; state = next_state; } - -#if defined(DEBUG_AUTOMATON) - fprintf(stderr, "%s.%zu: state %ld\n", - lincat->name.text, lin_idx, state->id); -#endif } ref PgfLRTableMaker::make() @@ -945,10 +977,38 @@ ref PgfLRTableMaker::make() } for (auto &i : state->ccats1) { - transition(i.first.first, i.first.second, i.second); + internalize_state(i.second); +#if defined(DEBUG_AUTOMATON) + fprintf(stderr, "%s.%zu: state %ld\n", + i.first.first->name.text, i.first.second, i.second->id); +#endif } for (auto &i : state->ccats2) { - transition(i.first.first->lincat, i.first.second, i.second); + internalize_state(i.second); +#if defined(DEBUG_AUTOMATON) + fprintf(stderr, "%s.%zu: state %ld\n", + i.first.first->lincat->name.text, i.first.second, i.second->id); +#endif + } + for (auto &i : state->tokens) { + internalize_state(i.second); +#if defined(DEBUG_AUTOMATON) + PgfPrinter printer(NULL, 0, NULL); + size_t sym_idx = i.first.second; + ref seq = i.first.first; + while (sym_idx < seq->syms.len) { + PgfSymbol sym = seq->syms.data[sym_idx]; + if (ref::get_tag(sym) != PgfSymbolKS::tag) + break; + printer.symbol(sym); + sym_idx++; + } + printer.nprintf(64, ": state %ld\n", i.second->id); + + PgfText *text = printer.get_text(); + fputs(text->text, stderr); + free(text); +#endif } } @@ -971,6 +1031,18 @@ ref PgfLRTableMaker::make() shift->next_state = i.second->id; } + ref> tokens = 0; + if (state->tokens.size() > 0) { + size_t index = 0; + tokens = vector_new(state->tokens.size()); + for (auto i : state->tokens) { + ref shift = vector_elem(tokens,index++); + shift->seq = i.first.first; + shift->sym_idx = i.first.second; + shift->next_state = i.second->id; + } + } + auto reductions = vector_new(state->completed.size()); for (size_t i = 0; i < state->completed.size(); i++) { Item *item = state->completed[i]; @@ -993,6 +1065,7 @@ ref PgfLRTableMaker::make() ref lrstate = vector_elem(lrtable, state->id); lrstate->shifts = shifts; + lrstate->tokens = tokens; lrstate->reductions = reductions; } return lrtable; @@ -1111,19 +1184,38 @@ void PgfParser::print_prod(Choice *choice, Production *prod) free(text); } -void PgfParser::print_transition(StackNode *source, StackNode *target, Stage *stage) +void PgfParser::print_transition(StackNode *source, StackNode *target, Stage *stage, ref shift) { - fprintf(stderr, "state %ld --- ?%d ---> state %ld (position %zu-%zu, nodes %zu)\n", - source->state_id, target->choice->fid, target->state_id, - stage->start.pos, stage->end.pos, - stage->nodes.size()); + PgfPrinter printer(NULL, 0, m); + printer.nprintf(64, "state %ld --- ", source->state_id); + if (target->choice != 0) { + printer.nprintf(32, "?%d", target->choice->fid); + } + if (shift != 0) { + size_t sym_idx = shift->sym_idx; + ref seq = shift->seq; + while (sym_idx < seq->syms.len) { + PgfSymbol sym = seq->syms.data[sym_idx]; + if (ref::get_tag(sym) != PgfSymbolKS::tag) + break; + printer.symbol(sym); + sym_idx++; + } + } + printer.nprintf(80, " ---> state %ld (position %zu-%zu, nodes %zu)\n", + target->state_id, + stage->start.pos, stage->end.pos, stage->nodes.size()); + PgfText *text = printer.get_text(); + fputs(text->text, stderr); + free(text); } #endif -PgfParser::PgfParser(ref concr, ref start, PgfText *sentence, PgfMarshaller *m, PgfUnmarshaller *u) +PgfParser::PgfParser(ref concr, ref start, PgfText *sentence, bool case_sensitive, PgfMarshaller *m, PgfUnmarshaller *u) { this->concr = concr; this->sentence = sentence; + this->case_sensitive = case_sensitive; this->m = m; this->u = u; this->last_fid = 0; @@ -1134,12 +1226,12 @@ PgfParser::PgfParser(ref concr, ref start, PgfText *se spot.pos = 0; spot.ptr = (uint8_t*) sentence->text; - this->before = NULL; + this->before = new Stage(spot); this->after = NULL; - this->ahead = new Stage(spot); + this->ahead = NULL; - StackNode *node = new StackNode(ahead, 0); - this->ahead->nodes.push_back(node); + StackNode *node = new StackNode(before, 0); + this->before->nodes.push_back(node); } void PgfParser::shift(StackNode *parent, ref lincat, size_t r, Production *prod, @@ -1172,7 +1264,7 @@ void PgfParser::shift(StackNode *parent, ref lincat, size_t r, P if (std::find(node->parents.begin(), node->parents.end(), parent) == node->parents.end()) { node->parents.push_back(parent); #ifdef DEBUG_PARSER - print_transition(parent,node,after); + print_transition(parent,node,after,0); #endif } @@ -1181,6 +1273,48 @@ void PgfParser::shift(StackNode *parent, ref lincat, size_t r, P } } +void PgfParser::shift(StackNode *parent, Stage *before) +{ + ref> shifts = vector_elem(concr->lrtable,parent->state_id)->tokens; + if (shifts != 0) { + const uint8_t *sent_end = (const uint8_t *) &sentence->text[sentence->size]; + for (size_t i = 0; i < shifts->len; i++) { + ref shift = vector_elem(shifts, i); + PgfTextSpot spot = before->end; + size_t sym_idx = shift->sym_idx; + int cmp = + text_sequence_cmp(&spot, sent_end, + shift->seq, &sym_idx, + case_sensitive, SM_PARTIAL); + if (cmp == 0) { + start_matches(&spot, NULL); + + StackNode *node = NULL; + for (StackNode *n : after->nodes) { + if (n->stage == before && n->state_id == shift->next_state) { + node = n; + break; + } + } + if (node == NULL) { + node = new StackNode(before, shift->next_state); + node->choice = NULL; + after->nodes.push_back(node); + } + + if (std::find(node->parents.begin(), node->parents.end(), parent) == node->parents.end()) { + node->parents.push_back(parent); +#ifdef DEBUG_PARSER + print_transition(parent,node,after,shift); +#endif + } + + end_matches(&spot, NULL); + } + } + } +} + PgfParser::Choice *PgfParser::intersect_choice(Choice *choice1, Choice *choice2, intersection_map &im) { if (choice1 == NULL) @@ -1352,6 +1486,7 @@ void PgfParser::space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err) while (i < before->nodes.size()) { StackNode *node = before->nodes[i++]; reduce_all(node); + shift(node, before); } } diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h index 2e35fa191..0e8654bd4 100644 --- a/src/runtime/c/pgf/parser.h +++ b/src/runtime/c/pgf/parser.h @@ -53,6 +53,12 @@ class PGF_INTERNAL_DECL PgfLRTableMaker } }; + typedef std::pair,size_t> Key3; + + struct PGF_INTERNAL_DECL CompareKey3 : std::less { + bool operator() (const Key3& k1, const Key3& k2) const; + }; + ref abstr; ref concr; @@ -81,7 +87,7 @@ class PGF_INTERNAL_DECL PgfLRTableMaker void print_production(CCat *ccat, Production *prod); void print_item(Item *item); - void transition(PgfConcrLincat *lincat, size_t lin_idx, State *&state); + void internalize_state(State *&state); public: PgfLRTableMaker(ref abstr, ref concr); @@ -95,6 +101,7 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum { ref concr; PgfText *sentence; + bool case_sensitive; PgfMarshaller *m; PgfUnmarshaller *u; @@ -119,6 +126,7 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum void shift(StackNode *parent, ref lincat, size_t r, Production *prod, Stage *before, Stage *after); + void shift(StackNode *parent, Stage *before); void reduce(StackNode *parent, ref lin, ref red, size_t n, std::vector &args, Stage *before, Stage *after); @@ -127,7 +135,7 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum size_t n, std::vector &args); void reduce_all(StackNode *state); void print_prod(Choice *choice, Production *prod); - void print_transition(StackNode *source, StackNode *target, Stage *stage); + void print_transition(StackNode *source, StackNode *target, Stage *stage, ref shift); typedef std::map,Choice*> intersection_map; @@ -144,7 +152,7 @@ class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum void release_expr_state(ExprState *state); public: - PgfParser(ref concr, ref start, PgfText *sentence, PgfMarshaller *m, PgfUnmarshaller *u); + PgfParser(ref concr, ref start, PgfText *sentence, bool case_sensitive, PgfMarshaller *m, PgfUnmarshaller *u); virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err); virtual void start_matches(PgfTextSpot *end, PgfExn* err); diff --git a/src/runtime/c/pgf/pgf.cxx b/src/runtime/c/pgf/pgf.cxx index adf95efe7..c2c5bd9b7 100644 --- a/src/runtime/c/pgf/pgf.cxx +++ b/src/runtime/c/pgf/pgf.cxx @@ -2743,7 +2743,7 @@ PgfExprEnum *pgf_parse(PgfDB *db, PgfConcrRevision revision, if (lincat_u.lincat == 0) return 0; - PgfParser *parser = new PgfParser(concr, lincat_u.lincat, sentence, m, u); + PgfParser *parser = new PgfParser(concr, lincat_u.lincat, sentence, case_sensitive, m, u); phrasetable_lookup_cohorts(concr->phrasetable, sentence, case_sensitive, parser, err); @@ -3170,6 +3170,24 @@ pgf_graphviz_lr_automaton(PgfDB *db, PgfConcrRevision revision, printer.efun(&shift->lincat->name); printer.nprintf(16, ".%zu\"];\n", shift->r); } + + for (size_t j = 0; j < state->tokens->len; j++) { + ref shift = vector_elem(state->tokens, j); + printer.nprintf(16, " s%zu -> s%zu [label=\"", i, shift->next_state); + size_t sym_idx = shift->sym_idx; + while (sym_idx < shift->seq->syms.len) { + if (ref::get_tag(shift->seq->syms.data[sym_idx]) != PgfSymbolKS::tag) + break; + if (sym_idx > shift->sym_idx) + printer.puts(" "); + auto symks = ref::untagged(shift->seq->syms.data[sym_idx]); + printer.puts("\\\""); + printer.put_esc_str(&symks->token); + printer.puts("\\\""); + sym_idx++; + } + printer.puts("\"];\n"); + } } printer.puts("}"); diff --git a/src/runtime/c/pgf/phrasetable.cxx b/src/runtime/c/pgf/phrasetable.cxx index ade41844a..1c2506ddc 100644 --- a/src/runtime/c/pgf/phrasetable.cxx +++ b/src/runtime/c/pgf/phrasetable.cxx @@ -228,28 +228,33 @@ int sequence_cmp(ref seq1, ref seq2) return 0; } -static +PGF_INTERNAL int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end, - ref seq, - bool case_sensitive, bool full_match) + ref seq, size_t *p_i, + bool case_sensitive, SeqMatch sm) { int res1 = 0; - size_t i = 0; const uint8_t *s2 = NULL; const uint8_t *e2 = NULL; + uint8_t t = 0xff; + if (*p_i < seq->syms.len) { + t = ref::get_tag(seq->syms.data[*p_i]); + } + size_t count = 0; for (;;) { if (spot->ptr >= end) { - if (s2 < e2 || i < seq->syms.len) + if (s2 < e2 || t == PgfSymbolKS::tag) return -1; return case_sensitive ? res1 : 0; } - if (s2 >= e2 && i >= seq->syms.len) - return full_match ? 1 : 0; + if (s2 >= e2 && t != PgfSymbolKS::tag) { + return (sm == SM_FULL_MATCH) ? 1 : 0; + } uint32_t ucs1 = pgf_utf8_decode(&spot->ptr); spot->pos++; uint32_t ucs1i = pgf_utf8_to_upper(ucs1); @@ -268,16 +273,21 @@ int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end, } } - uint8_t t = ref::get_tag(seq->syms.data[i]); if (t != PgfSymbolKS::tag) { + if (sm == SM_PARTIAL) + return 0; return ((int) PgfSymbolKS::tag) - ((int) t); } - auto sym_ks = ref::untagged(seq->syms.data[i]); + auto sym_ks = ref::untagged(seq->syms.data[*p_i]); s2 = (uint8_t *) &sym_ks->token.text; e2 = s2+sym_ks->token.size; - i++; + (*p_i)++; + t = 0xff; + if (*p_i < seq->syms.len) { + t = ref::get_tag(seq->syms.data[*p_i]); + } } uint32_t ucs2 = pgf_utf8_decode(&s2); @@ -552,7 +562,8 @@ void phrasetable_lookup(PgfPhrasetable table, current.pos = 0; current.ptr = (uint8_t *) sentence->text; const uint8_t *end = current.ptr+sentence->size; - int cmp = text_sequence_cmp(¤t,end,table->value.seq,case_sensitive,true); + size_t sym_idx = 0; + int cmp = text_sequence_cmp(¤t,end,table->value.seq,&sym_idx,case_sensitive,SM_FULL_MATCH); if (cmp < 0) { phrasetable_lookup(table->left,sentence,case_sensitive,scanner,err); } else if (cmp > 0) { @@ -662,7 +673,8 @@ void phrasetable_lookup_prefixes(PgfCohortsState *state, return; PgfTextSpot current = state->spot; - int cmp = text_sequence_cmp(¤t,state->end,table->value.seq,state->case_sensitive,false); + size_t sym_idx = 0; + int cmp = text_sequence_cmp(¤t,state->end,table->value.seq,&sym_idx,state->case_sensitive,SM_PREFIX); if (cmp < 0) { phrasetable_lookup_prefixes(state,table->left,min,max); } else if (cmp > 0) { diff --git a/src/runtime/c/pgf/phrasetable.h b/src/runtime/c/pgf/phrasetable.h index 6d4236df9..b5553d2b4 100644 --- a/src/runtime/c/pgf/phrasetable.h +++ b/src/runtime/c/pgf/phrasetable.h @@ -115,4 +115,13 @@ void phrasetable_iter(PgfConcr *concr, PGF_INTERNAL_DECL void phrasetable_release(PgfPhrasetable table); +// The following are used internally in the parser + +enum SeqMatch { SM_FULL_MATCH, SM_PREFIX, SM_PARTIAL }; + +PGF_INTERNAL_DECL +int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end, + ref seq, size_t *p_i, + bool case_sensitive, SeqMatch sm); + #endif diff --git a/src/runtime/c/pgf/printer.cxx b/src/runtime/c/pgf/printer.cxx index c9675a4f9..e185dc335 100644 --- a/src/runtime/c/pgf/printer.cxx +++ b/src/runtime/c/pgf/printer.cxx @@ -45,6 +45,47 @@ void PgfPrinter::puts(const char *s) } } +void PgfPrinter::put_esc_str(PgfText *v) +{ + PgfText *charbuf = (PgfText *) alloca(sizeof(PgfText)+7); + + const uint8_t* start = (uint8_t*) v->text; + const uint8_t* end = start + v->size; + while (start < end) { + const uint8_t* s = start; + uint32_t c = pgf_utf8_decode(&s); + switch (c) { + case '\\': + puts("\\\\"); + break; + case '"': + puts("\\\""); + break; + case '\n': + puts("\\n"); + break; + case '\r': + puts("\\r"); + break; + case '\b': + puts("\\b"); + break; + case '\t': + puts("\\t"); + break; + case '\0': + puts("\\0"); + break; + default: + charbuf->size = s-start; + memcpy(charbuf->text, start, charbuf->size); + charbuf->text[charbuf->size] = 0; + puts(charbuf); + } + start = s; + } +} + void PgfPrinter::nprintf(size_t buf_size, const char *format, ...) { again: { @@ -348,44 +389,8 @@ PgfLiteral PgfPrinter::lflt(double v) PgfLiteral PgfPrinter::lstr(PgfText *v) { - PgfText *charbuf = (PgfText *) alloca(sizeof(PgfText)+7); - puts("\""); - const uint8_t* start = (uint8_t*) v->text; - const uint8_t* end = start + v->size; - while (start < end) { - const uint8_t* s = start; - uint32_t c = pgf_utf8_decode(&s); - switch (c) { - case '\\': - puts("\\\\"); - break; - case '"': - puts("\\\""); - break; - case '\n': - puts("\\n"); - break; - case '\r': - puts("\\r"); - break; - case '\b': - puts("\\b"); - break; - case '\t': - puts("\\t"); - break; - case '\0': - puts("\\0"); - break; - default: - charbuf->size = s-start; - memcpy(charbuf->text, start, charbuf->size); - charbuf->text[charbuf->size] = 0; - puts(charbuf); - } - start = s; - } + put_esc_str(v); puts("\""); return 0; } diff --git a/src/runtime/c/pgf/printer.h b/src/runtime/c/pgf/printer.h index db069a667..9d13f202b 100644 --- a/src/runtime/c/pgf/printer.h +++ b/src/runtime/c/pgf/printer.h @@ -46,6 +46,8 @@ public: void puts(PgfText *s); void puts(const char *s); + void put_esc_str(PgfText *v); + // buf_size is the expected buffer size. If larger is needed, // it will be allocated automatically. #if defined(_MSC_VER)