diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am index af20f62a1..3fd9b2383 100644 --- a/src/runtime/c/Makefile.am +++ b/src/runtime/c/Makefile.am @@ -42,7 +42,9 @@ libpgf_la_SOURCES = \ pgf/probspace.cxx \ pgf/probspace.h \ pgf/generator.cxx \ - pgf/generator.h + pgf/generator.h \ + pgf/md5.cxx \ + pgf/md5.h libpgf_la_LDFLAGS = -no-undefined -version-info 4:0:0 libpgf_la_CXXFLAGS = -fno-rtti -std=c++11 -DCOMPILING_PGF diff --git a/src/runtime/c/pgf/data.cxx b/src/runtime/c/pgf/data.cxx index e0d8b7cce..d188f3133 100644 --- a/src/runtime/c/pgf/data.cxx +++ b/src/runtime/c/pgf/data.cxx @@ -48,9 +48,9 @@ void PgfConcr::release(ref concr) void PgfConcrLincat::release(ref lincat) { for (size_t i = 0; i < lincat->fields->len; i++) { - PgfLincatField::release(vector_elem(lincat->fields, i)); + text_db_release(*vector_elem(lincat->fields, i)); } - Vector::release(lincat->fields); + Vector>::release(lincat->fields); for (size_t i = 0; i < lincat->args->len; i++) { PgfLParam::release(vector_elem(lincat->args, i)->param); @@ -67,13 +67,6 @@ void PgfConcrLincat::release(ref lincat) PgfDB::free(lincat, lincat->name.size+1); } -void PgfLincatField::release(ref field) -{ - text_db_release(field->name); - if (field->backrefs != 0) - Vector::release(field->backrefs); -} - void PgfLParam::release(ref param) { PgfDB::free(param, param->n_terms*sizeof(param->terms[0])); diff --git a/src/runtime/c/pgf/data.h b/src/runtime/c/pgf/data.h index a9818df0e..2542bb9f4 100644 --- a/src/runtime/c/pgf/data.h +++ b/src/runtime/c/pgf/data.h @@ -224,19 +224,6 @@ struct PGF_INTERNAL_DECL PgfSymbolALLCAPIT { static const uint8_t tag = 10; }; -struct PGF_INTERNAL_DECL PgfConcrLincat; -struct PGF_INTERNAL_DECL PgfLincatBackref; -struct PGF_INTERNAL_DECL PgfLincatEpsilon; - -struct PGF_INTERNAL_DECL PgfLincatField { - ref lincat; - ref name; - ref> backrefs; - ref> epsilons; - - static void release(ref field); -}; - struct PGF_INTERNAL_DECL PgfConcrLincat { static const uint8_t tag = 0; @@ -246,7 +233,7 @@ struct PGF_INTERNAL_DECL PgfConcrLincat { ref> args; ref>> res; ref>> seqs; - ref> fields; + ref>> fields; PgfText name; @@ -268,18 +255,6 @@ struct PGF_INTERNAL_DECL PgfConcrLin { static void release(ref lin); }; -struct PGF_INTERNAL_DECL PgfLinSeqIndex { - ref lin; - size_t seq_index; -}; - -struct PGF_INTERNAL_DECL PgfLincatBackref : public PgfLinSeqIndex { - size_t dot; -}; - -struct PGF_INTERNAL_DECL PgfLincatEpsilon : public PgfLinSeqIndex { -}; - struct PGF_INTERNAL_DECL PgfConcrPrintname { ref printname; PgfText name; @@ -287,6 +262,25 @@ struct PGF_INTERNAL_DECL PgfConcrPrintname { static void release(ref printname); }; +struct PGF_INTERNAL_DECL PgfLRShift { + size_t next_state; + ref lincat; + size_t r; + bool is_epsilon; +}; + +struct PGF_INTERNAL_DECL PgfLRReduce { + object lin_obj; + size_t seq_index; +}; + +struct PGF_INTERNAL_DECL PgfLRState { + ref> shifts; + ref> reductions; +}; + +typedef Vector PgfLRTable; + struct PGF_INTERNAL_DECL PgfConcr { static const uint8_t tag = 1; @@ -296,6 +290,8 @@ struct PGF_INTERNAL_DECL PgfConcr { PgfPhrasetable phrasetable; Namespace printnames; + ref lrtable; + PgfText name; static void release(ref pgf); diff --git a/src/runtime/c/pgf/linearizer.cxx b/src/runtime/c/pgf/linearizer.cxx index fdf434f70..3df419763 100644 --- a/src/runtime/c/pgf/linearizer.cxx +++ b/src/runtime/c/pgf/linearizer.cxx @@ -287,7 +287,7 @@ void PgfLinearizer::TreeLinNode::check_category(PgfLinearizer *linearizer, PgfTe void PgfLinearizer::TreeLinNode::linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex) { PgfText *cat = &lin->absfun->type->name; - PgfText *field = &*(vector_elem(lin->lincat->fields, lindex)->name); + PgfText *field = &**vector_elem(lin->lincat->fields, lindex); if (linearizer->pre_stack == NULL) out->begin_phrase(cat, fid, field, &lin->name); @@ -390,7 +390,7 @@ void PgfLinearizer::TreeLindefNode::linearize_arg(PgfLinearizationOutputIface *o void PgfLinearizer::TreeLindefNode::linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex) { if (lincat != 0) { - PgfText *field = &*(vector_elem(lincat->fields, lindex)->name); + PgfText *field = &**vector_elem(lincat->fields, lindex); if (linearizer->pre_stack == NULL) out->begin_phrase(&lincat->name, fid, field, fun); else { @@ -543,7 +543,7 @@ void PgfLinearizer::TreeLitNode::linearize(PgfLinearizationOutputIface *out, Pgf { PgfText *field = NULL; if (lincat != 0) { - field = &*(vector_elem(lincat->fields, lindex)->name); + field = &**vector_elem(lincat->fields, lindex); } linearizer->flush_pre_stack(out, literal); diff --git a/src/runtime/c/pgf/md5.cxx b/src/runtime/c/pgf/md5.cxx new file mode 100644 index 000000000..6a8b4c8cb --- /dev/null +++ b/src/runtime/c/pgf/md5.cxx @@ -0,0 +1,197 @@ +/* + * Derived from the RSA Data Security, Inc. MD5 Message-Digest Algorithm + * and modified slightly to be functionally identical but condensed into control structures. + */ + +#include "data.h" +#include "md5.h" + +/* + * Constants defined by the MD5 algorithm + */ +#define A 0x67452301 +#define B 0xefcdab89 +#define C 0x98badcfe +#define D 0x10325476 + +static uint32_t S[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, + 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, + 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, + 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21}; + +static uint32_t K[] = {0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, + 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501, + 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, + 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, + 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, + 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8, + 0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, + 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a, + 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, + 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, + 0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, + 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665, + 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, + 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1, + 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, + 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391}; + +/* + * Padding used to make the size (in bits) of the input congruent to 448 mod 512 + */ +static uint8_t PADDING[] = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + +/* + * Bit-manipulation functions defined by the MD5 algorithm + */ +#define F(X, Y, Z) ((X & Y) | (~X & Z)) +#define G(X, Y, Z) ((X & Z) | (Y & ~Z)) +#define H(X, Y, Z) (X ^ Y ^ Z) +#define I(X, Y, Z) (Y ^ (X | ~Z)) + +/* + * Rotates a 32-bit word left by n bits + */ +uint32_t rotateLeft(uint32_t x, uint32_t n){ + return (x << n) | (x >> (32 - n)); +} + +/* + * Initialize a context + */ +MD5Context::MD5Context() +{ + size = (uint64_t)0; + + buffer[0] = (uint32_t)A; + buffer[1] = (uint32_t)B; + buffer[2] = (uint32_t)C; + buffer[3] = (uint32_t)D; +} + +/* + * Step on 512 bits of input with the main MD5 algorithm. + */ +static +void md5Step(uint32_t *buffer, uint32_t *input){ + uint32_t AA = buffer[0]; + uint32_t BB = buffer[1]; + uint32_t CC = buffer[2]; + uint32_t DD = buffer[3]; + + uint32_t E; + + unsigned int j; + + for(unsigned int i = 0; i < 64; ++i){ + switch(i / 16){ + case 0: + E = F(BB, CC, DD); + j = i; + break; + case 1: + E = G(BB, CC, DD); + j = ((i * 5) + 1) % 16; + break; + case 2: + E = H(BB, CC, DD); + j = ((i * 3) + 5) % 16; + break; + default: + E = I(BB, CC, DD); + j = (i * 7) % 16; + break; + } + + uint32_t temp = DD; + DD = CC; + CC = BB; + BB = BB + rotateLeft(AA + E + K[i] + input[j], S[i]); + AA = temp; + } + + buffer[0] += AA; + buffer[1] += BB; + buffer[2] += CC; + buffer[3] += DD; +} + +/* + * Add some amount of input to the context + * + * If the input fills out a block of 512 bits, apply the algorithm (md5Step) + * and save the result in the buffer. Also updates the overall size. + */ +void MD5Context::update(uint8_t *input_buffer, size_t input_len) +{ + uint32_t input[16]; + unsigned int offset = this->size % 64; + this->size += (uint64_t)input_len; + + // Copy each byte in input_buffer into the next space in our context input + for (unsigned int i = 0; i < input_len; ++i) { + this->input[offset++] = (uint8_t)*(input_buffer + i); + + // If we've filled our context input, copy it into our local array input + // then reset the offset to 0 and fill in a new buffer. + // Every time we fill out a chunk, we run it through the algorithm + // to enable some back and forth between cpu and i/o + if (offset % 64 == 0){ + for (unsigned int j = 0; j < 16; ++j) { + // Convert to little-endian + // The local variable `input` our 512-bit chunk separated into 32-bit words + // we can use in calculations + input[j] = (uint32_t)(this->input[(j * 4) + 3]) << 24 | + (uint32_t)(this->input[(j * 4) + 2]) << 16 | + (uint32_t)(this->input[(j * 4) + 1]) << 8 | + (uint32_t)(this->input[(j * 4)]); + } + md5Step(this->buffer, input); + offset = 0; + } + } +} + +/* + * Pad the current input to get to 448 bytes, append the size in bits to the very end, + * and save the result of the final iteration into digest. + */ +void MD5Context::finalize(MD5Digest *digest) +{ + uint32_t input[16]; + unsigned int offset = this->size % 64; + unsigned int padding_length = offset < 56 ? 56 - offset : (56 + 64) - offset; + + // Fill in the padding and undo the changes to size that resulted from the update + update(PADDING, padding_length); + this->size -= (uint64_t)padding_length; + + // Do a final update (internal to this function) + // Last two 32-bit words are the two halves of the size (converted from bytes to bits) + for(unsigned int j = 0; j < 14; ++j) + { + input[j] = (uint32_t)(this->input[(j * 4) + 3]) << 24 | + (uint32_t)(this->input[(j * 4) + 2]) << 16 | + (uint32_t)(this->input[(j * 4) + 1]) << 8 | + (uint32_t)(this->input[(j * 4)]); + } + input[14] = (uint32_t)(this->size * 8); + input[15] = (uint32_t)((this->size * 8) >> 32); + + md5Step(this->buffer, input); + + // Move the result into digest (convert from little-endian) + for(unsigned int i = 0; i < 4; ++i){ + digest->b[(i * 4) + 0] = (uint8_t)((this->buffer[i] & 0x000000FF)); + digest->b[(i * 4) + 1] = (uint8_t)((this->buffer[i] & 0x0000FF00) >> 8); + digest->b[(i * 4) + 2] = (uint8_t)((this->buffer[i] & 0x00FF0000) >> 16); + digest->b[(i * 4) + 3] = (uint8_t)((this->buffer[i] & 0xFF000000) >> 24); + } +} diff --git a/src/runtime/c/pgf/md5.h b/src/runtime/c/pgf/md5.h new file mode 100644 index 000000000..1e33226ec --- /dev/null +++ b/src/runtime/c/pgf/md5.h @@ -0,0 +1,30 @@ +#ifndef MD5_H +#define MD5_H + +struct PGF_INTERNAL_DECL MD5Digest { + uint8_t b[16]; +}; + +inline bool operator < (const MD5Digest &d1, const MD5Digest &d2) { + return memcmp(d1.b, d2.b, 16) < 0; +} + +class PGF_INTERNAL_DECL MD5Context { + uint64_t size; // Size of input in bytes + uint32_t buffer[4]; // Current accumulation of hash + uint8_t input[64]; // Input to be used in the next step + +public: + MD5Context(); + void update(uint8_t *input, size_t input_len); + + template + void update(T &input) + { + update((uint8_t *) &input, sizeof(T)); + } + + void finalize(MD5Digest *digest); +}; + +#endif diff --git a/src/runtime/c/pgf/parser.cxx b/src/runtime/c/pgf/parser.cxx index 2f5f3236c..2bdb277a3 100644 --- a/src/runtime/c/pgf/parser.cxx +++ b/src/runtime/c/pgf/parser.cxx @@ -1,803 +1,980 @@ #include "data.h" #include "printer.h" #include "parser.h" -#include "math.h" -#include -#include -#include -#include -#define PARSER_DEBUG +//#define DEBUG_STATE_CREATION +//#define DEBUG_AUTOMATON +//#define DEBUG_PARSER +//#define DEBUG_GENERATOR -class PGF_INTERNAL_DECL PgfParser::CFGCat { -public: - ref field; - size_t value; - - // copy assignment - bool operator<(const CFGCat& other) const - { - if (field < other.field) - return true; - else if (field == other.field) - return (value < other.value); - else - return false; - } -}; - -struct PGF_INTERNAL_DECL PgfParser::Choice -{ - ParseItemConts* conts; +struct PgfLRTableMaker::State { size_t id; - prob_t viterbi_prob; - bool is_chunk; - std::vector prods; - std::vector items; - std::vector> exprs; + Predictions *preds; + std::vector seed; + std::vector shifts; + std::vector reductions; - Choice(ParseItemConts* conts, size_t id, prob_t prob) - { - this->conts = conts; - this->id = id; - this->viterbi_prob = prob; - this->is_chunk = true; + State(Predictions *preds) { + this->id = 0; + this->preds = preds; } - - void trace(State *state); }; -class PGF_INTERNAL_DECL PgfParser::Production -{ -public: - void trace(Choice *res); - - ref lin; +struct PgfLRTableMaker::Item { + Item *next; + object lin_obj; size_t seq_index; - Choice *args[]; + ref seq; + size_t dot; }; -struct PGF_INTERNAL_DECL PgfParser::ParseItemConts { - State *state; - ref field; - size_t value; - std::vector items; +struct PgfLRTableMaker::Predictions { + ref lincat; + size_t r; + bool is_epsilon; + Item *items; }; -class PGF_INTERNAL_DECL PgfParser::State +struct PgfLRTableMaker::CompareItem : std::less { + bool operator() (const Item *item1, const Item *item2) const { + if (item1->lin_obj < item2->lin_obj) + return true; + else if (item1->lin_obj > item2->lin_obj) + return false; + + if (item1->seq_index < item2->seq_index) + return true; + else if (item1->seq_index > item2->seq_index) + return false; + + return (item1->dot < item2->dot); + } +}; + +PgfLRTableMaker::PgfLRTableMaker(ref abstr, ref concr) { -public: - ParseItemConts *get_conts(ref field, size_t value) - { - ParseItemConts *conts; - CFGCat cfg_cat = {field, value}; - auto itr1 = contss.find(cfg_cat); - if (itr1 == contss.end()) { - conts = new ParseItemConts(); - conts->state = this; - conts->field = field; - conts->value = value; - contss.insert(std::pair(cfg_cat, conts)); - } else { - conts = itr1->second; - } - return conts; - } + this->abstr = abstr; + this->concr = concr; -public: - PgfTextSpot start, end; - State *prev, *next; + PgfText *startcat = (PgfText *) + alloca(sizeof(PgfText)+9); + startcat->size = 8; + strcpy(startcat->text, "startcat"); - prob_t viterbi_prob; + ref flag = + namespace_lookup(abstr->aflags, startcat); - class ResultComparator : std::less { - public: - bool operator()(Item* &lhs, Item* &rhs) const - { - return lhs->get_prob() > rhs->get_prob(); - } - }; + ref lincat = 0; + if (flag != 0) { + switch (ref::get_tag(flag->value)) { + case PgfLiteralStr::tag: { + auto lstr = ref::untagged(flag->value); - std::map contss; - std::map choices; - std::priority_queue,ResultComparator> queue; -}; + State *state = new State(NULL); -class PGF_INTERNAL_DECL PgfParser::ParseItem : public Item -{ -public: - void* operator new(size_t size, PgfLinSeqIndex *r) - { - size_t n_args = r->lin->absfun->type->hypos->len; - size_t ex_size = sizeof(Choice*)*n_args; - ParseItem *item = (ParseItem *) malloc(size+ex_size); - memset(item->args, 0, ex_size); - return item; - } + lincat = + namespace_lookup(concr->lincats, &lstr->val); - void* operator new(size_t size, ParseItem *item) - { - size_t n_args = item->lin->absfun->type->hypos->len; - size_t ex_size = sizeof(Choice*)*n_args; - ParseItem *new_item = (ParseItem *) malloc(size+ex_size); - memcpy(new_item, item, size+ex_size); - return new_item; - } + MD5Context ctxt; - ParseItem(ParseItemConts *conts, size_t values, ref lin, size_t seq_index) - { - this->outside_prob = lin->lincat->abscat->prob; - this->inside_prob = lin->absfun->prob; - this->conts = conts; - this->lin = lin; - this->seq_index = seq_index; - this->dot = lin->seqs->data[seq_index]->syms.len; - this->values = values; - } + for (size_t i = lincat->n_lindefs; i < lincat->res->len; i++) { + ref res = *vector_elem(lincat->res, i); + size_t seq_index = + lincat->n_lindefs*lincat->fields->len + i-lincat->n_lindefs; + ref seq = *vector_elem(lincat->seqs, seq_index); + Item *item = new Item; + item->next = NULL; + item->lin_obj = lincat.tagged(); + item->seq_index = seq_index; + item->seq = seq; + item->dot = 0; - ParseItem(ParseItemConts *conts, PgfLincatBackref *backref, - size_t d, Choice *choice) - { - this->outside_prob = backref->lin->lincat->abscat->prob; - this->inside_prob = backref->lin->absfun->prob + choice->viterbi_prob; - this->conts = conts; - this->lin = backref->lin; - this->seq_index = backref->seq_index; - this->dot = backref->dot+1; - this->args[d] = choice; - } + ctxt.update(item->lin_obj); + ctxt.update(item->seq_index); + ctxt.update(item->dot); - ParseItem(ParseItemConts *conts, PgfLincatEpsilon *epsilon, prob_t outside_prob) - { - this->outside_prob = outside_prob; - this->inside_prob = epsilon->lin->absfun->prob; - this->conts = conts; - this->lin = epsilon->lin; - this->seq_index = epsilon->seq_index; - this->dot = 0; - } - - ParseItem(size_t d, Choice *choice) - { - this->inside_prob += choice->viterbi_prob; - this->dot += 1; - this->args[d] = choice; - } - - static void bu_predict(ref field, State *state, Choice *choice) - { - if (field->backrefs == 0) - return; - - for (size_t i = 0; i < field->backrefs->len; i++) { - ref backref = vector_elem(field->backrefs, i); - - ref seq = - *vector_elem(backref->lin->seqs, backref->seq_index); - PgfSymbol sym = seq->syms.data[backref->dot]; - ref symcat = ref::untagged(sym); - - size_t index = backref->seq_index % backref->lin->lincat->fields->len; - ref up_field = vector_elem(backref->lin->lincat->fields, index); - ParseItemConts *conts = choice->conts->state->get_conts(up_field, 0); - - state->queue.push(new(&*backref) ParseItem(conts, backref, - symcat->d, choice)); - } - } - - static void eps_predict(ref field, State *state, ParseItemConts *conts, prob_t outside_prob) - { - if (field->epsilons == 0) - return; - - for (size_t i = 0; i < field->epsilons->len; i++) { - ref epsilon = vector_elem(field->epsilons, i); - state->queue.push(new(&*epsilon) ParseItem(conts, epsilon, outside_prob)); - } - } - - void combine(State *state, Choice *choice) - { - ref seq = lin->seqs->data[seq_index]; - PgfSymbol sym = *vector_elem(&seq->syms,dot); - auto sym_cat = ref::untagged(sym); - state->queue.push(new(this) ParseItem(sym_cat->d, choice)); - } - - void complete(PgfParser *parser, ref seq) - { - // the last child as a non-chunk - size_t dot = seq->syms.len; - while (dot > 0) { - dot--; - PgfSymbol sym = *vector_elem(&seq->syms,dot); - if (ref::get_tag(sym) == PgfSymbolCat::tag) { - auto sym_cat = ref::untagged(sym); - Choice *last = args[sym_cat->d]; - if (last != NULL) { - if (last->conts == conts) - continue; -#ifdef PARSER_DEBUG - if (last->is_chunk) { - fprintf(stderr, "not-chunk(?%ld)\n", last->id); - } -#endif - last->is_chunk = false; - } + state->seed.push_back(item); } + + MD5Digest digest; + ctxt.finalize(&digest); + + states[digest] = state; + todo.push_back(state); + } + } + } +} + +#if defined(DEBUG_STATE_CREATION) || defined(DEBUG_AUTOMATON) +void PgfLRTableMaker::print_item(Item *item) +{ + PgfPrinter printer(NULL, 0, NULL); + + switch (ref::get_tag(item->lin_obj)) { + case PgfConcrLin::tag: { + auto lin = + ref::untagged(item->lin_obj); + ref type = lin->absfun->type; + printer.puts(&type->name); + printer.puts(" -> "); + printer.puts(&lin->name); + printer.puts("["); + PgfDBMarshaller m; + for (size_t i = 0; i < type->hypos->len; i++) { + if (i > 0) + printer.puts(","); + m.match_type(&printer, vector_elem(type->hypos, i)->type.as_object()); + } + printer.nprintf(32, "]; %ld : ", item->seq_index / lin->lincat->fields->len); + break; + } + case PgfConcrLincat::tag: { + auto lincat = + ref::untagged(item->lin_obj); + printer.puts("linref "); + printer.puts(&lincat->name); + printer.puts("["); + printer.puts(&lincat->name); + printer.nprintf(32, "]; %ld : ", item->seq_index - lincat->fields->len*lincat->n_lindefs); + break; + } + } + + if (item->dot == 0) + printer.puts(". "); + + for (size_t i = 0; i < item->seq->syms.len; i++) { + PgfSymbol sym = item->seq->syms.data[i]; + printer.symbol(sym); + + if (i+1 == item->dot) + printer.puts(" . "); + } + printer.puts("\n"); + + PgfText *text = printer.get_text(); + fputs(text->text, stderr); + free(text); +} +#endif + +void PgfLRTableMaker::process(Item *item) +{ +#ifdef DEBUG_STATE_CREATION + print_item(item); +#endif + + if (item->dot < item->seq->syms.len) { + PgfSymbol sym = item->seq->syms.data[item->dot]; + symbol(item, sym); + } else { + complete(item); + } +} + +void PgfLRTableMaker::symbol(Item *item, PgfSymbol sym) +{ + switch (ref::get_tag(sym)) { + case PgfSymbolCat::tag: { + auto symcat = ref::untagged(sym); + + switch (ref::get_tag(item->lin_obj)) { + case PgfConcrLin::tag: { + auto lin = + ref::untagged(item->lin_obj); + ref res = + *vector_elem(lin->res, item->seq_index / lin->lincat->fields->len); + ref hypo = vector_elem(lin->absfun->type->hypos, symcat->d); + predict(item, ref::from_ptr(&hypo->type->name), res->vars, &symcat->r); break; } - - // Create a new choice - Choice *choice; - auto itr2 = parser->after->choices.find(conts); - if (itr2 == parser->after->choices.end()) { - choice = new Choice(conts, ++parser->last_choice_id, inside_prob); - choice->trace(parser->after); - parser->after->choices.insert(std::pair(conts, choice)); - } else { - choice = itr2->second; + case PgfConcrLincat::tag: { + auto lincat = + ref::untagged(item->lin_obj); + ref res = + *vector_elem(lincat->res, + item->seq_index - lincat->n_lindefs*lincat->fields->len); + predict(item, ref::from_ptr(&lincat->name), res->vars, &symcat->r); + break; } - - // Create a new production - size_t n_args = lin->absfun->type->hypos->len; - - Production *prod = (Production*) - malloc(sizeof(Production)+sizeof(Choice*)*n_args); - prod->lin = lin; - prod->seq_index = seq_index; - memcpy(prod->args, this+1, sizeof(Choice*)*n_args); - - prod->trace(choice); - choice->prods.push_back(prod); - - // If this the first time when we complete this category - if (itr2 == parser->after->choices.end()) { - // Combine with top-down predicted rules - for (ParseItem *item : conts->items) { - item->combine(parser->after,choice); - } - if (conts->state != parser->after) { - // Bottom up prediction if this is not an epsilon rule - bu_predict(conts->field,parser->after,choice); - } } + break; } - - void symbol(PgfParser *parser, PgfSymbol sym) { - switch (ref::get_tag(sym)) { - case PgfSymbolCat::tag: { - auto sym_cat = ref::untagged(sym); - Choice *arg = args[sym_cat->d]; - if (arg == NULL) { - ref ty = - vector_elem(lin->absfun->type->hypos, sym_cat->d)->type; - ref lincat = - namespace_lookup(parser->concr->lincats, &ty->name); - if (lincat != 0) { - ref field = vector_elem(lincat->fields, sym_cat->r.i0); - ParseItemConts *conts = parser->after->get_conts(field, 0); - conts->items.push_back(this); - - if (conts->items.size() == 1) { - eps_predict(field, parser->after, conts, inside_prob+outside_prob); - } - } - } - } - default:; - // Nothing - } } +} - virtual State *proceed(PgfParser *parser, PgfUnmarshaller *u) - { - ref seq = lin->seqs->data[seq_index]; - - if (dot >= seq->syms.len) { - complete(parser, seq); - } else { - PgfSymbol sym = *vector_elem(&seq->syms,dot); - symbol(parser, sym); - } - - return NULL; - } - - virtual bool combine(PgfParser *parser, ParseItemConts *conts, PgfExpr expr, prob_t prob, PgfUnmarshaller *u) - { - return false; - } - - virtual void print1(PgfPrinter *printer, State *state, PgfMarshaller *m) - { -#ifdef PARSER_DEBUG - printer->nprintf(32,"%ld-%ld; ", conts->state->end.pos, state->start.pos); - - size_t index = seq_index / lin->lincat->fields->len; - ref res = *vector_elem(lin->res, index); - ref ty = lin->absfun->type; - - if (res->vars != 0) { - printer->puts("{"); - size_t values = this->values; - for (size_t i = 0; i < res->vars->len; i++) { - if (i > 0) - printer->puts(", "); - - printer->lvar(res->vars->data[i].var); - size_t val = values / res->vars->data[i].range; - printer->nprintf(32,"=%ld",val); - - values = values % res->vars->data[i].range; - } - printer->puts("} . "); - } - - printer->efun(&ty->name); - printer->puts("("); - printer->lparam(ref::from_ptr(&res->param)); - printer->puts(") -> "); - - printer->efun(&lin->name); - printer->puts("["); - size_t n_args = lin->args->len / lin->res->len; - for (size_t i = 0; i < n_args; i++) { - if (i > 0) - printer->puts(","); - - if (args[i] == NULL) - printer->parg(vector_elem(ty->hypos, i)->type, - vector_elem(lin->args, index*n_args + i)); - else - printer->nprintf(10, "?%ld", args[i]->id); - } - - printer->nprintf(10, "]; %ld : ", seq_index % lin->lincat->fields->len); - ref seq = *vector_elem(lin->seqs, seq_index); - for (size_t i = 0; i < seq->syms.len; i++) { - if (i > 0) - printer->puts(" "); - if (dot == i) - printer->puts(". "); - printer->symbol(*vector_elem(&seq->syms, i)); - } - - if (dot == seq->syms.len) - printer->puts(" . "); -#endif - } - - virtual void print2(PgfPrinter *printer, State *state, int x, PgfMarshaller *m) - { - } - - virtual PgfExpr get_expr(PgfUnmarshaller *u) - { - return 0; - } - -private: - ParseItemConts *conts; - ref lin; - size_t seq_index; - size_t dot; - size_t values; - Choice *args[]; +struct PGF_INTERNAL_DECL PgfVariableValue { + size_t range; + size_t value; }; -class PgfParser::ExprItem : public Item +void PgfLRTableMaker::predict(Item *item, ref cat, + ref> vars, PgfLParam *r) { -public: - ExprItem(Choice *parent, Production *prod, prob_t outside_prob, PgfUnmarshaller *u) + PgfVariableValue *values = (PgfVariableValue *) + alloca(sizeof(PgfVariableValue)*r->n_terms); + for (size_t i = 0; i < r->n_terms; i++) { - this->parent = parent; - this->outside_prob = outside_prob; - this->inside_prob = prod->lin->absfun->prob; - this->prod = prod; - this->arg_index = 0; - this->expr = u->efun(&prod->lin->name); + size_t var = r->terms[i].var; + for (size_t j = 0; j < vars->len; j++) + { + ref range = vector_elem(vars, j); + if (range->var == var) { + values[i].range = range->range; + values[i].value = 0; + break; + } + } + } - size_t n_args = prod->lin->absfun->type->hypos->len; + size_t index = r->i0; + for (;;) { + predict(item, cat, index); + + size_t i = r->n_terms; + while (i > 0) { + i--; + values[i].value++; + if (values[i].value < values[i].range) { + index += r->terms[i].factor; + i++; + break; + } + + index -= (values[i].value-1) * r->terms[i].factor; + values[i].value = 0; + } + + if (i == 0) { + break; + } + } +} + +void PgfLRTableMaker::predict(Item *item, ref cat, size_t r) +{ + Predictions *&preds = predictions[Key(cat,r)]; + Predictions *tmp_preds = preds; + if (preds == NULL) { + preds = new Predictions(); + preds->lincat = 0; + preds->r = r; + preds->is_epsilon = false; + preds->items = NULL; + } + + State *&next_state = continuations[preds]; + State *tmp = next_state; + if (next_state == NULL) { + next_state = new State(preds); + } + Item *next_item = new Item(); + next_item->next = NULL; + next_item->lin_obj = item->lin_obj; + next_item->seq_index = item->seq_index; + next_item->seq = item->seq; + next_item->dot = item->dot+1; + next_state->seed.push_back(next_item); + push_heap(next_state->seed.begin(), next_state->seed.end(), compare_item); + + if (tmp == NULL) { + if (tmp_preds == NULL) { + std::function)> f = + [this,preds](ref fun) { + predict(fun, preds); + return true; + }; + probspace_iter(abstr->funs_by_cat, cat, f, false); + } else { + Item *new_item = preds->items; + while (new_item != NULL) { + process(new_item); + new_item = new_item->next; + } + } + } +} + +void PgfLRTableMaker::predict(ref absfun, Predictions *preds) +{ + ref lin = + namespace_lookup(concr->lins, &absfun->name); + + if (lin != 0) { + preds->lincat = lin->lincat; + + size_t n_args = absfun->type->hypos->len; + size_t n_fields = lin->lincat->fields->len; + for (size_t i = 0; i < lin->res->len; i++) { + size_t seq_index = n_fields * i + preds->r; + ref seq = *vector_elem(lin->seqs,seq_index); + + if (seq->syms.len == 0) { + preds->is_epsilon = true; + } else { + Item *item = new Item; + item->next = preds->items; + item->lin_obj = lin.tagged(); + item->seq_index = seq_index; + item->seq = seq; + item->dot = 0; + preds->items = item; + process(item); + } + } + } +} + +void PgfLRTableMaker::complete(Item *item) +{ + completed.push_back(item); +} + +ref PgfLRTableMaker::make() +{ + size_t state_id = 0; + while (!todo.empty()) { + State *state = todo.back(); todo.pop_back(); + +#if defined(DEBUG_AUTOMATON) + fprintf(stderr, "--------------- state %ld ---------------\n", state->id); +#endif + + while (!state->seed.empty()) { + Item *item = state->seed.back(); state->seed.pop_back(); + +#if defined(DEBUG_AUTOMATON) && !defined(DEBUG_STATE_CREATION) + // The order in which we process the items should not matter, + // For debugging however it is useful to see them in the same order. + pop_heap(state->seed.begin(),state->seed.end(),compare_item); + print_item(item); +#endif + + process(item); + } + state->seed.shrink_to_fit(); + + for (auto i : continuations) { + MD5Context ctxt; + auto begin = i.second->seed.begin(); + auto end = i.second->seed.end(); + while (begin != end) { + Item *item = *(--end); + ctxt.update(item->lin_obj); + ctxt.update(item->seq_index); + ctxt.update(item->dot); + + pop_heap(begin,end,compare_item); + } + + MD5Digest digest; + ctxt.finalize(&digest); + + State *&next_state = states[digest]; + if (next_state == NULL) { + next_state = i.second; + next_state->id = ++state_id; + todo.push_back(next_state); + } else { + delete i.second; + } + + PgfLRShift shift; + shift.lincat = next_state->preds->lincat; + shift.r = next_state->preds->r; + shift.next_state = next_state->id; + shift.is_epsilon = next_state->preds->is_epsilon; + state->shifts.push_back(shift); +#if defined(DEBUG_AUTOMATON) + fprintf(stderr, "%s.%zu: state %ld%s\n", + shift.lincat->name.text, shift.r, shift.next_state, + shift.is_epsilon ? " (epsilon)" : "" + ); +#endif + } + continuations.clear(); + + for (Item *item : completed) { + PgfLRReduce red; + red.lin_obj = item->lin_obj; + red.seq_index = item->seq_index; + state->reductions.push_back(red); +#if defined(DEBUG_AUTOMATON) + switch (ref::get_tag(red.lin_obj)) { + case PgfConcrLin::tag: { + auto lin = + ref::untagged(red.lin_obj); + fprintf(stderr, "reduce %s/%zu\n", lin->name.text, red.seq_index); + break; + } + case PgfConcrLincat::tag: { + auto lincat = + ref::untagged(red.lin_obj); + fprintf(stderr, "reduce linref %s/%zu\n", lincat->name.text, red.seq_index); + break; + } + } +#endif + } + completed.clear(); + } + + ref lrtable = vector_new(states.size()); + for (auto v : states) { + State *state = v.second; + ref lrstate = vector_elem(lrtable, state->id); + + lrstate->shifts = vector_new(state->shifts.size()); + for (size_t i = 0; i < state->shifts.size(); i++) { + *vector_elem(lrstate->shifts,i) = state->shifts[i]; + } + + lrstate->reductions = vector_new(state->reductions.size()); + for (size_t i = 0; i < state->reductions.size(); i++) { + *vector_elem(lrstate->reductions,i) = state->reductions[i]; + } + } + return lrtable; +} + +struct PgfParser::Choice { + int fid; + std::vector prods; + Result* res; + + Choice(int fid) { + this->fid = fid; + this->res = NULL; + } +}; + +struct PgfParser::Production { + ref lin; + size_t n_args; + Choice *args[]; + + void *operator new(size_t size, ref lin) { + size_t n_args = lin->args->len / lin->res->len; + Production *prod = (Production *) + malloc(size+sizeof(Choice*)*n_args); + prod->lin = lin; + prod->n_args = n_args; for (size_t i = 0; i < n_args; i++) { - if (prod->args[i] != NULL) - this->inside_prob += prod->args[i]->viterbi_prob; + prod->args[i] = NULL; } + return prod; } - ExprItem(ExprItem *prev, PgfExpr arg, prob_t prob, PgfUnmarshaller *u) - { - this->parent = prev->parent; - this->outside_prob = prev->outside_prob; - this->inside_prob = prev->inside_prob; - this->prod = prev->prod; - this->arg_index = prev->arg_index + 1; - this->expr = u->eapp(prev->expr,arg); - - this->inside_prob -= prod->args[prev->arg_index]->viterbi_prob; - this->inside_prob += prob; + Production() { + // If there is no constructor, GCC will zero the object, + // while it has already been initialized in the new operator. } - virtual State *proceed(PgfParser *parser, PgfUnmarshaller *u) - { - size_t n_args = prod->lin->absfun->type->hypos->len; - while (arg_index < n_args) { - Choice *choice = prod->args[arg_index]; - - if (choice != NULL) { - choice->items.push_back(this); - - if (choice->items.size() == 1) { - prob_t outside_prob = get_prob()-choice->viterbi_prob; - for (auto prod : choice->prods) { - parser->before->queue.push(new ExprItem(choice,prod,outside_prob,u)); - } - } else { - for (auto ep : choice->exprs) { - combine(parser,choice->conts,ep.first,ep.second,u); - } - } - return parser->before; - } - - PgfExpr arg = u->emeta(0); - expr = u->eapp(expr,arg); - u->free_ref(arg); - arg_index++; - } - - State *prev = parser->before; - parent->exprs.push_back(std::pair(expr,inside_prob)); - for (auto item : parent->items) { - if (item->combine(parser,parent->conts,expr,inside_prob,u)) { - prev = parent->conts->state; - } - } - - return prev; + void operator delete(void *p) { + free(p); } +}; - virtual bool combine(PgfParser *parser, ParseItemConts *conts, PgfExpr expr, prob_t prob, PgfUnmarshaller *u) - { - parser->before->queue.push(new ExprItem(this,expr,prob,u)); - return false; +struct PgfParser::StackNode { + size_t state_id; + Choice *choice; + std::vector parents; + + StackNode(size_t state_id) { + this->state_id = state_id; + this->choice = NULL; } +}; - virtual void print1(PgfPrinter *printer, State *state, PgfMarshaller *m) - { -#ifdef PARSER_DEBUG - parent->items[0]->print1(printer,state,m); +struct PgfParser::ParseState { + ParseState *next; + PgfTextSpot start; + PgfTextSpot end; + std::vector stacks; - printer->puts(" "); - - size_t n_args = prod->lin->absfun->type->hypos->len; - if (n_args > 0) - printer->puts("("); - m->match_expr(printer,expr); -#endif + ParseState(PgfTextSpot spot) { + next = NULL; + start = spot; + end = spot; } +}; - virtual void print2(PgfPrinter *printer, State *state, int x, PgfMarshaller *m) - { -#ifdef PARSER_DEBUG - size_t n_args = prod->lin->absfun->type->hypos->len; - for (size_t i = arg_index+x; i < n_args; i++) { - if (prod->args[i]) - printer->nprintf(10," ?%ld",prod->args[i]->id); - else - printer->puts(" ?"); - } - if (n_args > 0) - printer->puts(")"); +struct PgfParser::ExprState { + Result *res; + prob_t prob; - parent->items[0]->print2(printer,state,1,m); -#endif - } - - virtual PgfExpr get_expr(PgfUnmarshaller *u) - { - return expr; - } - -private: - Choice *parent; Production *prod; - size_t arg_index; + size_t n_args; PgfExpr expr; }; -class PgfParser::MetaItem : public Item -{ -public: - MetaItem(State *state, - PgfExpr arg, - prob_t inside_prob, - MetaItem *next) - { - this->outside_prob = state->viterbi_prob; - this->inside_prob = inside_prob; - this->state = state; - this->arg = arg; - this->next = next; +struct PgfParser::ExprInstance { + PgfExpr expr; + prob_t prob; + + ExprInstance(PgfExpr expr, prob_t prob) { + this->expr = expr; + this->prob = prob; } - - virtual State *proceed(PgfParser *parser, PgfUnmarshaller *u) - { - if (state->prev == NULL) - return NULL; - - if (state->choices.size() == 0) { - State *prev = state; - while (prev->prev != NULL && prev->choices.size() == 0) { - prev = prev->prev; - } - - size_t size = state->start.ptr-prev->end.ptr; - PgfText *token = (PgfText *) alloca(sizeof(PgfText)+size+1); - token->size = size; - memcpy(token->text,prev->end.ptr,size); - token->text[size] = 0; - - PgfExpr expr = u->elit(u->lstr(token)); - prev->queue.push(new MetaItem(prev, expr, - inside_prob, - this)); - return prev; - } else { - for (auto it : state->choices) { - ParseItemConts *conts = it.first; - Choice *choice = it.second; - - if (!choice->is_chunk) - continue; - - choice->items.push_back(this); - - if (choice->items.size() == 1) { - prob_t prob = conts->state->viterbi_prob+inside_prob; - for (Production *prod : choice->prods) { - parser->before->queue.push(new ExprItem(choice, - prod, prob+prod->lin->lincat->abscat->prob, u)); - } - } else { - for (auto ep : choice->exprs) { - combine(parser,conts,ep.first,ep.second,u); - } - } - } - return parser->before; - } - } - - virtual bool combine(PgfParser *parser, ParseItemConts *conts, PgfExpr expr, prob_t prob, PgfUnmarshaller *u) - { - conts->state->queue.push(new MetaItem(conts->state, - expr, - this->inside_prob+conts->field->lincat->abscat->prob+prob, - this)); - return true; - } - - virtual void print1(PgfPrinter *printer, State *state, PgfMarshaller *m) - { -#ifdef PARSER_DEBUG - printer->puts("?"); -#endif - } - - virtual void print2(PgfPrinter *printer, State *state, int x, PgfMarshaller *m) - { -#ifdef PARSER_DEBUG - MetaItem *res = this; - while (res->arg != 0) { - printer->puts(" ("); - m->match_expr(printer, res->arg); - printer->puts(")"); - res = res->next; - } -#endif - } - - virtual PgfExpr get_expr(PgfUnmarshaller *u) - { - MetaItem *res = this; - PgfExpr expr = u->emeta(0); - while (res->arg != 0) { - PgfExpr expr1 = u->eapp(expr, res->arg); - u->free_ref(expr); - expr = expr1; - res = res->next; - } - return expr; - } - -private: - State *state; - PgfExpr arg; - MetaItem *next; }; -void PgfParser::Item::trace(State *state, PgfMarshaller *m) +struct PgfParser::Result { + std::vector states; + std::vector exprs; +}; + +#if defined(DEBUG_STATE_CREATION) || defined(DEBUG_AUTOMATON) || defined(DEBUG_PARSER) +void PgfParser::print_prod(Choice *choice, Production *prod) { -#ifdef PARSER_DEBUG - PgfPrinter printer(NULL,0,m); - printer.puts("["); - print1(&printer, state, m); - print2(&printer, state, 0, m); - printer.nprintf(40,"; %f+%f=%f]\n",inside_prob,outside_prob,inside_prob+outside_prob); - printer.dump(); -#endif -} + PgfPrinter printer(NULL, 0, m); -void PgfParser::Choice::trace(State *state) -{ -#ifdef PARSER_DEBUG - size_t seq_index = conts->field-conts->field->lincat->fields->data; + printer.nprintf(32, "?%d -> ", choice->fid); - PgfPrinter printer(NULL,0,NULL); - printer.nprintf(40,"[%ld-%ld; ", conts->state->end.pos, state->start.pos); - printer.efun(&conts->field->lincat->name); - printer.nprintf(30,"(%ld); %ld", conts->value, seq_index); - printer.nprintf(40,"; ?%ld; %f]\n", id, viterbi_prob); - printer.dump(); -#endif -} - -void PgfParser::Production::trace(PgfParser::Choice *res) { -#ifdef PARSER_DEBUG - PgfPrinter printer(NULL,0,NULL); - printer.nprintf(10, "?%ld = ", res->id); - printer.puts(&lin->name); - - printer.puts("["); - auto hypos = lin->absfun->type->hypos; - for (size_t i = 0; i < hypos->len; i++) { + ref type = prod->lin->absfun->type; + printer.puts(&prod->lin->name); + printer.nprintf(32,"["); + PgfDBMarshaller m; + for (size_t i = 0; i < prod->n_args; i++) { + Choice *choice = prod->args[i]; if (i > 0) printer.puts(","); - - if (args[i] == NULL) - printer.efun(&hypos->data[i].type->name); - else - printer.nprintf(10, "?%ld", args[i]->id); + if (choice == NULL) { + m.match_type(&printer, vector_elem(type->hypos, i)->type.as_object()); + } else { + printer.nprintf(32, "?%d", choice->fid); + } } printer.puts("]\n"); - printer.dump(); -#endif + + PgfText *text = printer.get_text(); + fputs(text->text, stderr); + free(text); } -PgfParser::PgfParser(ref concr, ref start, PgfText *sentence, - PgfMarshaller *m, PgfUnmarshaller *u) +void PgfParser::print_transition(StackNode *source, StackNode *target, ParseState *state) +{ + fprintf(stderr, "state %ld --- ?%d ---> state %ld (position %zu-%zu, count %zu)\n", + source->state_id, target->choice->fid, target->state_id, + state->start.pos, state->end.pos, + state->stacks.size()); +} +#endif + +PgfParser::PgfParser(ref concr, ref start, PgfText *sentence, PgfMarshaller *m, PgfUnmarshaller *u) { this->concr = concr; - this->start = start; - this->sentence = textdup(sentence); - this->last_choice_id = 0; - this->before = NULL; - this->after = NULL; + this->sentence = sentence; this->m = m; this->u = u; + this->last_fid = 0; + this->top_res = NULL; + this->top_res_index = 0; + + PgfTextSpot spot; + spot.pos = 0; + spot.ptr = (uint8_t*) sentence->text; + + this->before = NULL; + this->after = NULL; + this->ahead = new ParseState(spot); + + StackNode *node = new StackNode(0); + this->ahead->stacks.push_back(node); +} + +void PgfParser::shift(StackNode *parent, ref lincat, size_t r, Production *prod, + ParseState *state) +{ + ref> shifts = vector_elem(concr->lrtable,parent->state_id)->shifts; + for (size_t i = 0; i < shifts->len; i++) { + ref shift = vector_elem(shifts,i); + if (lincat == shift->lincat && r == shift->r) { + StackNode *node = NULL; + for (StackNode *n : state->stacks) { + if (n->state_id == shift->next_state) { + node = n; + break; + } + } + if (node == NULL) { + node = new StackNode(shift->next_state); + node->choice = new Choice(++last_fid); + node->parents.push_back(parent); + state->stacks.push_back(node); + } + node->choice->prods.push_back(prod); + +#ifdef DEBUG_PARSER + print_prod(node->choice, prod); + print_transition(parent,node,state); +#endif + break; + } + } +} + +PgfParser::Choice *PgfParser::intersect_choice(Choice *choice1, Choice *choice2, intersection_map &im) +{ + if (choice1 == NULL) + return choice2; + if (choice2 == NULL) + return choice1; + if (choice1 == choice2) + return choice1; + + std::pair key(choice1,choice2); + auto it = im.find(key); + if (it != im.end()) { + return it->second; + } + + Choice *choice = new Choice(++last_fid); + im[key] = choice; + for (Production *prod1 : choice1->prods) { + for (Production *prod2 : choice2->prods) { + if (prod1->lin == prod2->lin) { + Production *prod = new(prod1->lin) Production(); + choice->prods.push_back(prod); + + for (size_t i = 0; i < prod->n_args; i++) { + Choice *arg = intersect_choice(prod1->args[i],prod2->args[i],im); + if (arg == NULL) { + //delete choice; + return NULL; + } + prod->args[i] = arg; + } + } + } + } + + return choice; +} + +void PgfParser::reduce(StackNode *parent, ref lin, size_t seq_index, + size_t n, std::vector &args) +{ + if (n == 0) { + ref lincat = lin->lincat; + size_t r = seq_index % lincat->fields->len; + + Production *prod = new(lin) Production(); + + ref seq = *vector_elem(lin->seqs, seq_index); + for (size_t i = 0; i < seq->syms.len; i++) { + PgfSymbol sym = seq->syms.data[i]; + switch (ref::get_tag(sym)) { + case PgfSymbolCat::tag: { + auto symcat = + ref::untagged(sym); + Choice *choice = args[seq->syms.len-i-1]; + intersection_map im; + choice = intersect_choice(choice, prod->args[symcat->d], im); + if (choice == NULL) { + //delete prod; + return; + } + prod->args[symcat->d] = choice; + break; + } + } + } + + shift(parent, lincat, r, prod, before); + return; + } + + args.push_back(parent->choice); + for (auto node : parent->parents) { + reduce(node, lin, seq_index, n-1, args); + } + args.pop_back(); +} + +void PgfParser::complete(StackNode *parent, ref lincat, size_t seq_index, + size_t n, std::vector &args) +{ + if (n == 0) { + Choice *choice = args[0]; + if (top_res == NULL) + top_res = new Result(); + choice->res = top_res; + predict_expr_states(choice, 0); + return; + } + + args.push_back(parent->choice); + for (auto node : parent->parents) { + complete(node, lincat, seq_index, n-1, args); + } + args.pop_back(); +} + +void PgfParser::reduce_all(StackNode *node) +{ + ref> shifts = vector_elem(concr->lrtable,node->state_id)->shifts; + for (size_t j = 0; j < shifts->len; j++) { + ref shift = vector_elem(shifts,j); + if (shift->is_epsilon) { + StackNode *new_node = NULL; + for (StackNode *n : before->stacks) { + if (n->state_id == shift->next_state) { + new_node = n; + break; + } + } + + if (new_node == NULL) { + new_node = new StackNode(shift->next_state); + new_node->choice = new Choice(++last_fid); + new_node->parents.push_back(node); + before->stacks.push_back(new_node); + + std::function, size_t seq_index)> f = + [this,new_node](ref lin, size_t seq_index) { + Production *prod = new(lin) Production(); + new_node->choice->prods.push_back(prod); +#ifdef DEBUG_PARSER + print_prod(new_node->choice, prod); +#endif + }; + phrasetable_lookup_epsilons(concr->phrasetable, + shift->lincat, shift->r, f); + } + +#ifdef DEBUG_PARSER + print_transition(node,new_node,before); +#endif + } + } + + ref> reductions = vector_elem(concr->lrtable,node->state_id)->reductions; + for (size_t j = 0; j < reductions->len; j++) { + ref red = vector_elem(reductions,j); + switch (ref::get_tag(red->lin_obj)) { + case PgfConcrLin::tag: { + auto lin = + ref::untagged(red->lin_obj); + ref seq = *vector_elem(lin->seqs,red->seq_index); + std::vector args; + reduce(node, lin, red->seq_index, seq->syms.len, args); + break; + } + case PgfConcrLincat::tag: { + auto lincat = + ref::untagged(red->lin_obj); + ref seq = *vector_elem(lincat->seqs,red->seq_index); + std::vector args; + if (before->end.pos == sentence->size) { + complete(node, lincat, red->seq_index, seq->syms.len, args); + } + } + } + } } void PgfParser::space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err) { - State *prev = NULL; - State *next = before; - while (next != NULL && next->start.pos < start->pos) { - prev = next; - next = next->next; +#ifdef DEBUG_PARSER + fprintf(stderr, "------------------ position %zu-%zu ------------------\n", + start->pos, end->pos); +#endif + + while (ahead != NULL && ahead->start.pos <= start->pos) { + ParseState *tmp = ahead->next; + ahead->next = before; + before = ahead; + ahead = tmp; } - if (next == NULL || next->start.pos != start->pos) { - before = new State(); - before->start = *start; - before->end = *end; - before->prev = prev; - before->next = next; - before->viterbi_prob = prev ? prev->viterbi_prob : 0; + before->end = *end; - if (prev != NULL) prev->next = before; - if (next != NULL) next->prev = before; - } else { - before = next; - before->end = *end; + size_t i = 0; + while (i < before->stacks.size()) { + StackNode *node = before->stacks[i++]; + reduce_all(node); } } void PgfParser::start_matches(PgfTextSpot *end, PgfExn* err) { - State *prev = NULL; - State *next = before; - while (next != NULL && next->start.pos < end->pos) { - prev = next; - next = next->next; + ParseState **last = &ahead; after = *last; + while (after != NULL && after->start.pos < end->pos) { + last = &after->next; after = *last; } - if (next == NULL || next->start.pos != end->pos) { - after = new State(); - after->start = *end; - after->end = *end; - after->prev = prev; - after->next = next; - after->viterbi_prob = INFINITY; - - if (prev != NULL) prev->next = after; - if (next != NULL) next->prev = after; - } else { - after = next; + if (after == NULL) { + *last = new ParseState(*end); + after = *last; } } void PgfParser::match(ref lin, size_t seq_index, PgfExn* err) { - ref field = vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len); - ref result = *vector_elem(lin->res, seq_index / lin->lincat->fields->len); + size_t r = seq_index % lin->lincat->fields->len; - ParseItemConts *conts = before->get_conts(field, result->param.i0); - PgfLinSeqIndex r = {lin, seq_index}; - after->queue.push(new(&r) ParseItem(conts, result->param.i0, lin, seq_index)); + Production *prod = new(lin) Production(); + + for (StackNode *parent : before->stacks) { + shift(parent, lin->lincat, r, prod, after); + } } void PgfParser::end_matches(PgfTextSpot *end, PgfExn* err) { - while (!after->queue.empty()) { - Item *item = after->queue.top(); - after->queue.pop(); +} - item->trace(after,m); - item->proceed(this,NULL); +bool PgfParser::CompareExprState::operator() (const ExprState *state1, const ExprState *state2) const { + return state1->prob > state2->prob; +} + +void PgfParser::predict_expr_states(Choice *choice, prob_t outside_prob) +{ + for (Production *prod : choice->prods) { + ExprState *state = new ExprState; + state->res = choice->res; + state->prod = prod; + state->n_args = 0; + state->expr = u->efun(&prod->lin->name); + state->prob = outside_prob+prod->lin->absfun->prob; + queue.push(state); + } +} + +#ifdef DEBUG_GENERATOR +void PgfParser::print_expr_state_before(PgfPrinter *printer, ExprState *state) +{ + if (state->res->states.size() > 0) { + ExprState *parent = state->res->states[0]; + print_expr_state_before(printer, parent); + printer->puts(" ["); + } + m->match_expr(printer, state->expr); +} + +void PgfParser::print_expr_state_after(PgfPrinter *printer, ExprState *state) +{ + for (size_t i = state->n_args; i < state->prod->n_args; i++) { + printer->puts(" ?"); } - for (auto i : after->choices) { - ParseItemConts *conts = i.first; - Choice *choice = i.second; + if (state->res->states.size() > 0) { + printer->puts("]"); + ExprState *parent = state->res->states[0]; + print_expr_state_after(printer, parent); + } +} - if (choice->is_chunk) { - prob_t viterbi_prob = conts->state->viterbi_prob+ - conts->field->lincat->abscat->prob+ - choice->viterbi_prob; - if (after->viterbi_prob > viterbi_prob) - after->viterbi_prob = viterbi_prob; +void PgfParser::print_expr_state(ExprState *state) +{ + PgfPrinter printer(NULL, 0, m); + + printer.nprintf(16, "[%f] ", state->prob); + print_expr_state_before(&printer, state); + print_expr_state_after(&printer, state); + printer.puts("\n"); + + PgfText *text = printer.get_text(); + fputs(text->text, stderr); + free(text); +} +#endif + +bool PgfParser::process_expr_state(ExprState *state) +{ + if (state->n_args >= state->prod->n_args) { + complete_expr_state(state); + return true; + } + + Choice *choice = state->prod->args[state->n_args]; + if (choice == NULL) { + PgfExpr meta = u->emeta(0); + PgfExpr app = u->eapp(state->expr, meta); + u->free_ref(state->expr); + u->free_ref(meta); + state->expr = app; + } else { + Result *tmp = choice->res; + if (choice->res == NULL) { + choice->res = new Result(); + } + choice->res->states.push_back(state); + + if (tmp == NULL) { + predict_expr_states(choice, state->prob); + } else { + for (ExprInstance p : choice->res->exprs) { + combine_expr_state(state,p); + } } } - if (isinf(after->viterbi_prob)) - after->viterbi_prob = before->viterbi_prob; + return false; } -void PgfParser::prepare() +void PgfParser::complete_expr_state(ExprState *state) { - after->queue.push(new MetaItem(after,0,0,NULL)); - before = after; + Result *res = state->res; + + prob_t outside_prob; + if (res == top_res) + outside_prob = 0; + else + outside_prob = res->states[0]->prob; + + prob_t inside_prob = state->prob-outside_prob; + res->exprs.emplace_back(state->expr,inside_prob); + for (ExprState *state : res->states) { + combine_expr_state(state,res->exprs.back()); + } +} + +void PgfParser::combine_expr_state(ExprState *state, ExprInstance &inst) +{ + PgfExpr app = u->eapp(state->expr, inst.expr); + + ExprState *app_state = new ExprState(); + app_state->res = state->res; + app_state->prob = state->prob + inst.prob; + app_state->prod = state->prod; + app_state->n_args = state->n_args+1; + app_state->expr = app; + queue.push(app_state); +} + +void PgfParser::release_expr_state(ExprState *state) +{ + } PgfExpr PgfParser::fetch(PgfDB *db, prob_t *prob) -{ +{ DB_scope scope(db, READER_SCOPE); - while (before != NULL && before->queue.empty()) { - before = before->next; - } + if (top_res == NULL) + return 0; - while (!before->queue.empty()) { - Item *item = before->queue.top(); - before->queue.pop(); - - item->trace(after,m); - - if (before->prev == NULL) { - *prob = item->get_prob(); - return item->get_expr(u); + for (;;) { + if (top_res_index < top_res->exprs.size()) { + auto inst = top_res->exprs[top_res_index++]; + *prob = inst.prob; + return inst.expr; } - before = item->proceed(this,u); + if (queue.empty()) + return 0; + + ExprState *state = queue.top(); queue.pop(); +#ifdef DEBUG_GENERATOR + print_expr_state(state); +#endif + + if (process_expr_state(state)) { + release_expr_state(state); + } } return 0; -} - -PgfParser::~PgfParser() -{ - free(sentence); - printf("~PgfParser()\n"); -} +} \ No newline at end of file diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h index 8adc93b03..3d69260ef 100644 --- a/src/runtime/c/pgf/parser.h +++ b/src/runtime/c/pgf/parser.h @@ -1,59 +1,114 @@ -#ifndef PARSER_H -#define PARSER_H +#ifndef LR_TABLE_H +#define LR_TABLE_H + +#include "md5.h" + +class PGF_INTERNAL_DECL PgfLRTableMaker +{ + struct State; + struct Item; + struct Predictions; + + struct CompareItem; + static const CompareItem compare_item; + + typedef std::pair,size_t> Key; + + struct PGF_INTERNAL_DECL CompareKey : std::less { + bool operator() (const Key& k1, const Key& k2) const { + int cmp = textcmp(k1.first,k2.first); + if (cmp < 0) + return true; + else if (cmp > 0) + return false; + + return (k1.second < k2.second); + } + }; + + ref abstr; + ref concr; + + std::vector todo; + std::map states; + std::map predictions; + std::map continuations; + std::vector completed; + + void process(Item *item); + void symbol(Item *item, PgfSymbol sym); + void predict(Item *item, ref cat, + ref> vars, PgfLParam *r); + void predict(Item *item, ref cat, size_t r); + void predict(ref absfun, Predictions *preds); + void complete(Item *item); + + static void print_item(Item *item); + +public: + PgfLRTableMaker(ref abstr, ref concr); + ref make(); +}; + +class PgfPrinter; + +class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum +{ + ref concr; + PgfText *sentence; + PgfMarshaller *m; + PgfUnmarshaller *u; + + struct Choice; + struct Production; + struct StackNode; + struct ParseState; + struct ExprState; + struct ExprInstance; + struct Result; + struct CompareExprState : std::less { + bool operator() (const ExprState *state1, const ExprState *state2) const; + }; + + ParseState *before, *after, *ahead; + std::priority_queue, CompareExprState> queue; + int last_fid; + + Result *top_res; + size_t top_res_index; + + void shift(StackNode *parent, ref lincat, size_t r, Production *prod, + ParseState *state); + void reduce(StackNode *parent, ref lin, size_t seq_index, + size_t n, std::vector &args); + void complete(StackNode *parent, ref lincat, size_t seq_index, + size_t n, std::vector &args); + void reduce_all(StackNode *state); + void print_prod(Choice *choice, Production *prod); + void print_transition(StackNode *source, StackNode *target, ParseState *state); + + typedef std::map,Choice*> intersection_map; + + Choice *intersect_choice(Choice *choice1, Choice *choice2, intersection_map &im); + + void print_expr_state_before(PgfPrinter *printer, ExprState *state); + void print_expr_state_after(PgfPrinter *printer, ExprState *state); + void print_expr_state(ExprState *state); + + void predict_expr_states(Choice *choice, prob_t outside_prob); + bool process_expr_state(ExprState *state); + void complete_expr_state(ExprState *state); + void combine_expr_state(ExprState *state, ExprInstance &inst); + void release_expr_state(ExprState *state); -class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum { public: PgfParser(ref concr, ref start, PgfText *sentence, PgfMarshaller *m, PgfUnmarshaller *u); - void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err); - void start_matches(PgfTextSpot *end, PgfExn* err); - void match(ref lin, size_t seq_index, PgfExn* err); - void end_matches(PgfTextSpot *end, PgfExn* err); + virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err); + virtual void start_matches(PgfTextSpot *end, PgfExn* err); + virtual void match(ref lin, size_t seq_index, PgfExn* err); + virtual void end_matches(PgfTextSpot *end, PgfExn* err); - void prepare(); PgfExpr fetch(PgfDB *db, prob_t *prob); - - virtual ~PgfParser(); - -private: - class CFGCat; - class State; - class Choice; - class Production; - - class ParseItemConts; - - class Item { - public: - prob_t get_prob() { return inside_prob + outside_prob; }; - - virtual State *proceed(PgfParser *parser, PgfUnmarshaller *u) = 0; - virtual bool combine(PgfParser *parser, ParseItemConts *conts, PgfExpr expr, prob_t inside_prob, PgfUnmarshaller *u) = 0; - virtual void print1(PgfPrinter *printer, State *state, PgfMarshaller *m) = 0; - virtual void print2(PgfPrinter *printer, State *state, int x, PgfMarshaller *m) = 0; - virtual PgfExpr get_expr(PgfUnmarshaller *u) = 0; - - void trace(State *state, PgfMarshaller *m); - - protected: - prob_t inside_prob; - prob_t outside_prob; - }; - - class ParseItem; - class ExprItem; - class MetaItem; - - ref concr; - ref start; - PgfText *sentence; - - size_t last_choice_id; - - State *before, *after; - - PgfMarshaller *m; - PgfUnmarshaller *u; }; - #endif diff --git a/src/runtime/c/pgf/pgf.cxx b/src/runtime/c/pgf/pgf.cxx index 55c5c6978..602239ba7 100644 --- a/src/runtime/c/pgf/pgf.cxx +++ b/src/runtime/c/pgf/pgf.cxx @@ -861,9 +861,9 @@ public: virtual void match(ref lin, size_t seq_index, PgfExn* err) { - ref field = - vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len); - callback->fn(callback, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err); + ref field = + *vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len); + callback->fn(callback, &lin->absfun->name, field, lin->lincat->abscat->prob+lin->absfun->prob, err); } virtual void end_matches(PgfTextSpot *end, PgfExn* err) @@ -909,9 +909,9 @@ public: virtual void match(ref lin, size_t seq_index, PgfExn* err) { - ref field = - vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len); - callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err); + ref field = + *vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len); + callback->morpho.fn(&callback->morpho, &lin->absfun->name, field, lin->lincat->abscat->prob+lin->absfun->prob, err); } virtual void end_matches(PgfTextSpot *end, PgfExn* err) @@ -976,7 +976,7 @@ PGF_API PgfText *pgf_get_lincat_field_internal(object o, size_t i) { ref lincat = o; - return &*(vector_elem(lincat->fields, i)->name); + return &**vector_elem(lincat->fields, i); } PGF_API @@ -1654,6 +1654,7 @@ class PGF_INTERNAL PgfLinBuilder : public PgfLinBuilderIface ref>> seqs; object container; // what are we building? + ref container_lincat; size_t var_index; size_t arg_index; @@ -1712,17 +1713,15 @@ public: lincat->seqs = seqs; lincat->n_lindefs = n_lindefs; - ref> db_fields = vector_new(n_fields); + ref>> db_fields = vector_new>(n_fields); for (size_t i = 0; i < n_fields; i++) { ref name = textdup_db(fields[i]); - vector_elem(db_fields, i)->lincat = lincat; - vector_elem(db_fields, i)->name = name; - vector_elem(db_fields, i)->backrefs = 0; - vector_elem(db_fields, i)->epsilons = 0; + *vector_elem(db_fields, i) = name; } lincat->fields = db_fields; this->container = lincat.tagged(); + this->container_lincat = 0; build->build(this, err); if (err->type == PGF_EXN_NONE && res_index != res->len) { @@ -1760,6 +1759,7 @@ public: lin->seqs = seqs; this->container = lin.tagged(); + this->container_lincat = lincat; build->build(this, err); if (err->type == PGF_EXN_NONE && res_index != res->len) { @@ -2149,7 +2149,7 @@ public: PgfPhrasetable phrasetable = phrasetable_internalize(concr->phrasetable, - seq, container, seq_index, + seq, container_lincat, container, seq_index, &entry); concr->phrasetable = phrasetable; *vector_elem(seqs, seq_index) = entry->seq; @@ -2418,7 +2418,7 @@ PgfText **pgf_category_fields(PgfDB *db, PgfConcrRevision revision, if (fields == 0) throw pgf_systemerror(ENOMEM); for (size_t i = 0; i < n_fields; i++) { - fields[i] = textdup(vector_elem(lincat->fields, i)->name); + fields[i] = textdup(*vector_elem(lincat->fields, i)); } *p_n_fields = n_fields; return fields; @@ -2511,7 +2511,7 @@ PgfText **pgf_tabular_linearize(PgfDB *db, PgfConcrRevision revision, PgfText *text = out.get_text(); if (text != NULL) { - res[pos++] = textdup(&*(vector_elem(lincat->fields,i)->name)); + res[pos++] = textdup(&**vector_elem(lincat->fields,i)); res[pos++] = text; } } @@ -2550,7 +2550,7 @@ PgfText **pgf_tabular_linearize_all(PgfDB *db, PgfConcrRevision revision, PgfText *text = out.get_text(); if (text != NULL) { - res[pos++] = textdup(&*(vector_elem(lincat->fields, i)->name)); + res[pos++] = textdup(&**vector_elem(lincat->fields, i)); res[pos++] = text; } } @@ -2656,7 +2656,6 @@ PgfExprEnum *pgf_parse(PgfDB *db, PgfConcrRevision revision, phrasetable_lookup_cohorts(concr->phrasetable, sentence, case_sensitive, parser, err); - parser->prepare(); return parser; } PGF_API_END diff --git a/src/runtime/c/pgf/phrasetable.cxx b/src/runtime/c/pgf/phrasetable.cxx index 23507dcc2..ed0d6db6c 100644 --- a/src/runtime/c/pgf/phrasetable.cxx +++ b/src/runtime/c/pgf/phrasetable.cxx @@ -299,9 +299,87 @@ int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end, } } +static +int backref_cmp(ref backref, ref lincat, size_t r) +{ + int cmp = 0; + switch (ref::get_tag(backref->container)) { + case PgfConcrLin::tag: { + ref lin = ref::untagged(backref->container); + if (lincat.as_object() < lin->lincat.as_object()) + cmp = -1; + else if (lincat.as_object() > lin->lincat.as_object()) + cmp = 1; + break; + } + case PgfConcrLincat::tag: { + if (lincat.as_object() > 0) + cmp = 1; + break; + } + } + + if (cmp == 0) { + size_t r1 = + (lincat == 0) ? 0 + : backref->seq_index % lincat->fields->len; + if (r < r1) + cmp = -1; + else if (r > r1) + cmp = 1; + } + + return cmp; +} + +static +ref> phrasetable_update_backrefs(PgfPhrasetable table, + ref lincat, + object container, + size_t seq_index) +{ + size_t len = (table->value.backrefs != 0) + ? table->value.backrefs->len + : 0; + + ref> backrefs = + vector_resize(table->value.backrefs, len+1, table->txn_id); + ssize_t i = 0; + ssize_t j = len-1; + if (table->value.seq->syms.len == 0 && len > 0) { + // The backrefs for the epsilon sequence are sorted by lincat and r + + size_t r = (lincat!=0) ? (seq_index % lincat->fields->len) : 0; + while (i <= j) { + ssize_t k = (i + j) / 2; + ref backref = vector_elem(backrefs, k); + + int cmp = backref_cmp(backref, lincat, r); + if (cmp < 0) { + while (j >= k) { + backrefs->data[j+1] = backrefs->data[j]; + j--; + } + } else if (cmp > 0) { + i = k+1; + } else { + while (j > k) { + backrefs->data[j+1] = backrefs->data[j]; + j--; + } + break; + } + } + } + backrefs->data[j+1].container = container; + backrefs->data[j+1].seq_index = seq_index; + return backrefs; +} + PGF_INTERNAL PgfPhrasetable phrasetable_internalize(PgfPhrasetable table, ref seq, + ref lincat, object container, size_t seq_index, ref *pentry) @@ -321,6 +399,7 @@ PgfPhrasetable phrasetable_internalize(PgfPhrasetable table, if (cmp < 0) { PgfPhrasetable left = phrasetable_internalize(table->left, seq, + lincat, container, seq_index, pentry); @@ -329,6 +408,7 @@ PgfPhrasetable phrasetable_internalize(PgfPhrasetable table, } else if (cmp > 0) { PgfPhrasetable right = phrasetable_internalize(table->right, seq, + lincat, container, seq_index, pentry); @@ -342,9 +422,7 @@ PgfPhrasetable phrasetable_internalize(PgfPhrasetable table, : 0; ref> backrefs = - vector_resize(table->value.backrefs, len+1, table->txn_id); - backrefs->data[len].container = container; - backrefs->data[len].seq_index = seq_index; + phrasetable_update_backrefs(table,lincat,container,seq_index); PgfPhrasetable new_table = Node::upd_node(table, table->left, table->right); @@ -356,6 +434,7 @@ PgfPhrasetable phrasetable_internalize(PgfPhrasetable table, PGF_INTERNAL ref phrasetable_relink(PgfPhrasetable table, + ref lincat, object container, size_t seq_index, size_t seq_id) @@ -370,9 +449,7 @@ ref phrasetable_relink(PgfPhrasetable table, : table->value.backrefs->len; ref> backrefs = - vector_resize(table->value.backrefs, len+1, table->txn_id); - backrefs->data[len].container = container; - backrefs->data[len].seq_index = seq_index; + phrasetable_update_backrefs(table,lincat,container,seq_index); table->value.backrefs = backrefs; return table->value.seq; @@ -397,12 +474,16 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table, PgfPhrasetable left = phrasetable_delete(table->left, container, seq_index, seq); + if (left == table->left) + return table; table = Node::upd_node(table,left,table->right); return Node::balanceR(table); } else if (cmp > 0) { PgfPhrasetable right = phrasetable_delete(table->right, container, seq_index, seq); + if (right == table->right) + return table; table = Node::upd_node(table,table->left,right); return Node::balanceL(table); } else { @@ -566,10 +647,10 @@ void finish_skipping(PgfCohortsState *state) { state->queue.pop(); } - +/* state->scanner->space(&state->spot, &state->spot, state->err); - +*/ state->last.pos = 0; state->last.ptr = NULL; state->skipping = false; @@ -740,6 +821,56 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table, } } +PGF_INTERNAL +void phrasetable_lookup_epsilons(PgfPhrasetable table, + ref lincat, size_t r, + std::function,size_t)> &f) +{ + while (table->left != 0) { + table = table->left; + } + + if (table->value.seq->syms.len > 0) + return; + + size_t len = (table->value.backrefs != 0) + ? table->value.backrefs->len + : 0; + + ssize_t i = 0; + ssize_t j = len-1; + while (i <= j) { + ssize_t k = (i + j) / 2; + ref backref = vector_elem(table->value.backrefs, k); + + int cmp = backref_cmp(backref, lincat, r); + if (cmp < 0) { + j = k-1; + } else if (cmp > 0) { + i = k+1; + } else { + i = k; + while (i > 0) { + ref backref = vector_elem(table->value.backrefs, i-1); + if (backref_cmp(backref, lincat, r) != 0) + break; + f(ref::untagged(backref->container),backref->seq_index); + i--; + } + f(ref::untagged(backref->container),backref->seq_index); + j = k; + while (j < len-1) { + ref backref = vector_elem(table->value.backrefs, j+1); + if (backref_cmp(backref, lincat, r) != 0) + break; + f(ref::untagged(backref->container),backref->seq_index); + j++; + } + break; + } + } +} + PGF_INTERNAL void phrasetable_iter(PgfConcr *concr, PgfPhrasetable table, @@ -768,10 +899,10 @@ void phrasetable_iter(PgfConcr *concr, ref lincat = namespace_lookup(concr->lincats, &lin->absfun->type->name); if (lincat != 0) { - ref field = - vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); + ref field = + *vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); - callback->fn(callback, &lin->absfun->name, &(*field->name), lincat->abscat->prob+lin->absfun->prob, err); + callback->fn(callback, &lin->absfun->name, &*field, lincat->abscat->prob+lin->absfun->prob, err); if (err->type != PGF_EXN_NONE) return; } diff --git a/src/runtime/c/pgf/phrasetable.h b/src/runtime/c/pgf/phrasetable.h index c27f95333..2d8abc8cd 100644 --- a/src/runtime/c/pgf/phrasetable.h +++ b/src/runtime/c/pgf/phrasetable.h @@ -52,15 +52,19 @@ private: #pragma GCC diagnostic pop #endif +struct PgfConcrLincat; + PGF_INTERNAL_DECL PgfPhrasetable phrasetable_internalize(PgfPhrasetable table, ref seq, + ref lincat, object container, size_t seq_index, ref *pentry); PGF_INTERNAL_DECL ref phrasetable_relink(PgfPhrasetable table, + ref lincat, object container, size_t seq_index, size_t seq_id); @@ -101,6 +105,11 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table, bool case_sensitive, PgfPhraseScanner *scanner, PgfExn* err); +PGF_INTERNAL_DECL +void phrasetable_lookup_epsilons(PgfPhrasetable table, + ref lincat, size_t r, + std::function, size_t)> &f); + PGF_INTERNAL_DECL void phrasetable_iter(PgfConcr *concr, PgfPhrasetable table, diff --git a/src/runtime/c/pgf/reader.cxx b/src/runtime/c/pgf/reader.cxx index 891346324..9e2010d7d 100644 --- a/src/runtime/c/pgf/reader.cxx +++ b/src/runtime/c/pgf/reader.cxx @@ -1,5 +1,6 @@ #include "data.h" #include "reader.h" +#include "parser.h" #include #include @@ -650,14 +651,14 @@ ref PgfReader::read_seq() return seq; } -ref>> PgfReader::read_seq_ids(object container) +ref>> PgfReader::read_seq_ids(ref lincat, object container) { size_t len = read_len(); ref>> vec = vector_new>(len); for (size_t i = 0; i < len; i++) { size_t seq_id = read_len(); ref seq = phrasetable_relink(concrete->phrasetable, - container, i, + lincat, container, i, seq_id); if (seq == 0) { throw pgf_error("Invalid sequence id"); @@ -701,7 +702,7 @@ ref PgfReader::read_lincat() auto n_lindefs = read_len(); auto args = read_vector(&PgfReader::read_parg); auto res = read_vector(&PgfReader::read_presult2); - auto seqs = read_seq_ids(lincat.tagged()); + auto seqs = read_seq_ids(0, lincat.tagged()); lincat->abscat = namespace_lookup(abstract->cats, &lincat->name); lincat->fields = fields; @@ -712,130 +713,35 @@ ref PgfReader::read_lincat() return lincat; } -ref> PgfReader::read_lincat_fields(ref lincat) +ref>> PgfReader::read_lincat_fields(ref lincat) { size_t len = read_len(); - ref> fields = vector_new(len); + ref>> fields = vector_new>(len); for (size_t i = 0; i < len; i++) { auto name = read_text(); - - ref field = vector_elem(fields,i); - field->lincat = lincat; - field->name = name; - field->backrefs = 0; - field->epsilons = 0; + *vector_elem(fields,i) = name; } return fields; } -static void add_to_index(ref concrete, ref lin, size_t seq_index, size_t dot) -{ - size_t n_fields = lin->lincat->fields->len; - ref seq = *vector_elem(lin->seqs,seq_index); - ref result = *vector_elem(lin->res, seq_index / n_fields); - ref field = vector_elem(lin->lincat->fields, seq_index % n_fields); - - if (dot >= seq->syms.len) { - ref> epsilons = field->epsilons; - epsilons = - vector_resize(epsilons, ((epsilons == 0) ? 0 : epsilons->len)+1, - PgfDB::get_txn_id()); - field->epsilons = epsilons; - ref epsilon = - vector_elem(epsilons,epsilons->len-1); - epsilon->lin = lin; - epsilon->seq_index = seq_index; - - if (epsilons->len == 1 && field->backrefs != 0) { - for (size_t i = 0; i < field->backrefs->len; i++) { - ref backref = vector_elem(field->backrefs,i); - add_to_index(concrete,backref->lin,backref->seq_index,backref->dot+1); - } - } - } else { - PgfSymbol sym = *vector_elem(&seq->syms,dot); - switch (ref::get_tag(sym)) { - case PgfSymbolCat::tag: { - auto sym_cat = ref::untagged(sym); - - ref hypo = - vector_elem(lin->absfun->type->hypos,sym_cat->d); - ref lincat = - namespace_lookup(concrete->lincats, - &hypo->type->name); - if (lincat == 0) - throw pgf_error("Found a lin which uses a category without a lincat"); - - size_t max_values = 1; - size_t *ranges = (size_t *) - alloca(sym_cat->r.n_terms*sizeof(size_t)); - for (size_t i = 0; i < sym_cat->r.n_terms; i++) { - for (size_t j = 0; j < result->vars->len; j++) { - auto var_range = vector_elem(result->vars, j); - if (var_range->var == sym_cat->r.terms[i].var) { - ranges[i] = vector_elem(result->vars, j)->range; - max_values *= var_range->range; - break; - } - } - } - - bool is_epsilon = false; - for (size_t values = 0; values < max_values; values++) { - size_t v = values; - size_t index = sym_cat->r.i0; - for (size_t i = 0; i < sym_cat->r.n_terms; i++) { - index += sym_cat->r.terms[i].factor * (v % ranges[i]); - v = v / ranges[i]; - } - - ref> backrefs = - vector_elem(lincat->fields,index)->backrefs; - backrefs = - vector_resize(backrefs, ((backrefs == 0) ? 0 : backrefs->len)+1, - PgfDB::get_txn_id()); - vector_elem(lincat->fields,index)->backrefs = backrefs; - ref backref = - vector_elem(backrefs,backrefs->len-1); - backref->lin = lin; - backref->seq_index = seq_index; - backref->dot = dot; - - if (vector_elem(lincat->fields,index)->epsilons != 0) - is_epsilon = true; - } - - if (is_epsilon) - add_to_index(concrete,lin,seq_index,dot+1); - - break; - } - } - } -}; - ref PgfReader::read_lin() { ref lin = read_name(&PgfConcrLin::name); lin->absfun = namespace_lookup(abstract->funs, &lin->name); if (lin->absfun == 0) throw pgf_error("Found a lin without a fun"); - - auto args = read_vector(&PgfReader::read_parg); - auto res = read_vector(&PgfReader::read_presult2); - auto seqs = read_seq_ids(lin.tagged()); - - lin->args = args; - lin->res = res; - lin->seqs = seqs; lin->lincat = namespace_lookup(concrete->lincats, &lin->absfun->type->name); if (lin->lincat == 0) throw pgf_error("Found a lin which uses a category without a lincat"); - for (size_t seq_index = 0; seq_index < lin->seqs->len; seq_index++) { - add_to_index(concrete, lin, seq_index, 0); - } + auto args = read_vector(&PgfReader::read_parg); + auto res = read_vector(&PgfReader::read_presult2); + auto seqs = read_seq_ids(lin->lincat, lin.tagged()); + + lin->args = args; + lin->res = res; + lin->seqs = seqs; return lin; } @@ -866,6 +772,9 @@ ref PgfReader::read_concrete() auto printnames = read_namespace(&PgfReader::read_printname); concrete->printnames = printnames; + PgfLRTableMaker maker(abstract, concrete); + concrete->lrtable = maker.make(); + return concrete; } diff --git a/src/runtime/c/pgf/reader.h b/src/runtime/c/pgf/reader.h index 2099a6b7c..34eb64744 100644 --- a/src/runtime/c/pgf/reader.h +++ b/src/runtime/c/pgf/reader.h @@ -71,14 +71,14 @@ public: void merge_abstract(ref abstract); ref read_lincat(); - ref> read_lincat_fields(ref lincat); + ref>> read_lincat_fields(ref lincat); ref read_lparam(); void read_variable_range(ref var_info); void read_parg(ref parg); ref read_presult(); PgfSymbol read_symbol(); ref read_seq(); - ref>> read_seq_ids(object container); + ref>> read_seq_ids(ref lincat, object container); PgfPhrasetable read_phrasetable(size_t len); PgfPhrasetable read_phrasetable(); ref read_lin(); diff --git a/src/runtime/c/pgf/writer.cxx b/src/runtime/c/pgf/writer.cxx index 3ef32ec4d..b47dc9d35 100644 --- a/src/runtime/c/pgf/writer.cxx +++ b/src/runtime/c/pgf/writer.cxx @@ -391,9 +391,9 @@ void PgfWriter::write_lincat(ref lincat) write_vector(lincat->seqs, &PgfWriter::write_seq_id); } -void PgfWriter::write_lincat_field(ref field) +void PgfWriter::write_lincat_field(ref> field) { - write_text(field->name); + write_text(*field); } void PgfWriter::write_lin(ref lin) diff --git a/src/runtime/c/pgf/writer.h b/src/runtime/c/pgf/writer.h index c596b1fb5..5713ac675 100644 --- a/src/runtime/c/pgf/writer.h +++ b/src/runtime/c/pgf/writer.h @@ -39,7 +39,7 @@ public: void write_abstract(ref abstract); void write_lincat(ref lincat); - void write_lincat_field(ref field); + void write_lincat_field(ref> field); void write_variable_range(ref var); void write_lparam(ref lparam); void write_parg(ref linarg);