1
0
forked from GitHub/gf-core

first draft of an LR parser

This commit is contained in:
Krasimir Angelov
2023-05-10 12:01:48 +02:00
parent 54352b507a
commit 7eac9ea2ab
15 changed files with 1415 additions and 917 deletions

View File

@@ -42,7 +42,9 @@ libpgf_la_SOURCES = \
pgf/probspace.cxx \
pgf/probspace.h \
pgf/generator.cxx \
pgf/generator.h
pgf/generator.h \
pgf/md5.cxx \
pgf/md5.h
libpgf_la_LDFLAGS = -no-undefined -version-info 4:0:0
libpgf_la_CXXFLAGS = -fno-rtti -std=c++11 -DCOMPILING_PGF

View File

@@ -48,9 +48,9 @@ void PgfConcr::release(ref<PgfConcr> concr)
void PgfConcrLincat::release(ref<PgfConcrLincat> lincat)
{
for (size_t i = 0; i < lincat->fields->len; i++) {
PgfLincatField::release(vector_elem(lincat->fields, i));
text_db_release(*vector_elem(lincat->fields, i));
}
Vector<PgfLincatField>::release(lincat->fields);
Vector<ref<PgfText>>::release(lincat->fields);
for (size_t i = 0; i < lincat->args->len; i++) {
PgfLParam::release(vector_elem(lincat->args, i)->param);
@@ -67,13 +67,6 @@ void PgfConcrLincat::release(ref<PgfConcrLincat> lincat)
PgfDB::free(lincat, lincat->name.size+1);
}
void PgfLincatField::release(ref<PgfLincatField> field)
{
text_db_release(field->name);
if (field->backrefs != 0)
Vector<PgfLincatBackref>::release(field->backrefs);
}
void PgfLParam::release(ref<PgfLParam> param)
{
PgfDB::free(param, param->n_terms*sizeof(param->terms[0]));

View File

@@ -224,19 +224,6 @@ struct PGF_INTERNAL_DECL PgfSymbolALLCAPIT {
static const uint8_t tag = 10;
};
struct PGF_INTERNAL_DECL PgfConcrLincat;
struct PGF_INTERNAL_DECL PgfLincatBackref;
struct PGF_INTERNAL_DECL PgfLincatEpsilon;
struct PGF_INTERNAL_DECL PgfLincatField {
ref<PgfConcrLincat> lincat;
ref<PgfText> name;
ref<Vector<PgfLincatBackref>> backrefs;
ref<Vector<PgfLincatEpsilon>> epsilons;
static void release(ref<PgfLincatField> field);
};
struct PGF_INTERNAL_DECL PgfConcrLincat {
static const uint8_t tag = 0;
@@ -246,7 +233,7 @@ struct PGF_INTERNAL_DECL PgfConcrLincat {
ref<Vector<PgfPArg>> args;
ref<Vector<ref<PgfPResult>>> res;
ref<Vector<ref<PgfSequence>>> seqs;
ref<Vector<PgfLincatField>> fields;
ref<Vector<ref<PgfText>>> fields;
PgfText name;
@@ -268,18 +255,6 @@ struct PGF_INTERNAL_DECL PgfConcrLin {
static void release(ref<PgfConcrLin> lin);
};
struct PGF_INTERNAL_DECL PgfLinSeqIndex {
ref<PgfConcrLin> lin;
size_t seq_index;
};
struct PGF_INTERNAL_DECL PgfLincatBackref : public PgfLinSeqIndex {
size_t dot;
};
struct PGF_INTERNAL_DECL PgfLincatEpsilon : public PgfLinSeqIndex {
};
struct PGF_INTERNAL_DECL PgfConcrPrintname {
ref<PgfText> printname;
PgfText name;
@@ -287,6 +262,25 @@ struct PGF_INTERNAL_DECL PgfConcrPrintname {
static void release(ref<PgfConcrPrintname> printname);
};
struct PGF_INTERNAL_DECL PgfLRShift {
size_t next_state;
ref<PgfConcrLincat> lincat;
size_t r;
bool is_epsilon;
};
struct PGF_INTERNAL_DECL PgfLRReduce {
object lin_obj;
size_t seq_index;
};
struct PGF_INTERNAL_DECL PgfLRState {
ref<Vector<PgfLRShift>> shifts;
ref<Vector<PgfLRReduce>> reductions;
};
typedef Vector<PgfLRState> PgfLRTable;
struct PGF_INTERNAL_DECL PgfConcr {
static const uint8_t tag = 1;
@@ -296,6 +290,8 @@ struct PGF_INTERNAL_DECL PgfConcr {
PgfPhrasetable phrasetable;
Namespace<PgfConcrPrintname> printnames;
ref<PgfLRTable> lrtable;
PgfText name;
static void release(ref<PgfConcr> pgf);

View File

@@ -287,7 +287,7 @@ void PgfLinearizer::TreeLinNode::check_category(PgfLinearizer *linearizer, PgfTe
void PgfLinearizer::TreeLinNode::linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex)
{
PgfText *cat = &lin->absfun->type->name;
PgfText *field = &*(vector_elem(lin->lincat->fields, lindex)->name);
PgfText *field = &**vector_elem(lin->lincat->fields, lindex);
if (linearizer->pre_stack == NULL)
out->begin_phrase(cat, fid, field, &lin->name);
@@ -390,7 +390,7 @@ void PgfLinearizer::TreeLindefNode::linearize_arg(PgfLinearizationOutputIface *o
void PgfLinearizer::TreeLindefNode::linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex)
{
if (lincat != 0) {
PgfText *field = &*(vector_elem(lincat->fields, lindex)->name);
PgfText *field = &**vector_elem(lincat->fields, lindex);
if (linearizer->pre_stack == NULL)
out->begin_phrase(&lincat->name, fid, field, fun);
else {
@@ -543,7 +543,7 @@ void PgfLinearizer::TreeLitNode::linearize(PgfLinearizationOutputIface *out, Pgf
{
PgfText *field = NULL;
if (lincat != 0) {
field = &*(vector_elem(lincat->fields, lindex)->name);
field = &**vector_elem(lincat->fields, lindex);
}
linearizer->flush_pre_stack(out, literal);

197
src/runtime/c/pgf/md5.cxx Normal file
View File

@@ -0,0 +1,197 @@
/*
* Derived from the RSA Data Security, Inc. MD5 Message-Digest Algorithm
* and modified slightly to be functionally identical but condensed into control structures.
*/
#include "data.h"
#include "md5.h"
/*
* Constants defined by the MD5 algorithm
*/
#define A 0x67452301
#define B 0xefcdab89
#define C 0x98badcfe
#define D 0x10325476
static uint32_t S[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20,
4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
static uint32_t K[] = {0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};
/*
* Padding used to make the size (in bits) of the input congruent to 448 mod 512
*/
static uint8_t PADDING[] = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
/*
* Bit-manipulation functions defined by the MD5 algorithm
*/
#define F(X, Y, Z) ((X & Y) | (~X & Z))
#define G(X, Y, Z) ((X & Z) | (Y & ~Z))
#define H(X, Y, Z) (X ^ Y ^ Z)
#define I(X, Y, Z) (Y ^ (X | ~Z))
/*
* Rotates a 32-bit word left by n bits
*/
uint32_t rotateLeft(uint32_t x, uint32_t n){
return (x << n) | (x >> (32 - n));
}
/*
* Initialize a context
*/
MD5Context::MD5Context()
{
size = (uint64_t)0;
buffer[0] = (uint32_t)A;
buffer[1] = (uint32_t)B;
buffer[2] = (uint32_t)C;
buffer[3] = (uint32_t)D;
}
/*
* Step on 512 bits of input with the main MD5 algorithm.
*/
static
void md5Step(uint32_t *buffer, uint32_t *input){
uint32_t AA = buffer[0];
uint32_t BB = buffer[1];
uint32_t CC = buffer[2];
uint32_t DD = buffer[3];
uint32_t E;
unsigned int j;
for(unsigned int i = 0; i < 64; ++i){
switch(i / 16){
case 0:
E = F(BB, CC, DD);
j = i;
break;
case 1:
E = G(BB, CC, DD);
j = ((i * 5) + 1) % 16;
break;
case 2:
E = H(BB, CC, DD);
j = ((i * 3) + 5) % 16;
break;
default:
E = I(BB, CC, DD);
j = (i * 7) % 16;
break;
}
uint32_t temp = DD;
DD = CC;
CC = BB;
BB = BB + rotateLeft(AA + E + K[i] + input[j], S[i]);
AA = temp;
}
buffer[0] += AA;
buffer[1] += BB;
buffer[2] += CC;
buffer[3] += DD;
}
/*
* Add some amount of input to the context
*
* If the input fills out a block of 512 bits, apply the algorithm (md5Step)
* and save the result in the buffer. Also updates the overall size.
*/
void MD5Context::update(uint8_t *input_buffer, size_t input_len)
{
uint32_t input[16];
unsigned int offset = this->size % 64;
this->size += (uint64_t)input_len;
// Copy each byte in input_buffer into the next space in our context input
for (unsigned int i = 0; i < input_len; ++i) {
this->input[offset++] = (uint8_t)*(input_buffer + i);
// If we've filled our context input, copy it into our local array input
// then reset the offset to 0 and fill in a new buffer.
// Every time we fill out a chunk, we run it through the algorithm
// to enable some back and forth between cpu and i/o
if (offset % 64 == 0){
for (unsigned int j = 0; j < 16; ++j) {
// Convert to little-endian
// The local variable `input` our 512-bit chunk separated into 32-bit words
// we can use in calculations
input[j] = (uint32_t)(this->input[(j * 4) + 3]) << 24 |
(uint32_t)(this->input[(j * 4) + 2]) << 16 |
(uint32_t)(this->input[(j * 4) + 1]) << 8 |
(uint32_t)(this->input[(j * 4)]);
}
md5Step(this->buffer, input);
offset = 0;
}
}
}
/*
* Pad the current input to get to 448 bytes, append the size in bits to the very end,
* and save the result of the final iteration into digest.
*/
void MD5Context::finalize(MD5Digest *digest)
{
uint32_t input[16];
unsigned int offset = this->size % 64;
unsigned int padding_length = offset < 56 ? 56 - offset : (56 + 64) - offset;
// Fill in the padding and undo the changes to size that resulted from the update
update(PADDING, padding_length);
this->size -= (uint64_t)padding_length;
// Do a final update (internal to this function)
// Last two 32-bit words are the two halves of the size (converted from bytes to bits)
for(unsigned int j = 0; j < 14; ++j)
{
input[j] = (uint32_t)(this->input[(j * 4) + 3]) << 24 |
(uint32_t)(this->input[(j * 4) + 2]) << 16 |
(uint32_t)(this->input[(j * 4) + 1]) << 8 |
(uint32_t)(this->input[(j * 4)]);
}
input[14] = (uint32_t)(this->size * 8);
input[15] = (uint32_t)((this->size * 8) >> 32);
md5Step(this->buffer, input);
// Move the result into digest (convert from little-endian)
for(unsigned int i = 0; i < 4; ++i){
digest->b[(i * 4) + 0] = (uint8_t)((this->buffer[i] & 0x000000FF));
digest->b[(i * 4) + 1] = (uint8_t)((this->buffer[i] & 0x0000FF00) >> 8);
digest->b[(i * 4) + 2] = (uint8_t)((this->buffer[i] & 0x00FF0000) >> 16);
digest->b[(i * 4) + 3] = (uint8_t)((this->buffer[i] & 0xFF000000) >> 24);
}
}

30
src/runtime/c/pgf/md5.h Normal file
View File

@@ -0,0 +1,30 @@
#ifndef MD5_H
#define MD5_H
struct PGF_INTERNAL_DECL MD5Digest {
uint8_t b[16];
};
inline bool operator < (const MD5Digest &d1, const MD5Digest &d2) {
return memcmp(d1.b, d2.b, 16) < 0;
}
class PGF_INTERNAL_DECL MD5Context {
uint64_t size; // Size of input in bytes
uint32_t buffer[4]; // Current accumulation of hash
uint8_t input[64]; // Input to be used in the next step
public:
MD5Context();
void update(uint8_t *input, size_t input_len);
template <class T>
void update(T &input)
{
update((uint8_t *) &input, sizeof(T));
}
void finalize(MD5Digest *digest);
};
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,59 +1,114 @@
#ifndef PARSER_H
#define PARSER_H
#ifndef LR_TABLE_H
#define LR_TABLE_H
#include "md5.h"
class PGF_INTERNAL_DECL PgfLRTableMaker
{
struct State;
struct Item;
struct Predictions;
struct CompareItem;
static const CompareItem compare_item;
typedef std::pair<ref<PgfText>,size_t> Key;
struct PGF_INTERNAL_DECL CompareKey : std::less<Key> {
bool operator() (const Key& k1, const Key& k2) const {
int cmp = textcmp(k1.first,k2.first);
if (cmp < 0)
return true;
else if (cmp > 0)
return false;
return (k1.second < k2.second);
}
};
ref<PgfAbstr> abstr;
ref<PgfConcr> concr;
std::vector<State*> todo;
std::map<MD5Digest,State*> states;
std::map<Key,Predictions*,CompareKey> predictions;
std::map<Predictions*,State*> continuations;
std::vector<Item*> completed;
void process(Item *item);
void symbol(Item *item, PgfSymbol sym);
void predict(Item *item, ref<PgfText> cat,
ref<Vector<PgfVariableRange>> vars, PgfLParam *r);
void predict(Item *item, ref<PgfText> cat, size_t r);
void predict(ref<PgfAbsFun> absfun, Predictions *preds);
void complete(Item *item);
static void print_item(Item *item);
public:
PgfLRTableMaker(ref<PgfAbstr> abstr, ref<PgfConcr> concr);
ref<PgfLRTable> make();
};
class PgfPrinter;
class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum
{
ref<PgfConcr> concr;
PgfText *sentence;
PgfMarshaller *m;
PgfUnmarshaller *u;
struct Choice;
struct Production;
struct StackNode;
struct ParseState;
struct ExprState;
struct ExprInstance;
struct Result;
struct CompareExprState : std::less<ExprState*> {
bool operator() (const ExprState *state1, const ExprState *state2) const;
};
ParseState *before, *after, *ahead;
std::priority_queue<ExprState*, std::vector<ExprState*>, CompareExprState> queue;
int last_fid;
Result *top_res;
size_t top_res_index;
void shift(StackNode *parent, ref<PgfConcrLincat> lincat, size_t r, Production *prod,
ParseState *state);
void reduce(StackNode *parent, ref<PgfConcrLin> lin, size_t seq_index,
size_t n, std::vector<Choice*> &args);
void complete(StackNode *parent, ref<PgfConcrLincat> lincat, size_t seq_index,
size_t n, std::vector<Choice*> &args);
void reduce_all(StackNode *state);
void print_prod(Choice *choice, Production *prod);
void print_transition(StackNode *source, StackNode *target, ParseState *state);
typedef std::map<std::pair<Choice*,Choice*>,Choice*> intersection_map;
Choice *intersect_choice(Choice *choice1, Choice *choice2, intersection_map &im);
void print_expr_state_before(PgfPrinter *printer, ExprState *state);
void print_expr_state_after(PgfPrinter *printer, ExprState *state);
void print_expr_state(ExprState *state);
void predict_expr_states(Choice *choice, prob_t outside_prob);
bool process_expr_state(ExprState *state);
void complete_expr_state(ExprState *state);
void combine_expr_state(ExprState *state, ExprInstance &inst);
void release_expr_state(ExprState *state);
class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum {
public:
PgfParser(ref<PgfConcr> concr, ref<PgfConcrLincat> start, PgfText *sentence, PgfMarshaller *m, PgfUnmarshaller *u);
void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err);
void start_matches(PgfTextSpot *end, PgfExn* err);
void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err);
void end_matches(PgfTextSpot *end, PgfExn* err);
virtual void space(PgfTextSpot *start, PgfTextSpot *end, PgfExn* err);
virtual void start_matches(PgfTextSpot *end, PgfExn* err);
virtual void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err);
virtual void end_matches(PgfTextSpot *end, PgfExn* err);
void prepare();
PgfExpr fetch(PgfDB *db, prob_t *prob);
virtual ~PgfParser();
private:
class CFGCat;
class State;
class Choice;
class Production;
class ParseItemConts;
class Item {
public:
prob_t get_prob() { return inside_prob + outside_prob; };
virtual State *proceed(PgfParser *parser, PgfUnmarshaller *u) = 0;
virtual bool combine(PgfParser *parser, ParseItemConts *conts, PgfExpr expr, prob_t inside_prob, PgfUnmarshaller *u) = 0;
virtual void print1(PgfPrinter *printer, State *state, PgfMarshaller *m) = 0;
virtual void print2(PgfPrinter *printer, State *state, int x, PgfMarshaller *m) = 0;
virtual PgfExpr get_expr(PgfUnmarshaller *u) = 0;
void trace(State *state, PgfMarshaller *m);
protected:
prob_t inside_prob;
prob_t outside_prob;
};
class ParseItem;
class ExprItem;
class MetaItem;
ref<PgfConcr> concr;
ref<PgfConcrLincat> start;
PgfText *sentence;
size_t last_choice_id;
State *before, *after;
PgfMarshaller *m;
PgfUnmarshaller *u;
};
#endif

View File

@@ -861,9 +861,9 @@ public:
virtual void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err)
{
ref<PgfLincatField> field =
vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len);
callback->fn(callback, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err);
ref<PgfText> field =
*vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len);
callback->fn(callback, &lin->absfun->name, field, lin->lincat->abscat->prob+lin->absfun->prob, err);
}
virtual void end_matches(PgfTextSpot *end, PgfExn* err)
@@ -909,9 +909,9 @@ public:
virtual void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err)
{
ref<PgfLincatField> field =
vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len);
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err);
ref<PgfText> field =
*vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len);
callback->morpho.fn(&callback->morpho, &lin->absfun->name, field, lin->lincat->abscat->prob+lin->absfun->prob, err);
}
virtual void end_matches(PgfTextSpot *end, PgfExn* err)
@@ -976,7 +976,7 @@ PGF_API
PgfText *pgf_get_lincat_field_internal(object o, size_t i)
{
ref<PgfConcrLincat> lincat = o;
return &*(vector_elem(lincat->fields, i)->name);
return &**vector_elem(lincat->fields, i);
}
PGF_API
@@ -1654,6 +1654,7 @@ class PGF_INTERNAL PgfLinBuilder : public PgfLinBuilderIface
ref<Vector<ref<PgfSequence>>> seqs;
object container; // what are we building?
ref<PgfConcrLincat> container_lincat;
size_t var_index;
size_t arg_index;
@@ -1712,17 +1713,15 @@ public:
lincat->seqs = seqs;
lincat->n_lindefs = n_lindefs;
ref<Vector<PgfLincatField>> db_fields = vector_new<PgfLincatField>(n_fields);
ref<Vector<ref<PgfText>>> db_fields = vector_new<ref<PgfText>>(n_fields);
for (size_t i = 0; i < n_fields; i++) {
ref<PgfText> name = textdup_db(fields[i]);
vector_elem(db_fields, i)->lincat = lincat;
vector_elem(db_fields, i)->name = name;
vector_elem(db_fields, i)->backrefs = 0;
vector_elem(db_fields, i)->epsilons = 0;
*vector_elem(db_fields, i) = name;
}
lincat->fields = db_fields;
this->container = lincat.tagged();
this->container_lincat = 0;
build->build(this, err);
if (err->type == PGF_EXN_NONE && res_index != res->len) {
@@ -1760,6 +1759,7 @@ public:
lin->seqs = seqs;
this->container = lin.tagged();
this->container_lincat = lincat;
build->build(this, err);
if (err->type == PGF_EXN_NONE && res_index != res->len) {
@@ -2149,7 +2149,7 @@ public:
PgfPhrasetable phrasetable =
phrasetable_internalize(concr->phrasetable,
seq, container, seq_index,
seq, container_lincat, container, seq_index,
&entry);
concr->phrasetable = phrasetable;
*vector_elem(seqs, seq_index) = entry->seq;
@@ -2418,7 +2418,7 @@ PgfText **pgf_category_fields(PgfDB *db, PgfConcrRevision revision,
if (fields == 0)
throw pgf_systemerror(ENOMEM);
for (size_t i = 0; i < n_fields; i++) {
fields[i] = textdup(vector_elem(lincat->fields, i)->name);
fields[i] = textdup(*vector_elem(lincat->fields, i));
}
*p_n_fields = n_fields;
return fields;
@@ -2511,7 +2511,7 @@ PgfText **pgf_tabular_linearize(PgfDB *db, PgfConcrRevision revision,
PgfText *text = out.get_text();
if (text != NULL) {
res[pos++] = textdup(&*(vector_elem(lincat->fields,i)->name));
res[pos++] = textdup(&**vector_elem(lincat->fields,i));
res[pos++] = text;
}
}
@@ -2550,7 +2550,7 @@ PgfText **pgf_tabular_linearize_all(PgfDB *db, PgfConcrRevision revision,
PgfText *text = out.get_text();
if (text != NULL) {
res[pos++] = textdup(&*(vector_elem(lincat->fields, i)->name));
res[pos++] = textdup(&**vector_elem(lincat->fields, i));
res[pos++] = text;
}
}
@@ -2656,7 +2656,6 @@ PgfExprEnum *pgf_parse(PgfDB *db, PgfConcrRevision revision,
phrasetable_lookup_cohorts(concr->phrasetable,
sentence, case_sensitive,
parser, err);
parser->prepare();
return parser;
} PGF_API_END

View File

@@ -299,9 +299,87 @@ int text_sequence_cmp(PgfTextSpot *spot, const uint8_t *end,
}
}
static
int backref_cmp(ref<PgfSequenceBackref> backref, ref<PgfConcrLincat> lincat, size_t r)
{
int cmp = 0;
switch (ref<PgfConcrLin>::get_tag(backref->container)) {
case PgfConcrLin::tag: {
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref->container);
if (lincat.as_object() < lin->lincat.as_object())
cmp = -1;
else if (lincat.as_object() > lin->lincat.as_object())
cmp = 1;
break;
}
case PgfConcrLincat::tag: {
if (lincat.as_object() > 0)
cmp = 1;
break;
}
}
if (cmp == 0) {
size_t r1 =
(lincat == 0) ? 0
: backref->seq_index % lincat->fields->len;
if (r < r1)
cmp = -1;
else if (r > r1)
cmp = 1;
}
return cmp;
}
static
ref<Vector<PgfSequenceBackref>> phrasetable_update_backrefs(PgfPhrasetable table,
ref<PgfConcrLincat> lincat,
object container,
size_t seq_index)
{
size_t len = (table->value.backrefs != 0)
? table->value.backrefs->len
: 0;
ref<Vector<PgfSequenceBackref>> backrefs =
vector_resize<PgfSequenceBackref>(table->value.backrefs, len+1, table->txn_id);
ssize_t i = 0;
ssize_t j = len-1;
if (table->value.seq->syms.len == 0 && len > 0) {
// The backrefs for the epsilon sequence are sorted by lincat and r
size_t r = (lincat!=0) ? (seq_index % lincat->fields->len) : 0;
while (i <= j) {
ssize_t k = (i + j) / 2;
ref<PgfSequenceBackref> backref = vector_elem(backrefs, k);
int cmp = backref_cmp(backref, lincat, r);
if (cmp < 0) {
while (j >= k) {
backrefs->data[j+1] = backrefs->data[j];
j--;
}
} else if (cmp > 0) {
i = k+1;
} else {
while (j > k) {
backrefs->data[j+1] = backrefs->data[j];
j--;
}
break;
}
}
}
backrefs->data[j+1].container = container;
backrefs->data[j+1].seq_index = seq_index;
return backrefs;
}
PGF_INTERNAL
PgfPhrasetable phrasetable_internalize(PgfPhrasetable table,
ref<PgfSequence> seq,
ref<PgfConcrLincat> lincat,
object container,
size_t seq_index,
ref<PgfPhrasetableEntry> *pentry)
@@ -321,6 +399,7 @@ PgfPhrasetable phrasetable_internalize(PgfPhrasetable table,
if (cmp < 0) {
PgfPhrasetable left = phrasetable_internalize(table->left,
seq,
lincat,
container,
seq_index,
pentry);
@@ -329,6 +408,7 @@ PgfPhrasetable phrasetable_internalize(PgfPhrasetable table,
} else if (cmp > 0) {
PgfPhrasetable right = phrasetable_internalize(table->right,
seq,
lincat,
container,
seq_index,
pentry);
@@ -342,9 +422,7 @@ PgfPhrasetable phrasetable_internalize(PgfPhrasetable table,
: 0;
ref<Vector<PgfSequenceBackref>> backrefs =
vector_resize<PgfSequenceBackref>(table->value.backrefs, len+1, table->txn_id);
backrefs->data[len].container = container;
backrefs->data[len].seq_index = seq_index;
phrasetable_update_backrefs(table,lincat,container,seq_index);
PgfPhrasetable new_table =
Node<PgfPhrasetableEntry>::upd_node(table, table->left, table->right);
@@ -356,6 +434,7 @@ PgfPhrasetable phrasetable_internalize(PgfPhrasetable table,
PGF_INTERNAL
ref<PgfSequence> phrasetable_relink(PgfPhrasetable table,
ref<PgfConcrLincat> lincat,
object container,
size_t seq_index,
size_t seq_id)
@@ -370,9 +449,7 @@ ref<PgfSequence> phrasetable_relink(PgfPhrasetable table,
: table->value.backrefs->len;
ref<Vector<PgfSequenceBackref>> backrefs =
vector_resize<PgfSequenceBackref>(table->value.backrefs, len+1, table->txn_id);
backrefs->data[len].container = container;
backrefs->data[len].seq_index = seq_index;
phrasetable_update_backrefs(table,lincat,container,seq_index);
table->value.backrefs = backrefs;
return table->value.seq;
@@ -397,12 +474,16 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
PgfPhrasetable left = phrasetable_delete(table->left,
container, seq_index,
seq);
if (left == table->left)
return table;
table = Node<PgfPhrasetableEntry>::upd_node(table,left,table->right);
return Node<PgfPhrasetableEntry>::balanceR(table);
} else if (cmp > 0) {
PgfPhrasetable right = phrasetable_delete(table->right,
container, seq_index,
seq);
if (right == table->right)
return table;
table = Node<PgfPhrasetableEntry>::upd_node(table,table->left,right);
return Node<PgfPhrasetableEntry>::balanceL(table);
} else {
@@ -566,10 +647,10 @@ void finish_skipping(PgfCohortsState *state) {
state->queue.pop();
}
/*
state->scanner->space(&state->spot, &state->spot,
state->err);
*/
state->last.pos = 0;
state->last.ptr = NULL;
state->skipping = false;
@@ -740,6 +821,56 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table,
}
}
PGF_INTERNAL
void phrasetable_lookup_epsilons(PgfPhrasetable table,
ref<PgfConcrLincat> lincat, size_t r,
std::function<void(ref<PgfConcrLin>,size_t)> &f)
{
while (table->left != 0) {
table = table->left;
}
if (table->value.seq->syms.len > 0)
return;
size_t len = (table->value.backrefs != 0)
? table->value.backrefs->len
: 0;
ssize_t i = 0;
ssize_t j = len-1;
while (i <= j) {
ssize_t k = (i + j) / 2;
ref<PgfSequenceBackref> backref = vector_elem(table->value.backrefs, k);
int cmp = backref_cmp(backref, lincat, r);
if (cmp < 0) {
j = k-1;
} else if (cmp > 0) {
i = k+1;
} else {
i = k;
while (i > 0) {
ref<PgfSequenceBackref> backref = vector_elem(table->value.backrefs, i-1);
if (backref_cmp(backref, lincat, r) != 0)
break;
f(ref<PgfConcrLin>::untagged(backref->container),backref->seq_index);
i--;
}
f(ref<PgfConcrLin>::untagged(backref->container),backref->seq_index);
j = k;
while (j < len-1) {
ref<PgfSequenceBackref> backref = vector_elem(table->value.backrefs, j+1);
if (backref_cmp(backref, lincat, r) != 0)
break;
f(ref<PgfConcrLin>::untagged(backref->container),backref->seq_index);
j++;
}
break;
}
}
}
PGF_INTERNAL
void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table,
@@ -768,10 +899,10 @@ void phrasetable_iter(PgfConcr *concr,
ref<PgfConcrLincat> lincat =
namespace_lookup(concr->lincats, &lin->absfun->type->name);
if (lincat != 0) {
ref<PgfLincatField> field =
vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
ref<PgfText> field =
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
callback->fn(callback, &lin->absfun->name, &(*field->name), lincat->abscat->prob+lin->absfun->prob, err);
callback->fn(callback, &lin->absfun->name, &*field, lincat->abscat->prob+lin->absfun->prob, err);
if (err->type != PGF_EXN_NONE)
return;
}

View File

@@ -52,15 +52,19 @@ private:
#pragma GCC diagnostic pop
#endif
struct PgfConcrLincat;
PGF_INTERNAL_DECL
PgfPhrasetable phrasetable_internalize(PgfPhrasetable table,
ref<PgfSequence> seq,
ref<PgfConcrLincat> lincat,
object container,
size_t seq_index,
ref<PgfPhrasetableEntry> *pentry);
PGF_INTERNAL_DECL
ref<PgfSequence> phrasetable_relink(PgfPhrasetable table,
ref<PgfConcrLincat> lincat,
object container,
size_t seq_index,
size_t seq_id);
@@ -101,6 +105,11 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table,
bool case_sensitive,
PgfPhraseScanner *scanner, PgfExn* err);
PGF_INTERNAL_DECL
void phrasetable_lookup_epsilons(PgfPhrasetable table,
ref<PgfConcrLincat> lincat, size_t r,
std::function<void(ref<PgfConcrLin>, size_t)> &f);
PGF_INTERNAL_DECL
void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table,

View File

@@ -1,5 +1,6 @@
#include "data.h"
#include "reader.h"
#include "parser.h"
#include <math.h>
#include <string.h>
@@ -650,14 +651,14 @@ ref<PgfSequence> PgfReader::read_seq()
return seq;
}
ref<Vector<ref<PgfSequence>>> PgfReader::read_seq_ids(object container)
ref<Vector<ref<PgfSequence>>> PgfReader::read_seq_ids(ref<PgfConcrLincat> lincat, object container)
{
size_t len = read_len();
ref<Vector<ref<PgfSequence>>> vec = vector_new<ref<PgfSequence>>(len);
for (size_t i = 0; i < len; i++) {
size_t seq_id = read_len();
ref<PgfSequence> seq = phrasetable_relink(concrete->phrasetable,
container, i,
lincat, container, i,
seq_id);
if (seq == 0) {
throw pgf_error("Invalid sequence id");
@@ -701,7 +702,7 @@ ref<PgfConcrLincat> PgfReader::read_lincat()
auto n_lindefs = read_len();
auto args = read_vector(&PgfReader::read_parg);
auto res = read_vector(&PgfReader::read_presult2);
auto seqs = read_seq_ids(lincat.tagged());
auto seqs = read_seq_ids(0, lincat.tagged());
lincat->abscat = namespace_lookup(abstract->cats, &lincat->name);
lincat->fields = fields;
@@ -712,130 +713,35 @@ ref<PgfConcrLincat> PgfReader::read_lincat()
return lincat;
}
ref<Vector<PgfLincatField>> PgfReader::read_lincat_fields(ref<PgfConcrLincat> lincat)
ref<Vector<ref<PgfText>>> PgfReader::read_lincat_fields(ref<PgfConcrLincat> lincat)
{
size_t len = read_len();
ref<Vector<PgfLincatField>> fields = vector_new<PgfLincatField>(len);
ref<Vector<ref<PgfText>>> fields = vector_new<ref<PgfText>>(len);
for (size_t i = 0; i < len; i++) {
auto name = read_text();
ref<PgfLincatField> field = vector_elem(fields,i);
field->lincat = lincat;
field->name = name;
field->backrefs = 0;
field->epsilons = 0;
*vector_elem(fields,i) = name;
}
return fields;
}
static void add_to_index(ref<PgfConcr> concrete, ref<PgfConcrLin> lin, size_t seq_index, size_t dot)
{
size_t n_fields = lin->lincat->fields->len;
ref<PgfSequence> seq = *vector_elem(lin->seqs,seq_index);
ref<PgfPResult> result = *vector_elem(lin->res, seq_index / n_fields);
ref<PgfLincatField> field = vector_elem(lin->lincat->fields, seq_index % n_fields);
if (dot >= seq->syms.len) {
ref<Vector<PgfLincatEpsilon>> epsilons = field->epsilons;
epsilons =
vector_resize(epsilons, ((epsilons == 0) ? 0 : epsilons->len)+1,
PgfDB::get_txn_id());
field->epsilons = epsilons;
ref<PgfLincatEpsilon> epsilon =
vector_elem(epsilons,epsilons->len-1);
epsilon->lin = lin;
epsilon->seq_index = seq_index;
if (epsilons->len == 1 && field->backrefs != 0) {
for (size_t i = 0; i < field->backrefs->len; i++) {
ref<PgfLincatBackref> backref = vector_elem(field->backrefs,i);
add_to_index(concrete,backref->lin,backref->seq_index,backref->dot+1);
}
}
} else {
PgfSymbol sym = *vector_elem(&seq->syms,dot);
switch (ref<PgfSymbol>::get_tag(sym)) {
case PgfSymbolCat::tag: {
auto sym_cat = ref<PgfSymbolCat>::untagged(sym);
ref<PgfHypo> hypo =
vector_elem(lin->absfun->type->hypos,sym_cat->d);
ref<PgfConcrLincat> lincat =
namespace_lookup(concrete->lincats,
&hypo->type->name);
if (lincat == 0)
throw pgf_error("Found a lin which uses a category without a lincat");
size_t max_values = 1;
size_t *ranges = (size_t *)
alloca(sym_cat->r.n_terms*sizeof(size_t));
for (size_t i = 0; i < sym_cat->r.n_terms; i++) {
for (size_t j = 0; j < result->vars->len; j++) {
auto var_range = vector_elem(result->vars, j);
if (var_range->var == sym_cat->r.terms[i].var) {
ranges[i] = vector_elem(result->vars, j)->range;
max_values *= var_range->range;
break;
}
}
}
bool is_epsilon = false;
for (size_t values = 0; values < max_values; values++) {
size_t v = values;
size_t index = sym_cat->r.i0;
for (size_t i = 0; i < sym_cat->r.n_terms; i++) {
index += sym_cat->r.terms[i].factor * (v % ranges[i]);
v = v / ranges[i];
}
ref<Vector<PgfLincatBackref>> backrefs =
vector_elem(lincat->fields,index)->backrefs;
backrefs =
vector_resize(backrefs, ((backrefs == 0) ? 0 : backrefs->len)+1,
PgfDB::get_txn_id());
vector_elem(lincat->fields,index)->backrefs = backrefs;
ref<PgfLincatBackref> backref =
vector_elem(backrefs,backrefs->len-1);
backref->lin = lin;
backref->seq_index = seq_index;
backref->dot = dot;
if (vector_elem(lincat->fields,index)->epsilons != 0)
is_epsilon = true;
}
if (is_epsilon)
add_to_index(concrete,lin,seq_index,dot+1);
break;
}
}
}
};
ref<PgfConcrLin> PgfReader::read_lin()
{
ref<PgfConcrLin> lin = read_name(&PgfConcrLin::name);
lin->absfun = namespace_lookup(abstract->funs, &lin->name);
if (lin->absfun == 0)
throw pgf_error("Found a lin without a fun");
auto args = read_vector(&PgfReader::read_parg);
auto res = read_vector(&PgfReader::read_presult2);
auto seqs = read_seq_ids(lin.tagged());
lin->args = args;
lin->res = res;
lin->seqs = seqs;
lin->lincat =
namespace_lookup(concrete->lincats, &lin->absfun->type->name);
if (lin->lincat == 0)
throw pgf_error("Found a lin which uses a category without a lincat");
for (size_t seq_index = 0; seq_index < lin->seqs->len; seq_index++) {
add_to_index(concrete, lin, seq_index, 0);
}
auto args = read_vector(&PgfReader::read_parg);
auto res = read_vector(&PgfReader::read_presult2);
auto seqs = read_seq_ids(lin->lincat, lin.tagged());
lin->args = args;
lin->res = res;
lin->seqs = seqs;
return lin;
}
@@ -866,6 +772,9 @@ ref<PgfConcr> PgfReader::read_concrete()
auto printnames = read_namespace<PgfConcrPrintname>(&PgfReader::read_printname);
concrete->printnames = printnames;
PgfLRTableMaker maker(abstract, concrete);
concrete->lrtable = maker.make();
return concrete;
}

View File

@@ -71,14 +71,14 @@ public:
void merge_abstract(ref<PgfAbstr> abstract);
ref<PgfConcrLincat> read_lincat();
ref<Vector<PgfLincatField>> read_lincat_fields(ref<PgfConcrLincat> lincat);
ref<Vector<ref<PgfText>>> read_lincat_fields(ref<PgfConcrLincat> lincat);
ref<PgfLParam> read_lparam();
void read_variable_range(ref<PgfVariableRange> var_info);
void read_parg(ref<PgfPArg> parg);
ref<PgfPResult> read_presult();
PgfSymbol read_symbol();
ref<PgfSequence> read_seq();
ref<Vector<ref<PgfSequence>>> read_seq_ids(object container);
ref<Vector<ref<PgfSequence>>> read_seq_ids(ref<PgfConcrLincat> lincat, object container);
PgfPhrasetable read_phrasetable(size_t len);
PgfPhrasetable read_phrasetable();
ref<PgfConcrLin> read_lin();

View File

@@ -391,9 +391,9 @@ void PgfWriter::write_lincat(ref<PgfConcrLincat> lincat)
write_vector(lincat->seqs, &PgfWriter::write_seq_id);
}
void PgfWriter::write_lincat_field(ref<PgfLincatField> field)
void PgfWriter::write_lincat_field(ref<ref<PgfText>> field)
{
write_text(field->name);
write_text(*field);
}
void PgfWriter::write_lin(ref<PgfConcrLin> lin)

View File

@@ -39,7 +39,7 @@ public:
void write_abstract(ref<PgfAbstr> abstract);
void write_lincat(ref<PgfConcrLincat> lincat);
void write_lincat_field(ref<PgfLincatField> field);
void write_lincat_field(ref<ref<PgfText>> field);
void write_variable_range(ref<PgfVariableRange> var);
void write_lparam(ref<PgfLParam> lparam);
void write_parg(ref<PgfPArg> linarg);