#include "data.h" #include "reader.h" #include "parser.h" #include #include PgfReader::PgfReader(FILE *in,PgfProbsCallback *probs_callback) { this->in = in; this->probs_callback = probs_callback; this->abstract = 0; this->concrete = 0; } uint8_t PgfReader::read_uint8() { uint8_t b; size_t n_bytes = fread((char*) &b, sizeof(b), 1, in); if (feof(in)) throw pgf_error("reached end of file while reading the grammar"); if (ferror(in)) throw pgf_error("an error occured while reading the grammar"); return b; } uint16_t PgfReader::read_u16be() { uint8_t buf[2]; size_t n_bytes = fread((char*) &buf, sizeof(buf), 1, in); if (feof(in)) throw pgf_error("reached end of file while reading a grammar"); if (ferror(in)) throw pgf_error("an error occured while reading the grammar"); return (((uint16_t) buf[0]) << 8 | buf[1]); } uint64_t PgfReader::read_u64be() { uint8_t buf[8]; size_t n_bytes = fread((char*) &buf, sizeof(buf), 1, in); if (feof(in)) throw pgf_error("reached end of file while reading a grammar"); if (ferror(in)) throw pgf_error("an error occured while reading the grammar"); return (((uint64_t) buf[0]) << 56 | ((uint64_t) buf[1]) << 48 | ((uint64_t) buf[2]) << 40 | ((uint64_t) buf[3]) << 32 | ((uint64_t) buf[4]) << 24 | ((uint64_t) buf[5]) << 16 | ((uint64_t) buf[6]) << 8 | ((uint64_t) buf[7])); } double PgfReader::read_double() { uint64_t u = read_u64be(); bool sign = u >> 63; unsigned rawexp = u >> 52 & 0x7ff; uint64_t mantissa = u & 0xfffffffffffff; double ret; if (rawexp == 0x7ff) { ret = (mantissa == 0) ? INFINITY : NAN; } else { uint64_t m = rawexp ? 1ULL << 52 | mantissa : mantissa << 1; ret = ldexp((double) m, rawexp - 1075); } return sign ? copysign(ret, -1.0) : ret; } prob_t PgfReader::read_prob(PgfText *name) { double d = read_double(); if (probs_callback != NULL) { d = probs_callback->fn(probs_callback, name); } return - logf(d); } uint64_t PgfReader::read_uint() { uint64_t u = 0; int shift = 0; uint8_t b = 0; do { b = read_uint8(); u |= (b & ~0x80) << shift; shift += 7; } while (b & 0x80); return u; } object PgfReader::read_text_internal(size_t struct_size) { size_t size = read_len(); object offs = current_db->malloc_internal(struct_size+sizeof(PgfText)+size+1); PgfText* ptext = (PgfText*) (current_base+offs+struct_size); ptext->size = size; // If reading the extra bytes causes EOF, it is an encoding // error, not a legitimate end of character stream. fread(ptext->text, size, 1, in); if (feof(in)) throw pgf_error("utf8 decoding error"); if (ferror(in)) throw pgf_error("an error occured while reading the grammar"); ptext->text[size] = 0; return offs; } template Namespace PgfReader::read_namespace(ref (PgfReader::*read_value)(), size_t len) { if (len == 0) return 0; size_t half = len/2; Namespace left = read_namespace(read_value, half); ref value = (this->*read_value)(); Namespace right = read_namespace(read_value, len-half-1); Namespace node = Node>::new_node(value); node->sz = 1+Node>::size(left)+Node>::size(right); node->left = left; node->right = right; return node; } template Namespace PgfReader::read_namespace(ref (PgfReader::*read_value)()) { size_t len = read_len(); return read_namespace(read_value, len); } template void PgfReader::merge_namespace(ref (PgfReader::*read_value)()) { size_t len = read_len(); for (size_t i = 0; i < len; i++) { ref value = (this->*read_value)(); V::release(value); } } template ref PgfReader::read_vector(Vector C::* field, void (PgfReader::*read_value)(ref val)) { size_t len = read_len(); ref loc = vector_new(field,len); for (size_t i = 0; i < len; i++) { (this->*read_value)(vector_elem(ref>::from_ptr(&(loc->*field)),i)); } return loc; } template ref> PgfReader::read_vector(void (PgfReader::*read_value)(ref val)) { size_t len = read_len(); ref> vec = vector_new(len); for (size_t i = 0; i < len; i++) { (this->*read_value)(vector_elem(vec,i)); } return vec; } PgfLiteral PgfReader::read_literal() { PgfLiteral lit = 0; uint8_t tag = read_tag(); switch (tag) { case PgfLiteralStr::tag: { ref lit_str = read_text(&PgfLiteralStr::val); lit = lit_str.tagged(); break; } case PgfLiteralInt::tag: { size_t size = read_len(); ref lit_int = PgfDB::malloc(sizeof(uintmax_t)*size); lit_int->size = size; for (size_t i = 0; i < size; i++) { lit_int->val[i] = (uintmax_t) read_uint(); } lit = lit_int.tagged(); break; } case PgfLiteralFlt::tag: { ref lit_flt = current_db->malloc(); lit_flt->val = read_double(); lit = lit_flt.tagged(); break; } default: throw pgf_error("Unknown literal tag"); } return lit; } ref PgfReader::read_flag() { ref flag = read_name(&PgfFlag::name); flag->value = read_literal(); return flag; } PgfExpr PgfReader::read_expr() { PgfExpr expr = 0; uint8_t tag = read_tag(); switch (tag) { case PgfExprAbs::tag:{ PgfBindType bind_type = (PgfBindType) read_tag(); ref eabs = read_name(&PgfExprAbs::name); eabs->bind_type = bind_type; PgfExpr body = read_expr(); eabs->body = body; expr = eabs.tagged(); break; } case PgfExprApp::tag: { PgfExpr fun = read_expr(); PgfExpr arg = read_expr(); ref eapp = PgfDB::malloc(); eapp->fun = fun; eapp->arg = arg; expr = eapp.tagged(); break; } case PgfExprLit::tag: { PgfExpr lit = read_literal(); ref elit = PgfDB::malloc(); elit->lit = lit; expr = elit.tagged(); break; } case PgfExprMeta::tag: { ref emeta = PgfDB::malloc(); emeta->id = read_int(); expr = emeta.tagged(); break; } case PgfExprFun::tag: { ref efun = read_name(&PgfExprFun::name); expr = efun.tagged(); break; } case PgfExprVar::tag: { ref evar = PgfDB::malloc(); evar->var = read_int(); expr = evar.tagged(); break; } case PgfExprTyped::tag: { auto expr = read_expr(); auto type = read_type(); ref etyped = PgfDB::malloc(); etyped->expr = expr; etyped->type = type.as_object(); expr = etyped.tagged(); break; } case PgfExprImplArg::tag: { auto expr = read_expr(); ref eimpl = current_db->malloc(); eimpl->expr = expr; expr = eimpl.tagged(); break; } default: throw pgf_error("Unknown expression tag"); } return expr; } void PgfReader::read_hypo(ref hypo) { hypo->bind_type = (PgfBindType) read_tag(); auto cid = read_name(); hypo->cid = cid; auto type = read_type(); hypo->type = type; } ref PgfReader::read_type() { auto hypos = read_vector(&PgfReader::read_hypo); ref tp = read_name(&PgfDTyp::name); tp->hypos = hypos; auto exprs = read_vector(&PgfReader::read_expr); tp->exprs = exprs; return tp; } ref PgfReader::read_absfun_only() { ref absfun = read_name(&PgfAbsFun::name); auto type = read_type(); absfun->type = type; absfun->arity = read_int(); uint8_t tag = read_tag(); switch (tag) { case 0: absfun->bytecode = 0; break; case 1: { read_len(); auto dummy = PgfDB::malloc(0); absfun->bytecode = dummy; break; } default: throw pgf_error("Unknown tag, 0 or 1 expected"); } absfun->prob = read_prob(&absfun->name); return absfun; } ref PgfReader::read_absfun() { ref absfun = read_absfun_only(); PgfProbspace funs_by_cat = probspace_insert(abstract->funs_by_cat, absfun); abstract->funs_by_cat = funs_by_cat; return absfun; } ref PgfReader::merge_absfun() { ref absfun = read_absfun_only(); if (namespace_lookup(abstract->funs, &absfun->name) == 0) { throw pgf_error("The set of abstract functions is merged PGFs must be the same"); } return absfun; } ref PgfReader::read_abscat() { ref abscat = read_name(&PgfAbsCat::name); auto context = read_vector(&PgfReader::read_hypo); abscat->context = context; abscat->prob = read_prob(&abscat->name); return abscat; } struct PGF_INTERNAL_DECL PgfAbsCatCounts { PgfText *name; size_t n_nan_probs; double probs_sum; prob_t prob; }; static PgfAbsCatCounts *find_counts(Vector *cats, PgfText *name) { size_t i = 0; size_t j = cats->len-1; while (i <= j) { size_t k = (i+j)/2; PgfAbsCatCounts *counts = &cats->data[k]; int cmp = textcmp(name, counts->name); if (cmp < 0) { j = k-1; } else if (cmp > 0) { i = k+1; } else { return counts; } } return NULL; } void PgfReader::read_abstract(ref abstract) { this->abstract = abstract; abstract->funs_by_cat = 0; auto name = read_name(); auto aflags = read_namespace(&PgfReader::read_flag); auto funs = read_namespace(&PgfReader::read_absfun); auto cats = read_namespace(&PgfReader::read_abscat); abstract->name = name; abstract->aflags = aflags; abstract->funs = funs; abstract->cats = cats; if (probs_callback != NULL) { Vector *cats = namespace_to_sorted_names(abstract->cats); std::function)> collect_counts = [cats](ref absfun) { PgfAbsCatCounts *counts = find_counts(cats, &absfun->type->name); if (counts != NULL) { if (isnan(absfun->prob)) { counts->n_nan_probs++; } else { counts->probs_sum += exp(-absfun->prob); } } return true; }; namespace_iter(abstract->funs, collect_counts); for (size_t i = 0; i < cats->len; i++) { PgfAbsCatCounts *counts = &cats->data[i]; counts->prob = - logf((1-counts->probs_sum) / counts->n_nan_probs); } std::function)> pad_probs = [cats](ref absfun) { if (isnan(absfun->prob)) { PgfAbsCatCounts *counts = find_counts(cats, &absfun->type->name); if (counts != NULL) { absfun->prob = counts->prob; } } return true; }; namespace_iter(abstract->funs, pad_probs); free(cats); } } void PgfReader::merge_abstract(ref abstract) { this->abstract = abstract; ref name = read_name(); int cmp = textcmp(&(*abstract->name), &(*name)); text_db_release(name); if (cmp != 0) throw pgf_error("The abstract syntax names doesn't match"); merge_namespace(&PgfReader::read_flag); merge_namespace(&PgfReader::merge_absfun); merge_namespace(&PgfReader::read_abscat); } ref PgfReader::read_lparam() { size_t i0 = read_int(); size_t n_terms = read_len(); ref lparam = PgfDB::malloc(n_terms*sizeof(PgfLParam::terms[0])); lparam->i0 = i0; lparam->n_terms = n_terms; for (size_t i = 0; i < n_terms; i++) { lparam->terms[i].factor = read_int(); lparam->terms[i].var = read_int(); } return lparam; } void PgfReader::read_variable_range(ref var_info) { var_info->var = read_int(); var_info->range = read_int(); } void PgfReader::read_parg(ref parg) { auto param = read_lparam(); parg->param = param; } ref PgfReader::read_presult() { ref> vars = 0; size_t n_vars = read_len(); if (n_vars > 0) { vars = vector_new(n_vars); for (size_t i = 0; i < n_vars; i++) { read_variable_range(vector_elem(vars,i)); } } size_t i0 = read_int(); size_t n_terms = read_len(); ref res = PgfDB::malloc(n_terms*sizeof(PgfLParam::terms[0])); res->vars = vars; res->param.i0 = i0; res->param.n_terms = n_terms; for (size_t i = 0; i < n_terms; i++) { res->param.terms[i].factor = read_int(); res->param.terms[i].var = read_int(); } return res; } template ref PgfReader::read_symbol_idx() { size_t d = read_int(); size_t i0 = read_int(); size_t n_terms = read_len(); ref sym_idx = PgfDB::malloc(n_terms*sizeof(PgfLParam::terms[0])); sym_idx->d = d; sym_idx->r.i0 = i0; sym_idx->r.n_terms = n_terms; for (size_t i = 0; i < n_terms; i++) { sym_idx->r.terms[i].factor = read_int(); sym_idx->r.terms[i].var = read_int(); } return sym_idx; } PgfSymbol PgfReader::read_symbol() { PgfSymbol sym = 0; uint8_t tag = read_tag(); switch (tag) { case PgfSymbolCat::tag: { ref sym_cat = read_symbol_idx(); sym = sym_cat.tagged(); break; } case PgfSymbolLit::tag: { ref sym_lit = read_symbol_idx(); sym = sym_lit.tagged(); break; } case PgfSymbolVar::tag: { ref sym_var = PgfDB::malloc(); sym_var->d = read_int(); sym_var->r = read_int(); sym = sym_var.tagged(); break; } case PgfSymbolKS::tag: { ref sym_ks = read_text(&PgfSymbolKS::token); sym = sym_ks.tagged(); break; } case PgfSymbolKP::tag: { size_t n_alts = read_len(); ref sym_kp = PgfDB::malloc(n_alts*sizeof(PgfAlternative)); sym_kp->alts.len = n_alts; for (size_t i = 0; i < n_alts; i++) { auto form = read_seq(); auto prefixes = read_vector(&PgfReader::read_text2); sym_kp->alts.data[i].form = form; sym_kp->alts.data[i].prefixes = prefixes; } auto default_form = read_seq(); sym_kp->default_form = default_form; sym = sym_kp.tagged(); break; } case PgfSymbolBIND::tag: { sym = ref(0).tagged(); break; } case PgfSymbolSOFTBIND::tag: { sym = ref(0).tagged(); break; } case PgfSymbolNE::tag: { sym = ref(0).tagged(); break; } case PgfSymbolSOFTSPACE::tag: { sym = ref(0).tagged(); break; } case PgfSymbolCAPIT::tag: { sym = ref(0).tagged(); break; } case PgfSymbolALLCAPIT::tag: { sym = ref(0).tagged(); break; } default: throw pgf_error("Unknown symbol tag"); } return sym; } ref PgfReader::read_seq() { size_t n_syms = read_len(); ref seq = PgfDB::malloc(n_syms*sizeof(PgfSymbol)); seq->syms.len = n_syms; for (size_t i = 0; i < n_syms; i++) { PgfSymbol sym = read_symbol(); *vector_elem(&seq->syms,i) = sym; } return seq; } ref>> PgfReader::read_seq_ids(ref lincat, object container) { size_t len = read_len(); ref>> vec = vector_new>(len); for (size_t i = 0; i < len; i++) { size_t seq_id = read_len(); ref seq = phrasetable_relink(concrete->phrasetable, lincat, container, i, seq_id); if (seq == 0) { throw pgf_error("Invalid sequence id"); } *vector_elem(vec,i) = seq; } return vec; } PgfPhrasetable PgfReader::read_phrasetable(size_t len) { if (len == 0) return 0; PgfPhrasetableEntry value; size_t half = len/2; PgfPhrasetable left = read_phrasetable(half); value.seq = read_seq(); value.backrefs = 0; PgfPhrasetable right = read_phrasetable(len-half-1); PgfPhrasetable table = Node::new_node(value); table->sz = 1+Node::size(left)+Node::size(right); table->left = left; table->right = right; return table; } PgfPhrasetable PgfReader::read_phrasetable() { size_t len = read_len(); return read_phrasetable(len); } ref PgfReader::read_lincat() { ref lincat = read_name(&PgfConcrLincat::name); auto fields = read_lincat_fields(lincat); auto n_lindefs = read_len(); auto args = read_vector(&PgfReader::read_parg); auto res = read_vector(&PgfReader::read_presult2); auto seqs = read_seq_ids(0, lincat.tagged()); lincat->abscat = namespace_lookup(abstract->cats, &lincat->name); lincat->fields = fields; lincat->n_lindefs = n_lindefs; lincat->args = args; lincat->res = res; lincat->seqs = seqs; return lincat; } ref>> PgfReader::read_lincat_fields(ref lincat) { size_t len = read_len(); ref>> fields = vector_new>(len); for (size_t i = 0; i < len; i++) { auto name = read_text(); *vector_elem(fields,i) = name; } return fields; } ref PgfReader::read_lin() { ref lin = read_name(&PgfConcrLin::name); lin->absfun = namespace_lookup(abstract->funs, &lin->name); if (lin->absfun == 0) throw pgf_error("Found a lin without a fun"); lin->lincat = namespace_lookup(concrete->lincats, &lin->absfun->type->name); if (lin->lincat == 0) throw pgf_error("Found a lin which uses a category without a lincat"); auto args = read_vector(&PgfReader::read_parg); auto res = read_vector(&PgfReader::read_presult2); auto seqs = read_seq_ids(lin->lincat, lin.tagged()); lin->args = args; lin->res = res; lin->seqs = seqs; return lin; } ref PgfReader::read_printname() { ref printname = read_name(&PgfConcrPrintname::name); printname->printname = read_text(); return printname; } ref PgfReader::read_concrete() { concrete = read_name(&PgfConcr::name); auto cflags = read_namespace(&PgfReader::read_flag); concrete->cflags = cflags; auto phrasetable = read_phrasetable(); concrete->phrasetable = phrasetable; auto lincats = read_namespace(&PgfReader::read_lincat); concrete->lincats = lincats; auto lins = read_namespace(&PgfReader::read_lin); concrete->lins = lins; auto printnames = read_namespace(&PgfReader::read_printname); concrete->printnames = printnames; PgfLRTableMaker maker(abstract, concrete); concrete->lrtable = maker.make(); return concrete; } ref PgfReader::read_pgf() { ref pgf = PgfDB::malloc(); pgf->major_version = read_u16be(); pgf->minor_version = read_u16be(); if (pgf->major_version != PGF_MAJOR_VERSION || pgf->minor_version != PGF_MINOR_VERSION) { throw pgf_error("Unsupported format version"); } auto gflags = read_namespace(&PgfReader::read_flag); pgf->gflags = gflags; read_abstract(ref::from_ptr(&pgf->abstract)); auto concretes = read_namespace(&PgfReader::read_concrete); pgf->concretes = concretes; return pgf; } void PgfReader::merge_pgf(ref pgf) { uint16_t major_version = read_u16be(); uint16_t minor_version = read_u16be(); if (pgf->major_version != PGF_MAJOR_VERSION || pgf->minor_version != PGF_MINOR_VERSION) { throw pgf_error("Unsupported format version"); } merge_namespace(&PgfReader::read_flag); // ?? merge_abstract(ref::from_ptr(&pgf->abstract)); size_t len = read_len(); for (size_t i = 0; i < len; i++) { ref concr = PgfReader::read_concrete(); Namespace concretes = namespace_insert(pgf->concretes, concr); if (concretes == 0) throw pgf_error("One and the same concrete syntax is included in several PGF files"); pgf->concretes = concretes; } }