#include "data.h" #include "reader.h" #include #include PgfReader::PgfReader(FILE *in) { this->in = in; this->abstract = 0; } uint8_t PgfReader::read_uint8() { uint8_t b; size_t n_bytes = fread((char*) &b, sizeof(b), 1, in); if (feof(in)) throw pgf_error("reached end of file while reading the grammar"); if (ferror(in)) throw pgf_error("an error occured while reading the grammar"); return b; } uint16_t PgfReader::read_u16be() { uint8_t buf[2]; size_t n_bytes = fread((char*) &buf, sizeof(buf), 1, in); if (feof(in)) throw pgf_error("reached end of file while reading a grammar"); if (ferror(in)) throw pgf_error("an error occured while reading the grammar"); return (((uint16_t) buf[0]) << 8 | buf[1]); } uint64_t PgfReader::read_u64be() { uint8_t buf[8]; size_t n_bytes = fread((char*) &buf, sizeof(buf), 1, in); if (feof(in)) throw pgf_error("reached end of file while reading a grammar"); if (ferror(in)) throw pgf_error("an error occured while reading the grammar"); return (((uint64_t) buf[0]) << 56 | ((uint64_t) buf[1]) << 48 | ((uint64_t) buf[2]) << 40 | ((uint64_t) buf[3]) << 32 | ((uint64_t) buf[4]) << 24 | ((uint64_t) buf[5]) << 16 | ((uint64_t) buf[6]) << 8 | ((uint64_t) buf[7])); } double PgfReader::read_double() { uint64_t u = read_u64be(); bool sign = u >> 63; unsigned rawexp = u >> 52 & 0x7ff; uint64_t mantissa = u & 0xfffffffffffff; double ret; if (rawexp == 0x7ff) { ret = (mantissa == 0) ? INFINITY : NAN; } else { uint64_t m = rawexp ? 1ULL << 52 | mantissa : mantissa << 1; ret = ldexp((double) m, rawexp - 1075); } return sign ? copysign(ret, -1.0) : ret; } uint64_t PgfReader::read_uint() { uint64_t u = 0; int shift = 0; uint8_t b = 0; do { b = read_uint8(); u |= (b & ~0x80) << shift; shift += 7; } while (b & 0x80); return u; } object PgfReader::read_name_internal(size_t struct_size) { size_t size = read_len(); object offs = current_db->malloc_internal(struct_size+sizeof(PgfText)+size+1); PgfText* ptext = (PgfText*) (current_base+offs+struct_size); ptext->size = size; // If reading the extra bytes causes EOF, it is an encoding // error, not a legitimate end of character stream. fread(ptext->text, size, 1, in); if (feof(in)) throw pgf_error("utf8 decoding error"); if (ferror(in)) throw pgf_error("an error occured while reading the grammar"); ptext->text[size] = 0; return offs; } object PgfReader::read_text_internal(size_t struct_size) { size_t len = read_len(); char* buf = (char*) alloca(len*6+1); char* p = buf; for (size_t i = 0; i < len; i++) { uint8_t c = read_uint8(); *(p++) = (char) c; if (c < 0x80) { continue; } if (c < 0xc2) { throw pgf_error("utf8 decoding error"); } int len = (c < 0xe0 ? 1 : c < 0xf0 ? 2 : c < 0xf8 ? 3 : c < 0xfc ? 4 : 5 ); // If reading the extra bytes causes EOF, it is an encoding // error, not a legitimate end of character stream. fread(p, len, 1, in); if (feof(in)) throw pgf_error("utf8 decoding error"); if (ferror(in)) throw pgf_error("an error occured while reading the grammar"); p += len; } size_t size = p-buf; *p++ = 0; object offs = current_db->malloc_internal(struct_size+sizeof(PgfText)+size+1); PgfText* ptext = (PgfText*) (current_base+offs+struct_size); ptext->size = size; memcpy(ptext->text, buf, size+1); return offs; } template Namespace PgfReader::read_namespace(ref (PgfReader::*read_value)(), size_t len) { if (len == 0) return 0; size_t half = len/2; Namespace left = read_namespace(read_value, half); ref value = (this->*read_value)(); Namespace right = read_namespace(read_value, len-half-1); return Node::new_node(value, left, right); } template Namespace PgfReader::read_namespace(ref (PgfReader::*read_value)()) { size_t len = read_len(); return read_namespace(read_value, len); } template ref PgfReader::read_vector(Vector C::* field, void (PgfReader::*read_value)(ref val)) { size_t len = read_len(); ref loc = vector_new(field,len); for (size_t i = 0; i < len; i++) { (this->*read_value)(vector_elem(ref>::from_ptr(&(loc->*field)),i)); } return loc; } template ref> PgfReader::read_vector(void (PgfReader::*read_value)(ref val)) { size_t len = read_len(); ref> vec = vector_new(len); for (size_t i = 0; i < len; i++) { (this->*read_value)(vector_elem(vec,i)); } return vec; } PgfLiteral PgfReader::read_literal() { PgfLiteral lit = 0; uint8_t tag = read_tag(); switch (tag) { case PgfLiteralStr::tag: { ref lit_str = read_text(&PgfLiteralStr::val); lit = ref::tagged(lit_str); break; } case PgfLiteralInt::tag: { size_t size = read_len(); ref lit_int = PgfDB::malloc(sizeof(uintmax_t)*size); lit_int->size = size; for (size_t i = 0; i < size; i++) { lit_int->val[i] = (uintmax_t) read_uint(); } lit = ref::tagged(lit_int); break; } case PgfLiteralFlt::tag: { ref lit_flt = current_db->malloc(); lit_flt->val = read_double(); lit = ref::tagged(lit_flt); break; } default: throw pgf_error("Unknown literal tag"); } return lit; } ref PgfReader::read_flag() { ref flag = read_name(&PgfFlag::name); flag->ref_count = 1; flag->value = read_literal(); return flag; } PgfExpr PgfReader::read_expr() { PgfExpr expr = 0; uint8_t tag = read_tag(); switch (tag) { case PgfExprAbs::tag:{ PgfBindType bind_type = (PgfBindType) read_tag(); ref eabs = read_name(&PgfExprAbs::name); eabs->bind_type = bind_type; eabs->body = read_expr(); expr = ref::tagged(eabs); break; } case PgfExprApp::tag: { ref eapp = PgfDB::malloc(); eapp->fun = read_expr(); eapp->arg = read_expr(); expr = ref::tagged(eapp); break; } case PgfExprLit::tag: { ref elit = PgfDB::malloc(); elit->lit = read_literal(); expr = ref::tagged(elit); break; } case PgfExprMeta::tag: { ref emeta = PgfDB::malloc(); emeta->id = read_int(); expr = ref::tagged(emeta); break; } case PgfExprFun::tag: { ref efun = read_name(&PgfExprFun::name); expr = ref::tagged(efun); break; } case PgfExprVar::tag: { ref evar = PgfDB::malloc(); evar->var = read_int(); expr = ref::tagged(evar); break; } case PgfExprTyped::tag: { ref etyped = PgfDB::malloc(); etyped->expr = read_expr(); etyped->type = read_type(); expr = ref::tagged(etyped); break; } case PgfExprImplArg::tag: { ref eimpl = current_db->malloc(); eimpl->expr = read_expr(); expr = ref::tagged(eimpl); break; } default: throw pgf_error("Unknown expression tag"); } return expr; } void PgfReader::read_hypo(ref hypo) { hypo->bind_type = (PgfBindType) read_tag(); hypo->cid = read_name(); hypo->type = read_type(); } ref PgfReader::read_type() { ref> hypos = read_vector(&PgfReader::read_hypo); ref tp = read_name(&PgfDTyp::name); tp->hypos = hypos; tp->exprs = read_vector(&PgfReader::read_expr); return tp; } ref PgfReader::read_absfun() { ref absfun = read_name(&PgfAbsFun::name); absfun->ref_count = 1; ref efun = ref::from_ptr((PgfExprFun*) &absfun->name); absfun->type = read_type(); absfun->arity = read_int(); uint8_t tag = read_tag(); switch (tag) { case 0: absfun->bytecode = 0; break; case 1: { read_len(); absfun->bytecode = PgfDB::malloc(0); break; } default: throw pgf_error("Unknown tag, 0 or 1 expected"); } absfun->prob = - log(read_double()); return absfun; } ref PgfReader::read_abscat() { ref abscat = read_name(&PgfAbsCat::name); abscat->ref_count = 1; abscat->context = read_vector(&PgfReader::read_hypo); // for now we just read the set of functions per category and ignore them size_t n_funs = read_len(); for (size_t i = 0; i < n_funs; i++) { read_double(); read_name(); } abscat->prob = - log(read_double()); return abscat; } void PgfReader::read_abstract(ref abstract) { this->abstract = abstract; abstract->name = read_name(); abstract->aflags = read_namespace(&PgfReader::read_flag); abstract->funs = read_namespace(&PgfReader::read_absfun); abstract->cats = read_namespace(&PgfReader::read_abscat); } ref PgfReader::read_lparam() { size_t i0 = read_int(); size_t n_terms = read_len(); ref lparam = PgfDB::malloc(n_terms*sizeof(PgfLParam::terms[0])); lparam->i0 = i0; lparam->n_terms = n_terms; for (size_t i = 0; i < n_terms; i++) { lparam->terms[i].factor = read_int(); lparam->terms[i].var = read_int(); } return lparam; } void PgfReader::read_variable_range(ref var_info) { var_info->var = read_int(); var_info->range = read_int(); } void PgfReader::read_parg(ref parg) { parg->param = read_lparam(); } ref PgfReader::read_presult() { ref> vars = 0; size_t n_vars = read_len(); if (n_vars > 0) { vars = vector_new(n_vars); for (size_t i = 0; i < n_vars; i++) { read_variable_range(vector_elem(vars,i)); } } size_t i0 = read_int(); size_t n_terms = read_len(); ref res = PgfDB::malloc(n_terms*sizeof(PgfLParam::terms[0])); res->vars = vars; res->param.i0 = i0; res->param.n_terms = n_terms; for (size_t i = 0; i < n_terms; i++) { res->param.terms[i].factor = read_int(); res->param.terms[i].var = read_int(); } return res; } template ref PgfReader::read_symbol_idx() { size_t d = read_int(); size_t i0 = read_int(); size_t n_terms = read_len(); ref sym_idx = PgfDB::malloc(n_terms*sizeof(PgfLParam::terms[0])); sym_idx->d = d; sym_idx->r.i0 = i0; sym_idx->r.n_terms = n_terms; for (size_t i = 0; i < n_terms; i++) { sym_idx->r.terms[i].factor = read_int(); sym_idx->r.terms[i].var = read_int(); } return sym_idx; } PgfSymbol PgfReader::read_symbol() { PgfSymbol sym = 0; uint8_t tag = read_tag(); switch (tag) { case PgfSymbolCat::tag: { ref sym_cat = read_symbol_idx(); sym = ref::tagged(sym_cat); break; } case PgfSymbolLit::tag: { ref sym_lit = read_symbol_idx(); sym = ref::tagged(sym_lit); break; } case PgfSymbolVar::tag: { ref sym_var = PgfDB::malloc(); sym_var->d = read_int(); sym_var->r = read_int(); sym = ref::tagged(sym_var); break; } case PgfSymbolKS::tag: { ref sym_ks = read_text(&PgfSymbolKS::token); sym = ref::tagged(sym_ks); break; } case PgfSymbolKP::tag: { size_t n_alts = read_len(); ref sym_kp = PgfDB::malloc(n_alts*sizeof(PgfAlternative)); sym_kp->alts.len = n_alts; for (size_t i = 0; i < n_alts; i++) { auto form = read_vector(&PgfReader::read_symbol2); auto prefixes = read_vector(&PgfReader::read_text2); sym_kp->alts.data[i].form = form; sym_kp->alts.data[i].prefixes = prefixes; } auto default_form = read_vector(&PgfReader::read_symbol2); sym_kp->default_form = default_form; sym = ref::tagged(sym_kp); break; } case PgfSymbolBIND::tag: { sym = ref::tagged(0); break; } case PgfSymbolSOFTBIND::tag: { sym = ref::tagged(0); break; } case PgfSymbolNE::tag: { sym = ref::tagged(0); break; } case PgfSymbolSOFTSPACE::tag: { sym = ref::tagged(0); break; } case PgfSymbolCAPIT::tag: { sym = ref::tagged(0); break; } case PgfSymbolALLCAPIT::tag: { sym = ref::tagged(0); break; } default: throw pgf_error("Unknown symbol tag"); } return sym; } ref PgfReader::read_lincat() { ref lincat = read_name(&PgfConcrLincat::name); lincat->ref_count = 1; lincat->abscat = namespace_lookup(abstract->cats, &lincat->name); lincat->fields = read_vector(&PgfReader::read_text2); lincat->n_lindefs = read_len(); lincat->args = read_vector(&PgfReader::read_parg); lincat->res = read_vector(&PgfReader::read_presult2); lincat->seqs = read_vector(&PgfReader::read_seq2); return lincat; } ref PgfReader::read_lin() { ref lin = read_name(&PgfConcrLin::name); lin->ref_count = 1; lin->absfun = namespace_lookup(abstract->funs, &lin->name); lin->args = read_vector(&PgfReader::read_parg); lin->res = read_vector(&PgfReader::read_presult2); lin->seqs = read_vector(&PgfReader::read_seq2); return lin; } ref PgfReader::read_printname() { ref printname = read_name(&PgfConcrPrintname::name); printname->ref_count = 1; printname->printname = read_text(); return printname; } ref PgfReader::read_concrete() { ref concr = read_name(&PgfConcr::name); concr->ref_count = 1; concr->ref_count_ex = 0; concr->cflags = read_namespace(&PgfReader::read_flag); concr->lincats = read_namespace(&PgfReader::read_lincat); concr->lins = read_namespace(&PgfReader::read_lin); concr->printnames = read_namespace(&PgfReader::read_printname); concr->prev = 0; concr->next = 0; return concr; } ref PgfReader::read_pgf() { ref pgf = PgfDB::malloc(master_size+1); pgf->ref_count = 1; pgf->major_version = read_u16be(); pgf->minor_version = read_u16be(); if (pgf->major_version != PGF_MAJOR_VERSION || pgf->minor_version != PGF_MINOR_VERSION) { throw pgf_error("Unsupported format version"); } pgf->gflags = read_namespace(&PgfReader::read_flag); read_abstract(ref::from_ptr(&pgf->abstract)); pgf->concretes = read_namespace(&PgfReader::read_concrete); pgf->prev = 0; pgf->next = 0; pgf->name.size = master_size; memcpy(&pgf->name.text, master_text, master_size+1); return pgf; }