1
0
forked from GitHub/gf-core

the parser is now fully Unicode compatible

This commit is contained in:
krasimir
2015-05-08 09:23:29 +00:00
parent b961e9a255
commit 365c7bb1d8
3 changed files with 79 additions and 86 deletions

View File

@@ -40,10 +40,4 @@ gu_utf8_encode(GuUCS ucs, uint8_t** buf);
void void
gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err); gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err);
static inline bool
gu_is_space(uint8_t c) {
return (c == '\t' || c == '\n' || c == '\v' ||
c == '\f' || c == '\r' || c == ' ');
}
#endif // GU_UTF8_H_ #endif // GU_UTF8_H_

View File

@@ -12,11 +12,13 @@ pgf_match_string_lit(PgfLiteralCallback* self,
{ {
gu_assert(lin_idx == 0); gu_assert(lin_idx == 0);
size_t offset = *poffset; const uint8_t* buf = (uint8_t*) (sentence + *poffset);
while (sentence[offset] && !gu_is_space(sentence[offset])) const uint8_t* p = buf;
offset++; size_t len = 0;
while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
len = p - buf;
}
size_t len = offset - *poffset;
if (len > 0) { if (len > 0) {
PgfExprProb* ep = gu_new(PgfExprProb, out_pool); PgfExprProb* ep = gu_new(PgfExprProb, out_pool);
ep->prob = 0; ep->prob = 0;
@@ -31,10 +33,10 @@ pgf_match_string_lit(PgfLiteralCallback* self,
PgfLiteralStr, PgfLiteralStr,
val, len+1, val, len+1,
&expr_lit->lit, out_pool); &expr_lit->lit, out_pool);
memcpy(lit_str->val, sentence+*poffset, len); memcpy(lit_str->val, buf, len);
lit_str->val[len] = 0; lit_str->val[len] = 0;
*poffset = offset; *poffset += len;
return ep; return ep;
} else { } else {
return NULL; return NULL;
@@ -71,15 +73,17 @@ pgf_match_int_lit(PgfLiteralCallback* self,
{ {
gu_assert(lin_idx == 0); gu_assert(lin_idx == 0);
size_t offset = *poffset; const uint8_t* buf = (uint8_t*) (sentence + *poffset);
while (sentence[offset] && !gu_is_space(sentence[offset])) const uint8_t* p = buf;
offset++; size_t len = 0;
while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
len = p - buf;
}
size_t len = offset - *poffset;
if (len > 0) { if (len > 0) {
GuPool* tmp_pool = gu_local_pool(); GuPool* tmp_pool = gu_local_pool();
PgfToken tok = gu_malloc(tmp_pool, len+1); PgfToken tok = gu_malloc(tmp_pool, len+1);
memcpy((char*) tok, sentence+*poffset, len); memcpy((char*) tok, buf, len);
((char*) tok)[len] = 0; ((char*) tok)[len] = 0;
int val; int val;
@@ -103,7 +107,7 @@ pgf_match_int_lit(PgfLiteralCallback* self,
&expr_lit->lit, out_pool); &expr_lit->lit, out_pool);
lit_int->val = val; lit_int->val = val;
*poffset = offset; *poffset += len;
return ep; return ep;
} else { } else {
return NULL; return NULL;
@@ -123,15 +127,17 @@ pgf_match_float_lit(PgfLiteralCallback* self,
{ {
gu_assert(lin_idx == 0); gu_assert(lin_idx == 0);
size_t offset = *poffset; const uint8_t* buf = (uint8_t*) (sentence + *poffset);
while (sentence[offset] && !gu_is_space(sentence[offset])) const uint8_t* p = buf;
offset++; size_t len = 0;
while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
len = p - buf;
}
size_t len = offset - *poffset;
if (len > 0) { if (len > 0) {
GuPool* tmp_pool = gu_local_pool(); GuPool* tmp_pool = gu_local_pool();
PgfToken tok = gu_malloc(tmp_pool, len+1); PgfToken tok = gu_malloc(tmp_pool, len+1);
memcpy((char*) tok, sentence+*poffset, len); memcpy((char*) tok, buf, len);
((char*) tok)[len] = 0; ((char*) tok)[len] = 0;
double val; double val;
@@ -155,7 +161,7 @@ pgf_match_float_lit(PgfLiteralCallback* self,
&expr_lit->lit, out_pool); &expr_lit->lit, out_pool);
lit_flt->val = val; lit_flt->val = val;
*poffset = offset; *poffset += len;
return ep; return ep;
} else { } else {
return NULL; return NULL;
@@ -181,30 +187,27 @@ pgf_match_name_lit(PgfLiteralCallback* self,
GuOut* out = gu_string_buf_out(sbuf); GuOut* out = gu_string_buf_out(sbuf);
GuExn* err = gu_new_exn(tmp_pool); GuExn* err = gu_new_exn(tmp_pool);
size_t offset = *poffset; const uint8_t* buf = (uint8_t*) (sentence + *poffset);
const uint8_t* p = buf;
int i = 0; int i = 0;
while (iswupper(sentence[offset])) { GuUCS ucs = gu_utf8_decode(&p);
size_t len = 0; while (gu_ucs_is_upper(ucs)) {
while (!gu_is_space(sentence[offset+len])) {
len++;
}
PgfToken tok = gu_malloc(tmp_pool, len+1);
memcpy((char*) tok, sentence+offset, len);
((char*) tok)[len] = 0;
if (i > 0) if (i > 0)
gu_putc(' ', out, err); gu_putc(' ', out, err);
gu_string_write(tok, out, err); gu_out_utf8(ucs, out, err);
ucs = gu_utf8_decode(&p);
while (ucs != 0 && !gu_ucs_is_space(ucs)) {
gu_out_utf8(ucs, out, err);
*poffset = p - ((uint8_t*) sentence);
ucs = gu_utf8_decode(&p);
}
i++; i++;
offset += len; while (gu_ucs_is_space(ucs))
*poffset = offset; ucs = gu_utf8_decode(&p);
while (gu_is_space(sentence[offset]))
offset++;
} }
PgfExprProb* ep = NULL; PgfExprProb* ep = NULL;

View File

@@ -145,41 +145,39 @@ pgf_prev_extern_sym(PgfSymbol sym)
} }
} }
static void static PgfSymbol
pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool) { pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
PgfSymbol new_sym;
size_t tok_len = strlen(tok);
PgfSymbolKS* sks = (PgfSymbolKS*)
gu_alloc_variant(PGF_SYMBOL_KS,
sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1,
gu_alignof(PgfSymbolKS),
&new_sym, pool);
strcpy(sks->token, tok);
*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = *psym;
*psym = new_sym;
}
PgfSymbol
pgf_collect_extern_tok(PgfParsing* ps, size_t start, size_t end)
{ {
PgfSymbol sym = gu_null_variant; PgfSymbol sym = gu_null_variant;
size_t offset = start; const uint8_t* start = (uint8_t*) ps->sentence+start_offset;
while (offset < end) { const uint8_t* end = (uint8_t*) ps->sentence+end_offset;
const uint8_t* p = start;
GuUCS ucs = gu_utf8_decode(&p);
while (start < end) {
size_t len = 0; size_t len = 0;
while (!gu_is_space(ps->sentence[offset+len])) { while (p <= end && !gu_ucs_is_space(ucs)) {
len++; len = (p - start);
ucs = gu_utf8_decode(&p);
} }
PgfToken tok = gu_malloc(ps->pool, len+1); PgfSymbol new_sym;
memcpy((char*) tok, ps->sentence+offset, len); PgfSymbolKS* sks = (PgfSymbolKS*)
((char*) tok)[len] = 0; gu_alloc_variant(PGF_SYMBOL_KS,
sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+len+1,
gu_alignof(PgfSymbolKS),
&new_sym, ps->pool);
memcpy((char*) sks->token, start, len);
((char*) sks->token)[len] = 0;
*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+len+1)) = sym;
sym = new_sym;
pgf_add_extern_tok(&sym, tok, ps->pool); start = p;
while (gu_ucs_is_space(ucs)) {
offset += len; start = p;
while (gu_is_space(ps->sentence[offset])) ucs = gu_utf8_decode(&p);
offset++; }
} }
return sym; return sym;
@@ -504,11 +502,11 @@ skip_space(GuString* psent, size_t* plen)
if (*plen == 0) if (*plen == 0)
return false; return false;
char c = **psent; const uint8_t* p = (uint8_t*) *psent;
if (!gu_is_space(c)) if (!gu_ucs_is_space(gu_utf8_decode(&p)))
return false; return false;
(*psent)++; *psent = (GuString) p;
return true; return true;
} }
@@ -2056,24 +2054,22 @@ pgf_parsing_last_token(PgfParsing* ps, GuPool* pool)
if (ps->before == NULL) if (ps->before == NULL)
return ""; return "";
size_t start = ps->before->end_offset; const uint8_t* start = (uint8_t*) ps->sentence;
while (start > 0) { const uint8_t* end = (uint8_t*) ps->sentence + ps->before->end_offset;
char c = ps->sentence[start-1];
if (gu_is_space(c)) const uint8_t* p = start;
break; while (p < end) {
start--; if (gu_ucs_is_space(gu_utf8_decode(&p))) {
start = p;
}
} }
size_t end = ps->before->end_offset; while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
while (ps->sentence[end] != 0) { end = p;
char c = ps->sentence[end];
if (gu_is_space(c))
break;
end++;
} }
char* tok = gu_malloc(pool, end-start+1); char* tok = gu_malloc(pool, end-start+1);
memcpy(tok, ps->sentence+start, (end-start)); memcpy(tok, start, (end-start));
tok[end-start] = 0; tok[end-start] = 0;
return tok; return tok;
} }