diff --git a/src/runtime/c/gu/utf8.h b/src/runtime/c/gu/utf8.h index 3ad28946d..be3ab0412 100644 --- a/src/runtime/c/gu/utf8.h +++ b/src/runtime/c/gu/utf8.h @@ -40,10 +40,4 @@ gu_utf8_encode(GuUCS ucs, uint8_t** buf); void gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err); -static inline bool -gu_is_space(uint8_t c) { - return (c == '\t' || c == '\n' || c == '\v' || - c == '\f' || c == '\r' || c == ' '); -} - #endif // GU_UTF8_H_ diff --git a/src/runtime/c/pgf/literals.c b/src/runtime/c/pgf/literals.c index c44a3d35e..aea358be5 100644 --- a/src/runtime/c/pgf/literals.c +++ b/src/runtime/c/pgf/literals.c @@ -12,11 +12,13 @@ pgf_match_string_lit(PgfLiteralCallback* self, { gu_assert(lin_idx == 0); - size_t offset = *poffset; - while (sentence[offset] && !gu_is_space(sentence[offset])) - offset++; + const uint8_t* buf = (uint8_t*) (sentence + *poffset); + const uint8_t* p = buf; + size_t len = 0; + while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) { + len = p - buf; + } - size_t len = offset - *poffset; if (len > 0) { PgfExprProb* ep = gu_new(PgfExprProb, out_pool); ep->prob = 0; @@ -31,10 +33,10 @@ pgf_match_string_lit(PgfLiteralCallback* self, PgfLiteralStr, val, len+1, &expr_lit->lit, out_pool); - memcpy(lit_str->val, sentence+*poffset, len); + memcpy(lit_str->val, buf, len); lit_str->val[len] = 0; - *poffset = offset; + *poffset += len; return ep; } else { return NULL; @@ -71,15 +73,17 @@ pgf_match_int_lit(PgfLiteralCallback* self, { gu_assert(lin_idx == 0); - size_t offset = *poffset; - while (sentence[offset] && !gu_is_space(sentence[offset])) - offset++; + const uint8_t* buf = (uint8_t*) (sentence + *poffset); + const uint8_t* p = buf; + size_t len = 0; + while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) { + len = p - buf; + } - size_t len = offset - *poffset; if (len > 0) { GuPool* tmp_pool = gu_local_pool(); PgfToken tok = gu_malloc(tmp_pool, len+1); - memcpy((char*) tok, sentence+*poffset, len); + memcpy((char*) tok, buf, len); ((char*) tok)[len] = 0; int val; @@ -103,7 +107,7 @@ pgf_match_int_lit(PgfLiteralCallback* self, &expr_lit->lit, out_pool); lit_int->val = val; - *poffset = offset; + *poffset += len; return ep; } else { return NULL; @@ -123,15 +127,17 @@ pgf_match_float_lit(PgfLiteralCallback* self, { gu_assert(lin_idx == 0); - size_t offset = *poffset; - while (sentence[offset] && !gu_is_space(sentence[offset])) - offset++; + const uint8_t* buf = (uint8_t*) (sentence + *poffset); + const uint8_t* p = buf; + size_t len = 0; + while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) { + len = p - buf; + } - size_t len = offset - *poffset; if (len > 0) { GuPool* tmp_pool = gu_local_pool(); PgfToken tok = gu_malloc(tmp_pool, len+1); - memcpy((char*) tok, sentence+*poffset, len); + memcpy((char*) tok, buf, len); ((char*) tok)[len] = 0; double val; @@ -155,7 +161,7 @@ pgf_match_float_lit(PgfLiteralCallback* self, &expr_lit->lit, out_pool); lit_flt->val = val; - *poffset = offset; + *poffset += len; return ep; } else { return NULL; @@ -181,30 +187,27 @@ pgf_match_name_lit(PgfLiteralCallback* self, GuOut* out = gu_string_buf_out(sbuf); GuExn* err = gu_new_exn(tmp_pool); - size_t offset = *poffset; + const uint8_t* buf = (uint8_t*) (sentence + *poffset); + const uint8_t* p = buf; int i = 0; - while (iswupper(sentence[offset])) { - size_t len = 0; - while (!gu_is_space(sentence[offset+len])) { - len++; - } - - PgfToken tok = gu_malloc(tmp_pool, len+1); - memcpy((char*) tok, sentence+offset, len); - ((char*) tok)[len] = 0; - + GuUCS ucs = gu_utf8_decode(&p); + while (gu_ucs_is_upper(ucs)) { if (i > 0) gu_putc(' ', out, err); - gu_string_write(tok, out, err); - + gu_out_utf8(ucs, out, err); + ucs = gu_utf8_decode(&p); + + while (ucs != 0 && !gu_ucs_is_space(ucs)) { + gu_out_utf8(ucs, out, err); + *poffset = p - ((uint8_t*) sentence); + ucs = gu_utf8_decode(&p); + } + i++; - offset += len; - *poffset = offset; - - while (gu_is_space(sentence[offset])) - offset++; + while (gu_ucs_is_space(ucs)) + ucs = gu_utf8_decode(&p); } PgfExprProb* ep = NULL; diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index be0b1b361..b2ad5374d 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -145,41 +145,39 @@ pgf_prev_extern_sym(PgfSymbol sym) } } -static void -pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool) { - PgfSymbol new_sym; - size_t tok_len = strlen(tok); - PgfSymbolKS* sks = (PgfSymbolKS*) - gu_alloc_variant(PGF_SYMBOL_KS, - sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1, - gu_alignof(PgfSymbolKS), - &new_sym, pool); - strcpy(sks->token, tok); - *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = *psym; - *psym = new_sym; -} - -PgfSymbol -pgf_collect_extern_tok(PgfParsing* ps, size_t start, size_t end) +static PgfSymbol +pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset) { PgfSymbol sym = gu_null_variant; - size_t offset = start; - while (offset < end) { + const uint8_t* start = (uint8_t*) ps->sentence+start_offset; + const uint8_t* end = (uint8_t*) ps->sentence+end_offset; + + const uint8_t* p = start; + GuUCS ucs = gu_utf8_decode(&p); + while (start < end) { size_t len = 0; - while (!gu_is_space(ps->sentence[offset+len])) { - len++; + while (p <= end && !gu_ucs_is_space(ucs)) { + len = (p - start); + ucs = gu_utf8_decode(&p); } - PgfToken tok = gu_malloc(ps->pool, len+1); - memcpy((char*) tok, ps->sentence+offset, len); - ((char*) tok)[len] = 0; + PgfSymbol new_sym; + PgfSymbolKS* sks = (PgfSymbolKS*) + gu_alloc_variant(PGF_SYMBOL_KS, + sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+len+1, + gu_alignof(PgfSymbolKS), + &new_sym, ps->pool); + memcpy((char*) sks->token, start, len); + ((char*) sks->token)[len] = 0; + *((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+len+1)) = sym; + sym = new_sym; - pgf_add_extern_tok(&sym, tok, ps->pool); - - offset += len; - while (gu_is_space(ps->sentence[offset])) - offset++; + start = p; + while (gu_ucs_is_space(ucs)) { + start = p; + ucs = gu_utf8_decode(&p); + } } return sym; @@ -504,11 +502,11 @@ skip_space(GuString* psent, size_t* plen) if (*plen == 0) return false; - char c = **psent; - if (!gu_is_space(c)) + const uint8_t* p = (uint8_t*) *psent; + if (!gu_ucs_is_space(gu_utf8_decode(&p))) return false; - (*psent)++; + *psent = (GuString) p; return true; } @@ -2056,24 +2054,22 @@ pgf_parsing_last_token(PgfParsing* ps, GuPool* pool) if (ps->before == NULL) return ""; - size_t start = ps->before->end_offset; - while (start > 0) { - char c = ps->sentence[start-1]; - if (gu_is_space(c)) - break; - start--; + const uint8_t* start = (uint8_t*) ps->sentence; + const uint8_t* end = (uint8_t*) ps->sentence + ps->before->end_offset; + + const uint8_t* p = start; + while (p < end) { + if (gu_ucs_is_space(gu_utf8_decode(&p))) { + start = p; + } } - size_t end = ps->before->end_offset; - while (ps->sentence[end] != 0) { - char c = ps->sentence[end]; - if (gu_is_space(c)) - break; - end++; + while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) { + end = p; } char* tok = gu_malloc(pool, end-start+1); - memcpy(tok, ps->sentence+start, (end-start)); + memcpy(tok, start, (end-start)); tok[end-start] = 0; return tok; }