diff --git a/src/runtime/c/gu/utf8.c b/src/runtime/c/gu/utf8.c index 8f22e5823..cd198a83d 100644 --- a/src/runtime/c/gu/utf8.c +++ b/src/runtime/c/gu/utf8.c @@ -33,20 +33,21 @@ gu_in_utf8_(GuIn* in, GuExn* err) if (!gu_ok(err)) { return 0; } - int len = (c < 0x80 ? 0 : - c < 0xc2 ? -1 : - c < 0xe0 ? 1 : - c < 0xf0 ? 2 : - c < 0xf5 ? 3 : - -1); - if (len < 0) { - goto fail; - } else if (len == 0) { + if (c < 0x80) { return c; + } + if (c < 0xc2) { + goto fail; } - static const uint8_t mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 }; - uint32_t u = c & mask[len]; - uint8_t buf[3]; + int len = (c < 0xe0 ? 1 : + c < 0xf0 ? 2 : + c < 0xf8 ? 3 : + c < 0xfc ? 4 : + 5 + ); + uint64_t mask = 0x0103070F1f7f; + uint32_t u = c & (mask >> (len * 8)); + uint8_t buf[5]; // If reading the extra bytes causes EOF, it is an encoding // error, not a legitimate end of character stream. gu_in_bytes(in, buf, len, err); @@ -78,55 +79,6 @@ fail: extern inline GuUCS gu_in_utf8(GuIn* in, GuExn* err); -static size_t -gu_advance_utf8(GuUCS ucs, uint8_t* buf) -{ - gu_require(gu_ucs_valid(ucs)); - if (ucs < 0x80) { - buf[0] = (uint8_t) ucs; - return 1; - } else if (ucs < 0x800) { - buf[0] = 0xc0 | (ucs >> 6); - buf[1] = 0x80 | (ucs & 0x3f); - return 2; - } else if (ucs < 0x10000) { - buf[0] = 0xe0 | (ucs >> 12); - buf[1] = 0x80 | ((ucs >> 6) & 0x3f); - buf[2] = 0x80 | (ucs & 0x3f); - return 3; - } else { - buf[0] = 0xf0 | (ucs >> 18); - buf[1] = 0x80 | ((ucs >> 12) & 0x3f); - buf[2] = 0x80 | ((ucs >> 6) & 0x3f); - buf[3] = 0x80 | (ucs & 0x3f); - return 4; - } -} - - -void -gu_out_utf8_(GuUCS ucs, GuOut* out, GuExn* err) -{ - uint8_t buf[4]; - size_t sz = gu_advance_utf8(ucs, buf); - switch (sz) { - case 2: - gu_out_bytes(out, buf, 2, err); - break; - case 3: - gu_out_bytes(out, buf, 3, err); - break; - case 4: - gu_out_bytes(out, buf, 4, err); - break; - default: - gu_impossible(); - } -} - -extern inline void -gu_out_utf8(GuUCS ucs, GuOut* out, GuExn* err); - void gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err) { @@ -137,18 +89,21 @@ gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err) return; } *(p++) = c; - int len = (c < 0x80 ? 0 : - c < 0xc2 ? -1 : - c < 0xe0 ? 1 : - c < 0xf0 ? 2 : - c < 0xf5 ? 3 : - -1); - if (len < 0) { - goto fail; - } else if (len == 0) { + + if (c < 0x80) { *buf = p; return; } + if (c < 0xc2) { + goto fail; + } + + int len = (c < 0xe0 ? 1 : + c < 0xf0 ? 2 : + c < 0xf8 ? 3 : + c < 0xfc ? 4 : + 5 + ); // If reading the extra bytes causes EOF, it is an encoding // error, not a legitimate end of character stream. gu_in_bytes(in, p, len, err); @@ -166,3 +121,56 @@ fail: gu_raise(err, GuUCSExn); return; } + +void +gu_utf8_encode(GuUCS ucs, uint8_t** buf) +{ + gu_require(gu_ucs_valid(ucs)); + uint8_t* p = *buf; + if (ucs < 0x80) { + p[0] = (uint8_t) ucs; + *buf = p+1; + } else if (ucs < 0x800) { + p[0] = 0xc0 | (ucs >> 6); + p[1] = 0x80 | (ucs & 0x3f); + *buf = p+2; + } else if (ucs < 0x10000) { + p[0] = 0xe0 | (ucs >> 12); + p[1] = 0x80 | ((ucs >> 6) & 0x3f); + p[2] = 0x80 | (ucs & 0x3f); + *buf = p+3; + } else if (ucs < 0x200000) { + p[0] = 0xf0 | (ucs >> 18); + p[1] = 0x80 | ((ucs >> 12) & 0x3f); + p[2] = 0x80 | ((ucs >> 6) & 0x3f); + p[3] = 0x80 | (ucs & 0x3f); + *buf = p+4; + } else if (ucs < 0x4000000) { + p[0] = 0xf8 | (ucs >> 24); + p[1] = 0x80 | ((ucs >> 18) & 0x3f); + p[2] = 0x80 | ((ucs >> 12) & 0x3f); + p[3] = 0x80 | ((ucs >> 6) & 0x3f); + p[4] = 0x80 | (ucs & 0x3f); + *buf = p+5; + } else { + p[0] = 0xfc | (ucs >> 30); + p[1] = 0x80 | ((ucs >> 24) & 0x3f); + p[2] = 0x80 | ((ucs >> 18) & 0x3f); + p[3] = 0x80 | ((ucs >> 12) & 0x3f); + p[4] = 0x80 | ((ucs >> 6) & 0x3f); + p[5] = 0x80 | (ucs & 0x3f); + *buf = p+6; + } +} + +void +gu_out_utf8_(GuUCS ucs, GuOut* out, GuExn* err) +{ + uint8_t buf[6]; + uint8_t* p = buf; + gu_utf8_encode(ucs, &p); + gu_out_bytes(out, buf, p-buf, err); +} + +extern inline void +gu_out_utf8(GuUCS ucs, GuOut* out, GuExn* err); diff --git a/src/runtime/c/gu/utf8.h b/src/runtime/c/gu/utf8.h index db7dccaf9..3ad28946d 100644 --- a/src/runtime/c/gu/utf8.h +++ b/src/runtime/c/gu/utf8.h @@ -32,7 +32,10 @@ gu_out_utf8(GuUCS ucs, GuOut* out, GuExn* err) // Helper functions used in other modules GuUCS -gu_utf8_decode(const uint8_t** utf8); +gu_utf8_decode(const uint8_t** buf); + +void +gu_utf8_encode(GuUCS ucs, uint8_t** buf); void gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err);