mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
fix the UTF8 implementation in libgu
This commit is contained in:
@@ -33,20 +33,21 @@ gu_in_utf8_(GuIn* in, GuExn* err)
|
||||
if (!gu_ok(err)) {
|
||||
return 0;
|
||||
}
|
||||
int len = (c < 0x80 ? 0 :
|
||||
c < 0xc2 ? -1 :
|
||||
c < 0xe0 ? 1 :
|
||||
c < 0xf0 ? 2 :
|
||||
c < 0xf5 ? 3 :
|
||||
-1);
|
||||
if (len < 0) {
|
||||
goto fail;
|
||||
} else if (len == 0) {
|
||||
if (c < 0x80) {
|
||||
return c;
|
||||
}
|
||||
if (c < 0xc2) {
|
||||
goto fail;
|
||||
}
|
||||
static const uint8_t mask[4] = { 0x7f, 0x1f, 0x0f, 0x07 };
|
||||
uint32_t u = c & mask[len];
|
||||
uint8_t buf[3];
|
||||
int len = (c < 0xe0 ? 1 :
|
||||
c < 0xf0 ? 2 :
|
||||
c < 0xf8 ? 3 :
|
||||
c < 0xfc ? 4 :
|
||||
5
|
||||
);
|
||||
uint64_t mask = 0x0103070F1f7f;
|
||||
uint32_t u = c & (mask >> (len * 8));
|
||||
uint8_t buf[5];
|
||||
// If reading the extra bytes causes EOF, it is an encoding
|
||||
// error, not a legitimate end of character stream.
|
||||
gu_in_bytes(in, buf, len, err);
|
||||
@@ -78,55 +79,6 @@ fail:
|
||||
extern inline GuUCS
|
||||
gu_in_utf8(GuIn* in, GuExn* err);
|
||||
|
||||
static size_t
|
||||
gu_advance_utf8(GuUCS ucs, uint8_t* buf)
|
||||
{
|
||||
gu_require(gu_ucs_valid(ucs));
|
||||
if (ucs < 0x80) {
|
||||
buf[0] = (uint8_t) ucs;
|
||||
return 1;
|
||||
} else if (ucs < 0x800) {
|
||||
buf[0] = 0xc0 | (ucs >> 6);
|
||||
buf[1] = 0x80 | (ucs & 0x3f);
|
||||
return 2;
|
||||
} else if (ucs < 0x10000) {
|
||||
buf[0] = 0xe0 | (ucs >> 12);
|
||||
buf[1] = 0x80 | ((ucs >> 6) & 0x3f);
|
||||
buf[2] = 0x80 | (ucs & 0x3f);
|
||||
return 3;
|
||||
} else {
|
||||
buf[0] = 0xf0 | (ucs >> 18);
|
||||
buf[1] = 0x80 | ((ucs >> 12) & 0x3f);
|
||||
buf[2] = 0x80 | ((ucs >> 6) & 0x3f);
|
||||
buf[3] = 0x80 | (ucs & 0x3f);
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
gu_out_utf8_(GuUCS ucs, GuOut* out, GuExn* err)
|
||||
{
|
||||
uint8_t buf[4];
|
||||
size_t sz = gu_advance_utf8(ucs, buf);
|
||||
switch (sz) {
|
||||
case 2:
|
||||
gu_out_bytes(out, buf, 2, err);
|
||||
break;
|
||||
case 3:
|
||||
gu_out_bytes(out, buf, 3, err);
|
||||
break;
|
||||
case 4:
|
||||
gu_out_bytes(out, buf, 4, err);
|
||||
break;
|
||||
default:
|
||||
gu_impossible();
|
||||
}
|
||||
}
|
||||
|
||||
extern inline void
|
||||
gu_out_utf8(GuUCS ucs, GuOut* out, GuExn* err);
|
||||
|
||||
void
|
||||
gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err)
|
||||
{
|
||||
@@ -137,18 +89,21 @@ gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err)
|
||||
return;
|
||||
}
|
||||
*(p++) = c;
|
||||
int len = (c < 0x80 ? 0 :
|
||||
c < 0xc2 ? -1 :
|
||||
c < 0xe0 ? 1 :
|
||||
c < 0xf0 ? 2 :
|
||||
c < 0xf5 ? 3 :
|
||||
-1);
|
||||
if (len < 0) {
|
||||
goto fail;
|
||||
} else if (len == 0) {
|
||||
|
||||
if (c < 0x80) {
|
||||
*buf = p;
|
||||
return;
|
||||
}
|
||||
if (c < 0xc2) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
int len = (c < 0xe0 ? 1 :
|
||||
c < 0xf0 ? 2 :
|
||||
c < 0xf8 ? 3 :
|
||||
c < 0xfc ? 4 :
|
||||
5
|
||||
);
|
||||
// If reading the extra bytes causes EOF, it is an encoding
|
||||
// error, not a legitimate end of character stream.
|
||||
gu_in_bytes(in, p, len, err);
|
||||
@@ -166,3 +121,56 @@ fail:
|
||||
gu_raise(err, GuUCSExn);
|
||||
return;
|
||||
}
|
||||
|
||||
void
|
||||
gu_utf8_encode(GuUCS ucs, uint8_t** buf)
|
||||
{
|
||||
gu_require(gu_ucs_valid(ucs));
|
||||
uint8_t* p = *buf;
|
||||
if (ucs < 0x80) {
|
||||
p[0] = (uint8_t) ucs;
|
||||
*buf = p+1;
|
||||
} else if (ucs < 0x800) {
|
||||
p[0] = 0xc0 | (ucs >> 6);
|
||||
p[1] = 0x80 | (ucs & 0x3f);
|
||||
*buf = p+2;
|
||||
} else if (ucs < 0x10000) {
|
||||
p[0] = 0xe0 | (ucs >> 12);
|
||||
p[1] = 0x80 | ((ucs >> 6) & 0x3f);
|
||||
p[2] = 0x80 | (ucs & 0x3f);
|
||||
*buf = p+3;
|
||||
} else if (ucs < 0x200000) {
|
||||
p[0] = 0xf0 | (ucs >> 18);
|
||||
p[1] = 0x80 | ((ucs >> 12) & 0x3f);
|
||||
p[2] = 0x80 | ((ucs >> 6) & 0x3f);
|
||||
p[3] = 0x80 | (ucs & 0x3f);
|
||||
*buf = p+4;
|
||||
} else if (ucs < 0x4000000) {
|
||||
p[0] = 0xf8 | (ucs >> 24);
|
||||
p[1] = 0x80 | ((ucs >> 18) & 0x3f);
|
||||
p[2] = 0x80 | ((ucs >> 12) & 0x3f);
|
||||
p[3] = 0x80 | ((ucs >> 6) & 0x3f);
|
||||
p[4] = 0x80 | (ucs & 0x3f);
|
||||
*buf = p+5;
|
||||
} else {
|
||||
p[0] = 0xfc | (ucs >> 30);
|
||||
p[1] = 0x80 | ((ucs >> 24) & 0x3f);
|
||||
p[2] = 0x80 | ((ucs >> 18) & 0x3f);
|
||||
p[3] = 0x80 | ((ucs >> 12) & 0x3f);
|
||||
p[4] = 0x80 | ((ucs >> 6) & 0x3f);
|
||||
p[5] = 0x80 | (ucs & 0x3f);
|
||||
*buf = p+6;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
gu_out_utf8_(GuUCS ucs, GuOut* out, GuExn* err)
|
||||
{
|
||||
uint8_t buf[6];
|
||||
uint8_t* p = buf;
|
||||
gu_utf8_encode(ucs, &p);
|
||||
gu_out_bytes(out, buf, p-buf, err);
|
||||
}
|
||||
|
||||
extern inline void
|
||||
gu_out_utf8(GuUCS ucs, GuOut* out, GuExn* err);
|
||||
|
||||
@@ -32,7 +32,10 @@ gu_out_utf8(GuUCS ucs, GuOut* out, GuExn* err)
|
||||
// Helper functions used in other modules
|
||||
|
||||
GuUCS
|
||||
gu_utf8_decode(const uint8_t** utf8);
|
||||
gu_utf8_decode(const uint8_t** buf);
|
||||
|
||||
void
|
||||
gu_utf8_encode(GuUCS ucs, uint8_t** buf);
|
||||
|
||||
void
|
||||
gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err);
|
||||
|
||||
Reference in New Issue
Block a user