forked from GitHub/gf-core
the parser is now fully Unicode compatible
This commit is contained in:
@@ -40,10 +40,4 @@ gu_utf8_encode(GuUCS ucs, uint8_t** buf);
|
|||||||
void
|
void
|
||||||
gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err);
|
gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err);
|
||||||
|
|
||||||
static inline bool
|
|
||||||
gu_is_space(uint8_t c) {
|
|
||||||
return (c == '\t' || c == '\n' || c == '\v' ||
|
|
||||||
c == '\f' || c == '\r' || c == ' ');
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // GU_UTF8_H_
|
#endif // GU_UTF8_H_
|
||||||
|
|||||||
@@ -12,11 +12,13 @@ pgf_match_string_lit(PgfLiteralCallback* self,
|
|||||||
{
|
{
|
||||||
gu_assert(lin_idx == 0);
|
gu_assert(lin_idx == 0);
|
||||||
|
|
||||||
size_t offset = *poffset;
|
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
|
||||||
while (sentence[offset] && !gu_is_space(sentence[offset]))
|
const uint8_t* p = buf;
|
||||||
offset++;
|
size_t len = 0;
|
||||||
|
while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
|
||||||
|
len = p - buf;
|
||||||
|
}
|
||||||
|
|
||||||
size_t len = offset - *poffset;
|
|
||||||
if (len > 0) {
|
if (len > 0) {
|
||||||
PgfExprProb* ep = gu_new(PgfExprProb, out_pool);
|
PgfExprProb* ep = gu_new(PgfExprProb, out_pool);
|
||||||
ep->prob = 0;
|
ep->prob = 0;
|
||||||
@@ -31,10 +33,10 @@ pgf_match_string_lit(PgfLiteralCallback* self,
|
|||||||
PgfLiteralStr,
|
PgfLiteralStr,
|
||||||
val, len+1,
|
val, len+1,
|
||||||
&expr_lit->lit, out_pool);
|
&expr_lit->lit, out_pool);
|
||||||
memcpy(lit_str->val, sentence+*poffset, len);
|
memcpy(lit_str->val, buf, len);
|
||||||
lit_str->val[len] = 0;
|
lit_str->val[len] = 0;
|
||||||
|
|
||||||
*poffset = offset;
|
*poffset += len;
|
||||||
return ep;
|
return ep;
|
||||||
} else {
|
} else {
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -71,15 +73,17 @@ pgf_match_int_lit(PgfLiteralCallback* self,
|
|||||||
{
|
{
|
||||||
gu_assert(lin_idx == 0);
|
gu_assert(lin_idx == 0);
|
||||||
|
|
||||||
size_t offset = *poffset;
|
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
|
||||||
while (sentence[offset] && !gu_is_space(sentence[offset]))
|
const uint8_t* p = buf;
|
||||||
offset++;
|
size_t len = 0;
|
||||||
|
while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
|
||||||
|
len = p - buf;
|
||||||
|
}
|
||||||
|
|
||||||
size_t len = offset - *poffset;
|
|
||||||
if (len > 0) {
|
if (len > 0) {
|
||||||
GuPool* tmp_pool = gu_local_pool();
|
GuPool* tmp_pool = gu_local_pool();
|
||||||
PgfToken tok = gu_malloc(tmp_pool, len+1);
|
PgfToken tok = gu_malloc(tmp_pool, len+1);
|
||||||
memcpy((char*) tok, sentence+*poffset, len);
|
memcpy((char*) tok, buf, len);
|
||||||
((char*) tok)[len] = 0;
|
((char*) tok)[len] = 0;
|
||||||
|
|
||||||
int val;
|
int val;
|
||||||
@@ -103,7 +107,7 @@ pgf_match_int_lit(PgfLiteralCallback* self,
|
|||||||
&expr_lit->lit, out_pool);
|
&expr_lit->lit, out_pool);
|
||||||
lit_int->val = val;
|
lit_int->val = val;
|
||||||
|
|
||||||
*poffset = offset;
|
*poffset += len;
|
||||||
return ep;
|
return ep;
|
||||||
} else {
|
} else {
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -123,15 +127,17 @@ pgf_match_float_lit(PgfLiteralCallback* self,
|
|||||||
{
|
{
|
||||||
gu_assert(lin_idx == 0);
|
gu_assert(lin_idx == 0);
|
||||||
|
|
||||||
size_t offset = *poffset;
|
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
|
||||||
while (sentence[offset] && !gu_is_space(sentence[offset]))
|
const uint8_t* p = buf;
|
||||||
offset++;
|
size_t len = 0;
|
||||||
|
while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
|
||||||
|
len = p - buf;
|
||||||
|
}
|
||||||
|
|
||||||
size_t len = offset - *poffset;
|
|
||||||
if (len > 0) {
|
if (len > 0) {
|
||||||
GuPool* tmp_pool = gu_local_pool();
|
GuPool* tmp_pool = gu_local_pool();
|
||||||
PgfToken tok = gu_malloc(tmp_pool, len+1);
|
PgfToken tok = gu_malloc(tmp_pool, len+1);
|
||||||
memcpy((char*) tok, sentence+*poffset, len);
|
memcpy((char*) tok, buf, len);
|
||||||
((char*) tok)[len] = 0;
|
((char*) tok)[len] = 0;
|
||||||
|
|
||||||
double val;
|
double val;
|
||||||
@@ -155,7 +161,7 @@ pgf_match_float_lit(PgfLiteralCallback* self,
|
|||||||
&expr_lit->lit, out_pool);
|
&expr_lit->lit, out_pool);
|
||||||
lit_flt->val = val;
|
lit_flt->val = val;
|
||||||
|
|
||||||
*poffset = offset;
|
*poffset += len;
|
||||||
return ep;
|
return ep;
|
||||||
} else {
|
} else {
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -181,30 +187,27 @@ pgf_match_name_lit(PgfLiteralCallback* self,
|
|||||||
GuOut* out = gu_string_buf_out(sbuf);
|
GuOut* out = gu_string_buf_out(sbuf);
|
||||||
GuExn* err = gu_new_exn(tmp_pool);
|
GuExn* err = gu_new_exn(tmp_pool);
|
||||||
|
|
||||||
size_t offset = *poffset;
|
const uint8_t* buf = (uint8_t*) (sentence + *poffset);
|
||||||
|
const uint8_t* p = buf;
|
||||||
|
|
||||||
int i = 0;
|
int i = 0;
|
||||||
while (iswupper(sentence[offset])) {
|
GuUCS ucs = gu_utf8_decode(&p);
|
||||||
size_t len = 0;
|
while (gu_ucs_is_upper(ucs)) {
|
||||||
while (!gu_is_space(sentence[offset+len])) {
|
|
||||||
len++;
|
|
||||||
}
|
|
||||||
|
|
||||||
PgfToken tok = gu_malloc(tmp_pool, len+1);
|
|
||||||
memcpy((char*) tok, sentence+offset, len);
|
|
||||||
((char*) tok)[len] = 0;
|
|
||||||
|
|
||||||
if (i > 0)
|
if (i > 0)
|
||||||
gu_putc(' ', out, err);
|
gu_putc(' ', out, err);
|
||||||
gu_string_write(tok, out, err);
|
gu_out_utf8(ucs, out, err);
|
||||||
|
ucs = gu_utf8_decode(&p);
|
||||||
|
|
||||||
|
while (ucs != 0 && !gu_ucs_is_space(ucs)) {
|
||||||
|
gu_out_utf8(ucs, out, err);
|
||||||
|
*poffset = p - ((uint8_t*) sentence);
|
||||||
|
ucs = gu_utf8_decode(&p);
|
||||||
|
}
|
||||||
|
|
||||||
i++;
|
i++;
|
||||||
|
|
||||||
offset += len;
|
while (gu_ucs_is_space(ucs))
|
||||||
*poffset = offset;
|
ucs = gu_utf8_decode(&p);
|
||||||
|
|
||||||
while (gu_is_space(sentence[offset]))
|
|
||||||
offset++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PgfExprProb* ep = NULL;
|
PgfExprProb* ep = NULL;
|
||||||
|
|||||||
@@ -145,41 +145,39 @@ pgf_prev_extern_sym(PgfSymbol sym)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static PgfSymbol
|
||||||
pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool) {
|
pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
|
||||||
PgfSymbol new_sym;
|
|
||||||
size_t tok_len = strlen(tok);
|
|
||||||
PgfSymbolKS* sks = (PgfSymbolKS*)
|
|
||||||
gu_alloc_variant(PGF_SYMBOL_KS,
|
|
||||||
sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1,
|
|
||||||
gu_alignof(PgfSymbolKS),
|
|
||||||
&new_sym, pool);
|
|
||||||
strcpy(sks->token, tok);
|
|
||||||
*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = *psym;
|
|
||||||
*psym = new_sym;
|
|
||||||
}
|
|
||||||
|
|
||||||
PgfSymbol
|
|
||||||
pgf_collect_extern_tok(PgfParsing* ps, size_t start, size_t end)
|
|
||||||
{
|
{
|
||||||
PgfSymbol sym = gu_null_variant;
|
PgfSymbol sym = gu_null_variant;
|
||||||
|
|
||||||
size_t offset = start;
|
const uint8_t* start = (uint8_t*) ps->sentence+start_offset;
|
||||||
while (offset < end) {
|
const uint8_t* end = (uint8_t*) ps->sentence+end_offset;
|
||||||
|
|
||||||
|
const uint8_t* p = start;
|
||||||
|
GuUCS ucs = gu_utf8_decode(&p);
|
||||||
|
while (start < end) {
|
||||||
size_t len = 0;
|
size_t len = 0;
|
||||||
while (!gu_is_space(ps->sentence[offset+len])) {
|
while (p <= end && !gu_ucs_is_space(ucs)) {
|
||||||
len++;
|
len = (p - start);
|
||||||
|
ucs = gu_utf8_decode(&p);
|
||||||
}
|
}
|
||||||
|
|
||||||
PgfToken tok = gu_malloc(ps->pool, len+1);
|
PgfSymbol new_sym;
|
||||||
memcpy((char*) tok, ps->sentence+offset, len);
|
PgfSymbolKS* sks = (PgfSymbolKS*)
|
||||||
((char*) tok)[len] = 0;
|
gu_alloc_variant(PGF_SYMBOL_KS,
|
||||||
|
sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+len+1,
|
||||||
|
gu_alignof(PgfSymbolKS),
|
||||||
|
&new_sym, ps->pool);
|
||||||
|
memcpy((char*) sks->token, start, len);
|
||||||
|
((char*) sks->token)[len] = 0;
|
||||||
|
*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+len+1)) = sym;
|
||||||
|
sym = new_sym;
|
||||||
|
|
||||||
pgf_add_extern_tok(&sym, tok, ps->pool);
|
start = p;
|
||||||
|
while (gu_ucs_is_space(ucs)) {
|
||||||
offset += len;
|
start = p;
|
||||||
while (gu_is_space(ps->sentence[offset]))
|
ucs = gu_utf8_decode(&p);
|
||||||
offset++;
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return sym;
|
return sym;
|
||||||
@@ -504,11 +502,11 @@ skip_space(GuString* psent, size_t* plen)
|
|||||||
if (*plen == 0)
|
if (*plen == 0)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
char c = **psent;
|
const uint8_t* p = (uint8_t*) *psent;
|
||||||
if (!gu_is_space(c))
|
if (!gu_ucs_is_space(gu_utf8_decode(&p)))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
(*psent)++;
|
*psent = (GuString) p;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2056,24 +2054,22 @@ pgf_parsing_last_token(PgfParsing* ps, GuPool* pool)
|
|||||||
if (ps->before == NULL)
|
if (ps->before == NULL)
|
||||||
return "";
|
return "";
|
||||||
|
|
||||||
size_t start = ps->before->end_offset;
|
const uint8_t* start = (uint8_t*) ps->sentence;
|
||||||
while (start > 0) {
|
const uint8_t* end = (uint8_t*) ps->sentence + ps->before->end_offset;
|
||||||
char c = ps->sentence[start-1];
|
|
||||||
if (gu_is_space(c))
|
const uint8_t* p = start;
|
||||||
break;
|
while (p < end) {
|
||||||
start--;
|
if (gu_ucs_is_space(gu_utf8_decode(&p))) {
|
||||||
|
start = p;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t end = ps->before->end_offset;
|
while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
|
||||||
while (ps->sentence[end] != 0) {
|
end = p;
|
||||||
char c = ps->sentence[end];
|
|
||||||
if (gu_is_space(c))
|
|
||||||
break;
|
|
||||||
end++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char* tok = gu_malloc(pool, end-start+1);
|
char* tok = gu_malloc(pool, end-start+1);
|
||||||
memcpy(tok, ps->sentence+start, (end-start));
|
memcpy(tok, start, (end-start));
|
||||||
tok[end-start] = 0;
|
tok[end-start] = 0;
|
||||||
return tok;
|
return tok;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user