the parser is now fully Unicode compatible

2015-05-08 09:23:29 +00:00
parent 5b60c3a00e
commit e600eb764a
3 changed files with 79 additions and 86 deletions
--- a/src/runtime/c/gu/utf8.h
+++ b/src/runtime/c/gu/utf8.h
@@ -40,10 +40,4 @@ gu_utf8_encode(GuUCS ucs, uint8_t** buf);
 void
 gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err);

-static inline bool
-gu_is_space(uint8_t c) {
-	return (c == '\t' || c == '\n' || c == '\v' ||
-	        c == '\f' || c == '\r' || c == ' ');
-}
-
 #endif // GU_UTF8_H_
--- a/src/runtime/c/pgf/literals.c
+++ b/src/runtime/c/pgf/literals.c
@@ -12,11 +12,13 @@ pgf_match_string_lit(PgfLiteralCallback* self,
 {
 	gu_assert(lin_idx == 0);

-	size_t offset = *poffset;
-	while (sentence[offset] && !gu_is_space(sentence[offset]))
-		offset++;
+	const uint8_t* buf = (uint8_t*) (sentence + *poffset);
+	const uint8_t* p   = buf;
+	size_t len = 0;
+	while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
+		len = p - buf;
+	}

-	size_t len = offset - *poffset;
 	if (len > 0) {
 		PgfExprProb* ep = gu_new(PgfExprProb, out_pool);
 		ep->prob = 0;
@@ -31,10 +33,10 @@ pgf_match_string_lit(PgfLiteralCallback* self,
 						        PgfLiteralStr,
 						        val, len+1,
 						        &expr_lit->lit, out_pool);
-		memcpy(lit_str->val, sentence+*poffset, len);
+		memcpy(lit_str->val, buf, len);
 		lit_str->val[len] = 0;

-		*poffset = offset;
+		*poffset += len;
 		return ep;
 	} else {
 		return NULL;
@@ -71,15 +73,17 @@ pgf_match_int_lit(PgfLiteralCallback* self,
 {
 	gu_assert(lin_idx == 0);

-	size_t offset = *poffset;
-	while (sentence[offset] && !gu_is_space(sentence[offset]))
-		offset++;
+	const uint8_t* buf = (uint8_t*) (sentence + *poffset);
+	const uint8_t* p   = buf;
+	size_t len = 0;
+	while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
+		len = p - buf;
+	}

-	size_t len = offset - *poffset;
 	if (len > 0) {
 		GuPool* tmp_pool = gu_local_pool();
 		PgfToken tok = gu_malloc(tmp_pool, len+1);
-		memcpy((char*) tok, sentence+*poffset, len);
+		memcpy((char*) tok, buf, len);
 		((char*) tok)[len] = 0;

 		int val;
@@ -103,7 +107,7 @@ pgf_match_int_lit(PgfLiteralCallback* self,
 						   &expr_lit->lit, out_pool);
 		lit_int->val = val;

-		*poffset = offset;
+		*poffset += len;
 		return ep;
 	} else {
 		return NULL;
@@ -123,15 +127,17 @@ pgf_match_float_lit(PgfLiteralCallback* self,
 {
 	gu_assert(lin_idx == 0);

-	size_t offset = *poffset;
-	while (sentence[offset] && !gu_is_space(sentence[offset]))
-		offset++;
+	const uint8_t* buf = (uint8_t*) (sentence + *poffset);
+	const uint8_t* p   = buf;
+	size_t len = 0;
+	while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
+		len = p - buf;
+	}

-	size_t len = offset - *poffset;
 	if (len > 0) {
 		GuPool* tmp_pool = gu_local_pool();
 		PgfToken tok = gu_malloc(tmp_pool, len+1);
-		memcpy((char*) tok, sentence+*poffset, len);
+		memcpy((char*) tok, buf, len);
 		((char*) tok)[len] = 0;

 		double val;
@@ -155,7 +161,7 @@ pgf_match_float_lit(PgfLiteralCallback* self,
 						   &expr_lit->lit, out_pool);
 		lit_flt->val = val;

-		*poffset = offset;
+		*poffset += len;
 		return ep;
 	} else {
 		return NULL;
@@ -181,30 +187,27 @@ pgf_match_name_lit(PgfLiteralCallback* self,
 	GuOut* out = gu_string_buf_out(sbuf);
 	GuExn* err = gu_new_exn(tmp_pool);

-	size_t offset = *poffset;
+	const uint8_t* buf = (uint8_t*) (sentence + *poffset);
+	const uint8_t* p   = buf;

 	int i = 0;
-	while (iswupper(sentence[offset])) {
-		size_t len = 0;
-		while (!gu_is_space(sentence[offset+len])) {
-			len++;
-		}
-
-		PgfToken tok = gu_malloc(tmp_pool, len+1);
-		memcpy((char*) tok, sentence+offset, len);
-		((char*) tok)[len] = 0;
-
+	GuUCS ucs = gu_utf8_decode(&p);
+	while (gu_ucs_is_upper(ucs)) {
 		if (i > 0)
 		  gu_putc(' ', out, err);
-		gu_string_write(tok, out, err);
-		
+		gu_out_utf8(ucs, out, err);
+		ucs = gu_utf8_decode(&p);
+
+		while (ucs != 0 && !gu_ucs_is_space(ucs)) {
+			gu_out_utf8(ucs, out, err);
+			*poffset = p - ((uint8_t*) sentence);
+			ucs = gu_utf8_decode(&p);
+		}
+
 		i++;

-		offset  += len;
-		*poffset = offset;
-
-		while (gu_is_space(sentence[offset]))
-			offset++;
+		while (gu_ucs_is_space(ucs))
+			ucs = gu_utf8_decode(&p);
 	}

 	PgfExprProb* ep = NULL;
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -145,41 +145,39 @@ pgf_prev_extern_sym(PgfSymbol sym)
 	}
 }

-static void
-pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool) {
-	PgfSymbol new_sym;
-	size_t tok_len = strlen(tok);
-	PgfSymbolKS* sks = (PgfSymbolKS*)
-		gu_alloc_variant(PGF_SYMBOL_KS,
-						 sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1,
-						 gu_alignof(PgfSymbolKS),
-						 &new_sym, pool);
-	strcpy(sks->token, tok);
-	*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = *psym;
-	*psym = new_sym;
-}
-
-PgfSymbol
-pgf_collect_extern_tok(PgfParsing* ps, size_t start, size_t end)
+static PgfSymbol
+pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
 {
 	PgfSymbol sym = gu_null_variant;

-	size_t offset = start;
-	while (offset < end) {
+	const uint8_t* start = (uint8_t*) ps->sentence+start_offset;
+	const uint8_t* end   = (uint8_t*) ps->sentence+end_offset;
+
+	const uint8_t* p = start;
+	GuUCS ucs = gu_utf8_decode(&p);
+	while (start < end) {
 		size_t len = 0;
-		while (!gu_is_space(ps->sentence[offset+len])) {
-			len++;
+		while (p <= end && !gu_ucs_is_space(ucs)) {
+			len = (p - start);
+			ucs = gu_utf8_decode(&p);
 		}

-		PgfToken tok = gu_malloc(ps->pool, len+1);
-		memcpy((char*) tok, ps->sentence+offset, len);
-		((char*) tok)[len] = 0;
+		PgfSymbol new_sym;
+		PgfSymbolKS* sks = (PgfSymbolKS*)
+			gu_alloc_variant(PGF_SYMBOL_KS,
+			             sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+len+1,
+			             gu_alignof(PgfSymbolKS),
+			             &new_sym, ps->pool);
+		memcpy((char*) sks->token, start, len);
+		((char*) sks->token)[len] = 0;
+		*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+len+1)) = sym;
+		sym = new_sym;

-		pgf_add_extern_tok(&sym, tok, ps->pool);
-
-		offset  += len;
-		while (gu_is_space(ps->sentence[offset]))
-			offset++;
+		start = p;
+		while (gu_ucs_is_space(ucs)) {
+			start = p;
+			ucs = gu_utf8_decode(&p);
+		}
 	}

 	return sym;
@@ -504,11 +502,11 @@ skip_space(GuString* psent, size_t* plen)
 	if (*plen == 0)
 		return false;

-	char c = **psent;
-	if (!gu_is_space(c))
+	const uint8_t* p = (uint8_t*) *psent;
+	if (!gu_ucs_is_space(gu_utf8_decode(&p)))
 		return false;

-	(*psent)++;
+	*psent = (GuString) p;
 	return true;
 }

@@ -2056,24 +2054,22 @@ pgf_parsing_last_token(PgfParsing* ps, GuPool* pool)
 	if (ps->before == NULL)
 		return "";

-	size_t start = ps->before->end_offset;
-	while (start > 0) {
-		char c = ps->sentence[start-1];
-		if (gu_is_space(c))
-			break;
-		start--;
+	const uint8_t* start = (uint8_t*) ps->sentence;
+	const uint8_t* end   = (uint8_t*) ps->sentence + ps->before->end_offset;
+
+	const uint8_t* p = start;
+	while (p < end) {
+		if (gu_ucs_is_space(gu_utf8_decode(&p))) {
+			start = p;
+		}
 	}

-	size_t end = ps->before->end_offset;
-	while (ps->sentence[end] != 0) {
-		char c = ps->sentence[end];
-		if (gu_is_space(c))
-			break;
-		end++;
+	while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
+		end = p;
 	}

 	char* tok = gu_malloc(pool, end-start+1);
-	memcpy(tok, ps->sentence+start, (end-start));
+	memcpy(tok, start, (end-start));
 	tok[end-start] = 0;
 	return tok;
 }