the parser is now fully Unicode compatible

2015-05-08 09:23:29 +00:00
parent b961e9a255
commit 365c7bb1d8
3 changed files with 79 additions and 86 deletions
--- a/src/runtime/c/gu/utf8.h
+++ b/src/runtime/c/gu/utf8.h
@@ -40,10 +40,4 @@ gu_utf8_encode(GuUCS ucs, uint8_t** buf);
 void
 gu_in_utf8_buf(uint8_t** buf, GuIn* in, GuExn* err);
 static inline bool
 gu_is_space(uint8_t c) {
 	return (c == '\t' || c == '\n' || c == '\v' ||
 	        c == '\f' || c == '\r' || c == ' ');
 }
 #endif // GU_UTF8_H_
--- a/src/runtime/c/pgf/literals.c
+++ b/src/runtime/c/pgf/literals.c
@@ -12,11 +12,13 @@ pgf_match_string_lit(PgfLiteralCallback* self,
 {
 	gu_assert(lin_idx == 0);
-	size_t offset = *poffset;
+	const uint8_t* buf = (uint8_t*) (sentence + *poffset);
-	while (sentence[offset] && !gu_is_space(sentence[offset]))
+	const uint8_t* p   = buf;
-		offset++;
+	size_t len = 0;
 	while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
 		len = p - buf;
 	}
 	size_t len = offset - *poffset;
 	if (len > 0) {
 		PgfExprProb* ep = gu_new(PgfExprProb, out_pool);
 		ep->prob = 0;
@@ -31,10 +33,10 @@ pgf_match_string_lit(PgfLiteralCallback* self,
 						        PgfLiteralStr,
 						        val, len+1,
 						        &expr_lit->lit, out_pool);
-		memcpy(lit_str->val, sentence+*poffset, len);
+		memcpy(lit_str->val, buf, len);
 		lit_str->val[len] = 0;
-		*poffset = offset;
+		*poffset += len;
 		return ep;
 	} else {
 		return NULL;
@@ -71,15 +73,17 @@ pgf_match_int_lit(PgfLiteralCallback* self,
 {
 	gu_assert(lin_idx == 0);
-	size_t offset = *poffset;
+	const uint8_t* buf = (uint8_t*) (sentence + *poffset);
-	while (sentence[offset] && !gu_is_space(sentence[offset]))
+	const uint8_t* p   = buf;
-		offset++;
+	size_t len = 0;
 	while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
 		len = p - buf;
 	}
 	size_t len = offset - *poffset;
 	if (len > 0) {
 		GuPool* tmp_pool = gu_local_pool();
 		PgfToken tok = gu_malloc(tmp_pool, len+1);
-		memcpy((char*) tok, sentence+*poffset, len);
+		memcpy((char*) tok, buf, len);
 		((char*) tok)[len] = 0;
 		int val;
@@ -103,7 +107,7 @@ pgf_match_int_lit(PgfLiteralCallback* self,
 						   &expr_lit->lit, out_pool);
 		lit_int->val = val;
-		*poffset = offset;
+		*poffset += len;
 		return ep;
 	} else {
 		return NULL;
@@ -123,15 +127,17 @@ pgf_match_float_lit(PgfLiteralCallback* self,
 {
 	gu_assert(lin_idx == 0);
-	size_t offset = *poffset;
+	const uint8_t* buf = (uint8_t*) (sentence + *poffset);
-	while (sentence[offset] && !gu_is_space(sentence[offset]))
+	const uint8_t* p   = buf;
-		offset++;
+	size_t len = 0;
 	while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
 		len = p - buf;
 	}
 	size_t len = offset - *poffset;
 	if (len > 0) {
 		GuPool* tmp_pool = gu_local_pool();
 		PgfToken tok = gu_malloc(tmp_pool, len+1);
-		memcpy((char*) tok, sentence+*poffset, len);
+		memcpy((char*) tok, buf, len);
 		((char*) tok)[len] = 0;
 		double val;
@@ -155,7 +161,7 @@ pgf_match_float_lit(PgfLiteralCallback* self,
 						   &expr_lit->lit, out_pool);
 		lit_flt->val = val;
-		*poffset = offset;
+		*poffset += len;
 		return ep;
 	} else {
 		return NULL;
@@ -181,30 +187,27 @@ pgf_match_name_lit(PgfLiteralCallback* self,
 	GuOut* out = gu_string_buf_out(sbuf);
 	GuExn* err = gu_new_exn(tmp_pool);
-	size_t offset = *poffset;
+	const uint8_t* buf = (uint8_t*) (sentence + *poffset);
 	const uint8_t* p   = buf;
 	int i = 0;
-	while (iswupper(sentence[offset])) {
+	GuUCS ucs = gu_utf8_decode(&p);
-		size_t len = 0;
+	while (gu_ucs_is_upper(ucs)) {
 		while (!gu_is_space(sentence[offset+len])) {
 			len++;
 		}
 		PgfToken tok = gu_malloc(tmp_pool, len+1);
 		memcpy((char*) tok, sentence+offset, len);
 		((char*) tok)[len] = 0;
 		if (i > 0)
 		  gu_putc(' ', out, err);
-		gu_string_write(tok, out, err);
+		gu_out_utf8(ucs, out, err);
-		
+		ucs = gu_utf8_decode(&p);
 		while (ucs != 0 && !gu_ucs_is_space(ucs)) {
 			gu_out_utf8(ucs, out, err);
 			*poffset = p - ((uint8_t*) sentence);
 			ucs = gu_utf8_decode(&p);
 		}
 		i++;
-		offset  += len;
+		while (gu_ucs_is_space(ucs))
-		*poffset = offset;
+			ucs = gu_utf8_decode(&p);
 		while (gu_is_space(sentence[offset]))
 			offset++;
 	}
 	PgfExprProb* ep = NULL;
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -145,41 +145,39 @@ pgf_prev_extern_sym(PgfSymbol sym)
 	}
 }
-static void
+static PgfSymbol
-pgf_add_extern_tok(PgfSymbol* psym, PgfToken tok, GuPool* pool) {
+pgf_collect_extern_tok(PgfParsing* ps, size_t start_offset, size_t end_offset)
 	PgfSymbol new_sym;
 	size_t tok_len = strlen(tok);
 	PgfSymbolKS* sks = (PgfSymbolKS*)
 		gu_alloc_variant(PGF_SYMBOL_KS,
 						 sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+tok_len+1,
 						 gu_alignof(PgfSymbolKS),
 						 &new_sym, pool);
 	strcpy(sks->token, tok);
 	*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+tok_len+1)) = *psym;
 	*psym = new_sym;
 }
 PgfSymbol
 pgf_collect_extern_tok(PgfParsing* ps, size_t start, size_t end)
 {
 	PgfSymbol sym = gu_null_variant;
-	size_t offset = start;
+	const uint8_t* start = (uint8_t*) ps->sentence+start_offset;
-	while (offset < end) {
+	const uint8_t* end   = (uint8_t*) ps->sentence+end_offset;
 	const uint8_t* p = start;
 	GuUCS ucs = gu_utf8_decode(&p);
 	while (start < end) {
 		size_t len = 0;
-		while (!gu_is_space(ps->sentence[offset+len])) {
+		while (p <= end && !gu_ucs_is_space(ucs)) {
-			len++;
+			len = (p - start);
 			ucs = gu_utf8_decode(&p);
 		}
-		PgfToken tok = gu_malloc(ps->pool, len+1);
+		PgfSymbol new_sym;
-		memcpy((char*) tok, ps->sentence+offset, len);
+		PgfSymbolKS* sks = (PgfSymbolKS*)
-		((char*) tok)[len] = 0;
+			gu_alloc_variant(PGF_SYMBOL_KS,
 			             sizeof(PgfSymbol)+sizeof(PgfSymbolKS)+len+1,
 			             gu_alignof(PgfSymbolKS),
 			             &new_sym, ps->pool);
 		memcpy((char*) sks->token, start, len);
 		((char*) sks->token)[len] = 0;
 		*((PgfSymbol*) (((uint8_t*) sks)+sizeof(PgfSymbolKS)+len+1)) = sym;
 		sym = new_sym;
-		pgf_add_extern_tok(&sym, tok, ps->pool);
+		start = p;
-
+		while (gu_ucs_is_space(ucs)) {
-		offset  += len;
+			start = p;
-		while (gu_is_space(ps->sentence[offset]))
+			ucs = gu_utf8_decode(&p);
-			offset++;
+		}
 	}
 	return sym;
@@ -504,11 +502,11 @@ skip_space(GuString* psent, size_t* plen)
 	if (*plen == 0)
 		return false;
-	char c = **psent;
+	const uint8_t* p = (uint8_t*) *psent;
-	if (!gu_is_space(c))
+	if (!gu_ucs_is_space(gu_utf8_decode(&p)))
 		return false;
-	(*psent)++;
+	*psent = (GuString) p;
 	return true;
 }
@@ -2056,24 +2054,22 @@ pgf_parsing_last_token(PgfParsing* ps, GuPool* pool)
 	if (ps->before == NULL)
 		return "";
-	size_t start = ps->before->end_offset;
+	const uint8_t* start = (uint8_t*) ps->sentence;
-	while (start > 0) {
+	const uint8_t* end   = (uint8_t*) ps->sentence + ps->before->end_offset;
-		char c = ps->sentence[start-1];
+
-		if (gu_is_space(c))
+	const uint8_t* p = start;
-			break;
+	while (p < end) {
-		start--;
+		if (gu_ucs_is_space(gu_utf8_decode(&p))) {
 			start = p;
 		}
 	}
-	size_t end = ps->before->end_offset;
+	while (*p && !gu_ucs_is_space(gu_utf8_decode(&p))) {
-	while (ps->sentence[end] != 0) {
+		end = p;
 		char c = ps->sentence[end];
 		if (gu_is_space(c))
 			break;
 		end++;
 	}
 	char* tok = gu_malloc(pool, end-start+1);
-	memcpy(tok, ps->sentence+start, (end-start));
+	memcpy(tok, start, (end-start));
 	tok[end-start] = 0;
 	return tok;
 }