the GF syntax for identifiers is exteded with quoted forms, i.e. you could write for instance 'ab.c' and then everything between the quites is identifier. This includes Unicode characters and non-ASCII symbols. This is useful for automatically generated GF grammars.

2026-04-23 19:42:50 -06:00 · 2013-11-22 13:30:18 +00:00
parent cddc19a5ad
commit 4254b101f4
8 changed files with 165 additions and 54 deletions
--- a/gf.cabal
+++ b/gf.cabal
@@ -63,6 +63,7 @@ Library
                 fst,
                 containers,
                 bytestring,
                 utf8-string,
                 random,
                 pretty,
                 mtl
--- a/src/compiler/GF/Grammar/Lexer.hs
+++ b/src/compiler/GF/Grammar/Lexer.hs
--- a/src/compiler/GF/Grammar/Parser.y
+++ b/src/compiler/GF/Grammar/Parser.y
@@ -103,7 +103,6 @@ import Data.Char(toLower)
 Integer       { (T_Integer $$) }
 Double        { (T_Double  $$) }
 String        { (T_String  $$) }
 LString       { (T_LString $$) }
 Ident         { (T_Ident   $$) }
@@ -457,7 +456,6 @@ Exp6
  | '{' ListLocDef '}'    {% mkR $2 }
  | '<' ListTupleComp '>' { R (tuple2record $2) }
  | '<' Exp ':' Exp '>'   { Typed $2 $4      }
  | LString               { K $1 }
  | '(' Exp ')'           { $2 }
 ListExp :: { [Term] }
--- a/src/compiler/GF/Grammar/lexer/Lexer.x
+++ b/src/compiler/GF/Grammar/lexer/Lexer.x
@@ -1,5 +1,3 @@
 -- -*- haskell -*-
 -- This Alex file was machine-generated by the BNF converter
 {
 module GF.Grammar.Lexer
         ( Token(..), Posn(..)
@@ -8,19 +6,18 @@ module GF.Grammar.Lexer
         ) where
 import GF.Infra.Ident
 import GF.Data.Operations
 import qualified Data.ByteString.Char8 as BS
 import qualified Data.Map as Map
 }
-$l = [a-zA-Z\192 - \255] # [\215 \247]    -- isolatin1 letter FIXME
+$l = [a-zA-Z\192 - \255] # [\215 \247]
-$c = [A-Z\192-\221] # [\215]    -- capital isolatin1 letter FIXME
+$c = [A-Z\192-\221] # [\215]
-$s = [a-z\222-\255] # [\247]    -- small isolatin1 letter FIXME
+$s = [a-z\222-\255] # [\247]
 $d = [0-9]                -- digit
 $i = [$l $d _ ']          -- identifier character
-$u = [\0-\255]          -- universal: any character
+$u = [\0-\255]            -- universal: any character
@rsyms =    -- symbols and non-identifier-like reserved words
   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
@@ -31,7 +28,7 @@ $u = [\0-\255]          -- universal: any character
 $white+ ;
@rsyms                          { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
-\' ($u # \')* \'                { tok (eitherResIdent (T_LString . BS.unpack)) }
+\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) }
 (\_ | $l)($l | $d | \_ | \')*   { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
 \" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) }
@@ -115,7 +112,6 @@ data Token
 | T_String  String          -- string literals
 | T_Integer Int             -- integer literals
 | T_Double  Double          -- double precision float literals
 | T_LString String
 | T_Ident   Ident
 | T_EOF
@@ -207,6 +203,7 @@ unescapeInitTail = unesc . tail where
    '\\':'n':cs  -> '\n' : unesc cs
    '\\':'t':cs  -> '\t' : unesc cs
    '"':[]    -> []
    '\'':[]    -> []
    c:cs      -> c : unesc cs
    _         -> []
--- a/src/runtime/c/pgf/expr.c
+++ b/src/runtime/c/pgf/expr.c
@@ -1,5 +1,6 @@
 #include "pgf.h"
 #include <gu/assert.h>
 #include <gu/utf8.h>
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -166,6 +167,45 @@ pgf_expr_parser_getc(PgfExprParser* parser)
 	}
 }
 static bool
 pgf_is_ident_first(GuUCS ucs)
 {
 	return (ucs == '_') ||
           (ucs >= 'a' && ucs <= 'z') ||
           (ucs >= 'A' && ucs <= 'Z') ||
           (ucs >= 192 && ucs <= 255 && ucs != 247 && ucs != 215);
 }
 static bool
 pgf_is_ident_rest(GuUCS ucs)
 {
 	return (ucs == '_') ||
           (ucs == '\'') ||
           (ucs >= '0' && ucs <= '9') ||
           (ucs >= 'a' && ucs <= 'z') ||
           (ucs >= 'A' && ucs <= 'Z') ||
           (ucs >= 192 && ucs <= 255 && ucs != 247 && ucs != 215);
 }
 static bool
 pgf_is_normal_ident(PgfCId id)
 {
 	const uint8_t* p = (const uint8_t*) id;
 	GuUCS ucs = gu_utf8_decode(&p);
 	if (!pgf_is_ident_first(ucs))
 		return false;
 	for (;;) {
 		ucs = gu_utf8_decode(&p);
 		if (ucs == 0)
 			break;
 		if (!pgf_is_ident_rest(ucs))
 			return false;
 	}
 	return true;
 }
 static void
 pgf_expr_parser_token(PgfExprParser* parser)
 {
@@ -227,20 +267,32 @@ pgf_expr_parser_token(PgfExprParser* parser)
 		pgf_expr_parser_getc(parser);
 		parser->token_tag = PGF_TOKEN_COLON;
 		break;
-	case '_':
+	case '\'':
 		pgf_expr_parser_getc(parser);
-		parser->token_tag = PGF_TOKEN_WILD;
+
 		GuBuf* chars = gu_new_buf(char, parser->tmp_pool);
 		while (parser->ch != '\'' && parser->ch != EOF) {
 			if (parser->ch == '\\') {
 				pgf_expr_parser_getc(parser);
 			}
 			gu_buf_push(chars, char, parser->ch);
 			pgf_expr_parser_getc(parser);
 		}
 		if (parser->ch == '\'') {
 			pgf_expr_parser_getc(parser);
 			gu_buf_push(chars, char, 0);
 			parser->token_tag   = PGF_TOKEN_IDENT;
 			parser->token_value = chars;
 		}
 		break;
 	default: {
 		GuBuf* chars = gu_new_buf(char, parser->tmp_pool);
-		if (isalpha(parser->ch)) {
+		if (pgf_is_ident_first(parser->ch)) {
-			while (isalnum(parser->ch) || 
+			do {
 			       parser->ch == '_' || 
 			       parser->ch == '\'') {
 				gu_buf_push(chars, char, parser->ch);
 				pgf_expr_parser_getc(parser);
-			}
+			} while (pgf_is_ident_rest(parser->ch));
 			gu_buf_push(chars, char, 0);
 			parser->token_tag   = PGF_TOKEN_IDENT;
 			parser->token_value = chars;
@@ -268,7 +320,7 @@ pgf_expr_parser_token(PgfExprParser* parser)
 			}
 		} else if (parser->ch == '"') {
 			pgf_expr_parser_getc(parser);
-			
+
 			while (parser->ch != '"' && parser->ch != EOF) {
 				gu_buf_push(chars, char, parser->ch);
 				pgf_expr_parser_getc(parser);
@@ -924,6 +976,30 @@ pgf_expr_hash(GuHash h, PgfExpr e)
 	return h;
 }
 void
 pgf_print_cid(PgfCId id,
 			  GuOut* out, GuExn* err)
 {
 	if (pgf_is_normal_ident(id))
 		gu_string_write(id, out, err);
 	else {
 		gu_putc('\'', out, err);
 		const uint8_t* p = (const uint8_t*) id;
 		for (;;) {
 			GuUCS ucs = gu_utf8_decode(&p);
 			if (ucs == 0)
 				break;
 			if (ucs == '\'')
 				gu_puts("\\\'", out, err);
 			else if (ucs == '\\')
 				gu_puts("\\\\", out, err);
 			else
 				gu_out_utf8(ucs, out, err);
 		}
 		gu_putc('\'', out, err);
 	}
 }
 void
 pgf_print_literal(PgfLiteral lit,
 			      GuOut* out, GuExn* err)
@@ -973,7 +1049,7 @@ pgf_print_expr(PgfExpr expr, PgfPrintContext* ctxt, int prec,
 			if (abs->bind_type == PGF_BIND_TYPE_IMPLICIT) {
 				gu_putc('{', out, err);
 			}
-			gu_string_write(abs->id, out, err);
+			pgf_print_cid(abs->id, out, err);
 			if (abs->bind_type == PGF_BIND_TYPE_IMPLICIT) {
 				gu_putc('}', out, err);
 			}
@@ -1028,7 +1104,7 @@ pgf_print_expr(PgfExpr expr, PgfPrintContext* ctxt, int prec,
 		break;
 	case PGF_EXPR_FUN: {
 		PgfExprFun* fun = ei.data;
-		gu_string_write(fun->fun, out, err);
+		pgf_print_cid(fun->fun, out, err);
 		break;
 	}
 	case PGF_EXPR_VAR: {
@@ -1043,7 +1119,7 @@ pgf_print_expr(PgfExpr expr, PgfPrintContext* ctxt, int prec,
 		if (c == NULL) {
 			gu_printf(out, err, "#%d", evar->var);
 		} else {
-			gu_string_write(c->name, out, err);
+			pgf_print_cid(c->name, out, err);
 		}
 		break;
 	}
@@ -1074,7 +1150,7 @@ pgf_print_hypo(PgfHypo *hypo, PgfPrintContext* ctxt, int prec,
 {
    if (hypo->bind_type == PGF_BIND_TYPE_IMPLICIT) {
        gu_puts("({", out, err);
-        gu_string_write(hypo->cid, out, err);
+        pgf_print_cid(hypo->cid, out, err);
        gu_puts("} : ", out, err);
        pgf_print_type(hypo->type, ctxt, 0, out, err);
        gu_puts(")", out, err);
@@ -1083,7 +1159,7 @@ pgf_print_hypo(PgfHypo *hypo, PgfPrintContext* ctxt, int prec,
        if (strcmp(hypo->cid, "_") != 0) {
            gu_puts("(", out, err);
-            gu_string_write(hypo->cid, out, err);
+            pgf_print_cid(hypo->cid, out, err);
            gu_puts(" : ", out, err);
            pgf_print_type(hypo->type, ctxt, 0, out, err);
            gu_puts(")", out, err);
@@ -1117,7 +1193,7 @@ pgf_print_type(PgfType *type, PgfPrintContext* ctxt, int prec,
 			gu_puts(" -> ", out, err);
 		}
-		gu_string_write(type->cid, out, err);
+		pgf_print_cid(type->cid, out, err);
 		for (size_t i = 0; i < type->n_exprs; i++) {
 			gu_puts(" ", out, err);
@@ -1143,7 +1219,7 @@ pgf_print_type(PgfType *type, PgfPrintContext* ctxt, int prec,
 		if (prec > 3) gu_putc(')', out, err);
 	} else {
-		gu_string_write(type->cid, out, err);
+		pgf_print_cid(type->cid, out, err);
 	}
 }
--- a/src/runtime/c/pgf/expr.h
+++ b/src/runtime/c/pgf/expr.h
@@ -176,6 +176,9 @@ struct PgfPrintContext {
 	PgfPrintContext* next;
 };
 void
 pgf_print_cid(PgfCId id, GuOut* out, GuExn* err);
 void
 pgf_print_literal(PgfLiteral lit, GuOut* out, GuExn* err);
--- a/src/runtime/c/pgf/printer.c
+++ b/src/runtime/c/pgf/printer.c
@@ -16,7 +16,7 @@ pgf_print_flag(GuMapItor* fn, const void* key, void* value,
    GuOut *out = clo->out;
    gu_puts("  flag ", out, err);
-    gu_string_write(flag, out, err);
+    pgf_print_cid(flag, out, err);
    gu_puts(" = ", out, err);
    pgf_print_literal(lit, out, err);
    gu_puts(";\n", out, err);
@@ -32,7 +32,7 @@ pgf_print_cat(GuMapItor* fn, const void* key, void* value,
    GuOut *out = clo->out;
    gu_puts("  cat ", out, err);
-    gu_string_write(name, out, err);
+    pgf_print_cid(name, out, err);
 	PgfPrintContext* ctxt = NULL;
    size_t n_hypos = gu_seq_length(cat->context);
@@ -61,7 +61,7 @@ pgf_print_absfun(GuMapItor* fn, const void* key, void* value,
    GuOut *out = clo->out;
    gu_puts((fun->defns == NULL) ? "  data " : "  fun ", out, err);
-    gu_string_write(name, out, err);
+    pgf_print_cid(name, out, err);
    gu_puts(" : ", out, err);
    pgf_print_type(fun->type, NULL, 0, out, err);
    gu_printf(out, err, " ;   -- %f\n", fun->ep.prob);
@@ -70,7 +70,7 @@ static void
 pgf_print_abstract(PgfAbstr* abstr, GuOut* out, GuExn* err)
 {
 	gu_puts("abstract ", out, err);
-	gu_string_write(abstr->name, out, err);
+	pgf_print_cid(abstr->name, out, err);
 	gu_puts(" {\n", out, err);
 	PgfPrintFn clo1 = { { pgf_print_flag }, out };
@@ -205,7 +205,7 @@ pgf_print_cncfun(PgfCncFun *cncfun, PgfSequences* sequences,
 	if (cncfun->absfun != NULL) {
 		gu_puts(" [", out, err);
-		gu_string_write(cncfun->absfun->name, out, err);
+		pgf_print_cid(cncfun->absfun->name, out, err);
 		gu_puts("]", out, err);
 	}
@@ -311,7 +311,7 @@ pgf_print_cnccat(GuMapItor* fn, const void* key, void* value,
    GuOut *out = clo->out;
    gu_puts("    ", out, err);
-    gu_string_write(name, out, err);
+    pgf_print_cid(name, out, err);
    gu_puts(" :=\n", out, err);
    PgfCCat *start = gu_seq_get(cnccat->cats, PgfCCat*, 0);
@@ -335,7 +335,7 @@ pgf_print_concrete(PgfCId cncname, PgfConcr* concr,
                   GuOut* out, GuExn* err)
 {
 	gu_puts("concrete ", out, err);
-	gu_string_write(cncname, out, err);
+	pgf_print_cid(cncname, out, err);
 	gu_puts(" {\n", out, err);
 	PgfPrintFn clo1 = { { pgf_print_flag }, out };
--- a/src/runtime/haskell/PGF/CId.hs
+++ b/src/runtime/haskell/PGF/CId.hs
@@ -7,6 +7,7 @@ module PGF.CId (CId(..),
 import Control.Monad
 import qualified Data.ByteString.Char8 as BS
 import qualified Data.ByteString.UTF8 as UTF8
 import Data.Char
 import qualified Text.ParserCombinators.ReadP as RP
 import qualified Text.PrettyPrint as PP
@@ -21,7 +22,7 @@ wildCId = CId (BS.singleton '_')
 -- | Creates a new identifier from 'String'
 mkCId :: String -> CId
-mkCId s = CId (BS.pack s)
+mkCId s = CId (UTF8.fromString s)
 bsCId = CId
@@ -33,7 +34,18 @@ readCId s = case [x | (x,cs) <- RP.readP_to_S pCId s, all isSpace cs] of
 -- | Renders the identifier as 'String'
 showCId :: CId -> String
-showCId (CId x) = BS.unpack x
+showCId (CId x) = 
  let raw = UTF8.toString x
  in if isIdent raw
       then raw
       else "'" ++ concatMap escape raw ++ "'"
  where
    isIdent []     = False
    isIdent (c:cs) = isIdentFirst c && all isIdentRest cs
    escape '\'' = "\\\'"
    escape '\\' = "\\\\"
    escape c    = [c]
 instance Show CId where
    showsPrec _ = showString . showCId
@@ -48,10 +60,35 @@ pCId = do s <- pIdent
            else return (mkCId s)
 pIdent :: RP.ReadP String
-pIdent = liftM2 (:) (RP.satisfy isIdentFirst) (RP.munch isIdentRest)
+pIdent = 
-  where
+  liftM2 (:) (RP.satisfy isIdentFirst) (RP.munch isIdentRest)
-    isIdentFirst c = c == '_' || isLetter c
+  `mplus`
-    isIdentRest c = c == '_' || c == '\'' || isAlphaNum c
+  do RP.char '\''
     cs <- RP.many1 insideChar
     RP.char '\''
     return cs
 --  where
 insideChar = RP.readS_to_P $ \s ->
  case s of
 	[]             -> []
 	('\\':'\\':cs) -> [('\\',cs)]
 	('\\':'\'':cs) -> [('\'',cs)]
 	('\\':cs)      -> []
 	('\'':cs)      -> []
 	(c:cs)         -> [(c,cs)]
 isIdentFirst c =
  (c == '_') ||
  (c >= 'a' && c <= 'z') ||
  (c >= 'A' && c <= 'Z') ||
  (c >= '\192' && c <= '\255' && c /= '\247' && c /= '\215')
 isIdentRest c = 
  (c == '_') ||
  (c == '\'') ||
  (c >= '0' && c <= '9') ||
  (c >= 'a' && c <= 'z') ||
  (c >= 'A' && c <= 'Z') ||
  (c >= '\192' && c <= '\255' && c /= '\247' && c /= '\215')
 ppCId :: CId -> PP.Doc
 ppCId = PP.text . showCId