the GF syntax for identifiers is exteded with quoted forms, i.e. you could write for instance 'ab.c' and then everything between the quites is identifier. This includes Unicode characters and non-ASCII symbols. This is useful for automatically generated GF grammars.

2026-06-21 01:06:14 -06:00 · 2013-11-22 13:30:18 +00:00
parent 1d2786f7da
commit 8bcc70eac8
8 changed files with 165 additions and 54 deletions
--- a/src/compiler/GF/Grammar/Lexer.hs
+++ b/src/compiler/GF/Grammar/Lexer.hs
--- a/src/compiler/GF/Grammar/Parser.y
+++ b/src/compiler/GF/Grammar/Parser.y
@@ -103,7 +103,6 @@ import Data.Char(toLower)
 Integer       { (T_Integer $$) }
 Double        { (T_Double  $$) }
 String        { (T_String  $$) }
-LString       { (T_LString $$) }
 Ident         { (T_Ident   $$) }


@@ -457,7 +456,6 @@ Exp6
  | '{' ListLocDef '}'    {% mkR $2 }
  | '<' ListTupleComp '>' { R (tuple2record $2) }
  | '<' Exp ':' Exp '>'   { Typed $2 $4      }
-  | LString               { K $1 }
  | '(' Exp ')'           { $2 }

 ListExp :: { [Term] }
--- a/src/compiler/GF/Grammar/lexer/Lexer.x
+++ b/src/compiler/GF/Grammar/lexer/Lexer.x
@@ -1,5 +1,3 @@
-- -*- haskell -*-
-- This Alex file was machine-generated by the BNF converter
 {
 module GF.Grammar.Lexer
         ( Token(..), Posn(..)
@@ -8,19 +6,18 @@ module GF.Grammar.Lexer
         ) where

 import GF.Infra.Ident
-import GF.Data.Operations
 import qualified Data.ByteString.Char8 as BS
 import qualified Data.Map as Map

 }


-$l = [a-zA-Z\192 - \255] # [\215 \247]    -- isolatin1 letter FIXME
-$c = [A-Z\192-\221] # [\215]    -- capital isolatin1 letter FIXME
-$s = [a-z\222-\255] # [\247]    -- small isolatin1 letter FIXME
+$l = [a-zA-Z\192 - \255] # [\215 \247]
+$c = [A-Z\192-\221] # [\215]
+$s = [a-z\222-\255] # [\247]
 $d = [0-9]                -- digit
 $i = [$l $d _ ']          -- identifier character
-$u = [\0-\255]          -- universal: any character
+$u = [\0-\255]            -- universal: any character

@rsyms =    -- symbols and non-identifier-like reserved words
   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
@@ -31,7 +28,7 @@ $u = [\0-\255]          -- universal: any character

 $white+ ;
@rsyms                          { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
-\' ($u # \')* \'                { tok (eitherResIdent (T_LString . BS.unpack)) }
+\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) }
 (\_ | $l)($l | $d | \_ | \')*   { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }

 \" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) }
@@ -115,7 +112,6 @@ data Token
 | T_String  String          -- string literals
 | T_Integer Int             -- integer literals
 | T_Double  Double          -- double precision float literals
- | T_LString String
 | T_Ident   Ident
 | T_EOF

@@ -207,6 +203,7 @@ unescapeInitTail = unesc . tail where
    '\\':'n':cs  -> '\n' : unesc cs
    '\\':'t':cs  -> '\t' : unesc cs
    '"':[]    -> []
+    '\'':[]    -> []
    c:cs      -> c : unesc cs
    _         -> []