the GF syntax for identifiers is exteded with quoted forms, i.e. you could write for instance 'ab.c' and then everything between the quites is identifier. This includes Unicode characters and non-ASCII symbols. This is useful for automatically generated GF grammars.

This commit is contained in:
kr.angelov
2013-11-22 13:30:18 +00:00
parent cddc19a5ad
commit 4254b101f4
8 changed files with 165 additions and 54 deletions

View File

@@ -63,6 +63,7 @@ Library
fst, fst,
containers, containers,
bytestring, bytestring,
utf8-string,
random, random,
pretty, pretty,
mtl mtl

File diff suppressed because one or more lines are too long

View File

@@ -103,7 +103,6 @@ import Data.Char(toLower)
Integer { (T_Integer $$) } Integer { (T_Integer $$) }
Double { (T_Double $$) } Double { (T_Double $$) }
String { (T_String $$) } String { (T_String $$) }
LString { (T_LString $$) }
Ident { (T_Ident $$) } Ident { (T_Ident $$) }
@@ -457,7 +456,6 @@ Exp6
| '{' ListLocDef '}' {% mkR $2 } | '{' ListLocDef '}' {% mkR $2 }
| '<' ListTupleComp '>' { R (tuple2record $2) } | '<' ListTupleComp '>' { R (tuple2record $2) }
| '<' Exp ':' Exp '>' { Typed $2 $4 } | '<' Exp ':' Exp '>' { Typed $2 $4 }
| LString { K $1 }
| '(' Exp ')' { $2 } | '(' Exp ')' { $2 }
ListExp :: { [Term] } ListExp :: { [Term] }

View File

@@ -1,5 +1,3 @@
-- -*- haskell -*-
-- This Alex file was machine-generated by the BNF converter
{ {
module GF.Grammar.Lexer module GF.Grammar.Lexer
( Token(..), Posn(..) ( Token(..), Posn(..)
@@ -8,19 +6,18 @@ module GF.Grammar.Lexer
) where ) where
import GF.Infra.Ident import GF.Infra.Ident
import GF.Data.Operations
import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Char8 as BS
import qualified Data.Map as Map import qualified Data.Map as Map
} }
$l = [a-zA-Z\192 - \255] # [\215 \247] -- isolatin1 letter FIXME $l = [a-zA-Z\192 - \255] # [\215 \247]
$c = [A-Z\192-\221] # [\215] -- capital isolatin1 letter FIXME $c = [A-Z\192-\221] # [\215]
$s = [a-z\222-\255] # [\247] -- small isolatin1 letter FIXME $s = [a-z\222-\255] # [\247]
$d = [0-9] -- digit $d = [0-9] -- digit
$i = [$l $d _ '] -- identifier character $i = [$l $d _ '] -- identifier character
$u = [\0-\255] -- universal: any character $u = [\0-\255] -- universal: any character
@rsyms = -- symbols and non-identifier-like reserved words @rsyms = -- symbols and non-identifier-like reserved words
\; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/
@@ -31,7 +28,7 @@ $u = [\0-\255] -- universal: any character
$white+ ; $white+ ;
@rsyms { tok (eitherResIdent (T_Ident . identC . rawIdentC)) } @rsyms { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
\' ($u # \')* \' { tok (eitherResIdent (T_LString . BS.unpack)) } \' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (eitherResIdent (T_Ident . identC . rawIdentS . unescapeInitTail . BS.unpack)) }
(\_ | $l)($l | $d | \_ | \')* { tok (eitherResIdent (T_Ident . identC . rawIdentC)) } (\_ | $l)($l | $d | \_ | \')* { tok (eitherResIdent (T_Ident . identC . rawIdentC)) }
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) } \" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \" { tok (T_String . unescapeInitTail . BS.unpack) }
@@ -115,7 +112,6 @@ data Token
| T_String String -- string literals | T_String String -- string literals
| T_Integer Int -- integer literals | T_Integer Int -- integer literals
| T_Double Double -- double precision float literals | T_Double Double -- double precision float literals
| T_LString String
| T_Ident Ident | T_Ident Ident
| T_EOF | T_EOF
@@ -207,6 +203,7 @@ unescapeInitTail = unesc . tail where
'\\':'n':cs -> '\n' : unesc cs '\\':'n':cs -> '\n' : unesc cs
'\\':'t':cs -> '\t' : unesc cs '\\':'t':cs -> '\t' : unesc cs
'"':[] -> [] '"':[] -> []
'\'':[] -> []
c:cs -> c : unesc cs c:cs -> c : unesc cs
_ -> [] _ -> []

View File

@@ -1,5 +1,6 @@
#include "pgf.h" #include "pgf.h"
#include <gu/assert.h> #include <gu/assert.h>
#include <gu/utf8.h>
#include <ctype.h> #include <ctype.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@@ -166,6 +167,45 @@ pgf_expr_parser_getc(PgfExprParser* parser)
} }
} }
static bool
pgf_is_ident_first(GuUCS ucs)
{
return (ucs == '_') ||
(ucs >= 'a' && ucs <= 'z') ||
(ucs >= 'A' && ucs <= 'Z') ||
(ucs >= 192 && ucs <= 255 && ucs != 247 && ucs != 215);
}
static bool
pgf_is_ident_rest(GuUCS ucs)
{
return (ucs == '_') ||
(ucs == '\'') ||
(ucs >= '0' && ucs <= '9') ||
(ucs >= 'a' && ucs <= 'z') ||
(ucs >= 'A' && ucs <= 'Z') ||
(ucs >= 192 && ucs <= 255 && ucs != 247 && ucs != 215);
}
static bool
pgf_is_normal_ident(PgfCId id)
{
const uint8_t* p = (const uint8_t*) id;
GuUCS ucs = gu_utf8_decode(&p);
if (!pgf_is_ident_first(ucs))
return false;
for (;;) {
ucs = gu_utf8_decode(&p);
if (ucs == 0)
break;
if (!pgf_is_ident_rest(ucs))
return false;
}
return true;
}
static void static void
pgf_expr_parser_token(PgfExprParser* parser) pgf_expr_parser_token(PgfExprParser* parser)
{ {
@@ -227,20 +267,32 @@ pgf_expr_parser_token(PgfExprParser* parser)
pgf_expr_parser_getc(parser); pgf_expr_parser_getc(parser);
parser->token_tag = PGF_TOKEN_COLON; parser->token_tag = PGF_TOKEN_COLON;
break; break;
case '_': case '\'':
pgf_expr_parser_getc(parser); pgf_expr_parser_getc(parser);
parser->token_tag = PGF_TOKEN_WILD;
GuBuf* chars = gu_new_buf(char, parser->tmp_pool);
while (parser->ch != '\'' && parser->ch != EOF) {
if (parser->ch == '\\') {
pgf_expr_parser_getc(parser);
}
gu_buf_push(chars, char, parser->ch);
pgf_expr_parser_getc(parser);
}
if (parser->ch == '\'') {
pgf_expr_parser_getc(parser);
gu_buf_push(chars, char, 0);
parser->token_tag = PGF_TOKEN_IDENT;
parser->token_value = chars;
}
break; break;
default: { default: {
GuBuf* chars = gu_new_buf(char, parser->tmp_pool); GuBuf* chars = gu_new_buf(char, parser->tmp_pool);
if (isalpha(parser->ch)) { if (pgf_is_ident_first(parser->ch)) {
while (isalnum(parser->ch) || do {
parser->ch == '_' ||
parser->ch == '\'') {
gu_buf_push(chars, char, parser->ch); gu_buf_push(chars, char, parser->ch);
pgf_expr_parser_getc(parser); pgf_expr_parser_getc(parser);
} } while (pgf_is_ident_rest(parser->ch));
gu_buf_push(chars, char, 0); gu_buf_push(chars, char, 0);
parser->token_tag = PGF_TOKEN_IDENT; parser->token_tag = PGF_TOKEN_IDENT;
parser->token_value = chars; parser->token_value = chars;
@@ -268,7 +320,7 @@ pgf_expr_parser_token(PgfExprParser* parser)
} }
} else if (parser->ch == '"') { } else if (parser->ch == '"') {
pgf_expr_parser_getc(parser); pgf_expr_parser_getc(parser);
while (parser->ch != '"' && parser->ch != EOF) { while (parser->ch != '"' && parser->ch != EOF) {
gu_buf_push(chars, char, parser->ch); gu_buf_push(chars, char, parser->ch);
pgf_expr_parser_getc(parser); pgf_expr_parser_getc(parser);
@@ -924,6 +976,30 @@ pgf_expr_hash(GuHash h, PgfExpr e)
return h; return h;
} }
void
pgf_print_cid(PgfCId id,
GuOut* out, GuExn* err)
{
if (pgf_is_normal_ident(id))
gu_string_write(id, out, err);
else {
gu_putc('\'', out, err);
const uint8_t* p = (const uint8_t*) id;
for (;;) {
GuUCS ucs = gu_utf8_decode(&p);
if (ucs == 0)
break;
if (ucs == '\'')
gu_puts("\\\'", out, err);
else if (ucs == '\\')
gu_puts("\\\\", out, err);
else
gu_out_utf8(ucs, out, err);
}
gu_putc('\'', out, err);
}
}
void void
pgf_print_literal(PgfLiteral lit, pgf_print_literal(PgfLiteral lit,
GuOut* out, GuExn* err) GuOut* out, GuExn* err)
@@ -973,7 +1049,7 @@ pgf_print_expr(PgfExpr expr, PgfPrintContext* ctxt, int prec,
if (abs->bind_type == PGF_BIND_TYPE_IMPLICIT) { if (abs->bind_type == PGF_BIND_TYPE_IMPLICIT) {
gu_putc('{', out, err); gu_putc('{', out, err);
} }
gu_string_write(abs->id, out, err); pgf_print_cid(abs->id, out, err);
if (abs->bind_type == PGF_BIND_TYPE_IMPLICIT) { if (abs->bind_type == PGF_BIND_TYPE_IMPLICIT) {
gu_putc('}', out, err); gu_putc('}', out, err);
} }
@@ -1028,7 +1104,7 @@ pgf_print_expr(PgfExpr expr, PgfPrintContext* ctxt, int prec,
break; break;
case PGF_EXPR_FUN: { case PGF_EXPR_FUN: {
PgfExprFun* fun = ei.data; PgfExprFun* fun = ei.data;
gu_string_write(fun->fun, out, err); pgf_print_cid(fun->fun, out, err);
break; break;
} }
case PGF_EXPR_VAR: { case PGF_EXPR_VAR: {
@@ -1043,7 +1119,7 @@ pgf_print_expr(PgfExpr expr, PgfPrintContext* ctxt, int prec,
if (c == NULL) { if (c == NULL) {
gu_printf(out, err, "#%d", evar->var); gu_printf(out, err, "#%d", evar->var);
} else { } else {
gu_string_write(c->name, out, err); pgf_print_cid(c->name, out, err);
} }
break; break;
} }
@@ -1074,7 +1150,7 @@ pgf_print_hypo(PgfHypo *hypo, PgfPrintContext* ctxt, int prec,
{ {
if (hypo->bind_type == PGF_BIND_TYPE_IMPLICIT) { if (hypo->bind_type == PGF_BIND_TYPE_IMPLICIT) {
gu_puts("({", out, err); gu_puts("({", out, err);
gu_string_write(hypo->cid, out, err); pgf_print_cid(hypo->cid, out, err);
gu_puts("} : ", out, err); gu_puts("} : ", out, err);
pgf_print_type(hypo->type, ctxt, 0, out, err); pgf_print_type(hypo->type, ctxt, 0, out, err);
gu_puts(")", out, err); gu_puts(")", out, err);
@@ -1083,7 +1159,7 @@ pgf_print_hypo(PgfHypo *hypo, PgfPrintContext* ctxt, int prec,
if (strcmp(hypo->cid, "_") != 0) { if (strcmp(hypo->cid, "_") != 0) {
gu_puts("(", out, err); gu_puts("(", out, err);
gu_string_write(hypo->cid, out, err); pgf_print_cid(hypo->cid, out, err);
gu_puts(" : ", out, err); gu_puts(" : ", out, err);
pgf_print_type(hypo->type, ctxt, 0, out, err); pgf_print_type(hypo->type, ctxt, 0, out, err);
gu_puts(")", out, err); gu_puts(")", out, err);
@@ -1117,7 +1193,7 @@ pgf_print_type(PgfType *type, PgfPrintContext* ctxt, int prec,
gu_puts(" -> ", out, err); gu_puts(" -> ", out, err);
} }
gu_string_write(type->cid, out, err); pgf_print_cid(type->cid, out, err);
for (size_t i = 0; i < type->n_exprs; i++) { for (size_t i = 0; i < type->n_exprs; i++) {
gu_puts(" ", out, err); gu_puts(" ", out, err);
@@ -1143,7 +1219,7 @@ pgf_print_type(PgfType *type, PgfPrintContext* ctxt, int prec,
if (prec > 3) gu_putc(')', out, err); if (prec > 3) gu_putc(')', out, err);
} else { } else {
gu_string_write(type->cid, out, err); pgf_print_cid(type->cid, out, err);
} }
} }

View File

@@ -176,6 +176,9 @@ struct PgfPrintContext {
PgfPrintContext* next; PgfPrintContext* next;
}; };
void
pgf_print_cid(PgfCId id, GuOut* out, GuExn* err);
void void
pgf_print_literal(PgfLiteral lit, GuOut* out, GuExn* err); pgf_print_literal(PgfLiteral lit, GuOut* out, GuExn* err);

View File

@@ -16,7 +16,7 @@ pgf_print_flag(GuMapItor* fn, const void* key, void* value,
GuOut *out = clo->out; GuOut *out = clo->out;
gu_puts(" flag ", out, err); gu_puts(" flag ", out, err);
gu_string_write(flag, out, err); pgf_print_cid(flag, out, err);
gu_puts(" = ", out, err); gu_puts(" = ", out, err);
pgf_print_literal(lit, out, err); pgf_print_literal(lit, out, err);
gu_puts(";\n", out, err); gu_puts(";\n", out, err);
@@ -32,7 +32,7 @@ pgf_print_cat(GuMapItor* fn, const void* key, void* value,
GuOut *out = clo->out; GuOut *out = clo->out;
gu_puts(" cat ", out, err); gu_puts(" cat ", out, err);
gu_string_write(name, out, err); pgf_print_cid(name, out, err);
PgfPrintContext* ctxt = NULL; PgfPrintContext* ctxt = NULL;
size_t n_hypos = gu_seq_length(cat->context); size_t n_hypos = gu_seq_length(cat->context);
@@ -61,7 +61,7 @@ pgf_print_absfun(GuMapItor* fn, const void* key, void* value,
GuOut *out = clo->out; GuOut *out = clo->out;
gu_puts((fun->defns == NULL) ? " data " : " fun ", out, err); gu_puts((fun->defns == NULL) ? " data " : " fun ", out, err);
gu_string_write(name, out, err); pgf_print_cid(name, out, err);
gu_puts(" : ", out, err); gu_puts(" : ", out, err);
pgf_print_type(fun->type, NULL, 0, out, err); pgf_print_type(fun->type, NULL, 0, out, err);
gu_printf(out, err, " ; -- %f\n", fun->ep.prob); gu_printf(out, err, " ; -- %f\n", fun->ep.prob);
@@ -70,7 +70,7 @@ static void
pgf_print_abstract(PgfAbstr* abstr, GuOut* out, GuExn* err) pgf_print_abstract(PgfAbstr* abstr, GuOut* out, GuExn* err)
{ {
gu_puts("abstract ", out, err); gu_puts("abstract ", out, err);
gu_string_write(abstr->name, out, err); pgf_print_cid(abstr->name, out, err);
gu_puts(" {\n", out, err); gu_puts(" {\n", out, err);
PgfPrintFn clo1 = { { pgf_print_flag }, out }; PgfPrintFn clo1 = { { pgf_print_flag }, out };
@@ -205,7 +205,7 @@ pgf_print_cncfun(PgfCncFun *cncfun, PgfSequences* sequences,
if (cncfun->absfun != NULL) { if (cncfun->absfun != NULL) {
gu_puts(" [", out, err); gu_puts(" [", out, err);
gu_string_write(cncfun->absfun->name, out, err); pgf_print_cid(cncfun->absfun->name, out, err);
gu_puts("]", out, err); gu_puts("]", out, err);
} }
@@ -311,7 +311,7 @@ pgf_print_cnccat(GuMapItor* fn, const void* key, void* value,
GuOut *out = clo->out; GuOut *out = clo->out;
gu_puts(" ", out, err); gu_puts(" ", out, err);
gu_string_write(name, out, err); pgf_print_cid(name, out, err);
gu_puts(" :=\n", out, err); gu_puts(" :=\n", out, err);
PgfCCat *start = gu_seq_get(cnccat->cats, PgfCCat*, 0); PgfCCat *start = gu_seq_get(cnccat->cats, PgfCCat*, 0);
@@ -335,7 +335,7 @@ pgf_print_concrete(PgfCId cncname, PgfConcr* concr,
GuOut* out, GuExn* err) GuOut* out, GuExn* err)
{ {
gu_puts("concrete ", out, err); gu_puts("concrete ", out, err);
gu_string_write(cncname, out, err); pgf_print_cid(cncname, out, err);
gu_puts(" {\n", out, err); gu_puts(" {\n", out, err);
PgfPrintFn clo1 = { { pgf_print_flag }, out }; PgfPrintFn clo1 = { { pgf_print_flag }, out };

View File

@@ -7,6 +7,7 @@ module PGF.CId (CId(..),
import Control.Monad import Control.Monad
import qualified Data.ByteString.Char8 as BS import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString.UTF8 as UTF8
import Data.Char import Data.Char
import qualified Text.ParserCombinators.ReadP as RP import qualified Text.ParserCombinators.ReadP as RP
import qualified Text.PrettyPrint as PP import qualified Text.PrettyPrint as PP
@@ -21,7 +22,7 @@ wildCId = CId (BS.singleton '_')
-- | Creates a new identifier from 'String' -- | Creates a new identifier from 'String'
mkCId :: String -> CId mkCId :: String -> CId
mkCId s = CId (BS.pack s) mkCId s = CId (UTF8.fromString s)
bsCId = CId bsCId = CId
@@ -33,7 +34,18 @@ readCId s = case [x | (x,cs) <- RP.readP_to_S pCId s, all isSpace cs] of
-- | Renders the identifier as 'String' -- | Renders the identifier as 'String'
showCId :: CId -> String showCId :: CId -> String
showCId (CId x) = BS.unpack x showCId (CId x) =
let raw = UTF8.toString x
in if isIdent raw
then raw
else "'" ++ concatMap escape raw ++ "'"
where
isIdent [] = False
isIdent (c:cs) = isIdentFirst c && all isIdentRest cs
escape '\'' = "\\\'"
escape '\\' = "\\\\"
escape c = [c]
instance Show CId where instance Show CId where
showsPrec _ = showString . showCId showsPrec _ = showString . showCId
@@ -48,10 +60,35 @@ pCId = do s <- pIdent
else return (mkCId s) else return (mkCId s)
pIdent :: RP.ReadP String pIdent :: RP.ReadP String
pIdent = liftM2 (:) (RP.satisfy isIdentFirst) (RP.munch isIdentRest) pIdent =
where liftM2 (:) (RP.satisfy isIdentFirst) (RP.munch isIdentRest)
isIdentFirst c = c == '_' || isLetter c `mplus`
isIdentRest c = c == '_' || c == '\'' || isAlphaNum c do RP.char '\''
cs <- RP.many1 insideChar
RP.char '\''
return cs
-- where
insideChar = RP.readS_to_P $ \s ->
case s of
[] -> []
('\\':'\\':cs) -> [('\\',cs)]
('\\':'\'':cs) -> [('\'',cs)]
('\\':cs) -> []
('\'':cs) -> []
(c:cs) -> [(c,cs)]
isIdentFirst c =
(c == '_') ||
(c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '\192' && c <= '\255' && c /= '\247' && c /= '\215')
isIdentRest c =
(c == '_') ||
(c == '\'') ||
(c >= '0' && c <= '9') ||
(c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '\192' && c <= '\255' && c /= '\247' && c /= '\215')
ppCId :: CId -> PP.Doc ppCId :: CId -> PP.Doc
ppCId = PP.text . showCId ppCId = PP.text . showCId