Files
gf-core/src/compiler/GF/Grammar/Lexer.x

338 lines
7.9 KiB
Plaintext

-- -*- haskell -*-
{
{-# LANGUAGE CPP #-}
module GF.Grammar.Lexer
( Token(..), Posn(..)
, P, runP, runPartial, token, lexer, getPosn, failLoc
, isReservedWord, invMap
) where
import Control.Applicative
import Control.Monad(ap)
import GF.Infra.Ident
--import GF.Data.Operations
import qualified Data.ByteString.Char8 as BS
import qualified Data.ByteString as WBS
import qualified Data.ByteString.Internal as BS(w2c)
import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.Map as Map
import Data.Word(Word8)
import Data.Char(readLitChar)
--import Debug.Trace(trace)
import qualified Control.Monad.Fail as Fail
}
$l = [a-zA-Z\192 - \255] # [\215 \247]
$c = [A-Z\192-\221] # [\215]
$s = [a-z\222-\255] # [\247]
$d = [0-9] -- digit
$i = [$l $d _ '] -- identifier character
$u = [.\n] -- universal: any character
@rsyms = -- symbols and non-identifier-like reserved words
\; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \=
:-
"--" [.]* ; -- Toss single line comments
"{-" ([$u # \-] | \- [$u # \}])* ("-")+ "}" ;
$white+ ;
@rsyms { tok ident }
\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (T_Ident . identS . unescapeInitTail . unpack) }
(\_ | $l)($l | $d | \_ | \')* { tok ident }
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t | $d+)))* \" { tok (T_String . unescapeInitTail . unpack) }
(\-)? $d+ { tok (T_Integer . read . unpack) }
(\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) }
{
unpack = UTF8.toString
--unpack = id
ident = res T_Ident . identC . rawIdentC
tok f p s = f s
data Token
= T_exclmark
| T_patt
| T_int_label
| T_oparen
| T_cparen
| T_tilde
| T_star
| T_starstar
| T_plus
| T_plusplus
| T_comma
| T_minus
| T_rarrow
| T_dot
| T_alt
| T_colon
| T_semicolon
| T_less
| T_equal
| T_big_rarrow
| T_great
| T_questmark
| T_obrack
| T_lam
| T_lamlam
| T_cbrack
| T_ocurly
| T_bar
| T_ccurly
| T_underscore
| T_at
| T_cfarrow
| T_PType
| T_Str
| T_Strs
| T_Tok
| T_Type
| T_abstract
| T_case
| T_cat
| T_concrete
| T_data
| T_def
| T_flags
| T_fn
| T_fun
| T_in
| T_incomplete
| T_instance
| T_interface
| T_let
| T_lin
| T_lincat
| T_lindef
| T_linref
| T_of
| T_open
| T_oper
| T_param
| T_pattern
| T_pre
| T_printname
| T_resource
| T_strs
| T_table
| T_transfer
| T_variants
| T_where
| T_with
| T_coercions
| T_terminator
| T_separator
| T_nonempty
| T_String String -- string literals
| T_Integer Int -- integer literals
| T_Double Double -- double precision float literals
| T_Ident Ident
| T_EOF
deriving (Eq, Ord, Show) -- debug
res = eitherResIdent
eitherResIdent :: (Ident -> Token) -> Ident -> Token
eitherResIdent tv s =
case Map.lookup s resWords of
Just t -> t
Nothing -> tv s
isReservedWord :: Ident -> Bool
isReservedWord ident = Map.member ident resWords
resWords = Map.fromList
[ b "!" T_exclmark
, b "#" T_patt
, b "$" T_int_label
, b "(" T_oparen
, b ")" T_cparen
, b "~" T_tilde
, b "*" T_star
, b "**" T_starstar
, b "+" T_plus
, b "++" T_plusplus
, b "," T_comma
, b "-" T_minus
, b "->" T_rarrow
, b "." T_dot
, b "/" T_alt
, b ":" T_colon
, b ";" T_semicolon
, b "<" T_less
, b "=" T_equal
, b "=>" T_big_rarrow
, b ">" T_great
, b "?" T_questmark
, b "[" T_obrack
, b "]" T_cbrack
, b "\\" T_lam
, b "\\\\" T_lamlam
, b "{" T_ocurly
, b "}" T_ccurly
, b "|" T_bar
, b "_" T_underscore
, b "@" T_at
, b "::=" T_cfarrow
, b ":=" T_cfarrow
, b "PType" T_PType
, b "Str" T_Str
, b "Strs" T_Strs
, b "Tok" T_Tok
, b "Type" T_Type
, b "abstract" T_abstract
, b "case" T_case
, b "cat" T_cat
, b "concrete" T_concrete
, b "data" T_data
, b "def" T_def
, b "flags" T_flags
, b "fn" T_fn
, b "fun" T_fun
, b "in" T_in
, b "incomplete" T_incomplete
, b "instance" T_instance
, b "interface" T_interface
, b "let" T_let
, b "lin" T_lin
, b "lincat" T_lincat
, b "lindef" T_lindef
, b "linref" T_linref
, b "of" T_of
, b "open" T_open
, b "oper" T_oper
, b "param" T_param
, b "pattern" T_pattern
, b "pre" T_pre
, b "printname" T_printname
, b "resource" T_resource
, b "strs" T_strs
, b "table" T_table
, b "transfer" T_transfer
, b "variants" T_variants
, b "where" T_where
, b "with" T_with
, b "coercions" T_coercions
, b "terminator" T_terminator
, b "separator" T_separator
, b "nonempty" T_nonempty
]
where b s t = (identS s, t)
invMap :: Map.Map Token String
invMap = res
where
lst = Map.toList resWords
flp = map (\(k,v) -> (v,showIdent k)) lst
res = Map.fromList flp
unescapeInitTail :: String -> String
unescapeInitTail = unesc . tail where
unesc s = case s of
[] -> []
'\"':[] -> []
'\'':[] -> []
_ -> case readLitChar s of
[(c,cs)] -> c:unesc cs
-------------------------------------------------------------------
-- Alex wrapper code.
-- A modified "posn" wrapper.
-------------------------------------------------------------------
data Posn = Pn {-# UNPACK #-} !Int
{-# UNPACK #-} !Int
deriving (Eq,Show)
alexMove :: Posn -> Char -> Posn
alexMove (Pn l c) '\n' = Pn (l+1) 1
alexMove (Pn l c) _ = Pn l (c+1)
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte (AI p _ s) =
case WBS.uncons s of
Nothing -> Nothing
Just (w,s) ->
let p' = alexMove p c
c = BS.w2c w
in p' `seq` Just (w, (AI p' c s))
{-
-- Not used by this lexer:
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (AI p c s) = c
-}
data AlexInput = AI {-# UNPACK #-} !Posn -- current position,
{-# UNPACK #-} !Char -- previous char
{-# UNPACK #-} !BS.ByteString -- current input string
type AlexInput2 = (AlexInput,AlexInput)
data ParseResult a
= POk AlexInput2 a
| PFailed Posn -- The position of the error
String -- The error message
newtype P a = P { unP :: AlexInput2 -> ParseResult a }
instance Functor P where
fmap = liftA
instance Applicative P where
pure a = a `seq` (P $ \s -> POk s a)
(<*>) = ap
instance Monad P where
return = pure
(P m) >>= k = P $ \ s -> case m s of
POk s a -> unP (k a) s
PFailed posn err -> PFailed posn err
#if !(MIN_VERSION_base(4,13,0))
-- Monad(fail) will be removed in GHC 8.8+
fail = Fail.fail
#endif
instance Fail.MonadFail P where
fail msg = P $ \(_,AI posn _ _) -> PFailed posn msg
runP :: P a -> BS.ByteString -> Either (Posn,String) a
runP p bs = snd <$> runP' p (Pn 1 0,bs)
runPartial p s = conv <$> runP' p (Pn 1 0,UTF8.fromString s)
where conv ((pos,rest),x) = (UTF8.toString rest,x)
runP' (P f) (pos,txt) =
case f (dup (AI pos ' ' txt)) of
POk (AI pos _ rest,_) x -> Right ((pos,rest),x)
PFailed pos msg -> Left (pos,msg)
dup x = (x,x)
failLoc :: Posn -> String -> P a
failLoc pos msg = P $ \_ -> PFailed pos msg
lexer :: (Token -> P a) -> P a
lexer cont = cont=<<token
token :: P Token
token = P go
where
--cont' t = trace (show t) (cont t)
go ai2@(_,inp@(AI pos _ str)) =
case alexScan inp 0 of
AlexEOF -> POk (inp,inp) T_EOF
AlexError (AI pos _ _) -> PFailed pos "lexical error"
AlexSkip inp' len -> {-trace (show len) $-} go (inp,inp')
AlexToken inp' len act -> POk (inp,inp') (act pos ({-UTF8.toString-} (UTF8.take len str)))
getPosn :: P Posn
getPosn = P $ \ai2@(_,inp@(AI pos _ _)) -> POk ai2 pos
}