379 lines
10 KiB
Plaintext
379 lines
10 KiB
Plaintext
{
|
|
{-# LANGUAGE ViewPatterns, LambdaCase #-}
|
|
{-# LANGUAGE GeneralisedNewtypeDeriving #-}
|
|
{-# LANGUAGE OverloadedStrings #-}
|
|
module Rlp.Lex
|
|
( P(..)
|
|
, RlpToken(..)
|
|
, Located(..)
|
|
, lexToken
|
|
, lexStream
|
|
, lexDebug
|
|
, lexCont
|
|
, popLexState
|
|
, programInitState
|
|
, runP'
|
|
)
|
|
where
|
|
import Codec.Binary.UTF8.String (encodeChar)
|
|
import Control.Monad
|
|
import Control.Monad.Errorful
|
|
import Core.Syntax (Name)
|
|
import Data.Functor.Identity
|
|
import Data.Char (digitToInt)
|
|
import Data.Monoid (First)
|
|
import Data.Maybe
|
|
import Data.Text (Text)
|
|
import Data.Text qualified as T
|
|
import Data.Word
|
|
import Data.Default
|
|
import Lens.Micro.Mtl
|
|
import Lens.Micro
|
|
|
|
import Debug.Trace
|
|
import Rlp.Parse.Types
|
|
}
|
|
|
|
$whitechar = [ \t\n\r\f\v]
|
|
|
|
$nl = [\n\r]
|
|
$white_no_nl = $white # $nl
|
|
|
|
$lower = [a-z \_]
|
|
$upper = [A-Z]
|
|
$alpha = [$lower $upper]
|
|
$digit = 0-9
|
|
|
|
$special = [\(\)\,\;\[\]\{\}]
|
|
$namechar = [$alpha $digit \' \#]
|
|
$asciisym = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:]
|
|
|
|
@decimal = $digit+
|
|
|
|
@varname = $lower $namechar*
|
|
@conname = $upper $namechar*
|
|
@consym = \: $asciisym*
|
|
@varsym = $asciisym+
|
|
|
|
@reservedname =
|
|
case|data|do|import|in|let|letrec|module|of|where
|
|
|infixr|infixl|infix
|
|
|
|
@reservedop =
|
|
"=" | \\ | "->" | "|" | "::"
|
|
|
|
rlp :-
|
|
|
|
-- everywhere: skip whitespace
|
|
$white_no_nl+ ;
|
|
|
|
-- everywhere: skip comments
|
|
-- TODO: don't treat operators like (-->) as comments
|
|
"--".* ;
|
|
|
|
-- we are indentation-sensitive! do not skip NLs!. upon encountering a newline,
|
|
-- we check indentation and potentially insert extra tokens. search this file
|
|
-- for the definition of `doBol`
|
|
<0> \n { beginPush bol }
|
|
|
|
<layout>
|
|
{
|
|
|
|
}
|
|
|
|
-- layout keywords
|
|
<0>
|
|
{
|
|
"let" { constToken TokenLet `thenBeginPush` layout_let }
|
|
"letrec" { constToken TokenLetrec `thenBeginPush` layout_let }
|
|
"of" { constToken TokenOf `thenBeginPush` layout_of }
|
|
}
|
|
|
|
-- scan various identifiers and reserved words. order is important here!
|
|
<0>
|
|
{
|
|
@reservedname { tokenWith lexReservedName }
|
|
@conname { tokenWith TokenConName }
|
|
@varname { tokenWith TokenVarName }
|
|
@reservedop { tokenWith lexReservedOp }
|
|
@consym { tokenWith TokenConSym }
|
|
@varsym { tokenWith TokenVarSym }
|
|
}
|
|
|
|
-- literals -- currently this is just unsigned integer literals
|
|
<0>
|
|
{
|
|
@decimal { tokenWith (TokenLitInt . readInt) }
|
|
}
|
|
|
|
-- control characters
|
|
<0>
|
|
{
|
|
"(" { constToken TokenLParen }
|
|
")" { constToken TokenRParen }
|
|
"{" { explicitLBrace }
|
|
"}" { explicitRBrace }
|
|
";" { constToken TokenSemicolon }
|
|
}
|
|
|
|
-- consume all whitespace leaving us at the beginning of the next non-empty
|
|
-- line. we then compare the indentation of that line to the enclosing layout
|
|
-- context and proceed accordingly
|
|
<bol>
|
|
{
|
|
$whitechar ;
|
|
\n ;
|
|
() { doBol }
|
|
}
|
|
|
|
<layout_top>
|
|
{
|
|
\n ;
|
|
"{" { explicitLBrace `thenDo` popLexState }
|
|
}
|
|
|
|
<layout, layout_let, layout_of>
|
|
{
|
|
\n { beginPush bol }
|
|
"{" { explicitLBrace `thenDo` popLexState }
|
|
}
|
|
|
|
<layout_let>
|
|
{
|
|
"in" { constToken TokenIn `thenDo` (popLexState *> popLayout) }
|
|
}
|
|
|
|
<layout, layout_top, layout_let, layout_of>
|
|
{
|
|
() { doLayout }
|
|
}
|
|
|
|
{
|
|
|
|
lexReservedName :: Text -> RlpToken
|
|
lexReservedName = \case
|
|
"data" -> TokenData
|
|
"case" -> TokenCase
|
|
"of" -> TokenOf
|
|
"let" -> TokenLet
|
|
"letrec" -> TokenLetrec
|
|
"in" -> TokenIn
|
|
"infix" -> TokenInfix
|
|
"infixl" -> TokenInfixL
|
|
"infixr" -> TokenInfixR
|
|
s -> error (show s)
|
|
|
|
lexReservedOp :: Text -> RlpToken
|
|
lexReservedOp = \case
|
|
"=" -> TokenEquals
|
|
"::" -> TokenHasType
|
|
"|" -> TokenPipe
|
|
"->" -> TokenArrow
|
|
s -> error (show s)
|
|
|
|
-- | @andBegin@, with the subtle difference that the start code is set
|
|
-- /after/ the action
|
|
thenBegin :: LexerAction a -> Int -> LexerAction a
|
|
thenBegin act c inp l = do
|
|
a <- act inp l
|
|
psLexState . _head .= c
|
|
pure a
|
|
|
|
thenBeginPush :: LexerAction a -> Int -> LexerAction a
|
|
thenBeginPush act c inp l = do
|
|
a <- act inp l
|
|
pushLexState c
|
|
pure a
|
|
|
|
andBegin :: LexerAction a -> Int -> LexerAction a
|
|
andBegin act c inp l = do
|
|
psLexState . _head .= c
|
|
act inp l
|
|
|
|
beginPush :: Int -> LexerAction (Located RlpToken)
|
|
beginPush n _ _ = pushLexState n >> lexToken
|
|
|
|
alexGetByte :: AlexInput -> Maybe (Word8, AlexInput)
|
|
alexGetByte inp = case inp ^. aiBytes of
|
|
[] -> do
|
|
(c,t) <- T.uncons (inp ^. aiSource)
|
|
let (b:bs) = encodeChar c
|
|
-- tail the source
|
|
inp' = inp & aiSource .~ t
|
|
-- record the excess bytes for successive calls
|
|
& aiBytes .~ bs
|
|
-- report the previous char
|
|
& aiPrevChar .~ c
|
|
-- update the position
|
|
& aiPos %~ \ (ln,col,a) ->
|
|
if c == '\n'
|
|
then (ln+1, 1, a+1)
|
|
else (ln, col+1, a+1)
|
|
pure (b, inp')
|
|
|
|
_ -> Just (head bs, inp')
|
|
where
|
|
(bs, inp') = inp & aiBytes <<%~ drop 1
|
|
|
|
getInput :: P AlexInput
|
|
getInput = use psInput
|
|
|
|
getLexState :: P Int
|
|
getLexState = use (psLexState . singular _head)
|
|
|
|
alexInputPrevChar :: AlexInput -> Char
|
|
alexInputPrevChar = view aiPrevChar
|
|
|
|
pushLexState :: Int -> P ()
|
|
pushLexState n = psLexState %= (n:)
|
|
|
|
readInt :: Text -> Int
|
|
readInt = T.foldl f 0 where
|
|
f n c = 10*n + digitToInt c
|
|
|
|
constToken :: RlpToken -> LexerAction (Located RlpToken)
|
|
constToken t inp l = do
|
|
pos <- use (psInput . aiPos)
|
|
pure (Located (spanFromPos pos l) t)
|
|
|
|
tokenWith :: (Text -> RlpToken) -> LexerAction (Located RlpToken)
|
|
tokenWith tf inp l = do
|
|
pos <- getPos
|
|
let t = tf (T.take l $ inp ^. aiSource)
|
|
pure (Located (spanFromPos pos l) t)
|
|
|
|
getPos :: P Position
|
|
getPos = use (psInput . aiPos)
|
|
|
|
alexEOF :: P (Located RlpToken)
|
|
alexEOF = do
|
|
inp <- getInput
|
|
pos <- getPos
|
|
pure (Located (spanFromPos pos 0) TokenEOF)
|
|
|
|
runP' :: P a -> Text -> (ParseState, [MsgEnvelope RlpParseError], Maybe a)
|
|
runP' p s = runP p st where
|
|
st = initParseState [layout_top,0] s
|
|
|
|
lexToken :: P (Located RlpToken)
|
|
lexToken = do
|
|
inp <- getInput
|
|
c <- getLexState
|
|
st <- use id
|
|
-- traceM $ "st: " <> show st
|
|
case alexScan inp c of
|
|
AlexEOF -> pure $ Located (spanFromPos (inp^.aiPos) 0) TokenEOF
|
|
AlexSkip inp' l -> do
|
|
psInput .= inp'
|
|
lexToken
|
|
AlexToken inp' l act -> do
|
|
psInput .= inp'
|
|
act inp l
|
|
AlexError inp' -> addFatalHere 1 RlpParErrLexical
|
|
|
|
lexCont :: (Located RlpToken -> P a) -> P a
|
|
lexCont = (lexToken >>=)
|
|
|
|
lexStream :: P [RlpToken]
|
|
lexStream = do
|
|
t <- lexToken
|
|
case t of
|
|
Located _ TokenEOF -> pure [TokenEOF]
|
|
Located _ t -> (t:) <$> lexStream
|
|
|
|
lexDebug :: (Located RlpToken -> P a) -> P a
|
|
lexDebug k = do
|
|
t <- lexToken
|
|
traceM $ "token: " <> show t
|
|
k t
|
|
|
|
lexTest :: Text -> Maybe [RlpToken]
|
|
lexTest s = runP' lexStream s ^. _3
|
|
|
|
indentLevel :: P Int
|
|
indentLevel = do
|
|
pos <- use (psInput . aiPos)
|
|
pure (pos ^. _2)
|
|
|
|
insertToken :: RlpToken -> P (Located RlpToken)
|
|
insertToken t = do
|
|
pos <- use (psInput . aiPos)
|
|
pure (Located (spanFromPos pos 0) t)
|
|
|
|
popLayout :: P Layout
|
|
popLayout = do
|
|
-- traceM "pop layout"
|
|
ctx <- preuse (psLayoutStack . _head)
|
|
psLayoutStack %= (drop 1)
|
|
case ctx of
|
|
Just l -> pure l
|
|
Nothing -> error "popLayout: layout stack empty! this is a bug."
|
|
|
|
pushLayout :: Layout -> P ()
|
|
pushLayout l = do
|
|
-- traceM "push layout"
|
|
psLayoutStack %= (l:)
|
|
|
|
popLexState :: P ()
|
|
popLexState = do
|
|
psLexState %= tail
|
|
|
|
insertSemicolon, insertLBrace, insertRBrace :: P (Located RlpToken)
|
|
insertSemicolon = {- traceM "inserting semi" >> -} insertToken TokenSemicolonV
|
|
insertLBrace = {- traceM "inserting lbrace" >> -} insertToken TokenLBraceV
|
|
insertRBrace = {- traceM "inserting rbrace" >> -} insertToken TokenRBraceV
|
|
|
|
cmpLayout :: P Ordering
|
|
cmpLayout = do
|
|
i <- indentLevel
|
|
ctx <- preuse (psLayoutStack . _head)
|
|
case ctx of
|
|
Just (Implicit n) -> pure (i `compare` n)
|
|
_ -> pure GT
|
|
|
|
doBol :: LexerAction (Located RlpToken)
|
|
doBol inp l = do
|
|
off <- cmpLayout
|
|
i <- indentLevel
|
|
-- traceM $ "i: " <> show i
|
|
-- important that we pop the lex state lest we find our lexer diverging
|
|
case off of
|
|
-- the line is aligned with the previous. it therefore belongs to the
|
|
-- same list
|
|
EQ -> popLexState *> insertSemicolon
|
|
-- the line is indented further than the previous, so we assume it is a
|
|
-- line continuation. ignore it and move on!
|
|
GT -> popLexState *> lexToken
|
|
-- the line is indented less than the previous, pop the layout stack and
|
|
-- insert a closing brace. make VERY good note of the fact that we do not
|
|
-- pop the lex state! this means doBol is called until indentation is EQ
|
|
-- GT. so if multiple layouts are closed at once, this catches that.
|
|
LT -> popLayout >> insertRBrace
|
|
|
|
thenDo :: LexerAction a -> P b -> LexerAction a
|
|
thenDo act p inp l = act inp l <* p
|
|
|
|
explicitLBrace :: LexerAction (Located RlpToken)
|
|
explicitLBrace inp l = do
|
|
pushLayout Explicit
|
|
constToken TokenLBrace inp l
|
|
|
|
explicitRBrace :: LexerAction (Located RlpToken)
|
|
explicitRBrace inp l = do
|
|
popLayout
|
|
constToken TokenRBrace inp l
|
|
|
|
doLayout :: LexerAction (Located RlpToken)
|
|
doLayout _ _ = do
|
|
i <- indentLevel
|
|
-- traceM $ "doLayout: i: " <> show i
|
|
pushLayout (Implicit i)
|
|
popLexState
|
|
insertLBrace
|
|
|
|
programInitState :: Text -> ParseState
|
|
programInitState = initParseState [layout_top,0]
|
|
|
|
}
|
|
|