From 17ddf3530cc114871ddea53637bf1e5f8d06bc75 Mon Sep 17 00:00:00 2001 From: crumbtoo Date: Sun, 14 Jan 2024 18:19:37 -0700 Subject: [PATCH] kitten i'll be honest mommy's about to kill herself --- src/Rlp/Lex.x | 210 +++++++++++----------------------- src/Rlp/Lex.x.old | 280 ++++++++++++++++++++++++++++++++++++++++++++++ src/Rlp/Parse.y | 44 +++++++- src/Rlp/Syntax.hs | 4 + 4 files changed, 390 insertions(+), 148 deletions(-) create mode 100644 src/Rlp/Lex.x.old diff --git a/src/Rlp/Lex.x b/src/Rlp/Lex.x index 324fb19..9d41eb4 100644 --- a/src/Rlp/Lex.x +++ b/src/Rlp/Lex.x @@ -6,18 +6,19 @@ module Rlp.Lex ( P(..) , RlpToken(..) , Located(..) - , AlexPosn , lexer , lexerCont ) where import Control.Monad import Data.Functor.Identity +import Data.Char (digitToInt) import Core.Syntax (Name) import Data.Monoid (First) import Data.Maybe import Data.Text (Text) import Data.Text qualified as T +import Data.Word import Data.Default import Lens.Micro.Mtl import Lens.Micro @@ -26,8 +27,6 @@ import Lens.Micro.TH import Debug.Trace } -%wrapper "monadUserState-strict-text" - $whitechar = [ \t\n\r\f\v] $lower = [a-z \_] @@ -42,6 +41,8 @@ $namechar = [$alpha $digit \' \#] @varname = $lower $namechar* +@digits = $digit+ + rlp :- -- skip whitespace @@ -54,14 +55,13 @@ rlp :- <0> { - \n ; - "{" { explicitLBrace `thenBegin` one } - () { doLayout `thenBegin` one } + \n { begin bol } } { @varname { tokenWith TokenVarName } + @digits { tokenWith (TokenLitInt . readInt) } "=" { constToken TokenEquals } \n { begin bol } } @@ -73,29 +73,50 @@ rlp :- { $whitechar ; \n ; - () { doBol `andBegin` one } + () { doBol } } { +begin = undefined + +type LexerAction a = AlexInput -> Int -> P a + +type AlexInput = + ( Char -- prev char + , Text -- input + ) + +alexGetByte :: AlexInput -> Maybe (Word8, AlexInput) +alexGetByte (_,s) = undefined + +getInput :: P AlexInput +getInput = undefined + +alexInputPrevChar :: AlexInput -> Char +alexInputPrevChar = (^. _1) + +readInt :: Text -> Int +readInt = T.foldr f 0 where + f c n = digitToInt c + 10*n + -- | @andBegin@, with the subtle difference that the start code is set -- /after/ the action -thenBegin :: AlexAction a -> Int -> AlexAction a +thenBegin :: LexerAction a -> Int -> LexerAction a thenBegin act c inp l = do a <- act inp l - alexSetStartCode c - pure a + undefined -constToken :: RlpToken -> AlexAction (Located RlpToken) -constToken t inp _ = pure $ Located (inp ^. _1) t +constToken :: RlpToken -> LexerAction (Located RlpToken) +constToken t inp _ = undefined -tokenWith :: (Text -> RlpToken) -> AlexAction (Located RlpToken) -tokenWith tf (p,_,_,s) l = pure $ Located p (tf $ T.take l s) +tokenWith :: (Text -> RlpToken) -> LexerAction (Located RlpToken) +tokenWith tf inp l = undefined -alexEOF :: Alex (Located RlpToken) +alexEOF :: P (Located RlpToken) alexEOF = do - inp <- alexGetInput - pure (Located (inp ^. _1) TokenEOF) + inp <- getInput + pure (Located undefined TokenEOF) data RlpToken -- literals @@ -123,156 +144,59 @@ data RlpToken | TokenEOF deriving (Show) -newtype P a = P { runP :: ParseState -> Alex (ParseState, a) } +newtype P a = P { runP :: ParseState -> (ParseState, Maybe a) } deriving (Functor) -execP :: P a -> ParseState -> Text -> Either String a -execP p st s = snd <$> runAlex s (runP p st) +execP :: P a -> ParseState -> Either String a +execP p st = undefined -data ParseState = ParseState { } +execP' :: P a -> Text -> Either String a +execP' p s = execP p st where + st = initParseState s -instance Default ParseState where - def = ParseState { } +initParseState :: Text -> ParseState +initParseState s = ParseState + { _psLayoutStack = [] + , _psLexState = [bol,0] + , _psInput = (undefined, s) + } + +data ParseState = ParseState + { _psLayoutStack :: [Layout] + , _psLexState :: [Int] + , _psInput :: AlexInput + } instance Applicative P where - pure a = P $ \st -> pure (st,a) + pure a = P $ \st -> (st,Just a) liftA2 = liftM2 instance Monad P where - p >>= k = P $ \st -> do - (st',a) <- runP p st - runP (k a) st' - -data AlexUserState = AlexUserState - -- the layout context, along with a start code to return to when the layout - -- ends - { _ausLayoutStack :: [(Layout, Int)] - } - deriving Show - -alexInitUserState :: AlexUserState -alexInitUserState = AlexUserState - { _ausLayoutStack = [] - } + p >>= k = undefined data Layout = Explicit | Implicit Int deriving (Show, Eq) -data Located a = Located AlexPosn a +data Located a = Located (Int, Int) a deriving (Show) -ausLayoutStack :: Lens' AlexUserState [(Layout, Int)] -ausLayoutStack = lens _ausLayoutStack - (\ s l -> s { _ausLayoutStack = l }) - lexer :: P (Located RlpToken) -lexer = P $ \st -> (st,) <$> lexToken +lexer = undefined lexerCont :: (Located RlpToken -> P a) -> P a -lexerCont = (lexer >>=) +lexerCont = undefined -lexStream :: Alex [RlpToken] -lexStream = do - t <- lexToken - case t of - Located _ TokenEOF -> pure [TokenEOF] - Located _ a -> (a:) <$> lexStream +lexStream :: P [RlpToken] +lexStream = undefined lexTest :: Text -> Either String [RlpToken] -lexTest = flip runAlex lexStream +lexTest = undefined -lexToken :: Alex (Located RlpToken) -lexToken = alexMonadScan +lexToken :: P (Located RlpToken) +lexToken = undefined -getsAus :: (AlexUserState -> b) -> Alex b -getsAus k = alexGetUserState <&> k - -useAus :: Getting a AlexUserState a -> Alex a -useAus l = do - aus <- alexGetUserState - pure (aus ^. l) - -preuseAus :: Getting (First a) AlexUserState a -> Alex (Maybe a) -preuseAus l = do - aus <- alexGetUserState - pure (aus ^? l) - -modifyingAus :: ASetter' AlexUserState a -> (a -> a) -> Alex () -modifyingAus l f = do - aus <- alexGetUserState - alexSetUserState (aus & l %~ f) - -indentLevel :: Alex Int -indentLevel = do - inp <- alexGetInput - let col = inp ^. _1 - & \ (AlexPn _ _ c) -> c - pure col - -cmpLayout :: Alex Ordering -cmpLayout = do - i <- indentLevel - ctx <- preuseAus (ausLayoutStack . _head) - case (ctx <&> fst) ^. non (Implicit 1) of - Implicit n -> pure (i `compare` n) - Explicit -> pure GT - -insertToken :: RlpToken -> Alex (Located RlpToken) -insertToken t = do - inp <- alexGetInput - pure (Located (inp ^. _1) t) - -insertSemicolon, insertLBrace, insertRBrace :: Alex (Located RlpToken) -insertSemicolon = traceM "inserting semi" >> insertToken TokenSemicolonV -insertLBrace = traceM "inserting lbrace" >> insertToken TokenLBraceV -insertRBrace = traceM "inserting rbrace" >> insertToken TokenRBraceV - --- pop the layout stack and jump to the popped return code -popLayout :: Alex () -popLayout = do - traceM "pop layout" - ctx <- preuseAus (ausLayoutStack . _head) - modifyingAus ausLayoutStack (drop 1) - case ctx of - Just (l,c) -> alexSetStartCode c - Nothing -> pure () - -pushLayout :: Layout -> Alex () -pushLayout l = do - traceM "push layout" - c <- alexGetStartCode - modifyingAus ausLayoutStack ((l,c):) - -doBol :: AlexAction (Located RlpToken) -doBol inp len = do - off <- cmpLayout - case off of - -- the line is aligned with the previous. it therefore belongs to the - -- same list - EQ -> insertSemicolon - -- the line is indented further than the previous, so we assume it is a - -- line continuation. ignore it and move on! - GT -> undefined -- alexSetStartCode one >> lexToken - -- the line is indented less than the previous, pop the layout stack and - -- insert a closing brace. - LT -> insertRBrace >> popLayout >> lexToken - -explicitLBrace, explicitRBrace :: AlexAction (Located RlpToken) - -explicitLBrace _ _ = do - pushLayout Explicit - insertToken TokenLBrace - -explicitRBrace _ _ = do - popLayout - insertToken TokenRBrace - -doLayout :: AlexAction (Located RlpToken) -doLayout _ _ = do - i <- indentLevel - pushLayout (Implicit i) - insertLBrace +doBol = undefined } diff --git a/src/Rlp/Lex.x.old b/src/Rlp/Lex.x.old new file mode 100644 index 0000000..533c94c --- /dev/null +++ b/src/Rlp/Lex.x.old @@ -0,0 +1,280 @@ +{ +{-# LANGUAGE GeneralisedNewtypeDeriving #-} +{-# LANGUAGE TemplateHaskell #-} +{-# LANGUAGE OverloadedStrings #-} +module Rlp.Lex + ( P(..) + , RlpToken(..) + , Located(..) + , AlexPosn + , lexer + , lexerCont + ) + where +import Control.Monad +import Data.Functor.Identity +import Data.Char (digitToInt) +import Core.Syntax (Name) +import Data.Monoid (First) +import Data.Maybe +import Data.Text (Text) +import Data.Text qualified as T +import Data.Default +import Lens.Micro.Mtl +import Lens.Micro +import Lens.Micro.TH + +import Debug.Trace +} + +$whitechar = [ \t\n\r\f\v] + +$lower = [a-z \_] +$upper = [A-Z] +$alpha = [$lower $upper] +$digit = 0-9 + +$nl = [\n\r] +$white_no_nl = $white # $nl + +$namechar = [$alpha $digit \' \#] + +@varname = $lower $namechar* + +@digits = $digit+ + +rlp :- + + -- skip whitespace + $white_no_nl+ ; + -- TODO: don't treat operators like (-->) as comments + "--".* ; + ";" { constToken TokenSemicolon } + -- "{" { explicitLBrace } + -- "}" { explicitRBrace } + +<0> +{ + \n { begin bol } +} + + +{ + @varname { tokenWith TokenVarName } + @digits { tokenWith (TokenLitInt . readInt) } + "=" { constToken TokenEquals } + \n { begin bol } +} + +-- consume all whitespace leaving us at the beginning of the next non-empty +-- line. we then compare the indentation of that line to the enclosing layout +-- context and proceed accordingly + +{ + $whitechar ; + \n ; + () { doBol `andBegin` one } +} + +{ + +readInt :: Text -> Int +readInt = T.foldr f 0 where + f c n = digitToInt c + 10*n + +-- | @andBegin@, with the subtle difference that the start code is set +-- /after/ the action +thenBegin :: AlexAction a -> Int -> AlexAction a +thenBegin act c inp l = do + a <- act inp l + alexSetStartCode c + pure a + +constToken :: RlpToken -> AlexAction (Located RlpToken) +constToken t inp _ = pure $ Located (inp ^. _1) t + +tokenWith :: (Text -> RlpToken) -> AlexAction (Located RlpToken) +tokenWith tf (p,_,_,s) l = pure $ Located p (tf $ T.take l s) + +alexEOF :: Alex (Located RlpToken) +alexEOF = do + inp <- alexGetInput + pure (Located (inp ^. _1) TokenEOF) + +data RlpToken + -- literals + = TokenLitInt Int + -- identifiers + | TokenVarName Name + | TokenConName Name + | TokenVarSym Name + | TokenConSym Name + -- keywords + | TokenData + | TokenPipe + | TokenLet + | TokenIn + -- control symbols + | TokenEquals + | TokenSemicolon + | TokenLBrace + | TokenRBrace + -- 'virtual' control symbols, inserted by the lexer without any correlation + -- to a specific symbol + | TokenSemicolonV + | TokenLBraceV + | TokenRBraceV + | TokenEOF + deriving (Show) + +newtype P a = P { runP :: ParseState -> Alex (ParseState, Maybe a) } + deriving (Functor) + +execP :: P a -> ParseState -> Text -> Either String a +execP p st s = snd <$> runAlex s (runP p st) + +execP' :: P a -> Text -> Either String a +execP' p = execP p def + +data ParseState = ParseState + { _psLayoutStack :: [Layout] + , _psLexState :: [Int] + } + +instance Default ParseState where + def = ParseState { } + +instance Applicative P where + pure a = P $ \st -> pure (st,a) + liftA2 = liftM2 + +instance Monad P where + p >>= k = P $ \st -> do + (st',a) <- runP p st + runP (k a) st' + +data Layout = Explicit + | Implicit Int + deriving (Show, Eq) + +data Located a = Located AlexPosn a + deriving (Show) + +psLayoutStack :: Lens' AlexUserState [Layout] +psLayoutStack = lens _psLayoutStack + (\ s l -> s { _psLayoutStack = l }) + +lexer :: P (Located RlpToken) +lexer = P $ \st -> (st,) <$> lexToken + +lexerCont :: (Located RlpToken -> P a) -> P a +lexerCont = (lexer >>=) + +lexStream :: Alex [RlpToken] +lexStream = do + t <- lexToken + case t of + Located _ TokenEOF -> pure [TokenEOF] + Located _ a -> (a:) <$> lexStream + +lexTest :: Text -> Either String [RlpToken] +lexTest = flip runAlex lexStream + +lexToken :: Alex (Located RlpToken) +lexToken = alexMonadScan + +getsAus :: (AlexUserState -> b) -> Alex b +getsAus k = alexGetUserState <&> k + +useAus :: Getting a AlexUserState a -> Alex a +useAus l = do + aus <- alexGetUserState + pure (aus ^. l) + +preuseAus :: Getting (First a) AlexUserState a -> Alex (Maybe a) +preuseAus l = do + aus <- alexGetUserState + pure (aus ^? l) + +modifyingAus :: ASetter' AlexUserState a -> (a -> a) -> Alex () +modifyingAus l f = do + aus <- alexGetUserState + alexSetUserState (aus & l %~ f) + +indentLevel :: Alex Int +indentLevel = do + inp <- alexGetInput + let col = inp ^. _1 + & \ (AlexPn _ _ c) -> c + pure col + +cmpLayout :: Alex Ordering +cmpLayout = do + i <- indentLevel + ctx <- preuseAus (ausLayoutStack . _head) + case ctx ^. non (Implicit 1) of + Implicit n -> pure (i `compare` n) + Explicit -> pure GT + +insertToken :: RlpToken -> Alex (Located RlpToken) +insertToken t = do + inp <- alexGetInput + pure (Located (inp ^. _1) t) + +insertSemicolon, insertLBrace, insertRBrace :: Alex (Located RlpToken) +insertSemicolon = traceM "inserting semi" >> insertToken TokenSemicolonV +insertLBrace = traceM "inserting lbrace" >> insertToken TokenLBraceV +insertRBrace = traceM "inserting rbrace" >> insertToken TokenRBraceV + +-- pop the layout stack and jump to the popped return code +popLayout :: Alex Layout +popLayout = do + traceM "pop layout" + ctx <- preuseAus (ausLayoutStack . _head) + modifyingAus ausLayoutStack (drop 1) + case ctx of + Just l -> pure l + Nothing -> error "uhh" + +pushLayout :: Layout -> Alex () +pushLayout l = do + traceM "push layout" + modifyingAus ausLayoutStack (l:) + +pushLexState :: Alex () +pushLexState = do + undefined + +doBol :: AlexAction (Located RlpToken) +doBol inp len = do + off <- cmpLayout + case off of + -- the line is aligned with the previous. it therefore belongs to the + -- same list + EQ -> insertSemicolon + -- the line is indented further than the previous, so we assume it is a + -- line continuation. ignore it and move on! + GT -> undefined -- alexSetStartCode one >> lexToken + -- the line is indented less than the previous, pop the layout stack and + -- insert a closing brace. + LT -> popLayout >> insertRBrace + +explicitLBrace, explicitRBrace :: AlexAction (Located RlpToken) + +explicitLBrace _ _ = do + pushLayout Explicit + insertToken TokenLBrace + +explicitRBrace _ _ = do + popLayout + insertToken TokenRBrace + +doLayout :: AlexAction (Located RlpToken) +doLayout _ _ = do + i <- indentLevel + pushLayout (Implicit i) + traceM $ "layout " <> show i + insertLBrace + +} + diff --git a/src/Rlp/Parse.y b/src/Rlp/Parse.y index 3016594..3205988 100644 --- a/src/Rlp/Parse.y +++ b/src/Rlp/Parse.y @@ -1,6 +1,6 @@ { module Rlp.Parse - ( + ( parseRlpProgram ) where import Rlp.Lex @@ -8,7 +8,8 @@ import Rlp.Syntax import Rlp.Parse.Types } -%name rlp +%name parseRlpProgram StandaloneProgram + %monad { P } %lexer { lexerCont } { Located _ TokenEOF } %error { parseError } @@ -16,23 +17,56 @@ import Rlp.Parse.Types %token varname { Located _ (TokenVarName $$) } + litint { Located _ (TokenLitInt $$) } '=' { Located _ TokenEquals } + ';' { Located _ TokenSemicolon } + ';?' { Located _ TokenSemicolonV } + '{' { Located _ TokenLBrace } + '}' { Located _ TokenRBrace } + '{?' { Located _ TokenLBraceV } + '?}' { Located _ TokenRBraceV } eof { Located _ TokenEOF } %% +StandaloneProgram :: { [PartialDecl'] } +StandaloneProgram : VL Decls VR eof { $2 } + +VL :: { () } +VL : '{?' { () } + +VR :: { () } +VR : '?}' { () } + | error { () } + +Decls :: { [PartialDecl'] } +Decls : Decl Semi Decls { $1 : $3 } + | Decl Semi { [$1] } + | Decl { [$1] } + +Semi :: { Located RlpToken } +Semi : ';' { $1 } + | ';?' { $1 } + Decl :: { PartialDecl' } Decl : FunDecl { undefined } FunDecl :: { PartialDecl' } FunDecl : varname '=' Expr { undefined } -Expr :: { RlpExpr' } -Expr : { undefined } +Expr :: { RlpExpr' } +Expr : Literal { LitE $1 } + | Var { VarE $1 } + +Literal :: { Lit' } +Literal : litint { IntL $1 } + +Var :: { VarId } +Var : varname { NameVar $1 } { parseError :: Located RlpToken -> P a -parseError = error "aaaaah" +parseError = error . show } diff --git a/src/Rlp/Syntax.hs b/src/Rlp/Syntax.hs index 58843b5..bf35445 100644 --- a/src/Rlp/Syntax.hs +++ b/src/Rlp/Syntax.hs @@ -5,6 +5,8 @@ {-# LANGUAGE OverloadedStrings, PatternSynonyms #-} module Rlp.Syntax ( RlpModule(..) + , RlpProgram(..) + , RlpProgram' , rlpmodName , rlpmodProgram , RlpExpr(..) @@ -54,6 +56,8 @@ data RlpModule b = RlpModule newtype RlpProgram b = RlpProgram [Decl RlpExpr b] +type RlpProgram' = RlpProgram Name + -- | The @e@ parameter is used for partial results. When parsing an input, we -- first parse all top-level declarations in order to extract infix[lr] -- declarations. This process yields a @[Decl (Const Text) Name]@, where @Const