|
|
- {
- module Frontend.Autogen.Lexer where
-
- import Control.Monad
-
- import qualified Data.ByteString.Lazy as Lbs
- import qualified Data.Text.Encoding as T
- import qualified Data.Text as T
- import qualified Data.Char
- import Data.Int (Int64)
-
- import Frontend.Lexer.Wrapper
- import Frontend.Lexer.Tokens
- import Frontend.Parser.Posn
- }
-
- -- %wrapper "monadUserState-bytestring"
-
- $alpha = [a-zA-Z]
- $digit = [0-9]
- $white_nol = $white # [\n\t]
-
- tokens :-
- $white_nol+ ;
- \t { \_ _ -> alexError "tab character in source code" }
-
- <0,import_> "--" .* \n
- { just $ pushStartCode newline }
-
- <0,import_>
- $alpha [$alpha $digit \_ \' \.]* { variableOrKeyword }
-
- <0> \= { always TokEqual }
- <0> \: \: { always TokDoubleColon }
-
- <0> \\ { \i l -> setPendingLC *> yield' (const TokLambda) i l }
- <0> "->" { always TokArrow }
- <0> "_" { always TokUnder }
-
- <0> \{ { always TokOBrace }
- <0> \[ { always TokOSquare }
-
- <0,import_> {
- \, { always TokComma }
- \( { always TokOParen }
- \) { always TokCParen }
- }
-
- <0> \} { closeBrace }
- <0> \] { always TokCSquare }
-
- <0> \;+ { always TokSemi }
-
- <0,import_> \n { just $ pushStartCode newline }
-
- <0> \" { startString }
-
- <string> {
- \\ \" { stringAppend (T.singleton '"') }
- \\ \\ { stringAppend (T.singleton '\\') }
-
- \\ a { stringAppend (T.singleton '\a') }
- \\ b { stringAppend (T.singleton '\b') }
- \\ f { stringAppend (T.singleton '\f') }
- \\ n { stringAppend (T.singleton '\n') }
- \\ \n { stringAppend (T.singleton '\n') }
- \\ r { stringAppend (T.singleton '\r') }
- \\ v { stringAppend (T.singleton '\v') }
- \\ t { stringAppend (T.singleton '\t') }
-
- \" { endString }
-
- [^\\\"]+ { stringSegment }
- }
-
- <0,newline,comment,import_>
- "{-" { just $ pushStartCode comment }
-
- <comment> {
- "-}" { \_ _ -> popStartCode *> alexMonadScan }
- . ;
- }
-
- -- newline: emit a semicolon when de-denting
- <newline> {
- \n ;
- "--" .* \n ;
-
- () { offsideRule }
- }
-
- -- layout: indentation of the next token is context for offside rule
- <layout> {
- \n ;
- "--" .* \n ;
-
- \{ { openBrace }
- () { startLayout }
- }
-
- <import_> {
- \n { just $ pushStartCode newline }
- "--" .* \n { just $ pushStartCode newline }
- }
-
- <empty_layout> () { emptyLayout }
-
- <pending> () { emitPendingToken }
-
- {
- alexEOF :: Alex Token
- alexEOF = do
- (Posn l c, _, _, _) <- alexGetInput
-
- maybePopImportSC
-
- state <- getUserState
-
- unless (T.null (stringBuffer state)) $ do
- alexError $ "Unterminated string literal at line " ++ show l ++ ", column " ++ show c
-
- case layoutColumns state of
- -- EOF is allowed to close as many layout contexts as there are
- -- pending (number of pending layout contexts is the length of the
- -- list minus one, since there's the one initial layout context.)
- _:tail -> do
- mapUserState $ \s ->
- s { pendingTokens = (Token TokLEnd l c <$ tail) ++ [Token TokEof l c]
- , layoutColumns = []
- }
- pushStartCode pending
- pure (Token TokLEnd l c)
- _ -> pure $ Token TokEof l c
-
- yield k inp i = clearPendingLC *> yield' k inp i
-
- yield' k (Posn l c, _, s, _) i = do
- pure (Token (k $! (T.decodeUtf8 (Lbs.toStrict (Lbs.take i s)))) l c)
-
- setPendingLC = mapUserState $ \s -> s { pendingLambdaCase = True }
- clearPendingLC = mapUserState $ \s -> s { pendingLambdaCase = False }
-
- always :: TokenClass -> AlexInput -> Int64 -> Alex Token
- always k x i = yield (const k) x i
-
- -- reset the string buffer and push the string start code
- startString (p, _, _, _) _ = do
- mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Just p }
- pushStartCode string
- alexMonadScan
-
- -- pop the string start code, and emit the string buffer as a token.
- endString (Posn l c, _, _, _) _i = do
- state <- getUserState
-
- mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Nothing }
- popStartCode
-
- let (Just (Posn l c)) = stringStartPosn state
- pure (Token (TokString (stringBuffer state)) l c)
-
- -- append a /lexed/ region to the string buffer
- stringSegment (Posn _ _, _, buf, _) i = do
- mapUserState $ \s -> s { stringBuffer = stringBuffer s <> T.decodeUtf8 (Lbs.toStrict (Lbs.take i buf)) }
- alexMonadScan
-
- -- append a constant fragment to the string buffer.
- stringAppend text _ _ = do
- mapUserState $ \s -> s { stringBuffer = stringBuffer s <> text }
- alexMonadScan
-
- emitPendingToken :: AlexAction Token
- emitPendingToken _ _ = do
- t <- getUserState
- case pendingTokens t of
- [] -> do
- popStartCode
- alexMonadScan
- (x:xs) -> do
- mapUserState $ \s -> s { pendingTokens = xs }
- pure x
-
- delayToken :: Token -> Alex ()
- delayToken t = do
- mapUserState $ \s -> s { pendingTokens = t:pendingTokens s }
- pushStartCode pending
-
- just :: Alex a -> AlexAction Token
- just k _ _ = k *> alexMonadScan
-
- getUserState :: Alex AlexUserState
- getUserState = Alex $ \s -> Right (s, alex_ust s)
-
- mapUserState :: (AlexUserState -> AlexUserState) -> Alex ()
- mapUserState k = Alex $ \s -> Right (s { alex_ust = k $! alex_ust s }, ())
-
- pushStartCode :: Int -> Alex ()
- pushStartCode c = do
- sc <- alexGetStartCode
- mapUserState $ \s -> s { startCodes = sc:startCodes s }
- alexSetStartCode c
-
- popStartCode :: Alex ()
- popStartCode = do
- sc <- startCodes <$> getUserState
- case sc of
- [] -> alexSetStartCode 0
- (x:xs) -> do
- mapUserState $ \s -> s { startCodes = xs }
- alexSetStartCode x
-
- offsideRule :: AlexInput -> Int64 -> Alex Token
- offsideRule (Posn line col, _, _, _) _ = do
- columns <- layoutColumns <$> getUserState
-
- let continue = popStartCode *> alexMonadScan
-
- -- The "offside rule" governs how to insert virtual semicolon and
- -- closing '}' tokens. It applies in the "newline" state, and, if we
- -- stay in that state, the rule keeps applying. There are a couple of
- -- cases:
- case columns of
- -- If we have no layout context (or we're in a layout context that
- -- started with a physical '{'), then the offside rule plain doesn't
- -- apply.
- [] -> continue
- ExplicitLayout:_ -> continue
- -- Otherwise, we're dealing with something like
- --
- -- do token
- -- ^ this is the layout column.
- col':ctx -> do
- case col `compare` layoutCol col' of
- -- If we have something like
- --
- -- do token
- -- token
- -- ^ this is where we are
- -- then we emit a semicolon (and possibly do some bookeeping,
- -- like leaving the newline state)
- EQ -> do
- popStartCode
- maybePopImportSC
- pure (Token TokSemi line col)
-
- -- If we have something like
- --
- -- do token
- -- token
- -- ^ this is where we are
- -- then we don't emit anything, just leave the newline state,
- -- since this token continues the previous logical line.
- GT -> continue
-
- -- If we have something like
- --
- -- C D E
- -- do token
- -- do token
- -- do token
- -- token
- -- ^ we are here
- -- then we're behind the layout context, but not just one, three!
- -- we emit a closing '}' to close context 'E', and STAY in the
- -- newline context. when we eventually end up here again
- -- (recurring interleaved with the lexer state machine), we
- -- close the D and C contexts in the same way.
- LT -> do
- mapUserState $ \s -> s { layoutColumns = ctx }
- pure (Token TokLEnd line col)
-
- -- eventually we either exhaust all the layout contexts or get
- -- to a layout context we're EQ or GT compared to. in that case
- -- one of the other rules apply.
-
- maybePopImportSC :: Alex ()
- maybePopImportSC = do
- startcode <- alexGetStartCode
- when (startcode == import_) popStartCode
-
- emptyLayout :: AlexInput -> Int64 -> Alex Token
- emptyLayout (Posn line col, _, _, _) _ = do
- popStartCode
- pushStartCode newline
- pure (Token TokLEnd line col)
-
- startLayout :: AlexInput -> Int64 -> Alex Token
- startLayout (Posn line col, _, _, _) _ = do
- state <- getUserState
- popStartCode
- let
- col' =
- case layoutColumns state of
- [] -> 0
- (x:_) -> layoutCol x
-
- layoutKind = case pendingLayoutKw state of
- Just s -> s
- Nothing -> Layout
-
- -- here's another rule. suppose we have:
- --
- -- foo = bar where
- -- spam = ham
- --
- -- if we just apply the rule that the next token after a layout
- -- keyword determines the column for the layout context, then we're
- -- starting another layout context at column 1! that's definitely not
- -- what we want.
- --
- -- so a new layout context only starts if the first token is to the right
- -- of the previous layout context. that is: a block only starts if it's
- -- on the same line as the layout context, or indented further.
- if col <= col'
- then pushStartCode empty_layout
- else mapUserState $ \s -> s { layoutColumns = layoutKind col:layoutColumns s }
- pure (Token TokLStart line col)
-
- popLayoutContext :: Alex ()
- popLayoutContext = mapUserState $ \s -> s { layoutColumns = tail (layoutColumns s) }
-
- openBrace :: AlexInput -> Int64 -> Alex Token
- openBrace (Posn line col, _, _, _) _ = do
- -- if we see a '{' token, we're probably in the layout state. in that
- -- case, we pop it! otherwise, we just pop the state anyway: if we
- -- were in <0>, then popping gets us back in <0>.
- popStartCode
-
- -- we push an ExplicitLayout state so that the offside rule stops
- -- applying (logical lines are delimited by physical semicolons) and a
- -- '}' can close it.
- mapUserState $ \s -> s { layoutColumns = ExplicitLayout:layoutColumns s }
- pure (Token TokOBrace line col)
-
- closeBrace :: AlexInput -> Int64 -> Alex Token
- closeBrace (Posn line col, _, _, _) _ = do
- -- if we're lexing a '}' token (physical) and the rightmost layout
- -- context was started by a physical '{', then we can close it.
- -- otherwise we do nothing and probably get a parse error!
- columns <- layoutColumns <$> getUserState
- case columns of
- ExplicitLayout:_ -> popLayoutContext
- _ -> pure ()
- pure (Token TokCBrace line col)
-
- variableOrKeyword :: AlexAction Token
- variableOrKeyword (Posn l c, _, s, _) size = finishVarKw l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s))
-
- finishVarKw :: Int -> Int -> T.Text -> Alex Token
- finishVarKw l c text
- | T.null text = undefined
- | Data.Char.isUpper (T.head text), T.singleton '.' `T.isInfixOf` text = pure $
- -- if we have a token like A.B.C, we reverse it and span at the
- -- first (last) dot, so that we have, e.g.:
- --
- -- "Aa.Bb.Cc" -> "cC.bB.aA"
- -- "Cc.Bb.Aa" -> ("Cc", ".bB.aA")
- --
- -- what we have then is the suffix and the prefix, but they've both
- -- been reversed. so we unreverse them, and also drop the first
- -- (last) dot from the prefix.
- --
- -- if the suffix starts with an uppercase letter, it's a constructor
- -- symbol (ConId).
- let
- txet = T.reverse text
- (suffix', prefix') = T.span (/= '.') txet
-
- prefix = T.reverse (T.tail prefix')
- suffix = T.reverse suffix'
- in if Data.Char.isUpper (T.head suffix)
- then Token (TokQual ConId prefix suffix) l c
- else Token (TokQual VarId prefix suffix) l c
- | Data.Char.isUpper (T.head text) = pure $ Token (TokUnqual ConId text) l c
-
- finishVarKw l c text = do
- sc <- alexGetStartCode
- state <- getUserState
-
- clearPendingLC
-
- let col = layoutCol (head (layoutColumns state))
-
- case T.unpack text of
- -- we handle the contextual 'as'/'qualified' tokens using a
- -- startcode.
- --
- -- in the import_ state, as and qualified are keywords, unless the
- -- offside rule would apply to emit a ';' or '}' token. in that
- -- case, we emit a semicolon (what the offside rule would do!), and
- -- set the "keyword" (now changed to an identifier) as pending, so
- -- that it will be emitted by the next alexMonadScan.
- "import" -> do
- pushStartCode import_
- pure (Token TokImport l c)
-
- "as"
- | sc == import_, c > col -> pure (Token TokAs l c)
- | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c
- | otherwise -> pure (Token (TokUnqual VarId text) l c)
-
- "qualified"
- | sc == import_, c > col -> pure (Token TokQualified l c)
- | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c
- | otherwise -> pure (Token (TokUnqual VarId text) l c)
-
- -- when starting a layout context for let expressions we make sure
- -- that it is distinguishable from layout contexts started by
- -- anything else, because let layout contexts can be terminated
- -- ahead of time by the 'in' token. for instance in:
- --
- -- let x = 1 in x
- --
- -- there is no reason for the layout context that started after
- -- 'let' to be terminated by the 'in' token, since the offside rule
- -- hasn't had a chance to apply. the token stream in that case would look like
- --
- -- 'let' '{' x '=' 1 'in' x
- --
- -- which is a parse error. we do not implement the rule which says parse errors
- -- terminate layout contexts, instead doing this approximation.
- "let" -> laidOut' (Just LetLayout) TokLet l c
- "in" -> do
- laidout <- layoutColumns <$> getUserState
- case laidout of
- LetLayout _:_ -> earlyEnd TokIn l c
- _ -> pure (Token TokIn l c)
-
- "data" -> pure (Token TokData l c)
-
- "where" -> laidOut TokWhere l c
- "module" -> pure (Token TokModule l c)
-
- -- when we lex a \ token, a flag is set in the lexer state to
- -- indicate that, if there is a 'case' token directly following,
- -- that token is to be interpreted as part of a lambda-case
- -- construct, and so must start a layout context for its branches.
- "case"
- | pendingLambdaCase state -> laidOut TokCase l c
- | otherwise -> pure (Token TokCase l c)
-
- "of" -> laidOut TokOf l c
-
- (_:_) -> pure (Token (TokUnqual VarId text) l c)
-
- [] -> error "empty keyword/identifier"
-
- earlyEnd :: TokenClass -> Int -> Int -> Alex Token
- earlyEnd tok l c = do
- popLayoutContext
- delayToken (Token tok l c)
- pure (Token TokLEnd l c)
-
- offsideKeyword :: TokenClass -> Int -> Int -> Alex Token
- offsideKeyword tok l c = do
- popStartCode
- delayToken (Token tok l c)
- pure (Token TokSemi l c)
-
- laidOut' :: Maybe (Int -> LayoutState) -> TokenClass -> Int -> Int -> Alex Token
- laidOut' n x l c = do
- pushStartCode layout
- mapUserState $ \s -> s { leastColumn = c, pendingLayoutKw = n }
- pure (Token x l c)
-
- laidOut = laidOut' Nothing
-
- alexMonadScan = do
- inp@(_,_,_,n) <- alexGetInput
-
- sc <- alexGetStartCode
- case alexScan inp sc of
- AlexEOF -> alexEOF
- AlexError error@(_,_,inp,_) ->
- alexError $ "Unexpected character: " ++ show (T.head (T.decodeUtf8 (Lbs.toStrict inp)))
- AlexSkip inp _len -> do
- alexSetInput inp
- alexMonadScan
- AlexToken inp'@(_,_,_,n') _ action -> let len = n'-n in do
- alexSetInput inp'
- action (ignorePendingBytes inp) len
- }
|