|
{
|
|
module Frontend.Autogen.Lexer where
|
|
|
|
import Control.Monad
|
|
|
|
import qualified Data.ByteString.Lazy as Lbs
|
|
import qualified Data.Text.Encoding as T
|
|
import qualified Data.Text as T
|
|
import qualified Data.Char
|
|
import Data.Int (Int64)
|
|
|
|
import Frontend.Lexer.Wrapper
|
|
import Frontend.Lexer.Tokens
|
|
import Frontend.Parser.Posn
|
|
}
|
|
|
|
-- %wrapper "monadUserState-bytestring"
|
|
|
|
$alpha = [a-zA-Z]
|
|
$digit = [0-9]
|
|
$white_nol = $white # [\n\t]
|
|
|
|
tokens :-
|
|
$white_nol+ ;
|
|
\t { \_ _ -> alexError "tab character in source code" }
|
|
|
|
<0,import_> "--" .* \n
|
|
{ just $ pushStartCode newline }
|
|
|
|
<0,import_>
|
|
$alpha [$alpha $digit \_ \' \.]* { variableOrKeyword }
|
|
|
|
<0> \= { always TokEqual }
|
|
<0> \: \: { always TokDoubleColon }
|
|
|
|
<0> \\ { \i l -> setPendingLC *> yield' (const TokLambda) i l }
|
|
<0> "->" { always TokArrow }
|
|
<0> "_" { always TokUnder }
|
|
|
|
<0> \{ { always TokOBrace }
|
|
<0> \[ { always TokOSquare }
|
|
|
|
<0,import_> {
|
|
\, { always TokComma }
|
|
\( { always TokOParen }
|
|
\) { always TokCParen }
|
|
}
|
|
|
|
<0> \} { closeBrace }
|
|
<0> \] { always TokCSquare }
|
|
|
|
<0> \;+ { always TokSemi }
|
|
|
|
<0,import_> \n { just $ pushStartCode newline }
|
|
|
|
<0> \" { startString }
|
|
|
|
<string> {
|
|
\\ \" { stringAppend (T.singleton '"') }
|
|
\\ \\ { stringAppend (T.singleton '\\') }
|
|
|
|
\\ a { stringAppend (T.singleton '\a') }
|
|
\\ b { stringAppend (T.singleton '\b') }
|
|
\\ f { stringAppend (T.singleton '\f') }
|
|
\\ n { stringAppend (T.singleton '\n') }
|
|
\\ \n { stringAppend (T.singleton '\n') }
|
|
\\ r { stringAppend (T.singleton '\r') }
|
|
\\ v { stringAppend (T.singleton '\v') }
|
|
\\ t { stringAppend (T.singleton '\t') }
|
|
|
|
\" { endString }
|
|
|
|
[^\\\"]+ { stringSegment }
|
|
}
|
|
|
|
<0,newline,comment,import_>
|
|
"{-" { just $ pushStartCode comment }
|
|
|
|
<comment> {
|
|
"-}" { \_ _ -> popStartCode *> alexMonadScan }
|
|
. ;
|
|
}
|
|
|
|
-- newline: emit a semicolon when de-denting
|
|
<newline> {
|
|
\n ;
|
|
"--" .* \n ;
|
|
|
|
() { offsideRule }
|
|
}
|
|
|
|
-- layout: indentation of the next token is context for offside rule
|
|
<layout> {
|
|
\n ;
|
|
"--" .* \n ;
|
|
|
|
\{ { openBrace }
|
|
() { startLayout }
|
|
}
|
|
|
|
<import_> {
|
|
\n { just $ pushStartCode newline }
|
|
"--" .* \n { just $ pushStartCode newline }
|
|
}
|
|
|
|
<empty_layout> () { emptyLayout }
|
|
|
|
<pending> () { emitPendingToken }
|
|
|
|
{
|
|
alexEOF :: Alex Token
|
|
alexEOF = do
|
|
(Posn l c, _, _, _) <- alexGetInput
|
|
|
|
maybePopImportSC
|
|
|
|
state <- getUserState
|
|
|
|
unless (T.null (stringBuffer state)) $ do
|
|
alexError $ "Unterminated string literal at line " ++ show l ++ ", column " ++ show c
|
|
|
|
case layoutColumns state of
|
|
-- EOF is allowed to close as many layout contexts as there are
|
|
-- pending (number of pending layout contexts is the length of the
|
|
-- list minus one, since there's the one initial layout context.)
|
|
_:tail -> do
|
|
mapUserState $ \s ->
|
|
s { pendingTokens = (Token TokLEnd l c <$ tail) ++ [Token TokEof l c]
|
|
, layoutColumns = []
|
|
}
|
|
pushStartCode pending
|
|
pure (Token TokLEnd l c)
|
|
_ -> pure $ Token TokEof l c
|
|
|
|
yield k inp i = clearPendingLC *> yield' k inp i
|
|
|
|
yield' k (Posn l c, _, s, _) i = do
|
|
pure (Token (k $! (T.decodeUtf8 (Lbs.toStrict (Lbs.take i s)))) l c)
|
|
|
|
setPendingLC = mapUserState $ \s -> s { pendingLambdaCase = True }
|
|
clearPendingLC = mapUserState $ \s -> s { pendingLambdaCase = False }
|
|
|
|
always :: TokenClass -> AlexInput -> Int64 -> Alex Token
|
|
always k x i = yield (const k) x i
|
|
|
|
-- reset the string buffer and push the string start code
|
|
startString (p, _, _, _) _ = do
|
|
mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Just p }
|
|
pushStartCode string
|
|
alexMonadScan
|
|
|
|
-- pop the string start code, and emit the string buffer as a token.
|
|
endString (Posn l c, _, _, _) _i = do
|
|
state <- getUserState
|
|
|
|
mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Nothing }
|
|
popStartCode
|
|
|
|
let (Just (Posn l c)) = stringStartPosn state
|
|
pure (Token (TokString (stringBuffer state)) l c)
|
|
|
|
-- append a /lexed/ region to the string buffer
|
|
stringSegment (Posn _ _, _, buf, _) i = do
|
|
mapUserState $ \s -> s { stringBuffer = stringBuffer s <> T.decodeUtf8 (Lbs.toStrict (Lbs.take i buf)) }
|
|
alexMonadScan
|
|
|
|
-- append a constant fragment to the string buffer.
|
|
stringAppend text _ _ = do
|
|
mapUserState $ \s -> s { stringBuffer = stringBuffer s <> text }
|
|
alexMonadScan
|
|
|
|
emitPendingToken :: AlexAction Token
|
|
emitPendingToken _ _ = do
|
|
t <- getUserState
|
|
case pendingTokens t of
|
|
[] -> do
|
|
popStartCode
|
|
alexMonadScan
|
|
(x:xs) -> do
|
|
mapUserState $ \s -> s { pendingTokens = xs }
|
|
pure x
|
|
|
|
delayToken :: Token -> Alex ()
|
|
delayToken t = do
|
|
mapUserState $ \s -> s { pendingTokens = t:pendingTokens s }
|
|
pushStartCode pending
|
|
|
|
just :: Alex a -> AlexAction Token
|
|
just k _ _ = k *> alexMonadScan
|
|
|
|
getUserState :: Alex AlexUserState
|
|
getUserState = Alex $ \s -> Right (s, alex_ust s)
|
|
|
|
mapUserState :: (AlexUserState -> AlexUserState) -> Alex ()
|
|
mapUserState k = Alex $ \s -> Right (s { alex_ust = k $! alex_ust s }, ())
|
|
|
|
pushStartCode :: Int -> Alex ()
|
|
pushStartCode c = do
|
|
sc <- alexGetStartCode
|
|
mapUserState $ \s -> s { startCodes = sc:startCodes s }
|
|
alexSetStartCode c
|
|
|
|
popStartCode :: Alex ()
|
|
popStartCode = do
|
|
sc <- startCodes <$> getUserState
|
|
case sc of
|
|
[] -> alexSetStartCode 0
|
|
(x:xs) -> do
|
|
mapUserState $ \s -> s { startCodes = xs }
|
|
alexSetStartCode x
|
|
|
|
offsideRule :: AlexInput -> Int64 -> Alex Token
|
|
offsideRule (Posn line col, _, _, _) _ = do
|
|
columns <- layoutColumns <$> getUserState
|
|
|
|
let continue = popStartCode *> alexMonadScan
|
|
|
|
-- The "offside rule" governs how to insert virtual semicolon and
|
|
-- closing '}' tokens. It applies in the "newline" state, and, if we
|
|
-- stay in that state, the rule keeps applying. There are a couple of
|
|
-- cases:
|
|
case columns of
|
|
-- If we have no layout context (or we're in a layout context that
|
|
-- started with a physical '{'), then the offside rule plain doesn't
|
|
-- apply.
|
|
[] -> continue
|
|
ExplicitLayout:_ -> continue
|
|
-- Otherwise, we're dealing with something like
|
|
--
|
|
-- do token
|
|
-- ^ this is the layout column.
|
|
col':ctx -> do
|
|
case col `compare` layoutCol col' of
|
|
-- If we have something like
|
|
--
|
|
-- do token
|
|
-- token
|
|
-- ^ this is where we are
|
|
-- then we emit a semicolon (and possibly do some bookeeping,
|
|
-- like leaving the newline state)
|
|
EQ -> do
|
|
popStartCode
|
|
maybePopImportSC
|
|
pure (Token TokSemi line col)
|
|
|
|
-- If we have something like
|
|
--
|
|
-- do token
|
|
-- token
|
|
-- ^ this is where we are
|
|
-- then we don't emit anything, just leave the newline state,
|
|
-- since this token continues the previous logical line.
|
|
GT -> continue
|
|
|
|
-- If we have something like
|
|
--
|
|
-- C D E
|
|
-- do token
|
|
-- do token
|
|
-- do token
|
|
-- token
|
|
-- ^ we are here
|
|
-- then we're behind the layout context, but not just one, three!
|
|
-- we emit a closing '}' to close context 'E', and STAY in the
|
|
-- newline context. when we eventually end up here again
|
|
-- (recurring interleaved with the lexer state machine), we
|
|
-- close the D and C contexts in the same way.
|
|
LT -> do
|
|
mapUserState $ \s -> s { layoutColumns = ctx }
|
|
pure (Token TokLEnd line col)
|
|
|
|
-- eventually we either exhaust all the layout contexts or get
|
|
-- to a layout context we're EQ or GT compared to. in that case
|
|
-- one of the other rules apply.
|
|
|
|
maybePopImportSC :: Alex ()
|
|
maybePopImportSC = do
|
|
startcode <- alexGetStartCode
|
|
when (startcode == import_) popStartCode
|
|
|
|
emptyLayout :: AlexInput -> Int64 -> Alex Token
|
|
emptyLayout (Posn line col, _, _, _) _ = do
|
|
popStartCode
|
|
pushStartCode newline
|
|
pure (Token TokLEnd line col)
|
|
|
|
startLayout :: AlexInput -> Int64 -> Alex Token
|
|
startLayout (Posn line col, _, _, _) _ = do
|
|
state <- getUserState
|
|
popStartCode
|
|
let
|
|
col' =
|
|
case layoutColumns state of
|
|
[] -> 0
|
|
(x:_) -> layoutCol x
|
|
|
|
layoutKind = case pendingLayoutKw state of
|
|
Just s -> s
|
|
Nothing -> Layout
|
|
|
|
-- here's another rule. suppose we have:
|
|
--
|
|
-- foo = bar where
|
|
-- spam = ham
|
|
--
|
|
-- if we just apply the rule that the next token after a layout
|
|
-- keyword determines the column for the layout context, then we're
|
|
-- starting another layout context at column 1! that's definitely not
|
|
-- what we want.
|
|
--
|
|
-- so a new layout context only starts if the first token is to the right
|
|
-- of the previous layout context. that is: a block only starts if it's
|
|
-- on the same line as the layout context, or indented further.
|
|
if col <= col'
|
|
then pushStartCode empty_layout
|
|
else mapUserState $ \s -> s { layoutColumns = layoutKind col:layoutColumns s }
|
|
pure (Token TokLStart line col)
|
|
|
|
popLayoutContext :: Alex ()
|
|
popLayoutContext = mapUserState $ \s -> s { layoutColumns = tail (layoutColumns s) }
|
|
|
|
openBrace :: AlexInput -> Int64 -> Alex Token
|
|
openBrace (Posn line col, _, _, _) _ = do
|
|
-- if we see a '{' token, we're probably in the layout state. in that
|
|
-- case, we pop it! otherwise, we just pop the state anyway: if we
|
|
-- were in <0>, then popping gets us back in <0>.
|
|
popStartCode
|
|
|
|
-- we push an ExplicitLayout state so that the offside rule stops
|
|
-- applying (logical lines are delimited by physical semicolons) and a
|
|
-- '}' can close it.
|
|
mapUserState $ \s -> s { layoutColumns = ExplicitLayout:layoutColumns s }
|
|
pure (Token TokOBrace line col)
|
|
|
|
closeBrace :: AlexInput -> Int64 -> Alex Token
|
|
closeBrace (Posn line col, _, _, _) _ = do
|
|
-- if we're lexing a '}' token (physical) and the rightmost layout
|
|
-- context was started by a physical '{', then we can close it.
|
|
-- otherwise we do nothing and probably get a parse error!
|
|
columns <- layoutColumns <$> getUserState
|
|
case columns of
|
|
ExplicitLayout:_ -> popLayoutContext
|
|
_ -> pure ()
|
|
pure (Token TokCBrace line col)
|
|
|
|
variableOrKeyword :: AlexAction Token
|
|
variableOrKeyword (Posn l c, _, s, _) size = finishVarKw l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s))
|
|
|
|
finishVarKw :: Int -> Int -> T.Text -> Alex Token
|
|
finishVarKw l c text
|
|
| T.null text = undefined
|
|
| Data.Char.isUpper (T.head text), T.singleton '.' `T.isInfixOf` text = pure $
|
|
-- if we have a token like A.B.C, we reverse it and span at the
|
|
-- first (last) dot, so that we have, e.g.:
|
|
--
|
|
-- "Aa.Bb.Cc" -> "cC.bB.aA"
|
|
-- "Cc.Bb.Aa" -> ("Cc", ".bB.aA")
|
|
--
|
|
-- what we have then is the suffix and the prefix, but they've both
|
|
-- been reversed. so we unreverse them, and also drop the first
|
|
-- (last) dot from the prefix.
|
|
--
|
|
-- if the suffix starts with an uppercase letter, it's a constructor
|
|
-- symbol (ConId).
|
|
let
|
|
txet = T.reverse text
|
|
(suffix', prefix') = T.span (/= '.') txet
|
|
|
|
prefix = T.reverse (T.tail prefix')
|
|
suffix = T.reverse suffix'
|
|
in if Data.Char.isUpper (T.head suffix)
|
|
then Token (TokQual ConId prefix suffix) l c
|
|
else Token (TokQual VarId prefix suffix) l c
|
|
| Data.Char.isUpper (T.head text) = pure $ Token (TokUnqual ConId text) l c
|
|
|
|
finishVarKw l c text = do
|
|
sc <- alexGetStartCode
|
|
state <- getUserState
|
|
|
|
clearPendingLC
|
|
|
|
let col = layoutCol (head (layoutColumns state))
|
|
|
|
case T.unpack text of
|
|
-- we handle the contextual 'as'/'qualified' tokens using a
|
|
-- startcode.
|
|
--
|
|
-- in the import_ state, as and qualified are keywords, unless the
|
|
-- offside rule would apply to emit a ';' or '}' token. in that
|
|
-- case, we emit a semicolon (what the offside rule would do!), and
|
|
-- set the "keyword" (now changed to an identifier) as pending, so
|
|
-- that it will be emitted by the next alexMonadScan.
|
|
"import" -> do
|
|
pushStartCode import_
|
|
pure (Token TokImport l c)
|
|
|
|
"as"
|
|
| sc == import_, c > col -> pure (Token TokAs l c)
|
|
| sc == import_ -> offsideKeyword (TokUnqual VarId text) l c
|
|
| otherwise -> pure (Token (TokUnqual VarId text) l c)
|
|
|
|
"qualified"
|
|
| sc == import_, c > col -> pure (Token TokQualified l c)
|
|
| sc == import_ -> offsideKeyword (TokUnqual VarId text) l c
|
|
| otherwise -> pure (Token (TokUnqual VarId text) l c)
|
|
|
|
-- when starting a layout context for let expressions we make sure
|
|
-- that it is distinguishable from layout contexts started by
|
|
-- anything else, because let layout contexts can be terminated
|
|
-- ahead of time by the 'in' token. for instance in:
|
|
--
|
|
-- let x = 1 in x
|
|
--
|
|
-- there is no reason for the layout context that started after
|
|
-- 'let' to be terminated by the 'in' token, since the offside rule
|
|
-- hasn't had a chance to apply. the token stream in that case would look like
|
|
--
|
|
-- 'let' '{' x '=' 1 'in' x
|
|
--
|
|
-- which is a parse error. we do not implement the rule which says parse errors
|
|
-- terminate layout contexts, instead doing this approximation.
|
|
"let" -> laidOut' (Just LetLayout) TokLet l c
|
|
"in" -> do
|
|
laidout <- layoutColumns <$> getUserState
|
|
case laidout of
|
|
LetLayout _:_ -> earlyEnd TokIn l c
|
|
_ -> pure (Token TokIn l c)
|
|
|
|
"data" -> pure (Token TokData l c)
|
|
|
|
"where" -> laidOut TokWhere l c
|
|
"module" -> pure (Token TokModule l c)
|
|
|
|
-- when we lex a \ token, a flag is set in the lexer state to
|
|
-- indicate that, if there is a 'case' token directly following,
|
|
-- that token is to be interpreted as part of a lambda-case
|
|
-- construct, and so must start a layout context for its branches.
|
|
"case"
|
|
| pendingLambdaCase state -> laidOut TokCase l c
|
|
| otherwise -> pure (Token TokCase l c)
|
|
|
|
"of" -> laidOut TokOf l c
|
|
|
|
(_:_) -> pure (Token (TokUnqual VarId text) l c)
|
|
|
|
[] -> error "empty keyword/identifier"
|
|
|
|
earlyEnd :: TokenClass -> Int -> Int -> Alex Token
|
|
earlyEnd tok l c = do
|
|
popLayoutContext
|
|
delayToken (Token tok l c)
|
|
pure (Token TokLEnd l c)
|
|
|
|
offsideKeyword :: TokenClass -> Int -> Int -> Alex Token
|
|
offsideKeyword tok l c = do
|
|
popStartCode
|
|
delayToken (Token tok l c)
|
|
pure (Token TokSemi l c)
|
|
|
|
laidOut' :: Maybe (Int -> LayoutState) -> TokenClass -> Int -> Int -> Alex Token
|
|
laidOut' n x l c = do
|
|
pushStartCode layout
|
|
mapUserState $ \s -> s { leastColumn = c, pendingLayoutKw = n }
|
|
pure (Token x l c)
|
|
|
|
laidOut = laidOut' Nothing
|
|
|
|
alexMonadScan = do
|
|
inp@(_,_,_,n) <- alexGetInput
|
|
|
|
sc <- alexGetStartCode
|
|
case alexScan inp sc of
|
|
AlexEOF -> alexEOF
|
|
AlexError error@(_,_,inp,_) ->
|
|
alexError $ "Unexpected character: " ++ show (T.head (T.decodeUtf8 (Lbs.toStrict inp)))
|
|
AlexSkip inp _len -> do
|
|
alexSetInput inp
|
|
alexMonadScan
|
|
AlexToken inp'@(_,_,_,n') _ action -> let len = n'-n in do
|
|
alexSetInput inp'
|
|
action (ignorePendingBytes inp) len
|
|
}
|