{ module Frontend.Autogen.Lexer where import Control.Monad import qualified Data.ByteString.Lazy as Lbs import qualified Data.Text.Encoding as T import qualified Data.Text as T import qualified Data.Char import Data.Int (Int64) import Frontend.Lexer.Wrapper import Frontend.Lexer.Tokens import Frontend.Parser.Posn } -- %wrapper "monadUserState-bytestring" $lower = [a-z] $upper = [A-Z] $alpha = [ $lower $upper ] $digit = [0-9] $alnum = [ $alpha $digit ] $white_nol = $white # [\n\t] $optail = [\! \# \$ \% \& \* \+ \. \/ \< \= \> \? \@ \\ \^ \| \- \~ \:] $ophead = $optail # \: @conid = $upper [$alnum \_ \']* @namespace = (@conid \.)* tokens :- $white_nol+ ; \t { \_ _ -> alexError "tab character in source code" } <0,import_,foreign_> "--" .* \n { just $ pushStartCode newline } <0> \= { always TokEqual } <0> \` { always TokTick } <0> \: \: { always TokDoubleColon } \: \: { \i l -> popStartCode *> always TokDoubleColon i l } <0> \\ { \i l -> setPendingLC *> yield' (const TokLambda) i l } <0> "->" { always TokArrow } <0> "_" { always TokUnder } <0> \{ { always TokOBrace } <0> \[ { always TokOSquare } <0,import_,foreign_> { \, { always TokComma } \( { always TokOParen } \) { always TokCParen } } <0> \} { closeBrace } <0> \] { always TokCSquare } <0> \;+ { always TokSemi } <0,import_,foreign_> \n { just $ pushStartCode newline } <0,foreign_> \" { startString } { \\ \" { stringAppend (T.singleton '"') } \\ \\ { stringAppend (T.singleton '\\') } \\ a { stringAppend (T.singleton '\a') } \\ b { stringAppend (T.singleton '\b') } \\ f { stringAppend (T.singleton '\f') } \\ n { stringAppend (T.singleton '\n') } \\ \n { stringAppend (T.singleton '\n') } \\ r { stringAppend (T.singleton '\r') } \\ v { stringAppend (T.singleton '\v') } \\ t { stringAppend (T.singleton '\t') } \" { endString } [^\\\"]+ { stringSegment } } <0,newline,comment,import_> "{-" { just $ pushStartCode comment } { "-}" { \_ _ -> popStartCode *> alexMonadScan } . ; } -- newline: emit a semicolon when de-denting { \n ; "--" .* \n ; () { offsideRule } } -- layout: indentation of the next token is context for offside rule { \n ; "--" .* \n ; \{ { openBrace } () { startLayout } } () { emptyLayout } () { emitPendingToken } -- identifiers and keywords <0,import_,foreign_> { $lower [$alpha $digit \_ \']* { variableOrKeyword } $upper [$alpha $digit \_ \']* { yield (TokUnqual ConId) } $ophead $optail* { yield (TokUnqualOp VarId) } : $optail* { yield (TokUnqualOp ConId) } @namespace $lower [$alpha $digit \_ \']* { qualifiedVariable } @namespace $upper [$alpha $digit \_ \']* { qualifiedVariable } @namespace $ophead $optail* { qualifiedOperator } @namespace : $optail* { qualifiedOperator } } { alexEOF :: Alex Token alexEOF = do (Posn l c, _, _, _) <- alexGetInput maybePopImportSC state <- getUserState unless (T.null (stringBuffer state)) $ do alexError $ "Unterminated string literal at line " ++ show l ++ ", column " ++ show c case layoutColumns state of -- EOF is allowed to close as many layout contexts as there are -- pending (number of pending layout contexts is the length of the -- list minus one, since there's the one initial layout context.) _:tail -> do mapUserState $ \s -> s { pendingTokens = (Token TokLEnd l c <$ tail) ++ [Token TokEof l c] , layoutColumns = [] } pushStartCode pending pure (Token TokLEnd l c) _ -> pure $ Token TokEof l c yield k inp i = clearPendingLC *> yield' k inp i yield' k (Posn l c, _, s, _) i = do pure (Token (k $! (T.decodeUtf8 (Lbs.toStrict (Lbs.take i s)))) l c) setPendingLC = mapUserState $ \s -> s { pendingLambdaCase = True } clearPendingLC = mapUserState $ \s -> s { pendingLambdaCase = False } always :: TokenClass -> AlexInput -> Int64 -> Alex Token always k x i = yield (const k) x i -- reset the string buffer and push the string start code startString (p, _, _, _) _ = do mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Just p } pushStartCode string alexMonadScan -- pop the string start code, and emit the string buffer as a token. endString (Posn l c, _, _, _) _i = do state <- getUserState mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Nothing } popStartCode let (Just (Posn l c)) = stringStartPosn state pure (Token (TokString (stringBuffer state)) l c) -- append a /lexed/ region to the string buffer stringSegment (Posn _ _, _, buf, _) i = do mapUserState $ \s -> s { stringBuffer = stringBuffer s <> T.decodeUtf8 (Lbs.toStrict (Lbs.take i buf)) } alexMonadScan -- append a constant fragment to the string buffer. stringAppend text _ _ = do mapUserState $ \s -> s { stringBuffer = stringBuffer s <> text } alexMonadScan emitPendingToken :: AlexAction Token emitPendingToken _ _ = do t <- getUserState case pendingTokens t of [] -> do popStartCode alexMonadScan (x:xs) -> do mapUserState $ \s -> s { pendingTokens = xs } pure x delayToken :: Token -> Alex () delayToken t = do mapUserState $ \s -> s { pendingTokens = t:pendingTokens s } pushStartCode pending just :: Alex a -> AlexAction Token just k _ _ = k *> alexMonadScan getUserState :: Alex AlexUserState getUserState = Alex $ \s -> Right (s, alex_ust s) mapUserState :: (AlexUserState -> AlexUserState) -> Alex () mapUserState k = Alex $ \s -> Right (s { alex_ust = k $! alex_ust s }, ()) pushStartCode :: Int -> Alex () pushStartCode c = do sc <- alexGetStartCode mapUserState $ \s -> s { startCodes = sc:startCodes s } alexSetStartCode c popStartCode :: Alex () popStartCode = do sc <- startCodes <$> getUserState case sc of [] -> alexSetStartCode 0 (x:xs) -> do mapUserState $ \s -> s { startCodes = xs } alexSetStartCode x offsideRule :: AlexInput -> Int64 -> Alex Token offsideRule (Posn line col, _, _, _) _ = do columns <- layoutColumns <$> getUserState let continue = popStartCode *> alexMonadScan -- The "offside rule" governs how to insert virtual semicolon and -- closing '}' tokens. It applies in the "newline" state, and, if we -- stay in that state, the rule keeps applying. There are a couple of -- cases: case columns of -- If we have no layout context (or we're in a layout context that -- started with a physical '{'), then the offside rule plain doesn't -- apply. [] -> continue ExplicitLayout:_ -> continue -- Otherwise, we're dealing with something like -- -- do token -- ^ this is the layout column. col':ctx -> do case col `compare` layoutCol col' of -- If we have something like -- -- do token -- token -- ^ this is where we are -- then we emit a semicolon (and possibly do some bookeeping, -- like leaving the newline state) EQ -> do popStartCode maybePopImportSC pure (Token TokSemi line col) -- If we have something like -- -- do token -- token -- ^ this is where we are -- then we don't emit anything, just leave the newline state, -- since this token continues the previous logical line. GT -> continue -- If we have something like -- -- C D E -- do token -- do token -- do token -- token -- ^ we are here -- then we're behind the layout context, but not just one, three! -- we emit a closing '}' to close context 'E', and STAY in the -- newline context. when we eventually end up here again -- (recurring interleaved with the lexer state machine), we -- close the D and C contexts in the same way. LT -> do mapUserState $ \s -> s { layoutColumns = ctx } pure (Token TokLEnd line col) -- eventually we either exhaust all the layout contexts or get -- to a layout context we're EQ or GT compared to. in that case -- one of the other rules apply. maybePopImportSC :: Alex () maybePopImportSC = do startcode <- alexGetStartCode when (startcode == import_ || startcode == foreign_) popStartCode emptyLayout :: AlexInput -> Int64 -> Alex Token emptyLayout (Posn line col, _, _, _) _ = do popStartCode pushStartCode newline pure (Token TokLEnd line col) startLayout :: AlexInput -> Int64 -> Alex Token startLayout (Posn line col, _, _, _) _ = do state <- getUserState popStartCode let col' = case layoutColumns state of [] -> 0 (x:_) -> layoutCol x layoutKind = case pendingLayoutKw state of Just s -> s Nothing -> Layout -- here's another rule. suppose we have: -- -- foo = bar where -- spam = ham -- -- if we just apply the rule that the next token after a layout -- keyword determines the column for the layout context, then we're -- starting another layout context at column 1! that's definitely not -- what we want. -- -- so a new layout context only starts if the first token is to the right -- of the previous layout context. that is: a block only starts if it's -- on the same line as the layout context, or indented further. if col <= col' then pushStartCode empty_layout else mapUserState $ \s -> s { layoutColumns = layoutKind col:layoutColumns s } pure (Token TokLStart line col) popLayoutContext :: Alex () popLayoutContext = mapUserState $ \s -> s { layoutColumns = tail (layoutColumns s) } openBrace :: AlexInput -> Int64 -> Alex Token openBrace (Posn line col, _, _, _) _ = do -- if we see a '{' token, we're probably in the layout state. in that -- case, we pop it! otherwise, we just pop the state anyway: if we -- were in <0>, then popping gets us back in <0>. popStartCode -- we push an ExplicitLayout state so that the offside rule stops -- applying (logical lines are delimited by physical semicolons) and a -- '}' can close it. mapUserState $ \s -> s { layoutColumns = ExplicitLayout:layoutColumns s } pure (Token TokOBrace line col) closeBrace :: AlexInput -> Int64 -> Alex Token closeBrace (Posn line col, _, _, _) _ = do -- if we're lexing a '}' token (physical) and the rightmost layout -- context was started by a physical '{', then we can close it. -- otherwise we do nothing and probably get a parse error! columns <- layoutColumns <$> getUserState case columns of ExplicitLayout:_ -> popLayoutContext _ -> pure () pure (Token TokCBrace line col) variableOrKeyword :: AlexAction Token variableOrKeyword (Posn l c, _, s, _) size = finishVarKw l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s)) qualifiedVariable :: AlexAction Token qualifiedVariable (Posn l c, _, s, _) size = finishVar TokUnqual TokQual l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s)) qualifiedOperator :: AlexAction Token qualifiedOperator (Posn l c, _, s, _) size = finishVar TokUnqualOp TokQualOp l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s)) finishVarKw :: Int -> Int -> T.Text -> Alex Token finishVarKw l c text | T.null text = undefined | Data.Char.isUpper (T.head text) = finishVar TokUnqual TokQual l c text finishVarKw l c text = do sc <- alexGetStartCode state <- getUserState clearPendingLC let col = layoutCol (head (layoutColumns state)) case T.unpack text of -- we handle the contextual 'as'/'qualified' tokens using a -- startcode. -- -- in the import_ state, as and qualified are keywords, unless the -- offside rule would apply to emit a ';' or '}' token. in that -- case, we emit a semicolon (what the offside rule would do!), and -- set the "keyword" (now changed to an identifier) as pending, so -- that it will be emitted by the next alexMonadScan. "import" -> do when ((sc /= import_) && (sc /= foreign_)) $ pushStartCode import_ pure (Token TokImport l c) "as" -> conditionalKeyword l c import_ (c > col) TokAs "qualified" -> conditionalKeyword l c import_ (c > col) TokQualified "foreign" -> do when (sc /= foreign_) $ pushStartCode foreign_ pure (Token TokForeign l c) "export" -> conditionalKeyword l c foreign_ (c > col) TokExport "safe" -> conditionalKeyword l c foreign_ (c > col) TokSafe "unsafe" -> conditionalKeyword l c foreign_ (c > col) TokUnsafe "ccall" -> conditionalKeyword l c foreign_ (c > col) TokCCall -- when starting a layout context for let expressions we make sure -- that it is distinguishable from layout contexts started by -- anything else, because let layout contexts can be terminated -- ahead of time by the 'in' token. for instance in: -- -- let x = 1 in x -- -- there is no reason for the layout context that started after -- 'let' to be terminated by the 'in' token, since the offside rule -- hasn't had a chance to apply. the token stream in that case would look like -- -- 'let' '{' x '=' 1 'in' x -- -- which is a parse error. we do not implement the rule which says parse errors -- terminate layout contexts, instead doing this approximation. "let" -> laidOut' (Just LetLayout) TokLet l c "in" -> do laidout <- layoutColumns <$> getUserState case laidout of LetLayout _:_ -> earlyEnd TokIn l c _ -> pure (Token TokIn l c) "data" -> pure (Token TokData l c) "where" -> laidOut TokWhere l c "module" -> pure (Token TokModule l c) -- when we lex a \ token, a flag is set in the lexer state to -- indicate that, if there is a 'case' token directly following, -- that token is to be interpreted as part of a lambda-case -- construct, and so must start a layout context for its branches. "case" | pendingLambdaCase state -> laidOut TokCase l c | otherwise -> pure (Token TokCase l c) "of" -> laidOut TokOf l c (_:_) -> pure (Token (TokUnqual VarId text) l c) [] -> error "empty keyword/identifier" conditionalKeyword l c import_ cond kw = do sc <- alexGetStartCode case () of () | sc == import_, cond -> pure (Token kw l c) | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c | otherwise -> pure (Token (TokUnqual VarId text) l c) where text = T.pack (show kw) finishVar :: (IdClass -> T.Text -> TokenClass) -> (IdClass -> T.Text -> T.Text -> TokenClass) -> Int -> Int -> T.Text -> Alex Token finishVar tokunqual tokqual l c text | T.null text = undefined | Data.Char.isUpper (T.head text), T.singleton '.' `T.isInfixOf` text = pure $ let txet = T.reverse text (suffix', prefix') = T.span (/= '.') txet prefix = T.reverse (T.tail prefix') suffix = T.reverse suffix' in if Data.Char.isUpper (T.head suffix) then Token (tokqual ConId prefix suffix) l c else Token (tokqual VarId prefix suffix) l c | Data.Char.isUpper (T.head text) = pure $ Token (tokunqual ConId text) l c | otherwise = pure $ Token (tokunqual VarId text) l c earlyEnd :: TokenClass -> Int -> Int -> Alex Token earlyEnd tok l c = do popLayoutContext delayToken (Token tok l c) pure (Token TokLEnd l c) offsideKeyword :: TokenClass -> Int -> Int -> Alex Token offsideKeyword tok l c = do popStartCode delayToken (Token tok l c) pure (Token TokSemi l c) laidOut' :: Maybe (Int -> LayoutState) -> TokenClass -> Int -> Int -> Alex Token laidOut' n x l c = do pushStartCode layout mapUserState $ \s -> s { leastColumn = c, pendingLayoutKw = n } pure (Token x l c) laidOut = laidOut' Nothing alexMonadScan = do inp@(_,_,_,n) <- alexGetInput sc <- alexGetStartCode case alexScan inp sc of AlexEOF -> alexEOF AlexError error@(_,_,inp,_) -> alexError $ "Unexpected character: " ++ show (T.head (T.decodeUtf8 (Lbs.toStrict inp))) AlexSkip inp _len -> do alexSetInput inp alexMonadScan AlexToken inp'@(_,_,_,n') _ action -> let len = n'-n in do alexSetInput inp' action (ignorePendingBytes inp) len }