amelia
/
ahc


{

module Frontend.Autogen.Lexer where


import Control.Monad


import qualified Data.ByteString.Lazy as Lbs

import qualified Data.Text.Encoding as T

import qualified Data.Text as T

import qualified Data.Char

import Data.Int (Int64)


import Frontend.Lexer.Wrapper

import Frontend.Lexer.Tokens

import Frontend.Parser.Posn

}


-- %wrapper "monadUserState-bytestring"


$alpha = [a-zA-Z]

$digit = [0-9]

$white_nol = $white # [\n\t]


tokens :-

 $white_nol+ ;

 \t { \_ _ -> alexError "tab character in source code" }


<0,import_> "--" .* \n

 { just $ pushStartCode newline }


<0,import_>

 $alpha [$alpha $digit \_ \' \.]* { variableOrKeyword }


<0> \= { always TokEqual }

<0> \: \: { always TokDoubleColon }


<0> \\ { \i l -> setPendingLC *> yield' (const TokLambda) i l }

<0> "->" { always TokArrow }

<0> "_" { always TokUnder }


<0> \{ { always TokOBrace }

<0> \[ { always TokOSquare }


<0,import_> {

 \, { always TokComma }

 \( { always TokOParen }

 \) { always TokCParen }

}


<0> \} { closeBrace }

<0> \] { always TokCSquare }


<0> \;+ { always TokSemi }


<0,import_> \n { just $ pushStartCode newline }


<0> \" { startString }


<string> {

 \\ \" { stringAppend (T.singleton '"') }

 \\ \\ { stringAppend (T.singleton '\\') }


 \\ a { stringAppend (T.singleton '\a') }

 \\ b { stringAppend (T.singleton '\b') }

 \\ f { stringAppend (T.singleton '\f') }

 \\ n { stringAppend (T.singleton '\n') }

 \\ \n { stringAppend (T.singleton '\n') }

 \\ r { stringAppend (T.singleton '\r') }

 \\ v { stringAppend (T.singleton '\v') }

 \\ t { stringAppend (T.singleton '\t') }


 \" { endString }


 [^\\\"]+ { stringSegment }

}


<0,newline,comment,import_>

 "{-" { just $ pushStartCode comment }


<comment> {

 "-}" { \_ _ -> popStartCode *> alexMonadScan }

 . ;

}


-- newline: emit a semicolon when de-denting

<newline> {

 \n ;

 "--" .* \n ;


 () { offsideRule }

}


-- layout: indentation of the next token is context for offside rule

<layout> {

 \n ;

 "--" .* \n ;


 \{ { openBrace }

 () { startLayout }

}


<import_> {

 \n { just $ pushStartCode newline }

 "--" .* \n { just $ pushStartCode newline }

}


<empty_layout> () { emptyLayout }


<pending> () { emitPendingToken }


{

alexEOF :: Alex Token

alexEOF = do

 (Posn l c, _, _, _) <- alexGetInput


 maybePopImportSC


 state <- getUserState


 unless (T.null (stringBuffer state)) $ do

 alexError $ "Unterminated string literal at line " ++ show l ++ ", column " ++ show c


 case layoutColumns state of

 -- EOF is allowed to close as many layout contexts as there are

 -- pending (number of pending layout contexts is the length of the

 -- list minus one, since there's the one initial layout context.)

 _:tail -> do

 mapUserState $ \s ->

 s { pendingTokens = (Token TokLEnd l c <$ tail) ++ [Token TokEof l c]

 , layoutColumns = []

 }

 pushStartCode pending

 pure (Token TokLEnd l c)

 _ -> pure $ Token TokEof l c


yield k inp i = clearPendingLC *> yield' k inp i


yield' k (Posn l c, _, s, _) i = do

 pure (Token (k $! (T.decodeUtf8 (Lbs.toStrict (Lbs.take i s)))) l c)


setPendingLC = mapUserState $ \s -> s { pendingLambdaCase = True }

clearPendingLC = mapUserState $ \s -> s { pendingLambdaCase = False }


always :: TokenClass -> AlexInput -> Int64 -> Alex Token

always k x i = yield (const k) x i


-- reset the string buffer and push the string start code

startString (p, _, _, _) _ = do

 mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Just p }

 pushStartCode string

 alexMonadScan


-- pop the string start code, and emit the string buffer as a token.

endString (Posn l c, _, _, _) _i = do

 state <- getUserState


 mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Nothing }

 popStartCode


 let (Just (Posn l c)) = stringStartPosn state

 pure (Token (TokString (stringBuffer state)) l c)


-- append a /lexed/ region to the string buffer

stringSegment (Posn _ _, _, buf, _) i = do

 mapUserState $ \s -> s { stringBuffer = stringBuffer s <> T.decodeUtf8 (Lbs.toStrict (Lbs.take i buf)) }

 alexMonadScan


-- append a constant fragment to the string buffer.

stringAppend text _ _ = do

 mapUserState $ \s -> s { stringBuffer = stringBuffer s <> text }

 alexMonadScan


emitPendingToken :: AlexAction Token

emitPendingToken _ _ = do

 t <- getUserState

 case pendingTokens t of

 [] -> do

 popStartCode

 alexMonadScan

 (x:xs) -> do

 mapUserState $ \s -> s { pendingTokens = xs }

 pure x


delayToken :: Token -> Alex ()

delayToken t = do

 mapUserState $ \s -> s { pendingTokens = t:pendingTokens s }

 pushStartCode pending


just :: Alex a -> AlexAction Token

just k _ _ = k *> alexMonadScan


getUserState :: Alex AlexUserState

getUserState = Alex $ \s -> Right (s, alex_ust s)


mapUserState :: (AlexUserState -> AlexUserState) -> Alex ()

mapUserState k = Alex $ \s -> Right (s { alex_ust = k $! alex_ust s }, ())


pushStartCode :: Int -> Alex ()

pushStartCode c = do

 sc <- alexGetStartCode

 mapUserState $ \s -> s { startCodes = sc:startCodes s }

 alexSetStartCode c


popStartCode :: Alex ()

popStartCode = do

 sc <- startCodes <$> getUserState

 case sc of

 [] -> alexSetStartCode 0

 (x:xs) -> do

 mapUserState $ \s -> s { startCodes = xs }

 alexSetStartCode x


offsideRule :: AlexInput -> Int64 -> Alex Token

offsideRule (Posn line col, _, _, _) _ = do

 columns <- layoutColumns <$> getUserState


 let continue = popStartCode *> alexMonadScan


 -- The "offside rule" governs how to insert virtual semicolon and

 -- closing '}' tokens. It applies in the "newline" state, and, if we

 -- stay in that state, the rule keeps applying. There are a couple of

 -- cases:

 case columns of

 -- If we have no layout context (or we're in a layout context that

 -- started with a physical '{'), then the offside rule plain doesn't

 -- apply.

 [] -> continue

 ExplicitLayout:_ -> continue

 -- Otherwise, we're dealing with something like

 --

 -- do token

 -- ^ this is the layout column.

 col':ctx -> do

 case col `compare` layoutCol col' of

 -- If we have something like

 --

 -- do token

 -- token

 -- ^ this is where we are

 -- then we emit a semicolon (and possibly do some bookeeping,

 -- like leaving the newline state)

 EQ -> do

 popStartCode

 maybePopImportSC

 pure (Token TokSemi line col)


 -- If we have something like

 --

 -- do token

 -- token

 -- ^ this is where we are

 -- then we don't emit anything, just leave the newline state,

 -- since this token continues the previous logical line.

 GT -> continue


 -- If we have something like

 --

 -- C D E

 -- do token

 -- do token

 -- do token

 -- token

 -- ^ we are here

 -- then we're behind the layout context, but not just one, three!

 -- we emit a closing '}' to close context 'E', and STAY in the

 -- newline context. when we eventually end up here again

 -- (recurring interleaved with the lexer state machine), we

 -- close the D and C contexts in the same way.

 LT -> do

 mapUserState $ \s -> s { layoutColumns = ctx }

 pure (Token TokLEnd line col)


 -- eventually we either exhaust all the layout contexts or get

 -- to a layout context we're EQ or GT compared to. in that case

 -- one of the other rules apply.


maybePopImportSC :: Alex ()

maybePopImportSC = do

 startcode <- alexGetStartCode

 when (startcode == import_) popStartCode


emptyLayout :: AlexInput -> Int64 -> Alex Token

emptyLayout (Posn line col, _, _, _) _ = do

 popStartCode

 pushStartCode newline

 pure (Token TokLEnd line col)


startLayout :: AlexInput -> Int64 -> Alex Token

startLayout (Posn line col, _, _, _) _ = do

 state <- getUserState

 popStartCode

 let

 col' =

 case layoutColumns state of

 [] -> 0

 (x:_) -> layoutCol x


 layoutKind = case pendingLayoutKw state of

 Just s -> s

 Nothing -> Layout


 -- here's another rule. suppose we have:

 --

 -- foo = bar where

 -- spam = ham

 --

 -- if we just apply the rule that the next token after a layout

 -- keyword determines the column for the layout context, then we're

 -- starting another layout context at column 1! that's definitely not

 -- what we want.

 --

 -- so a new layout context only starts if the first token is to the right

 -- of the previous layout context. that is: a block only starts if it's

 -- on the same line as the layout context, or indented further.

 if col <= col'

 then pushStartCode empty_layout

 else mapUserState $ \s -> s { layoutColumns = layoutKind col:layoutColumns s }

 pure (Token TokLStart line col)


popLayoutContext :: Alex ()

popLayoutContext = mapUserState $ \s -> s { layoutColumns = tail (layoutColumns s) }


openBrace :: AlexInput -> Int64 -> Alex Token

openBrace (Posn line col, _, _, _) _ = do

 -- if we see a '{' token, we're probably in the layout state. in that

 -- case, we pop it! otherwise, we just pop the state anyway: if we

 -- were in <0>, then popping gets us back in <0>.

 popStartCode


 -- we push an ExplicitLayout state so that the offside rule stops

 -- applying (logical lines are delimited by physical semicolons) and a

 -- '}' can close it.

 mapUserState $ \s -> s { layoutColumns = ExplicitLayout:layoutColumns s }

 pure (Token TokOBrace line col)


closeBrace :: AlexInput -> Int64 -> Alex Token

closeBrace (Posn line col, _, _, _) _ = do

 -- if we're lexing a '}' token (physical) and the rightmost layout

 -- context was started by a physical '{', then we can close it.

 -- otherwise we do nothing and probably get a parse error!

 columns <- layoutColumns <$> getUserState

 case columns of

 ExplicitLayout:_ -> popLayoutContext

 _ -> pure ()

 pure (Token TokCBrace line col)


variableOrKeyword :: AlexAction Token

variableOrKeyword (Posn l c, _, s, _) size = finishVarKw l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s))


finishVarKw :: Int -> Int -> T.Text -> Alex Token

finishVarKw l c text

 | T.null text = undefined

 | Data.Char.isUpper (T.head text), T.singleton '.' `T.isInfixOf` text = pure $

 -- if we have a token like A.B.C, we reverse it and span at the

 -- first (last) dot, so that we have, e.g.:

 --

 -- "Aa.Bb.Cc" -> "cC.bB.aA"

 -- "Cc.Bb.Aa" -> ("Cc", ".bB.aA")

 --

 -- what we have then is the suffix and the prefix, but they've both

 -- been reversed. so we unreverse them, and also drop the first

 -- (last) dot from the prefix.

 --

 -- if the suffix starts with an uppercase letter, it's a constructor

 -- symbol (ConId).

 let

 txet = T.reverse text

 (suffix', prefix') = T.span (/= '.') txet


 prefix = T.reverse (T.tail prefix')

 suffix = T.reverse suffix'

 in if Data.Char.isUpper (T.head suffix)

 then Token (TokQual ConId prefix suffix) l c

 else Token (TokQual VarId prefix suffix) l c

 | Data.Char.isUpper (T.head text) = pure $ Token (TokUnqual ConId text) l c


finishVarKw l c text = do

 sc <- alexGetStartCode

 state <- getUserState


 clearPendingLC


 let col = layoutCol (head (layoutColumns state))


 case T.unpack text of

 -- we handle the contextual 'as'/'qualified' tokens using a

 -- startcode.

 --

 -- in the import_ state, as and qualified are keywords, unless the

 -- offside rule would apply to emit a ';' or '}' token. in that

 -- case, we emit a semicolon (what the offside rule would do!), and

 -- set the "keyword" (now changed to an identifier) as pending, so

 -- that it will be emitted by the next alexMonadScan.

 "import" -> do

 pushStartCode import_

 pure (Token TokImport l c)


 "as"

 | sc == import_, c > col -> pure (Token TokAs l c)

 | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c

 | otherwise -> pure (Token (TokUnqual VarId text) l c)


 "qualified"

 | sc == import_, c > col -> pure (Token TokQualified l c)

 | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c

 | otherwise -> pure (Token (TokUnqual VarId text) l c)


 -- when starting a layout context for let expressions we make sure

 -- that it is distinguishable from layout contexts started by

 -- anything else, because let layout contexts can be terminated

 -- ahead of time by the 'in' token. for instance in:

 --

 -- let x = 1 in x

 --

 -- there is no reason for the layout context that started after

 -- 'let' to be terminated by the 'in' token, since the offside rule

 -- hasn't had a chance to apply. the token stream in that case would look like

 --

 -- 'let' '{' x '=' 1 'in' x

 --

 -- which is a parse error. we do not implement the rule which says parse errors

 -- terminate layout contexts, instead doing this approximation.

 "let" -> laidOut' (Just LetLayout) TokLet l c

 "in" -> do

 laidout <- layoutColumns <$> getUserState

 case laidout of

 LetLayout _:_ -> earlyEnd TokIn l c

 _ -> pure (Token TokIn l c)


 "data" -> pure (Token TokData l c)


 "where" -> laidOut TokWhere l c

 "module" -> pure (Token TokModule l c)


 -- when we lex a \ token, a flag is set in the lexer state to

 -- indicate that, if there is a 'case' token directly following,

 -- that token is to be interpreted as part of a lambda-case

 -- construct, and so must start a layout context for its branches.

 "case"

 | pendingLambdaCase state -> laidOut TokCase l c

 | otherwise -> pure (Token TokCase l c)


 "of" -> laidOut TokOf l c


 (_:_) -> pure (Token (TokUnqual VarId text) l c)


 [] -> error "empty keyword/identifier"


earlyEnd :: TokenClass -> Int -> Int -> Alex Token

earlyEnd tok l c = do

 popLayoutContext

 delayToken (Token tok l c)

 pure (Token TokLEnd l c)


offsideKeyword :: TokenClass -> Int -> Int -> Alex Token

offsideKeyword tok l c = do

 popStartCode

 delayToken (Token tok l c)

 pure (Token TokSemi l c)


laidOut' :: Maybe (Int -> LayoutState) -> TokenClass -> Int -> Int -> Alex Token

laidOut' n x l c = do

 pushStartCode layout

 mapUserState $ \s -> s { leastColumn = c, pendingLayoutKw = n }

 pure (Token x l c)


laidOut = laidOut' Nothing


alexMonadScan = do

 inp@(_,_,_,n) <- alexGetInput


 sc <- alexGetStartCode

 case alexScan inp sc of

 AlexEOF -> alexEOF

 AlexError error@(_,_,inp,_) ->

 alexError $ "Unexpected character: " ++ show (T.head (T.decodeUtf8 (Lbs.toStrict inp)))

 AlexSkip inp _len -> do

 alexSetInput inp

 alexMonadScan

 AlexToken inp'@(_,_,_,n') _ action -> let len = n'-n in do

 alexSetInput inp'

 action (ignorePendingBytes inp) len

}