Abbie's Haskell compiler
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

481 lines
15 KiB

  1. {
  2. module Frontend.Autogen.Lexer where
  3. import Control.Monad
  4. import qualified Data.ByteString.Lazy as Lbs
  5. import qualified Data.Text.Encoding as T
  6. import qualified Data.Text as T
  7. import qualified Data.Char
  8. import Data.Int (Int64)
  9. import Frontend.Lexer.Wrapper
  10. import Frontend.Lexer.Tokens
  11. import Frontend.Parser.Posn
  12. }
  13. -- %wrapper "monadUserState-bytestring"
  14. $alpha = [a-zA-Z]
  15. $digit = [0-9]
  16. $white_nol = $white # [\n\t]
  17. tokens :-
  18. $white_nol+ ;
  19. \t { \_ _ -> alexError "tab character in source code" }
  20. <0,import_> "--" .* \n
  21. { just $ pushStartCode newline }
  22. <0,import_>
  23. $alpha [$alpha $digit \_ \' \.]* { variableOrKeyword }
  24. <0> \= { always TokEqual }
  25. <0> \: \: { always TokDoubleColon }
  26. <0> \\ { \i l -> setPendingLC *> yield' (const TokLambda) i l }
  27. <0> "->" { always TokArrow }
  28. <0> "_" { always TokUnder }
  29. <0> \{ { always TokOBrace }
  30. <0> \[ { always TokOSquare }
  31. <0,import_> {
  32. \, { always TokComma }
  33. \( { always TokOParen }
  34. \) { always TokCParen }
  35. }
  36. <0> \} { closeBrace }
  37. <0> \] { always TokCSquare }
  38. <0> \;+ { always TokSemi }
  39. <0,import_> \n { just $ pushStartCode newline }
  40. <0> \" { startString }
  41. <string> {
  42. \\ \" { stringAppend (T.singleton '"') }
  43. \\ \\ { stringAppend (T.singleton '\\') }
  44. \\ a { stringAppend (T.singleton '\a') }
  45. \\ b { stringAppend (T.singleton '\b') }
  46. \\ f { stringAppend (T.singleton '\f') }
  47. \\ n { stringAppend (T.singleton '\n') }
  48. \\ \n { stringAppend (T.singleton '\n') }
  49. \\ r { stringAppend (T.singleton '\r') }
  50. \\ v { stringAppend (T.singleton '\v') }
  51. \\ t { stringAppend (T.singleton '\t') }
  52. \" { endString }
  53. [^\\\"]+ { stringSegment }
  54. }
  55. <0,newline,comment,import_>
  56. "{-" { just $ pushStartCode comment }
  57. <comment> {
  58. "-}" { \_ _ -> popStartCode *> alexMonadScan }
  59. . ;
  60. }
  61. -- newline: emit a semicolon when de-denting
  62. <newline> {
  63. \n ;
  64. "--" .* \n ;
  65. () { offsideRule }
  66. }
  67. -- layout: indentation of the next token is context for offside rule
  68. <layout> {
  69. \n ;
  70. "--" .* \n ;
  71. \{ { openBrace }
  72. () { startLayout }
  73. }
  74. <import_> {
  75. \n { just $ pushStartCode newline }
  76. "--" .* \n { just $ pushStartCode newline }
  77. }
  78. <empty_layout> () { emptyLayout }
  79. <pending> () { emitPendingToken }
  80. {
  81. alexEOF :: Alex Token
  82. alexEOF = do
  83. (Posn l c, _, _, _) <- alexGetInput
  84. maybePopImportSC
  85. state <- getUserState
  86. unless (T.null (stringBuffer state)) $ do
  87. alexError $ "Unterminated string literal at line " ++ show l ++ ", column " ++ show c
  88. case layoutColumns state of
  89. -- EOF is allowed to close as many layout contexts as there are
  90. -- pending (number of pending layout contexts is the length of the
  91. -- list minus one, since there's the one initial layout context.)
  92. _:tail -> do
  93. mapUserState $ \s ->
  94. s { pendingTokens = (Token TokLEnd l c <$ tail) ++ [Token TokEof l c]
  95. , layoutColumns = []
  96. }
  97. pushStartCode pending
  98. pure (Token TokLEnd l c)
  99. _ -> pure $ Token TokEof l c
  100. yield k inp i = clearPendingLC *> yield' k inp i
  101. yield' k (Posn l c, _, s, _) i = do
  102. pure (Token (k $! (T.decodeUtf8 (Lbs.toStrict (Lbs.take i s)))) l c)
  103. setPendingLC = mapUserState $ \s -> s { pendingLambdaCase = True }
  104. clearPendingLC = mapUserState $ \s -> s { pendingLambdaCase = False }
  105. always :: TokenClass -> AlexInput -> Int64 -> Alex Token
  106. always k x i = yield (const k) x i
  107. -- reset the string buffer and push the string start code
  108. startString (p, _, _, _) _ = do
  109. mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Just p }
  110. pushStartCode string
  111. alexMonadScan
  112. -- pop the string start code, and emit the string buffer as a token.
  113. endString (Posn l c, _, _, _) _i = do
  114. state <- getUserState
  115. mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Nothing }
  116. popStartCode
  117. let (Just (Posn l c)) = stringStartPosn state
  118. pure (Token (TokString (stringBuffer state)) l c)
  119. -- append a /lexed/ region to the string buffer
  120. stringSegment (Posn _ _, _, buf, _) i = do
  121. mapUserState $ \s -> s { stringBuffer = stringBuffer s <> T.decodeUtf8 (Lbs.toStrict (Lbs.take i buf)) }
  122. alexMonadScan
  123. -- append a constant fragment to the string buffer.
  124. stringAppend text _ _ = do
  125. mapUserState $ \s -> s { stringBuffer = stringBuffer s <> text }
  126. alexMonadScan
  127. emitPendingToken :: AlexAction Token
  128. emitPendingToken _ _ = do
  129. t <- getUserState
  130. case pendingTokens t of
  131. [] -> do
  132. popStartCode
  133. alexMonadScan
  134. (x:xs) -> do
  135. mapUserState $ \s -> s { pendingTokens = xs }
  136. pure x
  137. delayToken :: Token -> Alex ()
  138. delayToken t = do
  139. mapUserState $ \s -> s { pendingTokens = t:pendingTokens s }
  140. pushStartCode pending
  141. just :: Alex a -> AlexAction Token
  142. just k _ _ = k *> alexMonadScan
  143. getUserState :: Alex AlexUserState
  144. getUserState = Alex $ \s -> Right (s, alex_ust s)
  145. mapUserState :: (AlexUserState -> AlexUserState) -> Alex ()
  146. mapUserState k = Alex $ \s -> Right (s { alex_ust = k $! alex_ust s }, ())
  147. pushStartCode :: Int -> Alex ()
  148. pushStartCode c = do
  149. sc <- alexGetStartCode
  150. mapUserState $ \s -> s { startCodes = sc:startCodes s }
  151. alexSetStartCode c
  152. popStartCode :: Alex ()
  153. popStartCode = do
  154. sc <- startCodes <$> getUserState
  155. case sc of
  156. [] -> alexSetStartCode 0
  157. (x:xs) -> do
  158. mapUserState $ \s -> s { startCodes = xs }
  159. alexSetStartCode x
  160. offsideRule :: AlexInput -> Int64 -> Alex Token
  161. offsideRule (Posn line col, _, _, _) _ = do
  162. columns <- layoutColumns <$> getUserState
  163. let continue = popStartCode *> alexMonadScan
  164. -- The "offside rule" governs how to insert virtual semicolon and
  165. -- closing '}' tokens. It applies in the "newline" state, and, if we
  166. -- stay in that state, the rule keeps applying. There are a couple of
  167. -- cases:
  168. case columns of
  169. -- If we have no layout context (or we're in a layout context that
  170. -- started with a physical '{'), then the offside rule plain doesn't
  171. -- apply.
  172. [] -> continue
  173. ExplicitLayout:_ -> continue
  174. -- Otherwise, we're dealing with something like
  175. --
  176. -- do token
  177. -- ^ this is the layout column.
  178. col':ctx -> do
  179. case col `compare` layoutCol col' of
  180. -- If we have something like
  181. --
  182. -- do token
  183. -- token
  184. -- ^ this is where we are
  185. -- then we emit a semicolon (and possibly do some bookeeping,
  186. -- like leaving the newline state)
  187. EQ -> do
  188. popStartCode
  189. maybePopImportSC
  190. pure (Token TokSemi line col)
  191. -- If we have something like
  192. --
  193. -- do token
  194. -- token
  195. -- ^ this is where we are
  196. -- then we don't emit anything, just leave the newline state,
  197. -- since this token continues the previous logical line.
  198. GT -> continue
  199. -- If we have something like
  200. --
  201. -- C D E
  202. -- do token
  203. -- do token
  204. -- do token
  205. -- token
  206. -- ^ we are here
  207. -- then we're behind the layout context, but not just one, three!
  208. -- we emit a closing '}' to close context 'E', and STAY in the
  209. -- newline context. when we eventually end up here again
  210. -- (recurring interleaved with the lexer state machine), we
  211. -- close the D and C contexts in the same way.
  212. LT -> do
  213. mapUserState $ \s -> s { layoutColumns = ctx }
  214. pure (Token TokLEnd line col)
  215. -- eventually we either exhaust all the layout contexts or get
  216. -- to a layout context we're EQ or GT compared to. in that case
  217. -- one of the other rules apply.
  218. maybePopImportSC :: Alex ()
  219. maybePopImportSC = do
  220. startcode <- alexGetStartCode
  221. when (startcode == import_) popStartCode
  222. emptyLayout :: AlexInput -> Int64 -> Alex Token
  223. emptyLayout (Posn line col, _, _, _) _ = do
  224. popStartCode
  225. pushStartCode newline
  226. pure (Token TokLEnd line col)
  227. startLayout :: AlexInput -> Int64 -> Alex Token
  228. startLayout (Posn line col, _, _, _) _ = do
  229. state <- getUserState
  230. popStartCode
  231. let
  232. col' =
  233. case layoutColumns state of
  234. [] -> 0
  235. (x:_) -> layoutCol x
  236. layoutKind = case pendingLayoutKw state of
  237. Just s -> s
  238. Nothing -> Layout
  239. -- here's another rule. suppose we have:
  240. --
  241. -- foo = bar where
  242. -- spam = ham
  243. --
  244. -- if we just apply the rule that the next token after a layout
  245. -- keyword determines the column for the layout context, then we're
  246. -- starting another layout context at column 1! that's definitely not
  247. -- what we want.
  248. --
  249. -- so a new layout context only starts if the first token is to the right
  250. -- of the previous layout context. that is: a block only starts if it's
  251. -- on the same line as the layout context, or indented further.
  252. if col <= col'
  253. then pushStartCode empty_layout
  254. else mapUserState $ \s -> s { layoutColumns = layoutKind col:layoutColumns s }
  255. pure (Token TokLStart line col)
  256. popLayoutContext :: Alex ()
  257. popLayoutContext = mapUserState $ \s -> s { layoutColumns = tail (layoutColumns s) }
  258. openBrace :: AlexInput -> Int64 -> Alex Token
  259. openBrace (Posn line col, _, _, _) _ = do
  260. -- if we see a '{' token, we're probably in the layout state. in that
  261. -- case, we pop it! otherwise, we just pop the state anyway: if we
  262. -- were in <0>, then popping gets us back in <0>.
  263. popStartCode
  264. -- we push an ExplicitLayout state so that the offside rule stops
  265. -- applying (logical lines are delimited by physical semicolons) and a
  266. -- '}' can close it.
  267. mapUserState $ \s -> s { layoutColumns = ExplicitLayout:layoutColumns s }
  268. pure (Token TokOBrace line col)
  269. closeBrace :: AlexInput -> Int64 -> Alex Token
  270. closeBrace (Posn line col, _, _, _) _ = do
  271. -- if we're lexing a '}' token (physical) and the rightmost layout
  272. -- context was started by a physical '{', then we can close it.
  273. -- otherwise we do nothing and probably get a parse error!
  274. columns <- layoutColumns <$> getUserState
  275. case columns of
  276. ExplicitLayout:_ -> popLayoutContext
  277. _ -> pure ()
  278. pure (Token TokCBrace line col)
  279. variableOrKeyword :: AlexAction Token
  280. variableOrKeyword (Posn l c, _, s, _) size = finishVarKw l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s))
  281. finishVarKw :: Int -> Int -> T.Text -> Alex Token
  282. finishVarKw l c text
  283. | T.null text = undefined
  284. | Data.Char.isUpper (T.head text), T.singleton '.' `T.isInfixOf` text = pure $
  285. -- if we have a token like A.B.C, we reverse it and span at the
  286. -- first (last) dot, so that we have, e.g.:
  287. --
  288. -- "Aa.Bb.Cc" -> "cC.bB.aA"
  289. -- "Cc.Bb.Aa" -> ("Cc", ".bB.aA")
  290. --
  291. -- what we have then is the suffix and the prefix, but they've both
  292. -- been reversed. so we unreverse them, and also drop the first
  293. -- (last) dot from the prefix.
  294. --
  295. -- if the suffix starts with an uppercase letter, it's a constructor
  296. -- symbol (ConId).
  297. let
  298. txet = T.reverse text
  299. (suffix', prefix') = T.span (/= '.') txet
  300. prefix = T.reverse (T.tail prefix')
  301. suffix = T.reverse suffix'
  302. in if Data.Char.isUpper (T.head suffix)
  303. then Token (TokQual ConId prefix suffix) l c
  304. else Token (TokQual VarId prefix suffix) l c
  305. | Data.Char.isUpper (T.head text) = pure $ Token (TokUnqual ConId text) l c
  306. finishVarKw l c text = do
  307. sc <- alexGetStartCode
  308. state <- getUserState
  309. clearPendingLC
  310. let col = layoutCol (head (layoutColumns state))
  311. case T.unpack text of
  312. -- we handle the contextual 'as'/'qualified' tokens using a
  313. -- startcode.
  314. --
  315. -- in the import_ state, as and qualified are keywords, unless the
  316. -- offside rule would apply to emit a ';' or '}' token. in that
  317. -- case, we emit a semicolon (what the offside rule would do!), and
  318. -- set the "keyword" (now changed to an identifier) as pending, so
  319. -- that it will be emitted by the next alexMonadScan.
  320. "import" -> do
  321. pushStartCode import_
  322. pure (Token TokImport l c)
  323. "as"
  324. | sc == import_, c > col -> pure (Token TokAs l c)
  325. | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c
  326. | otherwise -> pure (Token (TokUnqual VarId text) l c)
  327. "qualified"
  328. | sc == import_, c > col -> pure (Token TokQualified l c)
  329. | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c
  330. | otherwise -> pure (Token (TokUnqual VarId text) l c)
  331. -- when starting a layout context for let expressions we make sure
  332. -- that it is distinguishable from layout contexts started by
  333. -- anything else, because let layout contexts can be terminated
  334. -- ahead of time by the 'in' token. for instance in:
  335. --
  336. -- let x = 1 in x
  337. --
  338. -- there is no reason for the layout context that started after
  339. -- 'let' to be terminated by the 'in' token, since the offside rule
  340. -- hasn't had a chance to apply. the token stream in that case would look like
  341. --
  342. -- 'let' '{' x '=' 1 'in' x
  343. --
  344. -- which is a parse error. we do not implement the rule which says parse errors
  345. -- terminate layout contexts, instead doing this approximation.
  346. "let" -> laidOut' (Just LetLayout) TokLet l c
  347. "in" -> do
  348. laidout <- layoutColumns <$> getUserState
  349. case laidout of
  350. LetLayout _:_ -> earlyEnd TokIn l c
  351. _ -> pure (Token TokIn l c)
  352. "data" -> pure (Token TokData l c)
  353. "where" -> laidOut TokWhere l c
  354. "module" -> pure (Token TokModule l c)
  355. -- when we lex a \ token, a flag is set in the lexer state to
  356. -- indicate that, if there is a 'case' token directly following,
  357. -- that token is to be interpreted as part of a lambda-case
  358. -- construct, and so must start a layout context for its branches.
  359. "case"
  360. | pendingLambdaCase state -> laidOut TokCase l c
  361. | otherwise -> pure (Token TokCase l c)
  362. "of" -> laidOut TokOf l c
  363. (_:_) -> pure (Token (TokUnqual VarId text) l c)
  364. [] -> error "empty keyword/identifier"
  365. earlyEnd :: TokenClass -> Int -> Int -> Alex Token
  366. earlyEnd tok l c = do
  367. popLayoutContext
  368. delayToken (Token tok l c)
  369. pure (Token TokLEnd l c)
  370. offsideKeyword :: TokenClass -> Int -> Int -> Alex Token
  371. offsideKeyword tok l c = do
  372. popStartCode
  373. delayToken (Token tok l c)
  374. pure (Token TokSemi l c)
  375. laidOut' :: Maybe (Int -> LayoutState) -> TokenClass -> Int -> Int -> Alex Token
  376. laidOut' n x l c = do
  377. pushStartCode layout
  378. mapUserState $ \s -> s { leastColumn = c, pendingLayoutKw = n }
  379. pure (Token x l c)
  380. laidOut = laidOut' Nothing
  381. alexMonadScan = do
  382. inp@(_,_,_,n) <- alexGetInput
  383. sc <- alexGetStartCode
  384. case alexScan inp sc of
  385. AlexEOF -> alexEOF
  386. AlexError error@(_,_,inp,_) ->
  387. alexError $ "Unexpected character: " ++ show (T.head (T.decodeUtf8 (Lbs.toStrict inp)))
  388. AlexSkip inp _len -> do
  389. alexSetInput inp
  390. alexMonadScan
  391. AlexToken inp'@(_,_,_,n') _ action -> let len = n'-n in do
  392. alexSetInput inp'
  393. action (ignorePendingBytes inp) len
  394. }