Abbie's Haskell compiler
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

515 lines
17 KiB

  1. {
  2. module Frontend.Autogen.Lexer where
  3. import Control.Monad
  4. import qualified Data.ByteString.Lazy as Lbs
  5. import qualified Data.Text.Encoding as T
  6. import qualified Data.Text as T
  7. import qualified Data.Char
  8. import Data.Int (Int64)
  9. import Frontend.Lexer.Wrapper
  10. import Frontend.Lexer.Tokens
  11. import Frontend.Parser.Posn
  12. }
  13. -- %wrapper "monadUserState-bytestring"
  14. $lower = [a-z]
  15. $upper = [A-Z]
  16. $alpha = [ $lower $upper ]
  17. $digit = [0-9]
  18. $alnum = [ $alpha $digit ]
  19. $white_nol = $white # [\n\t]
  20. $optail = [\! \# \$ \% \& \* \+ \. \/ \< \= \> \? \@ \\ \^ \| \- \~ \:]
  21. $ophead = $optail # \:
  22. @conid = $upper [$alnum \_ \']*
  23. @namespace = (@conid \.)*
  24. tokens :-
  25. $white_nol+ ;
  26. \t { \_ _ -> alexError "tab character in source code" }
  27. <0,import_,foreign_> "--" .* \n
  28. { just $ pushStartCode newline }
  29. <0> \= { always TokEqual }
  30. <0> \` { always TokTick }
  31. <0> \: \: { always TokDoubleColon }
  32. <foreign_> \: \: { \i l -> popStartCode *> always TokDoubleColon i l }
  33. <0> \\ { \i l -> setPendingLC *> yield' (const TokLambda) i l }
  34. <0> "->" { always TokArrow }
  35. <0> "_" { always TokUnder }
  36. <0> \{ { always TokOBrace }
  37. <0> \[ { always TokOSquare }
  38. <0,import_,foreign_> {
  39. \, { always TokComma }
  40. \( { always TokOParen }
  41. \) { always TokCParen }
  42. }
  43. <0> \} { closeBrace }
  44. <0> \] { always TokCSquare }
  45. <0> \;+ { always TokSemi }
  46. <0,import_,foreign_> \n { just $ pushStartCode newline }
  47. <0,foreign_> \" { startString }
  48. <string> {
  49. \\ \" { stringAppend (T.singleton '"') }
  50. \\ \\ { stringAppend (T.singleton '\\') }
  51. \\ a { stringAppend (T.singleton '\a') }
  52. \\ b { stringAppend (T.singleton '\b') }
  53. \\ f { stringAppend (T.singleton '\f') }
  54. \\ n { stringAppend (T.singleton '\n') }
  55. \\ \n { stringAppend (T.singleton '\n') }
  56. \\ r { stringAppend (T.singleton '\r') }
  57. \\ v { stringAppend (T.singleton '\v') }
  58. \\ t { stringAppend (T.singleton '\t') }
  59. \" { endString }
  60. [^\\\"]+ { stringSegment }
  61. }
  62. <0,newline,comment,import_>
  63. "{-" { just $ pushStartCode comment }
  64. <comment> {
  65. "-}" { \_ _ -> popStartCode *> alexMonadScan }
  66. . ;
  67. }
  68. -- newline: emit a semicolon when de-denting
  69. <newline> {
  70. \n ;
  71. "--" .* \n ;
  72. () { offsideRule }
  73. }
  74. -- layout: indentation of the next token is context for offside rule
  75. <layout> {
  76. \n ;
  77. "--" .* \n ;
  78. \{ { openBrace }
  79. () { startLayout }
  80. }
  81. <empty_layout> () { emptyLayout }
  82. <pending> () { emitPendingToken }
  83. -- identifiers and keywords
  84. <0,import_,foreign_> {
  85. $lower [$alpha $digit \_ \']* { variableOrKeyword }
  86. $upper [$alpha $digit \_ \']* { yield (TokUnqual ConId) }
  87. $ophead $optail* { yield (TokUnqualOp VarId) }
  88. : $optail* { yield (TokUnqualOp ConId) }
  89. @namespace $lower [$alpha $digit \_ \']* { qualifiedVariable }
  90. @namespace $upper [$alpha $digit \_ \']* { qualifiedVariable }
  91. @namespace $ophead $optail* { qualifiedOperator }
  92. @namespace : $optail* { qualifiedOperator }
  93. }
  94. {
  95. alexEOF :: Alex Token
  96. alexEOF = do
  97. (Posn l c, _, _, _) <- alexGetInput
  98. maybePopImportSC
  99. state <- getUserState
  100. unless (T.null (stringBuffer state)) $ do
  101. alexError $ "Unterminated string literal at line " ++ show l ++ ", column " ++ show c
  102. case layoutColumns state of
  103. -- EOF is allowed to close as many layout contexts as there are
  104. -- pending (number of pending layout contexts is the length of the
  105. -- list minus one, since there's the one initial layout context.)
  106. _:tail -> do
  107. mapUserState $ \s ->
  108. s { pendingTokens = (Token TokLEnd l c <$ tail) ++ [Token TokEof l c]
  109. , layoutColumns = []
  110. }
  111. pushStartCode pending
  112. pure (Token TokLEnd l c)
  113. _ -> pure $ Token TokEof l c
  114. yield k inp i = clearPendingLC *> yield' k inp i
  115. yield' k (Posn l c, _, s, _) i = do
  116. pure (Token (k $! (T.decodeUtf8 (Lbs.toStrict (Lbs.take i s)))) l c)
  117. setPendingLC = mapUserState $ \s -> s { pendingLambdaCase = True }
  118. clearPendingLC = mapUserState $ \s -> s { pendingLambdaCase = False }
  119. always :: TokenClass -> AlexInput -> Int64 -> Alex Token
  120. always k x i = yield (const k) x i
  121. -- reset the string buffer and push the string start code
  122. startString (p, _, _, _) _ = do
  123. mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Just p }
  124. pushStartCode string
  125. alexMonadScan
  126. -- pop the string start code, and emit the string buffer as a token.
  127. endString (Posn l c, _, _, _) _i = do
  128. state <- getUserState
  129. mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Nothing }
  130. popStartCode
  131. let (Just (Posn l c)) = stringStartPosn state
  132. pure (Token (TokString (stringBuffer state)) l c)
  133. -- append a /lexed/ region to the string buffer
  134. stringSegment (Posn _ _, _, buf, _) i = do
  135. mapUserState $ \s -> s { stringBuffer = stringBuffer s <> T.decodeUtf8 (Lbs.toStrict (Lbs.take i buf)) }
  136. alexMonadScan
  137. -- append a constant fragment to the string buffer.
  138. stringAppend text _ _ = do
  139. mapUserState $ \s -> s { stringBuffer = stringBuffer s <> text }
  140. alexMonadScan
  141. emitPendingToken :: AlexAction Token
  142. emitPendingToken _ _ = do
  143. t <- getUserState
  144. case pendingTokens t of
  145. [] -> do
  146. popStartCode
  147. alexMonadScan
  148. (x:xs) -> do
  149. mapUserState $ \s -> s { pendingTokens = xs }
  150. pure x
  151. delayToken :: Token -> Alex ()
  152. delayToken t = do
  153. mapUserState $ \s -> s { pendingTokens = t:pendingTokens s }
  154. pushStartCode pending
  155. just :: Alex a -> AlexAction Token
  156. just k _ _ = k *> alexMonadScan
  157. getUserState :: Alex AlexUserState
  158. getUserState = Alex $ \s -> Right (s, alex_ust s)
  159. mapUserState :: (AlexUserState -> AlexUserState) -> Alex ()
  160. mapUserState k = Alex $ \s -> Right (s { alex_ust = k $! alex_ust s }, ())
  161. pushStartCode :: Int -> Alex ()
  162. pushStartCode c = do
  163. sc <- alexGetStartCode
  164. mapUserState $ \s -> s { startCodes = sc:startCodes s }
  165. alexSetStartCode c
  166. popStartCode :: Alex ()
  167. popStartCode = do
  168. sc <- startCodes <$> getUserState
  169. case sc of
  170. [] -> alexSetStartCode 0
  171. (x:xs) -> do
  172. mapUserState $ \s -> s { startCodes = xs }
  173. alexSetStartCode x
  174. offsideRule :: AlexInput -> Int64 -> Alex Token
  175. offsideRule (Posn line col, _, _, _) _ = do
  176. columns <- layoutColumns <$> getUserState
  177. let continue = popStartCode *> alexMonadScan
  178. -- The "offside rule" governs how to insert virtual semicolon and
  179. -- closing '}' tokens. It applies in the "newline" state, and, if we
  180. -- stay in that state, the rule keeps applying. There are a couple of
  181. -- cases:
  182. case columns of
  183. -- If we have no layout context (or we're in a layout context that
  184. -- started with a physical '{'), then the offside rule plain doesn't
  185. -- apply.
  186. [] -> continue
  187. ExplicitLayout:_ -> continue
  188. -- Otherwise, we're dealing with something like
  189. --
  190. -- do token
  191. -- ^ this is the layout column.
  192. col':ctx -> do
  193. case col `compare` layoutCol col' of
  194. -- If we have something like
  195. --
  196. -- do token
  197. -- token
  198. -- ^ this is where we are
  199. -- then we emit a semicolon (and possibly do some bookeeping,
  200. -- like leaving the newline state)
  201. EQ -> do
  202. popStartCode
  203. maybePopImportSC
  204. pure (Token TokSemi line col)
  205. -- If we have something like
  206. --
  207. -- do token
  208. -- token
  209. -- ^ this is where we are
  210. -- then we don't emit anything, just leave the newline state,
  211. -- since this token continues the previous logical line.
  212. GT -> continue
  213. -- If we have something like
  214. --
  215. -- C D E
  216. -- do token
  217. -- do token
  218. -- do token
  219. -- token
  220. -- ^ we are here
  221. -- then we're behind the layout context, but not just one, three!
  222. -- we emit a closing '}' to close context 'E', and STAY in the
  223. -- newline context. when we eventually end up here again
  224. -- (recurring interleaved with the lexer state machine), we
  225. -- close the D and C contexts in the same way.
  226. LT -> do
  227. mapUserState $ \s -> s { layoutColumns = ctx }
  228. pure (Token TokLEnd line col)
  229. -- eventually we either exhaust all the layout contexts or get
  230. -- to a layout context we're EQ or GT compared to. in that case
  231. -- one of the other rules apply.
  232. maybePopImportSC :: Alex ()
  233. maybePopImportSC = do
  234. startcode <- alexGetStartCode
  235. when (startcode == import_ || startcode == foreign_) popStartCode
  236. emptyLayout :: AlexInput -> Int64 -> Alex Token
  237. emptyLayout (Posn line col, _, _, _) _ = do
  238. popStartCode
  239. pushStartCode newline
  240. pure (Token TokLEnd line col)
  241. startLayout :: AlexInput -> Int64 -> Alex Token
  242. startLayout (Posn line col, _, _, _) _ = do
  243. state <- getUserState
  244. popStartCode
  245. let
  246. col' =
  247. case layoutColumns state of
  248. [] -> 0
  249. (x:_) -> layoutCol x
  250. layoutKind = case pendingLayoutKw state of
  251. Just s -> s
  252. Nothing -> Layout
  253. -- here's another rule. suppose we have:
  254. --
  255. -- foo = bar where
  256. -- spam = ham
  257. --
  258. -- if we just apply the rule that the next token after a layout
  259. -- keyword determines the column for the layout context, then we're
  260. -- starting another layout context at column 1! that's definitely not
  261. -- what we want.
  262. --
  263. -- so a new layout context only starts if the first token is to the right
  264. -- of the previous layout context. that is: a block only starts if it's
  265. -- on the same line as the layout context, or indented further.
  266. if col <= col'
  267. then pushStartCode empty_layout
  268. else mapUserState $ \s -> s { layoutColumns = layoutKind col:layoutColumns s }
  269. pure (Token TokLStart line col)
  270. popLayoutContext :: Alex ()
  271. popLayoutContext = mapUserState $ \s -> s { layoutColumns = tail (layoutColumns s) }
  272. openBrace :: AlexInput -> Int64 -> Alex Token
  273. openBrace (Posn line col, _, _, _) _ = do
  274. -- if we see a '{' token, we're probably in the layout state. in that
  275. -- case, we pop it! otherwise, we just pop the state anyway: if we
  276. -- were in <0>, then popping gets us back in <0>.
  277. popStartCode
  278. -- we push an ExplicitLayout state so that the offside rule stops
  279. -- applying (logical lines are delimited by physical semicolons) and a
  280. -- '}' can close it.
  281. mapUserState $ \s -> s { layoutColumns = ExplicitLayout:layoutColumns s }
  282. pure (Token TokOBrace line col)
  283. closeBrace :: AlexInput -> Int64 -> Alex Token
  284. closeBrace (Posn line col, _, _, _) _ = do
  285. -- if we're lexing a '}' token (physical) and the rightmost layout
  286. -- context was started by a physical '{', then we can close it.
  287. -- otherwise we do nothing and probably get a parse error!
  288. columns <- layoutColumns <$> getUserState
  289. case columns of
  290. ExplicitLayout:_ -> popLayoutContext
  291. _ -> pure ()
  292. pure (Token TokCBrace line col)
  293. variableOrKeyword :: AlexAction Token
  294. variableOrKeyword (Posn l c, _, s, _) size =
  295. finishVarKw l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s))
  296. qualifiedVariable :: AlexAction Token
  297. qualifiedVariable (Posn l c, _, s, _) size =
  298. finishVar TokUnqual TokQual l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s))
  299. qualifiedOperator :: AlexAction Token
  300. qualifiedOperator (Posn l c, _, s, _) size =
  301. finishVar TokUnqualOp TokQualOp l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s))
  302. finishVarKw :: Int -> Int -> T.Text -> Alex Token
  303. finishVarKw l c text
  304. | T.null text = undefined
  305. | Data.Char.isUpper (T.head text) = finishVar TokUnqual TokQual l c text
  306. finishVarKw l c text = do
  307. sc <- alexGetStartCode
  308. state <- getUserState
  309. clearPendingLC
  310. let col = layoutCol (head (layoutColumns state))
  311. case T.unpack text of
  312. -- we handle the contextual 'as'/'qualified' tokens using a
  313. -- startcode.
  314. --
  315. -- in the import_ state, as and qualified are keywords, unless the
  316. -- offside rule would apply to emit a ';' or '}' token. in that
  317. -- case, we emit a semicolon (what the offside rule would do!), and
  318. -- set the "keyword" (now changed to an identifier) as pending, so
  319. -- that it will be emitted by the next alexMonadScan.
  320. "import" -> do
  321. when ((sc /= import_) && (sc /= foreign_)) $ pushStartCode import_
  322. pure (Token TokImport l c)
  323. "as" -> conditionalKeyword l c import_ (c > col) TokAs
  324. "qualified" -> conditionalKeyword l c import_ (c > col) TokQualified
  325. "foreign" -> do
  326. when (sc /= foreign_) $ pushStartCode foreign_
  327. pure (Token TokForeign l c)
  328. "export" -> conditionalKeyword l c foreign_ (c > col) TokExport
  329. "safe" -> conditionalKeyword l c foreign_ (c > col) TokSafe
  330. "unsafe" -> conditionalKeyword l c foreign_ (c > col) TokUnsafe
  331. "ccall" -> conditionalKeyword l c foreign_ (c > col) TokCCall
  332. -- when starting a layout context for let expressions we make sure
  333. -- that it is distinguishable from layout contexts started by
  334. -- anything else, because let layout contexts can be terminated
  335. -- ahead of time by the 'in' token. for instance in:
  336. --
  337. -- let x = 1 in x
  338. --
  339. -- there is no reason for the layout context that started after
  340. -- 'let' to be terminated by the 'in' token, since the offside rule
  341. -- hasn't had a chance to apply. the token stream in that case would look like
  342. --
  343. -- 'let' '{' x '=' 1 'in' x
  344. --
  345. -- which is a parse error. we do not implement the rule which says parse errors
  346. -- terminate layout contexts, instead doing this approximation.
  347. "let" -> laidOut' (Just LetLayout) TokLet l c
  348. "in" -> do
  349. laidout <- layoutColumns <$> getUserState
  350. case laidout of
  351. LetLayout _:_ -> earlyEnd TokIn l c
  352. _ -> pure (Token TokIn l c)
  353. "data" -> pure (Token TokData l c)
  354. "where" -> laidOut TokWhere l c
  355. "module" -> pure (Token TokModule l c)
  356. -- when we lex a \ token, a flag is set in the lexer state to
  357. -- indicate that, if there is a 'case' token directly following,
  358. -- that token is to be interpreted as part of a lambda-case
  359. -- construct, and so must start a layout context for its branches.
  360. "case"
  361. | pendingLambdaCase state -> laidOut TokCase l c
  362. | otherwise -> pure (Token TokCase l c)
  363. "of" -> laidOut TokOf l c
  364. (_:_) -> pure (Token (TokUnqual VarId text) l c)
  365. [] -> error "empty keyword/identifier"
  366. conditionalKeyword l c import_ cond kw = do
  367. sc <- alexGetStartCode
  368. case () of
  369. () | sc == import_, cond -> pure (Token kw l c)
  370. | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c
  371. | otherwise -> pure (Token (TokUnqual VarId text) l c)
  372. where text = T.pack (show kw)
  373. finishVar :: (IdClass -> T.Text -> TokenClass) -> (IdClass -> T.Text -> T.Text -> TokenClass) -> Int -> Int -> T.Text -> Alex Token
  374. finishVar tokunqual tokqual l c text
  375. | T.null text = undefined
  376. | Data.Char.isUpper (T.head text), T.singleton '.' `T.isInfixOf` text = pure $
  377. let
  378. txet = T.reverse text
  379. (suffix', prefix') = T.span (/= '.') txet
  380. prefix = T.reverse (T.tail prefix')
  381. suffix = T.reverse suffix'
  382. in if Data.Char.isUpper (T.head suffix)
  383. then Token (tokqual ConId prefix suffix) l c
  384. else Token (tokqual VarId prefix suffix) l c
  385. | Data.Char.isUpper (T.head text) = pure $ Token (tokunqual ConId text) l c
  386. | otherwise = pure $ Token (tokunqual VarId text) l c
  387. earlyEnd :: TokenClass -> Int -> Int -> Alex Token
  388. earlyEnd tok l c = do
  389. popLayoutContext
  390. delayToken (Token tok l c)
  391. pure (Token TokLEnd l c)
  392. offsideKeyword :: TokenClass -> Int -> Int -> Alex Token
  393. offsideKeyword tok l c = do
  394. popStartCode
  395. delayToken (Token tok l c)
  396. pure (Token TokSemi l c)
  397. laidOut' :: Maybe (Int -> LayoutState) -> TokenClass -> Int -> Int -> Alex Token
  398. laidOut' n x l c = do
  399. pushStartCode layout
  400. mapUserState $ \s -> s { leastColumn = c, pendingLayoutKw = n }
  401. pure (Token x l c)
  402. laidOut = laidOut' Nothing
  403. alexMonadScan = do
  404. inp@(_,_,_,n) <- alexGetInput
  405. sc <- alexGetStartCode
  406. case alexScan inp sc of
  407. AlexEOF -> alexEOF
  408. AlexError error@(_,_,inp,_) ->
  409. alexError $ "Unexpected character: " ++ show (T.head (T.decodeUtf8 (Lbs.toStrict inp)))
  410. AlexSkip inp _len -> do
  411. alexSetInput inp
  412. alexMonadScan
  413. AlexToken inp'@(_,_,_,n') _ action -> let len = n'-n in do
  414. alexSetInput inp'
  415. action (ignorePendingBytes inp) len
  416. }