From 27186d92b3dc02f91acae5be928312a3faec2011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Abigail=20Magalh=C3=A3es?= Date: Tue, 3 Aug 2021 13:56:16 -0300 Subject: [PATCH] Refactor parser/lexer error message infra --- .gitignore | 4 +- ahc.cabal | 1 + hie.yaml | 19 +- src/Frontend/Autogen/Lexer.x | 328 ++++++++++++++++++++++------------ src/Frontend/Autogen/Parser.y | 61 +++++-- src/Frontend/Lexer/Wrapper.hs | 181 +++++++++++++++++++ src/Frontend/Syntax.hs | 41 ++++- src/Main.hs | 82 +++++++-- 8 files changed, 558 insertions(+), 159 deletions(-) create mode 100644 src/Frontend/Lexer/Wrapper.hs diff --git a/.gitignore b/.gitignore index a83453a..278e615 100644 --- a/.gitignore +++ b/.gitignore @@ -14,5 +14,5 @@ !/Setup.hs # alex/happy artefacts -src/*.o -src/*.hi \ No newline at end of file +src/**/*.o +src/**/*.hi \ No newline at end of file diff --git a/ahc.cabal b/ahc.cabal index 67f901b..8cfeb83 100644 --- a/ahc.cabal +++ b/ahc.cabal @@ -28,6 +28,7 @@ executable ahc Frontend.Autogen.Lexer, Frontend.Autogen.Parser, Frontend.Lexer.Tokens, + Frontend.Lexer.Wrapper, Frontend.Parser.Posn, Frontend.Syntax diff --git a/hie.yaml b/hie.yaml index 1961153..c9aa9e5 100644 --- a/hie.yaml +++ b/hie.yaml @@ -1,10 +1,25 @@ cradle: multi: - - path: "./src/Frontend/Autogen" + # disable HLS for the parser/lexer + - path: "./src/Frontend/Autogen/" config: cradle: none: + + # enable it for the ahc source + - path: "./src/" + config: + cradle: + stack: + + # disable it for my random testing files + # (and also Setup.hs!) - path: "./" config: cradle: - stack: \ No newline at end of file + none: + + - path: "./.stack-work/" + config: + cradle: + none: \ No newline at end of file diff --git a/src/Frontend/Autogen/Lexer.x b/src/Frontend/Autogen/Lexer.x index e255a0d..392b694 100644 --- a/src/Frontend/Autogen/Lexer.x +++ b/src/Frontend/Autogen/Lexer.x @@ -1,30 +1,33 @@ { module Frontend.Autogen.Lexer where +import Control.Monad + import qualified Data.ByteString.Lazy as Lbs import qualified Data.Text.Encoding as T import qualified Data.Text as T +import qualified Data.Char +import Data.Int (Int64) +import Frontend.Lexer.Wrapper import Frontend.Lexer.Tokens - -import Control.Monad - -import Debug.Trace +import Frontend.Parser.Posn } -%wrapper "monadUserState-bytestring" +-- %wrapper "monadUserState-bytestring" $alpha = [a-zA-Z] $digit = [0-9] -$white_nol = $white # \n +$white_nol = $white # [\n\t] tokens :- $white_nol+ ; + \t { \_ _ -> alexError "tab character in source code" } -<0,module_header> "--" .* \n +<0,import_> "--" .* \n { just $ pushStartCode newline } -<0,module_header,import_> +<0,import_> $alpha [$alpha $digit \_ \' \.]* { variableOrKeyword } <0> \= { always TokEqual } @@ -37,7 +40,7 @@ tokens :- <0> \{ { always TokOBrace } <0> \[ { always TokOSquare } -<0,module_header,import_> { +<0,import_> { \, { always TokComma } \( { always TokOParen } \) { always TokCParen } @@ -48,33 +51,33 @@ tokens :- <0> \;+ { always TokSemi } -<0> \n { just $ pushStartCode newline } +<0,import_> \n { just $ pushStartCode newline } -<0> \" { just startString } +<0> \" { startString } { - \\ \" { stringSeg (T.singleton '"') } - \\ \\ { stringSeg (T.singleton '\\') } - - \\ a { stringSeg (T.singleton '\a') } - \\ b { stringSeg (T.singleton '\b') } - \\ f { stringSeg (T.singleton '\f') } - \\ n { stringSeg (T.singleton '\n') } - \\ \n { stringSeg (T.singleton '\n') } - \\ r { stringSeg (T.singleton '\r') } - \\ v { stringSeg (T.singleton '\v') } - \\ t { stringSeg (T.singleton '\t') } + \\ \" { stringAppend (T.singleton '"') } + \\ \\ { stringAppend (T.singleton '\\') } + + \\ a { stringAppend (T.singleton '\a') } + \\ b { stringAppend (T.singleton '\b') } + \\ f { stringAppend (T.singleton '\f') } + \\ n { stringAppend (T.singleton '\n') } + \\ \n { stringAppend (T.singleton '\n') } + \\ r { stringAppend (T.singleton '\r') } + \\ v { stringAppend (T.singleton '\v') } + \\ t { stringAppend (T.singleton '\t') } \" { endString } - [^\\\"]+ { stringChar } + [^\\\"]+ { stringSegment } } -<0,newline,comment,import_,module_header> +<0,newline,comment,import_> "{-" { just $ pushStartCode comment } { - "-}" { \i l -> popStartCode *> skip i l } + "-}" { \_ _ -> popStartCode *> alexMonadScan } . ; } @@ -104,14 +107,10 @@ tokens :- () { emitPendingToken } - { - \n ; -} - { alexEOF :: Alex Token alexEOF = do - (AlexPn _ l c, _, _, _) <- alexGetInput + (Posn l c, _, _, _) <- alexGetInput maybePopImportSC @@ -135,7 +134,7 @@ alexEOF = do yield k inp i = clearPendingLC *> yield' k inp i -yield' k (AlexPn _ l c, _, s, _) i = do +yield' k (Posn l c, _, s, _) i = do pure (Token (k $! (T.decodeUtf8 (Lbs.toStrict (Lbs.take i s)))) l c) setPendingLC = mapUserState $ \s -> s { pendingLambdaCase = True } @@ -144,45 +143,32 @@ clearPendingLC = mapUserState $ \s -> s { pendingLambdaCase = False } always :: TokenClass -> AlexInput -> Int64 -> Alex Token always k x i = yield (const k) x i -startString = do - mapUserState $ \s -> s { stringBuffer = T.empty } +-- reset the string buffer and push the string start code +startString (p, _, _, _) _ = do + mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Just p } pushStartCode string + alexMonadScan -endString (AlexPn _ l c, _, _, _) _i = do - text <- stringBuffer <$> getUserState - mapUserState $ \s -> s { stringBuffer = T.empty } +-- pop the string start code, and emit the string buffer as a token. +endString (Posn l c, _, _, _) _i = do + state <- getUserState + + mapUserState $ \s -> s { stringBuffer = T.empty, stringStartPosn = Nothing } popStartCode - pure (Token (TokString text) l c) -stringChar input@(AlexPn _ _ _, _, buf, _) i = do + let (Just (Posn l c)) = stringStartPosn state + pure (Token (TokString (stringBuffer state)) l c) + +-- append a /lexed/ region to the string buffer +stringSegment (Posn _ _, _, buf, _) i = do mapUserState $ \s -> s { stringBuffer = stringBuffer s <> T.decodeUtf8 (Lbs.toStrict (Lbs.take i buf)) } alexMonadScan -stringSeg text _ _ = do +-- append a constant fragment to the string buffer. +stringAppend text _ _ = do mapUserState $ \s -> s { stringBuffer = stringBuffer s <> text } alexMonadScan -data LayoutState - = LetLayout { layoutCol :: Int } - | Layout { layoutCol :: Int } - | ModLayout { layoutCol :: Int } - deriving (Show) - -data AlexUserState = - AlexUserState { layoutColumns :: ![LayoutState] - , startCodes :: ![Int] - , leastColumn :: !Int - - , pendingLayoutKw :: Maybe (Int -> LayoutState) - , pendingTokens :: ![Token] - , pendingLambdaCase :: !Bool - , haveModuleHeader :: !Bool - - , stringBuffer :: !T.Text - } - -alexInitUserState = AlexUserState [] [] 0 Nothing [] False False T.empty - emitPendingToken :: AlexAction Token emitPendingToken _ _ = do t <- getUserState @@ -224,19 +210,68 @@ popStartCode = do alexSetStartCode x offsideRule :: AlexInput -> Int64 -> Alex Token -offsideRule (AlexPn _ line col, _, s, _) _ = do - ~(col':ctx) <- layoutColumns <$> getUserState - case col `compare` layoutCol col' of - EQ -> do - popStartCode - maybePopImportSC - pure (Token TokSemi line col) - GT -> do - popStartCode - alexMonadScan - LT -> do - mapUserState $ \s -> s { layoutColumns = ctx } - pure (Token TokLEnd line col) +offsideRule (Posn line col, _, _, _) _ = do + columns <- layoutColumns <$> getUserState + + let continue = popStartCode *> alexMonadScan + + -- The "offside rule" governs how to insert virtual semicolon and + -- closing '}' tokens. It applies in the "newline" state, and, if we + -- stay in that state, the rule keeps applying. There are a couple of + -- cases: + case columns of + -- If we have no layout context (or we're in a layout context that + -- started with a physical '{'), then the offside rule plain doesn't + -- apply. + [] -> continue + ExplicitLayout:_ -> continue + -- Otherwise, we're dealing with something like + -- + -- do token + -- ^ this is the layout column. + col':ctx -> do + case col `compare` layoutCol col' of + -- If we have something like + -- + -- do token + -- token + -- ^ this is where we are + -- then we emit a semicolon (and possibly do some bookeeping, + -- like leaving the newline state) + EQ -> do + popStartCode + maybePopImportSC + pure (Token TokSemi line col) + + -- If we have something like + -- + -- do token + -- token + -- ^ this is where we are + -- then we don't emit anything, just leave the newline state, + -- since this token continues the previous logical line. + GT -> continue + + -- If we have something like + -- + -- C D E + -- do token + -- do token + -- do token + -- token + -- ^ we are here + -- then we're behind the layout context, but not just one, three! + -- we emit a closing '}' to close context 'E', and STAY in the + -- newline context. when we eventually end up here again + -- (recurring interleaved with the lexer state machine), we + -- close the D and C contexts in the same way. + LT -> do + mapUserState $ \s -> s { layoutColumns = ctx } + pure (Token TokLEnd line col) + + -- eventually we either exhaust all the layout contexts or get + -- to a layout context we're EQ or GT compared to. in that case + -- one of the other rules apply. maybePopImportSC :: Alex () maybePopImportSC = do @@ -244,13 +279,13 @@ maybePopImportSC = do when (startcode == import_) popStartCode emptyLayout :: AlexInput -> Int64 -> Alex Token -emptyLayout (AlexPn _ line col, _, _, _) _ = do +emptyLayout (Posn line col, _, _, _) _ = do popStartCode pushStartCode newline pure (Token TokLEnd line col) startLayout :: AlexInput -> Int64 -> Alex Token -startLayout (AlexPn _ line col, _, _, _) _ = do +startLayout (Posn line col, _, _, _) _ = do state <- getUserState popStartCode let @@ -263,42 +298,70 @@ startLayout (AlexPn _ line col, _, _, _) _ = do Just s -> s Nothing -> Layout - if col < col' + -- here's another rule. suppose we have: + -- + -- foo = bar where + -- spam = ham + -- + -- if we just apply the rule that the next token after a layout + -- keyword determines the column for the layout context, then we're + -- starting another layout context at column 1! that's definitely not + -- what we want. + -- + -- so a new layout context only starts if the first token is to the right + -- of the previous layout context. that is: a block only starts if it's + -- on the same line as the layout context, or indented further. + if col <= col' then pushStartCode empty_layout else mapUserState $ \s -> s { layoutColumns = layoutKind col:layoutColumns s } pure (Token TokLStart line col) -getLayout :: Alex LayoutState -getLayout = do - t <- getUserState - case layoutColumns t of - (x:_) -> pure x - _ -> error "No layout?" +popLayoutContext :: Alex () +popLayoutContext = mapUserState $ \s -> s { layoutColumns = tail (layoutColumns s) } openBrace :: AlexInput -> Int64 -> Alex Token -openBrace (AlexPn _ line col, _, _, _) _ = do +openBrace (Posn line col, _, _, _) _ = do + -- if we see a '{' token, we're probably in the layout state. in that + -- case, we pop it! otherwise, we just pop the state anyway: if we + -- were in <0>, then popping gets us back in <0>. popStartCode - mapUserState $ \s -> s { layoutColumns = Layout minBound:layoutColumns s } - pure (Token TokOBrace line col) -popLayoutContext :: Alex () -popLayoutContext = mapUserState $ \s -> s { layoutColumns = tail (layoutColumns s) } + -- we push an ExplicitLayout state so that the offside rule stops + -- applying (logical lines are delimited by physical semicolons) and a + -- '}' can close it. + mapUserState $ \s -> s { layoutColumns = ExplicitLayout:layoutColumns s } + pure (Token TokOBrace line col) closeBrace :: AlexInput -> Int64 -> Alex Token -closeBrace (AlexPn _ line col, _, _, _) _ = do - ~(col':_) <- layoutColumns <$> getUserState - if layoutCol col' < 0 - then popLayoutContext - else pure () +closeBrace (Posn line col, _, _, _) _ = do + -- if we're lexing a '}' token (physical) and the rightmost layout + -- context was started by a physical '{', then we can close it. + -- otherwise we do nothing and probably get a parse error! + columns <- layoutColumns <$> getUserState + case columns of + ExplicitLayout:_ -> popLayoutContext + _ -> pure () pure (Token TokCBrace line col) variableOrKeyword :: AlexAction Token -variableOrKeyword (AlexPn _ l c, _, s, _) size = finishVarKw l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s)) +variableOrKeyword (Posn l c, _, s, _) size = finishVarKw l c $ T.decodeUtf8 (Lbs.toStrict (Lbs.take size s)) finishVarKw :: Int -> Int -> T.Text -> Alex Token finishVarKw l c text | T.null text = undefined | Data.Char.isUpper (T.head text), T.singleton '.' `T.isInfixOf` text = pure $ + -- if we have a token like A.B.C, we reverse it and span at the + -- first (last) dot, so that we have, e.g.: + -- + -- "Aa.Bb.Cc" -> "cC.bB.aA" + -- "Cc.Bb.Aa" -> ("Cc", ".bB.aA") + -- + -- what we have then is the suffix and the prefix, but they've both + -- been reversed. so we unreverse them, and also drop the first + -- (last) dot from the prefix. + -- + -- if the suffix starts with an uppercase letter, it's a constructor + -- symbol (ConId). let txet = T.reverse text (suffix', prefix') = T.span (/= '.') txet @@ -319,6 +382,18 @@ finishVarKw l c text = do let col = layoutCol (head (layoutColumns state)) case T.unpack text of + -- we handle the contextual 'as'/'qualified' tokens using a + -- startcode. + -- + -- in the import_ state, as and qualified are keywords, unless the + -- offside rule would apply to emit a ';' or '}' token. in that + -- case, we emit a semicolon (what the offside rule would do!), and + -- set the "keyword" (now changed to an identifier) as pending, so + -- that it will be emitted by the next alexMonadScan. + "import" -> do + pushStartCode import_ + pure (Token TokImport l c) + "as" | sc == import_, c > col -> pure (Token TokAs l c) | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c @@ -329,58 +404,79 @@ finishVarKw l c text = do | sc == import_ -> offsideKeyword (TokUnqual VarId text) l c | otherwise -> pure (Token (TokUnqual VarId text) l c) + -- when starting a layout context for let expressions we make sure + -- that it is distinguishable from layout contexts started by + -- anything else, because let layout contexts can be terminated + -- ahead of time by the 'in' token. for instance in: + -- + -- let x = 1 in x + -- + -- there is no reason for the layout context that started after + -- 'let' to be terminated by the 'in' token, since the offside rule + -- hasn't had a chance to apply. the token stream in that case would look like + -- + -- 'let' '{' x '=' 1 'in' x + -- + -- which is a parse error. we do not implement the rule which says parse errors + -- terminate layout contexts, instead doing this approximation. "let" -> laidOut' (Just LetLayout) TokLet l c "in" -> do - laidout <- getLayout + laidout <- layoutColumns <$> getUserState case laidout of - -- let .. in critical pair: - -- 'in' is allowed to close a layout context before the offside rule would apply. - LetLayout _ -> earlyEnd TokIn l c + LetLayout _:_ -> earlyEnd TokIn l c _ -> pure (Token TokIn l c) "data" -> pure (Token TokData l c) - "where" -> do - -- if this is the where in the module_header, then - -- pop the start code so that the offside rule applies again - when (sc == module_header) popStartCode - laidOut' (if sc == module_header then Just ModLayout else Nothing) TokWhere l c + "where" -> laidOut TokWhere l c + "module" -> pure (Token TokModule l c) + -- when we lex a \ token, a flag is set in the lexer state to + -- indicate that, if there is a 'case' token directly following, + -- that token is to be interpreted as part of a lambda-case + -- construct, and so must start a layout context for its branches. "case" - -- "case" is a layout token if it's immediately following a \\ | pendingLambdaCase state -> laidOut TokCase l c | otherwise -> pure (Token TokCase l c) - "import" -> do - pushStartCode import_ - pure (Token TokImport l c) - "of" -> laidOut TokOf l c - "module" -> do - unless (haveModuleHeader state) $ do - mapUserState $ \s -> s { haveModuleHeader = True } - pushStartCode module_header - pure (Token TokModule l c) - - (x:_) -> pure (Token (TokUnqual VarId text) l c) + (_:_) -> pure (Token (TokUnqual VarId text) l c) [] -> error "empty keyword/identifier" +earlyEnd :: TokenClass -> Int -> Int -> Alex Token earlyEnd tok l c = do popLayoutContext delayToken (Token tok l c) pure (Token TokLEnd l c) +offsideKeyword :: TokenClass -> Int -> Int -> Alex Token offsideKeyword tok l c = do - popLayoutContext + popStartCode delayToken (Token tok l c) pure (Token TokSemi l c) +laidOut' :: Maybe (Int -> LayoutState) -> TokenClass -> Int -> Int -> Alex Token laidOut' n x l c = do pushStartCode layout mapUserState $ \s -> s { leastColumn = c, pendingLayoutKw = n } pure (Token x l c) laidOut = laidOut' Nothing -} + +alexMonadScan = do + inp@(_,_,_,n) <- alexGetInput + + sc <- alexGetStartCode + case alexScan inp sc of + AlexEOF -> alexEOF + AlexError error@(_,_,inp,_) -> + alexError $ "Unexpected character: " ++ show (T.head (T.decodeUtf8 (Lbs.toStrict inp))) + AlexSkip inp _len -> do + alexSetInput inp + alexMonadScan + AlexToken inp'@(_,_,_,n') _ action -> let len = n'-n in do + alexSetInput inp' + action (ignorePendingBytes inp) len +} \ No newline at end of file diff --git a/src/Frontend/Autogen/Parser.y b/src/Frontend/Autogen/Parser.y index 2840f3d..b398324 100644 --- a/src/Frontend/Autogen/Parser.y +++ b/src/Frontend/Autogen/Parser.y @@ -3,16 +3,20 @@ module Frontend.Autogen.Parser where import qualified Data.Text as T +import Data.Maybe +import Frontend.Lexer.Wrapper import Frontend.Lexer.Tokens import Frontend.Parser.Posn import Frontend.Syntax import Frontend.Autogen.Lexer +import qualified Prelude import Prelude hiding (span) import Debug.Trace +import Control.Monad } %name parseExp Exp @@ -121,13 +125,13 @@ Apat :: { Pat } | '(' CommaList(Pat) ')' { span $1 $3 $ makeTuplePattern $2 } Decl :: { Decl } - : CommaList1(VAR) '::' Type { TySig (getVar `fmap` $1) $3 } - | VAR Apat List(Apat) Rhs { FunDecl (getVar $1) ($2:$3) $4 } - | Pat Rhs { PatDecl $1 $2 } + : CommaList1(VAR) '::' Type { TySig (getVar `fmap` $1) $3 (startPosn (head $1)) (endPosn $3) } + | VAR Apat List(Apat) Rhs { FunDecl (getVar $1) ($2:$3) $4 (startPosn $1) (endPosn $4) } + | Pat Rhs { PatDecl $1 $2 (startPosn $1) (endPosn $2) } Rhs :: { Rhs } - : '=' Exp { BareRhs $2 [] } - | '=' Exp 'where' LaidOutList(Decl) { BareRhs $2 (thd $4) } + : '=' Exp { BareRhs $2 [] (startPosn $1) (endPosn $2) } + | '=' Exp 'where' LaidOutList(Decl) { BareRhs $2 (thd $4) (startPosn $1) (endPosn $4) } LaidOutList(p) : START Opt(Semis) LOLContents(p, END) { (startPosn $1, lolEnd $3, lolList $3) } @@ -140,14 +144,17 @@ LOLContents(p, End) Module :: { Module } : 'module' CON ImportExportList 'where' LaidOutList(ModItem) - { Module { moduleName = toModId (getVar $2) - , moduleExports = $3 - , moduleItems = thd $5 } + {% do { (imports,items) <- spanModuleItems (thd $5) + ; pure $ Module { moduleName = toModId (getVar $2) + , moduleExports = fst $3 + , moduleImports = imports + , moduleItems = items } + } } -ImportExportList :: { Maybe [NamespacedItem ParsedVar] } - : {-empty-} { Nothing } - | '(' CommaList(NSItem) ')' { Just $2 } +ImportExportList :: { (Maybe [NamespacedItem ParsedVar], Maybe Posn) } + : {-empty-} { (Nothing, Nothing) } + | '(' CommaList(NSItem) ')' { (Just $2, Just (endPosn $3)) } NSItem :: { NamespacedItem ParsedVar } : VAR { IEVar (getVar $1) } @@ -155,21 +162,21 @@ NSItem :: { NamespacedItem ParsedVar } | 'module' CON { IEModule (getVar $2) } ModItem :: { Item } - : Decl { ModDecl $1 } - | Import { ModImport $1 } + : Decl { ModDecl $1 (startPosn $1) (endPosn $1) } + | Import { ModImport $1 (startPosn $1) (endPosn $1) } Import :: { ModuleImport ParsedVar } : 'import' modid ImportExportList - { Import $2 $3 False Nothing } + { Import $2 (fst $3) False Nothing (startPosn $1) (fromMaybe (endPosn $2) (snd $3)) } | 'import' modid ImportExportList 'as' CON - { Import $2 $3 False (Just (getVar $5)) } + { Import $2 (fst $3) False (Just (getVar $5)) (startPosn $1) (endPosn $5) } | 'import' 'qualified' modid ImportExportList - { Import $3 $4 True Nothing } + { Import $3 (fst $4) True Nothing (startPosn $1) (fromMaybe (endPosn $3) (snd $4)) } | 'import' 'qualified' modid ImportExportList 'as' CON - { Import $3 $4 True (Just (getVar $6)) } + { Import $3 (fst $4) True (Just (getVar $6)) (startPosn $1) (endPosn $6) } Opt(p) : { () } @@ -246,4 +253,24 @@ makeTuplePattern xs = TupPat xs makeTuple [x] = ParenExp x makeTuple xs = Tuple xs + +spanModuleItems xs = do + let + isImport (ModImport _ _ _) = True + isImport _ = False + + (imports, items) = Prelude.span isImport xs + + forM_ items $ \x -> case x of + ModImport _ start end -> + alexThrow $ \fname -> + ParseError { parseErrorMessage = "all import statements should be at the top of the file." + , parseErrorInlineDesc = Just "unexpected import statement" + , parseErrorFilename = fname + , parseErrorBegin = start + , parseErrorEnd = end + } + _ -> pure () + + pure (map itemImport imports, items) } diff --git a/src/Frontend/Lexer/Wrapper.hs b/src/Frontend/Lexer/Wrapper.hs new file mode 100644 index 0000000..a035a6a --- /dev/null +++ b/src/Frontend/Lexer/Wrapper.hs @@ -0,0 +1,181 @@ +module Frontend.Lexer.Wrapper where + +import Control.Applicative as App (Applicative (..)) + +import Data.Word (Word8) + +import Data.Int (Int64) +import qualified Data.Char +import qualified Data.ByteString.Lazy as ByteString +import qualified Data.ByteString.Internal as ByteString (w2c) +import Frontend.Lexer.Tokens (Token) +import qualified Data.Text as T +import Frontend.Parser.Posn + +type Byte = Word8 +type AlexInput = ( Posn, -- current position, + Char, -- previous char + ByteString.ByteString, -- current input string + Int64) -- bytes consumed so far + +ignorePendingBytes :: AlexInput -> AlexInput +ignorePendingBytes i = i -- no pending bytes when lexing bytestrings + +alexInputPrevChar :: AlexInput -> Char +alexInputPrevChar (_,c,_,_) = c + +alexGetByte :: AlexInput -> Maybe (Byte,AlexInput) +alexGetByte (p,_,cs,n) = + case ByteString.uncons cs of + Nothing -> Nothing + Just (b, cs') -> + let c = ByteString.w2c b + p' = alexMove p c + n' = n+1 + in p' `seq` cs' `seq` n' `seq` Just (b, (p', c, cs',n')) + +-- ----------------------------------------------------------------------------- +-- Token positions + +-- `Posn' records the location of a token in the input text. It has three +-- fields: the address (number of chacaters preceding the token), line number +-- and column of a token within the file. `start_pos' gives the position of the +-- start of the file and `eof_pos' a standard encoding for the end of file. +-- `move_pos' calculates the new position after traversing a given character, +-- assuming the usual eight character tab stops. + + +alexStartPos :: Posn +alexStartPos = Posn 1 1 + +alexMove :: Posn -> Char -> Posn +alexMove (Posn l c) '\t' = Posn l (c+8-((c-1) `mod` 8)) +alexMove (Posn l _) '\n' = Posn (l+1) 1 +alexMove (Posn l c) _ = Posn l (c+1) + +data AlexState = AlexState { + alex_pos :: !Posn, -- position at current input location + + alex_bpos:: !Int64, -- bytes consumed so far + alex_inp :: ByteString.ByteString, -- the current input + alex_chr :: !Char, -- the character before the input + + alex_scd :: !Int -- the current startcode + + , alex_ust :: AlexUserState -- AlexUserState will be defined in the user program + , alex_fname :: String + } + +runAlex :: String -> ByteString.ByteString -> Alex a -> Either ParseError a +runAlex fname input__ (Alex f) = + case f initState of + Left msg -> Left msg + Right ( _, a ) -> Right a + where + initState = AlexState + { alex_bpos = 0 + , alex_pos = alexStartPos + , alex_inp = input__ + , alex_chr = '\n' + + , alex_ust = alexInitUserState + + , alex_scd = 0 + , alex_fname = fname + } + +newtype Alex a = Alex { unAlex :: AlexState -> Either ParseError (AlexState, a) } + +instance Functor Alex where + fmap f a = Alex $ \s -> case unAlex a s of + Left msg -> Left msg + Right (s', a') -> Right (s', f a') + +instance Applicative Alex where + pure a = Alex $ \s -> Right (s, a) + fa <*> a = Alex $ \s -> case unAlex fa s of + Left msg -> Left msg + Right (s', f) -> case unAlex a s' of + Left msg -> Left msg + Right (s'', b) -> Right (s'', f b) + +instance Monad Alex where + m >>= k = Alex $ \s -> case unAlex m s of + Left msg -> Left msg + Right (s',a) -> unAlex (k a) s' + return = App.pure + +alexGetInput :: Alex AlexInput +alexGetInput = + Alex $ \s@AlexState{alex_pos=pos,alex_bpos=bpos,alex_chr=c,alex_inp=inp__} -> + Right (s, (pos,c,inp__,bpos)) + + +alexSetInput :: AlexInput -> Alex () +alexSetInput (pos,c,inp__,bpos) + = Alex $ \s -> Right ( s { alex_pos = pos + , alex_bpos = bpos + , alex_chr = c + , alex_inp = inp__ + } + , ()) + +alexError :: String -> Alex a +alexError message = Alex $ \s -> Left (ParseError message (alex_fname s) Nothing (alex_pos s) (alex_pos s)) + +alexErrorPosn :: Posn -> Posn -> String -> Alex a +alexErrorPosn start end message = Alex $ \s -> Left (ParseError message (alex_fname s) Nothing start end) + +alexThrow :: (String -> ParseError) -> Alex a +alexThrow err = Alex $ \s -> Left (err (alex_fname s)) + +alexGetStartCode :: Alex Int +alexGetStartCode = Alex $ \s@AlexState{alex_scd=sc} -> Right (s, sc) + +alexSetStartCode :: Int -> Alex () +alexSetStartCode sc = Alex $ \s -> Right (s{alex_scd=sc}, ()) + +-- ----------------------------------------------------------------------------- +-- Useful token actions + +type AlexAction result = AlexInput -> Int64 -> Alex result + +-- perform an action for this token, and set the start code to a new value +andBegin :: AlexAction result -> Int -> AlexAction result +(action `andBegin` code) input__ len = do + alexSetStartCode code + action input__ len + +token :: (AlexInput -> Int64 -> token) -> AlexAction token +token t input__ len = return (t input__ len) + +data LayoutState + = LetLayout { layoutCol :: Int } + | Layout { layoutCol :: Int } + | ExplicitLayout + deriving (Show) + +data AlexUserState = + AlexUserState { layoutColumns :: ![LayoutState] + , startCodes :: ![Int] + , leastColumn :: !Int + + , pendingLayoutKw :: Maybe (Int -> LayoutState) + , pendingTokens :: ![Token] + , pendingLambdaCase :: !Bool + + , stringBuffer :: !T.Text + , stringStartPosn :: Maybe Posn + } + +alexInitUserState :: AlexUserState +alexInitUserState = AlexUserState [] [] 0 Nothing [] False T.empty Nothing + +data ParseError + = ParseError { parseErrorMessage :: String + , parseErrorFilename :: String + , parseErrorInlineDesc :: Maybe String + , parseErrorBegin :: Posn + , parseErrorEnd :: Posn + } + deriving (Eq, Show) \ No newline at end of file diff --git a/src/Frontend/Syntax.hs b/src/Frontend/Syntax.hs index 8d97411..8cfb38a 100644 --- a/src/Frontend/Syntax.hs +++ b/src/Frontend/Syntax.hs @@ -72,15 +72,27 @@ instance HasPosn (FeType var) where span sp ep x = SPType x (startPosn sp) (endPosn ep) data FeDecl var - = PatDecl (FePat var) (FeRhs var) - | FunDecl var [FePat var] (FeRhs var) - | TySig [var] (FeType var) + = PatDecl { pdPat :: FePat var, declRhs :: FeRhs var, declBegin :: Posn, declEnd :: Posn } + | FunDecl { fdVar :: var, fdArgs :: [FePat var], declRhs :: FeRhs var, declBegin :: Posn, declEnd :: Posn } + | TySig { tsVars :: [var], tsType :: FeType var, declBegin :: Posn, declEnd :: Posn } deriving (Eq, Show, Ord) +instance HasPosn (FeDecl var) where + startPosn = declBegin + endPosn = declEnd + + span sp ep s = s { declBegin = startPosn sp, declEnd = endPosn ep } + data FeRhs var - = BareRhs (FeExpr var) [FeDecl var] + = BareRhs { bareRhs :: FeExpr var, rhsWhere :: [FeDecl var], rhsBegin :: Posn, rhsEnd :: Posn } deriving (Eq, Show, Ord) +instance HasPosn (FeRhs var) where + startPosn = rhsBegin + endPosn = rhsEnd + + span sp ep s = s { rhsBegin = startPosn sp, rhsEnd = endPosn ep } + data Literal = LitString T.Text | LitNumber Integer @@ -89,6 +101,7 @@ data Literal data FeModule var = Module { moduleName :: var , moduleExports :: Maybe [NamespacedItem var] + , moduleImports :: [ModuleImport var] , moduleItems :: [ModuleItem var] } deriving (Eq, Show, Ord) @@ -98,9 +111,17 @@ data ModuleImport var , importList :: Maybe [NamespacedItem var] , importQualified :: Bool , importAlias :: Maybe var + , importBegin :: Posn + , importEnd :: Posn } deriving (Eq, Show, Ord) +instance HasPosn (ModuleImport var) where + startPosn = importBegin + endPosn = importEnd + + span sp ep s = s { importBegin = startPosn sp, importEnd = endPosn ep } + data NamespacedItem var = IEVar var | IECon var @@ -108,8 +129,8 @@ data NamespacedItem var deriving (Eq, Show, Ord) data ModuleItem var - = ModDecl (FeDecl var) - | ModImport (ModuleImport var) + = ModDecl { itemDecl :: FeDecl var, itemBegin :: Posn, itemEnd :: Posn } + | ModImport { itemImport :: ModuleImport var, itemBegin :: Posn, itemEnd :: Posn } deriving (Eq, Show, Ord) data ParsedVar @@ -140,4 +161,10 @@ instance HasPosn ParsedVar where startPosn = varBegin endPosn = varEnd - span sp ep s = s { varBegin = startPosn sp, varEnd = endPosn ep } \ No newline at end of file + span sp ep s = s { varBegin = startPosn sp, varEnd = endPosn ep } + +instance HasPosn (ModuleItem var) where + startPosn = itemBegin + endPosn = itemEnd + + span sp ep s = s { itemBegin = startPosn sp, itemEnd = endPosn ep } \ No newline at end of file diff --git a/src/Main.hs b/src/Main.hs index 08056b0..ebc9214 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -1,40 +1,91 @@ -- {-# LANGUAGE BlockArguments #-} module Main where +import Control.Monad ( unless ) + import qualified Data.ByteString.Lazy as Lbs +import qualified Data.Text.Encoding as T +import qualified Data.Text as T import Data.Foldable -import Frontend.Lexer.Tokens -import Frontend.Autogen.Lexer +import Debug.Trace + import Frontend.Autogen.Parser +import Frontend.Autogen.Lexer +import Frontend.Lexer.Wrapper +import Frontend.Lexer.Tokens -import qualified Data.Text.Encoding as T -import qualified Data.Text as T -import Control.Monad ( unless ) import System.Environment (getArgs) -import Text.Show.Pretty (pPrint) -import Debug.Trace +import Text.Show.Pretty (pPrint) +import Frontend.Parser.Posn +import System.Posix.Internals +import GHC.IO.Handle.FD (stdout, handleToFd) +import GHC.IO.FD +import Data.Maybe (fromMaybe) main :: IO () main = do args <- getArgs for_ args $ \str -> do - Main.lex pPrint parseMod =<< Lbs.readFile str + Main.lex str pPrint parseMod =<< Lbs.readFile str testParse :: String -> IO () -testParse s = Main.lex print parseMod (Lbs.fromStrict (T.encodeUtf8 (T.pack s))) +testParse s = Main.lex "" print parseMod (Lbs.fromStrict (T.encodeUtf8 (T.pack s))) testLex :: String -> IO () -testLex s = Main.lex (const (pure ())) (scan []) (Lbs.fromStrict (T.encodeUtf8 (T.pack s))) - -lex :: (a -> IO ()) -> Alex a -> Lbs.ByteString -> IO () -lex show cont arg = do - let x = runAlex arg cont +testLex s = Main.lex "" (const (pure ())) (scan []) (Lbs.fromStrict (T.encodeUtf8 (T.pack s))) + +lex :: String -> (a -> IO ()) -> Alex a -> Lbs.ByteString -> IO () +lex fname show cont arg = do + let x = runAlex fname arg cont case x of - Left e -> print e + Left e -> showParseError e Right x -> show x +showParseError :: ParseError -> IO () +showParseError pe = do + code <- lines <$> readFile (parseErrorFilename pe) + color <- fmap (1 ==) . c_isatty . fdFD =<< handleToFd GHC.IO.Handle.FD.stdout + + let + linum = posnLine (parseErrorBegin pe) + startcol = posnColm (parseErrorBegin pe) + + multiline = linum /= posnLine (parseErrorEnd pe) + + width + | multiline = 0 + | otherwise = max 0 (posnColm (parseErrorEnd pe) - startcol - 1) + + linum' = show linum + + line = [ show linum ++ sep ++ code !! (linum - 1) | linum <- [ linum .. posnLine (parseErrorEnd pe) ] ] + + padding = replicate (length linum') ' ' ++ sep + padding' = replicate (length linum') ' ' ++ " " + + caret = replicate (startcol - 1) ' ' ++ red ++ "^" + squiggle = replicate width '~' + + (red, bold, reset, sep) + | color = ("\x1b[31m", "\x1b[1m", "\x1b[0m", "\x1b[1;34m \9474 \x1b[0m") + | otherwise = ("", "", "", "") + + putStr . unlines $ + [ bold + ++ parseErrorFilename pe + ++ ":" ++ show linum ++ ":" ++ show startcol ++ ": " + ++ red ++ "parse error:" ++ reset + + , padding' + , init (unlines line) + , padding' ++ caret ++ squiggle ++ reset ++ fromMaybe "" ((" " ++) <$> parseErrorInlineDesc pe) + + , "" + , padding' ++ parseErrorMessage pe + ] + scan :: [Token] -> Alex [Token] scan acc = do tok <- alexMonadScan @@ -45,6 +96,7 @@ scan acc = do , "just lexed: " ++ show tok , "sc: " ++ show sc , "sc stack: " ++ show (startCodes state) + , "layout stack: " ++ show (layoutColumns state) ] case tokenClass tok of TokEof -> pure (reverse acc)