Abbie's Haskell compiler
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

78 lines
2.3 KiB

module Frontend.Lexer.Unicode
( UnicodeClass(..)
, classify
, fudgeCharacterClass
) where
import Data.Word (Word8)
import Data.Char
-- | A less specfic version of 'GeneralCategory', grouping characters
-- into a couple of key categories.
data UnicodeClass
-- These are used to designate the beginning of symbols
= Upper | Lower | Symbol
-- Generic and digit can be used in identifiers, but not at the start of one
| Generic | Digit
| Whitespace
-- These are guaranteed parse order. The only difference is that "graphic" is printable, while
-- other may not be.
| OtherGraphic | Other
deriving (Eq, Show)
-- | Determine the class for a given character.
classify :: Char -> UnicodeClass
classify c = case generalCategory c of
-- See classification descriptions in
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table>
-- Cased letters
UppercaseLetter -> Upper
LowercaseLetter -> Lower
TitlecaseLetter -> Upper
ModifierLetter -> Generic
OtherLetter -> Lower
NonSpacingMark -> Generic
SpacingCombiningMark -> OtherGraphic
EnclosingMark -> OtherGraphic
DecimalNumber -> Digit
LetterNumber -> Generic
OtherNumber -> Digit
ConnectorPunctuation -> Symbol
DashPunctuation -> Symbol
OpenPunctuation -> OtherGraphic
ClosePunctuation -> OtherGraphic
InitialQuote -> OtherGraphic
FinalQuote -> OtherGraphic
OtherPunctuation -> Symbol
MathSymbol -> Symbol
CurrencySymbol -> Symbol
ModifierSymbol -> Symbol
-- So this _could_ be Lower or something, just so we can allow for emoji variables.
-- Hrmrm, maybe not.
OtherSymbol -> Symbol
Space -> Whitespace
-- This is all the wacky things in C* and Z* groups
_ -> Other
-- | Convert a character class into a fake byte which will be used by
-- "Parser.Lexer"
fudgeCharacterClass :: UnicodeClass -> Word8
fudgeCharacterClass Upper = 0xf0
fudgeCharacterClass Lower = 0xf1
fudgeCharacterClass Symbol = 0xf2
fudgeCharacterClass Generic = 0xf3
fudgeCharacterClass Digit = 0xf4
fudgeCharacterClass Whitespace = 0xf5
fudgeCharacterClass OtherGraphic = 0xf6
fudgeCharacterClass Other = 0xf7