module Frontend.Lexer.Unicode ( UnicodeClass(..) , classify , fudgeCharacterClass ) where import Data.Word (Word8) import Data.Char -- | A less specfic version of 'GeneralCategory', grouping characters -- into a couple of key categories. data UnicodeClass -- These are used to designate the beginning of symbols = Upper | Lower | Symbol -- Generic and digit can be used in identifiers, but not at the start of one | Generic | Digit | Whitespace -- These are guaranteed parse order. The only difference is that "graphic" is printable, while -- other may not be. | OtherGraphic | Other deriving (Eq, Show) -- | Determine the class for a given character. classify :: Char -> UnicodeClass classify c = case generalCategory c of -- See classification descriptions in -- -- Cased letters UppercaseLetter -> Upper LowercaseLetter -> Lower TitlecaseLetter -> Upper ModifierLetter -> Generic OtherLetter -> Lower NonSpacingMark -> Generic SpacingCombiningMark -> OtherGraphic EnclosingMark -> OtherGraphic DecimalNumber -> Digit LetterNumber -> Generic OtherNumber -> Digit ConnectorPunctuation -> Symbol DashPunctuation -> Symbol OpenPunctuation -> OtherGraphic ClosePunctuation -> OtherGraphic InitialQuote -> OtherGraphic FinalQuote -> OtherGraphic OtherPunctuation -> Symbol MathSymbol -> Symbol CurrencySymbol -> Symbol ModifierSymbol -> Symbol -- So this _could_ be Lower or something, just so we can allow for emoji variables. -- Hrmrm, maybe not. OtherSymbol -> Symbol Space -> Whitespace -- This is all the wacky things in C* and Z* groups _ -> Other -- | Convert a character class into a fake byte which will be used by -- "Parser.Lexer" fudgeCharacterClass :: UnicodeClass -> Word8 fudgeCharacterClass Upper = 0xf0 fudgeCharacterClass Lower = 0xf1 fudgeCharacterClass Symbol = 0xf2 fudgeCharacterClass Generic = 0xf3 fudgeCharacterClass Digit = 0xf4 fudgeCharacterClass Whitespace = 0xf5 fudgeCharacterClass OtherGraphic = 0xf6 fudgeCharacterClass Other = 0xf7