|
|
- module Frontend.Lexer.Unicode
- ( UnicodeClass(..)
- , classify
- , fudgeCharacterClass
- ) where
-
- import Data.Word (Word8)
- import Data.Char
-
- -- | A less specfic version of 'GeneralCategory', grouping characters
- -- into a couple of key categories.
- data UnicodeClass
- -- These are used to designate the beginning of symbols
- = Upper | Lower | Symbol
-
- -- Generic and digit can be used in identifiers, but not at the start of one
- | Generic | Digit
-
- | Whitespace
-
- -- These are guaranteed parse order. The only difference is that "graphic" is printable, while
- -- other may not be.
- | OtherGraphic | Other
- deriving (Eq, Show)
-
- -- | Determine the class for a given character.
- classify :: Char -> UnicodeClass
- classify c = case generalCategory c of
- -- See classification descriptions in
- -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table>
-
- -- Cased letters
- UppercaseLetter -> Upper
- LowercaseLetter -> Lower
- TitlecaseLetter -> Upper
-
- ModifierLetter -> Generic
- OtherLetter -> Lower
-
- NonSpacingMark -> Generic
- SpacingCombiningMark -> OtherGraphic
- EnclosingMark -> OtherGraphic
-
- DecimalNumber -> Digit
- LetterNumber -> Generic
- OtherNumber -> Digit
-
- ConnectorPunctuation -> Symbol
- DashPunctuation -> Symbol
- OpenPunctuation -> OtherGraphic
- ClosePunctuation -> OtherGraphic
- InitialQuote -> OtherGraphic
- FinalQuote -> OtherGraphic
- OtherPunctuation -> Symbol
-
- MathSymbol -> Symbol
- CurrencySymbol -> Symbol
- ModifierSymbol -> Symbol
- -- So this _could_ be Lower or something, just so we can allow for emoji variables.
- -- Hrmrm, maybe not.
- OtherSymbol -> Symbol
-
- Space -> Whitespace
-
- -- This is all the wacky things in C* and Z* groups
- _ -> Other
-
- -- | Convert a character class into a fake byte which will be used by
- -- "Parser.Lexer"
- fudgeCharacterClass :: UnicodeClass -> Word8
- fudgeCharacterClass Upper = 0xf0
- fudgeCharacterClass Lower = 0xf1
- fudgeCharacterClass Symbol = 0xf2
- fudgeCharacterClass Generic = 0xf3
- fudgeCharacterClass Digit = 0xf4
- fudgeCharacterClass Whitespace = 0xf5
- fudgeCharacterClass OtherGraphic = 0xf6
- fudgeCharacterClass Other = 0xf7
|