|
module Frontend.Lexer.Unicode
|
|
( UnicodeClass(..)
|
|
, classify
|
|
, fudgeCharacterClass
|
|
) where
|
|
|
|
import Data.Word (Word8)
|
|
import Data.Char
|
|
|
|
-- | A less specfic version of 'GeneralCategory', grouping characters
|
|
-- into a couple of key categories.
|
|
data UnicodeClass
|
|
-- These are used to designate the beginning of symbols
|
|
= Upper | Lower | Symbol
|
|
|
|
-- Generic and digit can be used in identifiers, but not at the start of one
|
|
| Generic | Digit
|
|
|
|
| Whitespace
|
|
|
|
-- These are guaranteed parse order. The only difference is that "graphic" is printable, while
|
|
-- other may not be.
|
|
| OtherGraphic | Other
|
|
deriving (Eq, Show)
|
|
|
|
-- | Determine the class for a given character.
|
|
classify :: Char -> UnicodeClass
|
|
classify c = case generalCategory c of
|
|
-- See classification descriptions in
|
|
-- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table>
|
|
|
|
-- Cased letters
|
|
UppercaseLetter -> Upper
|
|
LowercaseLetter -> Lower
|
|
TitlecaseLetter -> Upper
|
|
|
|
ModifierLetter -> Generic
|
|
OtherLetter -> Lower
|
|
|
|
NonSpacingMark -> Generic
|
|
SpacingCombiningMark -> OtherGraphic
|
|
EnclosingMark -> OtherGraphic
|
|
|
|
DecimalNumber -> Digit
|
|
LetterNumber -> Generic
|
|
OtherNumber -> Digit
|
|
|
|
ConnectorPunctuation -> Symbol
|
|
DashPunctuation -> Symbol
|
|
OpenPunctuation -> OtherGraphic
|
|
ClosePunctuation -> OtherGraphic
|
|
InitialQuote -> OtherGraphic
|
|
FinalQuote -> OtherGraphic
|
|
OtherPunctuation -> Symbol
|
|
|
|
MathSymbol -> Symbol
|
|
CurrencySymbol -> Symbol
|
|
ModifierSymbol -> Symbol
|
|
-- So this _could_ be Lower or something, just so we can allow for emoji variables.
|
|
-- Hrmrm, maybe not.
|
|
OtherSymbol -> Symbol
|
|
|
|
Space -> Whitespace
|
|
|
|
-- This is all the wacky things in C* and Z* groups
|
|
_ -> Other
|
|
|
|
-- | Convert a character class into a fake byte which will be used by
|
|
-- "Parser.Lexer"
|
|
fudgeCharacterClass :: UnicodeClass -> Word8
|
|
fudgeCharacterClass Upper = 0xf0
|
|
fudgeCharacterClass Lower = 0xf1
|
|
fudgeCharacterClass Symbol = 0xf2
|
|
fudgeCharacterClass Generic = 0xf3
|
|
fudgeCharacterClass Digit = 0xf4
|
|
fudgeCharacterClass Whitespace = 0xf5
|
|
fudgeCharacterClass OtherGraphic = 0xf6
|
|
fudgeCharacterClass Other = 0xf7
|