module Frontend.Lexer.Unicode
  ( UnicodeClass(..)
  , classify
  , fudgeCharacterClass
  ) where

import Data.Word (Word8)
import Data.Char

-- | A less specfic version of 'GeneralCategory', grouping characters
-- into a couple of key categories.
data UnicodeClass
  -- These are used to designate the beginning of symbols
  = Upper | Lower | Symbol

  -- Generic and digit can be used in identifiers, but not at the start of one
  | Generic | Digit

  | Whitespace

  -- These are  guaranteed parse order. The only difference is that "graphic" is printable, while
  -- other may not be.
  | OtherGraphic | Other
  deriving (Eq, Show)

-- | Determine the class for a given character.
classify :: Char -> UnicodeClass
classify c = case generalCategory c of
  -- See classification descriptions in
  -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table>

  -- Cased letters
  UppercaseLetter       -> Upper
  LowercaseLetter       -> Lower
  TitlecaseLetter       -> Upper

  ModifierLetter        -> Generic
  OtherLetter           -> Lower

  NonSpacingMark        -> Generic
  SpacingCombiningMark  -> OtherGraphic
  EnclosingMark         -> OtherGraphic

  DecimalNumber         -> Digit
  LetterNumber          -> Generic
  OtherNumber           -> Digit

  ConnectorPunctuation  -> Symbol
  DashPunctuation       -> Symbol
  OpenPunctuation       -> OtherGraphic
  ClosePunctuation      -> OtherGraphic
  InitialQuote          -> OtherGraphic
  FinalQuote            -> OtherGraphic
  OtherPunctuation      -> Symbol

  MathSymbol            -> Symbol
  CurrencySymbol        -> Symbol
  ModifierSymbol        -> Symbol
  -- So this _could_ be Lower or something, just so we can allow for emoji variables.
  -- Hrmrm, maybe not.
  OtherSymbol           -> Symbol

  Space                 -> Whitespace

  -- This is all the wacky things in C* and Z* groups
  _                     -> Other

-- | Convert a character class into a fake byte which will be used by
-- "Parser.Lexer"
fudgeCharacterClass :: UnicodeClass -> Word8
fudgeCharacterClass Upper        = 0xf0
fudgeCharacterClass Lower        = 0xf1
fudgeCharacterClass Symbol       = 0xf2
fudgeCharacterClass Generic      = 0xf3
fudgeCharacterClass Digit        = 0xf4
fudgeCharacterClass Whitespace   = 0xf5
fudgeCharacterClass OtherGraphic = 0xf6
fudgeCharacterClass Other        = 0xf7