Abbie's Haskell compiler
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

78 lines
2.3 KiB

  1. module Frontend.Lexer.Unicode
  2. ( UnicodeClass(..)
  3. , classify
  4. , fudgeCharacterClass
  5. ) where
  6. import Data.Word (Word8)
  7. import Data.Char
  8. -- | A less specfic version of 'GeneralCategory', grouping characters
  9. -- into a couple of key categories.
  10. data UnicodeClass
  11. -- These are used to designate the beginning of symbols
  12. = Upper | Lower | Symbol
  13. -- Generic and digit can be used in identifiers, but not at the start of one
  14. | Generic | Digit
  15. | Whitespace
  16. -- These are guaranteed parse order. The only difference is that "graphic" is printable, while
  17. -- other may not be.
  18. | OtherGraphic | Other
  19. deriving (Eq, Show)
  20. -- | Determine the class for a given character.
  21. classify :: Char -> UnicodeClass
  22. classify c = case generalCategory c of
  23. -- See classification descriptions in
  24. -- <http://www.unicode.org/reports/tr44/tr44-14.html#GC_Values_Table>
  25. -- Cased letters
  26. UppercaseLetter -> Upper
  27. LowercaseLetter -> Lower
  28. TitlecaseLetter -> Upper
  29. ModifierLetter -> Generic
  30. OtherLetter -> Lower
  31. NonSpacingMark -> Generic
  32. SpacingCombiningMark -> OtherGraphic
  33. EnclosingMark -> OtherGraphic
  34. DecimalNumber -> Digit
  35. LetterNumber -> Generic
  36. OtherNumber -> Digit
  37. ConnectorPunctuation -> Symbol
  38. DashPunctuation -> Symbol
  39. OpenPunctuation -> OtherGraphic
  40. ClosePunctuation -> OtherGraphic
  41. InitialQuote -> OtherGraphic
  42. FinalQuote -> OtherGraphic
  43. OtherPunctuation -> Symbol
  44. MathSymbol -> Symbol
  45. CurrencySymbol -> Symbol
  46. ModifierSymbol -> Symbol
  47. -- So this _could_ be Lower or something, just so we can allow for emoji variables.
  48. -- Hrmrm, maybe not.
  49. OtherSymbol -> Symbol
  50. Space -> Whitespace
  51. -- This is all the wacky things in C* and Z* groups
  52. _ -> Other
  53. -- | Convert a character class into a fake byte which will be used by
  54. -- "Parser.Lexer"
  55. fudgeCharacterClass :: UnicodeClass -> Word8
  56. fudgeCharacterClass Upper = 0xf0
  57. fudgeCharacterClass Lower = 0xf1
  58. fudgeCharacterClass Symbol = 0xf2
  59. fudgeCharacterClass Generic = 0xf3
  60. fudgeCharacterClass Digit = 0xf4
  61. fudgeCharacterClass Whitespace = 0xf5
  62. fudgeCharacterClass OtherGraphic = 0xf6
  63. fudgeCharacterClass Other = 0xf7