1{-# LANGUAGE OverloadedStrings #-} 2{-# LANGUAGE DeriveDataTypeable #-} 3{-# LANGUAGE BangPatterns #-} 4 5module Commonmark.Tokens 6 ( Tok(..) 7 , TokType(..) 8 , SourcePos 9 , tokenize 10 , untokenize 11 ) where 12 13import Data.Char (isAlphaNum, isSpace) 14import Data.Text (Text) 15import qualified Data.Text as T 16import Data.Data (Data, Typeable) 17import Text.Parsec.Pos 18import Data.Text.Normalize (normalize, NormalizationMode(NFC)) 19 20data Tok = Tok { tokType :: !TokType 21 , tokPos :: !SourcePos 22 , tokContents :: {-# UNPACK #-} !Text 23 } 24 deriving (Show, Eq, Data, Typeable) 25 26data TokType = 27 Spaces 28 | UnicodeSpace 29 | LineEnd 30 | WordChars 31 | Symbol {-# UNPACK #-} !Char 32 deriving (Show, Eq, Ord, Data, Typeable) 33 34-- | Convert a 'Text' into a list of 'Tok'. The first parameter 35-- species the source name. 36tokenize :: String -> Text -> [Tok] 37tokenize name = 38 {-# SCC tokenize #-} go (initialPos name) . T.groupBy f . normalize NFC 39 where 40 -- We group \r\n, consecutive spaces, and consecutive alphanums; 41 -- everything else gets in a token by itself. 42 f '\r' '\n' = True 43 f ' ' ' ' = True 44 f x y = isAlphaNum x && isAlphaNum y 45 46 go _pos [] = [] 47 go !pos (t:ts) = -- note that t:ts are guaranteed to be nonempty 48 case T.head t of 49 ' ' -> Tok Spaces pos t : 50 go (incSourceColumn pos (T.length t)) ts 51 '\t' -> Tok Spaces pos t : 52 go (incSourceColumn pos 53 (4 - (sourceColumn pos - 1) `mod` 4)) ts 54 '\r' -> Tok LineEnd pos t : 55 go (incSourceLine (setSourceColumn pos 1) 1) ts 56 '\n' -> Tok LineEnd pos t : 57 go (incSourceLine (setSourceColumn pos 1) 1) ts 58 thead 59 | isAlphaNum thead -> 60 Tok WordChars pos t : 61 go (incSourceColumn pos (T.length t)) ts 62 | isSpace thead -> 63 Tok UnicodeSpace pos t : 64 go (incSourceColumn pos 1) ts 65 | otherwise -> 66 Tok (Symbol thead) pos t : 67 go (incSourceColumn pos 1) ts 68 69-- | Reverses 'tokenize'. @untokenize . tokenize@ should be 70-- the identity. 71untokenize :: [Tok] -> Text 72untokenize = {-# SCC untokenize #-} mconcat . map tokContents 73 74