1{-# LANGUAGE OverloadedStrings #-}
2{-# LANGUAGE DeriveDataTypeable #-}
3{-# LANGUAGE BangPatterns #-}
4
5module Commonmark.Tokens
6  ( Tok(..)
7  , TokType(..)
8  , SourcePos
9  , tokenize
10  , untokenize
11  ) where
12
13import           Data.Char       (isAlphaNum, isSpace)
14import           Data.Text       (Text)
15import qualified Data.Text       as T
16import           Data.Data       (Data, Typeable)
17import           Text.Parsec.Pos
18import           Data.Text.Normalize (normalize, NormalizationMode(NFC))
19
20data Tok = Tok { tokType     :: !TokType
21               , tokPos      :: !SourcePos
22               , tokContents :: {-# UNPACK #-} !Text
23               }
24               deriving (Show, Eq, Data, Typeable)
25
26data TokType =
27       Spaces
28     | UnicodeSpace
29     | LineEnd
30     | WordChars
31     | Symbol {-# UNPACK #-} !Char
32     deriving (Show, Eq, Ord, Data, Typeable)
33
34-- | Convert a 'Text' into a list of 'Tok'. The first parameter
35-- species the source name.
36tokenize :: String -> Text -> [Tok]
37tokenize name =
38  {-# SCC tokenize #-} go (initialPos name) . T.groupBy f . normalize NFC
39  where
40    -- We group \r\n, consecutive spaces, and consecutive alphanums;
41    -- everything else gets in a token by itself.
42    f '\r' '\n' = True
43    f ' ' ' '   = True
44    f x   y     = isAlphaNum x && isAlphaNum y
45
46    go _pos [] = []
47    go !pos (t:ts) = -- note that t:ts are guaranteed to be nonempty
48      case T.head t of
49         ' ' ->  Tok Spaces pos t :
50                 go (incSourceColumn pos (T.length t)) ts
51         '\t' -> Tok Spaces pos t :
52                 go (incSourceColumn pos
53                       (4 - (sourceColumn pos - 1) `mod` 4)) ts
54         '\r' -> Tok LineEnd pos t :
55                 go (incSourceLine (setSourceColumn pos 1) 1) ts
56         '\n' -> Tok LineEnd pos t :
57                 go (incSourceLine (setSourceColumn pos 1) 1) ts
58         thead
59           | isAlphaNum thead ->
60                 Tok WordChars pos t :
61                 go (incSourceColumn pos (T.length t)) ts
62           | isSpace thead ->
63                 Tok UnicodeSpace pos t :
64                 go (incSourceColumn pos 1) ts
65           | otherwise ->
66                 Tok (Symbol thead) pos t :
67                 go (incSourceColumn pos 1) ts
68
69-- | Reverses 'tokenize'.  @untokenize . tokenize@ should be
70-- the identity.
71untokenize :: [Tok] -> Text
72untokenize = {-# SCC untokenize #-} mconcat . map tokContents
73
74