1-- |
2-- Module      :  Text.Megaparsec.Stream
3-- Copyright   :  © 2015–present Megaparsec contributors
4-- License     :  FreeBSD
5--
6-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
7-- Stability   :  experimental
8-- Portability :  portable
9--
10-- Megaparsec's input stream facilities.
11--
12-- You probably do not want to import this module directly because
13-- "Text.Megaparsec" re-exports it anyway.
14--
15-- @since 6.0.0
16
17{-# LANGUAGE FlexibleContexts    #-}
18{-# LANGUAGE FlexibleInstances   #-}
19{-# LANGUAGE LambdaCase          #-}
20{-# LANGUAGE MultiWayIf          #-}
21{-# LANGUAGE RankNTypes          #-}
22{-# LANGUAGE RecordWildCards     #-}
23{-# LANGUAGE ScopedTypeVariables #-}
24{-# LANGUAGE TypeFamilies        #-}
25
26module Text.Megaparsec.Stream
27  ( Stream (..) )
28where
29
30import Data.Char (chr)
31import Data.Foldable (foldl')
32import Data.Kind (Type)
33import Data.List.NonEmpty (NonEmpty (..))
34import Data.Maybe (fromMaybe)
35import Data.Proxy
36import Data.Word (Word8)
37import Text.Megaparsec.Pos
38import Text.Megaparsec.State
39import qualified Data.ByteString            as B
40import qualified Data.ByteString.Char8      as B8
41import qualified Data.ByteString.Lazy       as BL
42import qualified Data.ByteString.Lazy.Char8 as BL8
43import qualified Data.List.NonEmpty         as NE
44import qualified Data.Text                  as T
45import qualified Data.Text.Lazy             as TL
46
47-- | Type class for inputs that can be consumed by the library.
48
49class (Ord (Token s), Ord (Tokens s)) => Stream s where
50
51  -- | Type of token in the stream.
52
53  type Token s :: Type
54
55  -- | Type of “chunk” of the stream.
56
57  type Tokens s :: Type
58
59  -- | Lift a single token to chunk of the stream. The default
60  -- implementation is:
61  --
62  -- > tokenToChunk pxy = tokensToChunk pxy . pure
63  --
64  -- However for some types of stream there may be a more efficient way to
65  -- lift.
66
67  tokenToChunk  :: Proxy s -> Token s -> Tokens s
68  tokenToChunk pxy = tokensToChunk pxy . pure
69
70  -- | The first method that establishes isomorphism between list of tokens
71  -- and chunk of the stream. Valid implementation should satisfy:
72  --
73  -- > chunkToTokens pxy (tokensToChunk pxy ts) == ts
74
75  tokensToChunk :: Proxy s -> [Token s] -> Tokens s
76
77  -- | The second method that establishes isomorphism between list of tokens
78  -- and chunk of the stream. Valid implementation should satisfy:
79  --
80  -- > tokensToChunk pxy (chunkToTokens pxy chunk) == chunk
81
82  chunkToTokens :: Proxy s -> Tokens s -> [Token s]
83
84  -- | Return length of a chunk of the stream.
85
86  chunkLength :: Proxy s -> Tokens s -> Int
87
88  -- | Check if a chunk of the stream is empty. The default implementation
89  -- is in terms of the more general 'chunkLength':
90  --
91  -- > chunkEmpty pxy ts = chunkLength pxy ts <= 0
92  --
93  -- However for many streams there may be a more efficient implementation.
94
95  chunkEmpty :: Proxy s -> Tokens s -> Bool
96  chunkEmpty pxy ts = chunkLength pxy ts <= 0
97
98  -- | Extract a single token form the stream. Return 'Nothing' if the
99  -- stream is empty.
100
101  take1_ :: s -> Maybe (Token s, s)
102
103  -- | @'takeN_' n s@ should try to extract a chunk of length @n@, or if the
104  -- stream is too short, the rest of the stream. Valid implementation
105  -- should follow the rules:
106  --
107  --     * If the requested length @n@ is 0 (or less), 'Nothing' should
108  --       never be returned, instead @'Just' (\"\", s)@ should be returned,
109  --       where @\"\"@ stands for the empty chunk, and @s@ is the original
110  --       stream (second argument).
111  --     * If the requested length is greater than 0 and the stream is
112  --       empty, 'Nothing' should be returned indicating end of input.
113  --     * In other cases, take chunk of length @n@ (or shorter if the
114  --       stream is not long enough) from the input stream and return the
115  --       chunk along with the rest of the stream.
116
117  takeN_ :: Int -> s -> Maybe (Tokens s, s)
118
119  -- | Extract chunk of the stream taking tokens while the supplied
120  -- predicate returns 'True'. Return the chunk and the rest of the stream.
121  --
122  -- For many types of streams, the method allows for significant
123  -- performance improvements, although it is not strictly necessary from
124  -- conceptual point of view.
125
126  takeWhile_ :: (Token s -> Bool) -> s -> (Tokens s, s)
127
128  -- | Pretty-print non-empty stream of tokens. This function is also used
129  -- to print single tokens (represented as singleton lists).
130  --
131  -- @since 7.0.0
132
133  showTokens :: Proxy s -> NonEmpty (Token s) -> String
134
135  -- | Return the number of characters that a non-empty stream of tokens
136  -- spans. The default implementation is sufficient if every token spans
137  -- exactly 1 character.
138  --
139  -- @since 8.0.0
140
141  tokensLength :: Proxy s -> NonEmpty (Token s) -> Int
142  tokensLength Proxy = NE.length
143
144  -- | Given an offset @o@ and initial 'PosState', adjust the state in such
145  -- a way that it starts at the offset.
146  --
147  -- Return two values (in order):
148  --
149  --     * 'String' representing the line on which the given offset @o@ is
150  --       located. The line should satisfy a number of conditions that are
151  --       described below.
152  --     * The updated 'PosState' which can be in turn used to locate
153  --       another offset @o'@ given that @o' >= o@.
154  --
155  -- The 'String' representing the offending line in input stream should
156  -- satisfy the following:
157  --
158  --     * It should adequately represent location of token at the offset of
159  --       interest, that is, character at 'sourceColumn' of the returned
160  --       'SourcePos' should correspond to the token at the offset @o@.
161  --     * It should not include the newline at the end.
162  --     * It should not be empty, if the line happens to be empty, it
163  --       should be replaced with the string @\"\<empty line\>\"@.
164  --     * Tab characters should be replaced by appropriate number of
165  --       spaces, which is determined by the 'pstateTabWidth' field of
166  --       'PosState'.
167  --
168  -- __Note__: type signature of the function was changed in the version
169  -- /8.0.0/.
170  --
171  -- @since 7.0.0
172
173  reachOffset
174    :: Int             -- ^ Offset to reach
175    -> PosState s      -- ^ Initial 'PosState' to use
176    -> (String, PosState s) -- ^ See the description of the function
177
178  -- | A version of 'reachOffset' that may be faster because it doesn't need
179  -- to fetch the line at which the given offset in located.
180  --
181  -- The default implementation is this:
182  --
183  -- > reachOffsetNoLine o pst =
184  -- >   snd (reachOffset o pst)
185  --
186  -- __Note__: type signature of the function was changed in the version
187  -- /8.0.0/.
188  --
189  -- @since 7.0.0
190
191  reachOffsetNoLine
192    :: Int             -- ^ Offset to reach
193    -> PosState s      -- ^ Initial 'PosState' to use
194    -> PosState s      -- ^ Reached source position and updated state
195  reachOffsetNoLine o pst =
196    snd (reachOffset o pst)
197
198instance Stream String where
199  type Token String = Char
200  type Tokens String = String
201  tokenToChunk Proxy = pure
202  tokensToChunk Proxy = id
203  chunkToTokens Proxy = id
204  chunkLength Proxy = length
205  chunkEmpty Proxy = null
206  take1_ [] = Nothing
207  take1_ (t:ts) = Just (t, ts)
208  takeN_ n s
209    | n <= 0    = Just ("", s)
210    | null s    = Nothing
211    | otherwise = Just (splitAt n s)
212  takeWhile_ = span
213  showTokens Proxy = stringPretty
214  -- NOTE Do not eta-reduce these (breaks inlining)
215  reachOffset o pst =
216    reachOffset' splitAt foldl' id id ('\n','\t') o pst
217  reachOffsetNoLine o pst =
218    reachOffsetNoLine' splitAt foldl' ('\n', '\t') o pst
219
220instance Stream B.ByteString where
221  type Token B.ByteString = Word8
222  type Tokens B.ByteString = B.ByteString
223  tokenToChunk Proxy = B.singleton
224  tokensToChunk Proxy = B.pack
225  chunkToTokens Proxy = B.unpack
226  chunkLength Proxy = B.length
227  chunkEmpty Proxy = B.null
228  take1_ = B.uncons
229  takeN_ n s
230    | n <= 0    = Just (B.empty, s)
231    | B.null s  = Nothing
232    | otherwise = Just (B.splitAt n s)
233  takeWhile_ = B.span
234  showTokens Proxy = stringPretty . fmap (chr . fromIntegral)
235  -- NOTE Do not eta-reduce these (breaks inlining)
236  reachOffset o pst =
237    reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) o pst
238  reachOffsetNoLine o pst =
239    reachOffsetNoLine' B.splitAt B.foldl' (10, 9) o pst
240
241instance Stream BL.ByteString where
242  type Token BL.ByteString = Word8
243  type Tokens BL.ByteString = BL.ByteString
244  tokenToChunk Proxy = BL.singleton
245  tokensToChunk Proxy = BL.pack
246  chunkToTokens Proxy = BL.unpack
247  chunkLength Proxy = fromIntegral . BL.length
248  chunkEmpty Proxy = BL.null
249  take1_ = BL.uncons
250  takeN_ n s
251    | n <= 0    = Just (BL.empty, s)
252    | BL.null s = Nothing
253    | otherwise = Just (BL.splitAt (fromIntegral n) s)
254  takeWhile_ = BL.span
255  showTokens Proxy = stringPretty . fmap (chr . fromIntegral)
256  -- NOTE Do not eta-reduce these (breaks inlining)
257  reachOffset o pst =
258    reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) o pst
259  reachOffsetNoLine o pst =
260    reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) o pst
261
262instance Stream T.Text where
263  type Token T.Text = Char
264  type Tokens T.Text = T.Text
265  tokenToChunk Proxy = T.singleton
266  tokensToChunk Proxy = T.pack
267  chunkToTokens Proxy = T.unpack
268  chunkLength Proxy = T.length
269  chunkEmpty Proxy = T.null
270  take1_ = T.uncons
271  takeN_ n s
272    | n <= 0    = Just (T.empty, s)
273    | T.null s  = Nothing
274    | otherwise = Just (T.splitAt n s)
275  takeWhile_ = T.span
276  showTokens Proxy = stringPretty
277  -- NOTE Do not eta-reduce (breaks inlining of reachOffset').
278  reachOffset o pst =
279    reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') o pst
280  reachOffsetNoLine o pst =
281    reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') o pst
282
283instance Stream TL.Text where
284  type Token TL.Text  = Char
285  type Tokens TL.Text = TL.Text
286  tokenToChunk Proxy = TL.singleton
287  tokensToChunk Proxy = TL.pack
288  chunkToTokens Proxy = TL.unpack
289  chunkLength Proxy = fromIntegral . TL.length
290  chunkEmpty Proxy = TL.null
291  take1_ = TL.uncons
292  takeN_ n s
293    | n <= 0    = Just (TL.empty, s)
294    | TL.null s = Nothing
295    | otherwise = Just (TL.splitAt (fromIntegral n) s)
296  takeWhile_ = TL.span
297  showTokens Proxy = stringPretty
298  -- NOTE Do not eta-reduce (breaks inlining of reachOffset').
299  reachOffset o pst =
300    reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') o pst
301  reachOffsetNoLine o pst =
302    reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') o pst
303
304----------------------------------------------------------------------------
305-- Helpers
306
307-- | An internal helper state type combining a difference 'String' and an
308-- unboxed 'SourcePos'.
309
310data St = St SourcePos ShowS
311
312-- | A helper definition to facilitate defining 'reachOffset' for various
313-- stream types.
314
315reachOffset'
316  :: forall s. Stream s
317  => (Int -> s -> (Tokens s, s))
318     -- ^ How to split input stream at given offset
319  -> (forall b. (b -> Token s -> b) -> b -> Tokens s -> b)
320     -- ^ How to fold over input stream
321  -> (Tokens s -> String)
322     -- ^ How to convert chunk of input stream into a 'String'
323  -> (Token s -> Char)
324     -- ^ How to convert a token into a 'Char'
325  -> (Token s, Token s)
326     -- ^ Newline token and tab token
327  -> Int
328     -- ^ Offset to reach
329  -> PosState s
330     -- ^ Initial 'PosState' to use
331  -> (String, PosState s)
332     -- ^ Line at which 'SourcePos' is located, updated 'PosState'
333reachOffset' splitAt'
334             foldl''
335             fromToks
336             fromTok
337             (newlineTok, tabTok)
338             o
339             PosState {..} =
340  ( case expandTab pstateTabWidth
341           . addPrefix
342           . f
343           . fromToks
344           . fst
345           $ takeWhile_ (/= newlineTok) post of
346      "" -> "<empty line>"
347      xs -> xs
348  , PosState
349      { pstateInput = post
350      , pstateOffset = max pstateOffset o
351      , pstateSourcePos = spos
352      , pstateTabWidth = pstateTabWidth
353      , pstateLinePrefix =
354          if sameLine
355            -- NOTE We don't use difference lists here because it's
356            -- desirable for 'PosState' to be an instance of 'Eq' and
357            -- 'Show'. So we just do appending here. Fortunately several
358            -- parse errors on the same line should be relatively rare.
359            then pstateLinePrefix ++ f ""
360            else f ""
361      }
362  )
363  where
364    addPrefix xs =
365      if sameLine
366        then pstateLinePrefix ++ xs
367        else xs
368    sameLine = sourceLine spos == sourceLine pstateSourcePos
369    (pre, post) = splitAt' (o - pstateOffset) pstateInput
370    St spos f = foldl'' go (St pstateSourcePos id) pre
371    go (St apos g) ch =
372      let SourcePos n l c = apos
373          c' = unPos c
374          w  = unPos pstateTabWidth
375      in if | ch == newlineTok ->
376                St (SourcePos n (l <> pos1) pos1)
377                   id
378            | ch == tabTok ->
379                St (SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w)))
380                   (g . (fromTok ch :))
381            | otherwise ->
382                St (SourcePos n l (c <> pos1))
383                   (g . (fromTok ch :))
384{-# INLINE reachOffset' #-}
385
386-- | Like 'reachOffset'' but for 'reachOffsetNoLine'.
387
388reachOffsetNoLine'
389  :: forall s. Stream s
390  => (Int -> s -> (Tokens s, s))
391     -- ^ How to split input stream at given offset
392  -> (forall b. (b -> Token s -> b) -> b -> Tokens s -> b)
393     -- ^ How to fold over input stream
394  -> (Token s, Token s)
395     -- ^ Newline token and tab token
396  -> Int
397     -- ^ Offset to reach
398  -> PosState s
399     -- ^ Initial 'PosState' to use
400  -> PosState s
401     -- ^ Updated 'PosState'
402reachOffsetNoLine' splitAt'
403                   foldl''
404                   (newlineTok, tabTok)
405                   o
406                   PosState {..} =
407  ( PosState
408      { pstateInput = post
409      , pstateOffset = max pstateOffset o
410      , pstateSourcePos = spos
411      , pstateTabWidth = pstateTabWidth
412      , pstateLinePrefix = pstateLinePrefix
413      }
414  )
415  where
416    spos = foldl'' go pstateSourcePos pre
417    (pre, post) = splitAt' (o - pstateOffset) pstateInput
418    go (SourcePos n l c) ch =
419      let c' = unPos c
420          w  = unPos pstateTabWidth
421      in if | ch == newlineTok ->
422                SourcePos n (l <> pos1) pos1
423            | ch == tabTok ->
424                SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w))
425            | otherwise ->
426                SourcePos n l (c <> pos1)
427{-# INLINE reachOffsetNoLine' #-}
428
429-- | Like 'BL.splitAt' but accepts the index as an 'Int'.
430
431splitAtBL :: Int -> BL.ByteString -> (BL.ByteString, BL.ByteString)
432splitAtBL n = BL.splitAt (fromIntegral n)
433{-# INLINE splitAtBL #-}
434
435-- | Like 'TL.splitAt' but accepts the index as an 'Int'.
436
437splitAtTL :: Int -> TL.Text -> (TL.Text, TL.Text)
438splitAtTL n = TL.splitAt (fromIntegral n)
439{-# INLINE splitAtTL #-}
440
441-- | @stringPretty s@ returns pretty representation of string @s@. This is
442-- used when printing string tokens in error messages.
443
444stringPretty :: NonEmpty Char -> String
445stringPretty (x:|[])      = charPretty x
446stringPretty ('\r':|"\n") = "crlf newline"
447stringPretty xs           = "\"" <> concatMap f (NE.toList xs) <> "\""
448  where
449    f ch =
450      case charPretty' ch of
451        Nothing     -> [ch]
452        Just pretty -> "<" <> pretty <> ">"
453
454-- | @charPretty ch@ returns user-friendly string representation of given
455-- character @ch@, suitable for using in error messages.
456
457charPretty :: Char -> String
458charPretty ' ' = "space"
459charPretty ch = fromMaybe ("'" <> [ch] <> "'") (charPretty' ch)
460
461-- | If the given character has a pretty representation, return that,
462-- otherwise 'Nothing'. This is an internal helper.
463
464charPretty' :: Char -> Maybe String
465charPretty' = \case
466  '\NUL' -> Just "null"
467  '\SOH' -> Just "start of heading"
468  '\STX' -> Just "start of text"
469  '\ETX' -> Just "end of text"
470  '\EOT' -> Just "end of transmission"
471  '\ENQ' -> Just "enquiry"
472  '\ACK' -> Just "acknowledge"
473  '\BEL' -> Just "bell"
474  '\BS'  -> Just "backspace"
475  '\t'   -> Just "tab"
476  '\n'   -> Just "newline"
477  '\v'   -> Just "vertical tab"
478  '\f'   -> Just "form feed"
479  '\r'   -> Just "carriage return"
480  '\SO'  -> Just "shift out"
481  '\SI'  -> Just "shift in"
482  '\DLE' -> Just "data link escape"
483  '\DC1' -> Just "device control one"
484  '\DC2' -> Just "device control two"
485  '\DC3' -> Just "device control three"
486  '\DC4' -> Just "device control four"
487  '\NAK' -> Just "negative acknowledge"
488  '\SYN' -> Just "synchronous idle"
489  '\ETB' -> Just "end of transmission block"
490  '\CAN' -> Just "cancel"
491  '\EM'  -> Just "end of medium"
492  '\SUB' -> Just "substitute"
493  '\ESC' -> Just "escape"
494  '\FS'  -> Just "file separator"
495  '\GS'  -> Just "group separator"
496  '\RS'  -> Just "record separator"
497  '\US'  -> Just "unit separator"
498  '\DEL' -> Just "delete"
499  '\160' -> Just "non-breaking space"
500  _      -> Nothing
501
502-- | Replace tab characters with given number of spaces.
503
504expandTab
505  :: Pos
506  -> String
507  -> String
508expandTab w' = go 0
509  where
510    go 0 []        = []
511    go 0 ('\t':xs) = go w xs
512    go 0 (x:xs)    = x : go 0 xs
513    go n xs        = ' ' : go (n - 1) xs
514    w              = unPos w'
515