1{-# LANGUAGE FlexibleContexts #-}
2{-# LANGUAGE FlexibleInstances #-}
3{-# LANGUAGE LambdaCase #-}
4{-# LANGUAGE MultiWayIf #-}
5{-# LANGUAGE RankNTypes #-}
6{-# LANGUAGE RecordWildCards #-}
7{-# LANGUAGE Safe #-}
8{-# LANGUAGE ScopedTypeVariables #-}
9{-# LANGUAGE TypeFamilies #-}
10
11-- |
12-- Module      :  Text.Megaparsec.Stream
13-- Copyright   :  © 2015–present Megaparsec contributors
14-- License     :  FreeBSD
15--
16-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
17-- Stability   :  experimental
18-- Portability :  portable
19--
20-- Megaparsec's input stream facilities.
21--
22-- You probably do not want to import this module directly because
23-- "Text.Megaparsec" re-exports it anyway.
24--
25-- @since 6.0.0
26module Text.Megaparsec.Stream
27  ( Stream (..),
28    VisualStream (..),
29    TraversableStream (..),
30  )
31where
32
33import qualified Data.ByteString as B
34import qualified Data.ByteString.Char8 as B8
35import qualified Data.ByteString.Lazy as BL
36import qualified Data.ByteString.Lazy.Char8 as BL8
37import Data.Char (chr)
38import Data.Foldable (foldl', toList)
39import Data.Kind (Type)
40import Data.List.NonEmpty (NonEmpty (..))
41import qualified Data.List.NonEmpty as NE
42import Data.Maybe (fromMaybe)
43import Data.Proxy
44import qualified Data.Sequence as S
45import qualified Data.Text as T
46import qualified Data.Text.Lazy as TL
47import Data.Word (Word8)
48import Text.Megaparsec.Pos
49import Text.Megaparsec.State
50
51-- | Type class for inputs that can be consumed by the library.
52--
53-- __Note__: before the version /9.0.0/ the class included the methods from
54-- 'VisualStream' and 'TraversableStream'.
55class (Ord (Token s), Ord (Tokens s)) => Stream s where
56  -- | Type of token in the stream.
57  type Token s :: Type
58
59  -- | Type of “chunk” of the stream.
60  type Tokens s :: Type
61
62  -- | Lift a single token to chunk of the stream. The default
63  -- implementation is:
64  --
65  -- > tokenToChunk pxy = tokensToChunk pxy . pure
66  --
67  -- However for some types of stream there may be a more efficient way to
68  -- lift.
69  tokenToChunk :: Proxy s -> Token s -> Tokens s
70  tokenToChunk pxy = tokensToChunk pxy . pure
71
72  -- | The first method that establishes isomorphism between list of tokens
73  -- and chunk of the stream. Valid implementation should satisfy:
74  --
75  -- > chunkToTokens pxy (tokensToChunk pxy ts) == ts
76  tokensToChunk :: Proxy s -> [Token s] -> Tokens s
77
78  -- | The second method that establishes isomorphism between list of tokens
79  -- and chunk of the stream. Valid implementation should satisfy:
80  --
81  -- > tokensToChunk pxy (chunkToTokens pxy chunk) == chunk
82  chunkToTokens :: Proxy s -> Tokens s -> [Token s]
83
84  -- | Return length of a chunk of the stream.
85  chunkLength :: Proxy s -> Tokens s -> Int
86
87  -- | Check if a chunk of the stream is empty. The default implementation
88  -- is in terms of the more general 'chunkLength':
89  --
90  -- > chunkEmpty pxy ts = chunkLength pxy ts <= 0
91  --
92  -- However for many streams there may be a more efficient implementation.
93  chunkEmpty :: Proxy s -> Tokens s -> Bool
94  chunkEmpty pxy ts = chunkLength pxy ts <= 0
95
96  -- | Extract a single token form the stream. Return 'Nothing' if the
97  -- stream is empty.
98  take1_ :: s -> Maybe (Token s, s)
99
100  -- | @'takeN_' n s@ should try to extract a chunk of length @n@, or if the
101  -- stream is too short, the rest of the stream. Valid implementation
102  -- should follow the rules:
103  --
104  --     * If the requested length @n@ is 0 (or less), 'Nothing' should
105  --       never be returned, instead @'Just' (\"\", s)@ should be returned,
106  --       where @\"\"@ stands for the empty chunk, and @s@ is the original
107  --       stream (second argument).
108  --     * If the requested length is greater than 0 and the stream is
109  --       empty, 'Nothing' should be returned indicating end of input.
110  --     * In other cases, take chunk of length @n@ (or shorter if the
111  --       stream is not long enough) from the input stream and return the
112  --       chunk along with the rest of the stream.
113  takeN_ :: Int -> s -> Maybe (Tokens s, s)
114
115  -- | Extract chunk of the stream taking tokens while the supplied
116  -- predicate returns 'True'. Return the chunk and the rest of the stream.
117  --
118  -- For many types of streams, the method allows for significant
119  -- performance improvements, although it is not strictly necessary from
120  -- conceptual point of view.
121  takeWhile_ :: (Token s -> Bool) -> s -> (Tokens s, s)
122
123-- | @since 9.0.0
124instance Ord a => Stream [a] where
125  type Token [a] = a
126  type Tokens [a] = [a]
127  tokenToChunk Proxy = pure
128  tokensToChunk Proxy = id
129  chunkToTokens Proxy = id
130  chunkLength Proxy = length
131  chunkEmpty Proxy = null
132  take1_ [] = Nothing
133  take1_ (t : ts) = Just (t, ts)
134  takeN_ n s
135    | n <= 0 = Just ([], s)
136    | null s = Nothing
137    | otherwise = Just (splitAt n s)
138  takeWhile_ = span
139
140-- | @since 9.0.0
141instance Ord a => Stream (S.Seq a) where
142  type Token (S.Seq a) = a
143  type Tokens (S.Seq a) = S.Seq a
144  tokenToChunk Proxy = pure
145  tokensToChunk Proxy = S.fromList
146  chunkToTokens Proxy = toList
147  chunkLength Proxy = length
148  chunkEmpty Proxy = null
149  take1_ S.Empty = Nothing
150  take1_ (t S.:<| ts) = Just (t, ts)
151  takeN_ n s
152    | n <= 0 = Just (S.empty, s)
153    | null s = Nothing
154    | otherwise = Just (S.splitAt n s)
155  takeWhile_ = S.spanl
156
157instance Stream B.ByteString where
158  type Token B.ByteString = Word8
159  type Tokens B.ByteString = B.ByteString
160  tokenToChunk Proxy = B.singleton
161  tokensToChunk Proxy = B.pack
162  chunkToTokens Proxy = B.unpack
163  chunkLength Proxy = B.length
164  chunkEmpty Proxy = B.null
165  take1_ = B.uncons
166  takeN_ n s
167    | n <= 0 = Just (B.empty, s)
168    | B.null s = Nothing
169    | otherwise = Just (B.splitAt n s)
170  takeWhile_ = B.span
171
172instance Stream BL.ByteString where
173  type Token BL.ByteString = Word8
174  type Tokens BL.ByteString = BL.ByteString
175  tokenToChunk Proxy = BL.singleton
176  tokensToChunk Proxy = BL.pack
177  chunkToTokens Proxy = BL.unpack
178  chunkLength Proxy = fromIntegral . BL.length
179  chunkEmpty Proxy = BL.null
180  take1_ = BL.uncons
181  takeN_ n s
182    | n <= 0 = Just (BL.empty, s)
183    | BL.null s = Nothing
184    | otherwise = Just (BL.splitAt (fromIntegral n) s)
185  takeWhile_ = BL.span
186
187instance Stream T.Text where
188  type Token T.Text = Char
189  type Tokens T.Text = T.Text
190  tokenToChunk Proxy = T.singleton
191  tokensToChunk Proxy = T.pack
192  chunkToTokens Proxy = T.unpack
193  chunkLength Proxy = T.length
194  chunkEmpty Proxy = T.null
195  take1_ = T.uncons
196  takeN_ n s
197    | n <= 0 = Just (T.empty, s)
198    | T.null s = Nothing
199    | otherwise = Just (T.splitAt n s)
200  takeWhile_ = T.span
201
202instance Stream TL.Text where
203  type Token TL.Text = Char
204  type Tokens TL.Text = TL.Text
205  tokenToChunk Proxy = TL.singleton
206  tokensToChunk Proxy = TL.pack
207  chunkToTokens Proxy = TL.unpack
208  chunkLength Proxy = fromIntegral . TL.length
209  chunkEmpty Proxy = TL.null
210  take1_ = TL.uncons
211  takeN_ n s
212    | n <= 0 = Just (TL.empty, s)
213    | TL.null s = Nothing
214    | otherwise = Just (TL.splitAt (fromIntegral n) s)
215  takeWhile_ = TL.span
216
217-- | Type class for inputs that can also be used for debugging.
218--
219-- @since 9.0.0
220class Stream s => VisualStream s where
221  -- | Pretty-print non-empty stream of tokens. This function is also used
222  -- to print single tokens (represented as singleton lists).
223  --
224  -- @since 7.0.0
225  showTokens :: Proxy s -> NonEmpty (Token s) -> String
226
227  -- | Return the number of characters that a non-empty stream of tokens
228  -- spans. The default implementation is sufficient if every token spans
229  -- exactly 1 character.
230  --
231  -- @since 8.0.0
232  tokensLength :: Proxy s -> NonEmpty (Token s) -> Int
233  tokensLength Proxy = NE.length
234
235instance VisualStream String where
236  showTokens Proxy = stringPretty
237
238instance VisualStream B.ByteString where
239  showTokens Proxy = stringPretty . fmap (chr . fromIntegral)
240
241instance VisualStream BL.ByteString where
242  showTokens Proxy = stringPretty . fmap (chr . fromIntegral)
243
244instance VisualStream T.Text where
245  showTokens Proxy = stringPretty
246
247instance VisualStream TL.Text where
248  showTokens Proxy = stringPretty
249
250-- | Type class for inputs that can also be used for error reporting.
251--
252-- @since 9.0.0
253class Stream s => TraversableStream s where
254  {-# MINIMAL reachOffset | reachOffsetNoLine #-}
255
256  -- | Given an offset @o@ and initial 'PosState', adjust the state in such
257  -- a way that it starts at the offset.
258  --
259  -- Return two values (in order):
260  --
261  --     * 'Maybe' 'String' representing the line on which the given offset
262  --       @o@ is located. It can be omitted (i.e. 'Nothing'); in that case
263  --       error reporting functions will not show offending lines. If
264  --       returned, the line should satisfy a number of conditions that are
265  --       described below.
266  --     * The updated 'PosState' which can be in turn used to locate
267  --       another offset @o'@ given that @o' >= o@.
268  --
269  -- The 'String' representing the offending line in input stream should
270  -- satisfy the following:
271  --
272  --     * It should adequately represent location of token at the offset of
273  --       interest, that is, character at 'sourceColumn' of the returned
274  --       'SourcePos' should correspond to the token at the offset @o@.
275  --     * It should not include the newline at the end.
276  --     * It should not be empty, if the line happens to be empty, it
277  --       should be replaced with the string @\"\<empty line\>\"@.
278  --     * Tab characters should be replaced by appropriate number of
279  --       spaces, which is determined by the 'pstateTabWidth' field of
280  --       'PosState'.
281  --
282  -- __Note__: type signature of the function was changed in the version
283  -- /9.0.0/.
284  --
285  -- @since 7.0.0
286  reachOffset ::
287    -- | Offset to reach
288    Int ->
289    -- | Initial 'PosState' to use
290    PosState s ->
291    -- | See the description of the function
292    (Maybe String, PosState s)
293  reachOffset o pst =
294    (Nothing, reachOffsetNoLine o pst)
295
296  -- | A version of 'reachOffset' that may be faster because it doesn't need
297  -- to fetch the line at which the given offset in located.
298  --
299  -- The default implementation is this:
300  --
301  -- > reachOffsetNoLine o pst =
302  -- >   snd (reachOffset o pst)
303  --
304  -- __Note__: type signature of the function was changed in the version
305  -- /8.0.0/.
306  --
307  -- @since 7.0.0
308  reachOffsetNoLine ::
309    -- | Offset to reach
310    Int ->
311    -- | Initial 'PosState' to use
312    PosState s ->
313    -- | Reached source position and updated state
314    PosState s
315  reachOffsetNoLine o pst =
316    snd (reachOffset o pst)
317
318instance TraversableStream String where
319  -- NOTE Do not eta-reduce these (breaks inlining)
320  reachOffset o pst =
321    reachOffset' splitAt foldl' id id ('\n', '\t') o pst
322  reachOffsetNoLine o pst =
323    reachOffsetNoLine' splitAt foldl' ('\n', '\t') o pst
324
325instance TraversableStream B.ByteString where
326  -- NOTE Do not eta-reduce these (breaks inlining)
327  reachOffset o pst =
328    reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) o pst
329  reachOffsetNoLine o pst =
330    reachOffsetNoLine' B.splitAt B.foldl' (10, 9) o pst
331
332instance TraversableStream BL.ByteString where
333  -- NOTE Do not eta-reduce these (breaks inlining)
334  reachOffset o pst =
335    reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) o pst
336  reachOffsetNoLine o pst =
337    reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) o pst
338
339instance TraversableStream T.Text where
340  -- NOTE Do not eta-reduce (breaks inlining of reachOffset').
341  reachOffset o pst =
342    reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') o pst
343  reachOffsetNoLine o pst =
344    reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') o pst
345
346instance TraversableStream TL.Text where
347  -- NOTE Do not eta-reduce (breaks inlining of reachOffset').
348  reachOffset o pst =
349    reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') o pst
350  reachOffsetNoLine o pst =
351    reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') o pst
352
353----------------------------------------------------------------------------
354-- Helpers
355
356-- | An internal helper state type combining a difference 'String' and an
357-- unboxed 'SourcePos'.
358data St = St SourcePos ShowS
359
360-- | A helper definition to facilitate defining 'reachOffset' for various
361-- stream types.
362reachOffset' ::
363  forall s.
364  Stream s =>
365  -- | How to split input stream at given offset
366  (Int -> s -> (Tokens s, s)) ->
367  -- | How to fold over input stream
368  (forall b. (b -> Token s -> b) -> b -> Tokens s -> b) ->
369  -- | How to convert chunk of input stream into a 'String'
370  (Tokens s -> String) ->
371  -- | How to convert a token into a 'Char'
372  (Token s -> Char) ->
373  -- | Newline token and tab token
374  (Token s, Token s) ->
375  -- | Offset to reach
376  Int ->
377  -- | Initial 'PosState' to use
378  PosState s ->
379  -- | Line at which 'SourcePos' is located, updated 'PosState'
380  (Maybe String, PosState s)
381reachOffset'
382  splitAt'
383  foldl''
384  fromToks
385  fromTok
386  (newlineTok, tabTok)
387  o
388  PosState {..} =
389    ( Just $ case expandTab pstateTabWidth
390        . addPrefix
391        . f
392        . fromToks
393        . fst
394        $ takeWhile_ (/= newlineTok) post of
395        "" -> "<empty line>"
396        xs -> xs,
397      PosState
398        { pstateInput = post,
399          pstateOffset = max pstateOffset o,
400          pstateSourcePos = spos,
401          pstateTabWidth = pstateTabWidth,
402          pstateLinePrefix =
403            if sameLine
404              then -- NOTE We don't use difference lists here because it's
405              -- desirable for 'PosState' to be an instance of 'Eq' and
406              -- 'Show'. So we just do appending here. Fortunately several
407              -- parse errors on the same line should be relatively rare.
408                pstateLinePrefix ++ f ""
409              else f ""
410        }
411    )
412    where
413      addPrefix xs =
414        if sameLine
415          then pstateLinePrefix ++ xs
416          else xs
417      sameLine = sourceLine spos == sourceLine pstateSourcePos
418      (pre, post) = splitAt' (o - pstateOffset) pstateInput
419      St spos f = foldl'' go (St pstateSourcePos id) pre
420      go (St apos g) ch =
421        let SourcePos n l c = apos
422            c' = unPos c
423            w = unPos pstateTabWidth
424         in if
425                | ch == newlineTok ->
426                  St
427                    (SourcePos n (l <> pos1) pos1)
428                    id
429                | ch == tabTok ->
430                  St
431                    (SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w)))
432                    (g . (fromTok ch :))
433                | otherwise ->
434                  St
435                    (SourcePos n l (c <> pos1))
436                    (g . (fromTok ch :))
437{-# INLINE reachOffset' #-}
438
439-- | Like 'reachOffset'' but for 'reachOffsetNoLine'.
440reachOffsetNoLine' ::
441  forall s.
442  Stream s =>
443  -- | How to split input stream at given offset
444  (Int -> s -> (Tokens s, s)) ->
445  -- | How to fold over input stream
446  (forall b. (b -> Token s -> b) -> b -> Tokens s -> b) ->
447  -- | Newline token and tab token
448  (Token s, Token s) ->
449  -- | Offset to reach
450  Int ->
451  -- | Initial 'PosState' to use
452  PosState s ->
453  -- | Updated 'PosState'
454  PosState s
455reachOffsetNoLine'
456  splitAt'
457  foldl''
458  (newlineTok, tabTok)
459  o
460  PosState {..} =
461    ( PosState
462        { pstateInput = post,
463          pstateOffset = max pstateOffset o,
464          pstateSourcePos = spos,
465          pstateTabWidth = pstateTabWidth,
466          pstateLinePrefix = pstateLinePrefix
467        }
468    )
469    where
470      spos = foldl'' go pstateSourcePos pre
471      (pre, post) = splitAt' (o - pstateOffset) pstateInput
472      go (SourcePos n l c) ch =
473        let c' = unPos c
474            w = unPos pstateTabWidth
475         in if
476                | ch == newlineTok ->
477                  SourcePos n (l <> pos1) pos1
478                | ch == tabTok ->
479                  SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w))
480                | otherwise ->
481                  SourcePos n l (c <> pos1)
482{-# INLINE reachOffsetNoLine' #-}
483
484-- | Like 'BL.splitAt' but accepts the index as an 'Int'.
485splitAtBL :: Int -> BL.ByteString -> (BL.ByteString, BL.ByteString)
486splitAtBL n = BL.splitAt (fromIntegral n)
487{-# INLINE splitAtBL #-}
488
489-- | Like 'TL.splitAt' but accepts the index as an 'Int'.
490splitAtTL :: Int -> TL.Text -> (TL.Text, TL.Text)
491splitAtTL n = TL.splitAt (fromIntegral n)
492{-# INLINE splitAtTL #-}
493
494-- | @stringPretty s@ returns pretty representation of string @s@. This is
495-- used when printing string tokens in error messages.
496stringPretty :: NonEmpty Char -> String
497stringPretty (x :| []) = charPretty x
498stringPretty ('\r' :| "\n") = "crlf newline"
499stringPretty xs = "\"" <> concatMap f (NE.toList xs) <> "\""
500  where
501    f ch =
502      case charPretty' ch of
503        Nothing -> [ch]
504        Just pretty -> "<" <> pretty <> ">"
505
506-- | @charPretty ch@ returns user-friendly string representation of given
507-- character @ch@, suitable for using in error messages.
508charPretty :: Char -> String
509charPretty ' ' = "space"
510charPretty ch = fromMaybe ("'" <> [ch] <> "'") (charPretty' ch)
511
512-- | If the given character has a pretty representation, return that,
513-- otherwise 'Nothing'. This is an internal helper.
514charPretty' :: Char -> Maybe String
515charPretty' = \case
516  '\NUL' -> Just "null"
517  '\SOH' -> Just "start of heading"
518  '\STX' -> Just "start of text"
519  '\ETX' -> Just "end of text"
520  '\EOT' -> Just "end of transmission"
521  '\ENQ' -> Just "enquiry"
522  '\ACK' -> Just "acknowledge"
523  '\BEL' -> Just "bell"
524  '\BS' -> Just "backspace"
525  '\t' -> Just "tab"
526  '\n' -> Just "newline"
527  '\v' -> Just "vertical tab"
528  '\f' -> Just "form feed"
529  '\r' -> Just "carriage return"
530  '\SO' -> Just "shift out"
531  '\SI' -> Just "shift in"
532  '\DLE' -> Just "data link escape"
533  '\DC1' -> Just "device control one"
534  '\DC2' -> Just "device control two"
535  '\DC3' -> Just "device control three"
536  '\DC4' -> Just "device control four"
537  '\NAK' -> Just "negative acknowledge"
538  '\SYN' -> Just "synchronous idle"
539  '\ETB' -> Just "end of transmission block"
540  '\CAN' -> Just "cancel"
541  '\EM' -> Just "end of medium"
542  '\SUB' -> Just "substitute"
543  '\ESC' -> Just "escape"
544  '\FS' -> Just "file separator"
545  '\GS' -> Just "group separator"
546  '\RS' -> Just "record separator"
547  '\US' -> Just "unit separator"
548  '\DEL' -> Just "delete"
549  '\160' -> Just "non-breaking space"
550  _ -> Nothing
551
552-- | Replace tab characters with given number of spaces.
553expandTab ::
554  Pos ->
555  String ->
556  String
557expandTab w' = go 0
558  where
559    go 0 [] = []
560    go 0 ('\t' : xs) = go w xs
561    go 0 (x : xs) = x : go 0 xs
562    go n xs = ' ' : go (n - 1) xs
563    w = unPos w'
564