1-- |
2-- Module      :  Text.Megaparsec.Stream
3-- Copyright   :  © 2015–2019 Megaparsec contributors
4-- License     :  FreeBSD
5--
6-- Maintainer  :  Mark Karpov <markkarpov92@gmail.com>
7-- Stability   :  experimental
8-- Portability :  portable
9--
10-- Megaparsec's input stream facilities.
11--
12-- You probably do not want to import this module directly because
13-- "Text.Megaparsec" re-exports it anyway.
14--
15-- @since 6.0.0
16
17{-# LANGUAGE CPP                 #-}
18{-# LANGUAGE FlexibleContexts    #-}
19{-# LANGUAGE FlexibleInstances   #-}
20{-# LANGUAGE LambdaCase          #-}
21{-# LANGUAGE MultiWayIf          #-}
22{-# LANGUAGE RankNTypes          #-}
23{-# LANGUAGE RecordWildCards     #-}
24{-# LANGUAGE ScopedTypeVariables #-}
25{-# LANGUAGE TypeFamilies        #-}
26
27module Text.Megaparsec.Stream
28  ( Stream (..) )
29where
30
31import Data.Char (chr)
32import Data.Foldable (foldl')
33import Data.List.NonEmpty (NonEmpty (..))
34import Data.Maybe (fromMaybe)
35import Data.Proxy
36import Data.Word (Word8)
37import Text.Megaparsec.Pos
38import Text.Megaparsec.State
39import qualified Data.ByteString            as B
40import qualified Data.ByteString.Char8      as B8
41import qualified Data.ByteString.Lazy       as BL
42import qualified Data.ByteString.Lazy.Char8 as BL8
43import qualified Data.List.NonEmpty         as NE
44import qualified Data.Text                  as T
45import qualified Data.Text.Lazy             as TL
46
47#if !MIN_VERSION_base(4,11,0)
48import Data.Semigroup
49#endif
50
51-- | Type class for inputs that can be consumed by the library.
52
53class (Ord (Token s), Ord (Tokens s)) => Stream s where
54
55  -- | Type of token in the stream.
56
57  type Token s :: *
58
59  -- | Type of “chunk” of the stream.
60
61  type Tokens s :: *
62
63  -- | Lift a single token to chunk of the stream. The default
64  -- implementation is:
65  --
66  -- > tokenToChunk pxy = tokensToChunk pxy . pure
67  --
68  -- However for some types of stream there may be a more efficient way to
69  -- lift.
70
71  tokenToChunk  :: Proxy s -> Token s -> Tokens s
72  tokenToChunk pxy = tokensToChunk pxy . pure
73
74  -- | The first method that establishes isomorphism between list of tokens
75  -- and chunk of the stream. Valid implementation should satisfy:
76  --
77  -- > chunkToTokens pxy (tokensToChunk pxy ts) == ts
78
79  tokensToChunk :: Proxy s -> [Token s] -> Tokens s
80
81  -- | The second method that establishes isomorphism between list of tokens
82  -- and chunk of the stream. Valid implementation should satisfy:
83  --
84  -- > tokensToChunk pxy (chunkToTokens pxy chunk) == chunk
85
86  chunkToTokens :: Proxy s -> Tokens s -> [Token s]
87
88  -- | Return length of a chunk of the stream.
89
90  chunkLength :: Proxy s -> Tokens s -> Int
91
92  -- | Check if a chunk of the stream is empty. The default implementation
93  -- is in terms of the more general 'chunkLength':
94  --
95  -- > chunkEmpty pxy ts = chunkLength pxy ts <= 0
96  --
97  -- However for many streams there may be a more efficient implementation.
98
99  chunkEmpty :: Proxy s -> Tokens s -> Bool
100  chunkEmpty pxy ts = chunkLength pxy ts <= 0
101
102  -- | Extract a single token form the stream. Return 'Nothing' if the
103  -- stream is empty.
104
105  take1_ :: s -> Maybe (Token s, s)
106
107  -- | @'takeN_' n s@ should try to extract a chunk of length @n@, or if the
108  -- stream is too short, the rest of the stream. Valid implementation
109  -- should follow the rules:
110  --
111  --     * If the requested length @n@ is 0 (or less), 'Nothing' should
112  --       never be returned, instead @'Just' (\"\", s)@ should be returned,
113  --       where @\"\"@ stands for the empty chunk, and @s@ is the original
114  --       stream (second argument).
115  --     * If the requested length is greater than 0 and the stream is
116  --       empty, 'Nothing' should be returned indicating end of input.
117  --     * In other cases, take chunk of length @n@ (or shorter if the
118  --       stream is not long enough) from the input stream and return the
119  --       chunk along with the rest of the stream.
120
121  takeN_ :: Int -> s -> Maybe (Tokens s, s)
122
123  -- | Extract chunk of the stream taking tokens while the supplied
124  -- predicate returns 'True'. Return the chunk and the rest of the stream.
125  --
126  -- For many types of streams, the method allows for significant
127  -- performance improvements, although it is not strictly necessary from
128  -- conceptual point of view.
129
130  takeWhile_ :: (Token s -> Bool) -> s -> (Tokens s, s)
131
132  -- | Pretty-print non-empty stream of tokens. This function is also used
133  -- to print single tokens (represented as singleton lists).
134  --
135  -- @since 7.0.0
136
137  showTokens :: Proxy s -> NonEmpty (Token s) -> String
138
139  -- | Given an offset @o@ and initial 'PosState', adjust the state in such
140  -- a way that it starts at the offset.
141  --
142  -- Return three values (in order):
143  --
144  --     * 'SourcePos' which the given offset @o@ points to.
145  --     * 'String' representing the line on which the given offset @o@ is
146  --       located. The line should satisfy a number of conditions that are
147  --       described below.
148  --     * The updated 'PosState' which can be in turn used to locate
149  --       another offset @o'@ given that @o' >= o@.
150  --
151  -- The 'String' representing the offending line in input stream should
152  -- satisfy the following:
153  --
154  --     * It should adequately represent location of token at the offset of
155  --       interest, that is, character at 'sourceColumn' of the returned
156  --       'SourcePos' should correspond to the token at the offset @o@.
157  --     * It should not include the newline at the end.
158  --     * It should not be empty, if the line happens to be empty, it
159  --       should be replaced with the string @\"\<empty line\>\"@.
160  --     * Tab characters should be replaced by appropriate number of
161  --       spaces, which is determined by the 'pstateTabWidth' field of
162  --       'PosState'.
163  --
164  -- @since 7.0.0
165
166  reachOffset
167    :: Int             -- ^ Offset to reach
168    -> PosState s      -- ^ Initial 'PosState' to use
169    -> (SourcePos, String, PosState s) -- ^ (See below)
170
171  -- | A version of 'reachOffset' that may be faster because it doesn't need
172  -- to fetch the line at which the given offset in located.
173  --
174  -- The default implementation is this:
175  --
176  -- > reachOffsetNoLine o pst =
177  -- >   let (spos, _, pst')=  reachOffset o pst
178  -- >   in (spos, pst')
179  --
180  -- @since 7.0.0
181
182  reachOffsetNoLine
183    :: Int             -- ^ Offset to reach
184    -> PosState s      -- ^ Initial 'PosState' to use
185    -> (SourcePos, PosState s) -- ^ Reached source position and updated state
186  reachOffsetNoLine o pst =
187    let (spos, _, pst') = reachOffset o pst
188    in (spos, pst')
189
190instance Stream String where
191  type Token String = Char
192  type Tokens String = String
193  tokenToChunk Proxy = pure
194  tokensToChunk Proxy = id
195  chunkToTokens Proxy = id
196  chunkLength Proxy = length
197  chunkEmpty Proxy = null
198  take1_ [] = Nothing
199  take1_ (t:ts) = Just (t, ts)
200  takeN_ n s
201    | n <= 0    = Just ("", s)
202    | null s    = Nothing
203    | otherwise = Just (splitAt n s)
204  takeWhile_ = span
205  showTokens Proxy = stringPretty
206  -- NOTE Do not eta-reduce these (breaks inlining)
207  reachOffset o pst =
208    reachOffset' splitAt foldl' id id ('\n','\t') o pst
209  reachOffsetNoLine o pst =
210    reachOffsetNoLine' splitAt foldl' ('\n', '\t') o pst
211
212instance Stream B.ByteString where
213  type Token B.ByteString = Word8
214  type Tokens B.ByteString = B.ByteString
215  tokenToChunk Proxy = B.singleton
216  tokensToChunk Proxy = B.pack
217  chunkToTokens Proxy = B.unpack
218  chunkLength Proxy = B.length
219  chunkEmpty Proxy = B.null
220  take1_ = B.uncons
221  takeN_ n s
222    | n <= 0    = Just (B.empty, s)
223    | B.null s  = Nothing
224    | otherwise = Just (B.splitAt n s)
225  takeWhile_ = B.span
226  showTokens Proxy = stringPretty . fmap (chr . fromIntegral)
227  -- NOTE Do not eta-reduce these (breaks inlining)
228  reachOffset o pst =
229    reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) o pst
230  reachOffsetNoLine o pst =
231    reachOffsetNoLine' B.splitAt B.foldl' (10, 9) o pst
232
233instance Stream BL.ByteString where
234  type Token BL.ByteString = Word8
235  type Tokens BL.ByteString = BL.ByteString
236  tokenToChunk Proxy = BL.singleton
237  tokensToChunk Proxy = BL.pack
238  chunkToTokens Proxy = BL.unpack
239  chunkLength Proxy = fromIntegral . BL.length
240  chunkEmpty Proxy = BL.null
241  take1_ = BL.uncons
242  takeN_ n s
243    | n <= 0    = Just (BL.empty, s)
244    | BL.null s = Nothing
245    | otherwise = Just (BL.splitAt (fromIntegral n) s)
246  takeWhile_ = BL.span
247  showTokens Proxy = stringPretty . fmap (chr . fromIntegral)
248  -- NOTE Do not eta-reduce these (breaks inlining)
249  reachOffset o pst =
250    reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) o pst
251  reachOffsetNoLine o pst =
252    reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) o pst
253
254instance Stream T.Text where
255  type Token T.Text = Char
256  type Tokens T.Text = T.Text
257  tokenToChunk Proxy = T.singleton
258  tokensToChunk Proxy = T.pack
259  chunkToTokens Proxy = T.unpack
260  chunkLength Proxy = T.length
261  chunkEmpty Proxy = T.null
262  take1_ = T.uncons
263  takeN_ n s
264    | n <= 0    = Just (T.empty, s)
265    | T.null s  = Nothing
266    | otherwise = Just (T.splitAt n s)
267  takeWhile_ = T.span
268  showTokens Proxy = stringPretty
269  -- NOTE Do not eta-reduce (breaks inlining of reachOffset').
270  reachOffset o pst =
271    reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') o pst
272  reachOffsetNoLine o pst =
273    reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') o pst
274
275instance Stream TL.Text where
276  type Token TL.Text  = Char
277  type Tokens TL.Text = TL.Text
278  tokenToChunk Proxy = TL.singleton
279  tokensToChunk Proxy = TL.pack
280  chunkToTokens Proxy = TL.unpack
281  chunkLength Proxy = fromIntegral . TL.length
282  chunkEmpty Proxy = TL.null
283  take1_ = TL.uncons
284  takeN_ n s
285    | n <= 0    = Just (TL.empty, s)
286    | TL.null s = Nothing
287    | otherwise = Just (TL.splitAt (fromIntegral n) s)
288  takeWhile_ = TL.span
289  showTokens Proxy = stringPretty
290  -- NOTE Do not eta-reduce (breaks inlining of reachOffset').
291  reachOffset o pst =
292    reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') o pst
293  reachOffsetNoLine o pst =
294    reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') o pst
295
296----------------------------------------------------------------------------
297-- Helpers
298
299-- | An internal helper state type combining a difference 'String' and an
300-- unboxed 'SourcePos'.
301
302data St = St SourcePos ShowS
303
304-- | A helper definition to facilitate defining 'reachOffset' for various
305-- stream types.
306
307reachOffset'
308  :: forall s. Stream s
309  => (Int -> s -> (Tokens s, s))
310     -- ^ How to split input stream at given offset
311  -> (forall b. (b -> Token s -> b) -> b -> Tokens s -> b)
312     -- ^ How to fold over input stream
313  -> (Tokens s -> String)
314     -- ^ How to convert chunk of input stream into a 'String'
315  -> (Token s -> Char)
316     -- ^ How to convert a token into a 'Char'
317  -> (Token s, Token s)
318     -- ^ Newline token and tab token
319  -> Int
320     -- ^ Offset to reach
321  -> PosState s
322     -- ^ Initial 'PosState' to use
323  -> (SourcePos, String, PosState s)
324     -- ^ Reached 'SourcePos', line at which 'SourcePos' is located, updated
325     -- 'PosState'
326reachOffset' splitAt'
327             foldl''
328             fromToks
329             fromTok
330             (newlineTok, tabTok)
331             o
332             PosState {..} =
333  ( spos
334  , case expandTab pstateTabWidth
335           . addPrefix
336           . f
337           . fromToks
338           . fst
339           $ takeWhile_ (/= newlineTok) post of
340      "" -> "<empty line>"
341      xs -> xs
342  , PosState
343      { pstateInput = post
344      , pstateOffset = max pstateOffset o
345      , pstateSourcePos = spos
346      , pstateTabWidth = pstateTabWidth
347      , pstateLinePrefix =
348          if sameLine
349            -- NOTE We don't use difference lists here because it's
350            -- desirable for 'PosState' to be an instance of 'Eq' and
351            -- 'Show'. So we just do appending here. Fortunately several
352            -- parse errors on the same line should be relatively rare.
353            then pstateLinePrefix ++ f ""
354            else f ""
355      }
356  )
357  where
358    addPrefix xs =
359      if sameLine
360        then pstateLinePrefix ++ xs
361        else xs
362    sameLine = sourceLine spos == sourceLine pstateSourcePos
363    (pre, post) = splitAt' (o - pstateOffset) pstateInput
364    St spos f = foldl'' go (St pstateSourcePos id) pre
365    go (St apos g) ch =
366      let SourcePos n l c = apos
367          c' = unPos c
368          w  = unPos pstateTabWidth
369      in if | ch == newlineTok ->
370                St (SourcePos n (l <> pos1) pos1)
371                   id
372            | ch == tabTok ->
373                St (SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w)))
374                   (g . (fromTok ch :))
375            | otherwise ->
376                St (SourcePos n l (c <> pos1))
377                   (g . (fromTok ch :))
378{-# INLINE reachOffset' #-}
379
380-- | Like 'reachOffset'' but for 'reachOffsetNoLine'.
381
382reachOffsetNoLine'
383  :: forall s. Stream s
384  => (Int -> s -> (Tokens s, s))
385     -- ^ How to split input stream at given offset
386  -> (forall b. (b -> Token s -> b) -> b -> Tokens s -> b)
387     -- ^ How to fold over input stream
388  -> (Token s, Token s)
389     -- ^ Newline token and tab token
390  -> Int
391     -- ^ Offset to reach
392  -> PosState s
393     -- ^ Initial 'PosState' to use
394  -> (SourcePos, PosState s)
395     -- ^ Reached 'SourcePos' and updated 'PosState'
396reachOffsetNoLine' splitAt'
397                   foldl''
398                   (newlineTok, tabTok)
399                   o
400                   PosState {..} =
401  ( spos
402  , PosState
403      { pstateInput = post
404      , pstateOffset = max pstateOffset o
405      , pstateSourcePos = spos
406      , pstateTabWidth = pstateTabWidth
407      , pstateLinePrefix = pstateLinePrefix
408      }
409  )
410  where
411    spos = foldl'' go pstateSourcePos pre
412    (pre, post) = splitAt' (o - pstateOffset) pstateInput
413    go (SourcePos n l c) ch =
414      let c' = unPos c
415          w  = unPos pstateTabWidth
416      in if | ch == newlineTok ->
417                SourcePos n (l <> pos1) pos1
418            | ch == tabTok ->
419                SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w))
420            | otherwise ->
421                SourcePos n l (c <> pos1)
422{-# INLINE reachOffsetNoLine' #-}
423
424-- | Like 'BL.splitAt' but accepts the index as an 'Int'.
425
426splitAtBL :: Int -> BL.ByteString -> (BL.ByteString, BL.ByteString)
427splitAtBL n = BL.splitAt (fromIntegral n)
428{-# INLINE splitAtBL #-}
429
430-- | Like 'TL.splitAt' but accepts the index as an 'Int'.
431
432splitAtTL :: Int -> TL.Text -> (TL.Text, TL.Text)
433splitAtTL n = TL.splitAt (fromIntegral n)
434{-# INLINE splitAtTL #-}
435
436-- | @stringPretty s@ returns pretty representation of string @s@. This is
437-- used when printing string tokens in error messages.
438
439stringPretty :: NonEmpty Char -> String
440stringPretty (x:|[])      = charPretty x
441stringPretty ('\r':|"\n") = "crlf newline"
442stringPretty xs           = "\"" <> concatMap f (NE.toList xs) <> "\""
443  where
444    f ch =
445      case charPretty' ch of
446        Nothing     -> [ch]
447        Just pretty -> "<" <> pretty <> ">"
448
449-- | @charPretty ch@ returns user-friendly string representation of given
450-- character @ch@, suitable for using in error messages.
451
452charPretty :: Char -> String
453charPretty ' ' = "space"
454charPretty ch = fromMaybe ("'" <> [ch] <> "'") (charPretty' ch)
455
456-- | If the given character has a pretty representation, return that,
457-- otherwise 'Nothing'. This is an internal helper.
458
459charPretty' :: Char -> Maybe String
460charPretty' = \case
461  '\NUL' -> Just "null"
462  '\SOH' -> Just "start of heading"
463  '\STX' -> Just "start of text"
464  '\ETX' -> Just "end of text"
465  '\EOT' -> Just "end of transmission"
466  '\ENQ' -> Just "enquiry"
467  '\ACK' -> Just "acknowledge"
468  '\BEL' -> Just "bell"
469  '\BS'  -> Just "backspace"
470  '\t'   -> Just "tab"
471  '\n'   -> Just "newline"
472  '\v'   -> Just "vertical tab"
473  '\f'   -> Just "form feed"
474  '\r'   -> Just "carriage return"
475  '\SO'  -> Just "shift out"
476  '\SI'  -> Just "shift in"
477  '\DLE' -> Just "data link escape"
478  '\DC1' -> Just "device control one"
479  '\DC2' -> Just "device control two"
480  '\DC3' -> Just "device control three"
481  '\DC4' -> Just "device control four"
482  '\NAK' -> Just "negative acknowledge"
483  '\SYN' -> Just "synchronous idle"
484  '\ETB' -> Just "end of transmission block"
485  '\CAN' -> Just "cancel"
486  '\EM'  -> Just "end of medium"
487  '\SUB' -> Just "substitute"
488  '\ESC' -> Just "escape"
489  '\FS'  -> Just "file separator"
490  '\GS'  -> Just "group separator"
491  '\RS'  -> Just "record separator"
492  '\US'  -> Just "unit separator"
493  '\DEL' -> Just "delete"
494  '\160' -> Just "non-breaking space"
495  _      -> Nothing
496
497-- | Replace tab characters with given number of spaces.
498
499expandTab
500  :: Pos
501  -> String
502  -> String
503expandTab w' = go 0
504  where
505    go 0 []        = []
506    go 0 ('\t':xs) = go w xs
507    go 0 (x:xs)    = x : go 0 xs
508    go n xs        = ' ' : go (n - 1) xs
509    w              = unPos w'
510