1-- | 2-- Module : Text.Megaparsec.Stream 3-- Copyright : © 2015–2019 Megaparsec contributors 4-- License : FreeBSD 5-- 6-- Maintainer : Mark Karpov <markkarpov92@gmail.com> 7-- Stability : experimental 8-- Portability : portable 9-- 10-- Megaparsec's input stream facilities. 11-- 12-- You probably do not want to import this module directly because 13-- "Text.Megaparsec" re-exports it anyway. 14-- 15-- @since 6.0.0 16 17{-# LANGUAGE CPP #-} 18{-# LANGUAGE FlexibleContexts #-} 19{-# LANGUAGE FlexibleInstances #-} 20{-# LANGUAGE LambdaCase #-} 21{-# LANGUAGE MultiWayIf #-} 22{-# LANGUAGE RankNTypes #-} 23{-# LANGUAGE RecordWildCards #-} 24{-# LANGUAGE ScopedTypeVariables #-} 25{-# LANGUAGE TypeFamilies #-} 26 27module Text.Megaparsec.Stream 28 ( Stream (..) ) 29where 30 31import Data.Char (chr) 32import Data.Foldable (foldl') 33import Data.List.NonEmpty (NonEmpty (..)) 34import Data.Maybe (fromMaybe) 35import Data.Proxy 36import Data.Word (Word8) 37import Text.Megaparsec.Pos 38import Text.Megaparsec.State 39import qualified Data.ByteString as B 40import qualified Data.ByteString.Char8 as B8 41import qualified Data.ByteString.Lazy as BL 42import qualified Data.ByteString.Lazy.Char8 as BL8 43import qualified Data.List.NonEmpty as NE 44import qualified Data.Text as T 45import qualified Data.Text.Lazy as TL 46 47#if !MIN_VERSION_base(4,11,0) 48import Data.Semigroup 49#endif 50 51-- | Type class for inputs that can be consumed by the library. 52 53class (Ord (Token s), Ord (Tokens s)) => Stream s where 54 55 -- | Type of token in the stream. 56 57 type Token s :: * 58 59 -- | Type of “chunk” of the stream. 60 61 type Tokens s :: * 62 63 -- | Lift a single token to chunk of the stream. The default 64 -- implementation is: 65 -- 66 -- > tokenToChunk pxy = tokensToChunk pxy . pure 67 -- 68 -- However for some types of stream there may be a more efficient way to 69 -- lift. 70 71 tokenToChunk :: Proxy s -> Token s -> Tokens s 72 tokenToChunk pxy = tokensToChunk pxy . pure 73 74 -- | The first method that establishes isomorphism between list of tokens 75 -- and chunk of the stream. Valid implementation should satisfy: 76 -- 77 -- > chunkToTokens pxy (tokensToChunk pxy ts) == ts 78 79 tokensToChunk :: Proxy s -> [Token s] -> Tokens s 80 81 -- | The second method that establishes isomorphism between list of tokens 82 -- and chunk of the stream. Valid implementation should satisfy: 83 -- 84 -- > tokensToChunk pxy (chunkToTokens pxy chunk) == chunk 85 86 chunkToTokens :: Proxy s -> Tokens s -> [Token s] 87 88 -- | Return length of a chunk of the stream. 89 90 chunkLength :: Proxy s -> Tokens s -> Int 91 92 -- | Check if a chunk of the stream is empty. The default implementation 93 -- is in terms of the more general 'chunkLength': 94 -- 95 -- > chunkEmpty pxy ts = chunkLength pxy ts <= 0 96 -- 97 -- However for many streams there may be a more efficient implementation. 98 99 chunkEmpty :: Proxy s -> Tokens s -> Bool 100 chunkEmpty pxy ts = chunkLength pxy ts <= 0 101 102 -- | Extract a single token form the stream. Return 'Nothing' if the 103 -- stream is empty. 104 105 take1_ :: s -> Maybe (Token s, s) 106 107 -- | @'takeN_' n s@ should try to extract a chunk of length @n@, or if the 108 -- stream is too short, the rest of the stream. Valid implementation 109 -- should follow the rules: 110 -- 111 -- * If the requested length @n@ is 0 (or less), 'Nothing' should 112 -- never be returned, instead @'Just' (\"\", s)@ should be returned, 113 -- where @\"\"@ stands for the empty chunk, and @s@ is the original 114 -- stream (second argument). 115 -- * If the requested length is greater than 0 and the stream is 116 -- empty, 'Nothing' should be returned indicating end of input. 117 -- * In other cases, take chunk of length @n@ (or shorter if the 118 -- stream is not long enough) from the input stream and return the 119 -- chunk along with the rest of the stream. 120 121 takeN_ :: Int -> s -> Maybe (Tokens s, s) 122 123 -- | Extract chunk of the stream taking tokens while the supplied 124 -- predicate returns 'True'. Return the chunk and the rest of the stream. 125 -- 126 -- For many types of streams, the method allows for significant 127 -- performance improvements, although it is not strictly necessary from 128 -- conceptual point of view. 129 130 takeWhile_ :: (Token s -> Bool) -> s -> (Tokens s, s) 131 132 -- | Pretty-print non-empty stream of tokens. This function is also used 133 -- to print single tokens (represented as singleton lists). 134 -- 135 -- @since 7.0.0 136 137 showTokens :: Proxy s -> NonEmpty (Token s) -> String 138 139 -- | Given an offset @o@ and initial 'PosState', adjust the state in such 140 -- a way that it starts at the offset. 141 -- 142 -- Return three values (in order): 143 -- 144 -- * 'SourcePos' which the given offset @o@ points to. 145 -- * 'String' representing the line on which the given offset @o@ is 146 -- located. The line should satisfy a number of conditions that are 147 -- described below. 148 -- * The updated 'PosState' which can be in turn used to locate 149 -- another offset @o'@ given that @o' >= o@. 150 -- 151 -- The 'String' representing the offending line in input stream should 152 -- satisfy the following: 153 -- 154 -- * It should adequately represent location of token at the offset of 155 -- interest, that is, character at 'sourceColumn' of the returned 156 -- 'SourcePos' should correspond to the token at the offset @o@. 157 -- * It should not include the newline at the end. 158 -- * It should not be empty, if the line happens to be empty, it 159 -- should be replaced with the string @\"\<empty line\>\"@. 160 -- * Tab characters should be replaced by appropriate number of 161 -- spaces, which is determined by the 'pstateTabWidth' field of 162 -- 'PosState'. 163 -- 164 -- @since 7.0.0 165 166 reachOffset 167 :: Int -- ^ Offset to reach 168 -> PosState s -- ^ Initial 'PosState' to use 169 -> (SourcePos, String, PosState s) -- ^ (See below) 170 171 -- | A version of 'reachOffset' that may be faster because it doesn't need 172 -- to fetch the line at which the given offset in located. 173 -- 174 -- The default implementation is this: 175 -- 176 -- > reachOffsetNoLine o pst = 177 -- > let (spos, _, pst')= reachOffset o pst 178 -- > in (spos, pst') 179 -- 180 -- @since 7.0.0 181 182 reachOffsetNoLine 183 :: Int -- ^ Offset to reach 184 -> PosState s -- ^ Initial 'PosState' to use 185 -> (SourcePos, PosState s) -- ^ Reached source position and updated state 186 reachOffsetNoLine o pst = 187 let (spos, _, pst') = reachOffset o pst 188 in (spos, pst') 189 190instance Stream String where 191 type Token String = Char 192 type Tokens String = String 193 tokenToChunk Proxy = pure 194 tokensToChunk Proxy = id 195 chunkToTokens Proxy = id 196 chunkLength Proxy = length 197 chunkEmpty Proxy = null 198 take1_ [] = Nothing 199 take1_ (t:ts) = Just (t, ts) 200 takeN_ n s 201 | n <= 0 = Just ("", s) 202 | null s = Nothing 203 | otherwise = Just (splitAt n s) 204 takeWhile_ = span 205 showTokens Proxy = stringPretty 206 -- NOTE Do not eta-reduce these (breaks inlining) 207 reachOffset o pst = 208 reachOffset' splitAt foldl' id id ('\n','\t') o pst 209 reachOffsetNoLine o pst = 210 reachOffsetNoLine' splitAt foldl' ('\n', '\t') o pst 211 212instance Stream B.ByteString where 213 type Token B.ByteString = Word8 214 type Tokens B.ByteString = B.ByteString 215 tokenToChunk Proxy = B.singleton 216 tokensToChunk Proxy = B.pack 217 chunkToTokens Proxy = B.unpack 218 chunkLength Proxy = B.length 219 chunkEmpty Proxy = B.null 220 take1_ = B.uncons 221 takeN_ n s 222 | n <= 0 = Just (B.empty, s) 223 | B.null s = Nothing 224 | otherwise = Just (B.splitAt n s) 225 takeWhile_ = B.span 226 showTokens Proxy = stringPretty . fmap (chr . fromIntegral) 227 -- NOTE Do not eta-reduce these (breaks inlining) 228 reachOffset o pst = 229 reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) o pst 230 reachOffsetNoLine o pst = 231 reachOffsetNoLine' B.splitAt B.foldl' (10, 9) o pst 232 233instance Stream BL.ByteString where 234 type Token BL.ByteString = Word8 235 type Tokens BL.ByteString = BL.ByteString 236 tokenToChunk Proxy = BL.singleton 237 tokensToChunk Proxy = BL.pack 238 chunkToTokens Proxy = BL.unpack 239 chunkLength Proxy = fromIntegral . BL.length 240 chunkEmpty Proxy = BL.null 241 take1_ = BL.uncons 242 takeN_ n s 243 | n <= 0 = Just (BL.empty, s) 244 | BL.null s = Nothing 245 | otherwise = Just (BL.splitAt (fromIntegral n) s) 246 takeWhile_ = BL.span 247 showTokens Proxy = stringPretty . fmap (chr . fromIntegral) 248 -- NOTE Do not eta-reduce these (breaks inlining) 249 reachOffset o pst = 250 reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) o pst 251 reachOffsetNoLine o pst = 252 reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) o pst 253 254instance Stream T.Text where 255 type Token T.Text = Char 256 type Tokens T.Text = T.Text 257 tokenToChunk Proxy = T.singleton 258 tokensToChunk Proxy = T.pack 259 chunkToTokens Proxy = T.unpack 260 chunkLength Proxy = T.length 261 chunkEmpty Proxy = T.null 262 take1_ = T.uncons 263 takeN_ n s 264 | n <= 0 = Just (T.empty, s) 265 | T.null s = Nothing 266 | otherwise = Just (T.splitAt n s) 267 takeWhile_ = T.span 268 showTokens Proxy = stringPretty 269 -- NOTE Do not eta-reduce (breaks inlining of reachOffset'). 270 reachOffset o pst = 271 reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') o pst 272 reachOffsetNoLine o pst = 273 reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') o pst 274 275instance Stream TL.Text where 276 type Token TL.Text = Char 277 type Tokens TL.Text = TL.Text 278 tokenToChunk Proxy = TL.singleton 279 tokensToChunk Proxy = TL.pack 280 chunkToTokens Proxy = TL.unpack 281 chunkLength Proxy = fromIntegral . TL.length 282 chunkEmpty Proxy = TL.null 283 take1_ = TL.uncons 284 takeN_ n s 285 | n <= 0 = Just (TL.empty, s) 286 | TL.null s = Nothing 287 | otherwise = Just (TL.splitAt (fromIntegral n) s) 288 takeWhile_ = TL.span 289 showTokens Proxy = stringPretty 290 -- NOTE Do not eta-reduce (breaks inlining of reachOffset'). 291 reachOffset o pst = 292 reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') o pst 293 reachOffsetNoLine o pst = 294 reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') o pst 295 296---------------------------------------------------------------------------- 297-- Helpers 298 299-- | An internal helper state type combining a difference 'String' and an 300-- unboxed 'SourcePos'. 301 302data St = St SourcePos ShowS 303 304-- | A helper definition to facilitate defining 'reachOffset' for various 305-- stream types. 306 307reachOffset' 308 :: forall s. Stream s 309 => (Int -> s -> (Tokens s, s)) 310 -- ^ How to split input stream at given offset 311 -> (forall b. (b -> Token s -> b) -> b -> Tokens s -> b) 312 -- ^ How to fold over input stream 313 -> (Tokens s -> String) 314 -- ^ How to convert chunk of input stream into a 'String' 315 -> (Token s -> Char) 316 -- ^ How to convert a token into a 'Char' 317 -> (Token s, Token s) 318 -- ^ Newline token and tab token 319 -> Int 320 -- ^ Offset to reach 321 -> PosState s 322 -- ^ Initial 'PosState' to use 323 -> (SourcePos, String, PosState s) 324 -- ^ Reached 'SourcePos', line at which 'SourcePos' is located, updated 325 -- 'PosState' 326reachOffset' splitAt' 327 foldl'' 328 fromToks 329 fromTok 330 (newlineTok, tabTok) 331 o 332 PosState {..} = 333 ( spos 334 , case expandTab pstateTabWidth 335 . addPrefix 336 . f 337 . fromToks 338 . fst 339 $ takeWhile_ (/= newlineTok) post of 340 "" -> "<empty line>" 341 xs -> xs 342 , PosState 343 { pstateInput = post 344 , pstateOffset = max pstateOffset o 345 , pstateSourcePos = spos 346 , pstateTabWidth = pstateTabWidth 347 , pstateLinePrefix = 348 if sameLine 349 -- NOTE We don't use difference lists here because it's 350 -- desirable for 'PosState' to be an instance of 'Eq' and 351 -- 'Show'. So we just do appending here. Fortunately several 352 -- parse errors on the same line should be relatively rare. 353 then pstateLinePrefix ++ f "" 354 else f "" 355 } 356 ) 357 where 358 addPrefix xs = 359 if sameLine 360 then pstateLinePrefix ++ xs 361 else xs 362 sameLine = sourceLine spos == sourceLine pstateSourcePos 363 (pre, post) = splitAt' (o - pstateOffset) pstateInput 364 St spos f = foldl'' go (St pstateSourcePos id) pre 365 go (St apos g) ch = 366 let SourcePos n l c = apos 367 c' = unPos c 368 w = unPos pstateTabWidth 369 in if | ch == newlineTok -> 370 St (SourcePos n (l <> pos1) pos1) 371 id 372 | ch == tabTok -> 373 St (SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w))) 374 (g . (fromTok ch :)) 375 | otherwise -> 376 St (SourcePos n l (c <> pos1)) 377 (g . (fromTok ch :)) 378{-# INLINE reachOffset' #-} 379 380-- | Like 'reachOffset'' but for 'reachOffsetNoLine'. 381 382reachOffsetNoLine' 383 :: forall s. Stream s 384 => (Int -> s -> (Tokens s, s)) 385 -- ^ How to split input stream at given offset 386 -> (forall b. (b -> Token s -> b) -> b -> Tokens s -> b) 387 -- ^ How to fold over input stream 388 -> (Token s, Token s) 389 -- ^ Newline token and tab token 390 -> Int 391 -- ^ Offset to reach 392 -> PosState s 393 -- ^ Initial 'PosState' to use 394 -> (SourcePos, PosState s) 395 -- ^ Reached 'SourcePos' and updated 'PosState' 396reachOffsetNoLine' splitAt' 397 foldl'' 398 (newlineTok, tabTok) 399 o 400 PosState {..} = 401 ( spos 402 , PosState 403 { pstateInput = post 404 , pstateOffset = max pstateOffset o 405 , pstateSourcePos = spos 406 , pstateTabWidth = pstateTabWidth 407 , pstateLinePrefix = pstateLinePrefix 408 } 409 ) 410 where 411 spos = foldl'' go pstateSourcePos pre 412 (pre, post) = splitAt' (o - pstateOffset) pstateInput 413 go (SourcePos n l c) ch = 414 let c' = unPos c 415 w = unPos pstateTabWidth 416 in if | ch == newlineTok -> 417 SourcePos n (l <> pos1) pos1 418 | ch == tabTok -> 419 SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w)) 420 | otherwise -> 421 SourcePos n l (c <> pos1) 422{-# INLINE reachOffsetNoLine' #-} 423 424-- | Like 'BL.splitAt' but accepts the index as an 'Int'. 425 426splitAtBL :: Int -> BL.ByteString -> (BL.ByteString, BL.ByteString) 427splitAtBL n = BL.splitAt (fromIntegral n) 428{-# INLINE splitAtBL #-} 429 430-- | Like 'TL.splitAt' but accepts the index as an 'Int'. 431 432splitAtTL :: Int -> TL.Text -> (TL.Text, TL.Text) 433splitAtTL n = TL.splitAt (fromIntegral n) 434{-# INLINE splitAtTL #-} 435 436-- | @stringPretty s@ returns pretty representation of string @s@. This is 437-- used when printing string tokens in error messages. 438 439stringPretty :: NonEmpty Char -> String 440stringPretty (x:|[]) = charPretty x 441stringPretty ('\r':|"\n") = "crlf newline" 442stringPretty xs = "\"" <> concatMap f (NE.toList xs) <> "\"" 443 where 444 f ch = 445 case charPretty' ch of 446 Nothing -> [ch] 447 Just pretty -> "<" <> pretty <> ">" 448 449-- | @charPretty ch@ returns user-friendly string representation of given 450-- character @ch@, suitable for using in error messages. 451 452charPretty :: Char -> String 453charPretty ' ' = "space" 454charPretty ch = fromMaybe ("'" <> [ch] <> "'") (charPretty' ch) 455 456-- | If the given character has a pretty representation, return that, 457-- otherwise 'Nothing'. This is an internal helper. 458 459charPretty' :: Char -> Maybe String 460charPretty' = \case 461 '\NUL' -> Just "null" 462 '\SOH' -> Just "start of heading" 463 '\STX' -> Just "start of text" 464 '\ETX' -> Just "end of text" 465 '\EOT' -> Just "end of transmission" 466 '\ENQ' -> Just "enquiry" 467 '\ACK' -> Just "acknowledge" 468 '\BEL' -> Just "bell" 469 '\BS' -> Just "backspace" 470 '\t' -> Just "tab" 471 '\n' -> Just "newline" 472 '\v' -> Just "vertical tab" 473 '\f' -> Just "form feed" 474 '\r' -> Just "carriage return" 475 '\SO' -> Just "shift out" 476 '\SI' -> Just "shift in" 477 '\DLE' -> Just "data link escape" 478 '\DC1' -> Just "device control one" 479 '\DC2' -> Just "device control two" 480 '\DC3' -> Just "device control three" 481 '\DC4' -> Just "device control four" 482 '\NAK' -> Just "negative acknowledge" 483 '\SYN' -> Just "synchronous idle" 484 '\ETB' -> Just "end of transmission block" 485 '\CAN' -> Just "cancel" 486 '\EM' -> Just "end of medium" 487 '\SUB' -> Just "substitute" 488 '\ESC' -> Just "escape" 489 '\FS' -> Just "file separator" 490 '\GS' -> Just "group separator" 491 '\RS' -> Just "record separator" 492 '\US' -> Just "unit separator" 493 '\DEL' -> Just "delete" 494 '\160' -> Just "non-breaking space" 495 _ -> Nothing 496 497-- | Replace tab characters with given number of spaces. 498 499expandTab 500 :: Pos 501 -> String 502 -> String 503expandTab w' = go 0 504 where 505 go 0 [] = [] 506 go 0 ('\t':xs) = go w xs 507 go 0 (x:xs) = x : go 0 xs 508 go n xs = ' ' : go (n - 1) xs 509 w = unPos w' 510