1{-# LANGUAGE FlexibleContexts #-} 2{-# LANGUAGE FlexibleInstances #-} 3{-# LANGUAGE LambdaCase #-} 4{-# LANGUAGE MultiWayIf #-} 5{-# LANGUAGE RankNTypes #-} 6{-# LANGUAGE RecordWildCards #-} 7{-# LANGUAGE Safe #-} 8{-# LANGUAGE ScopedTypeVariables #-} 9{-# LANGUAGE TypeFamilies #-} 10 11-- | 12-- Module : Text.Megaparsec.Stream 13-- Copyright : © 2015–present Megaparsec contributors 14-- License : FreeBSD 15-- 16-- Maintainer : Mark Karpov <markkarpov92@gmail.com> 17-- Stability : experimental 18-- Portability : portable 19-- 20-- Megaparsec's input stream facilities. 21-- 22-- You probably do not want to import this module directly because 23-- "Text.Megaparsec" re-exports it anyway. 24-- 25-- @since 6.0.0 26module Text.Megaparsec.Stream 27 ( Stream (..), 28 VisualStream (..), 29 TraversableStream (..), 30 ) 31where 32 33import qualified Data.ByteString as B 34import qualified Data.ByteString.Char8 as B8 35import qualified Data.ByteString.Lazy as BL 36import qualified Data.ByteString.Lazy.Char8 as BL8 37import Data.Char (chr) 38import Data.Foldable (foldl', toList) 39import Data.Kind (Type) 40import Data.List.NonEmpty (NonEmpty (..)) 41import qualified Data.List.NonEmpty as NE 42import Data.Maybe (fromMaybe) 43import Data.Proxy 44import qualified Data.Sequence as S 45import qualified Data.Text as T 46import qualified Data.Text.Lazy as TL 47import Data.Word (Word8) 48import Text.Megaparsec.Pos 49import Text.Megaparsec.State 50 51-- | Type class for inputs that can be consumed by the library. 52-- 53-- __Note__: before the version /9.0.0/ the class included the methods from 54-- 'VisualStream' and 'TraversableStream'. 55class (Ord (Token s), Ord (Tokens s)) => Stream s where 56 -- | Type of token in the stream. 57 type Token s :: Type 58 59 -- | Type of “chunk” of the stream. 60 type Tokens s :: Type 61 62 -- | Lift a single token to chunk of the stream. The default 63 -- implementation is: 64 -- 65 -- > tokenToChunk pxy = tokensToChunk pxy . pure 66 -- 67 -- However for some types of stream there may be a more efficient way to 68 -- lift. 69 tokenToChunk :: Proxy s -> Token s -> Tokens s 70 tokenToChunk pxy = tokensToChunk pxy . pure 71 72 -- | The first method that establishes isomorphism between list of tokens 73 -- and chunk of the stream. Valid implementation should satisfy: 74 -- 75 -- > chunkToTokens pxy (tokensToChunk pxy ts) == ts 76 tokensToChunk :: Proxy s -> [Token s] -> Tokens s 77 78 -- | The second method that establishes isomorphism between list of tokens 79 -- and chunk of the stream. Valid implementation should satisfy: 80 -- 81 -- > tokensToChunk pxy (chunkToTokens pxy chunk) == chunk 82 chunkToTokens :: Proxy s -> Tokens s -> [Token s] 83 84 -- | Return length of a chunk of the stream. 85 chunkLength :: Proxy s -> Tokens s -> Int 86 87 -- | Check if a chunk of the stream is empty. The default implementation 88 -- is in terms of the more general 'chunkLength': 89 -- 90 -- > chunkEmpty pxy ts = chunkLength pxy ts <= 0 91 -- 92 -- However for many streams there may be a more efficient implementation. 93 chunkEmpty :: Proxy s -> Tokens s -> Bool 94 chunkEmpty pxy ts = chunkLength pxy ts <= 0 95 96 -- | Extract a single token form the stream. Return 'Nothing' if the 97 -- stream is empty. 98 take1_ :: s -> Maybe (Token s, s) 99 100 -- | @'takeN_' n s@ should try to extract a chunk of length @n@, or if the 101 -- stream is too short, the rest of the stream. Valid implementation 102 -- should follow the rules: 103 -- 104 -- * If the requested length @n@ is 0 (or less), 'Nothing' should 105 -- never be returned, instead @'Just' (\"\", s)@ should be returned, 106 -- where @\"\"@ stands for the empty chunk, and @s@ is the original 107 -- stream (second argument). 108 -- * If the requested length is greater than 0 and the stream is 109 -- empty, 'Nothing' should be returned indicating end of input. 110 -- * In other cases, take chunk of length @n@ (or shorter if the 111 -- stream is not long enough) from the input stream and return the 112 -- chunk along with the rest of the stream. 113 takeN_ :: Int -> s -> Maybe (Tokens s, s) 114 115 -- | Extract chunk of the stream taking tokens while the supplied 116 -- predicate returns 'True'. Return the chunk and the rest of the stream. 117 -- 118 -- For many types of streams, the method allows for significant 119 -- performance improvements, although it is not strictly necessary from 120 -- conceptual point of view. 121 takeWhile_ :: (Token s -> Bool) -> s -> (Tokens s, s) 122 123-- | @since 9.0.0 124instance Ord a => Stream [a] where 125 type Token [a] = a 126 type Tokens [a] = [a] 127 tokenToChunk Proxy = pure 128 tokensToChunk Proxy = id 129 chunkToTokens Proxy = id 130 chunkLength Proxy = length 131 chunkEmpty Proxy = null 132 take1_ [] = Nothing 133 take1_ (t : ts) = Just (t, ts) 134 takeN_ n s 135 | n <= 0 = Just ([], s) 136 | null s = Nothing 137 | otherwise = Just (splitAt n s) 138 takeWhile_ = span 139 140-- | @since 9.0.0 141instance Ord a => Stream (S.Seq a) where 142 type Token (S.Seq a) = a 143 type Tokens (S.Seq a) = S.Seq a 144 tokenToChunk Proxy = pure 145 tokensToChunk Proxy = S.fromList 146 chunkToTokens Proxy = toList 147 chunkLength Proxy = length 148 chunkEmpty Proxy = null 149 take1_ S.Empty = Nothing 150 take1_ (t S.:<| ts) = Just (t, ts) 151 takeN_ n s 152 | n <= 0 = Just (S.empty, s) 153 | null s = Nothing 154 | otherwise = Just (S.splitAt n s) 155 takeWhile_ = S.spanl 156 157instance Stream B.ByteString where 158 type Token B.ByteString = Word8 159 type Tokens B.ByteString = B.ByteString 160 tokenToChunk Proxy = B.singleton 161 tokensToChunk Proxy = B.pack 162 chunkToTokens Proxy = B.unpack 163 chunkLength Proxy = B.length 164 chunkEmpty Proxy = B.null 165 take1_ = B.uncons 166 takeN_ n s 167 | n <= 0 = Just (B.empty, s) 168 | B.null s = Nothing 169 | otherwise = Just (B.splitAt n s) 170 takeWhile_ = B.span 171 172instance Stream BL.ByteString where 173 type Token BL.ByteString = Word8 174 type Tokens BL.ByteString = BL.ByteString 175 tokenToChunk Proxy = BL.singleton 176 tokensToChunk Proxy = BL.pack 177 chunkToTokens Proxy = BL.unpack 178 chunkLength Proxy = fromIntegral . BL.length 179 chunkEmpty Proxy = BL.null 180 take1_ = BL.uncons 181 takeN_ n s 182 | n <= 0 = Just (BL.empty, s) 183 | BL.null s = Nothing 184 | otherwise = Just (BL.splitAt (fromIntegral n) s) 185 takeWhile_ = BL.span 186 187instance Stream T.Text where 188 type Token T.Text = Char 189 type Tokens T.Text = T.Text 190 tokenToChunk Proxy = T.singleton 191 tokensToChunk Proxy = T.pack 192 chunkToTokens Proxy = T.unpack 193 chunkLength Proxy = T.length 194 chunkEmpty Proxy = T.null 195 take1_ = T.uncons 196 takeN_ n s 197 | n <= 0 = Just (T.empty, s) 198 | T.null s = Nothing 199 | otherwise = Just (T.splitAt n s) 200 takeWhile_ = T.span 201 202instance Stream TL.Text where 203 type Token TL.Text = Char 204 type Tokens TL.Text = TL.Text 205 tokenToChunk Proxy = TL.singleton 206 tokensToChunk Proxy = TL.pack 207 chunkToTokens Proxy = TL.unpack 208 chunkLength Proxy = fromIntegral . TL.length 209 chunkEmpty Proxy = TL.null 210 take1_ = TL.uncons 211 takeN_ n s 212 | n <= 0 = Just (TL.empty, s) 213 | TL.null s = Nothing 214 | otherwise = Just (TL.splitAt (fromIntegral n) s) 215 takeWhile_ = TL.span 216 217-- | Type class for inputs that can also be used for debugging. 218-- 219-- @since 9.0.0 220class Stream s => VisualStream s where 221 -- | Pretty-print non-empty stream of tokens. This function is also used 222 -- to print single tokens (represented as singleton lists). 223 -- 224 -- @since 7.0.0 225 showTokens :: Proxy s -> NonEmpty (Token s) -> String 226 227 -- | Return the number of characters that a non-empty stream of tokens 228 -- spans. The default implementation is sufficient if every token spans 229 -- exactly 1 character. 230 -- 231 -- @since 8.0.0 232 tokensLength :: Proxy s -> NonEmpty (Token s) -> Int 233 tokensLength Proxy = NE.length 234 235instance VisualStream String where 236 showTokens Proxy = stringPretty 237 238instance VisualStream B.ByteString where 239 showTokens Proxy = stringPretty . fmap (chr . fromIntegral) 240 241instance VisualStream BL.ByteString where 242 showTokens Proxy = stringPretty . fmap (chr . fromIntegral) 243 244instance VisualStream T.Text where 245 showTokens Proxy = stringPretty 246 247instance VisualStream TL.Text where 248 showTokens Proxy = stringPretty 249 250-- | Type class for inputs that can also be used for error reporting. 251-- 252-- @since 9.0.0 253class Stream s => TraversableStream s where 254 {-# MINIMAL reachOffset | reachOffsetNoLine #-} 255 256 -- | Given an offset @o@ and initial 'PosState', adjust the state in such 257 -- a way that it starts at the offset. 258 -- 259 -- Return two values (in order): 260 -- 261 -- * 'Maybe' 'String' representing the line on which the given offset 262 -- @o@ is located. It can be omitted (i.e. 'Nothing'); in that case 263 -- error reporting functions will not show offending lines. If 264 -- returned, the line should satisfy a number of conditions that are 265 -- described below. 266 -- * The updated 'PosState' which can be in turn used to locate 267 -- another offset @o'@ given that @o' >= o@. 268 -- 269 -- The 'String' representing the offending line in input stream should 270 -- satisfy the following: 271 -- 272 -- * It should adequately represent location of token at the offset of 273 -- interest, that is, character at 'sourceColumn' of the returned 274 -- 'SourcePos' should correspond to the token at the offset @o@. 275 -- * It should not include the newline at the end. 276 -- * It should not be empty, if the line happens to be empty, it 277 -- should be replaced with the string @\"\<empty line\>\"@. 278 -- * Tab characters should be replaced by appropriate number of 279 -- spaces, which is determined by the 'pstateTabWidth' field of 280 -- 'PosState'. 281 -- 282 -- __Note__: type signature of the function was changed in the version 283 -- /9.0.0/. 284 -- 285 -- @since 7.0.0 286 reachOffset :: 287 -- | Offset to reach 288 Int -> 289 -- | Initial 'PosState' to use 290 PosState s -> 291 -- | See the description of the function 292 (Maybe String, PosState s) 293 reachOffset o pst = 294 (Nothing, reachOffsetNoLine o pst) 295 296 -- | A version of 'reachOffset' that may be faster because it doesn't need 297 -- to fetch the line at which the given offset in located. 298 -- 299 -- The default implementation is this: 300 -- 301 -- > reachOffsetNoLine o pst = 302 -- > snd (reachOffset o pst) 303 -- 304 -- __Note__: type signature of the function was changed in the version 305 -- /8.0.0/. 306 -- 307 -- @since 7.0.0 308 reachOffsetNoLine :: 309 -- | Offset to reach 310 Int -> 311 -- | Initial 'PosState' to use 312 PosState s -> 313 -- | Reached source position and updated state 314 PosState s 315 reachOffsetNoLine o pst = 316 snd (reachOffset o pst) 317 318instance TraversableStream String where 319 -- NOTE Do not eta-reduce these (breaks inlining) 320 reachOffset o pst = 321 reachOffset' splitAt foldl' id id ('\n', '\t') o pst 322 reachOffsetNoLine o pst = 323 reachOffsetNoLine' splitAt foldl' ('\n', '\t') o pst 324 325instance TraversableStream B.ByteString where 326 -- NOTE Do not eta-reduce these (breaks inlining) 327 reachOffset o pst = 328 reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) o pst 329 reachOffsetNoLine o pst = 330 reachOffsetNoLine' B.splitAt B.foldl' (10, 9) o pst 331 332instance TraversableStream BL.ByteString where 333 -- NOTE Do not eta-reduce these (breaks inlining) 334 reachOffset o pst = 335 reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) o pst 336 reachOffsetNoLine o pst = 337 reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) o pst 338 339instance TraversableStream T.Text where 340 -- NOTE Do not eta-reduce (breaks inlining of reachOffset'). 341 reachOffset o pst = 342 reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') o pst 343 reachOffsetNoLine o pst = 344 reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') o pst 345 346instance TraversableStream TL.Text where 347 -- NOTE Do not eta-reduce (breaks inlining of reachOffset'). 348 reachOffset o pst = 349 reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') o pst 350 reachOffsetNoLine o pst = 351 reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') o pst 352 353---------------------------------------------------------------------------- 354-- Helpers 355 356-- | An internal helper state type combining a difference 'String' and an 357-- unboxed 'SourcePos'. 358data St = St SourcePos ShowS 359 360-- | A helper definition to facilitate defining 'reachOffset' for various 361-- stream types. 362reachOffset' :: 363 forall s. 364 Stream s => 365 -- | How to split input stream at given offset 366 (Int -> s -> (Tokens s, s)) -> 367 -- | How to fold over input stream 368 (forall b. (b -> Token s -> b) -> b -> Tokens s -> b) -> 369 -- | How to convert chunk of input stream into a 'String' 370 (Tokens s -> String) -> 371 -- | How to convert a token into a 'Char' 372 (Token s -> Char) -> 373 -- | Newline token and tab token 374 (Token s, Token s) -> 375 -- | Offset to reach 376 Int -> 377 -- | Initial 'PosState' to use 378 PosState s -> 379 -- | Line at which 'SourcePos' is located, updated 'PosState' 380 (Maybe String, PosState s) 381reachOffset' 382 splitAt' 383 foldl'' 384 fromToks 385 fromTok 386 (newlineTok, tabTok) 387 o 388 PosState {..} = 389 ( Just $ case expandTab pstateTabWidth 390 . addPrefix 391 . f 392 . fromToks 393 . fst 394 $ takeWhile_ (/= newlineTok) post of 395 "" -> "<empty line>" 396 xs -> xs, 397 PosState 398 { pstateInput = post, 399 pstateOffset = max pstateOffset o, 400 pstateSourcePos = spos, 401 pstateTabWidth = pstateTabWidth, 402 pstateLinePrefix = 403 if sameLine 404 then -- NOTE We don't use difference lists here because it's 405 -- desirable for 'PosState' to be an instance of 'Eq' and 406 -- 'Show'. So we just do appending here. Fortunately several 407 -- parse errors on the same line should be relatively rare. 408 pstateLinePrefix ++ f "" 409 else f "" 410 } 411 ) 412 where 413 addPrefix xs = 414 if sameLine 415 then pstateLinePrefix ++ xs 416 else xs 417 sameLine = sourceLine spos == sourceLine pstateSourcePos 418 (pre, post) = splitAt' (o - pstateOffset) pstateInput 419 St spos f = foldl'' go (St pstateSourcePos id) pre 420 go (St apos g) ch = 421 let SourcePos n l c = apos 422 c' = unPos c 423 w = unPos pstateTabWidth 424 in if 425 | ch == newlineTok -> 426 St 427 (SourcePos n (l <> pos1) pos1) 428 id 429 | ch == tabTok -> 430 St 431 (SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w))) 432 (g . (fromTok ch :)) 433 | otherwise -> 434 St 435 (SourcePos n l (c <> pos1)) 436 (g . (fromTok ch :)) 437{-# INLINE reachOffset' #-} 438 439-- | Like 'reachOffset'' but for 'reachOffsetNoLine'. 440reachOffsetNoLine' :: 441 forall s. 442 Stream s => 443 -- | How to split input stream at given offset 444 (Int -> s -> (Tokens s, s)) -> 445 -- | How to fold over input stream 446 (forall b. (b -> Token s -> b) -> b -> Tokens s -> b) -> 447 -- | Newline token and tab token 448 (Token s, Token s) -> 449 -- | Offset to reach 450 Int -> 451 -- | Initial 'PosState' to use 452 PosState s -> 453 -- | Updated 'PosState' 454 PosState s 455reachOffsetNoLine' 456 splitAt' 457 foldl'' 458 (newlineTok, tabTok) 459 o 460 PosState {..} = 461 ( PosState 462 { pstateInput = post, 463 pstateOffset = max pstateOffset o, 464 pstateSourcePos = spos, 465 pstateTabWidth = pstateTabWidth, 466 pstateLinePrefix = pstateLinePrefix 467 } 468 ) 469 where 470 spos = foldl'' go pstateSourcePos pre 471 (pre, post) = splitAt' (o - pstateOffset) pstateInput 472 go (SourcePos n l c) ch = 473 let c' = unPos c 474 w = unPos pstateTabWidth 475 in if 476 | ch == newlineTok -> 477 SourcePos n (l <> pos1) pos1 478 | ch == tabTok -> 479 SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w)) 480 | otherwise -> 481 SourcePos n l (c <> pos1) 482{-# INLINE reachOffsetNoLine' #-} 483 484-- | Like 'BL.splitAt' but accepts the index as an 'Int'. 485splitAtBL :: Int -> BL.ByteString -> (BL.ByteString, BL.ByteString) 486splitAtBL n = BL.splitAt (fromIntegral n) 487{-# INLINE splitAtBL #-} 488 489-- | Like 'TL.splitAt' but accepts the index as an 'Int'. 490splitAtTL :: Int -> TL.Text -> (TL.Text, TL.Text) 491splitAtTL n = TL.splitAt (fromIntegral n) 492{-# INLINE splitAtTL #-} 493 494-- | @stringPretty s@ returns pretty representation of string @s@. This is 495-- used when printing string tokens in error messages. 496stringPretty :: NonEmpty Char -> String 497stringPretty (x :| []) = charPretty x 498stringPretty ('\r' :| "\n") = "crlf newline" 499stringPretty xs = "\"" <> concatMap f (NE.toList xs) <> "\"" 500 where 501 f ch = 502 case charPretty' ch of 503 Nothing -> [ch] 504 Just pretty -> "<" <> pretty <> ">" 505 506-- | @charPretty ch@ returns user-friendly string representation of given 507-- character @ch@, suitable for using in error messages. 508charPretty :: Char -> String 509charPretty ' ' = "space" 510charPretty ch = fromMaybe ("'" <> [ch] <> "'") (charPretty' ch) 511 512-- | If the given character has a pretty representation, return that, 513-- otherwise 'Nothing'. This is an internal helper. 514charPretty' :: Char -> Maybe String 515charPretty' = \case 516 '\NUL' -> Just "null" 517 '\SOH' -> Just "start of heading" 518 '\STX' -> Just "start of text" 519 '\ETX' -> Just "end of text" 520 '\EOT' -> Just "end of transmission" 521 '\ENQ' -> Just "enquiry" 522 '\ACK' -> Just "acknowledge" 523 '\BEL' -> Just "bell" 524 '\BS' -> Just "backspace" 525 '\t' -> Just "tab" 526 '\n' -> Just "newline" 527 '\v' -> Just "vertical tab" 528 '\f' -> Just "form feed" 529 '\r' -> Just "carriage return" 530 '\SO' -> Just "shift out" 531 '\SI' -> Just "shift in" 532 '\DLE' -> Just "data link escape" 533 '\DC1' -> Just "device control one" 534 '\DC2' -> Just "device control two" 535 '\DC3' -> Just "device control three" 536 '\DC4' -> Just "device control four" 537 '\NAK' -> Just "negative acknowledge" 538 '\SYN' -> Just "synchronous idle" 539 '\ETB' -> Just "end of transmission block" 540 '\CAN' -> Just "cancel" 541 '\EM' -> Just "end of medium" 542 '\SUB' -> Just "substitute" 543 '\ESC' -> Just "escape" 544 '\FS' -> Just "file separator" 545 '\GS' -> Just "group separator" 546 '\RS' -> Just "record separator" 547 '\US' -> Just "unit separator" 548 '\DEL' -> Just "delete" 549 '\160' -> Just "non-breaking space" 550 _ -> Nothing 551 552-- | Replace tab characters with given number of spaces. 553expandTab :: 554 Pos -> 555 String -> 556 String 557expandTab w' = go 0 558 where 559 go 0 [] = [] 560 go 0 ('\t' : xs) = go w xs 561 go 0 (x : xs) = x : go 0 xs 562 go n xs = ' ' : go (n - 1) xs 563 w = unPos w' 564