1-- | 2-- Module : Text.Megaparsec.Stream 3-- Copyright : © 2015–present Megaparsec contributors 4-- License : FreeBSD 5-- 6-- Maintainer : Mark Karpov <markkarpov92@gmail.com> 7-- Stability : experimental 8-- Portability : portable 9-- 10-- Megaparsec's input stream facilities. 11-- 12-- You probably do not want to import this module directly because 13-- "Text.Megaparsec" re-exports it anyway. 14-- 15-- @since 6.0.0 16 17{-# LANGUAGE FlexibleContexts #-} 18{-# LANGUAGE FlexibleInstances #-} 19{-# LANGUAGE LambdaCase #-} 20{-# LANGUAGE MultiWayIf #-} 21{-# LANGUAGE RankNTypes #-} 22{-# LANGUAGE RecordWildCards #-} 23{-# LANGUAGE ScopedTypeVariables #-} 24{-# LANGUAGE TypeFamilies #-} 25 26module Text.Megaparsec.Stream 27 ( Stream (..) ) 28where 29 30import Data.Char (chr) 31import Data.Foldable (foldl') 32import Data.Kind (Type) 33import Data.List.NonEmpty (NonEmpty (..)) 34import Data.Maybe (fromMaybe) 35import Data.Proxy 36import Data.Word (Word8) 37import Text.Megaparsec.Pos 38import Text.Megaparsec.State 39import qualified Data.ByteString as B 40import qualified Data.ByteString.Char8 as B8 41import qualified Data.ByteString.Lazy as BL 42import qualified Data.ByteString.Lazy.Char8 as BL8 43import qualified Data.List.NonEmpty as NE 44import qualified Data.Text as T 45import qualified Data.Text.Lazy as TL 46 47-- | Type class for inputs that can be consumed by the library. 48 49class (Ord (Token s), Ord (Tokens s)) => Stream s where 50 51 -- | Type of token in the stream. 52 53 type Token s :: Type 54 55 -- | Type of “chunk” of the stream. 56 57 type Tokens s :: Type 58 59 -- | Lift a single token to chunk of the stream. The default 60 -- implementation is: 61 -- 62 -- > tokenToChunk pxy = tokensToChunk pxy . pure 63 -- 64 -- However for some types of stream there may be a more efficient way to 65 -- lift. 66 67 tokenToChunk :: Proxy s -> Token s -> Tokens s 68 tokenToChunk pxy = tokensToChunk pxy . pure 69 70 -- | The first method that establishes isomorphism between list of tokens 71 -- and chunk of the stream. Valid implementation should satisfy: 72 -- 73 -- > chunkToTokens pxy (tokensToChunk pxy ts) == ts 74 75 tokensToChunk :: Proxy s -> [Token s] -> Tokens s 76 77 -- | The second method that establishes isomorphism between list of tokens 78 -- and chunk of the stream. Valid implementation should satisfy: 79 -- 80 -- > tokensToChunk pxy (chunkToTokens pxy chunk) == chunk 81 82 chunkToTokens :: Proxy s -> Tokens s -> [Token s] 83 84 -- | Return length of a chunk of the stream. 85 86 chunkLength :: Proxy s -> Tokens s -> Int 87 88 -- | Check if a chunk of the stream is empty. The default implementation 89 -- is in terms of the more general 'chunkLength': 90 -- 91 -- > chunkEmpty pxy ts = chunkLength pxy ts <= 0 92 -- 93 -- However for many streams there may be a more efficient implementation. 94 95 chunkEmpty :: Proxy s -> Tokens s -> Bool 96 chunkEmpty pxy ts = chunkLength pxy ts <= 0 97 98 -- | Extract a single token form the stream. Return 'Nothing' if the 99 -- stream is empty. 100 101 take1_ :: s -> Maybe (Token s, s) 102 103 -- | @'takeN_' n s@ should try to extract a chunk of length @n@, or if the 104 -- stream is too short, the rest of the stream. Valid implementation 105 -- should follow the rules: 106 -- 107 -- * If the requested length @n@ is 0 (or less), 'Nothing' should 108 -- never be returned, instead @'Just' (\"\", s)@ should be returned, 109 -- where @\"\"@ stands for the empty chunk, and @s@ is the original 110 -- stream (second argument). 111 -- * If the requested length is greater than 0 and the stream is 112 -- empty, 'Nothing' should be returned indicating end of input. 113 -- * In other cases, take chunk of length @n@ (or shorter if the 114 -- stream is not long enough) from the input stream and return the 115 -- chunk along with the rest of the stream. 116 117 takeN_ :: Int -> s -> Maybe (Tokens s, s) 118 119 -- | Extract chunk of the stream taking tokens while the supplied 120 -- predicate returns 'True'. Return the chunk and the rest of the stream. 121 -- 122 -- For many types of streams, the method allows for significant 123 -- performance improvements, although it is not strictly necessary from 124 -- conceptual point of view. 125 126 takeWhile_ :: (Token s -> Bool) -> s -> (Tokens s, s) 127 128 -- | Pretty-print non-empty stream of tokens. This function is also used 129 -- to print single tokens (represented as singleton lists). 130 -- 131 -- @since 7.0.0 132 133 showTokens :: Proxy s -> NonEmpty (Token s) -> String 134 135 -- | Return the number of characters that a non-empty stream of tokens 136 -- spans. The default implementation is sufficient if every token spans 137 -- exactly 1 character. 138 -- 139 -- @since 8.0.0 140 141 tokensLength :: Proxy s -> NonEmpty (Token s) -> Int 142 tokensLength Proxy = NE.length 143 144 -- | Given an offset @o@ and initial 'PosState', adjust the state in such 145 -- a way that it starts at the offset. 146 -- 147 -- Return two values (in order): 148 -- 149 -- * 'String' representing the line on which the given offset @o@ is 150 -- located. The line should satisfy a number of conditions that are 151 -- described below. 152 -- * The updated 'PosState' which can be in turn used to locate 153 -- another offset @o'@ given that @o' >= o@. 154 -- 155 -- The 'String' representing the offending line in input stream should 156 -- satisfy the following: 157 -- 158 -- * It should adequately represent location of token at the offset of 159 -- interest, that is, character at 'sourceColumn' of the returned 160 -- 'SourcePos' should correspond to the token at the offset @o@. 161 -- * It should not include the newline at the end. 162 -- * It should not be empty, if the line happens to be empty, it 163 -- should be replaced with the string @\"\<empty line\>\"@. 164 -- * Tab characters should be replaced by appropriate number of 165 -- spaces, which is determined by the 'pstateTabWidth' field of 166 -- 'PosState'. 167 -- 168 -- __Note__: type signature of the function was changed in the version 169 -- /8.0.0/. 170 -- 171 -- @since 7.0.0 172 173 reachOffset 174 :: Int -- ^ Offset to reach 175 -> PosState s -- ^ Initial 'PosState' to use 176 -> (String, PosState s) -- ^ See the description of the function 177 178 -- | A version of 'reachOffset' that may be faster because it doesn't need 179 -- to fetch the line at which the given offset in located. 180 -- 181 -- The default implementation is this: 182 -- 183 -- > reachOffsetNoLine o pst = 184 -- > snd (reachOffset o pst) 185 -- 186 -- __Note__: type signature of the function was changed in the version 187 -- /8.0.0/. 188 -- 189 -- @since 7.0.0 190 191 reachOffsetNoLine 192 :: Int -- ^ Offset to reach 193 -> PosState s -- ^ Initial 'PosState' to use 194 -> PosState s -- ^ Reached source position and updated state 195 reachOffsetNoLine o pst = 196 snd (reachOffset o pst) 197 198instance Stream String where 199 type Token String = Char 200 type Tokens String = String 201 tokenToChunk Proxy = pure 202 tokensToChunk Proxy = id 203 chunkToTokens Proxy = id 204 chunkLength Proxy = length 205 chunkEmpty Proxy = null 206 take1_ [] = Nothing 207 take1_ (t:ts) = Just (t, ts) 208 takeN_ n s 209 | n <= 0 = Just ("", s) 210 | null s = Nothing 211 | otherwise = Just (splitAt n s) 212 takeWhile_ = span 213 showTokens Proxy = stringPretty 214 -- NOTE Do not eta-reduce these (breaks inlining) 215 reachOffset o pst = 216 reachOffset' splitAt foldl' id id ('\n','\t') o pst 217 reachOffsetNoLine o pst = 218 reachOffsetNoLine' splitAt foldl' ('\n', '\t') o pst 219 220instance Stream B.ByteString where 221 type Token B.ByteString = Word8 222 type Tokens B.ByteString = B.ByteString 223 tokenToChunk Proxy = B.singleton 224 tokensToChunk Proxy = B.pack 225 chunkToTokens Proxy = B.unpack 226 chunkLength Proxy = B.length 227 chunkEmpty Proxy = B.null 228 take1_ = B.uncons 229 takeN_ n s 230 | n <= 0 = Just (B.empty, s) 231 | B.null s = Nothing 232 | otherwise = Just (B.splitAt n s) 233 takeWhile_ = B.span 234 showTokens Proxy = stringPretty . fmap (chr . fromIntegral) 235 -- NOTE Do not eta-reduce these (breaks inlining) 236 reachOffset o pst = 237 reachOffset' B.splitAt B.foldl' B8.unpack (chr . fromIntegral) (10, 9) o pst 238 reachOffsetNoLine o pst = 239 reachOffsetNoLine' B.splitAt B.foldl' (10, 9) o pst 240 241instance Stream BL.ByteString where 242 type Token BL.ByteString = Word8 243 type Tokens BL.ByteString = BL.ByteString 244 tokenToChunk Proxy = BL.singleton 245 tokensToChunk Proxy = BL.pack 246 chunkToTokens Proxy = BL.unpack 247 chunkLength Proxy = fromIntegral . BL.length 248 chunkEmpty Proxy = BL.null 249 take1_ = BL.uncons 250 takeN_ n s 251 | n <= 0 = Just (BL.empty, s) 252 | BL.null s = Nothing 253 | otherwise = Just (BL.splitAt (fromIntegral n) s) 254 takeWhile_ = BL.span 255 showTokens Proxy = stringPretty . fmap (chr . fromIntegral) 256 -- NOTE Do not eta-reduce these (breaks inlining) 257 reachOffset o pst = 258 reachOffset' splitAtBL BL.foldl' BL8.unpack (chr . fromIntegral) (10, 9) o pst 259 reachOffsetNoLine o pst = 260 reachOffsetNoLine' splitAtBL BL.foldl' (10, 9) o pst 261 262instance Stream T.Text where 263 type Token T.Text = Char 264 type Tokens T.Text = T.Text 265 tokenToChunk Proxy = T.singleton 266 tokensToChunk Proxy = T.pack 267 chunkToTokens Proxy = T.unpack 268 chunkLength Proxy = T.length 269 chunkEmpty Proxy = T.null 270 take1_ = T.uncons 271 takeN_ n s 272 | n <= 0 = Just (T.empty, s) 273 | T.null s = Nothing 274 | otherwise = Just (T.splitAt n s) 275 takeWhile_ = T.span 276 showTokens Proxy = stringPretty 277 -- NOTE Do not eta-reduce (breaks inlining of reachOffset'). 278 reachOffset o pst = 279 reachOffset' T.splitAt T.foldl' T.unpack id ('\n', '\t') o pst 280 reachOffsetNoLine o pst = 281 reachOffsetNoLine' T.splitAt T.foldl' ('\n', '\t') o pst 282 283instance Stream TL.Text where 284 type Token TL.Text = Char 285 type Tokens TL.Text = TL.Text 286 tokenToChunk Proxy = TL.singleton 287 tokensToChunk Proxy = TL.pack 288 chunkToTokens Proxy = TL.unpack 289 chunkLength Proxy = fromIntegral . TL.length 290 chunkEmpty Proxy = TL.null 291 take1_ = TL.uncons 292 takeN_ n s 293 | n <= 0 = Just (TL.empty, s) 294 | TL.null s = Nothing 295 | otherwise = Just (TL.splitAt (fromIntegral n) s) 296 takeWhile_ = TL.span 297 showTokens Proxy = stringPretty 298 -- NOTE Do not eta-reduce (breaks inlining of reachOffset'). 299 reachOffset o pst = 300 reachOffset' splitAtTL TL.foldl' TL.unpack id ('\n', '\t') o pst 301 reachOffsetNoLine o pst = 302 reachOffsetNoLine' splitAtTL TL.foldl' ('\n', '\t') o pst 303 304---------------------------------------------------------------------------- 305-- Helpers 306 307-- | An internal helper state type combining a difference 'String' and an 308-- unboxed 'SourcePos'. 309 310data St = St SourcePos ShowS 311 312-- | A helper definition to facilitate defining 'reachOffset' for various 313-- stream types. 314 315reachOffset' 316 :: forall s. Stream s 317 => (Int -> s -> (Tokens s, s)) 318 -- ^ How to split input stream at given offset 319 -> (forall b. (b -> Token s -> b) -> b -> Tokens s -> b) 320 -- ^ How to fold over input stream 321 -> (Tokens s -> String) 322 -- ^ How to convert chunk of input stream into a 'String' 323 -> (Token s -> Char) 324 -- ^ How to convert a token into a 'Char' 325 -> (Token s, Token s) 326 -- ^ Newline token and tab token 327 -> Int 328 -- ^ Offset to reach 329 -> PosState s 330 -- ^ Initial 'PosState' to use 331 -> (String, PosState s) 332 -- ^ Line at which 'SourcePos' is located, updated 'PosState' 333reachOffset' splitAt' 334 foldl'' 335 fromToks 336 fromTok 337 (newlineTok, tabTok) 338 o 339 PosState {..} = 340 ( case expandTab pstateTabWidth 341 . addPrefix 342 . f 343 . fromToks 344 . fst 345 $ takeWhile_ (/= newlineTok) post of 346 "" -> "<empty line>" 347 xs -> xs 348 , PosState 349 { pstateInput = post 350 , pstateOffset = max pstateOffset o 351 , pstateSourcePos = spos 352 , pstateTabWidth = pstateTabWidth 353 , pstateLinePrefix = 354 if sameLine 355 -- NOTE We don't use difference lists here because it's 356 -- desirable for 'PosState' to be an instance of 'Eq' and 357 -- 'Show'. So we just do appending here. Fortunately several 358 -- parse errors on the same line should be relatively rare. 359 then pstateLinePrefix ++ f "" 360 else f "" 361 } 362 ) 363 where 364 addPrefix xs = 365 if sameLine 366 then pstateLinePrefix ++ xs 367 else xs 368 sameLine = sourceLine spos == sourceLine pstateSourcePos 369 (pre, post) = splitAt' (o - pstateOffset) pstateInput 370 St spos f = foldl'' go (St pstateSourcePos id) pre 371 go (St apos g) ch = 372 let SourcePos n l c = apos 373 c' = unPos c 374 w = unPos pstateTabWidth 375 in if | ch == newlineTok -> 376 St (SourcePos n (l <> pos1) pos1) 377 id 378 | ch == tabTok -> 379 St (SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w))) 380 (g . (fromTok ch :)) 381 | otherwise -> 382 St (SourcePos n l (c <> pos1)) 383 (g . (fromTok ch :)) 384{-# INLINE reachOffset' #-} 385 386-- | Like 'reachOffset'' but for 'reachOffsetNoLine'. 387 388reachOffsetNoLine' 389 :: forall s. Stream s 390 => (Int -> s -> (Tokens s, s)) 391 -- ^ How to split input stream at given offset 392 -> (forall b. (b -> Token s -> b) -> b -> Tokens s -> b) 393 -- ^ How to fold over input stream 394 -> (Token s, Token s) 395 -- ^ Newline token and tab token 396 -> Int 397 -- ^ Offset to reach 398 -> PosState s 399 -- ^ Initial 'PosState' to use 400 -> PosState s 401 -- ^ Updated 'PosState' 402reachOffsetNoLine' splitAt' 403 foldl'' 404 (newlineTok, tabTok) 405 o 406 PosState {..} = 407 ( PosState 408 { pstateInput = post 409 , pstateOffset = max pstateOffset o 410 , pstateSourcePos = spos 411 , pstateTabWidth = pstateTabWidth 412 , pstateLinePrefix = pstateLinePrefix 413 } 414 ) 415 where 416 spos = foldl'' go pstateSourcePos pre 417 (pre, post) = splitAt' (o - pstateOffset) pstateInput 418 go (SourcePos n l c) ch = 419 let c' = unPos c 420 w = unPos pstateTabWidth 421 in if | ch == newlineTok -> 422 SourcePos n (l <> pos1) pos1 423 | ch == tabTok -> 424 SourcePos n l (mkPos $ c' + w - ((c' - 1) `rem` w)) 425 | otherwise -> 426 SourcePos n l (c <> pos1) 427{-# INLINE reachOffsetNoLine' #-} 428 429-- | Like 'BL.splitAt' but accepts the index as an 'Int'. 430 431splitAtBL :: Int -> BL.ByteString -> (BL.ByteString, BL.ByteString) 432splitAtBL n = BL.splitAt (fromIntegral n) 433{-# INLINE splitAtBL #-} 434 435-- | Like 'TL.splitAt' but accepts the index as an 'Int'. 436 437splitAtTL :: Int -> TL.Text -> (TL.Text, TL.Text) 438splitAtTL n = TL.splitAt (fromIntegral n) 439{-# INLINE splitAtTL #-} 440 441-- | @stringPretty s@ returns pretty representation of string @s@. This is 442-- used when printing string tokens in error messages. 443 444stringPretty :: NonEmpty Char -> String 445stringPretty (x:|[]) = charPretty x 446stringPretty ('\r':|"\n") = "crlf newline" 447stringPretty xs = "\"" <> concatMap f (NE.toList xs) <> "\"" 448 where 449 f ch = 450 case charPretty' ch of 451 Nothing -> [ch] 452 Just pretty -> "<" <> pretty <> ">" 453 454-- | @charPretty ch@ returns user-friendly string representation of given 455-- character @ch@, suitable for using in error messages. 456 457charPretty :: Char -> String 458charPretty ' ' = "space" 459charPretty ch = fromMaybe ("'" <> [ch] <> "'") (charPretty' ch) 460 461-- | If the given character has a pretty representation, return that, 462-- otherwise 'Nothing'. This is an internal helper. 463 464charPretty' :: Char -> Maybe String 465charPretty' = \case 466 '\NUL' -> Just "null" 467 '\SOH' -> Just "start of heading" 468 '\STX' -> Just "start of text" 469 '\ETX' -> Just "end of text" 470 '\EOT' -> Just "end of transmission" 471 '\ENQ' -> Just "enquiry" 472 '\ACK' -> Just "acknowledge" 473 '\BEL' -> Just "bell" 474 '\BS' -> Just "backspace" 475 '\t' -> Just "tab" 476 '\n' -> Just "newline" 477 '\v' -> Just "vertical tab" 478 '\f' -> Just "form feed" 479 '\r' -> Just "carriage return" 480 '\SO' -> Just "shift out" 481 '\SI' -> Just "shift in" 482 '\DLE' -> Just "data link escape" 483 '\DC1' -> Just "device control one" 484 '\DC2' -> Just "device control two" 485 '\DC3' -> Just "device control three" 486 '\DC4' -> Just "device control four" 487 '\NAK' -> Just "negative acknowledge" 488 '\SYN' -> Just "synchronous idle" 489 '\ETB' -> Just "end of transmission block" 490 '\CAN' -> Just "cancel" 491 '\EM' -> Just "end of medium" 492 '\SUB' -> Just "substitute" 493 '\ESC' -> Just "escape" 494 '\FS' -> Just "file separator" 495 '\GS' -> Just "group separator" 496 '\RS' -> Just "record separator" 497 '\US' -> Just "unit separator" 498 '\DEL' -> Just "delete" 499 '\160' -> Just "non-breaking space" 500 _ -> Nothing 501 502-- | Replace tab characters with given number of spaces. 503 504expandTab 505 :: Pos 506 -> String 507 -> String 508expandTab w' = go 0 509 where 510 go 0 [] = [] 511 go 0 ('\t':xs) = go w xs 512 go 0 (x:xs) = x : go 0 xs 513 go n xs = ' ' : go (n - 1) xs 514 w = unPos w' 515