1{-# LANGUAGE BangPatterns, CPP, GADTs, OverloadedStrings, RankNTypes, 2 RecordWildCards #-} 3-- | 4-- Module : Data.Attoparsec.ByteString.Internal 5-- Copyright : Bryan O'Sullivan 2007-2015 6-- License : BSD3 7-- 8-- Maintainer : bos@serpentine.com 9-- Stability : experimental 10-- Portability : unknown 11-- 12-- Simple, efficient parser combinators for 'ByteString' strings, 13-- loosely based on the Parsec library. 14 15module Data.Attoparsec.ByteString.Internal 16 ( 17 -- * Parser types 18 Parser 19 , Result 20 21 -- * Running parsers 22 , parse 23 , parseOnly 24 25 -- * Combinators 26 , module Data.Attoparsec.Combinator 27 28 -- * Parsing individual bytes 29 , satisfy 30 , satisfyWith 31 , anyWord8 32 , skip 33 , word8 34 , notWord8 35 36 -- ** Lookahead 37 , peekWord8 38 , peekWord8' 39 40 -- ** Byte classes 41 , inClass 42 , notInClass 43 44 -- * Parsing more complicated structures 45 , storable 46 47 -- * Efficient string handling 48 , skipWhile 49 , string 50 , stringCI 51 , take 52 , scan 53 , runScanner 54 , takeWhile 55 , takeWhile1 56 , takeTill 57 58 -- ** Consume all remaining input 59 , takeByteString 60 , takeLazyByteString 61 62 -- * Utilities 63 , endOfLine 64 , endOfInput 65 , match 66 , atEnd 67 ) where 68 69#if !MIN_VERSION_base(4,8,0) 70import Control.Applicative ((<$>)) 71#endif 72import Control.Applicative ((<|>)) 73import Control.Monad (when) 74import Data.Attoparsec.ByteString.Buffer (Buffer, buffer) 75import Data.Attoparsec.ByteString.FastSet (charClass, memberWord8) 76import Data.Attoparsec.Combinator ((<?>)) 77import Data.Attoparsec.Internal 78import Data.Attoparsec.Internal.Fhthagn (inlinePerformIO) 79import Data.Attoparsec.Internal.Types hiding (Parser, Failure, Success) 80import Data.ByteString (ByteString) 81import Data.List (intercalate) 82import Data.Word (Word8) 83import Foreign.ForeignPtr (withForeignPtr) 84import Foreign.Ptr (castPtr, minusPtr, plusPtr) 85import Foreign.Storable (Storable(peek, sizeOf)) 86import Prelude hiding (getChar, succ, take, takeWhile) 87import qualified Data.Attoparsec.ByteString.Buffer as Buf 88import qualified Data.Attoparsec.Internal.Types as T 89import qualified Data.ByteString as B8 90import qualified Data.ByteString.Char8 as B 91import qualified Data.ByteString.Internal as B 92import qualified Data.ByteString.Lazy as L 93import qualified Data.ByteString.Unsafe as B 94 95type Parser = T.Parser ByteString 96type Result = IResult ByteString 97type Failure r = T.Failure ByteString Buffer r 98type Success a r = T.Success ByteString Buffer a r 99 100-- | The parser @satisfy p@ succeeds for any byte for which the 101-- predicate @p@ returns 'True'. Returns the byte that is actually 102-- parsed. 103-- 104-- >digit = satisfy isDigit 105-- > where isDigit w = w >= 48 && w <= 57 106satisfy :: (Word8 -> Bool) -> Parser Word8 107satisfy p = do 108 h <- peekWord8' 109 if p h 110 then advance 1 >> return h 111 else fail "satisfy" 112{-# INLINE satisfy #-} 113 114-- | The parser @skip p@ succeeds for any byte for which the predicate 115-- @p@ returns 'True'. 116-- 117-- >skipDigit = skip isDigit 118-- > where isDigit w = w >= 48 && w <= 57 119skip :: (Word8 -> Bool) -> Parser () 120skip p = do 121 h <- peekWord8' 122 if p h 123 then advance 1 124 else fail "skip" 125 126-- | The parser @satisfyWith f p@ transforms a byte, and succeeds if 127-- the predicate @p@ returns 'True' on the transformed value. The 128-- parser returns the transformed byte that was parsed. 129satisfyWith :: (Word8 -> a) -> (a -> Bool) -> Parser a 130satisfyWith f p = do 131 h <- peekWord8' 132 let c = f h 133 if p c 134 then advance 1 >> return c 135 else fail "satisfyWith" 136{-# INLINE satisfyWith #-} 137 138storable :: Storable a => Parser a 139storable = hack undefined 140 where 141 hack :: Storable b => b -> Parser b 142 hack dummy = do 143 (fp,o,_) <- B.toForeignPtr `fmap` take (sizeOf dummy) 144 return . inlinePerformIO . withForeignPtr fp $ \p -> 145 peek (castPtr $ p `plusPtr` o) 146 147-- | Consume exactly @n@ bytes of input. 148take :: Int -> Parser ByteString 149take n0 = do 150 let n = max n0 0 151 s <- ensure n 152 advance n >> return s 153{-# INLINE take #-} 154 155-- | @string s@ parses a sequence of bytes that identically match 156-- @s@. Returns the parsed string (i.e. @s@). This parser consumes no 157-- input if it fails (even if a partial match). 158-- 159-- /Note/: The behaviour of this parser is different to that of the 160-- similarly-named parser in Parsec, as this one is all-or-nothing. 161-- To illustrate the difference, the following parser will fail under 162-- Parsec given an input of @\"for\"@: 163-- 164-- >string "foo" <|> string "for" 165-- 166-- The reason for its failure is that the first branch is a 167-- partial match, and will consume the letters @\'f\'@ and @\'o\'@ 168-- before failing. In attoparsec, the above parser will /succeed/ on 169-- that input, because the failed first branch will consume nothing. 170string :: ByteString -> Parser ByteString 171string s = string_ (stringSuspended id) id s 172{-# INLINE string #-} 173 174-- ASCII-specific but fast, oh yes. 175toLower :: Word8 -> Word8 176toLower w | w >= 65 && w <= 90 = w + 32 177 | otherwise = w 178 179-- | Satisfy a literal string, ignoring case. 180stringCI :: ByteString -> Parser ByteString 181stringCI s = string_ (stringSuspended lower) lower s 182 where lower = B8.map toLower 183{-# INLINE stringCI #-} 184 185string_ :: (forall r. ByteString -> ByteString -> Buffer -> Pos -> More 186 -> Failure r -> Success ByteString r -> Result r) 187 -> (ByteString -> ByteString) 188 -> ByteString -> Parser ByteString 189string_ suspended f s0 = T.Parser $ \t pos more lose succ -> 190 let n = B.length s 191 s = f s0 192 in if lengthAtLeast pos n t 193 then let t' = substring pos (Pos n) t 194 in if s == f t' 195 then succ t (pos + Pos n) more t' 196 else lose t pos more [] "string" 197 else let t' = Buf.unsafeDrop (fromPos pos) t 198 in if f t' `B.isPrefixOf` s 199 then suspended s (B.drop (B.length t') s) t pos more lose succ 200 else lose t pos more [] "string" 201{-# INLINE string_ #-} 202 203stringSuspended :: (ByteString -> ByteString) 204 -> ByteString -> ByteString -> Buffer -> Pos -> More 205 -> Failure r 206 -> Success ByteString r 207 -> Result r 208stringSuspended f s0 s t pos more lose succ = 209 runParser (demandInput_ >>= go) t pos more lose succ 210 where go s'0 = T.Parser $ \t' pos' more' lose' succ' -> 211 let m = B.length s 212 s' = f s'0 213 n = B.length s' 214 in if n >= m 215 then if B.unsafeTake m s' == s 216 then let o = Pos (B.length s0) 217 in succ' t' (pos' + o) more' 218 (substring pos' o t') 219 else lose' t' pos' more' [] "string" 220 else if s' == B.unsafeTake n s 221 then stringSuspended f s0 (B.unsafeDrop n s) 222 t' pos' more' lose' succ' 223 else lose' t' pos' more' [] "string" 224 225-- | Skip past input for as long as the predicate returns 'True'. 226skipWhile :: (Word8 -> Bool) -> Parser () 227skipWhile p = go 228 where 229 go = do 230 t <- B8.takeWhile p <$> get 231 continue <- inputSpansChunks (B.length t) 232 when continue go 233{-# INLINE skipWhile #-} 234 235-- | Consume input as long as the predicate returns 'False' 236-- (i.e. until it returns 'True'), and return the consumed input. 237-- 238-- This parser does not fail. It will return an empty string if the 239-- predicate returns 'True' on the first byte of input. 240-- 241-- /Note/: Because this parser does not fail, do not use it with 242-- combinators such as 'Control.Applicative.many', because such 243-- parsers loop until a failure occurs. Careless use will thus result 244-- in an infinite loop. 245takeTill :: (Word8 -> Bool) -> Parser ByteString 246takeTill p = takeWhile (not . p) 247{-# INLINE takeTill #-} 248 249-- | Consume input as long as the predicate returns 'True', and return 250-- the consumed input. 251-- 252-- This parser does not fail. It will return an empty string if the 253-- predicate returns 'False' on the first byte of input. 254-- 255-- /Note/: Because this parser does not fail, do not use it with 256-- combinators such as 'Control.Applicative.many', because such 257-- parsers loop until a failure occurs. Careless use will thus result 258-- in an infinite loop. 259takeWhile :: (Word8 -> Bool) -> Parser ByteString 260takeWhile p = do 261 s <- B8.takeWhile p <$> get 262 continue <- inputSpansChunks (B.length s) 263 if continue 264 then takeWhileAcc p [s] 265 else return s 266{-# INLINE takeWhile #-} 267 268takeWhileAcc :: (Word8 -> Bool) -> [ByteString] -> Parser ByteString 269takeWhileAcc p = go 270 where 271 go acc = do 272 s <- B8.takeWhile p <$> get 273 continue <- inputSpansChunks (B.length s) 274 if continue 275 then go (s:acc) 276 else return $ concatReverse (s:acc) 277{-# INLINE takeWhileAcc #-} 278 279takeRest :: Parser [ByteString] 280takeRest = go [] 281 where 282 go acc = do 283 input <- wantInput 284 if input 285 then do 286 s <- get 287 advance (B.length s) 288 go (s:acc) 289 else return (reverse acc) 290 291-- | Consume all remaining input and return it as a single string. 292takeByteString :: Parser ByteString 293takeByteString = B.concat `fmap` takeRest 294 295-- | Consume all remaining input and return it as a single string. 296takeLazyByteString :: Parser L.ByteString 297takeLazyByteString = L.fromChunks `fmap` takeRest 298 299data T s = T {-# UNPACK #-} !Int s 300 301scan_ :: (s -> [ByteString] -> Parser r) -> s -> (s -> Word8 -> Maybe s) 302 -> Parser r 303scan_ f s0 p = go [] s0 304 where 305 go acc s1 = do 306 let scanner (B.PS fp off len) = 307 withForeignPtr fp $ \ptr0 -> do 308 let start = ptr0 `plusPtr` off 309 end = start `plusPtr` len 310 inner ptr !s 311 | ptr < end = do 312 w <- peek ptr 313 case p s w of 314 Just s' -> inner (ptr `plusPtr` 1) s' 315 _ -> done (ptr `minusPtr` start) s 316 | otherwise = done (ptr `minusPtr` start) s 317 done !i !s = return (T i s) 318 inner start s1 319 bs <- get 320 let T i s' = inlinePerformIO $ scanner bs 321 !h = B.unsafeTake i bs 322 continue <- inputSpansChunks i 323 if continue 324 then go (h:acc) s' 325 else f s' (h:acc) 326{-# INLINE scan_ #-} 327 328-- | A stateful scanner. The predicate consumes and transforms a 329-- state argument, and each transformed state is passed to successive 330-- invocations of the predicate on each byte of the input until one 331-- returns 'Nothing' or the input ends. 332-- 333-- This parser does not fail. It will return an empty string if the 334-- predicate returns 'Nothing' on the first byte of input. 335-- 336-- /Note/: Because this parser does not fail, do not use it with 337-- combinators such as 'Control.Applicative.many', because such 338-- parsers loop until a failure occurs. Careless use will thus result 339-- in an infinite loop. 340scan :: s -> (s -> Word8 -> Maybe s) -> Parser ByteString 341scan = scan_ $ \_ chunks -> return $! concatReverse chunks 342{-# INLINE scan #-} 343 344-- | Like 'scan', but generalized to return the final state of the 345-- scanner. 346runScanner :: s -> (s -> Word8 -> Maybe s) -> Parser (ByteString, s) 347runScanner = scan_ $ \s xs -> let !sx = concatReverse xs in return (sx, s) 348{-# INLINE runScanner #-} 349 350-- | Consume input as long as the predicate returns 'True', and return 351-- the consumed input. 352-- 353-- This parser requires the predicate to succeed on at least one byte 354-- of input: it will fail if the predicate never returns 'True' or if 355-- there is no input left. 356takeWhile1 :: (Word8 -> Bool) -> Parser ByteString 357takeWhile1 p = do 358 (`when` demandInput) =<< endOfChunk 359 s <- B8.takeWhile p <$> get 360 let len = B.length s 361 if len == 0 362 then fail "takeWhile1" 363 else do 364 advance len 365 eoc <- endOfChunk 366 if eoc 367 then takeWhileAcc p [s] 368 else return s 369{-# INLINE takeWhile1 #-} 370 371-- | Match any byte in a set. 372-- 373-- >vowel = inClass "aeiou" 374-- 375-- Range notation is supported. 376-- 377-- >halfAlphabet = inClass "a-nA-N" 378-- 379-- To add a literal @\'-\'@ to a set, place it at the beginning or end 380-- of the string. 381inClass :: String -> Word8 -> Bool 382inClass s = (`memberWord8` mySet) 383 where mySet = charClass s 384 {-# NOINLINE mySet #-} 385{-# INLINE inClass #-} 386 387-- | Match any byte not in a set. 388notInClass :: String -> Word8 -> Bool 389notInClass s = not . inClass s 390{-# INLINE notInClass #-} 391 392-- | Match any byte. 393anyWord8 :: Parser Word8 394anyWord8 = satisfy $ const True 395{-# INLINE anyWord8 #-} 396 397-- | Match a specific byte. 398word8 :: Word8 -> Parser Word8 399word8 c = satisfy (== c) <?> show c 400{-# INLINE word8 #-} 401 402-- | Match any byte except the given one. 403notWord8 :: Word8 -> Parser Word8 404notWord8 c = satisfy (/= c) <?> "not " ++ show c 405{-# INLINE notWord8 #-} 406 407-- | Match any byte, to perform lookahead. Returns 'Nothing' if end of 408-- input has been reached. Does not consume any input. 409-- 410-- /Note/: Because this parser does not fail, do not use it with 411-- combinators such as 'Control.Applicative.many', because such 412-- parsers loop until a failure occurs. Careless use will thus result 413-- in an infinite loop. 414peekWord8 :: Parser (Maybe Word8) 415peekWord8 = T.Parser $ \t pos@(Pos pos_) more _lose succ -> 416 case () of 417 _| pos_ < Buf.length t -> 418 let !w = Buf.unsafeIndex t pos_ 419 in succ t pos more (Just w) 420 | more == Complete -> 421 succ t pos more Nothing 422 | otherwise -> 423 let succ' t' pos' more' = let !w = Buf.unsafeIndex t' pos_ 424 in succ t' pos' more' (Just w) 425 lose' t' pos' more' = succ t' pos' more' Nothing 426 in prompt t pos more lose' succ' 427{-# INLINE peekWord8 #-} 428 429-- | Match any byte, to perform lookahead. Does not consume any 430-- input, but will fail if end of input has been reached. 431peekWord8' :: Parser Word8 432peekWord8' = T.Parser $ \t pos more lose succ -> 433 if lengthAtLeast pos 1 t 434 then succ t pos more (Buf.unsafeIndex t (fromPos pos)) 435 else let succ' t' pos' more' bs' = succ t' pos' more' $! B.unsafeHead bs' 436 in ensureSuspended 1 t pos more lose succ' 437{-# INLINE peekWord8' #-} 438 439-- | Match either a single newline character @\'\\n\'@, or a carriage 440-- return followed by a newline character @\"\\r\\n\"@. 441endOfLine :: Parser () 442endOfLine = (word8 10 >> return ()) <|> (string "\r\n" >> return ()) 443 444-- | Terminal failure continuation. 445failK :: Failure a 446failK t (Pos pos) _more stack msg = Fail (Buf.unsafeDrop pos t) stack msg 447{-# INLINE failK #-} 448 449-- | Terminal success continuation. 450successK :: Success a a 451successK t (Pos pos) _more a = Done (Buf.unsafeDrop pos t) a 452{-# INLINE successK #-} 453 454-- | Run a parser. 455parse :: Parser a -> ByteString -> Result a 456parse m s = T.runParser m (buffer s) (Pos 0) Incomplete failK successK 457{-# INLINE parse #-} 458 459-- | Run a parser that cannot be resupplied via a 'Partial' result. 460-- 461-- This function does not force a parser to consume all of its input. 462-- Instead, any residual input will be discarded. To force a parser 463-- to consume all of its input, use something like this: 464-- 465-- @ 466--'parseOnly' (myParser 'Control.Applicative.<*' 'endOfInput') 467-- @ 468parseOnly :: Parser a -> ByteString -> Either String a 469parseOnly m s = case T.runParser m (buffer s) (Pos 0) Complete failK successK of 470 Fail _ [] err -> Left err 471 Fail _ ctxs err -> Left (intercalate " > " ctxs ++ ": " ++ err) 472 Done _ a -> Right a 473 _ -> error "parseOnly: impossible error!" 474{-# INLINE parseOnly #-} 475 476get :: Parser ByteString 477get = T.Parser $ \t pos more _lose succ -> 478 succ t pos more (Buf.unsafeDrop (fromPos pos) t) 479{-# INLINE get #-} 480 481endOfChunk :: Parser Bool 482endOfChunk = T.Parser $ \t pos more _lose succ -> 483 succ t pos more (fromPos pos == Buf.length t) 484{-# INLINE endOfChunk #-} 485 486inputSpansChunks :: Int -> Parser Bool 487inputSpansChunks i = T.Parser $ \t pos_ more _lose succ -> 488 let pos = pos_ + Pos i 489 in if fromPos pos < Buf.length t || more == Complete 490 then succ t pos more False 491 else let lose' t' pos' more' = succ t' pos' more' False 492 succ' t' pos' more' = succ t' pos' more' True 493 in prompt t pos more lose' succ' 494{-# INLINE inputSpansChunks #-} 495 496advance :: Int -> Parser () 497advance n = T.Parser $ \t pos more _lose succ -> 498 succ t (pos + Pos n) more () 499{-# INLINE advance #-} 500 501ensureSuspended :: Int -> Buffer -> Pos -> More 502 -> Failure r 503 -> Success ByteString r 504 -> Result r 505ensureSuspended n t pos more lose succ = 506 runParser (demandInput >> go) t pos more lose succ 507 where go = T.Parser $ \t' pos' more' lose' succ' -> 508 if lengthAtLeast pos' n t' 509 then succ' t' pos' more' (substring pos (Pos n) t') 510 else runParser (demandInput >> go) t' pos' more' lose' succ' 511 512-- | If at least @n@ elements of input are available, return the 513-- current input, otherwise fail. 514ensure :: Int -> Parser ByteString 515ensure n = T.Parser $ \t pos more lose succ -> 516 if lengthAtLeast pos n t 517 then succ t pos more (substring pos (Pos n) t) 518 -- The uncommon case is kept out-of-line to reduce code size: 519 else ensureSuspended n t pos more lose succ 520{-# INLINE ensure #-} 521 522-- | Return both the result of a parse and the portion of the input 523-- that was consumed while it was being parsed. 524match :: Parser a -> Parser (ByteString, a) 525match p = T.Parser $ \t pos more lose succ -> 526 let succ' t' pos' more' a = 527 succ t' pos' more' (substring pos (pos'-pos) t', a) 528 in runParser p t pos more lose succ' 529 530lengthAtLeast :: Pos -> Int -> Buffer -> Bool 531lengthAtLeast (Pos pos) n bs = Buf.length bs >= pos + n 532{-# INLINE lengthAtLeast #-} 533 534substring :: Pos -> Pos -> Buffer -> ByteString 535substring (Pos pos) (Pos n) = Buf.substring pos n 536{-# INLINE substring #-} 537