1{-# LANGUAGE BangPatterns, CPP, GADTs, OverloadedStrings, RankNTypes,
2    RecordWildCards #-}
3-- |
4-- Module      :  Data.Attoparsec.ByteString.Internal
5-- Copyright   :  Bryan O'Sullivan 2007-2015
6-- License     :  BSD3
7--
8-- Maintainer  :  bos@serpentine.com
9-- Stability   :  experimental
10-- Portability :  unknown
11--
12-- Simple, efficient parser combinators for 'ByteString' strings,
13-- loosely based on the Parsec library.
14
15module Data.Attoparsec.ByteString.Internal
16    (
17    -- * Parser types
18      Parser
19    , Result
20
21    -- * Running parsers
22    , parse
23    , parseOnly
24
25    -- * Combinators
26    , module Data.Attoparsec.Combinator
27
28    -- * Parsing individual bytes
29    , satisfy
30    , satisfyWith
31    , anyWord8
32    , skip
33    , word8
34    , notWord8
35
36    -- ** Lookahead
37    , peekWord8
38    , peekWord8'
39
40    -- ** Byte classes
41    , inClass
42    , notInClass
43
44    -- * Parsing more complicated structures
45    , storable
46
47    -- * Efficient string handling
48    , skipWhile
49    , string
50    , stringCI
51    , take
52    , scan
53    , runScanner
54    , takeWhile
55    , takeWhile1
56    , takeTill
57
58    -- ** Consume all remaining input
59    , takeByteString
60    , takeLazyByteString
61
62    -- * Utilities
63    , endOfLine
64    , endOfInput
65    , match
66    , atEnd
67    ) where
68
69#if !MIN_VERSION_base(4,8,0)
70import Control.Applicative ((<$>))
71#endif
72import Control.Applicative ((<|>))
73import Control.Monad (when)
74import Data.Attoparsec.ByteString.Buffer (Buffer, buffer)
75import Data.Attoparsec.ByteString.FastSet (charClass, memberWord8)
76import Data.Attoparsec.Combinator ((<?>))
77import Data.Attoparsec.Internal
78import Data.Attoparsec.Internal.Fhthagn (inlinePerformIO)
79import Data.Attoparsec.Internal.Types hiding (Parser, Failure, Success)
80import Data.ByteString (ByteString)
81import Data.List (intercalate)
82import Data.Word (Word8)
83import Foreign.ForeignPtr (withForeignPtr)
84import Foreign.Ptr (castPtr, minusPtr, plusPtr)
85import Foreign.Storable (Storable(peek, sizeOf))
86import Prelude hiding (getChar, succ, take, takeWhile)
87import qualified Data.Attoparsec.ByteString.Buffer as Buf
88import qualified Data.Attoparsec.Internal.Types as T
89import qualified Data.ByteString as B8
90import qualified Data.ByteString.Char8 as B
91import qualified Data.ByteString.Internal as B
92import qualified Data.ByteString.Lazy as L
93import qualified Data.ByteString.Unsafe as B
94
95type Parser = T.Parser ByteString
96type Result = IResult ByteString
97type Failure r = T.Failure ByteString Buffer r
98type Success a r = T.Success ByteString Buffer a r
99
100-- | The parser @satisfy p@ succeeds for any byte for which the
101-- predicate @p@ returns 'True'. Returns the byte that is actually
102-- parsed.
103--
104-- >digit = satisfy isDigit
105-- >    where isDigit w = w >= 48 && w <= 57
106satisfy :: (Word8 -> Bool) -> Parser Word8
107satisfy p = do
108  h <- peekWord8'
109  if p h
110    then advance 1 >> return h
111    else fail "satisfy"
112{-# INLINE satisfy #-}
113
114-- | The parser @skip p@ succeeds for any byte for which the predicate
115-- @p@ returns 'True'.
116--
117-- >skipDigit = skip isDigit
118-- >    where isDigit w = w >= 48 && w <= 57
119skip :: (Word8 -> Bool) -> Parser ()
120skip p = do
121  h <- peekWord8'
122  if p h
123    then advance 1
124    else fail "skip"
125
126-- | The parser @satisfyWith f p@ transforms a byte, and succeeds if
127-- the predicate @p@ returns 'True' on the transformed value. The
128-- parser returns the transformed byte that was parsed.
129satisfyWith :: (Word8 -> a) -> (a -> Bool) -> Parser a
130satisfyWith f p = do
131  h <- peekWord8'
132  let c = f h
133  if p c
134    then advance 1 >> return c
135    else fail "satisfyWith"
136{-# INLINE satisfyWith #-}
137
138storable :: Storable a => Parser a
139storable = hack undefined
140 where
141  hack :: Storable b => b -> Parser b
142  hack dummy = do
143    (fp,o,_) <- B.toForeignPtr `fmap` take (sizeOf dummy)
144    return . inlinePerformIO . withForeignPtr fp $ \p ->
145        peek (castPtr $ p `plusPtr` o)
146
147-- | Consume exactly @n@ bytes of input.
148take :: Int -> Parser ByteString
149take n0 = do
150  let n = max n0 0
151  s <- ensure n
152  advance n >> return s
153{-# INLINE take #-}
154
155-- | @string s@ parses a sequence of bytes that identically match
156-- @s@. Returns the parsed string (i.e. @s@).  This parser consumes no
157-- input if it fails (even if a partial match).
158--
159-- /Note/: The behaviour of this parser is different to that of the
160-- similarly-named parser in Parsec, as this one is all-or-nothing.
161-- To illustrate the difference, the following parser will fail under
162-- Parsec given an input of @\"for\"@:
163--
164-- >string "foo" <|> string "for"
165--
166-- The reason for its failure is that the first branch is a
167-- partial match, and will consume the letters @\'f\'@ and @\'o\'@
168-- before failing.  In attoparsec, the above parser will /succeed/ on
169-- that input, because the failed first branch will consume nothing.
170string :: ByteString -> Parser ByteString
171string s = string_ (stringSuspended id) id s
172{-# INLINE string #-}
173
174-- ASCII-specific but fast, oh yes.
175toLower :: Word8 -> Word8
176toLower w | w >= 65 && w <= 90 = w + 32
177          | otherwise          = w
178
179-- | Satisfy a literal string, ignoring case.
180stringCI :: ByteString -> Parser ByteString
181stringCI s = string_ (stringSuspended lower) lower s
182  where lower = B8.map toLower
183{-# INLINE stringCI #-}
184
185string_ :: (forall r. ByteString -> ByteString -> Buffer -> Pos -> More
186            -> Failure r -> Success ByteString r -> Result r)
187        -> (ByteString -> ByteString)
188        -> ByteString -> Parser ByteString
189string_ suspended f s0 = T.Parser $ \t pos more lose succ ->
190  let n = B.length s
191      s = f s0
192  in if lengthAtLeast pos n t
193     then let t' = substring pos (Pos n) t
194          in if s == f t'
195             then succ t (pos + Pos n) more t'
196             else lose t pos more [] "string"
197     else let t' = Buf.unsafeDrop (fromPos pos) t
198          in if f t' `B.isPrefixOf` s
199             then suspended s (B.drop (B.length t') s) t pos more lose succ
200             else lose t pos more [] "string"
201{-# INLINE string_ #-}
202
203stringSuspended :: (ByteString -> ByteString)
204                -> ByteString -> ByteString -> Buffer -> Pos -> More
205                -> Failure r
206                -> Success ByteString r
207                -> Result r
208stringSuspended f s0 s t pos more lose succ =
209    runParser (demandInput_ >>= go) t pos more lose succ
210  where go s'0   = T.Parser $ \t' pos' more' lose' succ' ->
211          let m  = B.length s
212              s' = f s'0
213              n  = B.length s'
214          in if n >= m
215             then if B.unsafeTake m s' == s
216                  then let o = Pos (B.length s0)
217                       in succ' t' (pos' + o) more'
218                          (substring pos' o t')
219                  else lose' t' pos' more' [] "string"
220             else if s' == B.unsafeTake n s
221                  then stringSuspended f s0 (B.unsafeDrop n s)
222                       t' pos' more' lose' succ'
223                  else lose' t' pos' more' [] "string"
224
225-- | Skip past input for as long as the predicate returns 'True'.
226skipWhile :: (Word8 -> Bool) -> Parser ()
227skipWhile p = go
228 where
229  go = do
230    t <- B8.takeWhile p <$> get
231    continue <- inputSpansChunks (B.length t)
232    when continue go
233{-# INLINE skipWhile #-}
234
235-- | Consume input as long as the predicate returns 'False'
236-- (i.e. until it returns 'True'), and return the consumed input.
237--
238-- This parser does not fail.  It will return an empty string if the
239-- predicate returns 'True' on the first byte of input.
240--
241-- /Note/: Because this parser does not fail, do not use it with
242-- combinators such as 'Control.Applicative.many', because such
243-- parsers loop until a failure occurs.  Careless use will thus result
244-- in an infinite loop.
245takeTill :: (Word8 -> Bool) -> Parser ByteString
246takeTill p = takeWhile (not . p)
247{-# INLINE takeTill #-}
248
249-- | Consume input as long as the predicate returns 'True', and return
250-- the consumed input.
251--
252-- This parser does not fail.  It will return an empty string if the
253-- predicate returns 'False' on the first byte of input.
254--
255-- /Note/: Because this parser does not fail, do not use it with
256-- combinators such as 'Control.Applicative.many', because such
257-- parsers loop until a failure occurs.  Careless use will thus result
258-- in an infinite loop.
259takeWhile :: (Word8 -> Bool) -> Parser ByteString
260takeWhile p = do
261    s <- B8.takeWhile p <$> get
262    continue <- inputSpansChunks (B.length s)
263    if continue
264      then takeWhileAcc p [s]
265      else return s
266{-# INLINE takeWhile #-}
267
268takeWhileAcc :: (Word8 -> Bool) -> [ByteString] -> Parser ByteString
269takeWhileAcc p = go
270 where
271  go acc = do
272    s <- B8.takeWhile p <$> get
273    continue <- inputSpansChunks (B.length s)
274    if continue
275      then go (s:acc)
276      else return $ concatReverse (s:acc)
277{-# INLINE takeWhileAcc #-}
278
279takeRest :: Parser [ByteString]
280takeRest = go []
281 where
282  go acc = do
283    input <- wantInput
284    if input
285      then do
286        s <- get
287        advance (B.length s)
288        go (s:acc)
289      else return (reverse acc)
290
291-- | Consume all remaining input and return it as a single string.
292takeByteString :: Parser ByteString
293takeByteString = B.concat `fmap` takeRest
294
295-- | Consume all remaining input and return it as a single string.
296takeLazyByteString :: Parser L.ByteString
297takeLazyByteString = L.fromChunks `fmap` takeRest
298
299data T s = T {-# UNPACK #-} !Int s
300
301scan_ :: (s -> [ByteString] -> Parser r) -> s -> (s -> Word8 -> Maybe s)
302         -> Parser r
303scan_ f s0 p = go [] s0
304 where
305  go acc s1 = do
306    let scanner (B.PS fp off len) =
307          withForeignPtr fp $ \ptr0 -> do
308            let start = ptr0 `plusPtr` off
309                end   = start `plusPtr` len
310                inner ptr !s
311                  | ptr < end = do
312                    w <- peek ptr
313                    case p s w of
314                      Just s' -> inner (ptr `plusPtr` 1) s'
315                      _       -> done (ptr `minusPtr` start) s
316                  | otherwise = done (ptr `minusPtr` start) s
317                done !i !s = return (T i s)
318            inner start s1
319    bs <- get
320    let T i s' = inlinePerformIO $ scanner bs
321        !h = B.unsafeTake i bs
322    continue <- inputSpansChunks i
323    if continue
324      then go (h:acc) s'
325      else f s' (h:acc)
326{-# INLINE scan_ #-}
327
328-- | A stateful scanner.  The predicate consumes and transforms a
329-- state argument, and each transformed state is passed to successive
330-- invocations of the predicate on each byte of the input until one
331-- returns 'Nothing' or the input ends.
332--
333-- This parser does not fail.  It will return an empty string if the
334-- predicate returns 'Nothing' on the first byte of input.
335--
336-- /Note/: Because this parser does not fail, do not use it with
337-- combinators such as 'Control.Applicative.many', because such
338-- parsers loop until a failure occurs.  Careless use will thus result
339-- in an infinite loop.
340scan :: s -> (s -> Word8 -> Maybe s) -> Parser ByteString
341scan = scan_ $ \_ chunks -> return $! concatReverse chunks
342{-# INLINE scan #-}
343
344-- | Like 'scan', but generalized to return the final state of the
345-- scanner.
346runScanner :: s -> (s -> Word8 -> Maybe s) -> Parser (ByteString, s)
347runScanner = scan_ $ \s xs -> let !sx = concatReverse xs in return (sx, s)
348{-# INLINE runScanner #-}
349
350-- | Consume input as long as the predicate returns 'True', and return
351-- the consumed input.
352--
353-- This parser requires the predicate to succeed on at least one byte
354-- of input: it will fail if the predicate never returns 'True' or if
355-- there is no input left.
356takeWhile1 :: (Word8 -> Bool) -> Parser ByteString
357takeWhile1 p = do
358  (`when` demandInput) =<< endOfChunk
359  s <- B8.takeWhile p <$> get
360  let len = B.length s
361  if len == 0
362    then fail "takeWhile1"
363    else do
364      advance len
365      eoc <- endOfChunk
366      if eoc
367        then takeWhileAcc p [s]
368        else return s
369{-# INLINE takeWhile1 #-}
370
371-- | Match any byte in a set.
372--
373-- >vowel = inClass "aeiou"
374--
375-- Range notation is supported.
376--
377-- >halfAlphabet = inClass "a-nA-N"
378--
379-- To add a literal @\'-\'@ to a set, place it at the beginning or end
380-- of the string.
381inClass :: String -> Word8 -> Bool
382inClass s = (`memberWord8` mySet)
383    where mySet = charClass s
384          {-# NOINLINE mySet #-}
385{-# INLINE inClass #-}
386
387-- | Match any byte not in a set.
388notInClass :: String -> Word8 -> Bool
389notInClass s = not . inClass s
390{-# INLINE notInClass #-}
391
392-- | Match any byte.
393anyWord8 :: Parser Word8
394anyWord8 = satisfy $ const True
395{-# INLINE anyWord8 #-}
396
397-- | Match a specific byte.
398word8 :: Word8 -> Parser Word8
399word8 c = satisfy (== c) <?> show c
400{-# INLINE word8 #-}
401
402-- | Match any byte except the given one.
403notWord8 :: Word8 -> Parser Word8
404notWord8 c = satisfy (/= c) <?> "not " ++ show c
405{-# INLINE notWord8 #-}
406
407-- | Match any byte, to perform lookahead. Returns 'Nothing' if end of
408-- input has been reached. Does not consume any input.
409--
410-- /Note/: Because this parser does not fail, do not use it with
411-- combinators such as 'Control.Applicative.many', because such
412-- parsers loop until a failure occurs.  Careless use will thus result
413-- in an infinite loop.
414peekWord8 :: Parser (Maybe Word8)
415peekWord8 = T.Parser $ \t pos@(Pos pos_) more _lose succ ->
416  case () of
417    _| pos_ < Buf.length t ->
418       let !w = Buf.unsafeIndex t pos_
419       in succ t pos more (Just w)
420     | more == Complete ->
421       succ t pos more Nothing
422     | otherwise ->
423       let succ' t' pos' more' = let !w = Buf.unsafeIndex t' pos_
424                                 in succ t' pos' more' (Just w)
425           lose' t' pos' more' = succ t' pos' more' Nothing
426       in prompt t pos more lose' succ'
427{-# INLINE peekWord8 #-}
428
429-- | Match any byte, to perform lookahead.  Does not consume any
430-- input, but will fail if end of input has been reached.
431peekWord8' :: Parser Word8
432peekWord8' = T.Parser $ \t pos more lose succ ->
433    if lengthAtLeast pos 1 t
434    then succ t pos more (Buf.unsafeIndex t (fromPos pos))
435    else let succ' t' pos' more' bs' = succ t' pos' more' $! B.unsafeHead bs'
436         in ensureSuspended 1 t pos more lose succ'
437{-# INLINE peekWord8' #-}
438
439-- | Match either a single newline character @\'\\n\'@, or a carriage
440-- return followed by a newline character @\"\\r\\n\"@.
441endOfLine :: Parser ()
442endOfLine = (word8 10 >> return ()) <|> (string "\r\n" >> return ())
443
444-- | Terminal failure continuation.
445failK :: Failure a
446failK t (Pos pos) _more stack msg = Fail (Buf.unsafeDrop pos t) stack msg
447{-# INLINE failK #-}
448
449-- | Terminal success continuation.
450successK :: Success a a
451successK t (Pos pos) _more a = Done (Buf.unsafeDrop pos t) a
452{-# INLINE successK #-}
453
454-- | Run a parser.
455parse :: Parser a -> ByteString -> Result a
456parse m s = T.runParser m (buffer s) (Pos 0) Incomplete failK successK
457{-# INLINE parse #-}
458
459-- | Run a parser that cannot be resupplied via a 'Partial' result.
460--
461-- This function does not force a parser to consume all of its input.
462-- Instead, any residual input will be discarded.  To force a parser
463-- to consume all of its input, use something like this:
464--
465-- @
466--'parseOnly' (myParser 'Control.Applicative.<*' 'endOfInput')
467-- @
468parseOnly :: Parser a -> ByteString -> Either String a
469parseOnly m s = case T.runParser m (buffer s) (Pos 0) Complete failK successK of
470                  Fail _ [] err   -> Left err
471                  Fail _ ctxs err -> Left (intercalate " > " ctxs ++ ": " ++ err)
472                  Done _ a        -> Right a
473                  _               -> error "parseOnly: impossible error!"
474{-# INLINE parseOnly #-}
475
476get :: Parser ByteString
477get = T.Parser $ \t pos more _lose succ ->
478  succ t pos more (Buf.unsafeDrop (fromPos pos) t)
479{-# INLINE get #-}
480
481endOfChunk :: Parser Bool
482endOfChunk = T.Parser $ \t pos more _lose succ ->
483  succ t pos more (fromPos pos == Buf.length t)
484{-# INLINE endOfChunk #-}
485
486inputSpansChunks :: Int -> Parser Bool
487inputSpansChunks i = T.Parser $ \t pos_ more _lose succ ->
488  let pos = pos_ + Pos i
489  in if fromPos pos < Buf.length t || more == Complete
490     then succ t pos more False
491     else let lose' t' pos' more' = succ t' pos' more' False
492              succ' t' pos' more' = succ t' pos' more' True
493          in prompt t pos more lose' succ'
494{-# INLINE inputSpansChunks #-}
495
496advance :: Int -> Parser ()
497advance n = T.Parser $ \t pos more _lose succ ->
498  succ t (pos + Pos n) more ()
499{-# INLINE advance #-}
500
501ensureSuspended :: Int -> Buffer -> Pos -> More
502                -> Failure r
503                -> Success ByteString r
504                -> Result r
505ensureSuspended n t pos more lose succ =
506    runParser (demandInput >> go) t pos more lose succ
507  where go = T.Parser $ \t' pos' more' lose' succ' ->
508          if lengthAtLeast pos' n t'
509          then succ' t' pos' more' (substring pos (Pos n) t')
510          else runParser (demandInput >> go) t' pos' more' lose' succ'
511
512-- | If at least @n@ elements of input are available, return the
513-- current input, otherwise fail.
514ensure :: Int -> Parser ByteString
515ensure n = T.Parser $ \t pos more lose succ ->
516    if lengthAtLeast pos n t
517    then succ t pos more (substring pos (Pos n) t)
518    -- The uncommon case is kept out-of-line to reduce code size:
519    else ensureSuspended n t pos more lose succ
520{-# INLINE ensure #-}
521
522-- | Return both the result of a parse and the portion of the input
523-- that was consumed while it was being parsed.
524match :: Parser a -> Parser (ByteString, a)
525match p = T.Parser $ \t pos more lose succ ->
526  let succ' t' pos' more' a =
527        succ t' pos' more' (substring pos (pos'-pos) t', a)
528  in runParser p t pos more lose succ'
529
530lengthAtLeast :: Pos -> Int -> Buffer -> Bool
531lengthAtLeast (Pos pos) n bs = Buf.length bs >= pos + n
532{-# INLINE lengthAtLeast #-}
533
534substring :: Pos -> Pos -> Buffer -> ByteString
535substring (Pos pos) (Pos n) = Buf.substring pos n
536{-# INLINE substring #-}
537