1{-# LANGUAGE BangPatterns, FlexibleInstances, GADTs, OverloadedStrings,
2    Rank2Types, RecordWildCards, TypeFamilies, TypeSynonymInstances #-}
3{-# OPTIONS_GHC -fno-warn-orphans #-}
4-- |
5-- Module      :  Data.Attoparsec.Text.Internal
6-- Copyright   :  Bryan O'Sullivan 2007-2015
7-- License     :  BSD3
8--
9-- Maintainer  :  bos@serpentine.com
10-- Stability   :  experimental
11-- Portability :  unknown
12--
13-- Simple, efficient parser combinators for 'Text' strings, loosely
14-- based on the Parsec library.
15
16module Data.Attoparsec.Text.Internal
17    (
18    -- * Parser types
19      Parser
20    , Result
21
22    -- * Running parsers
23    , parse
24    , parseOnly
25
26    -- * Combinators
27    , module Data.Attoparsec.Combinator
28
29    -- * Parsing individual characters
30    , satisfy
31    , satisfyWith
32    , anyChar
33    , skip
34    , char
35    , notChar
36
37    -- ** Lookahead
38    , peekChar
39    , peekChar'
40
41    -- ** Character classes
42    , inClass
43    , notInClass
44
45    -- * Efficient string handling
46    , skipWhile
47    , string
48    , stringCI
49    , asciiCI
50    , take
51    , scan
52    , runScanner
53    , takeWhile
54    , takeWhile1
55    , takeTill
56
57    -- ** Consume all remaining input
58    , takeText
59    , takeLazyText
60
61    -- * Utilities
62    , endOfLine
63    , endOfInput
64    , match
65    , atEnd
66    ) where
67
68import Control.Applicative ((<|>), (<$>), pure, (*>))
69import Control.Monad (when)
70import Data.Attoparsec.Combinator ((<?>))
71import Data.Attoparsec.Internal
72import Data.Attoparsec.Internal.Types hiding (Parser, Failure, Success)
73import qualified Data.Attoparsec.Text.Buffer as Buf
74import Data.Attoparsec.Text.Buffer (Buffer, buffer)
75import Data.Char (isAsciiUpper, isAsciiLower, toUpper, toLower)
76import Data.List (intercalate)
77import Data.String (IsString(..))
78import Data.Text.Internal (Text(..))
79import Prelude hiding (getChar, succ, take, takeWhile)
80import qualified Data.Attoparsec.Internal.Types as T
81import qualified Data.Attoparsec.Text.FastSet as Set
82import qualified Data.Text as T
83import qualified Data.Text.Lazy as L
84import qualified Data.Text.Unsafe as T
85
86type Parser = T.Parser Text
87type Result = IResult Text
88type Failure r = T.Failure Text Buffer r
89type Success a r = T.Success Text Buffer a r
90
91instance (a ~ Text) => IsString (Parser a) where
92    fromString = string . T.pack
93
94-- | The parser @satisfy p@ succeeds for any character for which the
95-- predicate @p@ returns 'True'. Returns the character that is
96-- actually parsed.
97--
98-- >digit = satisfy isDigit
99-- >    where isDigit c = c >= '0' && c <= '9'
100satisfy :: (Char -> Bool) -> Parser Char
101satisfy p = do
102  (k,c) <- ensure 1
103  let !h = T.unsafeHead c
104  if p h
105    then advance k >> return h
106    else fail "satisfy"
107{-# INLINE satisfy #-}
108
109-- | The parser @skip p@ succeeds for any character for which the
110-- predicate @p@ returns 'True'.
111--
112-- >skipDigit = skip isDigit
113-- >    where isDigit c = c >= '0' && c <= '9'
114skip :: (Char -> Bool) -> Parser ()
115skip p = do
116  (k,s) <- ensure 1
117  if p (T.unsafeHead s)
118    then advance k
119    else fail "skip"
120
121-- | The parser @satisfyWith f p@ transforms a character, and succeeds
122-- if the predicate @p@ returns 'True' on the transformed value. The
123-- parser returns the transformed character that was parsed.
124satisfyWith :: (Char -> a) -> (a -> Bool) -> Parser a
125satisfyWith f p = do
126  (k,s) <- ensure 1
127  let c = f $! T.unsafeHead s
128  if p c
129    then advance k >> return c
130    else fail "satisfyWith"
131{-# INLINE satisfyWith #-}
132
133-- | Consume @n@ characters of input, but succeed only if the
134-- predicate returns 'True'.
135takeWith :: Int -> (Text -> Bool) -> Parser Text
136takeWith n p = do
137  (k,s) <- ensure n
138  if p s
139    then advance k >> return s
140    else fail "takeWith"
141
142-- | Consume exactly @n@ characters of input.
143take :: Int -> Parser Text
144take n = takeWith (max n 0) (const True)
145{-# INLINE take #-}
146
147-- | @string s@ parses a sequence of characters that identically match
148-- @s@. Returns the parsed string (i.e. @s@).  This parser consumes no
149-- input if it fails (even if a partial match).
150--
151-- /Note/: The behaviour of this parser is different to that of the
152-- similarly-named parser in Parsec, as this one is all-or-nothing.
153-- To illustrate the difference, the following parser will fail under
154-- Parsec given an input of @\"for\"@:
155--
156-- >string "foo" <|> string "for"
157--
158-- The reason for its failure is that the first branch is a
159-- partial match, and will consume the letters @\'f\'@ and @\'o\'@
160-- before failing.  In attoparsec, the above parser will /succeed/ on
161-- that input, because the failed first branch will consume nothing.
162string :: Text -> Parser Text
163string s = string_ (stringSuspended id) id s
164{-# INLINE string #-}
165
166string_ :: (forall r. Text -> Text -> Buffer -> Pos -> More
167            -> Failure r -> Success Text r -> Result r)
168        -> (Text -> Text)
169        -> Text -> Parser Text
170string_ suspended f s0 = T.Parser $ \t pos more lose succ ->
171  let s  = f s0
172      ft = f (Buf.unbufferAt (fromPos pos) t)
173  in case T.commonPrefixes s ft of
174       Nothing
175         | T.null s          -> succ t pos more T.empty
176         | T.null ft         -> suspended s s t pos more lose succ
177         | otherwise         -> lose t pos more [] "string"
178       Just (pfx,ssfx,tsfx)
179         | T.null ssfx       -> let l = Pos (T.lengthWord16 pfx)
180                                in succ t (pos + l) more (substring pos l t)
181         | not (T.null tsfx) -> lose t pos more [] "string"
182         | otherwise         -> suspended s ssfx t pos more lose succ
183{-# INLINE string_ #-}
184
185stringSuspended :: (Text -> Text)
186                -> Text -> Text -> Buffer -> Pos -> More
187                -> Failure r
188                -> Success Text r
189                -> Result r
190stringSuspended f s000 s0 t0 pos0 more0 lose0 succ0 =
191    runParser (demandInput_ >>= go) t0 pos0 more0 lose0 succ0
192  where
193    go s' = T.Parser $ \t pos more lose succ ->
194      let s = f s'
195      in case T.commonPrefixes s0 s of
196        Nothing         -> lose t pos more [] "string"
197        Just (_pfx,ssfx,tsfx)
198          | T.null ssfx -> let l = Pos (T.lengthWord16 s000)
199                           in succ t (pos + l) more (substring pos l t)
200          | T.null tsfx -> stringSuspended f s000 ssfx t pos more lose succ
201          | otherwise   -> lose t pos more [] "string"
202
203-- | Satisfy a literal string, ignoring case.
204--
205-- Note: this function is currently quite inefficient. Unicode case
206-- folding can change the length of a string (\"&#223;\" becomes
207-- "ss"), which makes a simple, efficient implementation tricky.  We
208-- have (for now) chosen simplicity over efficiency.
209stringCI :: Text -> Parser Text
210stringCI s = go 0
211  where
212    go !n
213      | n > T.length fs = fail "stringCI"
214      | otherwise = do
215      (k,t) <- ensure n
216      if T.toCaseFold t == fs
217        then advance k >> return t
218        else go (n+1)
219    fs = T.toCaseFold s
220{-# INLINE stringCI #-}
221{-# DEPRECATED stringCI "this is very inefficient, use asciiCI instead" #-}
222
223-- | Satisfy a literal string, ignoring case for characters in the ASCII range.
224asciiCI :: Text -> Parser Text
225asciiCI s = fmap fst $ match $ T.foldr ((*>) . asciiCharCI) (pure ()) s
226{-# INLINE asciiCI #-}
227
228asciiCharCI :: Char -> Parser Char
229asciiCharCI c
230  | isAsciiUpper c = char c <|> char (toLower c)
231  | isAsciiLower c = char c <|> char (toUpper c)
232  | otherwise = char c
233{-# INLINE asciiCharCI #-}
234
235-- | Skip past input for as long as the predicate returns 'True'.
236skipWhile :: (Char -> Bool) -> Parser ()
237skipWhile p = go
238 where
239  go = do
240    t <- T.takeWhile p <$> get
241    continue <- inputSpansChunks (size t)
242    when continue go
243{-# INLINE skipWhile #-}
244
245-- | Consume input as long as the predicate returns 'False'
246-- (i.e. until it returns 'True'), and return the consumed input.
247--
248-- This parser does not fail.  It will return an empty string if the
249-- predicate returns 'True' on the first character of input.
250--
251-- /Note/: Because this parser does not fail, do not use it with
252-- combinators such as 'Control.Applicative.many', because such
253-- parsers loop until a failure occurs.  Careless use will thus result
254-- in an infinite loop.
255takeTill :: (Char -> Bool) -> Parser Text
256takeTill p = takeWhile (not . p)
257{-# INLINE takeTill #-}
258
259-- | Consume input as long as the predicate returns 'True', and return
260-- the consumed input.
261--
262-- This parser does not fail.  It will return an empty string if the
263-- predicate returns 'False' on the first character of input.
264--
265-- /Note/: Because this parser does not fail, do not use it with
266-- combinators such as 'Control.Applicative.many', because such
267-- parsers loop until a failure occurs.  Careless use will thus result
268-- in an infinite loop.
269takeWhile :: (Char -> Bool) -> Parser Text
270takeWhile p = do
271    h <- T.takeWhile p <$> get
272    continue <- inputSpansChunks (size h)
273    -- only use slow concat path if necessary
274    if continue
275      then takeWhileAcc p [h]
276      else return h
277{-# INLINE takeWhile #-}
278
279takeWhileAcc :: (Char -> Bool) -> [Text] -> Parser Text
280takeWhileAcc p = go
281 where
282  go acc = do
283    h <- T.takeWhile p <$> get
284    continue <- inputSpansChunks (size h)
285    if continue
286      then go (h:acc)
287      else return $ concatReverse (h:acc)
288{-# INLINE takeWhileAcc #-}
289
290takeRest :: Parser [Text]
291takeRest = go []
292 where
293  go acc = do
294    input <- wantInput
295    if input
296      then do
297        s <- get
298        advance (size s)
299        go (s:acc)
300      else return (reverse acc)
301
302-- | Consume all remaining input and return it as a single string.
303takeText :: Parser Text
304takeText = T.concat `fmap` takeRest
305
306-- | Consume all remaining input and return it as a single string.
307takeLazyText :: Parser L.Text
308takeLazyText = L.fromChunks `fmap` takeRest
309
310data Scan s = Continue s
311            | Finished s {-# UNPACK #-} !Int Text
312
313scan_ :: (s -> [Text] -> Parser r) -> s -> (s -> Char -> Maybe s) -> Parser r
314scan_ f s0 p = go [] s0
315 where
316  scanner s !n t =
317    case T.uncons t of
318      Just (c,t') -> case p s c of
319                       Just s' -> scanner s' (n+1) t'
320                       Nothing -> Finished s n t
321      Nothing     -> Continue s
322  go acc s = do
323    input <- get
324    case scanner s 0 input of
325      Continue s'  -> do continue <- inputSpansChunks (size input)
326                         if continue
327                           then go (input : acc) s'
328                           else f s' (input : acc)
329      Finished s' n t -> do advance (size input - size t)
330                            f s' (T.take n input : acc)
331{-# INLINE scan_ #-}
332
333-- | A stateful scanner.  The predicate consumes and transforms a
334-- state argument, and each transformed state is passed to successive
335-- invocations of the predicate on each character of the input until one
336-- returns 'Nothing' or the input ends.
337--
338-- This parser does not fail.  It will return an empty string if the
339-- predicate returns 'Nothing' on the first character of input.
340--
341-- /Note/: Because this parser does not fail, do not use it with
342-- combinators such as 'Control.Applicative.many', because such
343-- parsers loop until a failure occurs.  Careless use will thus result
344-- in an infinite loop.
345scan :: s -> (s -> Char -> Maybe s) -> Parser Text
346scan = scan_ $ \_ chunks -> return $! concatReverse chunks
347{-# INLINE scan #-}
348
349-- | Like 'scan', but generalized to return the final state of the
350-- scanner.
351runScanner :: s -> (s -> Char -> Maybe s) -> Parser (Text, s)
352runScanner = scan_ $ \s xs -> let !sx = concatReverse xs in return (sx, s)
353{-# INLINE runScanner #-}
354
355-- | Consume input as long as the predicate returns 'True', and return
356-- the consumed input.
357--
358-- This parser requires the predicate to succeed on at least one
359-- character of input: it will fail if the predicate never returns
360-- 'True' or if there is no input left.
361takeWhile1 :: (Char -> Bool) -> Parser Text
362takeWhile1 p = do
363  (`when` demandInput) =<< endOfChunk
364  h <- T.takeWhile p <$> get
365  let size' = size h
366  when (size' == 0) $ fail "takeWhile1"
367  advance size'
368  eoc <- endOfChunk
369  if eoc
370    then takeWhileAcc p [h]
371    else return h
372{-# INLINE takeWhile1 #-}
373
374-- | Match any character in a set.
375--
376-- >vowel = inClass "aeiou"
377--
378-- Range notation is supported.
379--
380-- >halfAlphabet = inClass "a-nA-N"
381--
382-- To add a literal @\'-\'@ to a set, place it at the beginning or end
383-- of the string.
384inClass :: String -> Char -> Bool
385inClass s = (`Set.member` mySet)
386    where mySet = Set.charClass s
387          {-# NOINLINE mySet #-}
388{-# INLINE inClass #-}
389
390-- | Match any character not in a set.
391notInClass :: String -> Char -> Bool
392notInClass s = not . inClass s
393{-# INLINE notInClass #-}
394
395-- | Match any character.
396anyChar :: Parser Char
397anyChar = satisfy $ const True
398{-# INLINE anyChar #-}
399
400-- | Match a specific character.
401char :: Char -> Parser Char
402char c = satisfy (== c) <?> show c
403{-# INLINE char #-}
404
405-- | Match any character except the given one.
406notChar :: Char -> Parser Char
407notChar c = satisfy (/= c) <?> "not " ++ show c
408{-# INLINE notChar #-}
409
410-- | Match any character, to perform lookahead. Returns 'Nothing' if
411-- end of input has been reached. Does not consume any input.
412--
413-- /Note/: Because this parser does not fail, do not use it with
414-- combinators such as 'Control.Applicative.many', because such
415-- parsers loop until a failure occurs.  Careless use will thus result
416-- in an infinite loop.
417peekChar :: Parser (Maybe Char)
418peekChar = T.Parser $ \t pos more _lose succ ->
419  case () of
420    _| pos < lengthOf t ->
421       let T.Iter !c _ = Buf.iter t (fromPos pos)
422       in succ t pos more (Just c)
423     | more == Complete ->
424       succ t pos more Nothing
425     | otherwise ->
426       let succ' t' pos' more' =
427             let T.Iter !c _ = Buf.iter t' (fromPos pos')
428             in succ t' pos' more' (Just c)
429           lose' t' pos' more' = succ t' pos' more' Nothing
430       in prompt t pos more lose' succ'
431{-# INLINE peekChar #-}
432
433-- | Match any character, to perform lookahead.  Does not consume any
434-- input, but will fail if end of input has been reached.
435peekChar' :: Parser Char
436peekChar' = do
437  (_,s) <- ensure 1
438  return $! T.unsafeHead s
439{-# INLINE peekChar' #-}
440
441-- | Match either a single newline character @\'\\n\'@, or a carriage
442-- return followed by a newline character @\"\\r\\n\"@.
443endOfLine :: Parser ()
444endOfLine = (char '\n' >> return ()) <|> (string "\r\n" >> return ())
445
446-- | Terminal failure continuation.
447failK :: Failure a
448failK t (Pos pos) _more stack msg = Fail (Buf.dropWord16 pos t) stack msg
449{-# INLINE failK #-}
450
451-- | Terminal success continuation.
452successK :: Success a a
453successK t (Pos pos) _more a = Done (Buf.dropWord16 pos t) a
454{-# INLINE successK #-}
455
456-- | Run a parser.
457parse :: Parser a -> Text -> Result a
458parse m s = runParser m (buffer s) 0 Incomplete failK successK
459{-# INLINE parse #-}
460
461-- | Run a parser that cannot be resupplied via a 'Partial' result.
462--
463-- This function does not force a parser to consume all of its input.
464-- Instead, any residual input will be discarded.  To force a parser
465-- to consume all of its input, use something like this:
466--
467-- @
468--'parseOnly' (myParser 'Control.Applicative.<*' 'endOfInput')
469-- @
470parseOnly :: Parser a -> Text -> Either String a
471parseOnly m s = case runParser m (buffer s) 0 Complete failK successK of
472                  Fail _ [] err   -> Left err
473                  Fail _ ctxs err -> Left (intercalate " > " ctxs ++ ": " ++ err)
474                  Done _ a        -> Right a
475                  _               -> error "parseOnly: impossible error!"
476{-# INLINE parseOnly #-}
477
478get :: Parser Text
479get = T.Parser $ \t pos more _lose succ ->
480  succ t pos more (Buf.dropWord16 (fromPos pos) t)
481{-# INLINE get #-}
482
483endOfChunk :: Parser Bool
484endOfChunk = T.Parser $ \t pos more _lose succ ->
485  succ t pos more (pos == lengthOf t)
486{-# INLINE endOfChunk #-}
487
488inputSpansChunks :: Pos -> Parser Bool
489inputSpansChunks i = T.Parser $ \t pos_ more _lose succ ->
490  let pos = pos_ + i
491  in if pos < lengthOf t || more == Complete
492     then succ t pos more False
493     else let lose' t' pos' more' = succ t' pos' more' False
494              succ' t' pos' more' = succ t' pos' more' True
495          in prompt t pos more lose' succ'
496{-# INLINE inputSpansChunks #-}
497
498advance :: Pos -> Parser ()
499advance n = T.Parser $ \t pos more _lose succ -> succ t (pos+n) more ()
500{-# INLINE advance #-}
501
502ensureSuspended :: Int -> Buffer -> Pos -> More
503                -> Failure r -> Success (Pos, Text) r
504                -> Result r
505ensureSuspended n t pos more lose succ =
506    runParser (demandInput >> go) t pos more lose succ
507  where go = T.Parser $ \t' pos' more' lose' succ' ->
508          case lengthAtLeast pos' n t' of
509            Just n' -> succ' t' pos' more' (n', substring pos n' t')
510            Nothing -> runParser (demandInput >> go) t' pos' more' lose' succ'
511
512-- | If at least @n@ elements of input are available, return the
513-- current input, otherwise fail.
514ensure :: Int -> Parser (Pos, Text)
515ensure n = T.Parser $ \t pos more lose succ ->
516    case lengthAtLeast pos n t of
517      Just n' -> succ t pos more (n', substring pos n' t)
518      -- The uncommon case is kept out-of-line to reduce code size:
519      Nothing -> ensureSuspended n t pos more lose succ
520{-# INLINE ensure #-}
521
522-- | Return both the result of a parse and the portion of the input
523-- that was consumed while it was being parsed.
524match :: Parser a -> Parser (Text, a)
525match p = T.Parser $ \t pos more lose succ ->
526  let succ' t' pos' more' a = succ t' pos' more'
527                              (substring pos (pos'-pos) t', a)
528  in runParser p t pos more lose succ'
529
530-- | Ensure that at least @n@ code points of input are available.
531-- Returns the number of words consumed while traversing.
532lengthAtLeast :: Pos -> Int -> Buffer -> Maybe Pos
533lengthAtLeast pos n t = go 0 (fromPos pos)
534  where go i !p
535          | i == n    = Just (Pos p - pos)
536          | p == len  = Nothing
537          | otherwise = go (i+1) (p + Buf.iter_ t p)
538        Pos len = lengthOf t
539{-# INLINE lengthAtLeast #-}
540
541substring :: Pos -> Pos -> Buffer -> Text
542substring (Pos pos) (Pos n) = Buf.substring pos n
543{-# INLINE substring #-}
544
545lengthOf :: Buffer -> Pos
546lengthOf = Pos . Buf.length
547
548size :: Text -> Pos
549size (Text _ _ l) = Pos l
550