1{-# LANGUAGE BangPatterns, FlexibleInstances, GADTs, OverloadedStrings, 2 Rank2Types, RecordWildCards, TypeFamilies, TypeSynonymInstances #-} 3{-# OPTIONS_GHC -fno-warn-orphans #-} 4-- | 5-- Module : Data.Attoparsec.Text.Internal 6-- Copyright : Bryan O'Sullivan 2007-2015 7-- License : BSD3 8-- 9-- Maintainer : bos@serpentine.com 10-- Stability : experimental 11-- Portability : unknown 12-- 13-- Simple, efficient parser combinators for 'Text' strings, loosely 14-- based on the Parsec library. 15 16module Data.Attoparsec.Text.Internal 17 ( 18 -- * Parser types 19 Parser 20 , Result 21 22 -- * Running parsers 23 , parse 24 , parseOnly 25 26 -- * Combinators 27 , module Data.Attoparsec.Combinator 28 29 -- * Parsing individual characters 30 , satisfy 31 , satisfyWith 32 , anyChar 33 , skip 34 , char 35 , notChar 36 37 -- ** Lookahead 38 , peekChar 39 , peekChar' 40 41 -- ** Character classes 42 , inClass 43 , notInClass 44 45 -- * Efficient string handling 46 , skipWhile 47 , string 48 , stringCI 49 , asciiCI 50 , take 51 , scan 52 , runScanner 53 , takeWhile 54 , takeWhile1 55 , takeTill 56 57 -- ** Consume all remaining input 58 , takeText 59 , takeLazyText 60 61 -- * Utilities 62 , endOfLine 63 , endOfInput 64 , match 65 , atEnd 66 ) where 67 68import Control.Applicative ((<|>), (<$>), pure, (*>)) 69import Control.Monad (when) 70import Data.Attoparsec.Combinator ((<?>)) 71import Data.Attoparsec.Internal 72import Data.Attoparsec.Internal.Types hiding (Parser, Failure, Success) 73import qualified Data.Attoparsec.Text.Buffer as Buf 74import Data.Attoparsec.Text.Buffer (Buffer, buffer) 75import Data.Char (isAsciiUpper, isAsciiLower, toUpper, toLower) 76import Data.List (intercalate) 77import Data.String (IsString(..)) 78import Data.Text.Internal (Text(..)) 79import Prelude hiding (getChar, succ, take, takeWhile) 80import qualified Data.Attoparsec.Internal.Types as T 81import qualified Data.Attoparsec.Text.FastSet as Set 82import qualified Data.Text as T 83import qualified Data.Text.Lazy as L 84import qualified Data.Text.Unsafe as T 85 86type Parser = T.Parser Text 87type Result = IResult Text 88type Failure r = T.Failure Text Buffer r 89type Success a r = T.Success Text Buffer a r 90 91instance (a ~ Text) => IsString (Parser a) where 92 fromString = string . T.pack 93 94-- | The parser @satisfy p@ succeeds for any character for which the 95-- predicate @p@ returns 'True'. Returns the character that is 96-- actually parsed. 97-- 98-- >digit = satisfy isDigit 99-- > where isDigit c = c >= '0' && c <= '9' 100satisfy :: (Char -> Bool) -> Parser Char 101satisfy p = do 102 (k,c) <- ensure 1 103 let !h = T.unsafeHead c 104 if p h 105 then advance k >> return h 106 else fail "satisfy" 107{-# INLINE satisfy #-} 108 109-- | The parser @skip p@ succeeds for any character for which the 110-- predicate @p@ returns 'True'. 111-- 112-- >skipDigit = skip isDigit 113-- > where isDigit c = c >= '0' && c <= '9' 114skip :: (Char -> Bool) -> Parser () 115skip p = do 116 (k,s) <- ensure 1 117 if p (T.unsafeHead s) 118 then advance k 119 else fail "skip" 120 121-- | The parser @satisfyWith f p@ transforms a character, and succeeds 122-- if the predicate @p@ returns 'True' on the transformed value. The 123-- parser returns the transformed character that was parsed. 124satisfyWith :: (Char -> a) -> (a -> Bool) -> Parser a 125satisfyWith f p = do 126 (k,s) <- ensure 1 127 let c = f $! T.unsafeHead s 128 if p c 129 then advance k >> return c 130 else fail "satisfyWith" 131{-# INLINE satisfyWith #-} 132 133-- | Consume @n@ characters of input, but succeed only if the 134-- predicate returns 'True'. 135takeWith :: Int -> (Text -> Bool) -> Parser Text 136takeWith n p = do 137 (k,s) <- ensure n 138 if p s 139 then advance k >> return s 140 else fail "takeWith" 141 142-- | Consume exactly @n@ characters of input. 143take :: Int -> Parser Text 144take n = takeWith (max n 0) (const True) 145{-# INLINE take #-} 146 147-- | @string s@ parses a sequence of characters that identically match 148-- @s@. Returns the parsed string (i.e. @s@). This parser consumes no 149-- input if it fails (even if a partial match). 150-- 151-- /Note/: The behaviour of this parser is different to that of the 152-- similarly-named parser in Parsec, as this one is all-or-nothing. 153-- To illustrate the difference, the following parser will fail under 154-- Parsec given an input of @\"for\"@: 155-- 156-- >string "foo" <|> string "for" 157-- 158-- The reason for its failure is that the first branch is a 159-- partial match, and will consume the letters @\'f\'@ and @\'o\'@ 160-- before failing. In attoparsec, the above parser will /succeed/ on 161-- that input, because the failed first branch will consume nothing. 162string :: Text -> Parser Text 163string s = string_ (stringSuspended id) id s 164{-# INLINE string #-} 165 166string_ :: (forall r. Text -> Text -> Buffer -> Pos -> More 167 -> Failure r -> Success Text r -> Result r) 168 -> (Text -> Text) 169 -> Text -> Parser Text 170string_ suspended f s0 = T.Parser $ \t pos more lose succ -> 171 let s = f s0 172 ft = f (Buf.unbufferAt (fromPos pos) t) 173 in case T.commonPrefixes s ft of 174 Nothing 175 | T.null s -> succ t pos more T.empty 176 | T.null ft -> suspended s s t pos more lose succ 177 | otherwise -> lose t pos more [] "string" 178 Just (pfx,ssfx,tsfx) 179 | T.null ssfx -> let l = Pos (T.lengthWord16 pfx) 180 in succ t (pos + l) more (substring pos l t) 181 | not (T.null tsfx) -> lose t pos more [] "string" 182 | otherwise -> suspended s ssfx t pos more lose succ 183{-# INLINE string_ #-} 184 185stringSuspended :: (Text -> Text) 186 -> Text -> Text -> Buffer -> Pos -> More 187 -> Failure r 188 -> Success Text r 189 -> Result r 190stringSuspended f s000 s0 t0 pos0 more0 lose0 succ0 = 191 runParser (demandInput_ >>= go) t0 pos0 more0 lose0 succ0 192 where 193 go s' = T.Parser $ \t pos more lose succ -> 194 let s = f s' 195 in case T.commonPrefixes s0 s of 196 Nothing -> lose t pos more [] "string" 197 Just (_pfx,ssfx,tsfx) 198 | T.null ssfx -> let l = Pos (T.lengthWord16 s000) 199 in succ t (pos + l) more (substring pos l t) 200 | T.null tsfx -> stringSuspended f s000 ssfx t pos more lose succ 201 | otherwise -> lose t pos more [] "string" 202 203-- | Satisfy a literal string, ignoring case. 204-- 205-- Note: this function is currently quite inefficient. Unicode case 206-- folding can change the length of a string (\"ß\" becomes 207-- "ss"), which makes a simple, efficient implementation tricky. We 208-- have (for now) chosen simplicity over efficiency. 209stringCI :: Text -> Parser Text 210stringCI s = go 0 211 where 212 go !n 213 | n > T.length fs = fail "stringCI" 214 | otherwise = do 215 (k,t) <- ensure n 216 if T.toCaseFold t == fs 217 then advance k >> return t 218 else go (n+1) 219 fs = T.toCaseFold s 220{-# INLINE stringCI #-} 221{-# DEPRECATED stringCI "this is very inefficient, use asciiCI instead" #-} 222 223-- | Satisfy a literal string, ignoring case for characters in the ASCII range. 224asciiCI :: Text -> Parser Text 225asciiCI s = fmap fst $ match $ T.foldr ((*>) . asciiCharCI) (pure ()) s 226{-# INLINE asciiCI #-} 227 228asciiCharCI :: Char -> Parser Char 229asciiCharCI c 230 | isAsciiUpper c = char c <|> char (toLower c) 231 | isAsciiLower c = char c <|> char (toUpper c) 232 | otherwise = char c 233{-# INLINE asciiCharCI #-} 234 235-- | Skip past input for as long as the predicate returns 'True'. 236skipWhile :: (Char -> Bool) -> Parser () 237skipWhile p = go 238 where 239 go = do 240 t <- T.takeWhile p <$> get 241 continue <- inputSpansChunks (size t) 242 when continue go 243{-# INLINE skipWhile #-} 244 245-- | Consume input as long as the predicate returns 'False' 246-- (i.e. until it returns 'True'), and return the consumed input. 247-- 248-- This parser does not fail. It will return an empty string if the 249-- predicate returns 'True' on the first character of input. 250-- 251-- /Note/: Because this parser does not fail, do not use it with 252-- combinators such as 'Control.Applicative.many', because such 253-- parsers loop until a failure occurs. Careless use will thus result 254-- in an infinite loop. 255takeTill :: (Char -> Bool) -> Parser Text 256takeTill p = takeWhile (not . p) 257{-# INLINE takeTill #-} 258 259-- | Consume input as long as the predicate returns 'True', and return 260-- the consumed input. 261-- 262-- This parser does not fail. It will return an empty string if the 263-- predicate returns 'False' on the first character of input. 264-- 265-- /Note/: Because this parser does not fail, do not use it with 266-- combinators such as 'Control.Applicative.many', because such 267-- parsers loop until a failure occurs. Careless use will thus result 268-- in an infinite loop. 269takeWhile :: (Char -> Bool) -> Parser Text 270takeWhile p = do 271 h <- T.takeWhile p <$> get 272 continue <- inputSpansChunks (size h) 273 -- only use slow concat path if necessary 274 if continue 275 then takeWhileAcc p [h] 276 else return h 277{-# INLINE takeWhile #-} 278 279takeWhileAcc :: (Char -> Bool) -> [Text] -> Parser Text 280takeWhileAcc p = go 281 where 282 go acc = do 283 h <- T.takeWhile p <$> get 284 continue <- inputSpansChunks (size h) 285 if continue 286 then go (h:acc) 287 else return $ concatReverse (h:acc) 288{-# INLINE takeWhileAcc #-} 289 290takeRest :: Parser [Text] 291takeRest = go [] 292 where 293 go acc = do 294 input <- wantInput 295 if input 296 then do 297 s <- get 298 advance (size s) 299 go (s:acc) 300 else return (reverse acc) 301 302-- | Consume all remaining input and return it as a single string. 303takeText :: Parser Text 304takeText = T.concat `fmap` takeRest 305 306-- | Consume all remaining input and return it as a single string. 307takeLazyText :: Parser L.Text 308takeLazyText = L.fromChunks `fmap` takeRest 309 310data Scan s = Continue s 311 | Finished s {-# UNPACK #-} !Int Text 312 313scan_ :: (s -> [Text] -> Parser r) -> s -> (s -> Char -> Maybe s) -> Parser r 314scan_ f s0 p = go [] s0 315 where 316 scanner s !n t = 317 case T.uncons t of 318 Just (c,t') -> case p s c of 319 Just s' -> scanner s' (n+1) t' 320 Nothing -> Finished s n t 321 Nothing -> Continue s 322 go acc s = do 323 input <- get 324 case scanner s 0 input of 325 Continue s' -> do continue <- inputSpansChunks (size input) 326 if continue 327 then go (input : acc) s' 328 else f s' (input : acc) 329 Finished s' n t -> do advance (size input - size t) 330 f s' (T.take n input : acc) 331{-# INLINE scan_ #-} 332 333-- | A stateful scanner. The predicate consumes and transforms a 334-- state argument, and each transformed state is passed to successive 335-- invocations of the predicate on each character of the input until one 336-- returns 'Nothing' or the input ends. 337-- 338-- This parser does not fail. It will return an empty string if the 339-- predicate returns 'Nothing' on the first character of input. 340-- 341-- /Note/: Because this parser does not fail, do not use it with 342-- combinators such as 'Control.Applicative.many', because such 343-- parsers loop until a failure occurs. Careless use will thus result 344-- in an infinite loop. 345scan :: s -> (s -> Char -> Maybe s) -> Parser Text 346scan = scan_ $ \_ chunks -> return $! concatReverse chunks 347{-# INLINE scan #-} 348 349-- | Like 'scan', but generalized to return the final state of the 350-- scanner. 351runScanner :: s -> (s -> Char -> Maybe s) -> Parser (Text, s) 352runScanner = scan_ $ \s xs -> let !sx = concatReverse xs in return (sx, s) 353{-# INLINE runScanner #-} 354 355-- | Consume input as long as the predicate returns 'True', and return 356-- the consumed input. 357-- 358-- This parser requires the predicate to succeed on at least one 359-- character of input: it will fail if the predicate never returns 360-- 'True' or if there is no input left. 361takeWhile1 :: (Char -> Bool) -> Parser Text 362takeWhile1 p = do 363 (`when` demandInput) =<< endOfChunk 364 h <- T.takeWhile p <$> get 365 let size' = size h 366 when (size' == 0) $ fail "takeWhile1" 367 advance size' 368 eoc <- endOfChunk 369 if eoc 370 then takeWhileAcc p [h] 371 else return h 372{-# INLINE takeWhile1 #-} 373 374-- | Match any character in a set. 375-- 376-- >vowel = inClass "aeiou" 377-- 378-- Range notation is supported. 379-- 380-- >halfAlphabet = inClass "a-nA-N" 381-- 382-- To add a literal @\'-\'@ to a set, place it at the beginning or end 383-- of the string. 384inClass :: String -> Char -> Bool 385inClass s = (`Set.member` mySet) 386 where mySet = Set.charClass s 387 {-# NOINLINE mySet #-} 388{-# INLINE inClass #-} 389 390-- | Match any character not in a set. 391notInClass :: String -> Char -> Bool 392notInClass s = not . inClass s 393{-# INLINE notInClass #-} 394 395-- | Match any character. 396anyChar :: Parser Char 397anyChar = satisfy $ const True 398{-# INLINE anyChar #-} 399 400-- | Match a specific character. 401char :: Char -> Parser Char 402char c = satisfy (== c) <?> show c 403{-# INLINE char #-} 404 405-- | Match any character except the given one. 406notChar :: Char -> Parser Char 407notChar c = satisfy (/= c) <?> "not " ++ show c 408{-# INLINE notChar #-} 409 410-- | Match any character, to perform lookahead. Returns 'Nothing' if 411-- end of input has been reached. Does not consume any input. 412-- 413-- /Note/: Because this parser does not fail, do not use it with 414-- combinators such as 'Control.Applicative.many', because such 415-- parsers loop until a failure occurs. Careless use will thus result 416-- in an infinite loop. 417peekChar :: Parser (Maybe Char) 418peekChar = T.Parser $ \t pos more _lose succ -> 419 case () of 420 _| pos < lengthOf t -> 421 let T.Iter !c _ = Buf.iter t (fromPos pos) 422 in succ t pos more (Just c) 423 | more == Complete -> 424 succ t pos more Nothing 425 | otherwise -> 426 let succ' t' pos' more' = 427 let T.Iter !c _ = Buf.iter t' (fromPos pos') 428 in succ t' pos' more' (Just c) 429 lose' t' pos' more' = succ t' pos' more' Nothing 430 in prompt t pos more lose' succ' 431{-# INLINE peekChar #-} 432 433-- | Match any character, to perform lookahead. Does not consume any 434-- input, but will fail if end of input has been reached. 435peekChar' :: Parser Char 436peekChar' = do 437 (_,s) <- ensure 1 438 return $! T.unsafeHead s 439{-# INLINE peekChar' #-} 440 441-- | Match either a single newline character @\'\\n\'@, or a carriage 442-- return followed by a newline character @\"\\r\\n\"@. 443endOfLine :: Parser () 444endOfLine = (char '\n' >> return ()) <|> (string "\r\n" >> return ()) 445 446-- | Terminal failure continuation. 447failK :: Failure a 448failK t (Pos pos) _more stack msg = Fail (Buf.dropWord16 pos t) stack msg 449{-# INLINE failK #-} 450 451-- | Terminal success continuation. 452successK :: Success a a 453successK t (Pos pos) _more a = Done (Buf.dropWord16 pos t) a 454{-# INLINE successK #-} 455 456-- | Run a parser. 457parse :: Parser a -> Text -> Result a 458parse m s = runParser m (buffer s) 0 Incomplete failK successK 459{-# INLINE parse #-} 460 461-- | Run a parser that cannot be resupplied via a 'Partial' result. 462-- 463-- This function does not force a parser to consume all of its input. 464-- Instead, any residual input will be discarded. To force a parser 465-- to consume all of its input, use something like this: 466-- 467-- @ 468--'parseOnly' (myParser 'Control.Applicative.<*' 'endOfInput') 469-- @ 470parseOnly :: Parser a -> Text -> Either String a 471parseOnly m s = case runParser m (buffer s) 0 Complete failK successK of 472 Fail _ [] err -> Left err 473 Fail _ ctxs err -> Left (intercalate " > " ctxs ++ ": " ++ err) 474 Done _ a -> Right a 475 _ -> error "parseOnly: impossible error!" 476{-# INLINE parseOnly #-} 477 478get :: Parser Text 479get = T.Parser $ \t pos more _lose succ -> 480 succ t pos more (Buf.dropWord16 (fromPos pos) t) 481{-# INLINE get #-} 482 483endOfChunk :: Parser Bool 484endOfChunk = T.Parser $ \t pos more _lose succ -> 485 succ t pos more (pos == lengthOf t) 486{-# INLINE endOfChunk #-} 487 488inputSpansChunks :: Pos -> Parser Bool 489inputSpansChunks i = T.Parser $ \t pos_ more _lose succ -> 490 let pos = pos_ + i 491 in if pos < lengthOf t || more == Complete 492 then succ t pos more False 493 else let lose' t' pos' more' = succ t' pos' more' False 494 succ' t' pos' more' = succ t' pos' more' True 495 in prompt t pos more lose' succ' 496{-# INLINE inputSpansChunks #-} 497 498advance :: Pos -> Parser () 499advance n = T.Parser $ \t pos more _lose succ -> succ t (pos+n) more () 500{-# INLINE advance #-} 501 502ensureSuspended :: Int -> Buffer -> Pos -> More 503 -> Failure r -> Success (Pos, Text) r 504 -> Result r 505ensureSuspended n t pos more lose succ = 506 runParser (demandInput >> go) t pos more lose succ 507 where go = T.Parser $ \t' pos' more' lose' succ' -> 508 case lengthAtLeast pos' n t' of 509 Just n' -> succ' t' pos' more' (n', substring pos n' t') 510 Nothing -> runParser (demandInput >> go) t' pos' more' lose' succ' 511 512-- | If at least @n@ elements of input are available, return the 513-- current input, otherwise fail. 514ensure :: Int -> Parser (Pos, Text) 515ensure n = T.Parser $ \t pos more lose succ -> 516 case lengthAtLeast pos n t of 517 Just n' -> succ t pos more (n', substring pos n' t) 518 -- The uncommon case is kept out-of-line to reduce code size: 519 Nothing -> ensureSuspended n t pos more lose succ 520{-# INLINE ensure #-} 521 522-- | Return both the result of a parse and the portion of the input 523-- that was consumed while it was being parsed. 524match :: Parser a -> Parser (Text, a) 525match p = T.Parser $ \t pos more lose succ -> 526 let succ' t' pos' more' a = succ t' pos' more' 527 (substring pos (pos'-pos) t', a) 528 in runParser p t pos more lose succ' 529 530-- | Ensure that at least @n@ code points of input are available. 531-- Returns the number of words consumed while traversing. 532lengthAtLeast :: Pos -> Int -> Buffer -> Maybe Pos 533lengthAtLeast pos n t = go 0 (fromPos pos) 534 where go i !p 535 | i == n = Just (Pos p - pos) 536 | p == len = Nothing 537 | otherwise = go (i+1) (p + Buf.iter_ t p) 538 Pos len = lengthOf t 539{-# INLINE lengthAtLeast #-} 540 541substring :: Pos -> Pos -> Buffer -> Text 542substring (Pos pos) (Pos n) = Buf.substring pos n 543{-# INLINE substring #-} 544 545lengthOf :: Buffer -> Pos 546lengthOf = Pos . Buf.length 547 548size :: Text -> Pos 549size (Text _ _ l) = Pos l 550