1-- | 2-- Module : Text.Megaparsec 3-- Copyright : © 2015–present Megaparsec contributors 4-- © 2007 Paolo Martini 5-- © 1999–2001 Daan Leijen 6-- License : FreeBSD 7-- 8-- Maintainer : Mark Karpov <markkarpov92@gmail.com> 9-- Stability : experimental 10-- Portability : portable 11-- 12-- This module includes everything you need to get started writing a parser. 13-- If you are new to Megaparsec and don't know where to begin, take a look 14-- at the tutorials 15-- <https://markkarpov.com/learn-haskell.html#megaparsec-tutorials>. 16-- 17-- In addition to the "Text.Megaparsec" module, which exports and re-exports 18-- most everything that you may need, we advise to import 19-- "Text.Megaparsec.Char" if you plan to work with a stream of 'Char' tokens 20-- or "Text.Megaparsec.Byte" if you intend to parse binary data. 21-- 22-- It is common to start working with the library by defining a type synonym 23-- like this: 24-- 25-- > type Parser = Parsec Void Text 26-- > ^ ^ 27-- > | | 28-- > Custom error component Input stream type 29-- 30-- Then you can write type signatures like @Parser 'Int'@—for a parser that 31-- returns an 'Int' for example. 32-- 33-- Similarly (since it's known to cause confusion), you should use 34-- 'ParseErrorBundle' type parametrized like this: 35-- 36-- > ParseErrorBundle Text Void 37-- > ^ ^ 38-- > | | 39-- > Input stream type Custom error component (the same you used in Parser) 40-- 41-- Megaparsec uses some type-level machinery to provide flexibility without 42-- compromising on type safety. Thus type signatures are sometimes necessary 43-- to avoid ambiguous types. If you're seeing an error message that reads 44-- like “Type variable @e0@ is ambiguous …”, you need to give an explicit 45-- signature to your parser to resolve the ambiguity. It's a good idea to 46-- provide type signatures for all top-level definitions. 47 48{-# LANGUAGE FlexibleContexts #-} 49{-# LANGUAGE FlexibleInstances #-} 50{-# LANGUAGE MultiParamTypeClasses #-} 51{-# LANGUAGE RankNTypes #-} 52{-# LANGUAGE ScopedTypeVariables #-} 53{-# LANGUAGE TypeFamilies #-} 54{-# LANGUAGE UndecidableInstances #-} 55 56module Text.Megaparsec 57 ( -- * Re-exports 58 -- $reexports 59 module Text.Megaparsec.Pos 60 , module Text.Megaparsec.Error 61 , module Text.Megaparsec.Stream 62 , module Control.Monad.Combinators 63 -- * Data types 64 , State (..) 65 , PosState (..) 66 , Parsec 67 , ParsecT 68 -- * Running parser 69 , parse 70 , parseMaybe 71 , parseTest 72 , runParser 73 , runParser' 74 , runParserT 75 , runParserT' 76 -- * Primitive combinators 77 , MonadParsec (..) 78 -- * Signaling parse errors 79 -- $parse-errors 80 , failure 81 , fancyFailure 82 , unexpected 83 , customFailure 84 , region 85 , registerParseError 86 , registerFailure 87 , registerFancyFailure 88 -- * Derivatives of primitive combinators 89 , single 90 , satisfy 91 , anySingle 92 , anySingleBut 93 , oneOf 94 , noneOf 95 , chunk 96 , (<?>) 97 , match 98 , takeRest 99 , atEnd 100 -- * Parser state combinators 101 , getInput 102 , setInput 103 , getSourcePos 104 , getOffset 105 , setOffset 106 , setParserState ) 107where 108 109import Control.Monad.Combinators 110import Control.Monad.Identity 111import Data.List.NonEmpty (NonEmpty (..)) 112import Data.Maybe (fromJust) 113import Data.Set (Set) 114import Text.Megaparsec.Class 115import Text.Megaparsec.Error 116import Text.Megaparsec.Internal 117import Text.Megaparsec.Pos 118import Text.Megaparsec.State 119import Text.Megaparsec.Stream 120import qualified Data.List.NonEmpty as NE 121import qualified Data.Set as E 122 123-- $reexports 124-- 125-- Note that we re-export monadic combinators from 126-- "Control.Monad.Combinators" because these are more efficient than 127-- 'Applicative'-based ones. Thus 'many' and 'some' may clash with the 128-- functions from "Control.Applicative". You need to hide the functions like 129-- this: 130-- 131-- > import Control.Applicative hiding (many, some) 132-- 133-- Also note that you can import "Control.Monad.Combinators.NonEmpty" if you 134-- wish that combinators like 'some' return 'NonEmpty' lists. The module 135-- lives in the @parser-combinators@ package (you need at least version 136-- /0.4.0/). 137-- 138-- This module is intended to be imported qualified: 139-- 140-- > import qualified Control.Monad.Combinators.NonEmpty as NE 141-- 142-- Other modules of interest are: 143-- 144-- * "Control.Monad.Combinators.Expr" for parsing of expressions. 145-- * "Control.Applicative.Permutations" for parsing of permutations 146-- phrases. 147 148---------------------------------------------------------------------------- 149-- Data types 150 151-- | 'Parsec' is a non-transformer variant of the more general 'ParsecT' 152-- monad transformer. 153 154type Parsec e s = ParsecT e s Identity 155 156---------------------------------------------------------------------------- 157-- Running a parser 158 159-- | @'parse' p file input@ runs parser @p@ over 'Identity' (see 160-- 'runParserT' if you're using the 'ParsecT' monad transformer; 'parse' 161-- itself is just a synonym for 'runParser'). It returns either a 162-- 'ParseErrorBundle' ('Left') or a value of type @a@ ('Right'). 163-- 'errorBundlePretty' can be used to turn 'ParseErrorBundle' into the 164-- string representation of the error message. See "Text.Megaparsec.Error" 165-- if you need to do more advanced error analysis. 166-- 167-- > main = case parse numbers "" "11,2,43" of 168-- > Left bundle -> putStr (errorBundlePretty bundle) 169-- > Right xs -> print (sum xs) 170-- > 171-- > numbers = decimal `sepBy` char ',' 172 173parse 174 :: Parsec e s a -- ^ Parser to run 175 -> String -- ^ Name of source file 176 -> s -- ^ Input for parser 177 -> Either (ParseErrorBundle s e) a 178parse = runParser 179 180-- | @'parseMaybe' p input@ runs the parser @p@ on @input@ and returns the 181-- result inside 'Just' on success and 'Nothing' on failure. This function 182-- also parses 'eof', so if the parser doesn't consume all of its input, it 183-- will fail. 184-- 185-- The function is supposed to be useful for lightweight parsing, where 186-- error messages (and thus file names) are not important and entire input 187-- should be parsed. For example, it can be used when parsing of a single 188-- number according to a specification of its format is desired. 189 190parseMaybe :: (Ord e, Stream s) => Parsec e s a -> s -> Maybe a 191parseMaybe p s = 192 case parse (p <* eof) "" s of 193 Left _ -> Nothing 194 Right x -> Just x 195 196-- | The expression @'parseTest' p input@ applies the parser @p@ against the 197-- input @input@ and prints the result to stdout. Useful for testing. 198 199parseTest :: ( ShowErrorComponent e 200 , Show a 201 , Stream s 202 ) 203 => Parsec e s a -- ^ Parser to run 204 -> s -- ^ Input for parser 205 -> IO () 206parseTest p input = 207 case parse p "" input of 208 Left e -> putStr (errorBundlePretty e) 209 Right x -> print x 210 211-- | @'runParser' p file input@ runs parser @p@ on the input stream of 212-- tokens @input@, obtained from source @file@. The @file@ is only used in 213-- error messages and may be the empty string. Returns either a 214-- 'ParseErrorBundle' ('Left') or a value of type @a@ ('Right'). 215-- 216-- > parseFromFile p file = runParser p file <$> readFile file 217 218runParser 219 :: Parsec e s a -- ^ Parser to run 220 -> String -- ^ Name of source file 221 -> s -- ^ Input for parser 222 -> Either (ParseErrorBundle s e) a 223runParser p name s = snd $ runParser' p (initialState name s) 224 225-- | The function is similar to 'runParser' with the difference that it 226-- accepts and returns parser state. This allows to specify arbitrary 227-- textual position at the beginning of parsing, for example. This is the 228-- most general way to run a parser over the 'Identity' monad. 229-- 230-- @since 4.2.0 231 232runParser' 233 :: Parsec e s a -- ^ Parser to run 234 -> State s e -- ^ Initial state 235 -> (State s e, Either (ParseErrorBundle s e) a) 236runParser' p = runIdentity . runParserT' p 237 238-- | @'runParserT' p file input@ runs parser @p@ on the input list of tokens 239-- @input@, obtained from source @file@. The @file@ is only used in error 240-- messages and may be the empty string. Returns a computation in the 241-- underlying monad @m@ that returns either a 'ParseErrorBundle' ('Left') or 242-- a value of type @a@ ('Right'). 243 244runParserT :: Monad m 245 => ParsecT e s m a -- ^ Parser to run 246 -> String -- ^ Name of source file 247 -> s -- ^ Input for parser 248 -> m (Either (ParseErrorBundle s e) a) 249runParserT p name s = snd <$> runParserT' p (initialState name s) 250 251-- | This function is similar to 'runParserT', but like 'runParser'' it 252-- accepts and returns parser state. This is thus the most general way to 253-- run a parser. 254-- 255-- @since 4.2.0 256 257runParserT' :: Monad m 258 => ParsecT e s m a -- ^ Parser to run 259 -> State s e -- ^ Initial state 260 -> m (State s e, Either (ParseErrorBundle s e) a) 261runParserT' p s = do 262 (Reply s' _ result) <- runParsecT p s 263 let toBundle es = ParseErrorBundle 264 { bundleErrors = 265 NE.sortWith errorOffset es 266 , bundlePosState = statePosState s 267 } 268 return $ case result of 269 OK x -> 270 case NE.nonEmpty (stateParseErrors s') of 271 Nothing -> (s', Right x) 272 Just de -> (s', Left (toBundle de)) 273 Error e -> 274 (s', Left (toBundle (e :| stateParseErrors s'))) 275 276-- | Given name of source file and input construct initial state for parser. 277 278initialState :: String -> s -> State s e 279initialState name s = State 280 { stateInput = s 281 , stateOffset = 0 282 , statePosState = PosState 283 { pstateInput = s 284 , pstateOffset = 0 285 , pstateSourcePos = initialPos name 286 , pstateTabWidth = defaultTabWidth 287 , pstateLinePrefix = "" 288 } 289 , stateParseErrors = [] 290 } 291 292---------------------------------------------------------------------------- 293-- Signaling parse errors 294 295-- $parse-errors 296-- 297-- The most general function to fail and end parsing is 'parseError'. These 298-- are built on top of it. The section also includes functions starting with 299-- the @register@ prefix which allow users to register “delayed” 300-- 'ParseError's. 301 302-- | Stop parsing and report a trivial 'ParseError'. 303-- 304-- @since 6.0.0 305 306failure 307 :: MonadParsec e s m 308 => Maybe (ErrorItem (Token s)) -- ^ Unexpected item (if any) 309 -> Set (ErrorItem (Token s)) -- ^ Expected items 310 -> m a 311failure us ps = do 312 o <- getOffset 313 parseError (TrivialError o us ps) 314{-# INLINE failure #-} 315 316-- | Stop parsing and report a fancy 'ParseError'. To report a single custom 317-- parse error, see 'Text.Megaparsec.customFailure'. 318-- 319-- @since 6.0.0 320 321fancyFailure 322 :: MonadParsec e s m 323 => Set (ErrorFancy e) -- ^ Fancy error components 324 -> m a 325fancyFailure xs = do 326 o <- getOffset 327 parseError (FancyError o xs) 328{-# INLINE fancyFailure #-} 329 330-- | The parser @'unexpected' item@ fails with an error message telling 331-- about unexpected item @item@ without consuming any input. 332-- 333-- > unexpected item = failure (Just item) Set.empty 334 335unexpected :: MonadParsec e s m => ErrorItem (Token s) -> m a 336unexpected item = failure (Just item) E.empty 337{-# INLINE unexpected #-} 338 339-- | Report a custom parse error. For a more general version, see 340-- 'fancyFailure'. 341-- 342-- > customFailure = fancyFailure . Set.singleton . ErrorCustom 343-- 344-- @since 6.3.0 345 346customFailure :: MonadParsec e s m => e -> m a 347customFailure = fancyFailure . E.singleton . ErrorCustom 348{-# INLINE customFailure #-} 349 350-- | Specify how to process 'ParseError's that happen inside of this 351-- wrapper. This applies to both normal and delayed 'ParseError's. 352-- 353-- As a side-effect of the implementation the inner computation will start 354-- with empty collection of delayed errors and they will be updated and 355-- “restored” on the way out of 'region'. 356-- 357-- @since 5.3.0 358 359region :: MonadParsec e s m 360 => (ParseError s e -> ParseError s e) 361 -- ^ How to process 'ParseError's 362 -> m a -- ^ The “region” that the processing applies to 363 -> m a 364region f m = do 365 deSoFar <- stateParseErrors <$> getParserState 366 updateParserState $ \s -> 367 s { stateParseErrors = [] } 368 r <- observing m 369 updateParserState $ \s -> 370 s { stateParseErrors = (f <$> stateParseErrors s) ++ deSoFar } 371 case r of 372 Left err -> parseError (f err) 373 Right x -> return x 374{-# INLINEABLE region #-} 375 376-- | Register a 'ParseError' for later reporting. This action does not end 377-- parsing and has no effect except for adding the given 'ParseError' to the 378-- collection of “delayed” 'ParseError's which will be taken into 379-- consideration at the end of parsing. Only if this collection is empty 380-- parser will succeed. This is the main way to report several parse errors 381-- at once. 382-- 383-- @since 8.0.0 384 385registerParseError :: MonadParsec e s m => ParseError s e -> m () 386registerParseError e = updateParserState $ \s -> 387 s { stateParseErrors = e : stateParseErrors s } 388{-# INLINE registerParseError #-} 389 390-- | Like 'failure', but for delayed 'ParseError's. 391-- 392-- @since 8.0.0 393 394registerFailure 395 :: MonadParsec e s m 396 => Maybe (ErrorItem (Token s)) -- ^ Unexpected item (if any) 397 -> Set (ErrorItem (Token s)) -- ^ Expected items 398 -> m () 399registerFailure us ps = do 400 o <- getOffset 401 registerParseError (TrivialError o us ps) 402{-# INLINE registerFailure #-} 403 404-- | Like 'fancyFailure', but for delayed 'ParseError's. 405-- 406-- @since 8.0.0 407 408registerFancyFailure 409 :: MonadParsec e s m 410 => Set (ErrorFancy e) -- ^ Fancy error components 411 -> m () 412registerFancyFailure xs = do 413 o <- getOffset 414 registerParseError (FancyError o xs) 415{-# INLINE registerFancyFailure #-} 416 417---------------------------------------------------------------------------- 418-- Derivatives of primitive combinators 419 420-- | @'single' t@ only matches the single token @t@. 421-- 422-- > semicolon = single ';' 423-- 424-- See also: 'token', 'anySingle', 'Text.Megaparsec.Byte.char', 425-- 'Text.Megaparsec.Char.char'. 426-- 427-- @since 7.0.0 428 429single :: MonadParsec e s m 430 => Token s -- ^ Token to match 431 -> m (Token s) 432single t = token testToken expected 433 where 434 testToken x = if x == t then Just x else Nothing 435 expected = E.singleton (Tokens (t:|[])) 436{-# INLINE single #-} 437 438-- | The parser @'satisfy' f@ succeeds for any token for which the supplied 439-- function @f@ returns 'True'. 440-- 441-- > digitChar = satisfy isDigit <?> "digit" 442-- > oneOf cs = satisfy (`elem` cs) 443-- 444-- See also: 'anySingle', 'anySingleBut', 'oneOf', 'noneOf'. 445-- 446-- @since 7.0.0 447 448satisfy :: MonadParsec e s m 449 => (Token s -> Bool) -- ^ Predicate to apply 450 -> m (Token s) 451satisfy f = token testChar E.empty 452 where 453 testChar x = if f x then Just x else Nothing 454{-# INLINE satisfy #-} 455 456-- | Parse and return a single token. It's a good idea to attach a 'label' 457-- to this parser. 458-- 459-- > anySingle = satisfy (const True) 460-- 461-- See also: 'satisfy', 'anySingleBut'. 462-- 463-- @since 7.0.0 464 465anySingle :: MonadParsec e s m => m (Token s) 466anySingle = satisfy (const True) 467{-# INLINE anySingle #-} 468 469-- | Match any token but the given one. It's a good idea to attach a 'label' 470-- to this parser. 471-- 472-- > anySingleBut t = satisfy (/= t) 473-- 474-- See also: 'single', 'anySingle', 'satisfy'. 475-- 476-- @since 7.0.0 477 478anySingleBut :: MonadParsec e s m 479 => Token s -- ^ Token we should not match 480 -> m (Token s) 481anySingleBut t = satisfy (/= t) 482{-# INLINE anySingleBut #-} 483 484-- | @'oneOf' ts@ succeeds if the current token is in the supplied 485-- collection of tokens @ts@. Returns the parsed token. Note that this 486-- parser cannot automatically generate the “expected” component of error 487-- message, so usually you should label it manually with 'label' or ('<?>'). 488-- 489-- > oneOf cs = satisfy (`elem` cs) 490-- 491-- See also: 'satisfy'. 492-- 493-- > digit = oneOf ['0'..'9'] <?> "digit" 494-- 495-- __Performance note__: prefer 'satisfy' when you can because it's faster 496-- when you have only a couple of tokens to compare to: 497-- 498-- > quoteFast = satisfy (\x -> x == '\'' || x == '\"') 499-- > quoteSlow = oneOf "'\"" 500-- 501-- @since 7.0.0 502 503oneOf :: (Foldable f, MonadParsec e s m) 504 => f (Token s) -- ^ Collection of matching tokens 505 -> m (Token s) 506oneOf cs = satisfy (`elem` cs) 507{-# INLINE oneOf #-} 508 509-- | As the dual of 'oneOf', @'noneOf' ts@ succeeds if the current token 510-- /not/ in the supplied list of tokens @ts@. Returns the parsed character. 511-- Note that this parser cannot automatically generate the “expected” 512-- component of error message, so usually you should label it manually with 513-- 'label' or ('<?>'). 514-- 515-- > noneOf cs = satisfy (`notElem` cs) 516-- 517-- See also: 'satisfy'. 518-- 519-- __Performance note__: prefer 'satisfy' and 'anySingleBut' when you can 520-- because it's faster. 521-- 522-- @since 7.0.0 523 524noneOf :: (Foldable f, MonadParsec e s m) 525 => f (Token s) -- ^ Collection of taken we should not match 526 -> m (Token s) 527noneOf cs = satisfy (`notElem` cs) 528{-# INLINE noneOf #-} 529 530-- | @'chunk' chk@ only matches the chunk @chk@. 531-- 532-- > divOrMod = chunk "div" <|> chunk "mod" 533-- 534-- See also: 'tokens', 'Text.Megaparsec.Char.string', 535-- 'Text.Megaparsec.Byte.string'. 536-- 537-- @since 7.0.0 538 539chunk :: MonadParsec e s m 540 => Tokens s -- ^ Chunk to match 541 -> m (Tokens s) 542chunk = tokens (==) 543{-# INLINE chunk #-} 544 545-- | A synonym for 'label' in the form of an operator. 546 547infix 0 <?> 548 549(<?>) :: MonadParsec e s m => m a -> String -> m a 550(<?>) = flip label 551{-# INLINE (<?>) #-} 552 553-- | Return both the result of a parse and a chunk of input that was 554-- consumed during parsing. This relies on the change of the 'stateOffset' 555-- value to evaluate how many tokens were consumed. If you mess with it 556-- manually in the argument parser, prepare for troubles. 557-- 558-- @since 5.3.0 559 560match :: MonadParsec e s m => m a -> m (Tokens s, a) 561match p = do 562 o <- getOffset 563 s <- getInput 564 r <- p 565 o' <- getOffset 566 -- NOTE The 'fromJust' call here should never fail because if the stream 567 -- is empty before 'p' (the only case when 'takeN_' can return 'Nothing' 568 -- as per its invariants), (tp' - tp) won't be greater than 0, and in that 569 -- case 'Just' is guaranteed to be returned as per another invariant of 570 -- 'takeN_'. 571 return ((fst . fromJust) (takeN_ (o' - o) s), r) 572{-# INLINEABLE match #-} 573 574-- | Consume the rest of the input and return it as a chunk. This parser 575-- never fails, but may return the empty chunk. 576-- 577-- > takeRest = takeWhileP Nothing (const True) 578-- 579-- @since 6.0.0 580 581takeRest :: MonadParsec e s m => m (Tokens s) 582takeRest = takeWhileP Nothing (const True) 583{-# INLINE takeRest #-} 584 585-- | Return 'True' when end of input has been reached. 586-- 587-- > atEnd = option False (True <$ hidden eof) 588-- 589-- @since 6.0.0 590 591atEnd :: MonadParsec e s m => m Bool 592atEnd = option False (True <$ hidden eof) 593{-# INLINE atEnd #-} 594 595---------------------------------------------------------------------------- 596-- Parser state combinators 597 598-- | Return the current input. 599 600getInput :: MonadParsec e s m => m s 601getInput = stateInput <$> getParserState 602{-# INLINE getInput #-} 603 604-- | @'setInput' input@ continues parsing with @input@. 605 606setInput :: MonadParsec e s m => s -> m () 607setInput s = updateParserState (\(State _ o pst de) -> State s o pst de) 608{-# INLINE setInput #-} 609 610-- | Return the current source position. This function /is not cheap/, do 611-- not call it e.g. on matching of every token, that's a bad idea. Still you 612-- can use it to get 'SourcePos' to attach to things that you parse. 613-- 614-- The function works under the assumption that we move in the input stream 615-- only forwards and never backwards, which is always true unless the user 616-- abuses the library. 617-- 618-- @since 7.0.0 619 620getSourcePos :: MonadParsec e s m => m SourcePos 621getSourcePos = do 622 st <- getParserState 623 let pst = reachOffsetNoLine (stateOffset st) (statePosState st) 624 setParserState st { statePosState = pst } 625 return (pstateSourcePos pst) 626{-# INLINE getSourcePos #-} 627 628-- | Get the number of tokens processed so far. 629-- 630-- See also: 'setOffset'. 631-- 632-- @since 7.0.0 633 634getOffset :: MonadParsec e s m => m Int 635getOffset = stateOffset <$> getParserState 636{-# INLINE getOffset #-} 637 638-- | Set the number of tokens processed so far. 639-- 640-- See also: 'getOffset'. 641-- 642-- @since 7.0.0 643 644setOffset :: MonadParsec e s m => Int -> m () 645setOffset o = updateParserState $ \(State s _ pst de) -> 646 State s o pst de 647{-# INLINE setOffset #-} 648 649-- | @'setParserState' st@ sets the parser state to @st@. 650-- 651-- See also: 'getParserState', 'updateParserState'. 652 653setParserState :: MonadParsec e s m => State s e -> m () 654setParserState st = updateParserState (const st) 655{-# INLINE setParserState #-} 656