1{-# LANGUAGE BangPatterns #-} 2{-# LANGUAGE CPP #-} 3{-# LANGUAGE DeriveDataTypeable #-} 4{-# LANGUAGE DeriveFunctor #-} 5{-# LANGUAGE FlexibleContexts #-} 6{-# LANGUAGE FlexibleInstances #-} 7{-# LANGUAGE GeneralizedNewtypeDeriving #-} 8{-# LANGUAGE OverloadedStrings #-} 9{-# LANGUAGE PatternGuards #-} 10{-# LANGUAGE RankNTypes #-} 11{-# LANGUAGE StandaloneDeriving #-} 12{-# LANGUAGE TypeFamilies #-} 13-- | This module provides both a native Haskell solution for parsing XML 14-- documents into a stream of events, and a set of parser combinators for 15-- dealing with a stream of events. 16-- 17-- As a simple example: 18-- 19-- >>> :set -XOverloadedStrings 20-- >>> import Conduit (runConduit, (.|)) 21-- >>> import Data.Text (Text, unpack) 22-- >>> import Data.XML.Types (Event) 23-- >>> data Person = Person Int Text Text deriving Show 24-- >>> :{ 25-- let parsePerson :: MonadThrow m => ConduitT Event o m (Maybe Person) 26-- parsePerson = tag' "person" parseAttributes $ \(age, goodAtHaskell) -> do 27-- name <- content 28-- return $ Person (read $ unpack age) name goodAtHaskell 29-- where parseAttributes = (,) <$> requireAttr "age" <*> requireAttr "goodAtHaskell" <* ignoreAttrs 30-- parsePeople :: MonadThrow m => ConduitT Event o m (Maybe [Person]) 31-- parsePeople = tagNoAttr "people" $ many parsePerson 32-- inputXml = mconcat 33-- [ "<?xml version=\"1.0\" encoding=\"utf-8\"?>" 34-- , "<people>" 35-- , " <person age=\"25\" goodAtHaskell=\"yes\">Michael</person>" 36-- , " <person age=\"2\" goodAtHaskell=\"might become\">Eliezer</person>" 37-- , "</people>" 38-- ] 39-- :} 40-- 41-- >>> runConduit $ parseLBS def inputXml .| force "people required" parsePeople 42-- [Person 25 "Michael" "yes",Person 2 "Eliezer" "might become"] 43-- 44-- 45-- This module also supports streaming results using 'yield'. 46-- This allows parser results to be processed using conduits 47-- while a particular parser (e.g. 'many') is still running. 48-- Without using streaming results, you have to wait until the parser finished 49-- before you can process the result list. Large XML files might be easier 50-- to process by using streaming results. 51-- See http://stackoverflow.com/q/21367423/2597135 for a related discussion. 52-- 53-- >>> import Data.Conduit.List as CL 54-- >>> :{ 55-- let parsePeople' :: MonadThrow m => ConduitT Event Person m (Maybe ()) 56-- parsePeople' = tagNoAttr "people" $ manyYield parsePerson 57-- :} 58-- 59-- >>> runConduit $ parseLBS def inputXml .| force "people required" parsePeople' .| CL.mapM_ print 60-- Person 25 "Michael" "yes" 61-- Person 2 "Eliezer" "might become" 62-- 63-- Previous versions of this module contained a number of more sophisticated 64-- functions written by Aristid Breitkreuz and Dmitry Olshansky. To keep this 65-- package simpler, those functions are being moved to a separate package. This 66-- note will be updated with the name of the package(s) when available. 67module Text.XML.Stream.Parse 68 ( -- * Parsing XML files 69 parseBytes 70 , parseBytesPos 71 , parseText 72 , parseTextPos 73 , detectUtf 74 , parseFile 75 , parseLBS 76 -- ** Parser settings 77 , ParseSettings 78 , def 79 , DecodeEntities 80 , DecodeIllegalCharacters 81 , psDecodeEntities 82 , psDecodeIllegalCharacters 83 , psRetainNamespaces 84 -- *** Entity decoding 85 , decodeXmlEntities 86 , decodeHtmlEntities 87 -- * Event parsing 88 , tag 89 , tag' 90 , tagNoAttr 91 , tagIgnoreAttrs 92 , content 93 , contentMaybe 94 -- * Ignoring tags/trees 95 , ignoreEmptyTag 96 , ignoreTree 97 , ignoreContent 98 , ignoreTreeContent 99 , ignoreAnyTreeContent 100 -- * Streaming events 101 , takeContent 102 , takeTree 103 , takeTreeContent 104 , takeAnyTreeContent 105 -- * Tag name matching 106 , NameMatcher(..) 107 , matching 108 , anyOf 109 , anyName 110 -- * Attribute parsing 111 , AttrParser 112 , attr 113 , requireAttr 114 , optionalAttr 115 , requireAttrRaw 116 , optionalAttrRaw 117 , ignoreAttrs 118 -- * Combinators 119 , orE 120 , choose 121 , many 122 , many_ 123 , manyIgnore 124 , many' 125 , force 126 -- * Streaming combinators 127 , manyYield 128 , manyYield' 129 , manyIgnoreYield 130 -- * Exceptions 131 , XmlException (..) 132 -- * Other types 133 , PositionRange 134 , EventPos 135 ) where 136import Conduit 137import Control.Applicative (Alternative (empty, (<|>)), 138 Applicative (..), (<$>)) 139import qualified Control.Applicative as A 140import Control.Arrow ((***)) 141import Control.Exception (Exception (..), SomeException) 142import Control.Monad (ap, liftM, void) 143import Control.Monad.Fix (fix) 144import Control.Monad.IO.Class (liftIO) 145import Control.Monad.Trans.Class (lift) 146import Control.Monad.Trans.Maybe (MaybeT (..)) 147import Control.Monad.Trans.Resource (MonadResource, MonadThrow (..), 148 throwM) 149import Data.Attoparsec.Text (Parser, anyChar, char, manyTill, 150 skipWhile, string, takeWhile, 151 takeWhile1, try) 152import qualified Data.Attoparsec.Text as AT 153import qualified Data.ByteString as S 154import qualified Data.ByteString.Lazy as L 155import Data.Char (isSpace) 156import Data.Conduit.Attoparsec (PositionRange, conduitParser) 157import qualified Data.Conduit.Text as CT 158import Data.Default.Class (Default (..)) 159import Data.List (foldl', intercalate) 160import qualified Data.Map as Map 161import Data.Maybe (fromMaybe, isNothing) 162import Data.String (IsString (..)) 163import Data.Text (Text, pack) 164import qualified Data.Text as T 165import Data.Text.Encoding (decodeUtf8With) 166import Data.Text.Encoding.Error (lenientDecode) 167import Data.Typeable (Typeable) 168import Data.XML.Types (Content (..), Event (..), 169 ExternalID (..), 170 Instruction (..), Name (..)) 171import Prelude hiding (takeWhile) 172import Text.XML.Stream.Token 173 174type Ents = [(Text, Text)] 175 176tokenToEvent :: ParseSettings -> Ents -> [NSLevel] -> Token -> (Ents, [NSLevel], [Event]) 177tokenToEvent _ es n (TokenXMLDeclaration _) = (es, n, []) 178tokenToEvent _ es n (TokenInstruction i) = (es, n, [EventInstruction i]) 179tokenToEvent ps es n (TokenBeginElement name as isClosed _) = 180 (es, n', if isClosed then [begin, end] else [begin]) 181 where 182 l0 = case n of 183 [] -> NSLevel Nothing Map.empty 184 x:_ -> x 185 (as', l') = foldl' go (id, l0) as 186 go (front, l) (TName kpref kname, val) = 187 (addNS front, l'') 188 where 189 isPrefixed = kpref == Just "xmlns" 190 isUnprefixed = isNothing kpref && kname == "xmlns" 191 192 addNS 193 | not (psRetainNamespaces ps) && (isPrefixed || isUnprefixed) = id 194 | otherwise = (((tname, map resolve val):) .) 195 where 196 tname 197 | isPrefixed = TName Nothing ("xmlns:" `T.append` kname) 198 | otherwise = TName kpref kname 199 200 l'' 201 | isPrefixed = 202 l { prefixes = Map.insert kname (contentsToText val) 203 $ prefixes l } 204 | isUnprefixed = 205 l { defaultNS = if T.null $ contentsToText val 206 then Nothing 207 else Just $ contentsToText val } 208 | otherwise = l 209 210 resolve (ContentEntity e) 211 | Just t <- lookup e es = ContentText t 212 resolve c = c 213 n' = if isClosed then n else l' : n 214 fixAttName (name', val) = (tnameToName True l' name', val) 215 elementName = tnameToName False l' name 216 begin = EventBeginElement elementName $ map fixAttName $ as' [] 217 end = EventEndElement elementName 218tokenToEvent _ es n (TokenEndElement name) = 219 (es, n', [EventEndElement $ tnameToName False l name]) 220 where 221 (l, n') = 222 case n of 223 [] -> (NSLevel Nothing Map.empty, []) 224 x:xs -> (x, xs) 225tokenToEvent _ es n (TokenContent (ContentEntity e)) 226 | Just t <- lookup e es = (es, n, [EventContent $ ContentText t]) 227tokenToEvent _ es n (TokenContent c) = (es, n, [EventContent c]) 228tokenToEvent _ es n (TokenComment c) = (es, n, [EventComment c]) 229tokenToEvent _ es n (TokenDoctype t eid es') = (es ++ es', n, [EventBeginDoctype t eid, EventEndDoctype]) 230tokenToEvent _ es n (TokenCDATA t) = (es, n, [EventCDATA t]) 231 232tnameToName :: Bool -> NSLevel -> TName -> Name 233tnameToName _ _ (TName (Just "xml") name) = 234 Name name (Just "http://www.w3.org/XML/1998/namespace") (Just "xml") 235tnameToName isAttr (NSLevel def' _) (TName Nothing name) = 236 Name name (if isAttr then Nothing else def') Nothing 237tnameToName _ (NSLevel _ m) (TName (Just pref) name) = 238 case Map.lookup pref m of 239 Just ns -> Name name (Just ns) (Just pref) 240 Nothing -> Name name Nothing (Just pref) -- FIXME is this correct? 241 242-- | Automatically determine which UTF variant is being used. This function 243-- first checks for BOMs, removing them as necessary, and then check for the 244-- equivalent of <?xml for each of UTF-8, UTF-16LE/BE, and UTF-32LE/BE. It 245-- defaults to assuming UTF-8. 246detectUtf :: MonadThrow m => ConduitT S.ByteString T.Text m () 247detectUtf = 248 conduit id 249 where 250 conduit front = await >>= maybe (return ()) (push front) 251 252 push front bss = 253 either conduit 254 (uncurry checkXMLDecl) 255 (getEncoding front bss) 256 257 getEncoding front bs' 258 | S.length bs < 4 = 259 Left (bs `S.append`) 260 | otherwise = 261 Right (bsOut, mcodec) 262 where 263 bs = front bs' 264 bsOut = S.append (S.drop toDrop x) y 265 (x, y) = S.splitAt 4 bs 266 (toDrop, mcodec) = 267 case S.unpack x of 268 [0x00, 0x00, 0xFE, 0xFF] -> (4, Just CT.utf32_be) 269 [0xFF, 0xFE, 0x00, 0x00] -> (4, Just CT.utf32_le) 270 0xFE : 0xFF: _ -> (2, Just CT.utf16_be) 271 0xFF : 0xFE: _ -> (2, Just CT.utf16_le) 272 0xEF : 0xBB: 0xBF : _ -> (3, Just CT.utf8) 273 [0x00, 0x00, 0x00, 0x3C] -> (0, Just CT.utf32_be) 274 [0x3C, 0x00, 0x00, 0x00] -> (0, Just CT.utf32_le) 275 [0x00, 0x3C, 0x00, 0x3F] -> (0, Just CT.utf16_be) 276 [0x3C, 0x00, 0x3F, 0x00] -> (0, Just CT.utf16_le) 277 _ -> (0, Nothing) -- Assuming UTF-8 278 279checkXMLDecl :: MonadThrow m 280 => S.ByteString 281 -> Maybe CT.Codec 282 -> ConduitT S.ByteString T.Text m () 283checkXMLDecl bs (Just codec) = leftover bs >> CT.decode codec 284checkXMLDecl bs0 Nothing = 285 loop [] (AT.parse (parseToken def)) bs0 286 where 287 loop chunks0 parser nextChunk = 288 case parser $ decodeUtf8With lenientDecode nextChunk of 289 AT.Fail{} -> fallback 290 AT.Partial f -> await >>= maybe fallback (loop chunks f) 291 AT.Done _ (TokenXMLDeclaration attrs) -> findEncoding attrs 292 AT.Done{} -> fallback 293 where 294 chunks = nextChunk : chunks0 295 fallback = complete CT.utf8 296 complete codec = mapM_ leftover chunks >> CT.decode codec 297 298 findEncoding [] = fallback 299 findEncoding ((TName _ "encoding", [ContentText enc]):_) = 300 case T.toLower enc of 301 "iso-8859-1" -> complete CT.iso8859_1 302 "utf-8" -> complete CT.utf8 303 _ -> complete CT.utf8 304 findEncoding (_:xs) = findEncoding xs 305 306type EventPos = (Maybe PositionRange, Event) 307 308-- | Parses a byte stream into 'Event's. This function is implemented fully in 309-- Haskell using attoparsec-text for parsing. The produced error messages do 310-- not give line/column information, so you may prefer to stick with the parser 311-- provided by libxml-enumerator. However, this has the advantage of not 312-- relying on any C libraries. 313-- 314-- This relies on 'detectUtf' to determine character encoding, and 'parseText' 315-- to do the actual parsing. 316parseBytes :: MonadThrow m 317 => ParseSettings 318 -> ConduitT S.ByteString Event m () 319parseBytes = mapOutput snd . parseBytesPos 320 321parseBytesPos :: MonadThrow m 322 => ParseSettings 323 -> ConduitT S.ByteString EventPos m () 324parseBytesPos ps = detectUtf .| parseTextPos ps 325 326dropBOM :: Monad m => ConduitT T.Text T.Text m () 327dropBOM = 328 await >>= maybe (return ()) push 329 where 330 push t = 331 case T.uncons t of 332 Nothing -> dropBOM 333 Just (c, cs) -> 334 let output 335 | c == '\xfeef' = cs 336 | otherwise = t 337 in yield output >> idConduit 338 idConduit = await >>= maybe (return ()) (\x -> yield x >> idConduit) 339 340-- | Parses a character stream into 'Event's. This function is implemented 341-- fully in Haskell using attoparsec-text for parsing. The produced error 342-- messages do not give line/column information, so you may prefer to stick 343-- with the parser provided by libxml-enumerator. However, this has the 344-- advantage of not relying on any C libraries. 345-- 346-- Since 1.2.4 347parseText :: MonadThrow m => ParseSettings -> ConduitT T.Text Event m () 348parseText = mapOutput snd . parseTextPos 349 350 351-- | Same as 'parseText', but includes the position of each event. 352-- 353-- Since 1.2.4 354parseTextPos :: MonadThrow m 355 => ParseSettings 356 -> ConduitT T.Text EventPos m () 357parseTextPos de = 358 dropBOM 359 .| tokenize 360 .| toEventC de 361 .| addBeginEnd 362 where 363 tokenize = conduitToken de 364 addBeginEnd = yield (Nothing, EventBeginDocument) >> addEnd 365 addEnd = await >>= maybe 366 (yield (Nothing, EventEndDocument)) 367 (\e -> yield e >> addEnd) 368 369toEventC :: Monad m => ParseSettings -> ConduitT (PositionRange, Token) EventPos m () 370toEventC ps = 371 go [] [] 372 where 373 go !es !levels = 374 await >>= maybe (return ()) push 375 where 376 push (position, token) = 377 mapM_ (yield . (,) (Just position)) events >> go es' levels' 378 where 379 (es', levels', events) = tokenToEvent ps es levels token 380 381 382type DecodeEntities = Text -> Content 383type DecodeIllegalCharacters = Int -> Maybe Char 384 385data ParseSettings = ParseSettings 386 { psDecodeEntities :: DecodeEntities 387 , psRetainNamespaces :: Bool 388 -- ^ Whether the original xmlns attributes should be retained in the parsed 389 -- values. For more information on motivation, see: 390 -- 391 -- <https://github.com/snoyberg/xml/issues/38> 392 -- 393 -- Default: False 394 -- 395 -- Since 1.2.1 396 , psDecodeIllegalCharacters :: DecodeIllegalCharacters 397 -- ^ How to decode illegal character references (@&#[0-9]+;@ or @&#x[0-9a-fA-F]+;@). 398 -- 399 -- Character references within the legal ranges defined by <https://www.w3.org/TR/REC-xml/#NT-Char the standard> are automatically parsed. 400 -- Others are passed to this function. 401 -- 402 -- Default: @const Nothing@ 403 -- 404 -- Since 1.7.1 405 } 406 407instance Default ParseSettings where 408 def = ParseSettings 409 { psDecodeEntities = decodeXmlEntities 410 , psRetainNamespaces = False 411 , psDecodeIllegalCharacters = const Nothing 412 } 413 414conduitToken :: MonadThrow m => ParseSettings -> ConduitT T.Text (PositionRange, Token) m () 415conduitToken = conduitParser . parseToken 416 417parseToken :: ParseSettings -> Parser Token 418parseToken settings = (char '<' >> parseLt) <|> TokenContent <$> parseContent settings False False 419 where 420 parseLt = 421 (char '?' >> parseInstr) <|> 422 (char '!' >> (parseComment <|> parseCdata <|> parseDoctype)) <|> 423 parseBegin <|> 424 (char '/' >> parseEnd) 425 parseInstr = do 426 name <- parseIdent 427 if name == "xml" 428 then do 429 as <- A.many $ parseAttribute settings 430 skipSpace 431 char' '?' 432 char' '>' 433 newline <|> return () 434 return $ TokenXMLDeclaration as 435 else do 436 skipSpace 437 x <- T.pack <$> manyTill anyChar (try $ string "?>") 438 return $ TokenInstruction $ Instruction name x 439 parseComment = do 440 char' '-' 441 char' '-' 442 c <- T.pack <$> manyTill anyChar (string "-->") -- FIXME use takeWhile instead 443 return $ TokenComment c 444 parseCdata = do 445 _ <- string "[CDATA[" 446 t <- T.pack <$> manyTill anyChar (string "]]>") -- FIXME use takeWhile instead 447 return $ TokenCDATA t 448 parseDoctype = do 449 _ <- string "DOCTYPE" 450 skipSpace 451 name <- parseName 452 let i = 453 case name of 454 TName Nothing x -> x 455 TName (Just x) y -> T.concat [x, ":", y] 456 skipSpace 457 eid <- fmap Just parsePublicID <|> 458 fmap Just parseSystemID <|> 459 return Nothing 460 skipSpace 461 ents <- (do 462 char' '[' 463 ents <- parseEntities id 464 skipSpace 465 return ents) <|> return [] 466 char' '>' 467 newline <|> return () 468 return $ TokenDoctype i eid ents 469 parseEntities front = 470 (char ']' >> return (front [])) <|> 471 (parseEntity >>= \e -> parseEntities (front . (e:))) <|> 472 (char '<' >> parseEntities front) <|> 473 (skipWhile (\t -> t /= ']' && t /= '<') >> parseEntities front) 474 parseEntity = try $ do 475 _ <- string "<!ENTITY" 476 skipSpace 477 i <- parseIdent 478 t <- quotedText 479 skipSpace 480 char' '>' 481 return (i, t) 482 parsePublicID = do 483 _ <- string "PUBLIC" 484 x <- quotedText 485 y <- quotedText 486 return $ PublicID x y 487 parseSystemID = do 488 _ <- string "SYSTEM" 489 x <- quotedText 490 return $ SystemID x 491 quotedText = do 492 skipSpace 493 between '"' <|> between '\'' 494 between c = do 495 char' c 496 x <- takeWhile (/=c) 497 char' c 498 return x 499 parseEnd = do 500 skipSpace 501 n <- parseName 502 skipSpace 503 char' '>' 504 return $ TokenEndElement n 505 parseBegin = do 506 skipSpace 507 n <- parseName 508 as <- A.many $ parseAttribute settings 509 skipSpace 510 isClose <- (char '/' >> skipSpace >> return True) <|> return False 511 char' '>' 512 return $ TokenBeginElement n as isClose 0 513 514parseAttribute :: ParseSettings -> Parser TAttribute 515parseAttribute settings = do 516 skipSpace 517 key <- parseName 518 skipSpace 519 char' '=' 520 skipSpace 521 val <- squoted <|> dquoted 522 return (key, val) 523 where 524 squoted = char '\'' *> manyTill (parseContent settings False True) (char '\'') 525 dquoted = char '"' *> manyTill (parseContent settings True False) (char '"') 526 527parseName :: Parser TName 528parseName = 529 name <$> parseIdent <*> A.optional (char ':' >> parseIdent) 530 where 531 name i1 Nothing = TName Nothing i1 532 name i1 (Just i2) = TName (Just i1) i2 533 534parseIdent :: Parser Text 535parseIdent = 536 takeWhile1 valid 537 where 538 valid '&' = False 539 valid '<' = False 540 valid '>' = False 541 valid ':' = False 542 valid '?' = False 543 valid '=' = False 544 valid '"' = False 545 valid '\'' = False 546 valid '/' = False 547 valid ';' = False 548 valid '#' = False 549 valid c = not $ isXMLSpace c 550 551parseContent :: ParseSettings 552 -> Bool -- break on double quote 553 -> Bool -- break on single quote 554 -> Parser Content 555parseContent (ParseSettings decodeEntities _ decodeIllegalCharacters) breakDouble breakSingle = parseReference <|> parseTextContent where 556 parseReference = do 557 char' '&' 558 t <- parseEntityRef <|> parseHexCharRef <|> parseDecCharRef 559 char' ';' 560 return t 561 parseEntityRef = do 562 TName ma b <- parseName 563 let name = maybe "" (`T.append` ":") ma `T.append` b 564 return $ case name of 565 "lt" -> ContentText "<" 566 "gt" -> ContentText ">" 567 "amp" -> ContentText "&" 568 "quot" -> ContentText "\"" 569 "apos" -> ContentText "'" 570 _ -> decodeEntities name 571 parseHexCharRef = do 572 void $ string "#x" 573 n <- AT.hexadecimal 574 case toValidXmlChar n <|> decodeIllegalCharacters n of 575 Nothing -> fail "Invalid character from hexadecimal character reference." 576 Just c -> return $ ContentText $ T.singleton c 577 parseDecCharRef = do 578 void $ string "#" 579 n <- AT.decimal 580 case toValidXmlChar n <|> decodeIllegalCharacters n of 581 Nothing -> fail "Invalid character from decimal character reference." 582 Just c -> return $ ContentText $ T.singleton c 583 parseTextContent = ContentText <$> takeWhile1 valid 584 valid '"' = not breakDouble 585 valid '\'' = not breakSingle 586 valid '&' = False -- amp 587 valid '<' = False -- lt 588 valid _ = True 589 590-- | Is this codepoint a valid XML character? See 591-- <https://www.w3.org/TR/xml/#charsets>. This is proudly XML 1.0 only. 592toValidXmlChar :: Int -> Maybe Char 593toValidXmlChar n 594 | any checkRange ranges = Just (toEnum n) 595 | otherwise = Nothing 596 where 597 --Inclusive lower bound, inclusive upper bound. 598 ranges :: [(Int, Int)] 599 ranges = 600 [ (0x9, 0xA) 601 , (0xD, 0xD) 602 , (0x20, 0xD7FF) 603 , (0xE000, 0xFFFD) 604 , (0x10000, 0x10FFFF) 605 ] 606 checkRange (lb, ub) = lb <= n && n <= ub 607 608skipSpace :: Parser () 609skipSpace = skipWhile isXMLSpace 610 611-- | Determines whether a character is an XML white space. The list of 612-- white spaces is given by 613-- 614-- > S ::= (#x20 | #x9 | #xD | #xA)+ 615-- 616-- in <http://www.w3.org/TR/2008/REC-xml-20081126/#sec-common-syn>. 617isXMLSpace :: Char -> Bool 618isXMLSpace ' ' = True 619isXMLSpace '\t' = True 620isXMLSpace '\r' = True 621isXMLSpace '\n' = True 622isXMLSpace _ = False 623 624newline :: Parser () 625newline = void $ (char '\r' >> char '\n') <|> char '\n' 626 627char' :: Char -> Parser () 628char' = void . char 629 630data ContentType = Ignore | IsContent Text | IsError String | NotContent 631 632-- | Grabs the next piece of content if available. This function skips over any 633-- comments, instructions or entities, and concatenates all content until the next start 634-- or end tag. 635contentMaybe :: MonadThrow m => ConduitT Event o m (Maybe Text) 636contentMaybe = do 637 x <- peekC 638 case pc' x of 639 Ignore -> dropC 1 >> contentMaybe 640 IsContent t -> dropC 1 >> fmap Just (takeContents (t:)) 641 IsError e -> lift $ throwM $ InvalidEntity e x 642 NotContent -> return Nothing 643 where 644 pc' Nothing = NotContent 645 pc' (Just x) = pc x 646 pc (EventContent (ContentText t)) = IsContent t 647 pc (EventContent (ContentEntity e)) = IsError $ "Unknown entity: " ++ show e 648 pc (EventCDATA t) = IsContent t 649 pc EventBeginElement{} = NotContent 650 pc EventEndElement{} = NotContent 651 pc EventBeginDocument{} = Ignore 652 pc EventEndDocument = Ignore 653 pc EventBeginDoctype{} = Ignore 654 pc EventEndDoctype = Ignore 655 pc EventInstruction{} = Ignore 656 pc EventComment{} = Ignore 657 takeContents front = do 658 x <- peekC 659 case pc' x of 660 Ignore -> dropC 1 >> takeContents front 661 IsContent t -> dropC 1 >> takeContents (front . (:) t) 662 IsError e -> lift $ throwM $ InvalidEntity e x 663 NotContent -> return $ T.concat $ front [] 664 665-- | Grabs the next piece of content. If none if available, returns 'T.empty'. 666-- This is simply a wrapper around 'contentMaybe'. 667content :: MonadThrow m => ConduitT Event o m Text 668content = fromMaybe T.empty <$> contentMaybe 669 670 671isWhitespace :: Event -> Bool 672isWhitespace EventBeginDocument = True 673isWhitespace EventEndDocument = True 674isWhitespace EventBeginDoctype{} = True 675isWhitespace EventEndDoctype = True 676isWhitespace EventInstruction{} = True 677isWhitespace (EventContent (ContentText t)) = T.all isSpace t 678isWhitespace EventComment{} = True 679isWhitespace (EventCDATA t) = T.all isSpace t 680isWhitespace _ = False 681 682 683-- | The most generic way to parse a tag. It takes a 'NameMatcher' to check whether 684-- this is a correct tag name, an 'AttrParser' to handle attributes, and 685-- then a parser to deal with content. 686-- 687-- 'Events' are consumed if and only if the tag name and its attributes match. 688-- 689-- This function automatically absorbs its balancing closing tag, and will 690-- throw an exception if not all of the attributes or child elements are 691-- consumed. If you want to allow extra attributes, see 'ignoreAttrs'. 692-- 693-- This function automatically ignores comments, instructions and whitespace. 694tag :: MonadThrow m 695 => NameMatcher a -- ^ Check if this is a correct tag name 696 -- and return a value that can be used to get an @AttrParser@. 697 -- If this fails, the function will return @Nothing@ 698 -> (a -> AttrParser b) -- ^ Given the value returned by the name checker, this function will 699 -- be used to get an @AttrParser@ appropriate for the specific tag. 700 -- If the @AttrParser@ fails, the function will also return @Nothing@ 701 -> (b -> ConduitT Event o m c) -- ^ Handler function to handle the attributes and children 702 -- of a tag, given the value return from the @AttrParser@ 703 -> ConduitT Event o m (Maybe c) 704tag nameMatcher attrParser f = do 705 (x, leftovers) <- dropWS [] 706 res <- case x of 707 Just (EventBeginElement name as) -> case runNameMatcher nameMatcher name of 708 Just y -> case runAttrParser' (attrParser y) as of 709 Left _ -> return Nothing 710 Right z -> do 711 z' <- f z 712 (a, _leftovers') <- dropWS [] 713 case a of 714 Just (EventEndElement name') 715 | name == name' -> return (Just z') 716 _ -> lift $ throwM $ InvalidEndElement name a 717 Nothing -> return Nothing 718 _ -> return Nothing 719 720 case res of 721 -- Did not parse, put back all of the leading whitespace events and the 722 -- final observed event generated by dropWS 723 Nothing -> mapM_ leftover leftovers 724 -- Parse succeeded, discard all of those whitespace events and the 725 -- first parsed event 726 Just _ -> return () 727 728 return res 729 where 730 -- Drop Events until we encounter a non-whitespace element. Return all of 731 -- the events consumed here (including the first non-whitespace event) so 732 -- that the calling function can treat them as leftovers if the parse fails 733 dropWS leftovers = do 734 x <- await 735 let leftovers' = maybe id (:) x leftovers 736 737 case isWhitespace <$> x of 738 Just True -> dropWS leftovers' 739 _ -> return (x, leftovers') 740 runAttrParser' p as = 741 case runAttrParser p as of 742 Left e -> Left e 743 Right ([], x) -> Right x 744 Right (attr', _) -> Left $ toException $ UnparsedAttributes attr' 745 746-- | A simplified version of 'tag' where the 'NameMatcher' result isn't forwarded to the attributes parser. 747-- 748-- Since 1.5.0 749tag' :: MonadThrow m 750 => NameMatcher a -> AttrParser b -> (b -> ConduitT Event o m c) 751 -> ConduitT Event o m (Maybe c) 752tag' a b = tag a (const b) 753 754-- | A further simplified tag parser, which requires that no attributes exist. 755tagNoAttr :: MonadThrow m 756 => NameMatcher a -- ^ Check if this is a correct tag name 757 -> ConduitT Event o m b -- ^ Handler function to handle the children of the matched tag 758 -> ConduitT Event o m (Maybe b) 759tagNoAttr name f = tag' name (return ()) $ const f 760 761 762-- | A further simplified tag parser, which ignores all attributes, if any exist 763tagIgnoreAttrs :: MonadThrow m 764 => NameMatcher a -- ^ Check if this is a correct tag name 765 -> ConduitT Event o m b -- ^ Handler function to handle the children of the matched tag 766 -> ConduitT Event o m (Maybe b) 767tagIgnoreAttrs name f = tag' name ignoreAttrs $ const f 768 769 770-- | Ignore an empty tag and all of its attributes. 771-- This does not ignore the tag recursively 772-- (i.e. it assumes there are no child elements). 773-- This function returns @Just ()@ if the tag matched. 774-- 775-- Since 1.5.0 776ignoreEmptyTag :: MonadThrow m 777 => NameMatcher a -- ^ Check if this is a correct tag name 778 -> ConduitT Event o m (Maybe ()) 779ignoreEmptyTag nameMatcher = tagIgnoreAttrs nameMatcher (return ()) 780 781 782ignored :: Monad m => ConduitT i o m () 783ignored = fix $ \recurse -> do 784 event <- await 785 case event of 786 Just _ -> recurse 787 _ -> return () 788 789 790-- | Same as `takeTree`, without yielding `Event`s. 791-- 792-- >>> :set -XOverloadedStrings 793-- >>> import Conduit 794-- 795-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| (ignoreTree "a" ignoreAttrs >> sinkList) 796-- [EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument] 797-- 798-- >>> runConduit $ parseLBS def "<a>content</a>" .| (ignoreTree "b" ignoreAttrs >> sinkList) 799-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument] 800-- 801-- >>> runConduit $ parseLBS def "content<a></a>" .| (ignoreTree anyName ignoreAttrs >> sinkList) 802-- [EventContent (ContentText "content"),EventBeginElement (Name {nameLocalName = "a", ...}) [],EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument] 803-- 804-- Since 1.9.0 805ignoreTree :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event o m (Maybe ()) 806ignoreTree nameMatcher attrParser = fuseUpstream (takeTree nameMatcher attrParser) ignored 807 808-- | Same as `takeContent`, without yielding `Event`s. 809-- 810-- >>> :set -XOverloadedStrings 811-- >>> import Conduit 812-- 813-- >>> runConduit $ parseLBS def "<a>content</a>" .| (ignoreContent >> sinkList) 814-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument] 815-- 816-- >>> runConduit $ parseLBS def "content<a></a>" .| (ignoreContent >> sinkList) 817-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument] 818-- 819-- >>> runConduit $ parseLBS def "content<a></a>" .| (ignoreContent >> sinkList) 820-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument] 821-- 822-- Since 1.9.0 823ignoreContent :: MonadThrow m => ConduitT Event o m (Maybe ()) 824ignoreContent = fuseUpstream takeContent ignored 825 826 827-- | Same as `takeTreeContent`, without yielding `Event`s. 828-- 829-- >>> :set -XOverloadedStrings 830-- >>> import Conduit 831-- 832-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| (ignoreTreeContent "a" ignoreAttrs >> sinkList) 833-- [EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument] 834-- 835-- >>> runConduit $ parseLBS def "<a>content</a>" .| (ignoreTreeContent "b" ignoreAttrs >> sinkList) 836-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument] 837-- 838-- >>> runConduit $ parseLBS def "content<a></a>" .| (ignoreTreeContent anyName ignoreAttrs >> sinkList) 839-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument] 840-- 841-- Since 1.5.0 842ignoreTreeContent :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event o m (Maybe ()) 843ignoreTreeContent namePred attrParser = fuseUpstream (takeTreeContent namePred attrParser) ignored 844 845 846-- | Same as `takeAnyTreeContent`, without yielding `Event`s. 847-- 848-- >>> :set -XOverloadedStrings 849-- >>> import Conduit 850-- 851-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| (ignoreAnyTreeContent >> sinkList) 852-- [EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument] 853-- 854-- >>> runConduit $ parseLBS def "text<b></b>" .| (ignoreAnyTreeContent >> sinkList) 855-- [EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument] 856-- 857-- Since 1.5.0 858ignoreAnyTreeContent :: MonadThrow m => ConduitT Event o m (Maybe ()) 859ignoreAnyTreeContent = fuseUpstream takeAnyTreeContent ignored 860 861 862-- | Get the value of the first parser which returns 'Just'. If no parsers 863-- succeed (i.e., return @Just@), this function returns 'Nothing'. 864-- 865-- > orE a b = choose [a, b] 866orE :: Monad m 867 => ConduitT Event o m (Maybe a) -- ^ The first (preferred) parser 868 -> ConduitT Event o m (Maybe a) -- ^ The second parser, only executed if the first parser fails 869 -> ConduitT Event o m (Maybe a) 870orE a b = a >>= \x -> maybe b (const $ return x) x 871 872-- | Get the value of the first parser which returns 'Just'. If no parsers 873-- succeed (i.e., return 'Just'), this function returns 'Nothing'. 874choose :: Monad m 875 => [ConduitT Event o m (Maybe a)] -- ^ List of parsers that will be tried in order. 876 -> ConduitT Event o m (Maybe a) -- ^ Result of the first parser to succeed, or @Nothing@ 877 -- if no parser succeeded 878choose [] = return Nothing 879choose (i:is) = i >>= maybe (choose is) (return . Just) 880 881-- | Force an optional parser into a required parser. All of the 'tag' 882-- functions, 'attr', 'choose' and 'many' deal with 'Maybe' parsers. Use this when you 883-- want to finally force something to happen. 884force :: MonadThrow m 885 => String -- ^ Error message 886 -> m (Maybe a) -- ^ Optional parser to be forced 887 -> m a 888force msg i = i >>= maybe (throwM $ XmlException msg Nothing) return 889 890-- | A helper function which reads a file from disk using 'enumFile', detects 891-- character encoding using 'detectUtf', parses the XML using 'parseBytes', and 892-- then hands off control to your supplied parser. 893parseFile :: MonadResource m 894 => ParseSettings 895 -> FilePath 896 -> ConduitT i Event m () 897parseFile ps fp = sourceFile fp .| transPipe liftIO (parseBytes ps) 898 899-- | Parse an event stream from a lazy 'L.ByteString'. 900parseLBS :: MonadThrow m 901 => ParseSettings 902 -> L.ByteString 903 -> ConduitT i Event m () 904parseLBS ps lbs = sourceLazy lbs .| parseBytes ps 905 906data XmlException = XmlException 907 { xmlErrorMessage :: String 908 , xmlBadInput :: Maybe Event 909 } 910 | InvalidEndElement Name (Maybe Event) 911 | InvalidEntity String (Maybe Event) 912 | MissingAttribute String 913 | UnparsedAttributes [(Name, [Content])] 914 deriving (Show, Typeable) 915 916instance Exception XmlException where 917#if MIN_VERSION_base(4, 8, 0) 918 displayException (XmlException msg (Just event)) = "Error while parsing XML event " ++ show event ++ ": " ++ msg 919 displayException (XmlException msg _) = "Error while parsing XML: " ++ msg 920 displayException (InvalidEndElement name (Just event)) = "Error while parsing XML event: expected </" ++ T.unpack (nameLocalName name) ++ ">, got " ++ show event 921 displayException (InvalidEndElement name _) = "Error while parsing XML event: expected </" ++ show name ++ ">, got nothing" 922 displayException (InvalidEntity msg (Just event)) = "Error while parsing XML entity " ++ show event ++ ": " ++ msg 923 displayException (InvalidEntity msg _) = "Error while parsing XML entity: " ++ msg 924 displayException (MissingAttribute msg) = "Missing required attribute: " ++ msg 925 displayException (UnparsedAttributes attrs) = show (length attrs) ++ " remaining unparsed attributes: \n" ++ intercalate "\n" (show <$> attrs) 926#endif 927 928 929-- | A @NameMatcher@ describes which names a tag parser is allowed to match. 930-- 931-- Since 1.5.0 932newtype NameMatcher a = NameMatcher { runNameMatcher :: Name -> Maybe a } 933 934deriving instance Functor NameMatcher 935 936instance Applicative NameMatcher where 937 pure a = NameMatcher $ const $ pure a 938 NameMatcher f <*> NameMatcher a = NameMatcher $ \name -> f name <*> a name 939 940-- | 'NameMatcher's can be combined with @\<|\>@ 941instance Alternative NameMatcher where 942 empty = NameMatcher $ const Nothing 943 NameMatcher f <|> NameMatcher g = NameMatcher (\a -> f a <|> g a) 944 945-- | Match a single 'Name' in a concise way. 946-- Note that 'Name' is namespace sensitive: when using the 'IsString' instance, 947-- use @"{http:\/\/a\/b}c"@ to match the tag @c@ in the XML namespace @http://a/b@ 948instance (a ~ Name) => IsString (NameMatcher a) where 949 fromString s = matching (== fromString s) 950 951-- | @matching f@ matches @name@ iff @f name@ is true. Returns the matched 'Name'. 952-- 953-- Since 1.5.0 954matching :: (Name -> Bool) -> NameMatcher Name 955matching f = NameMatcher $ \name -> if f name then Just name else Nothing 956 957-- | Matches any 'Name'. Returns the matched 'Name'. 958-- 959-- Since 1.5.0 960anyName :: NameMatcher Name 961anyName = matching (const True) 962 963-- | Matches any 'Name' from the given list. Returns the matched 'Name'. 964-- 965-- Since 1.5.0 966anyOf :: [Name] -> NameMatcher Name 967anyOf values = matching (`elem` values) 968 969 970-- | A monad for parsing attributes. By default, it requires you to deal with 971-- all attributes present on an element, and will throw an exception if there 972-- are unhandled attributes. Use the 'requireAttr', 'attr' et al 973-- functions for handling an attribute, and 'ignoreAttrs' if you would like to 974-- skip the rest of the attributes on an element. 975-- 976-- 'Alternative' instance behaves like 'First' monoid: it chooses first 977-- parser which doesn't fail. 978newtype AttrParser a = AttrParser { runAttrParser :: [(Name, [Content])] -> Either SomeException ([(Name, [Content])], a) } 979 980instance Monad AttrParser where 981 return a = AttrParser $ \as -> Right (as, a) 982 (AttrParser f) >>= g = AttrParser $ \as -> 983 either Left (\(as', f') -> runAttrParser (g f') as') (f as) 984instance Functor AttrParser where 985 fmap = liftM 986instance Applicative AttrParser where 987 pure = return 988 (<*>) = ap 989instance Alternative AttrParser where 990 empty = AttrParser $ const $ Left $ toException $ XmlException "AttrParser.empty" Nothing 991 AttrParser f <|> AttrParser g = AttrParser $ \x -> 992 either (const $ g x) Right (f x) 993instance MonadThrow AttrParser where 994 throwM = AttrParser . const . throwM 995 996optionalAttrRaw :: ((Name, [Content]) -> Maybe b) -> AttrParser (Maybe b) 997optionalAttrRaw f = 998 AttrParser $ go id 999 where 1000 go front [] = Right (front [], Nothing) 1001 go front (a:as) = 1002 maybe (go (front . (:) a) as) 1003 (\b -> Right (front as, Just b)) 1004 (f a) 1005 1006requireAttrRaw :: String -> ((Name, [Content]) -> Maybe b) -> AttrParser b 1007requireAttrRaw msg f = optionalAttrRaw f >>= 1008 maybe (AttrParser $ const $ Left $ toException $ MissingAttribute msg) 1009 return 1010 1011-- | Return the value for an attribute if present. 1012attr :: Name -> AttrParser (Maybe Text) 1013attr n = optionalAttrRaw 1014 (\(x, y) -> if x == n then Just (contentsToText y) else Nothing) 1015 1016-- | Shortcut composition of 'force' and 'attr'. 1017requireAttr :: Name -> AttrParser Text 1018requireAttr n = force ("Missing attribute: " ++ show n) $ attr n 1019 1020 1021{-# DEPRECATED optionalAttr "Please use 'attr'." #-} 1022optionalAttr :: Name -> AttrParser (Maybe Text) 1023optionalAttr = attr 1024 1025contentsToText :: [Content] -> Text 1026contentsToText = T.concat . map toText where 1027 toText (ContentText t) = t 1028 toText (ContentEntity e) = T.concat ["&", e, ";"] 1029 1030-- | Skip the remaining attributes on an element. Since this will clear the 1031-- list of attributes, you must call this /after/ any calls to 'requireAttr', 1032-- 'optionalAttr', etc. 1033ignoreAttrs :: AttrParser () 1034ignoreAttrs = AttrParser $ const $ Right ([], ()) 1035 1036-- | Keep parsing elements as long as the parser returns 'Just'. 1037many :: Monad m 1038 => ConduitT Event o m (Maybe a) 1039 -> ConduitT Event o m [a] 1040many i = manyIgnore i $ return Nothing 1041 1042-- | Like 'many' but discards the results without building an intermediate list. 1043-- 1044-- Since 1.5.0 1045many_ :: MonadThrow m 1046 => ConduitT Event o m (Maybe a) 1047 -> ConduitT Event o m () 1048many_ consumer = manyIgnoreYield (return Nothing) (void <$> consumer) 1049 1050-- | Keep parsing elements as long as the parser returns 'Just' 1051-- or the ignore parser returns 'Just'. 1052manyIgnore :: Monad m 1053 => ConduitT Event o m (Maybe a) 1054 -> ConduitT Event o m (Maybe b) 1055 -> ConduitT Event o m [a] 1056manyIgnore i ignored = go id where 1057 go front = i >>= maybe (onFail front) (\y -> go $ front . (:) y) 1058 -- onFail is called if the main parser fails 1059 onFail front = ignored >>= maybe (return $ front []) (const $ go front) 1060 1061-- | Like @many@, but any tags and content the consumer doesn't match on 1062-- are silently ignored. 1063many' :: MonadThrow m 1064 => ConduitT Event o m (Maybe a) 1065 -> ConduitT Event o m [a] 1066many' consumer = manyIgnore consumer ignoreAnyTreeContent 1067 1068 1069-- | Like 'many', but uses 'yield' so the result list can be streamed 1070-- to downstream conduits without waiting for 'manyYield' to finish 1071manyYield :: Monad m 1072 => ConduitT a b m (Maybe b) 1073 -> ConduitT a b m () 1074manyYield consumer = fix $ \loop -> 1075 consumer >>= maybe (return ()) (\x -> yield x >> loop) 1076 1077-- | Like 'manyIgnore', but uses 'yield' so the result list can be streamed 1078-- to downstream conduits without waiting for 'manyIgnoreYield' to finish 1079manyIgnoreYield :: MonadThrow m 1080 => ConduitT Event b m (Maybe b) -- ^ Consuming parser that generates the result stream 1081 -> ConduitT Event b m (Maybe ()) -- ^ Ignore parser that consumes elements to be ignored 1082 -> ConduitT Event b m () 1083manyIgnoreYield consumer ignoreParser = fix $ \loop -> 1084 consumer >>= maybe (onFail loop) (\x -> yield x >> loop) 1085 where onFail loop = ignoreParser >>= maybe (return ()) (const loop) 1086 1087-- | Like 'many'', but uses 'yield' so the result list can be streamed 1088-- to downstream conduits without waiting for 'manyYield'' to finish 1089manyYield' :: MonadThrow m 1090 => ConduitT Event b m (Maybe b) 1091 -> ConduitT Event b m () 1092manyYield' consumer = manyIgnoreYield consumer ignoreAnyTreeContent 1093 1094 1095-- | Stream a single content 'Event'. 1096-- 1097-- Returns @Just ()@ if a content 'Event' was consumed, @Nothing@ otherwise. 1098-- 1099-- >>> :set -XOverloadedStrings 1100-- >>> import Control.Monad (void) 1101-- >>> import Conduit 1102-- 1103-- >>> runConduit $ parseLBS def "content<a></a>" .| void takeContent .| sinkList 1104-- [EventBeginDocument,EventContent (ContentText "content")] 1105-- 1106-- If next event isn't a content, nothing is consumed. 1107-- 1108-- >>> runConduit $ parseLBS def "<a>content</a>" .| void takeContent .| sinkList 1109-- [EventBeginDocument] 1110-- 1111-- Since 1.5.0 1112takeContent :: MonadThrow m => ConduitT Event Event m (Maybe ()) 1113takeContent = do 1114 event <- await 1115 case event of 1116 Just e@EventContent{} -> yield e >> return (Just ()) 1117 Just e@EventCDATA{} -> yield e >> return (Just ()) 1118 Just e -> if isWhitespace e then yield e >> takeContent else leftover e >> return Nothing 1119 _ -> return Nothing 1120 1121-- | Stream 'Event's corresponding to a single XML element that matches given 'NameMatcher' and 'AttrParser', from the opening- to the closing-tag. 1122-- 1123-- >>> :set -XOverloadedStrings 1124-- >>> import Control.Monad (void) 1125-- >>> import Conduit 1126-- 1127-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTree "a" ignoreAttrs) .| sinkList 1128-- [EventBeginDocument,EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...})] 1129-- 1130-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTree "b" ignoreAttrs) .| sinkList 1131-- [EventBeginDocument] 1132-- 1133-- If next 'Event' isn't an element, nothing is consumed. 1134-- 1135-- >>> runConduit $ parseLBS def "text<a></a>" .| void (takeTree "a" ignoreAttrs) .| sinkList 1136-- [EventBeginDocument] 1137-- 1138-- If an opening-tag is consumed but no matching closing-tag is found, an 'XmlException' is thrown. 1139-- 1140-- >>> runConduit $ parseLBS def "<a><b></b>" .| void (takeTree "a" ignoreAttrs) .| sinkList 1141-- *** Exception: InvalidEndElement (Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}) Nothing 1142-- 1143-- This function automatically ignores comments, instructions and whitespace. 1144-- 1145-- Returns @Just ()@ if an element was consumed, 'Nothing' otherwise. 1146-- 1147-- Since 1.5.0 1148takeTree :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event Event m (Maybe ()) 1149takeTree nameMatcher attrParser = do 1150 event <- await 1151 case event of 1152 Just e@(EventBeginElement name as) -> case runNameMatcher nameMatcher name of 1153 Just _ -> case runAttrParser attrParser as of 1154 Right _ -> do 1155 yield e 1156 whileJust takeAnyTreeContent 1157 endEvent <- await 1158 case endEvent of 1159 Just e'@(EventEndElement name') | name == name' -> yield e' >> return (Just ()) 1160 _ -> lift $ throwM $ InvalidEndElement name endEvent 1161 _ -> leftover e >> return Nothing 1162 _ -> leftover e >> return Nothing 1163 1164 Just e -> if isWhitespace e then yield e >> takeTree nameMatcher attrParser else leftover e >> return Nothing 1165 _ -> return Nothing 1166 where 1167 whileJust f = fix $ \loop -> f >>= maybe (return ()) (const loop) 1168 1169-- | Like 'takeTree', but can also stream a content 'Event'. 1170-- 1171-- >>> :set -XOverloadedStrings 1172-- >>> import Control.Monad (void) 1173-- >>> import Conduit 1174-- 1175-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTreeContent "a" ignoreAttrs) .| sinkList 1176-- [EventBeginDocument,EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...})] 1177-- 1178-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTreeContent "b" ignoreAttrs) .| sinkList 1179-- [EventBeginDocument] 1180-- 1181-- >>> runConduit $ parseLBS def "content<a></a><b></b>" .| void (takeTreeContent "a" ignoreAttrs) .| sinkList 1182-- [EventBeginDocument,EventContent (ContentText "content")] 1183-- 1184-- Since 1.5.0 1185takeTreeContent :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event Event m (Maybe ()) 1186takeTreeContent nameMatcher attrParser = runMaybeT $ MaybeT (takeTree nameMatcher attrParser) <|> MaybeT takeContent 1187 1188-- | Like 'takeTreeContent', without checking for tag name or attributes. 1189-- 1190-- >>> :set -XOverloadedStrings 1191-- >>> import Control.Monad (void) 1192-- >>> import Conduit 1193-- 1194-- >>> runConduit $ parseLBS def "text<a></a>" .| void takeAnyTreeContent .| sinkList 1195-- [EventBeginDocument,EventContent (ContentText "text")] 1196-- 1197-- >>> runConduit $ parseLBS def "</a><b></b>" .| void takeAnyTreeContent .| sinkList 1198-- [EventBeginDocument] 1199-- 1200-- >>> runConduit $ parseLBS def "<b><c></c></b></a>text" .| void takeAnyTreeContent .| sinkList 1201-- [EventBeginDocument,EventBeginElement (Name {nameLocalName = "b", ...}) [],EventBeginElement (Name {nameLocalName = "c", ...}) [],EventEndElement (Name {nameLocalName = "c", ...}),EventEndElement (Name {nameLocalName = "b", ...})] 1202-- 1203-- Since 1.5.0 1204takeAnyTreeContent :: MonadThrow m 1205 => ConduitT Event Event m (Maybe ()) 1206takeAnyTreeContent = takeTreeContent anyName ignoreAttrs 1207 1208 1209-- | Default implementation of 'DecodeEntities', which leaves the 1210-- entity as-is. Numeric character references and the five standard 1211-- entities (lt, gt, amp, quot, pos) are handled internally by the 1212-- parser. 1213decodeXmlEntities :: DecodeEntities 1214decodeXmlEntities = ContentEntity 1215 1216-- | HTML4-compliant entity decoder. Handles the additional 248 1217-- entities defined by HTML 4 and XHTML 1. 1218-- 1219-- Note that HTML 5 introduces a drastically larger number of entities, and 1220-- this code does not recognize most of them. 1221decodeHtmlEntities :: DecodeEntities 1222decodeHtmlEntities t = 1223 maybe (ContentEntity t) ContentText $ Map.lookup t htmlEntities 1224 1225htmlEntities :: Map.Map T.Text T.Text 1226htmlEntities = Map.fromList 1227 $ map (pack *** pack) -- Work around the long-compile-time bug 1228 [ ("nbsp", "\160") 1229 , ("iexcl", "\161") 1230 , ("cent", "\162") 1231 , ("pound", "\163") 1232 , ("curren", "\164") 1233 , ("yen", "\165") 1234 , ("brvbar", "\166") 1235 , ("sect", "\167") 1236 , ("uml", "\168") 1237 , ("copy", "\169") 1238 , ("ordf", "\170") 1239 , ("laquo", "\171") 1240 , ("not", "\172") 1241 , ("shy", "\173") 1242 , ("reg", "\174") 1243 , ("macr", "\175") 1244 , ("deg", "\176") 1245 , ("plusmn", "\177") 1246 , ("sup2", "\178") 1247 , ("sup3", "\179") 1248 , ("acute", "\180") 1249 , ("micro", "\181") 1250 , ("para", "\182") 1251 , ("middot", "\183") 1252 , ("cedil", "\184") 1253 , ("sup1", "\185") 1254 , ("ordm", "\186") 1255 , ("raquo", "\187") 1256 , ("frac14", "\188") 1257 , ("frac12", "\189") 1258 , ("frac34", "\190") 1259 , ("iquest", "\191") 1260 , ("Agrave", "\192") 1261 , ("Aacute", "\193") 1262 , ("Acirc", "\194") 1263 , ("Atilde", "\195") 1264 , ("Auml", "\196") 1265 , ("Aring", "\197") 1266 , ("AElig", "\198") 1267 , ("Ccedil", "\199") 1268 , ("Egrave", "\200") 1269 , ("Eacute", "\201") 1270 , ("Ecirc", "\202") 1271 , ("Euml", "\203") 1272 , ("Igrave", "\204") 1273 , ("Iacute", "\205") 1274 , ("Icirc", "\206") 1275 , ("Iuml", "\207") 1276 , ("ETH", "\208") 1277 , ("Ntilde", "\209") 1278 , ("Ograve", "\210") 1279 , ("Oacute", "\211") 1280 , ("Ocirc", "\212") 1281 , ("Otilde", "\213") 1282 , ("Ouml", "\214") 1283 , ("times", "\215") 1284 , ("Oslash", "\216") 1285 , ("Ugrave", "\217") 1286 , ("Uacute", "\218") 1287 , ("Ucirc", "\219") 1288 , ("Uuml", "\220") 1289 , ("Yacute", "\221") 1290 , ("THORN", "\222") 1291 , ("szlig", "\223") 1292 , ("agrave", "\224") 1293 , ("aacute", "\225") 1294 , ("acirc", "\226") 1295 , ("atilde", "\227") 1296 , ("auml", "\228") 1297 , ("aring", "\229") 1298 , ("aelig", "\230") 1299 , ("ccedil", "\231") 1300 , ("egrave", "\232") 1301 , ("eacute", "\233") 1302 , ("ecirc", "\234") 1303 , ("euml", "\235") 1304 , ("igrave", "\236") 1305 , ("iacute", "\237") 1306 , ("icirc", "\238") 1307 , ("iuml", "\239") 1308 , ("eth", "\240") 1309 , ("ntilde", "\241") 1310 , ("ograve", "\242") 1311 , ("oacute", "\243") 1312 , ("ocirc", "\244") 1313 , ("otilde", "\245") 1314 , ("ouml", "\246") 1315 , ("divide", "\247") 1316 , ("oslash", "\248") 1317 , ("ugrave", "\249") 1318 , ("uacute", "\250") 1319 , ("ucirc", "\251") 1320 , ("uuml", "\252") 1321 , ("yacute", "\253") 1322 , ("thorn", "\254") 1323 , ("yuml", "\255") 1324 , ("OElig", "\338") 1325 , ("oelig", "\339") 1326 , ("Scaron", "\352") 1327 , ("scaron", "\353") 1328 , ("Yuml", "\376") 1329 , ("fnof", "\402") 1330 , ("circ", "\710") 1331 , ("tilde", "\732") 1332 , ("Alpha", "\913") 1333 , ("Beta", "\914") 1334 , ("Gamma", "\915") 1335 , ("Delta", "\916") 1336 , ("Epsilon", "\917") 1337 , ("Zeta", "\918") 1338 , ("Eta", "\919") 1339 , ("Theta", "\920") 1340 , ("Iota", "\921") 1341 , ("Kappa", "\922") 1342 , ("Lambda", "\923") 1343 , ("Mu", "\924") 1344 , ("Nu", "\925") 1345 , ("Xi", "\926") 1346 , ("Omicron", "\927") 1347 , ("Pi", "\928") 1348 , ("Rho", "\929") 1349 , ("Sigma", "\931") 1350 , ("Tau", "\932") 1351 , ("Upsilon", "\933") 1352 , ("Phi", "\934") 1353 , ("Chi", "\935") 1354 , ("Psi", "\936") 1355 , ("Omega", "\937") 1356 , ("alpha", "\945") 1357 , ("beta", "\946") 1358 , ("gamma", "\947") 1359 , ("delta", "\948") 1360 , ("epsilon", "\949") 1361 , ("zeta", "\950") 1362 , ("eta", "\951") 1363 , ("theta", "\952") 1364 , ("iota", "\953") 1365 , ("kappa", "\954") 1366 , ("lambda", "\955") 1367 , ("mu", "\956") 1368 , ("nu", "\957") 1369 , ("xi", "\958") 1370 , ("omicron", "\959") 1371 , ("pi", "\960") 1372 , ("rho", "\961") 1373 , ("sigmaf", "\962") 1374 , ("sigma", "\963") 1375 , ("tau", "\964") 1376 , ("upsilon", "\965") 1377 , ("phi", "\966") 1378 , ("chi", "\967") 1379 , ("psi", "\968") 1380 , ("omega", "\969") 1381 , ("thetasym", "\977") 1382 , ("upsih", "\978") 1383 , ("piv", "\982") 1384 , ("ensp", "\8194") 1385 , ("emsp", "\8195") 1386 , ("thinsp", "\8201") 1387 , ("zwnj", "\8204") 1388 , ("zwj", "\8205") 1389 , ("lrm", "\8206") 1390 , ("rlm", "\8207") 1391 , ("ndash", "\8211") 1392 , ("mdash", "\8212") 1393 , ("lsquo", "\8216") 1394 , ("rsquo", "\8217") 1395 , ("sbquo", "\8218") 1396 , ("ldquo", "\8220") 1397 , ("rdquo", "\8221") 1398 , ("bdquo", "\8222") 1399 , ("dagger", "\8224") 1400 , ("Dagger", "\8225") 1401 , ("bull", "\8226") 1402 , ("hellip", "\8230") 1403 , ("permil", "\8240") 1404 , ("prime", "\8242") 1405 , ("Prime", "\8243") 1406 , ("lsaquo", "\8249") 1407 , ("rsaquo", "\8250") 1408 , ("oline", "\8254") 1409 , ("frasl", "\8260") 1410 , ("euro", "\8364") 1411 , ("image", "\8465") 1412 , ("weierp", "\8472") 1413 , ("real", "\8476") 1414 , ("trade", "\8482") 1415 , ("alefsym", "\8501") 1416 , ("larr", "\8592") 1417 , ("uarr", "\8593") 1418 , ("rarr", "\8594") 1419 , ("darr", "\8595") 1420 , ("harr", "\8596") 1421 , ("crarr", "\8629") 1422 , ("lArr", "\8656") 1423 , ("uArr", "\8657") 1424 , ("rArr", "\8658") 1425 , ("dArr", "\8659") 1426 , ("hArr", "\8660") 1427 , ("forall", "\8704") 1428 , ("part", "\8706") 1429 , ("exist", "\8707") 1430 , ("empty", "\8709") 1431 , ("nabla", "\8711") 1432 , ("isin", "\8712") 1433 , ("notin", "\8713") 1434 , ("ni", "\8715") 1435 , ("prod", "\8719") 1436 , ("sum", "\8721") 1437 , ("minus", "\8722") 1438 , ("lowast", "\8727") 1439 , ("radic", "\8730") 1440 , ("prop", "\8733") 1441 , ("infin", "\8734") 1442 , ("ang", "\8736") 1443 , ("and", "\8743") 1444 , ("or", "\8744") 1445 , ("cap", "\8745") 1446 , ("cup", "\8746") 1447 , ("int", "\8747") 1448 , ("there4", "\8756") 1449 , ("sim", "\8764") 1450 , ("cong", "\8773") 1451 , ("asymp", "\8776") 1452 , ("ne", "\8800") 1453 , ("equiv", "\8801") 1454 , ("le", "\8804") 1455 , ("ge", "\8805") 1456 , ("sub", "\8834") 1457 , ("sup", "\8835") 1458 , ("nsub", "\8836") 1459 , ("sube", "\8838") 1460 , ("supe", "\8839") 1461 , ("oplus", "\8853") 1462 , ("otimes", "\8855") 1463 , ("perp", "\8869") 1464 , ("sdot", "\8901") 1465 , ("lceil", "\8968") 1466 , ("rceil", "\8969") 1467 , ("lfloor", "\8970") 1468 , ("rfloor", "\8971") 1469 , ("lang", "\9001") 1470 , ("rang", "\9002") 1471 , ("loz", "\9674") 1472 , ("spades", "\9824") 1473 , ("clubs", "\9827") 1474 , ("hearts", "\9829") 1475 , ("diams", "\9830") 1476 ] 1477