1{-# LANGUAGE BangPatterns               #-}
2{-# LANGUAGE CPP                        #-}
3{-# LANGUAGE DeriveDataTypeable         #-}
4{-# LANGUAGE DeriveFunctor              #-}
5{-# LANGUAGE FlexibleContexts           #-}
6{-# LANGUAGE FlexibleInstances          #-}
7{-# LANGUAGE GeneralizedNewtypeDeriving #-}
8{-# LANGUAGE OverloadedStrings          #-}
9{-# LANGUAGE PatternGuards              #-}
10{-# LANGUAGE RankNTypes                 #-}
11{-# LANGUAGE StandaloneDeriving         #-}
12{-# LANGUAGE TypeFamilies               #-}
13-- | This module provides both a native Haskell solution for parsing XML
14-- documents into a stream of events, and a set of parser combinators for
15-- dealing with a stream of events.
16--
17-- As a simple example:
18--
19-- >>> :set -XOverloadedStrings
20-- >>> import Conduit (runConduit, (.|))
21-- >>> import Data.Text (Text, unpack)
22-- >>> import Data.XML.Types (Event)
23-- >>> data Person = Person Int Text Text deriving Show
24-- >>> :{
25-- let parsePerson :: MonadThrow m => ConduitT Event o m (Maybe Person)
26--     parsePerson = tag' "person" parseAttributes $ \(age, goodAtHaskell) -> do
27--       name <- content
28--       return $ Person (read $ unpack age) name goodAtHaskell
29--       where parseAttributes = (,) <$> requireAttr "age" <*> requireAttr "goodAtHaskell" <* ignoreAttrs
30--     parsePeople :: MonadThrow m => ConduitT Event o m (Maybe [Person])
31--     parsePeople = tagNoAttr "people" $ many parsePerson
32--     inputXml = mconcat
33--       [ "<?xml version=\"1.0\" encoding=\"utf-8\"?>"
34--       , "<people>"
35--       , "  <person age=\"25\" goodAtHaskell=\"yes\">Michael</person>"
36--       , "  <person age=\"2\" goodAtHaskell=\"might become\">Eliezer</person>"
37--       , "</people>"
38--       ]
39-- :}
40--
41-- >>> runConduit $ parseLBS def inputXml .| force "people required" parsePeople
42-- [Person 25 "Michael" "yes",Person 2 "Eliezer" "might become"]
43--
44--
45-- This module also supports streaming results using 'yield'.
46-- This allows parser results to be processed using conduits
47-- while a particular parser (e.g. 'many') is still running.
48-- Without using streaming results, you have to wait until the parser finished
49-- before you can process the result list. Large XML files might be easier
50-- to process by using streaming results.
51-- See http://stackoverflow.com/q/21367423/2597135 for a related discussion.
52--
53-- >>> import Data.Conduit.List as CL
54-- >>> :{
55-- let parsePeople' :: MonadThrow m => ConduitT Event Person m (Maybe ())
56--     parsePeople' = tagNoAttr "people" $ manyYield parsePerson
57-- :}
58--
59-- >>> runConduit $ parseLBS def inputXml .| force "people required" parsePeople' .| CL.mapM_ print
60-- Person 25 "Michael" "yes"
61-- Person 2 "Eliezer" "might become"
62--
63-- Previous versions of this module contained a number of more sophisticated
64-- functions written by Aristid Breitkreuz and Dmitry Olshansky. To keep this
65-- package simpler, those functions are being moved to a separate package. This
66-- note will be updated with the name of the package(s) when available.
67module Text.XML.Stream.Parse
68    ( -- * Parsing XML files
69      parseBytes
70    , parseBytesPos
71    , parseText
72    , parseTextPos
73    , detectUtf
74    , parseFile
75    , parseLBS
76      -- ** Parser settings
77    , ParseSettings
78    , def
79    , DecodeEntities
80    , DecodeIllegalCharacters
81    , psDecodeEntities
82    , psDecodeIllegalCharacters
83    , psRetainNamespaces
84      -- *** Entity decoding
85    , decodeXmlEntities
86    , decodeHtmlEntities
87      -- * Event parsing
88    , tag
89    , tag'
90    , tagNoAttr
91    , tagIgnoreAttrs
92    , content
93    , contentMaybe
94      -- * Ignoring tags/trees
95    , ignoreEmptyTag
96    , ignoreTree
97    , ignoreContent
98    , ignoreTreeContent
99    , ignoreAnyTreeContent
100      -- * Streaming events
101    , takeContent
102    , takeTree
103    , takeTreeContent
104    , takeAnyTreeContent
105      -- * Tag name matching
106    , NameMatcher(..)
107    , matching
108    , anyOf
109    , anyName
110      -- * Attribute parsing
111    , AttrParser
112    , attr
113    , requireAttr
114    , optionalAttr
115    , requireAttrRaw
116    , optionalAttrRaw
117    , ignoreAttrs
118      -- * Combinators
119    , orE
120    , choose
121    , many
122    , many_
123    , manyIgnore
124    , many'
125    , force
126      -- * Streaming combinators
127    , manyYield
128    , manyYield'
129    , manyIgnoreYield
130      -- * Exceptions
131    , XmlException (..)
132      -- * Other types
133    , PositionRange
134    , EventPos
135    ) where
136import           Conduit
137import           Control.Applicative          (Alternative (empty, (<|>)),
138                                               Applicative (..), (<$>))
139import qualified Control.Applicative          as A
140import           Control.Arrow                ((***))
141import           Control.Exception            (Exception (..), SomeException)
142import           Control.Monad                (ap, liftM, void)
143import           Control.Monad.Fix            (fix)
144import           Control.Monad.IO.Class       (liftIO)
145import           Control.Monad.Trans.Class    (lift)
146import           Control.Monad.Trans.Maybe    (MaybeT (..))
147import           Control.Monad.Trans.Resource (MonadResource, MonadThrow (..),
148                                               throwM)
149import           Data.Attoparsec.Text         (Parser, anyChar, char, manyTill,
150                                               skipWhile, string, takeWhile,
151                                               takeWhile1, try)
152import qualified Data.Attoparsec.Text         as AT
153import qualified Data.ByteString              as S
154import qualified Data.ByteString.Lazy         as L
155import           Data.Char                    (isSpace)
156import           Data.Conduit.Attoparsec      (PositionRange, conduitParser)
157import qualified Data.Conduit.Text            as CT
158import           Data.Default.Class           (Default (..))
159import           Data.List                    (foldl', intercalate)
160import qualified Data.Map                     as Map
161import           Data.Maybe                   (fromMaybe, isNothing)
162import           Data.String                  (IsString (..))
163import           Data.Text                    (Text, pack)
164import qualified Data.Text                    as T
165import           Data.Text.Encoding           (decodeUtf8With)
166import           Data.Text.Encoding.Error     (lenientDecode)
167import           Data.Typeable                (Typeable)
168import           Data.XML.Types               (Content (..), Event (..),
169                                               ExternalID (..),
170                                               Instruction (..), Name (..))
171import           Prelude                      hiding (takeWhile)
172import           Text.XML.Stream.Token
173
174type Ents = [(Text, Text)]
175
176tokenToEvent :: ParseSettings -> Ents -> [NSLevel] -> Token -> (Ents, [NSLevel], [Event])
177tokenToEvent _ es n (TokenXMLDeclaration _) = (es, n, [])
178tokenToEvent _ es n (TokenInstruction i) = (es, n, [EventInstruction i])
179tokenToEvent ps es n (TokenBeginElement name as isClosed _) =
180    (es, n', if isClosed then [begin, end] else [begin])
181  where
182    l0 = case n of
183            []  -> NSLevel Nothing Map.empty
184            x:_ -> x
185    (as', l') = foldl' go (id, l0) as
186    go (front, l) (TName kpref kname, val) =
187        (addNS front, l'')
188      where
189        isPrefixed = kpref == Just "xmlns"
190        isUnprefixed = isNothing kpref && kname == "xmlns"
191
192        addNS
193            | not (psRetainNamespaces ps) && (isPrefixed || isUnprefixed) = id
194            | otherwise = (((tname, map resolve val):) .)
195          where
196            tname
197                | isPrefixed = TName Nothing ("xmlns:" `T.append` kname)
198                | otherwise = TName kpref kname
199
200        l''
201            | isPrefixed =
202                l { prefixes = Map.insert kname (contentsToText val)
203                                     $ prefixes l }
204            | isUnprefixed =
205                l { defaultNS = if T.null $ contentsToText val
206                                            then Nothing
207                                            else Just $ contentsToText val }
208            | otherwise = l
209
210    resolve (ContentEntity e)
211        | Just t <- lookup e es = ContentText t
212    resolve c = c
213    n' = if isClosed then n else l' : n
214    fixAttName (name', val) = (tnameToName True l' name', val)
215    elementName = tnameToName False l' name
216    begin = EventBeginElement elementName $ map fixAttName $ as' []
217    end = EventEndElement elementName
218tokenToEvent _ es n (TokenEndElement name) =
219    (es, n', [EventEndElement $ tnameToName False l name])
220  where
221    (l, n') =
222        case n of
223            []   -> (NSLevel Nothing Map.empty, [])
224            x:xs -> (x, xs)
225tokenToEvent _ es n (TokenContent (ContentEntity e))
226    | Just t <- lookup e es = (es, n, [EventContent $ ContentText t])
227tokenToEvent _ es n (TokenContent c) = (es, n, [EventContent c])
228tokenToEvent _ es n (TokenComment c) = (es, n, [EventComment c])
229tokenToEvent _ es n (TokenDoctype t eid es') = (es ++ es', n, [EventBeginDoctype t eid, EventEndDoctype])
230tokenToEvent _ es n (TokenCDATA t) = (es, n, [EventCDATA t])
231
232tnameToName :: Bool -> NSLevel -> TName -> Name
233tnameToName _ _ (TName (Just "xml") name) =
234    Name name (Just "http://www.w3.org/XML/1998/namespace") (Just "xml")
235tnameToName isAttr (NSLevel def' _) (TName Nothing name) =
236    Name name (if isAttr then Nothing else def') Nothing
237tnameToName _ (NSLevel _ m) (TName (Just pref) name) =
238    case Map.lookup pref m of
239        Just ns -> Name name (Just ns) (Just pref)
240        Nothing -> Name name Nothing (Just pref) -- FIXME is this correct?
241
242-- | Automatically determine which UTF variant is being used. This function
243-- first checks for BOMs, removing them as necessary, and then check for the
244-- equivalent of <?xml for each of UTF-8, UTF-16LE/BE, and UTF-32LE/BE. It
245-- defaults to assuming UTF-8.
246detectUtf :: MonadThrow m => ConduitT S.ByteString T.Text m ()
247detectUtf =
248    conduit id
249  where
250    conduit front = await >>= maybe (return ()) (push front)
251
252    push front bss =
253        either conduit
254               (uncurry checkXMLDecl)
255               (getEncoding front bss)
256
257    getEncoding front bs'
258        | S.length bs < 4 =
259            Left (bs `S.append`)
260        | otherwise =
261            Right (bsOut, mcodec)
262      where
263        bs = front bs'
264        bsOut = S.append (S.drop toDrop x) y
265        (x, y) = S.splitAt 4 bs
266        (toDrop, mcodec) =
267            case S.unpack x of
268                [0x00, 0x00, 0xFE, 0xFF] -> (4, Just CT.utf32_be)
269                [0xFF, 0xFE, 0x00, 0x00] -> (4, Just CT.utf32_le)
270                0xFE : 0xFF: _           -> (2, Just CT.utf16_be)
271                0xFF : 0xFE: _           -> (2, Just CT.utf16_le)
272                0xEF : 0xBB: 0xBF : _    -> (3, Just CT.utf8)
273                [0x00, 0x00, 0x00, 0x3C] -> (0, Just CT.utf32_be)
274                [0x3C, 0x00, 0x00, 0x00] -> (0, Just CT.utf32_le)
275                [0x00, 0x3C, 0x00, 0x3F] -> (0, Just CT.utf16_be)
276                [0x3C, 0x00, 0x3F, 0x00] -> (0, Just CT.utf16_le)
277                _                        -> (0, Nothing) -- Assuming UTF-8
278
279checkXMLDecl :: MonadThrow m
280             => S.ByteString
281             -> Maybe CT.Codec
282             -> ConduitT S.ByteString T.Text m ()
283checkXMLDecl bs (Just codec) = leftover bs >> CT.decode codec
284checkXMLDecl bs0 Nothing =
285    loop [] (AT.parse (parseToken def)) bs0
286  where
287    loop chunks0 parser nextChunk =
288        case parser $ decodeUtf8With lenientDecode nextChunk of
289            AT.Fail{} -> fallback
290            AT.Partial f -> await >>= maybe fallback (loop chunks f)
291            AT.Done _ (TokenXMLDeclaration attrs) -> findEncoding attrs
292            AT.Done{} -> fallback
293      where
294        chunks = nextChunk : chunks0
295        fallback = complete CT.utf8
296        complete codec = mapM_ leftover chunks >> CT.decode codec
297
298        findEncoding [] = fallback
299        findEncoding ((TName _ "encoding", [ContentText enc]):_) =
300            case T.toLower enc of
301                "iso-8859-1" -> complete CT.iso8859_1
302                "utf-8"      -> complete CT.utf8
303                _            -> complete CT.utf8
304        findEncoding (_:xs) = findEncoding xs
305
306type EventPos = (Maybe PositionRange, Event)
307
308-- | Parses a byte stream into 'Event's. This function is implemented fully in
309-- Haskell using attoparsec-text for parsing. The produced error messages do
310-- not give line/column information, so you may prefer to stick with the parser
311-- provided by libxml-enumerator. However, this has the advantage of not
312-- relying on any C libraries.
313--
314-- This relies on 'detectUtf' to determine character encoding, and 'parseText'
315-- to do the actual parsing.
316parseBytes :: MonadThrow m
317           => ParseSettings
318           -> ConduitT S.ByteString Event m ()
319parseBytes = mapOutput snd . parseBytesPos
320
321parseBytesPos :: MonadThrow m
322              => ParseSettings
323              -> ConduitT S.ByteString EventPos m ()
324parseBytesPos ps = detectUtf .| parseTextPos ps
325
326dropBOM :: Monad m => ConduitT T.Text T.Text m ()
327dropBOM =
328    await >>= maybe (return ()) push
329  where
330    push t =
331        case T.uncons t of
332            Nothing -> dropBOM
333            Just (c, cs) ->
334                let output
335                        | c == '\xfeef' = cs
336                        | otherwise = t
337                 in yield output >> idConduit
338    idConduit = await >>= maybe (return ()) (\x -> yield x >> idConduit)
339
340-- | Parses a character stream into 'Event's. This function is implemented
341-- fully in Haskell using attoparsec-text for parsing. The produced error
342-- messages do not give line/column information, so you may prefer to stick
343-- with the parser provided by libxml-enumerator. However, this has the
344-- advantage of not relying on any C libraries.
345--
346-- Since 1.2.4
347parseText :: MonadThrow m => ParseSettings -> ConduitT T.Text Event m ()
348parseText = mapOutput snd . parseTextPos
349
350
351-- | Same as 'parseText', but includes the position of each event.
352--
353-- Since 1.2.4
354parseTextPos :: MonadThrow m
355          => ParseSettings
356          -> ConduitT T.Text EventPos m ()
357parseTextPos de =
358    dropBOM
359        .| tokenize
360        .| toEventC de
361        .| addBeginEnd
362  where
363    tokenize = conduitToken de
364    addBeginEnd = yield (Nothing, EventBeginDocument) >> addEnd
365    addEnd = await >>= maybe
366        (yield (Nothing, EventEndDocument))
367        (\e -> yield e >> addEnd)
368
369toEventC :: Monad m => ParseSettings -> ConduitT (PositionRange, Token) EventPos m ()
370toEventC ps =
371    go [] []
372  where
373    go !es !levels =
374        await >>= maybe (return ()) push
375      where
376        push (position, token) =
377            mapM_ (yield . (,) (Just position)) events >> go es' levels'
378          where
379            (es', levels', events) = tokenToEvent ps es levels token
380
381
382type DecodeEntities = Text -> Content
383type DecodeIllegalCharacters = Int -> Maybe Char
384
385data ParseSettings = ParseSettings
386    { psDecodeEntities          :: DecodeEntities
387    , psRetainNamespaces        :: Bool
388    -- ^ Whether the original xmlns attributes should be retained in the parsed
389    -- values. For more information on motivation, see:
390    --
391    -- <https://github.com/snoyberg/xml/issues/38>
392    --
393    -- Default: False
394    --
395    -- Since 1.2.1
396    , psDecodeIllegalCharacters :: DecodeIllegalCharacters
397    -- ^ How to decode illegal character references (@&#[0-9]+;@ or @&#x[0-9a-fA-F]+;@).
398    --
399    -- Character references within the legal ranges defined by <https://www.w3.org/TR/REC-xml/#NT-Char the standard> are automatically parsed.
400    -- Others are passed to this function.
401    --
402    -- Default: @const Nothing@
403    --
404    -- Since 1.7.1
405    }
406
407instance Default ParseSettings where
408    def = ParseSettings
409        { psDecodeEntities = decodeXmlEntities
410        , psRetainNamespaces = False
411        , psDecodeIllegalCharacters = const Nothing
412        }
413
414conduitToken :: MonadThrow m => ParseSettings -> ConduitT T.Text (PositionRange, Token) m ()
415conduitToken = conduitParser . parseToken
416
417parseToken :: ParseSettings -> Parser Token
418parseToken settings = (char '<' >> parseLt) <|> TokenContent <$> parseContent settings False False
419  where
420    parseLt =
421        (char '?' >> parseInstr) <|>
422        (char '!' >> (parseComment <|> parseCdata <|> parseDoctype)) <|>
423        parseBegin <|>
424        (char '/' >> parseEnd)
425    parseInstr = do
426        name <- parseIdent
427        if name == "xml"
428            then do
429                as <- A.many $ parseAttribute settings
430                skipSpace
431                char' '?'
432                char' '>'
433                newline <|> return ()
434                return $ TokenXMLDeclaration as
435            else do
436                skipSpace
437                x <- T.pack <$> manyTill anyChar (try $ string "?>")
438                return $ TokenInstruction $ Instruction name x
439    parseComment = do
440        char' '-'
441        char' '-'
442        c <- T.pack <$> manyTill anyChar (string "-->") -- FIXME use takeWhile instead
443        return $ TokenComment c
444    parseCdata = do
445        _ <- string "[CDATA["
446        t <- T.pack <$> manyTill anyChar (string "]]>") -- FIXME use takeWhile instead
447        return $ TokenCDATA t
448    parseDoctype = do
449        _ <- string "DOCTYPE"
450        skipSpace
451        name <- parseName
452        let i =
453                case name of
454                    TName Nothing x  -> x
455                    TName (Just x) y -> T.concat [x, ":", y]
456        skipSpace
457        eid <- fmap Just parsePublicID <|>
458               fmap Just parseSystemID <|>
459               return Nothing
460        skipSpace
461        ents <- (do
462            char' '['
463            ents <- parseEntities id
464            skipSpace
465            return ents) <|> return []
466        char' '>'
467        newline <|> return ()
468        return $ TokenDoctype i eid ents
469    parseEntities front =
470        (char ']' >> return (front [])) <|>
471        (parseEntity >>= \e -> parseEntities (front . (e:))) <|>
472        (char '<' >> parseEntities front) <|>
473        (skipWhile (\t -> t /= ']' && t /= '<') >> parseEntities front)
474    parseEntity = try $ do
475        _ <- string "<!ENTITY"
476        skipSpace
477        i <- parseIdent
478        t <- quotedText
479        skipSpace
480        char' '>'
481        return (i, t)
482    parsePublicID = do
483        _ <- string "PUBLIC"
484        x <- quotedText
485        y <- quotedText
486        return $ PublicID x y
487    parseSystemID = do
488        _ <- string "SYSTEM"
489        x <- quotedText
490        return $ SystemID x
491    quotedText = do
492        skipSpace
493        between '"' <|> between '\''
494    between c = do
495        char' c
496        x <- takeWhile (/=c)
497        char' c
498        return x
499    parseEnd = do
500        skipSpace
501        n <- parseName
502        skipSpace
503        char' '>'
504        return $ TokenEndElement n
505    parseBegin = do
506        skipSpace
507        n <- parseName
508        as <- A.many $ parseAttribute settings
509        skipSpace
510        isClose <- (char '/' >> skipSpace >> return True) <|> return False
511        char' '>'
512        return $ TokenBeginElement n as isClose 0
513
514parseAttribute :: ParseSettings -> Parser TAttribute
515parseAttribute settings = do
516    skipSpace
517    key <- parseName
518    skipSpace
519    char' '='
520    skipSpace
521    val <- squoted <|> dquoted
522    return (key, val)
523  where
524    squoted = char '\'' *> manyTill (parseContent settings False True) (char '\'')
525    dquoted = char  '"' *> manyTill (parseContent settings True False) (char  '"')
526
527parseName :: Parser TName
528parseName =
529  name <$> parseIdent <*> A.optional (char ':' >> parseIdent)
530  where
531    name i1 Nothing   = TName Nothing i1
532    name i1 (Just i2) = TName (Just i1) i2
533
534parseIdent :: Parser Text
535parseIdent =
536    takeWhile1 valid
537  where
538    valid '&'  = False
539    valid '<'  = False
540    valid '>'  = False
541    valid ':'  = False
542    valid '?'  = False
543    valid '='  = False
544    valid '"'  = False
545    valid '\'' = False
546    valid '/'  = False
547    valid ';'  = False
548    valid '#'  = False
549    valid c    = not $ isXMLSpace c
550
551parseContent :: ParseSettings
552             -> Bool -- break on double quote
553             -> Bool -- break on single quote
554             -> Parser Content
555parseContent (ParseSettings decodeEntities _ decodeIllegalCharacters) breakDouble breakSingle = parseReference <|> parseTextContent where
556  parseReference = do
557    char' '&'
558    t <- parseEntityRef <|> parseHexCharRef <|> parseDecCharRef
559    char' ';'
560    return t
561  parseEntityRef = do
562    TName ma b <- parseName
563    let name = maybe "" (`T.append` ":") ma `T.append` b
564    return $ case name of
565      "lt"   -> ContentText "<"
566      "gt"   -> ContentText ">"
567      "amp"  -> ContentText "&"
568      "quot" -> ContentText "\""
569      "apos" -> ContentText "'"
570      _      -> decodeEntities name
571  parseHexCharRef = do
572    void $ string "#x"
573    n <- AT.hexadecimal
574    case toValidXmlChar n <|> decodeIllegalCharacters n of
575      Nothing -> fail "Invalid character from hexadecimal character reference."
576      Just c -> return $ ContentText $ T.singleton c
577  parseDecCharRef = do
578    void $ string "#"
579    n <- AT.decimal
580    case toValidXmlChar n <|> decodeIllegalCharacters n of
581      Nothing -> fail "Invalid character from decimal character reference."
582      Just c  -> return $ ContentText $ T.singleton c
583  parseTextContent = ContentText <$> takeWhile1 valid
584  valid '"'  = not breakDouble
585  valid '\'' = not breakSingle
586  valid '&'  = False -- amp
587  valid '<'  = False -- lt
588  valid _    = True
589
590-- | Is this codepoint a valid XML character? See
591-- <https://www.w3.org/TR/xml/#charsets>. This is proudly XML 1.0 only.
592toValidXmlChar :: Int -> Maybe Char
593toValidXmlChar n
594  | any checkRange ranges = Just (toEnum n)
595  | otherwise = Nothing
596  where
597    --Inclusive lower bound, inclusive upper bound.
598    ranges :: [(Int, Int)]
599    ranges =
600      [ (0x9, 0xA)
601      , (0xD, 0xD)
602      , (0x20, 0xD7FF)
603      , (0xE000, 0xFFFD)
604      , (0x10000, 0x10FFFF)
605      ]
606    checkRange (lb, ub) = lb <= n && n <= ub
607
608skipSpace :: Parser ()
609skipSpace = skipWhile isXMLSpace
610
611-- | Determines whether a character is an XML white space. The list of
612-- white spaces is given by
613--
614-- >  S ::= (#x20 | #x9 | #xD | #xA)+
615--
616-- in <http://www.w3.org/TR/2008/REC-xml-20081126/#sec-common-syn>.
617isXMLSpace :: Char -> Bool
618isXMLSpace ' '  = True
619isXMLSpace '\t' = True
620isXMLSpace '\r' = True
621isXMLSpace '\n' = True
622isXMLSpace _    = False
623
624newline :: Parser ()
625newline = void $ (char '\r' >> char '\n') <|> char '\n'
626
627char' :: Char -> Parser ()
628char' = void . char
629
630data ContentType = Ignore | IsContent Text | IsError String | NotContent
631
632-- | Grabs the next piece of content if available. This function skips over any
633-- comments, instructions or entities, and concatenates all content until the next start
634-- or end tag.
635contentMaybe :: MonadThrow m => ConduitT Event o m (Maybe Text)
636contentMaybe = do
637    x <- peekC
638    case pc' x of
639        Ignore      -> dropC 1 >> contentMaybe
640        IsContent t -> dropC 1 >> fmap Just (takeContents (t:))
641        IsError e   -> lift $ throwM $ InvalidEntity e x
642        NotContent  -> return Nothing
643  where
644    pc' Nothing  = NotContent
645    pc' (Just x) = pc x
646    pc (EventContent (ContentText t)) = IsContent t
647    pc (EventContent (ContentEntity e)) = IsError $ "Unknown entity: " ++ show e
648    pc (EventCDATA t) = IsContent t
649    pc EventBeginElement{} = NotContent
650    pc EventEndElement{} = NotContent
651    pc EventBeginDocument{} = Ignore
652    pc EventEndDocument = Ignore
653    pc EventBeginDoctype{} = Ignore
654    pc EventEndDoctype = Ignore
655    pc EventInstruction{} = Ignore
656    pc EventComment{} = Ignore
657    takeContents front = do
658        x <- peekC
659        case pc' x of
660            Ignore      -> dropC 1 >> takeContents front
661            IsContent t -> dropC 1 >> takeContents (front . (:) t)
662            IsError e   -> lift $ throwM $ InvalidEntity e x
663            NotContent  -> return $ T.concat $ front []
664
665-- | Grabs the next piece of content. If none if available, returns 'T.empty'.
666-- This is simply a wrapper around 'contentMaybe'.
667content :: MonadThrow m => ConduitT Event o m Text
668content = fromMaybe T.empty <$> contentMaybe
669
670
671isWhitespace :: Event -> Bool
672isWhitespace EventBeginDocument             = True
673isWhitespace EventEndDocument               = True
674isWhitespace EventBeginDoctype{}            = True
675isWhitespace EventEndDoctype                = True
676isWhitespace EventInstruction{}             = True
677isWhitespace (EventContent (ContentText t)) = T.all isSpace t
678isWhitespace EventComment{}                 = True
679isWhitespace (EventCDATA t)                 = T.all isSpace t
680isWhitespace _                              = False
681
682
683-- | The most generic way to parse a tag. It takes a 'NameMatcher' to check whether
684-- this is a correct tag name, an 'AttrParser' to handle attributes, and
685-- then a parser to deal with content.
686--
687-- 'Events' are consumed if and only if the tag name and its attributes match.
688--
689-- This function automatically absorbs its balancing closing tag, and will
690-- throw an exception if not all of the attributes or child elements are
691-- consumed. If you want to allow extra attributes, see 'ignoreAttrs'.
692--
693-- This function automatically ignores comments, instructions and whitespace.
694tag :: MonadThrow m
695    => NameMatcher a -- ^ Check if this is a correct tag name
696                     --   and return a value that can be used to get an @AttrParser@.
697                     --   If this fails, the function will return @Nothing@
698    -> (a -> AttrParser b) -- ^ Given the value returned by the name checker, this function will
699                           --   be used to get an @AttrParser@ appropriate for the specific tag.
700                           --   If the @AttrParser@ fails, the function will also return @Nothing@
701    -> (b -> ConduitT Event o m c) -- ^ Handler function to handle the attributes and children
702                                   --   of a tag, given the value return from the @AttrParser@
703    -> ConduitT Event o m (Maybe c)
704tag nameMatcher attrParser f = do
705  (x, leftovers) <- dropWS []
706  res <- case x of
707    Just (EventBeginElement name as) -> case runNameMatcher nameMatcher name of
708      Just y -> case runAttrParser' (attrParser y) as of
709        Left _ -> return Nothing
710        Right z -> do
711          z' <- f z
712          (a, _leftovers') <- dropWS []
713          case a of
714            Just (EventEndElement name')
715              | name == name' -> return (Just z')
716            _ -> lift $ throwM $ InvalidEndElement name a
717      Nothing -> return Nothing
718    _ -> return Nothing
719
720  case res of
721    -- Did not parse, put back all of the leading whitespace events and the
722    -- final observed event generated by dropWS
723    Nothing -> mapM_ leftover leftovers
724    -- Parse succeeded, discard all of those whitespace events and the
725    -- first parsed event
726    Just _  -> return ()
727
728  return res
729  where
730    -- Drop Events until we encounter a non-whitespace element. Return all of
731    -- the events consumed here (including the first non-whitespace event) so
732    -- that the calling function can treat them as leftovers if the parse fails
733    dropWS leftovers = do
734        x <- await
735        let leftovers' = maybe id (:) x leftovers
736
737        case isWhitespace <$> x of
738          Just True -> dropWS leftovers'
739          _         -> return (x, leftovers')
740    runAttrParser' p as =
741        case runAttrParser p as of
742            Left e           -> Left e
743            Right ([], x)    -> Right x
744            Right (attr', _) -> Left $ toException $ UnparsedAttributes attr'
745
746-- | A simplified version of 'tag' where the 'NameMatcher' result isn't forwarded to the attributes parser.
747--
748-- Since 1.5.0
749tag' :: MonadThrow m
750     => NameMatcher a -> AttrParser b -> (b -> ConduitT Event o m c)
751     -> ConduitT Event o m (Maybe c)
752tag' a b = tag a (const b)
753
754-- | A further simplified tag parser, which requires that no attributes exist.
755tagNoAttr :: MonadThrow m
756          => NameMatcher a -- ^ Check if this is a correct tag name
757          -> ConduitT Event o m b -- ^ Handler function to handle the children of the matched tag
758          -> ConduitT Event o m (Maybe b)
759tagNoAttr name f = tag' name (return ()) $ const f
760
761
762-- | A further simplified tag parser, which ignores all attributes, if any exist
763tagIgnoreAttrs :: MonadThrow m
764               => NameMatcher a -- ^ Check if this is a correct tag name
765               -> ConduitT Event o m b -- ^ Handler function to handle the children of the matched tag
766               -> ConduitT Event o m (Maybe b)
767tagIgnoreAttrs name f = tag' name ignoreAttrs $ const f
768
769
770-- | Ignore an empty tag and all of its attributes.
771--   This does not ignore the tag recursively
772--   (i.e. it assumes there are no child elements).
773--   This function returns @Just ()@ if the tag matched.
774--
775-- Since 1.5.0
776ignoreEmptyTag :: MonadThrow m
777          => NameMatcher a -- ^ Check if this is a correct tag name
778          -> ConduitT Event o m (Maybe ())
779ignoreEmptyTag nameMatcher = tagIgnoreAttrs nameMatcher (return ())
780
781
782ignored :: Monad m => ConduitT i o m ()
783ignored = fix $ \recurse -> do
784  event <- await
785  case event of
786    Just _ -> recurse
787    _      -> return ()
788
789
790-- | Same as `takeTree`, without yielding `Event`s.
791--
792-- >>> :set -XOverloadedStrings
793-- >>> import Conduit
794--
795-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| (ignoreTree "a" ignoreAttrs >> sinkList)
796-- [EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument]
797--
798-- >>> runConduit $ parseLBS def "<a>content</a>" .| (ignoreTree "b" ignoreAttrs >> sinkList)
799-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument]
800--
801-- >>> runConduit $ parseLBS def "content<a></a>" .| (ignoreTree anyName ignoreAttrs >> sinkList)
802-- [EventContent (ContentText "content"),EventBeginElement (Name {nameLocalName = "a", ...}) [],EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument]
803--
804-- Since 1.9.0
805ignoreTree :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event o m (Maybe ())
806ignoreTree nameMatcher attrParser = fuseUpstream (takeTree nameMatcher attrParser) ignored
807
808-- | Same as `takeContent`, without yielding `Event`s.
809--
810-- >>> :set -XOverloadedStrings
811-- >>> import Conduit
812--
813-- >>> runConduit $ parseLBS def "<a>content</a>" .| (ignoreContent >> sinkList)
814-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument]
815--
816-- >>> runConduit $ parseLBS def "content<a></a>" .| (ignoreContent >> sinkList)
817-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument]
818--
819-- >>> runConduit $ parseLBS def "content<a></a>" .| (ignoreContent >> sinkList)
820-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument]
821--
822-- Since 1.9.0
823ignoreContent :: MonadThrow m => ConduitT Event o m (Maybe ())
824ignoreContent = fuseUpstream takeContent ignored
825
826
827-- | Same as `takeTreeContent`, without yielding `Event`s.
828--
829-- >>> :set -XOverloadedStrings
830-- >>> import Conduit
831--
832-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| (ignoreTreeContent "a" ignoreAttrs >> sinkList)
833-- [EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument]
834--
835-- >>> runConduit $ parseLBS def "<a>content</a>" .| (ignoreTreeContent "b" ignoreAttrs >> sinkList)
836-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument]
837--
838-- >>> runConduit $ parseLBS def "content<a></a>" .| (ignoreTreeContent anyName ignoreAttrs >> sinkList)
839-- [EventBeginElement (Name {nameLocalName = "a", ...}) [],EventEndElement (Name {nameLocalName = "a", ...}),EventEndDocument]
840--
841-- Since 1.5.0
842ignoreTreeContent :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event o m (Maybe ())
843ignoreTreeContent namePred attrParser = fuseUpstream (takeTreeContent namePred attrParser) ignored
844
845
846-- | Same as `takeAnyTreeContent`, without yielding `Event`s.
847--
848-- >>> :set -XOverloadedStrings
849-- >>> import Conduit
850--
851-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| (ignoreAnyTreeContent >> sinkList)
852-- [EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument]
853--
854-- >>> runConduit $ parseLBS def "text<b></b>" .| (ignoreAnyTreeContent >> sinkList)
855-- [EventBeginElement (Name {nameLocalName = "b", ...}) [],EventEndElement (Name {nameLocalName = "b", ...}),EventEndDocument]
856--
857-- Since 1.5.0
858ignoreAnyTreeContent :: MonadThrow m => ConduitT Event o m (Maybe ())
859ignoreAnyTreeContent = fuseUpstream takeAnyTreeContent ignored
860
861
862-- | Get the value of the first parser which returns 'Just'. If no parsers
863-- succeed (i.e., return @Just@), this function returns 'Nothing'.
864--
865-- > orE a b = choose [a, b]
866orE :: Monad m
867    => ConduitT Event o m (Maybe a) -- ^ The first (preferred) parser
868    -> ConduitT Event o m (Maybe a) -- ^ The second parser, only executed if the first parser fails
869    -> ConduitT Event o m (Maybe a)
870orE a b = a >>= \x -> maybe b (const $ return x) x
871
872-- | Get the value of the first parser which returns 'Just'. If no parsers
873-- succeed (i.e., return 'Just'), this function returns 'Nothing'.
874choose :: Monad m
875       => [ConduitT Event o m (Maybe a)] -- ^ List of parsers that will be tried in order.
876       -> ConduitT Event o m (Maybe a)   -- ^ Result of the first parser to succeed, or @Nothing@
877                                         --   if no parser succeeded
878choose []     = return Nothing
879choose (i:is) = i >>= maybe (choose is) (return . Just)
880
881-- | Force an optional parser into a required parser. All of the 'tag'
882-- functions, 'attr', 'choose' and 'many' deal with 'Maybe' parsers. Use this when you
883-- want to finally force something to happen.
884force :: MonadThrow m
885      => String -- ^ Error message
886      -> m (Maybe a) -- ^ Optional parser to be forced
887      -> m a
888force msg i = i >>= maybe (throwM $ XmlException msg Nothing) return
889
890-- | A helper function which reads a file from disk using 'enumFile', detects
891-- character encoding using 'detectUtf', parses the XML using 'parseBytes', and
892-- then hands off control to your supplied parser.
893parseFile :: MonadResource m
894          => ParseSettings
895          -> FilePath
896          -> ConduitT i Event m ()
897parseFile ps fp = sourceFile fp .| transPipe liftIO (parseBytes ps)
898
899-- | Parse an event stream from a lazy 'L.ByteString'.
900parseLBS :: MonadThrow m
901         => ParseSettings
902         -> L.ByteString
903         -> ConduitT i Event m ()
904parseLBS ps lbs = sourceLazy lbs .| parseBytes ps
905
906data XmlException = XmlException
907    { xmlErrorMessage :: String
908    , xmlBadInput     :: Maybe Event
909    }
910                  | InvalidEndElement Name (Maybe Event)
911                  | InvalidEntity String (Maybe Event)
912                  | MissingAttribute String
913                  | UnparsedAttributes [(Name, [Content])]
914    deriving (Show, Typeable)
915
916instance Exception XmlException where
917#if MIN_VERSION_base(4, 8, 0)
918  displayException (XmlException msg (Just event)) = "Error while parsing XML event " ++ show event ++ ": " ++ msg
919  displayException (XmlException msg _) = "Error while parsing XML: " ++ msg
920  displayException (InvalidEndElement name (Just event)) = "Error while parsing XML event: expected </" ++ T.unpack (nameLocalName name) ++ ">, got " ++ show event
921  displayException (InvalidEndElement name _) = "Error while parsing XML event: expected </" ++ show name ++ ">, got nothing"
922  displayException (InvalidEntity msg (Just event)) = "Error while parsing XML entity " ++ show event ++ ": " ++ msg
923  displayException (InvalidEntity msg _) = "Error while parsing XML entity: " ++ msg
924  displayException (MissingAttribute msg) = "Missing required attribute: " ++ msg
925  displayException (UnparsedAttributes attrs) = show (length attrs) ++ " remaining unparsed attributes: \n" ++ intercalate "\n" (show <$> attrs)
926#endif
927
928
929-- | A @NameMatcher@ describes which names a tag parser is allowed to match.
930--
931-- Since 1.5.0
932newtype NameMatcher a = NameMatcher { runNameMatcher :: Name -> Maybe a }
933
934deriving instance Functor NameMatcher
935
936instance Applicative NameMatcher where
937  pure a = NameMatcher $ const $ pure a
938  NameMatcher f <*> NameMatcher a = NameMatcher $ \name -> f name <*> a name
939
940-- | 'NameMatcher's can be combined with @\<|\>@
941instance Alternative NameMatcher where
942  empty = NameMatcher $ const Nothing
943  NameMatcher f <|> NameMatcher g = NameMatcher (\a -> f a <|> g a)
944
945-- | Match a single 'Name' in a concise way.
946-- Note that 'Name' is namespace sensitive: when using the 'IsString' instance,
947-- use @"{http:\/\/a\/b}c"@ to match the tag @c@ in the XML namespace @http://a/b@
948instance (a ~ Name) => IsString (NameMatcher a) where
949  fromString s = matching (== fromString s)
950
951-- | @matching f@ matches @name@ iff @f name@ is true. Returns the matched 'Name'.
952--
953-- Since 1.5.0
954matching :: (Name -> Bool) -> NameMatcher Name
955matching f = NameMatcher $ \name -> if f name then Just name else Nothing
956
957-- | Matches any 'Name'. Returns the matched 'Name'.
958--
959-- Since 1.5.0
960anyName :: NameMatcher Name
961anyName = matching (const True)
962
963-- | Matches any 'Name' from the given list. Returns the matched 'Name'.
964--
965-- Since 1.5.0
966anyOf :: [Name] -> NameMatcher Name
967anyOf values = matching (`elem` values)
968
969
970-- | A monad for parsing attributes. By default, it requires you to deal with
971-- all attributes present on an element, and will throw an exception if there
972-- are unhandled attributes. Use the 'requireAttr', 'attr' et al
973-- functions for handling an attribute, and 'ignoreAttrs' if you would like to
974-- skip the rest of the attributes on an element.
975--
976-- 'Alternative' instance behaves like 'First' monoid: it chooses first
977-- parser which doesn't fail.
978newtype AttrParser a = AttrParser { runAttrParser :: [(Name, [Content])] -> Either SomeException ([(Name, [Content])], a) }
979
980instance Monad AttrParser where
981    return a = AttrParser $ \as -> Right (as, a)
982    (AttrParser f) >>= g = AttrParser $ \as ->
983        either Left (\(as', f') -> runAttrParser (g f') as') (f as)
984instance Functor AttrParser where
985    fmap = liftM
986instance Applicative AttrParser where
987    pure = return
988    (<*>) = ap
989instance Alternative AttrParser where
990    empty = AttrParser $ const $ Left $ toException $ XmlException "AttrParser.empty" Nothing
991    AttrParser f <|> AttrParser g = AttrParser $ \x ->
992        either (const $ g x) Right (f x)
993instance MonadThrow AttrParser where
994    throwM = AttrParser . const . throwM
995
996optionalAttrRaw :: ((Name, [Content]) -> Maybe b) -> AttrParser (Maybe b)
997optionalAttrRaw f =
998    AttrParser $ go id
999  where
1000    go front [] = Right (front [], Nothing)
1001    go front (a:as) =
1002        maybe (go (front . (:) a) as)
1003              (\b -> Right (front as, Just b))
1004              (f a)
1005
1006requireAttrRaw :: String -> ((Name, [Content]) -> Maybe b) -> AttrParser b
1007requireAttrRaw msg f = optionalAttrRaw f >>=
1008    maybe (AttrParser $ const $ Left $ toException $ MissingAttribute msg)
1009          return
1010
1011-- | Return the value for an attribute if present.
1012attr :: Name -> AttrParser (Maybe Text)
1013attr n = optionalAttrRaw
1014    (\(x, y) -> if x == n then Just (contentsToText y) else Nothing)
1015
1016-- | Shortcut composition of 'force' and 'attr'.
1017requireAttr :: Name -> AttrParser Text
1018requireAttr n = force ("Missing attribute: " ++ show n) $ attr n
1019
1020
1021{-# DEPRECATED optionalAttr "Please use 'attr'." #-}
1022optionalAttr :: Name -> AttrParser (Maybe Text)
1023optionalAttr = attr
1024
1025contentsToText :: [Content] -> Text
1026contentsToText = T.concat . map toText where
1027  toText (ContentText t)   = t
1028  toText (ContentEntity e) = T.concat ["&", e, ";"]
1029
1030-- | Skip the remaining attributes on an element. Since this will clear the
1031-- list of attributes, you must call this /after/ any calls to 'requireAttr',
1032-- 'optionalAttr', etc.
1033ignoreAttrs :: AttrParser ()
1034ignoreAttrs = AttrParser $ const $ Right ([], ())
1035
1036-- | Keep parsing elements as long as the parser returns 'Just'.
1037many :: Monad m
1038     => ConduitT Event o m (Maybe a)
1039     -> ConduitT Event o m [a]
1040many i = manyIgnore i $ return Nothing
1041
1042-- | Like 'many' but discards the results without building an intermediate list.
1043--
1044-- Since 1.5.0
1045many_ :: MonadThrow m
1046      => ConduitT Event o m (Maybe a)
1047      -> ConduitT Event o m ()
1048many_ consumer = manyIgnoreYield (return Nothing) (void <$> consumer)
1049
1050-- | Keep parsing elements as long as the parser returns 'Just'
1051--   or the ignore parser returns 'Just'.
1052manyIgnore :: Monad m
1053           => ConduitT Event o m (Maybe a)
1054           -> ConduitT Event o m (Maybe b)
1055           -> ConduitT Event o m [a]
1056manyIgnore i ignored = go id where
1057  go front = i >>= maybe (onFail front) (\y -> go $ front . (:) y)
1058  -- onFail is called if the main parser fails
1059  onFail front = ignored >>= maybe (return $ front []) (const $ go front)
1060
1061-- | Like @many@, but any tags and content the consumer doesn't match on
1062--   are silently ignored.
1063many' :: MonadThrow m
1064      => ConduitT Event o m (Maybe a)
1065      -> ConduitT Event o m [a]
1066many' consumer = manyIgnore consumer ignoreAnyTreeContent
1067
1068
1069-- | Like 'many', but uses 'yield' so the result list can be streamed
1070--   to downstream conduits without waiting for 'manyYield' to finish
1071manyYield :: Monad m
1072          => ConduitT a b m (Maybe b)
1073          -> ConduitT a b m ()
1074manyYield consumer = fix $ \loop ->
1075  consumer >>= maybe (return ()) (\x -> yield x >> loop)
1076
1077-- | Like 'manyIgnore', but uses 'yield' so the result list can be streamed
1078--   to downstream conduits without waiting for 'manyIgnoreYield' to finish
1079manyIgnoreYield :: MonadThrow m
1080                => ConduitT Event b m (Maybe b) -- ^ Consuming parser that generates the result stream
1081                -> ConduitT Event b m (Maybe ()) -- ^ Ignore parser that consumes elements to be ignored
1082                -> ConduitT Event b m ()
1083manyIgnoreYield consumer ignoreParser = fix $ \loop ->
1084  consumer >>= maybe (onFail loop) (\x -> yield x >> loop)
1085  where onFail loop = ignoreParser >>= maybe (return ()) (const loop)
1086
1087-- | Like 'many'', but uses 'yield' so the result list can be streamed
1088--   to downstream conduits without waiting for 'manyYield'' to finish
1089manyYield' :: MonadThrow m
1090           => ConduitT Event b m (Maybe b)
1091           -> ConduitT Event b m ()
1092manyYield' consumer = manyIgnoreYield consumer ignoreAnyTreeContent
1093
1094
1095-- | Stream a single content 'Event'.
1096--
1097-- Returns @Just ()@ if a content 'Event' was consumed, @Nothing@ otherwise.
1098--
1099-- >>> :set -XOverloadedStrings
1100-- >>> import Control.Monad (void)
1101-- >>> import Conduit
1102--
1103-- >>> runConduit $ parseLBS def "content<a></a>" .| void takeContent .| sinkList
1104-- [EventBeginDocument,EventContent (ContentText "content")]
1105--
1106-- If next event isn't a content, nothing is consumed.
1107--
1108-- >>> runConduit $ parseLBS def "<a>content</a>" .| void takeContent .| sinkList
1109-- [EventBeginDocument]
1110--
1111-- Since 1.5.0
1112takeContent :: MonadThrow m => ConduitT Event Event m (Maybe ())
1113takeContent = do
1114  event <- await
1115  case event of
1116    Just e@EventContent{} -> yield e >> return (Just ())
1117    Just e@EventCDATA{}   -> yield e >> return (Just ())
1118    Just e -> if isWhitespace e then yield e >> takeContent else leftover e >> return Nothing
1119    _ -> return Nothing
1120
1121-- | Stream 'Event's corresponding to a single XML element that matches given 'NameMatcher' and 'AttrParser', from the opening- to the closing-tag.
1122--
1123-- >>> :set -XOverloadedStrings
1124-- >>> import Control.Monad (void)
1125-- >>> import Conduit
1126--
1127-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTree "a" ignoreAttrs) .| sinkList
1128-- [EventBeginDocument,EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...})]
1129--
1130-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTree "b" ignoreAttrs) .| sinkList
1131-- [EventBeginDocument]
1132--
1133-- If next 'Event' isn't an element, nothing is consumed.
1134--
1135-- >>> runConduit $ parseLBS def "text<a></a>" .| void (takeTree "a" ignoreAttrs) .| sinkList
1136-- [EventBeginDocument]
1137--
1138-- If an opening-tag is consumed but no matching closing-tag is found, an 'XmlException' is thrown.
1139--
1140-- >>> runConduit $ parseLBS def "<a><b></b>" .| void (takeTree "a" ignoreAttrs) .| sinkList
1141-- *** Exception: InvalidEndElement (Name {nameLocalName = "a", nameNamespace = Nothing, namePrefix = Nothing}) Nothing
1142--
1143-- This function automatically ignores comments, instructions and whitespace.
1144--
1145-- Returns @Just ()@ if an element was consumed, 'Nothing' otherwise.
1146--
1147-- Since 1.5.0
1148takeTree :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event Event m (Maybe ())
1149takeTree nameMatcher attrParser = do
1150  event <- await
1151  case event of
1152    Just e@(EventBeginElement name as) -> case runNameMatcher nameMatcher name of
1153      Just _ -> case runAttrParser attrParser as of
1154        Right _ -> do
1155          yield e
1156          whileJust takeAnyTreeContent
1157          endEvent <- await
1158          case endEvent of
1159            Just e'@(EventEndElement name') | name == name' -> yield e' >> return (Just ())
1160            _ -> lift $ throwM $ InvalidEndElement name endEvent
1161        _ -> leftover e >> return Nothing
1162      _ -> leftover e >> return Nothing
1163
1164    Just e -> if isWhitespace e then yield e >> takeTree nameMatcher attrParser else leftover e >> return Nothing
1165    _ -> return Nothing
1166  where
1167    whileJust f = fix $ \loop -> f >>= maybe (return ()) (const loop)
1168
1169-- | Like 'takeTree', but can also stream a content 'Event'.
1170--
1171-- >>> :set -XOverloadedStrings
1172-- >>> import Control.Monad (void)
1173-- >>> import Conduit
1174--
1175-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTreeContent "a" ignoreAttrs) .| sinkList
1176-- [EventBeginDocument,EventBeginElement (Name {nameLocalName = "a", ...}) [],EventContent (ContentText "content"),EventEndElement (Name {nameLocalName = "a", ...})]
1177--
1178-- >>> runConduit $ parseLBS def "<a>content</a><b></b>" .| void (takeTreeContent "b" ignoreAttrs) .| sinkList
1179-- [EventBeginDocument]
1180--
1181-- >>> runConduit $ parseLBS def "content<a></a><b></b>" .| void (takeTreeContent "a" ignoreAttrs) .| sinkList
1182-- [EventBeginDocument,EventContent (ContentText "content")]
1183--
1184-- Since 1.5.0
1185takeTreeContent :: MonadThrow m => NameMatcher a -> AttrParser b -> ConduitT Event Event m (Maybe ())
1186takeTreeContent nameMatcher attrParser = runMaybeT $ MaybeT (takeTree nameMatcher attrParser) <|> MaybeT takeContent
1187
1188-- | Like 'takeTreeContent', without checking for tag name or attributes.
1189--
1190-- >>> :set -XOverloadedStrings
1191-- >>> import Control.Monad (void)
1192-- >>> import Conduit
1193--
1194-- >>> runConduit $ parseLBS def "text<a></a>" .| void takeAnyTreeContent .| sinkList
1195-- [EventBeginDocument,EventContent (ContentText "text")]
1196--
1197-- >>> runConduit $ parseLBS def "</a><b></b>" .| void takeAnyTreeContent .| sinkList
1198-- [EventBeginDocument]
1199--
1200-- >>> runConduit $ parseLBS def "<b><c></c></b></a>text" .| void takeAnyTreeContent .| sinkList
1201-- [EventBeginDocument,EventBeginElement (Name {nameLocalName = "b", ...}) [],EventBeginElement (Name {nameLocalName = "c", ...}) [],EventEndElement (Name {nameLocalName = "c", ...}),EventEndElement (Name {nameLocalName = "b", ...})]
1202--
1203-- Since 1.5.0
1204takeAnyTreeContent :: MonadThrow m
1205                => ConduitT Event Event m (Maybe ())
1206takeAnyTreeContent = takeTreeContent anyName ignoreAttrs
1207
1208
1209-- | Default implementation of 'DecodeEntities', which leaves the
1210-- entity as-is. Numeric character references and the five standard
1211-- entities (lt, gt, amp, quot, pos) are handled internally by the
1212-- parser.
1213decodeXmlEntities :: DecodeEntities
1214decodeXmlEntities = ContentEntity
1215
1216-- | HTML4-compliant entity decoder. Handles the additional 248
1217-- entities defined by HTML 4 and XHTML 1.
1218--
1219-- Note that HTML 5 introduces a drastically larger number of entities, and
1220-- this code does not recognize most of them.
1221decodeHtmlEntities :: DecodeEntities
1222decodeHtmlEntities t =
1223  maybe (ContentEntity t) ContentText $ Map.lookup t htmlEntities
1224
1225htmlEntities :: Map.Map T.Text T.Text
1226htmlEntities = Map.fromList
1227    $ map (pack *** pack) -- Work around the long-compile-time bug
1228    [ ("nbsp", "\160")
1229    , ("iexcl", "\161")
1230    , ("cent", "\162")
1231    , ("pound", "\163")
1232    , ("curren", "\164")
1233    , ("yen", "\165")
1234    , ("brvbar", "\166")
1235    , ("sect", "\167")
1236    , ("uml", "\168")
1237    , ("copy", "\169")
1238    , ("ordf", "\170")
1239    , ("laquo", "\171")
1240    , ("not", "\172")
1241    , ("shy", "\173")
1242    , ("reg", "\174")
1243    , ("macr", "\175")
1244    , ("deg", "\176")
1245    , ("plusmn", "\177")
1246    , ("sup2", "\178")
1247    , ("sup3", "\179")
1248    , ("acute", "\180")
1249    , ("micro", "\181")
1250    , ("para", "\182")
1251    , ("middot", "\183")
1252    , ("cedil", "\184")
1253    , ("sup1", "\185")
1254    , ("ordm", "\186")
1255    , ("raquo", "\187")
1256    , ("frac14", "\188")
1257    , ("frac12", "\189")
1258    , ("frac34", "\190")
1259    , ("iquest", "\191")
1260    , ("Agrave", "\192")
1261    , ("Aacute", "\193")
1262    , ("Acirc", "\194")
1263    , ("Atilde", "\195")
1264    , ("Auml", "\196")
1265    , ("Aring", "\197")
1266    , ("AElig", "\198")
1267    , ("Ccedil", "\199")
1268    , ("Egrave", "\200")
1269    , ("Eacute", "\201")
1270    , ("Ecirc", "\202")
1271    , ("Euml", "\203")
1272    , ("Igrave", "\204")
1273    , ("Iacute", "\205")
1274    , ("Icirc", "\206")
1275    , ("Iuml", "\207")
1276    , ("ETH", "\208")
1277    , ("Ntilde", "\209")
1278    , ("Ograve", "\210")
1279    , ("Oacute", "\211")
1280    , ("Ocirc", "\212")
1281    , ("Otilde", "\213")
1282    , ("Ouml", "\214")
1283    , ("times", "\215")
1284    , ("Oslash", "\216")
1285    , ("Ugrave", "\217")
1286    , ("Uacute", "\218")
1287    , ("Ucirc", "\219")
1288    , ("Uuml", "\220")
1289    , ("Yacute", "\221")
1290    , ("THORN", "\222")
1291    , ("szlig", "\223")
1292    , ("agrave", "\224")
1293    , ("aacute", "\225")
1294    , ("acirc", "\226")
1295    , ("atilde", "\227")
1296    , ("auml", "\228")
1297    , ("aring", "\229")
1298    , ("aelig", "\230")
1299    , ("ccedil", "\231")
1300    , ("egrave", "\232")
1301    , ("eacute", "\233")
1302    , ("ecirc", "\234")
1303    , ("euml", "\235")
1304    , ("igrave", "\236")
1305    , ("iacute", "\237")
1306    , ("icirc", "\238")
1307    , ("iuml", "\239")
1308    , ("eth", "\240")
1309    , ("ntilde", "\241")
1310    , ("ograve", "\242")
1311    , ("oacute", "\243")
1312    , ("ocirc", "\244")
1313    , ("otilde", "\245")
1314    , ("ouml", "\246")
1315    , ("divide", "\247")
1316    , ("oslash", "\248")
1317    , ("ugrave", "\249")
1318    , ("uacute", "\250")
1319    , ("ucirc", "\251")
1320    , ("uuml", "\252")
1321    , ("yacute", "\253")
1322    , ("thorn", "\254")
1323    , ("yuml", "\255")
1324    , ("OElig", "\338")
1325    , ("oelig", "\339")
1326    , ("Scaron", "\352")
1327    , ("scaron", "\353")
1328    , ("Yuml", "\376")
1329    , ("fnof", "\402")
1330    , ("circ", "\710")
1331    , ("tilde", "\732")
1332    , ("Alpha", "\913")
1333    , ("Beta", "\914")
1334    , ("Gamma", "\915")
1335    , ("Delta", "\916")
1336    , ("Epsilon", "\917")
1337    , ("Zeta", "\918")
1338    , ("Eta", "\919")
1339    , ("Theta", "\920")
1340    , ("Iota", "\921")
1341    , ("Kappa", "\922")
1342    , ("Lambda", "\923")
1343    , ("Mu", "\924")
1344    , ("Nu", "\925")
1345    , ("Xi", "\926")
1346    , ("Omicron", "\927")
1347    , ("Pi", "\928")
1348    , ("Rho", "\929")
1349    , ("Sigma", "\931")
1350    , ("Tau", "\932")
1351    , ("Upsilon", "\933")
1352    , ("Phi", "\934")
1353    , ("Chi", "\935")
1354    , ("Psi", "\936")
1355    , ("Omega", "\937")
1356    , ("alpha", "\945")
1357    , ("beta", "\946")
1358    , ("gamma", "\947")
1359    , ("delta", "\948")
1360    , ("epsilon", "\949")
1361    , ("zeta", "\950")
1362    , ("eta", "\951")
1363    , ("theta", "\952")
1364    , ("iota", "\953")
1365    , ("kappa", "\954")
1366    , ("lambda", "\955")
1367    , ("mu", "\956")
1368    , ("nu", "\957")
1369    , ("xi", "\958")
1370    , ("omicron", "\959")
1371    , ("pi", "\960")
1372    , ("rho", "\961")
1373    , ("sigmaf", "\962")
1374    , ("sigma", "\963")
1375    , ("tau", "\964")
1376    , ("upsilon", "\965")
1377    , ("phi", "\966")
1378    , ("chi", "\967")
1379    , ("psi", "\968")
1380    , ("omega", "\969")
1381    , ("thetasym", "\977")
1382    , ("upsih", "\978")
1383    , ("piv", "\982")
1384    , ("ensp", "\8194")
1385    , ("emsp", "\8195")
1386    , ("thinsp", "\8201")
1387    , ("zwnj", "\8204")
1388    , ("zwj", "\8205")
1389    , ("lrm", "\8206")
1390    , ("rlm", "\8207")
1391    , ("ndash", "\8211")
1392    , ("mdash", "\8212")
1393    , ("lsquo", "\8216")
1394    , ("rsquo", "\8217")
1395    , ("sbquo", "\8218")
1396    , ("ldquo", "\8220")
1397    , ("rdquo", "\8221")
1398    , ("bdquo", "\8222")
1399    , ("dagger", "\8224")
1400    , ("Dagger", "\8225")
1401    , ("bull", "\8226")
1402    , ("hellip", "\8230")
1403    , ("permil", "\8240")
1404    , ("prime", "\8242")
1405    , ("Prime", "\8243")
1406    , ("lsaquo", "\8249")
1407    , ("rsaquo", "\8250")
1408    , ("oline", "\8254")
1409    , ("frasl", "\8260")
1410    , ("euro", "\8364")
1411    , ("image", "\8465")
1412    , ("weierp", "\8472")
1413    , ("real", "\8476")
1414    , ("trade", "\8482")
1415    , ("alefsym", "\8501")
1416    , ("larr", "\8592")
1417    , ("uarr", "\8593")
1418    , ("rarr", "\8594")
1419    , ("darr", "\8595")
1420    , ("harr", "\8596")
1421    , ("crarr", "\8629")
1422    , ("lArr", "\8656")
1423    , ("uArr", "\8657")
1424    , ("rArr", "\8658")
1425    , ("dArr", "\8659")
1426    , ("hArr", "\8660")
1427    , ("forall", "\8704")
1428    , ("part", "\8706")
1429    , ("exist", "\8707")
1430    , ("empty", "\8709")
1431    , ("nabla", "\8711")
1432    , ("isin", "\8712")
1433    , ("notin", "\8713")
1434    , ("ni", "\8715")
1435    , ("prod", "\8719")
1436    , ("sum", "\8721")
1437    , ("minus", "\8722")
1438    , ("lowast", "\8727")
1439    , ("radic", "\8730")
1440    , ("prop", "\8733")
1441    , ("infin", "\8734")
1442    , ("ang", "\8736")
1443    , ("and", "\8743")
1444    , ("or", "\8744")
1445    , ("cap", "\8745")
1446    , ("cup", "\8746")
1447    , ("int", "\8747")
1448    , ("there4", "\8756")
1449    , ("sim", "\8764")
1450    , ("cong", "\8773")
1451    , ("asymp", "\8776")
1452    , ("ne", "\8800")
1453    , ("equiv", "\8801")
1454    , ("le", "\8804")
1455    , ("ge", "\8805")
1456    , ("sub", "\8834")
1457    , ("sup", "\8835")
1458    , ("nsub", "\8836")
1459    , ("sube", "\8838")
1460    , ("supe", "\8839")
1461    , ("oplus", "\8853")
1462    , ("otimes", "\8855")
1463    , ("perp", "\8869")
1464    , ("sdot", "\8901")
1465    , ("lceil", "\8968")
1466    , ("rceil", "\8969")
1467    , ("lfloor", "\8970")
1468    , ("rfloor", "\8971")
1469    , ("lang", "\9001")
1470    , ("rang", "\9002")
1471    , ("loz", "\9674")
1472    , ("spades", "\9824")
1473    , ("clubs", "\9827")
1474    , ("hearts", "\9829")
1475    , ("diams", "\9830")
1476    ]
1477