1{-# LANGUAGE DeriveDataTypeable #-} 2{-# LANGUAGE FlexibleContexts #-} 3{-# LANGUAGE RankNTypes #-} 4-- | DOM-based XML parsing and rendering. 5-- 6-- In this module, attribute values and content nodes can contain either raw 7-- text or entities. In most cases, these can be fully resolved at parsing. If 8-- that is the case for your documents, the "Text.XML" module provides 9-- simplified datatypes that only contain raw text. 10module Text.XML.Unresolved 11 ( -- * Non-streaming functions 12 writeFile 13 , readFile 14 -- * Lazy bytestrings 15 , renderLBS 16 , parseLBS 17 , parseLBS_ 18 -- * Text 19 , parseText 20 , parseText_ 21 , sinkTextDoc 22 -- * Byte streams 23 , sinkDoc 24 -- * Streaming functions 25 , toEvents 26 , elementToEvents 27 , fromEvents 28 , elementFromEvents 29 , renderBuilder 30 , renderBytes 31 , renderText 32 -- * Exceptions 33 , InvalidEventStream (..) 34 -- * Settings 35 , P.def 36 -- ** Parse 37 , P.ParseSettings 38 , P.psDecodeEntities 39 , P.psRetainNamespaces 40 -- ** Render 41 , R.RenderSettings 42 , R.rsPretty 43 , R.rsNamespaces 44 ) where 45 46import Conduit 47import Control.Applicative ((<$>), (<*>)) 48import Control.Exception (Exception, SomeException, throw) 49import Control.Monad (when) 50import Control.Monad.Trans.Class (lift) 51import Data.ByteString (ByteString) 52import Data.ByteString.Builder (Builder) 53import qualified Data.ByteString.Lazy as L 54import Data.Char (isSpace) 55import qualified Data.Conduit.Binary as CB 56import Data.Conduit.Lazy (lazyConsume) 57import qualified Data.Conduit.List as CL 58import Data.Maybe (isJust, mapMaybe) 59import Data.Monoid (mconcat) 60import Data.Text (Text) 61import qualified Data.Text as T 62import qualified Data.Text.Lazy as TL 63import Data.Typeable (Typeable) 64import Data.XML.Types 65import Prelude hiding (readFile, writeFile) 66import System.IO.Unsafe (unsafePerformIO) 67import Text.XML.Stream.Parse (ParseSettings) 68import qualified Text.XML.Stream.Parse as P 69import qualified Text.XML.Stream.Render as R 70 71readFile :: P.ParseSettings -> FilePath -> IO Document 72readFile ps fp = runConduitRes $ CB.sourceFile fp .| sinkDoc ps 73 74sinkDoc :: MonadThrow m 75 => P.ParseSettings 76 -> ConduitT ByteString o m Document 77sinkDoc ps = P.parseBytesPos ps .| fromEvents 78 79writeFile :: R.RenderSettings -> FilePath -> Document -> IO () 80writeFile rs fp doc = 81 runConduitRes $ renderBytes rs doc .| CB.sinkFile fp 82 83renderLBS :: R.RenderSettings -> Document -> L.ByteString 84renderLBS rs doc = 85 L.fromChunks $ unsafePerformIO 86 -- not generally safe, but we know that runResourceT 87 -- will not deallocate any of the resources being used 88 -- by the process 89 $ lazyConsume 90 $ renderBytes rs doc 91 92parseLBS :: P.ParseSettings -> L.ByteString -> Either SomeException Document 93parseLBS ps lbs = runConduit $ CL.sourceList (L.toChunks lbs) .| sinkDoc ps 94 95parseLBS_ :: P.ParseSettings -> L.ByteString -> Document 96parseLBS_ ps lbs = either throw id $ parseLBS ps lbs 97 98data InvalidEventStream = ContentAfterRoot P.EventPos 99 | MissingRootElement 100 | InvalidInlineDoctype P.EventPos 101 | MissingEndElement Name (Maybe P.EventPos) 102 | UnterminatedInlineDoctype 103 deriving Typeable 104instance Exception InvalidEventStream 105instance Show InvalidEventStream where 106 show (ContentAfterRoot (pos, e)) = mShowPos pos ++ "Found content after root element: " ++ prettyShowE e 107 show MissingRootElement = "Missing root element" 108 show (InvalidInlineDoctype (pos, e)) = mShowPos pos ++ "Invalid content inside doctype: " ++ prettyShowE e 109 show (MissingEndElement name Nothing) = "Documented ended while expected end element for: " ++ prettyShowName name 110 show (MissingEndElement name (Just (pos, e))) = mShowPos pos ++ "Expected end element for: " ++ prettyShowName name ++ ", but received: " ++ prettyShowE e 111 show UnterminatedInlineDoctype = "Unterminated doctype declaration" 112 113mShowPos :: Maybe P.PositionRange -> String 114mShowPos Nothing = "" 115mShowPos (Just pos) = show pos ++ ": " 116 117prettyShowE :: Event -> String 118prettyShowE = show -- FIXME 119 120prettyShowName :: Name -> String 121prettyShowName = show -- FIXME 122 123renderBuilder :: Monad m => R.RenderSettings -> Document -> ConduitT i Builder m () 124renderBuilder rs doc = CL.sourceList (toEvents doc) .| R.renderBuilder rs 125 126renderBytes :: PrimMonad m => R.RenderSettings -> Document -> ConduitT i ByteString m () 127renderBytes rs doc = CL.sourceList (toEvents doc) .| R.renderBytes rs 128 129renderText :: (MonadThrow m, PrimMonad m) => R.RenderSettings -> Document -> ConduitT i Text m () 130renderText rs doc = CL.sourceList (toEvents doc) .| R.renderText rs 131 132manyTries :: Monad m => m (Maybe a) -> m [a] 133manyTries f = 134 go id 135 where 136 go front = do 137 x <- f 138 case x of 139 Nothing -> return $ front [] 140 Just y -> go (front . (:) y) 141 142dropReturn :: Monad m => a -> ConduitM i o m a 143dropReturn x = CL.drop 1 >> return x 144 145-- | Parse a document from a stream of events. 146fromEvents :: MonadThrow m => ConduitT P.EventPos o m Document 147fromEvents = do 148 skip EventBeginDocument 149 d <- Document <$> goP <*> require elementFromEvents <*> goM 150 skip EventEndDocument 151 y <- CL.head 152 case y of 153 Nothing -> return d 154 Just (_, EventEndDocument) -> lift $ throwM MissingRootElement 155 Just z -> 156 lift $ throwM $ ContentAfterRoot z 157 where 158 skip e = do 159 x <- CL.peek 160 when (fmap snd x == Just e) (CL.drop 1) 161 require f = do 162 x <- f 163 case x of 164 Just y -> return y 165 Nothing -> do 166 my <- CL.head 167 case my of 168 Nothing -> error "Text.XML.Unresolved:impossible" 169 Just (_, EventEndDocument) -> lift $ throwM MissingRootElement 170 Just y -> lift $ throwM $ ContentAfterRoot y 171 goP = Prologue <$> goM <*> goD <*> goM 172 goM = manyTries goM' 173 goM' = do 174 x <- CL.peek 175 case x of 176 Just (_, EventInstruction i) -> dropReturn $ Just $ MiscInstruction i 177 Just (_, EventComment t) -> dropReturn $ Just $ MiscComment t 178 Just (_, EventContent (ContentText t)) 179 | T.all isSpace t -> CL.drop 1 >> goM' 180 _ -> return Nothing 181 goD = do 182 x <- CL.peek 183 case x of 184 Just (_, EventBeginDoctype name meid) -> do 185 CL.drop 1 186 dropTillDoctype 187 return (Just $ Doctype name meid) 188 _ -> return Nothing 189 dropTillDoctype = do 190 x <- CL.head 191 case x of 192 -- Leaving the following line commented so that the intention of 193 -- this function stays clear. I figure in the future xml-types will 194 -- be expanded again to support some form of EventDeclaration 195 -- 196 -- Just (EventDeclaration _) -> dropTillDoctype 197 Just (_, EventEndDoctype) -> return () 198 Just epos -> lift $ throwM $ InvalidInlineDoctype epos 199 Nothing -> lift $ throwM UnterminatedInlineDoctype 200 201-- | Try to parse a document element (as defined in XML) from a stream of events. 202-- 203-- @since 1.3.5 204elementFromEvents :: MonadThrow m => ConduitT P.EventPos o m (Maybe Element) 205elementFromEvents = goE 206 where 207 goE = do 208 x <- CL.peek 209 case x of 210 Just (_, EventBeginElement n as) -> Just <$> goE' n as 211 _ -> return Nothing 212 goE' n as = do 213 CL.drop 1 214 ns <- manyTries goN 215 y <- CL.head 216 if fmap snd y == Just (EventEndElement n) 217 then return $ Element n as $ compressNodes ns 218 else lift $ throwM $ MissingEndElement n y 219 goN = do 220 x <- CL.peek 221 case x of 222 Just (_, EventBeginElement n as) -> (Just . NodeElement) <$> goE' n as 223 Just (_, EventInstruction i) -> dropReturn $ Just $ NodeInstruction i 224 Just (_, EventContent c) -> dropReturn $ Just $ NodeContent c 225 Just (_, EventComment t) -> dropReturn $ Just $ NodeComment t 226 Just (_, EventCDATA t) -> dropReturn $ Just $ NodeContent $ ContentText t 227 _ -> return Nothing 228 229-- | Render a document into events. 230toEvents :: Document -> [Event] 231toEvents (Document prol root epi) = 232 (EventBeginDocument :) 233 . goP prol . elementToEvents' root . goM epi $ [EventEndDocument] 234 where 235 goP (Prologue before doctype after) = 236 goM before . maybe id goD doctype . goM after 237 goM [] = id 238 goM [x] = (goM' x :) 239 goM (x:xs) = (goM' x :) . goM xs 240 goM' (MiscInstruction i) = EventInstruction i 241 goM' (MiscComment t) = EventComment t 242 goD (Doctype name meid) = 243 (:) (EventBeginDoctype name meid) 244 . (:) EventEndDoctype 245 246-- | Render a document element into events. 247-- 248-- @since 1.3.5 249elementToEvents :: Element -> [Event] 250elementToEvents e = elementToEvents' e [] 251 252elementToEvents' :: Element -> [Event] -> [Event] 253elementToEvents' = goE 254 where 255 goE (Element name as ns) = 256 (EventBeginElement name as :) 257 . goN ns 258 . (EventEndElement name :) 259 goN [] = id 260 goN [x] = goN' x 261 goN (x:xs) = goN' x . goN xs 262 goN' (NodeElement e) = goE e 263 goN' (NodeInstruction i) = (EventInstruction i :) 264 goN' (NodeContent c) = (EventContent c :) 265 goN' (NodeComment t) = (EventComment t :) 266 267compressNodes :: [Node] -> [Node] 268compressNodes [] = [] 269compressNodes [x] = [x] 270compressNodes (x@(NodeContent (ContentText _)) : y@(NodeContent (ContentText _)) : z) = 271 let (textNodes, remainder) = span (isJust . unContent) (x:y:z) 272 texts = mapMaybe unContent textNodes 273 in 274 compressNodes $ NodeContent (ContentText $ mconcat texts) : remainder 275 where 276 unContent (NodeContent (ContentText text)) = Just text 277 unContent _ = Nothing 278compressNodes (x:xs) = x : compressNodes xs 279 280parseText :: ParseSettings -> TL.Text -> Either SomeException Document 281parseText ps tl = 282 runConduit 283 $ CL.sourceList (TL.toChunks tl) 284 .| sinkTextDoc ps 285 286parseText_ :: ParseSettings -> TL.Text -> Document 287parseText_ ps = either throw id . parseText ps 288 289sinkTextDoc :: MonadThrow m 290 => ParseSettings 291 -> ConduitT Text o m Document 292sinkTextDoc ps = P.parseTextPos ps .| fromEvents 293