1{-# LANGUAGE DeriveDataTypeable #-}
2{-# LANGUAGE FlexibleContexts   #-}
3{-# LANGUAGE RankNTypes         #-}
4-- | DOM-based XML parsing and rendering.
5--
6-- In this module, attribute values and content nodes can contain either raw
7-- text or entities. In most cases, these can be fully resolved at parsing. If
8-- that is the case for your documents, the "Text.XML" module provides
9-- simplified datatypes that only contain raw text.
10module Text.XML.Unresolved
11    ( -- * Non-streaming functions
12      writeFile
13    , readFile
14      -- * Lazy bytestrings
15    , renderLBS
16    , parseLBS
17    , parseLBS_
18      -- * Text
19    , parseText
20    , parseText_
21    , sinkTextDoc
22      -- * Byte streams
23    , sinkDoc
24      -- * Streaming functions
25    , toEvents
26    , elementToEvents
27    , fromEvents
28    , elementFromEvents
29    , renderBuilder
30    , renderBytes
31    , renderText
32      -- * Exceptions
33    , InvalidEventStream (..)
34      -- * Settings
35    , P.def
36      -- ** Parse
37    , P.ParseSettings
38    , P.psDecodeEntities
39    , P.psRetainNamespaces
40      -- ** Render
41    , R.RenderSettings
42    , R.rsPretty
43    , R.rsNamespaces
44    ) where
45
46import           Conduit
47import           Control.Applicative          ((<$>), (<*>))
48import           Control.Exception            (Exception, SomeException, throw)
49import           Control.Monad                (when)
50import           Control.Monad.Trans.Class    (lift)
51import           Data.ByteString              (ByteString)
52import           Data.ByteString.Builder      (Builder)
53import qualified Data.ByteString.Lazy         as L
54import           Data.Char                    (isSpace)
55import qualified Data.Conduit.Binary          as CB
56import           Data.Conduit.Lazy            (lazyConsume)
57import qualified Data.Conduit.List            as CL
58import           Data.Maybe                   (isJust, mapMaybe)
59import           Data.Monoid                  (mconcat)
60import           Data.Text                    (Text)
61import qualified Data.Text                    as T
62import qualified Data.Text.Lazy               as TL
63import           Data.Typeable                (Typeable)
64import           Data.XML.Types
65import           Prelude                      hiding (readFile, writeFile)
66import           System.IO.Unsafe             (unsafePerformIO)
67import           Text.XML.Stream.Parse        (ParseSettings)
68import qualified Text.XML.Stream.Parse        as P
69import qualified Text.XML.Stream.Render       as R
70
71readFile :: P.ParseSettings -> FilePath -> IO Document
72readFile ps fp = runConduitRes $ CB.sourceFile fp .| sinkDoc ps
73
74sinkDoc :: MonadThrow m
75        => P.ParseSettings
76        -> ConduitT ByteString o m Document
77sinkDoc ps = P.parseBytesPos ps .| fromEvents
78
79writeFile :: R.RenderSettings -> FilePath -> Document -> IO ()
80writeFile rs fp doc =
81    runConduitRes $ renderBytes rs doc .| CB.sinkFile fp
82
83renderLBS :: R.RenderSettings -> Document -> L.ByteString
84renderLBS rs doc =
85    L.fromChunks $ unsafePerformIO
86                 -- not generally safe, but we know that runResourceT
87                 -- will not deallocate any of the resources being used
88                 -- by the process
89                 $ lazyConsume
90                 $ renderBytes rs doc
91
92parseLBS :: P.ParseSettings -> L.ByteString -> Either SomeException Document
93parseLBS ps lbs = runConduit $ CL.sourceList (L.toChunks lbs) .| sinkDoc ps
94
95parseLBS_ :: P.ParseSettings -> L.ByteString -> Document
96parseLBS_ ps lbs = either throw id $ parseLBS ps lbs
97
98data InvalidEventStream = ContentAfterRoot P.EventPos
99                        | MissingRootElement
100                        | InvalidInlineDoctype P.EventPos
101                        | MissingEndElement Name (Maybe P.EventPos)
102                        | UnterminatedInlineDoctype
103    deriving Typeable
104instance Exception InvalidEventStream
105instance Show InvalidEventStream where
106    show (ContentAfterRoot (pos, e)) = mShowPos pos ++ "Found content after root element: " ++ prettyShowE e
107    show MissingRootElement = "Missing root element"
108    show (InvalidInlineDoctype (pos, e)) = mShowPos pos ++ "Invalid content inside doctype: " ++ prettyShowE e
109    show (MissingEndElement name Nothing) = "Documented ended while expected end element for: " ++ prettyShowName name
110    show (MissingEndElement name (Just (pos, e))) = mShowPos pos ++ "Expected end element for: " ++ prettyShowName name ++ ", but received: " ++ prettyShowE e
111    show UnterminatedInlineDoctype = "Unterminated doctype declaration"
112
113mShowPos :: Maybe P.PositionRange -> String
114mShowPos Nothing    = ""
115mShowPos (Just pos) = show pos ++ ": "
116
117prettyShowE :: Event -> String
118prettyShowE = show -- FIXME
119
120prettyShowName :: Name -> String
121prettyShowName = show -- FIXME
122
123renderBuilder :: Monad m => R.RenderSettings -> Document -> ConduitT i Builder m ()
124renderBuilder rs doc = CL.sourceList (toEvents doc) .| R.renderBuilder rs
125
126renderBytes :: PrimMonad m => R.RenderSettings -> Document -> ConduitT i ByteString m ()
127renderBytes rs doc = CL.sourceList (toEvents doc) .| R.renderBytes rs
128
129renderText :: (MonadThrow m, PrimMonad m) => R.RenderSettings -> Document -> ConduitT i Text m ()
130renderText rs doc = CL.sourceList (toEvents doc) .| R.renderText rs
131
132manyTries :: Monad m => m (Maybe a) -> m [a]
133manyTries f =
134    go id
135  where
136    go front = do
137        x <- f
138        case x of
139            Nothing -> return $ front []
140            Just y  -> go (front . (:) y)
141
142dropReturn :: Monad m => a -> ConduitM i o m a
143dropReturn x = CL.drop 1 >> return x
144
145-- | Parse a document from a stream of events.
146fromEvents :: MonadThrow m => ConduitT P.EventPos o m Document
147fromEvents = do
148    skip EventBeginDocument
149    d <- Document <$> goP <*> require elementFromEvents <*> goM
150    skip EventEndDocument
151    y <- CL.head
152    case y of
153        Nothing -> return d
154        Just (_, EventEndDocument) -> lift $ throwM MissingRootElement
155        Just z ->
156            lift $ throwM $ ContentAfterRoot z
157  where
158    skip e = do
159        x <- CL.peek
160        when (fmap snd x == Just e) (CL.drop 1)
161    require f = do
162        x <- f
163        case x of
164            Just y -> return y
165            Nothing -> do
166                my <- CL.head
167                case my of
168                    Nothing -> error "Text.XML.Unresolved:impossible"
169                    Just (_, EventEndDocument) -> lift $ throwM MissingRootElement
170                    Just y -> lift $ throwM $ ContentAfterRoot y
171    goP = Prologue <$> goM <*> goD <*> goM
172    goM = manyTries goM'
173    goM' = do
174        x <- CL.peek
175        case x of
176            Just (_, EventInstruction i) -> dropReturn $ Just $ MiscInstruction i
177            Just (_, EventComment t) -> dropReturn $ Just $ MiscComment t
178            Just (_, EventContent (ContentText t))
179                | T.all isSpace t -> CL.drop 1 >> goM'
180            _ -> return Nothing
181    goD = do
182        x <- CL.peek
183        case x of
184            Just (_, EventBeginDoctype name meid) -> do
185                CL.drop 1
186                dropTillDoctype
187                return (Just $ Doctype name meid)
188            _ -> return Nothing
189    dropTillDoctype = do
190        x <- CL.head
191        case x of
192            -- Leaving the following line commented so that the intention of
193            -- this function stays clear. I figure in the future xml-types will
194            -- be expanded again to support some form of EventDeclaration
195            --
196            -- Just (EventDeclaration _) -> dropTillDoctype
197            Just (_, EventEndDoctype) -> return ()
198            Just epos -> lift $ throwM $ InvalidInlineDoctype epos
199            Nothing -> lift $ throwM UnterminatedInlineDoctype
200
201-- | Try to parse a document element (as defined in XML) from a stream of events.
202--
203-- @since 1.3.5
204elementFromEvents :: MonadThrow m => ConduitT P.EventPos o m (Maybe Element)
205elementFromEvents = goE
206  where
207    goE = do
208        x <- CL.peek
209        case x of
210            Just (_, EventBeginElement n as) -> Just <$> goE' n as
211            _                                -> return Nothing
212    goE' n as = do
213        CL.drop 1
214        ns <- manyTries goN
215        y <- CL.head
216        if fmap snd y == Just (EventEndElement n)
217            then return $ Element n as $ compressNodes ns
218            else lift $ throwM $ MissingEndElement n y
219    goN = do
220        x <- CL.peek
221        case x of
222            Just (_, EventBeginElement n as) -> (Just . NodeElement) <$> goE' n as
223            Just (_, EventInstruction i) -> dropReturn $ Just $ NodeInstruction i
224            Just (_, EventContent c) -> dropReturn $ Just $ NodeContent c
225            Just (_, EventComment t) -> dropReturn $ Just $ NodeComment t
226            Just (_, EventCDATA t) -> dropReturn $ Just $ NodeContent $ ContentText t
227            _ -> return Nothing
228
229-- | Render a document into events.
230toEvents :: Document -> [Event]
231toEvents (Document prol root epi) =
232      (EventBeginDocument :)
233    . goP prol . elementToEvents' root . goM epi $ [EventEndDocument]
234  where
235    goP (Prologue before doctype after) =
236        goM before . maybe id goD doctype . goM after
237    goM []     = id
238    goM [x]    = (goM' x :)
239    goM (x:xs) = (goM' x :) . goM xs
240    goM' (MiscInstruction i) = EventInstruction i
241    goM' (MiscComment t)     = EventComment t
242    goD (Doctype name meid) =
243        (:) (EventBeginDoctype name meid)
244      . (:) EventEndDoctype
245
246-- | Render a document element into events.
247--
248-- @since 1.3.5
249elementToEvents :: Element -> [Event]
250elementToEvents e = elementToEvents' e []
251
252elementToEvents' :: Element -> [Event] -> [Event]
253elementToEvents' = goE
254  where
255    goE (Element name as ns) =
256          (EventBeginElement name as :)
257        . goN ns
258        . (EventEndElement name :)
259    goN []     = id
260    goN [x]    = goN' x
261    goN (x:xs) = goN' x . goN xs
262    goN' (NodeElement e)     = goE e
263    goN' (NodeInstruction i) = (EventInstruction i :)
264    goN' (NodeContent c)     = (EventContent c :)
265    goN' (NodeComment t)     = (EventComment t :)
266
267compressNodes :: [Node] -> [Node]
268compressNodes []     = []
269compressNodes [x]    = [x]
270compressNodes (x@(NodeContent (ContentText _)) : y@(NodeContent (ContentText _)) : z) =
271    let (textNodes, remainder) = span (isJust . unContent) (x:y:z)
272        texts = mapMaybe unContent textNodes
273    in
274        compressNodes $ NodeContent (ContentText $ mconcat texts) : remainder
275    where
276        unContent (NodeContent (ContentText text)) = Just text
277        unContent _                                = Nothing
278compressNodes (x:xs) = x : compressNodes xs
279
280parseText :: ParseSettings -> TL.Text -> Either SomeException Document
281parseText ps tl =
282    runConduit
283  $ CL.sourceList (TL.toChunks tl)
284 .| sinkTextDoc ps
285
286parseText_ :: ParseSettings -> TL.Text -> Document
287parseText_ ps = either throw id . parseText ps
288
289sinkTextDoc :: MonadThrow m
290            => ParseSettings
291            -> ConduitT Text o m Document
292sinkTextDoc ps = P.parseTextPos ps .| fromEvents
293