1{-# LANGUAGE DeriveDataTypeable, RankNTypes #-}
2-- | /NOTE/ It is recommended to start using "Data.Conduit.Combinators" instead
3-- of this module.
4--
5-- Copyright: 2011 Michael Snoyman, 2010-2011 John Millikin
6-- License: MIT
7--
8-- Handle streams of text.
9--
10-- Parts of this code were taken from enumerator and adapted for conduits.
11--
12-- For many purposes, it's recommended to use the conduit-combinators library,
13-- which provides a more complete set of functions.
14module Data.Conduit.Text
15    (
16
17    -- * Text codecs
18      Codec
19    , encode
20    , decode
21    , utf8
22    , utf16_le
23    , utf16_be
24    , utf32_le
25    , utf32_be
26    , ascii
27    , iso8859_1
28    , lines
29    , linesBounded
30    , TextException (..)
31    , takeWhile
32    , dropWhile
33    , take
34    , drop
35    , foldLines
36    , withLine
37    , CC.decodeUtf8
38    , CC.decodeUtf8Lenient
39    , CC.encodeUtf8
40    , detectUtf
41    ) where
42
43import           Prelude hiding (head, drop, takeWhile, lines, zip, zip3, zipWith, zipWith3, take, dropWhile)
44
45import qualified Control.Exception as Exc
46import qualified Data.ByteString as B
47import qualified Data.ByteString.Char8 as B8
48import           Data.Char (ord)
49import qualified Data.Text as T
50import qualified Data.Text.Encoding as TE
51import           Data.Word (Word8)
52import           Data.Typeable (Typeable)
53
54import Data.Conduit
55import qualified Data.Conduit.List as CL
56import qualified Data.Conduit.Combinators as CC
57import Control.Monad.Trans.Class (lift)
58import Control.Monad.Trans.Resource (MonadThrow, throwM)
59import Control.Monad (unless)
60import Data.Streaming.Text
61
62-- | A specific character encoding.
63--
64-- Since 0.3.0
65data Codec = Codec
66    { _codecName :: T.Text
67    , codecEncode
68        :: T.Text
69        -> (B.ByteString, Maybe (TextException, T.Text))
70    , codecDecode
71        :: B.ByteString
72        -> (T.Text, Either
73            (TextException, B.ByteString)
74            B.ByteString)
75    }
76    | NewCodec T.Text (T.Text -> B.ByteString) (B.ByteString -> DecodeResult)
77
78instance Show Codec where
79    showsPrec d c =
80        let (cnst, name) = case c of
81                Codec t _ _    -> ("Codec ", t)
82                NewCodec t _ _ -> ("NewCodec ", t)
83        in showParen (d > 10) $ showString cnst . shows name
84
85
86
87-- | Emit each line separately
88--
89-- Since 0.4.1
90lines :: Monad m => ConduitT T.Text T.Text m ()
91lines =
92    awaitText id
93  where
94    awaitText front = await >>= maybe (finish front) (process front)
95
96    finish front =
97      let t = T.concat $ front []
98       in unless (T.null t) (yield t)
99
100    process front text =
101      let (line, rest) = T.break (== '\n') text
102      in  case T.uncons rest of
103            Just (_, rest') -> do
104              yield (T.concat $ front [line])
105              process id rest'
106            Nothing -> Exc.assert (line == text) $ awaitText $ front . (line:)
107
108
109
110-- | Variant of the lines function with an integer parameter.
111-- The text length of any emitted line
112-- never exceeds the value of the parameter. Whenever
113-- this is about to happen a LengthExceeded exception
114-- is thrown. This function should be used instead
115-- of the lines function whenever we are dealing with
116-- user input (e.g. a file upload) because we can't be sure that
117-- user input won't have extraordinarily large lines which would
118-- require large amounts of memory if consumed.
119linesBounded :: MonadThrow m => Int -> ConduitT T.Text T.Text m ()
120linesBounded maxLineLen =
121    awaitText 0 T.empty
122  where
123    awaitText len buf = await >>= maybe (finish buf) (process len buf)
124
125    finish buf = unless (T.null buf) (yield buf)
126
127    process len buf text =
128      let (line, rest) = T.break (== '\n') text
129          len' = len + T.length line
130      in  if len' > maxLineLen
131            then lift $ throwM (LengthExceeded maxLineLen)
132            else case T.uncons rest of
133                   Just (_, rest') ->
134                     yield (buf `T.append` line) >> process 0 T.empty rest'
135                   _ ->
136                     awaitText len' $ buf `T.append` text
137
138
139
140-- | Convert text into bytes, using the provided codec. If the codec is
141-- not capable of representing an input character, an exception will be thrown.
142--
143-- Since 0.3.0
144encode :: MonadThrow m => Codec -> ConduitT T.Text B.ByteString m ()
145encode (NewCodec _ enc _) = CL.map enc
146encode codec = CL.mapM $ \t -> do
147    let (bs, mexc) = codecEncode codec t
148    maybe (return bs) (throwM . fst) mexc
149
150decodeNew
151    :: Monad m
152    => (Int -> B.ByteString -> T.Text -> B.ByteString -> ConduitT B.ByteString T.Text m ())
153    -> t
154    -> Int
155    -> (B.ByteString -> DecodeResult)
156    -> ConduitT B.ByteString T.Text m ()
157decodeNew onFailure _name =
158    loop
159  where
160    loop consumed dec =
161        await >>= maybe finish go
162      where
163        finish =
164            case dec B.empty of
165                DecodeResultSuccess _ _ -> return ()
166                DecodeResultFailure t rest -> onFailure consumed B.empty t rest
167        {-# INLINE finish #-}
168
169        go bs | B.null bs = loop consumed dec
170        go bs =
171            case dec bs of
172                DecodeResultSuccess t dec' -> do
173                    let consumed' = consumed + B.length bs
174                        next = do
175                            unless (T.null t) (yield t)
176                            loop consumed' dec'
177                     in consumed' `seq` next
178                DecodeResultFailure t rest -> onFailure consumed bs t rest
179
180-- | Convert bytes into text, using the provided codec. If the codec is
181-- not capable of decoding an input byte sequence, an exception will be thrown.
182--
183-- Since 0.3.0
184decode :: MonadThrow m => Codec -> ConduitT B.ByteString T.Text m ()
185decode (NewCodec name _ start) =
186    decodeNew onFailure name 0 start
187  where
188    onFailure consumed bs t rest = do
189        unless (T.null t) (yield t)
190        leftover rest -- rest will never be null, no need to check
191        let consumed' = consumed + B.length bs - B.length rest
192        throwM $ NewDecodeException name consumed' (B.take 4 rest)
193    {-# INLINE onFailure #-}
194decode codec =
195    loop id
196  where
197    loop front = await >>= maybe (finish front) (go front)
198
199    finish front =
200        case B.uncons $ front B.empty of
201            Nothing -> return ()
202            Just (w, _) -> lift $ throwM $ DecodeException codec w
203
204    go front bs' =
205        case extra of
206            Left (exc, _) -> lift $ throwM exc
207            Right bs'' -> yield text >> loop (B.append bs'')
208      where
209        (text, extra) = codecDecode codec bs
210        bs = front bs'
211
212-- |
213-- Since 0.3.0
214data TextException = DecodeException Codec Word8
215                   | EncodeException Codec Char
216                   | LengthExceeded Int
217                   | TextException Exc.SomeException
218                   | NewDecodeException !T.Text !Int !B.ByteString
219    deriving Typeable
220instance Show TextException where
221    show (DecodeException codec w) = concat
222        [ "Error decoding legacy Data.Conduit.Text codec "
223        , show codec
224        , " when parsing byte: "
225        , show w
226        ]
227    show (EncodeException codec c) = concat
228        [ "Error encoding legacy Data.Conduit.Text codec "
229        , show codec
230        , " when parsing char: "
231        , show c
232        ]
233    show (LengthExceeded i) = "Data.Conduit.Text.linesBounded: line too long: " ++ show i
234    show (TextException se) = "Data.Conduit.Text.TextException: " ++ show se
235    show (NewDecodeException codec consumed next) = concat
236        [ "Data.Conduit.Text.decode: Error decoding stream of "
237        , T.unpack codec
238        , " bytes. Error encountered in stream at offset "
239        , show consumed
240        , ". Encountered at byte sequence "
241        , show next
242        ]
243instance Exc.Exception TextException
244
245-- |
246-- Since 0.3.0
247utf8 :: Codec
248utf8 = NewCodec (T.pack "UTF-8") TE.encodeUtf8 Data.Streaming.Text.decodeUtf8
249
250-- |
251-- Since 0.3.0
252utf16_le :: Codec
253utf16_le = NewCodec (T.pack "UTF-16-LE") TE.encodeUtf16LE decodeUtf16LE
254
255-- |
256-- Since 0.3.0
257utf16_be :: Codec
258utf16_be = NewCodec (T.pack "UTF-16-BE") TE.encodeUtf16BE decodeUtf16BE
259
260-- |
261-- Since 0.3.0
262utf32_le :: Codec
263utf32_le = NewCodec (T.pack "UTF-32-LE") TE.encodeUtf32LE decodeUtf32LE
264
265-- |
266-- Since 0.3.0
267utf32_be :: Codec
268utf32_be = NewCodec (T.pack "UTF-32-BE") TE.encodeUtf32BE decodeUtf32BE
269
270-- |
271-- Since 0.3.0
272ascii :: Codec
273ascii = Codec name enc dec where
274    name = T.pack "ASCII"
275    enc text = (bytes, extra) where
276        (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text
277        bytes = B8.pack (T.unpack safe)
278        extra = if T.null unsafe
279            then Nothing
280            else Just (EncodeException ascii (T.head unsafe), unsafe)
281
282    dec bytes = (text, extra) where
283        (safe, unsafe) = B.span (<= 0x7F) bytes
284        text = T.pack (B8.unpack safe)
285        extra = if B.null unsafe
286            then Right B.empty
287            else Left (DecodeException ascii (B.head unsafe), unsafe)
288
289-- |
290-- Since 0.3.0
291iso8859_1 :: Codec
292iso8859_1 = Codec name enc dec where
293    name = T.pack "ISO-8859-1"
294    enc text = (bytes, extra) where
295        (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text
296        bytes = B8.pack (T.unpack safe)
297        extra = if T.null unsafe
298            then Nothing
299            else Just (EncodeException iso8859_1 (T.head unsafe), unsafe)
300
301    dec bytes = (T.pack (B8.unpack bytes), Right B.empty)
302
303-- |
304--
305-- Since 1.0.8
306takeWhile :: Monad m
307          => (Char -> Bool)
308          -> ConduitT T.Text T.Text m ()
309takeWhile p =
310    loop
311  where
312    loop = await >>= maybe (return ()) go
313    go t =
314        case T.span p t of
315            (x, y)
316                | T.null y -> yield x >> loop
317                | otherwise -> yield x >> leftover y
318
319-- |
320--
321-- Since 1.0.8
322dropWhile :: Monad m
323          => (Char -> Bool)
324          -> ConduitT T.Text o m ()
325dropWhile p =
326    loop
327  where
328    loop = await >>= maybe (return ()) go
329    go t
330        | T.null x = loop
331        | otherwise = leftover x
332      where
333        x = T.dropWhile p t
334
335-- |
336--
337-- Since 1.0.8
338take :: Monad m => Int -> ConduitT T.Text T.Text m ()
339take =
340    loop
341  where
342    loop i = await >>= maybe (return ()) (go i)
343    go i t
344        | diff == 0 = yield t
345        | diff < 0 =
346            let (x, y) = T.splitAt i t
347             in yield x >> leftover y
348        | otherwise = yield t >> loop diff
349      where
350        diff = i - T.length t
351
352-- |
353--
354-- Since 1.0.8
355drop :: Monad m => Int -> ConduitT T.Text o m ()
356drop =
357    loop
358  where
359    loop i = await >>= maybe (return ()) (go i)
360    go i t
361        | diff == 0 = return ()
362        | diff < 0 = leftover $ T.drop i t
363        | otherwise = loop diff
364      where
365        diff = i - T.length t
366
367-- |
368--
369-- Since 1.0.8
370foldLines :: Monad m
371          => (a -> ConduitM T.Text o m a)
372          -> a
373          -> ConduitT T.Text o m a
374foldLines f =
375    start
376  where
377    start a = CL.peek >>= maybe (return a) (const $ loop $ f a)
378
379    loop consumer = do
380        a <- takeWhile (/= '\n') .| do
381            a <- CL.map (T.filter (/= '\r')) .| consumer
382            CL.sinkNull
383            return a
384        drop 1
385        start a
386
387-- |
388--
389-- Since 1.0.8
390withLine :: Monad m
391         => ConduitT T.Text Void m a
392         -> ConduitT T.Text o m (Maybe a)
393withLine consumer = toConsumer $ do
394    mx <- CL.peek
395    case mx of
396        Nothing -> return Nothing
397        Just _ -> do
398            x <- takeWhile (/= '\n') .| do
399                x <- CL.map (T.filter (/= '\r')) .| consumer
400                CL.sinkNull
401                return x
402            drop 1
403            return $ Just x
404
405-- | Automatically determine which UTF variant is being used. This function
406-- checks for BOMs, removing them as necessary. It defaults to assuming UTF-8.
407--
408-- Since 1.1.9
409detectUtf :: MonadThrow m => ConduitT B.ByteString T.Text m ()
410detectUtf =
411    go id
412  where
413    go front = await >>= maybe (close front) (push front)
414
415    push front bs'
416        | B.length bs < 4 = go $ B.append bs
417        | otherwise       = leftDecode bs
418      where bs = front bs'
419
420    close front = leftDecode $ front B.empty
421
422    leftDecode bs = leftover bsOut >> decode codec
423      where
424        bsOut = B.append (B.drop toDrop x) y
425        (x, y) = B.splitAt 4 bs
426        (toDrop, codec) =
427            case B.unpack x of
428                [0x00, 0x00, 0xFE, 0xFF] -> (4, utf32_be)
429                [0xFF, 0xFE, 0x00, 0x00] -> (4, utf32_le)
430                0xFE : 0xFF: _           -> (2, utf16_be)
431                0xFF : 0xFE: _           -> (2, utf16_le)
432                0xEF : 0xBB: 0xBF : _    -> (3, utf8)
433                _                        -> (0, utf8) -- Assuming UTF-8
434{-# INLINE detectUtf #-}
435