1{-# LANGUAGE DeriveDataTypeable, RankNTypes #-} 2-- | /NOTE/ It is recommended to start using "Data.Conduit.Combinators" instead 3-- of this module. 4-- 5-- Copyright: 2011 Michael Snoyman, 2010-2011 John Millikin 6-- License: MIT 7-- 8-- Handle streams of text. 9-- 10-- Parts of this code were taken from enumerator and adapted for conduits. 11-- 12-- For many purposes, it's recommended to use the conduit-combinators library, 13-- which provides a more complete set of functions. 14module Data.Conduit.Text 15 ( 16 17 -- * Text codecs 18 Codec 19 , encode 20 , decode 21 , utf8 22 , utf16_le 23 , utf16_be 24 , utf32_le 25 , utf32_be 26 , ascii 27 , iso8859_1 28 , lines 29 , linesBounded 30 , TextException (..) 31 , takeWhile 32 , dropWhile 33 , take 34 , drop 35 , foldLines 36 , withLine 37 , CC.decodeUtf8 38 , CC.decodeUtf8Lenient 39 , CC.encodeUtf8 40 , detectUtf 41 ) where 42 43import Prelude hiding (head, drop, takeWhile, lines, zip, zip3, zipWith, zipWith3, take, dropWhile) 44 45import qualified Control.Exception as Exc 46import qualified Data.ByteString as B 47import qualified Data.ByteString.Char8 as B8 48import Data.Char (ord) 49import qualified Data.Text as T 50import qualified Data.Text.Encoding as TE 51import Data.Word (Word8) 52import Data.Typeable (Typeable) 53 54import Data.Conduit 55import qualified Data.Conduit.List as CL 56import qualified Data.Conduit.Combinators as CC 57import Control.Monad.Trans.Class (lift) 58import Control.Monad.Trans.Resource (MonadThrow, throwM) 59import Control.Monad (unless) 60import Data.Streaming.Text 61 62-- | A specific character encoding. 63-- 64-- Since 0.3.0 65data Codec = Codec 66 { _codecName :: T.Text 67 , codecEncode 68 :: T.Text 69 -> (B.ByteString, Maybe (TextException, T.Text)) 70 , codecDecode 71 :: B.ByteString 72 -> (T.Text, Either 73 (TextException, B.ByteString) 74 B.ByteString) 75 } 76 | NewCodec T.Text (T.Text -> B.ByteString) (B.ByteString -> DecodeResult) 77 78instance Show Codec where 79 showsPrec d c = 80 let (cnst, name) = case c of 81 Codec t _ _ -> ("Codec ", t) 82 NewCodec t _ _ -> ("NewCodec ", t) 83 in showParen (d > 10) $ showString cnst . shows name 84 85 86 87-- | Emit each line separately 88-- 89-- Since 0.4.1 90lines :: Monad m => ConduitT T.Text T.Text m () 91lines = 92 awaitText id 93 where 94 awaitText front = await >>= maybe (finish front) (process front) 95 96 finish front = 97 let t = T.concat $ front [] 98 in unless (T.null t) (yield t) 99 100 process front text = 101 let (line, rest) = T.break (== '\n') text 102 in case T.uncons rest of 103 Just (_, rest') -> do 104 yield (T.concat $ front [line]) 105 process id rest' 106 Nothing -> Exc.assert (line == text) $ awaitText $ front . (line:) 107 108 109 110-- | Variant of the lines function with an integer parameter. 111-- The text length of any emitted line 112-- never exceeds the value of the parameter. Whenever 113-- this is about to happen a LengthExceeded exception 114-- is thrown. This function should be used instead 115-- of the lines function whenever we are dealing with 116-- user input (e.g. a file upload) because we can't be sure that 117-- user input won't have extraordinarily large lines which would 118-- require large amounts of memory if consumed. 119linesBounded :: MonadThrow m => Int -> ConduitT T.Text T.Text m () 120linesBounded maxLineLen = 121 awaitText 0 T.empty 122 where 123 awaitText len buf = await >>= maybe (finish buf) (process len buf) 124 125 finish buf = unless (T.null buf) (yield buf) 126 127 process len buf text = 128 let (line, rest) = T.break (== '\n') text 129 len' = len + T.length line 130 in if len' > maxLineLen 131 then lift $ throwM (LengthExceeded maxLineLen) 132 else case T.uncons rest of 133 Just (_, rest') -> 134 yield (buf `T.append` line) >> process 0 T.empty rest' 135 _ -> 136 awaitText len' $ buf `T.append` text 137 138 139 140-- | Convert text into bytes, using the provided codec. If the codec is 141-- not capable of representing an input character, an exception will be thrown. 142-- 143-- Since 0.3.0 144encode :: MonadThrow m => Codec -> ConduitT T.Text B.ByteString m () 145encode (NewCodec _ enc _) = CL.map enc 146encode codec = CL.mapM $ \t -> do 147 let (bs, mexc) = codecEncode codec t 148 maybe (return bs) (throwM . fst) mexc 149 150decodeNew 151 :: Monad m 152 => (Int -> B.ByteString -> T.Text -> B.ByteString -> ConduitT B.ByteString T.Text m ()) 153 -> t 154 -> Int 155 -> (B.ByteString -> DecodeResult) 156 -> ConduitT B.ByteString T.Text m () 157decodeNew onFailure _name = 158 loop 159 where 160 loop consumed dec = 161 await >>= maybe finish go 162 where 163 finish = 164 case dec B.empty of 165 DecodeResultSuccess _ _ -> return () 166 DecodeResultFailure t rest -> onFailure consumed B.empty t rest 167 {-# INLINE finish #-} 168 169 go bs | B.null bs = loop consumed dec 170 go bs = 171 case dec bs of 172 DecodeResultSuccess t dec' -> do 173 let consumed' = consumed + B.length bs 174 next = do 175 unless (T.null t) (yield t) 176 loop consumed' dec' 177 in consumed' `seq` next 178 DecodeResultFailure t rest -> onFailure consumed bs t rest 179 180-- | Convert bytes into text, using the provided codec. If the codec is 181-- not capable of decoding an input byte sequence, an exception will be thrown. 182-- 183-- Since 0.3.0 184decode :: MonadThrow m => Codec -> ConduitT B.ByteString T.Text m () 185decode (NewCodec name _ start) = 186 decodeNew onFailure name 0 start 187 where 188 onFailure consumed bs t rest = do 189 unless (T.null t) (yield t) 190 leftover rest -- rest will never be null, no need to check 191 let consumed' = consumed + B.length bs - B.length rest 192 throwM $ NewDecodeException name consumed' (B.take 4 rest) 193 {-# INLINE onFailure #-} 194decode codec = 195 loop id 196 where 197 loop front = await >>= maybe (finish front) (go front) 198 199 finish front = 200 case B.uncons $ front B.empty of 201 Nothing -> return () 202 Just (w, _) -> lift $ throwM $ DecodeException codec w 203 204 go front bs' = 205 case extra of 206 Left (exc, _) -> lift $ throwM exc 207 Right bs'' -> yield text >> loop (B.append bs'') 208 where 209 (text, extra) = codecDecode codec bs 210 bs = front bs' 211 212-- | 213-- Since 0.3.0 214data TextException = DecodeException Codec Word8 215 | EncodeException Codec Char 216 | LengthExceeded Int 217 | TextException Exc.SomeException 218 | NewDecodeException !T.Text !Int !B.ByteString 219 deriving Typeable 220instance Show TextException where 221 show (DecodeException codec w) = concat 222 [ "Error decoding legacy Data.Conduit.Text codec " 223 , show codec 224 , " when parsing byte: " 225 , show w 226 ] 227 show (EncodeException codec c) = concat 228 [ "Error encoding legacy Data.Conduit.Text codec " 229 , show codec 230 , " when parsing char: " 231 , show c 232 ] 233 show (LengthExceeded i) = "Data.Conduit.Text.linesBounded: line too long: " ++ show i 234 show (TextException se) = "Data.Conduit.Text.TextException: " ++ show se 235 show (NewDecodeException codec consumed next) = concat 236 [ "Data.Conduit.Text.decode: Error decoding stream of " 237 , T.unpack codec 238 , " bytes. Error encountered in stream at offset " 239 , show consumed 240 , ". Encountered at byte sequence " 241 , show next 242 ] 243instance Exc.Exception TextException 244 245-- | 246-- Since 0.3.0 247utf8 :: Codec 248utf8 = NewCodec (T.pack "UTF-8") TE.encodeUtf8 Data.Streaming.Text.decodeUtf8 249 250-- | 251-- Since 0.3.0 252utf16_le :: Codec 253utf16_le = NewCodec (T.pack "UTF-16-LE") TE.encodeUtf16LE decodeUtf16LE 254 255-- | 256-- Since 0.3.0 257utf16_be :: Codec 258utf16_be = NewCodec (T.pack "UTF-16-BE") TE.encodeUtf16BE decodeUtf16BE 259 260-- | 261-- Since 0.3.0 262utf32_le :: Codec 263utf32_le = NewCodec (T.pack "UTF-32-LE") TE.encodeUtf32LE decodeUtf32LE 264 265-- | 266-- Since 0.3.0 267utf32_be :: Codec 268utf32_be = NewCodec (T.pack "UTF-32-BE") TE.encodeUtf32BE decodeUtf32BE 269 270-- | 271-- Since 0.3.0 272ascii :: Codec 273ascii = Codec name enc dec where 274 name = T.pack "ASCII" 275 enc text = (bytes, extra) where 276 (safe, unsafe) = T.span (\c -> ord c <= 0x7F) text 277 bytes = B8.pack (T.unpack safe) 278 extra = if T.null unsafe 279 then Nothing 280 else Just (EncodeException ascii (T.head unsafe), unsafe) 281 282 dec bytes = (text, extra) where 283 (safe, unsafe) = B.span (<= 0x7F) bytes 284 text = T.pack (B8.unpack safe) 285 extra = if B.null unsafe 286 then Right B.empty 287 else Left (DecodeException ascii (B.head unsafe), unsafe) 288 289-- | 290-- Since 0.3.0 291iso8859_1 :: Codec 292iso8859_1 = Codec name enc dec where 293 name = T.pack "ISO-8859-1" 294 enc text = (bytes, extra) where 295 (safe, unsafe) = T.span (\c -> ord c <= 0xFF) text 296 bytes = B8.pack (T.unpack safe) 297 extra = if T.null unsafe 298 then Nothing 299 else Just (EncodeException iso8859_1 (T.head unsafe), unsafe) 300 301 dec bytes = (T.pack (B8.unpack bytes), Right B.empty) 302 303-- | 304-- 305-- Since 1.0.8 306takeWhile :: Monad m 307 => (Char -> Bool) 308 -> ConduitT T.Text T.Text m () 309takeWhile p = 310 loop 311 where 312 loop = await >>= maybe (return ()) go 313 go t = 314 case T.span p t of 315 (x, y) 316 | T.null y -> yield x >> loop 317 | otherwise -> yield x >> leftover y 318 319-- | 320-- 321-- Since 1.0.8 322dropWhile :: Monad m 323 => (Char -> Bool) 324 -> ConduitT T.Text o m () 325dropWhile p = 326 loop 327 where 328 loop = await >>= maybe (return ()) go 329 go t 330 | T.null x = loop 331 | otherwise = leftover x 332 where 333 x = T.dropWhile p t 334 335-- | 336-- 337-- Since 1.0.8 338take :: Monad m => Int -> ConduitT T.Text T.Text m () 339take = 340 loop 341 where 342 loop i = await >>= maybe (return ()) (go i) 343 go i t 344 | diff == 0 = yield t 345 | diff < 0 = 346 let (x, y) = T.splitAt i t 347 in yield x >> leftover y 348 | otherwise = yield t >> loop diff 349 where 350 diff = i - T.length t 351 352-- | 353-- 354-- Since 1.0.8 355drop :: Monad m => Int -> ConduitT T.Text o m () 356drop = 357 loop 358 where 359 loop i = await >>= maybe (return ()) (go i) 360 go i t 361 | diff == 0 = return () 362 | diff < 0 = leftover $ T.drop i t 363 | otherwise = loop diff 364 where 365 diff = i - T.length t 366 367-- | 368-- 369-- Since 1.0.8 370foldLines :: Monad m 371 => (a -> ConduitM T.Text o m a) 372 -> a 373 -> ConduitT T.Text o m a 374foldLines f = 375 start 376 where 377 start a = CL.peek >>= maybe (return a) (const $ loop $ f a) 378 379 loop consumer = do 380 a <- takeWhile (/= '\n') .| do 381 a <- CL.map (T.filter (/= '\r')) .| consumer 382 CL.sinkNull 383 return a 384 drop 1 385 start a 386 387-- | 388-- 389-- Since 1.0.8 390withLine :: Monad m 391 => ConduitT T.Text Void m a 392 -> ConduitT T.Text o m (Maybe a) 393withLine consumer = toConsumer $ do 394 mx <- CL.peek 395 case mx of 396 Nothing -> return Nothing 397 Just _ -> do 398 x <- takeWhile (/= '\n') .| do 399 x <- CL.map (T.filter (/= '\r')) .| consumer 400 CL.sinkNull 401 return x 402 drop 1 403 return $ Just x 404 405-- | Automatically determine which UTF variant is being used. This function 406-- checks for BOMs, removing them as necessary. It defaults to assuming UTF-8. 407-- 408-- Since 1.1.9 409detectUtf :: MonadThrow m => ConduitT B.ByteString T.Text m () 410detectUtf = 411 go id 412 where 413 go front = await >>= maybe (close front) (push front) 414 415 push front bs' 416 | B.length bs < 4 = go $ B.append bs 417 | otherwise = leftDecode bs 418 where bs = front bs' 419 420 close front = leftDecode $ front B.empty 421 422 leftDecode bs = leftover bsOut >> decode codec 423 where 424 bsOut = B.append (B.drop toDrop x) y 425 (x, y) = B.splitAt 4 bs 426 (toDrop, codec) = 427 case B.unpack x of 428 [0x00, 0x00, 0xFE, 0xFF] -> (4, utf32_be) 429 [0xFF, 0xFE, 0x00, 0x00] -> (4, utf32_le) 430 0xFE : 0xFF: _ -> (2, utf16_be) 431 0xFF : 0xFE: _ -> (2, utf16_le) 432 0xEF : 0xBB: 0xBF : _ -> (3, utf8) 433 _ -> (0, utf8) -- Assuming UTF-8 434{-# INLINE detectUtf #-} 435