1{-# LANGUAGE BangPatterns #-} 2{-# LANGUAGE CPP #-} 3 4-- | Stream primitives for decoding and encoding 'Text' values in UTF-8 format. 5module System.IO.Streams.Text 6 ( -- * Decoders and Encoders 7 decodeUtf8 8 , decodeUtf8With 9 , encodeUtf8 10 ) where 11 12------------------------------------------------------------------------------ 13import Control.Monad (when) 14import Control.Monad.IO.Class (MonadIO (..)) 15import Data.ByteString (ByteString) 16import qualified Data.ByteString as S 17import qualified Data.ByteString.Unsafe as S 18#if !MIN_VERSION_base(4,8,0) 19import Data.Monoid (mappend) 20#endif 21import Data.Text (Text) 22import qualified Data.Text.Encoding as T 23import Data.Text.Encoding.Error (OnDecodeError) 24import Data.Word (Word8) 25------------------------------------------------------------------------------ 26import qualified System.IO.Streams.Combinators as Streams 27import System.IO.Streams.Internal (InputStream, OutputStream) 28import qualified System.IO.Streams.Internal as Streams 29 30 31------------------------------------------------------------------------------ 32-- | Convert an 'OutputStream' taking 'ByteString's to an 'OutputStream' that 33-- takes 'Text', encoding the data as UTF-8. See 34-- @Data.Text.Encoding.'T.encodeUtf8'@. 35encodeUtf8 :: OutputStream ByteString -> IO (OutputStream Text) 36encodeUtf8 = Streams.contramap T.encodeUtf8 37 38 39------------------------------------------------------------------------------ 40-- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an 41-- 'InputStream' of 'Text' values. If decoding fails, will throw an exception. 42-- See @Data.Text.Encoding.'T.decodeUtf8'@. 43decodeUtf8 :: InputStream ByteString -> IO (InputStream Text) 44decodeUtf8 = decode T.decodeUtf8 45{-# INLINE decodeUtf8 #-} 46 47 48------------------------------------------------------------------------------ 49-- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an 50-- 'InputStream' of 'Text' values. If decoding fails, invokes the given 51-- 'OnDecodeError' function to decide what to do. See 52-- @Data.Text.Encoding.'T.decodeUtf8With'@. 53decodeUtf8With :: OnDecodeError 54 -> InputStream ByteString 55 -> IO (InputStream Text) 56decodeUtf8With e = decode (T.decodeUtf8With e) 57{-# INLINE decodeUtf8With #-} 58 59 60------------------------------------------------------------------------------ 61decode :: (ByteString -> Text) 62 -> InputStream ByteString 63 -> IO (InputStream Text) 64decode decodeFunc input = Streams.fromGenerator $ go Nothing 65 where 66 go !soFar = liftIO (Streams.read input) >>= 67 maybe (finish soFar) (chunk soFar) 68 69 finish Nothing = return $! () 70 finish (Just x) = Streams.yield $! decodeFunc x 71 72 chunk Nothing s = process s 73 chunk (Just a) b = process $ a `mappend` b 74 75 process !s = 76 case findLastFullCode s of 77 LastCodeIsComplete x -> (Streams.yield $! decodeFunc x) >> go Nothing 78 Split a b -> do 79 when (not $ S.null a) $ 80 Streams.yield $! decodeFunc a 81 go (Just b) 82 NoCodesAreComplete x -> go (Just x) 83 84 85------------------------------------------------------------------------------ 86data ByteType = Regular 87 | Continuation 88 | Start !Int 89 90 91------------------------------------------------------------------------------ 92between :: Word8 -> Word8 -> Word8 -> Bool 93between x y z = x >= y && x <= z 94{-# INLINE between #-} 95 96 97------------------------------------------------------------------------------ 98characterizeByte :: Word8 -> ByteType 99characterizeByte c | between c 0 0x7F = Regular 100 | between c 0x80 0xBF = Continuation 101 | between c 0xC0 0xDF = Start 1 102 | between c 0xE0 0xEF = Start 2 103 -- Technically utf-8 ends after 0xf4, but those sequences 104 -- won't decode anyways. 105 | otherwise = Start 3 106 107 108------------------------------------------------------------------------------ 109data FindOutput = LastCodeIsComplete !ByteString 110 | Split !ByteString !ByteString 111 | NoCodesAreComplete !ByteString -- should be impossibly rare 112 -- in real data 113 114 115------------------------------------------------------------------------------ 116findLastFullCode :: ByteString -> FindOutput 117findLastFullCode b | len == 0 = LastCodeIsComplete b 118 | otherwise = go 119 where 120 len = S.length b 121 122 go = let !idx = len - 1 123 !c = S.unsafeIndex b idx 124 in case characterizeByte c of 125 Regular -> LastCodeIsComplete b 126 Continuation -> cont (len - 2) 127 _ -> Split (S.unsafeTake idx b) (S.unsafeDrop idx b) 128 129 cont !idx | idx < 0 = NoCodesAreComplete b 130 | otherwise = 131 let !c = S.unsafeIndex b idx 132 in case characterizeByte c of 133 -- what do we do with this? decoding will fail. give up 134 -- and lie, the text decoder will deal with it.. 135 Regular -> LastCodeIsComplete b 136 Continuation -> cont (idx - 1) 137 Start n -> if n + idx == len - 1 138 then LastCodeIsComplete b 139 else Split (S.unsafeTake idx b) 140 (S.unsafeDrop idx b) 141{-# INLINE findLastFullCode #-} 142