1{-# LANGUAGE BangPatterns #-}
2{-# LANGUAGE CPP          #-}
3
4-- | Stream primitives for decoding and encoding 'Text' values in UTF-8 format.
5module System.IO.Streams.Text
6  ( -- * Decoders and Encoders
7    decodeUtf8
8  , decodeUtf8With
9  , encodeUtf8
10  ) where
11
12------------------------------------------------------------------------------
13import           Control.Monad                 (when)
14import           Control.Monad.IO.Class        (MonadIO (..))
15import           Data.ByteString               (ByteString)
16import qualified Data.ByteString               as S
17import qualified Data.ByteString.Unsafe        as S
18#if !MIN_VERSION_base(4,8,0)
19import           Data.Monoid                   (mappend)
20#endif
21import           Data.Text                     (Text)
22import qualified Data.Text.Encoding            as T
23import           Data.Text.Encoding.Error      (OnDecodeError)
24import           Data.Word                     (Word8)
25------------------------------------------------------------------------------
26import qualified System.IO.Streams.Combinators as Streams
27import           System.IO.Streams.Internal    (InputStream, OutputStream)
28import qualified System.IO.Streams.Internal    as Streams
29
30
31------------------------------------------------------------------------------
32-- | Convert an 'OutputStream' taking 'ByteString's to an 'OutputStream' that
33-- takes 'Text', encoding the data as UTF-8. See
34-- @Data.Text.Encoding.'T.encodeUtf8'@.
35encodeUtf8 :: OutputStream ByteString -> IO (OutputStream Text)
36encodeUtf8 = Streams.contramap T.encodeUtf8
37
38
39------------------------------------------------------------------------------
40-- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an
41-- 'InputStream' of 'Text' values. If decoding fails, will throw an exception.
42-- See @Data.Text.Encoding.'T.decodeUtf8'@.
43decodeUtf8 :: InputStream ByteString -> IO (InputStream Text)
44decodeUtf8 = decode T.decodeUtf8
45{-# INLINE decodeUtf8 #-}
46
47
48------------------------------------------------------------------------------
49-- | Decode an 'InputStream' of 'ByteString's in UTF-8 format into an
50-- 'InputStream' of 'Text' values. If decoding fails, invokes the given
51-- 'OnDecodeError' function to decide what to do. See
52-- @Data.Text.Encoding.'T.decodeUtf8With'@.
53decodeUtf8With :: OnDecodeError
54               -> InputStream ByteString
55               -> IO (InputStream Text)
56decodeUtf8With e = decode (T.decodeUtf8With e)
57{-# INLINE decodeUtf8With #-}
58
59
60------------------------------------------------------------------------------
61decode :: (ByteString -> Text)
62       -> InputStream ByteString
63       -> IO (InputStream Text)
64decode decodeFunc input = Streams.fromGenerator $ go Nothing
65  where
66    go !soFar = liftIO (Streams.read input) >>=
67                maybe (finish soFar) (chunk soFar)
68
69    finish Nothing  = return $! ()
70    finish (Just x) = Streams.yield $! decodeFunc x
71
72    chunk Nothing  s = process s
73    chunk (Just a) b = process $ a `mappend` b
74
75    process !s =
76      case findLastFullCode s of
77        LastCodeIsComplete x -> (Streams.yield $! decodeFunc x) >> go Nothing
78        Split a b            -> do
79                                  when (not $ S.null a) $
80                                      Streams.yield $! decodeFunc a
81                                  go (Just b)
82        NoCodesAreComplete x -> go (Just x)
83
84
85------------------------------------------------------------------------------
86data ByteType = Regular
87              | Continuation
88              | Start !Int
89
90
91------------------------------------------------------------------------------
92between :: Word8 -> Word8 -> Word8 -> Bool
93between x y z = x >= y && x <= z
94{-# INLINE between #-}
95
96
97------------------------------------------------------------------------------
98characterizeByte :: Word8 -> ByteType
99characterizeByte c | between c 0 0x7F    = Regular
100                   | between c 0x80 0xBF = Continuation
101                   | between c 0xC0 0xDF = Start 1
102                   | between c 0xE0 0xEF = Start 2
103                   -- Technically utf-8 ends after 0xf4, but those sequences
104                   -- won't decode anyways.
105                   | otherwise           = Start 3
106
107
108------------------------------------------------------------------------------
109data FindOutput = LastCodeIsComplete !ByteString
110                | Split !ByteString !ByteString
111                | NoCodesAreComplete !ByteString   -- should be impossibly rare
112                                                   -- in real data
113
114
115------------------------------------------------------------------------------
116findLastFullCode :: ByteString -> FindOutput
117findLastFullCode b | len == 0  = LastCodeIsComplete b
118                   | otherwise = go
119  where
120    len = S.length b
121
122    go = let !idx = len - 1
123             !c   = S.unsafeIndex b idx
124         in case characterizeByte c of
125              Regular      -> LastCodeIsComplete b
126              Continuation -> cont (len - 2)
127              _            -> Split (S.unsafeTake idx b) (S.unsafeDrop idx b)
128
129    cont !idx | idx < 0 = NoCodesAreComplete b
130              | otherwise =
131                  let !c = S.unsafeIndex b idx
132                  in case characterizeByte c of
133                       -- what do we do with this? decoding will fail. give up
134                       -- and lie, the text decoder will deal with it..
135                       Regular      -> LastCodeIsComplete b
136                       Continuation -> cont (idx - 1)
137                       Start n      -> if n + idx == len - 1
138                                         then LastCodeIsComplete b
139                                         else Split (S.unsafeTake idx b)
140                                                    (S.unsafeDrop idx b)
141{-# INLINE findLastFullCode #-}
142