1{-# LANGUAGE CPP, BangPatterns #-}
2-- |
3-- Module      : BoundedWrite
4-- Copyright   : (c) 2010 Simon Meier
5-- License     : BSD3-style (see LICENSE)
6--
7-- Maintainer  : Leon P Smith <leon@melding-monads.com>
8-- Stability   : experimental
9-- Portability : tested on GHC only
10--
11-- A more general/efficient write type.
12--
13module BoundedWrite (main) where
14
15import Foreign
16import Data.Monoid
17import Data.Char
18
19import Foreign.UPtr
20
21import qualified Data.ByteString.Internal as S
22import qualified Data.ByteString.Lazy as L
23
24import Blaze.ByteString.Builder.Internal
25import Blaze.ByteString.Builder.Write
26import Blaze.ByteString.Builder.Word
27
28import Criterion.Main
29
30------------------------------------------------------------------------------
31-- Benchmarks
32------------------------------------------------------------------------------
33
34main :: IO ()
35main = defaultMain $ concat
36    {-
37    [ benchmark "mconcat . map (fromWriteSingleton writeChar)"
38        bfrom3Chars
39        from3Chars
40        chars3
41    ]
42    -}
43    [ benchmark "mconcat . map fromWord8"
44        (mconcat . map bfromWord8)
45        (mconcat . map fromWord8)
46        word8s
47    ]
48  where
49    benchmark name boundedF staticF x =
50        [ bench (name ++ " <- bounded write") $
51            whnf (L.length . toLazyByteString . boundedF) x
52        , bench (name ++ " <- static write") $
53            whnf (L.length . toLazyByteString . staticF) x
54        ]
55
56word8s :: [Word8]
57word8s = take 100000 $ cycle [0..]
58{-# NOINLINE word8s #-}
59
60chars :: [Char]
61chars = take 100000 $ ['\0'..]
62{-# NOINLINE chars #-}
63
64chars2 :: [(Char,Char)]
65chars2 = zip chars chars
66{-# NOINLINE chars2 #-}
67
68chars3 :: [(Char, Char, Char)]
69chars3 = zip3 chars (reverse chars) (reverse chars)
70{-# NOINLINE chars3 #-}
71
72bfromChars = (mconcat . map (fromBWriteSingleton bwriteChar))
73{-# NOINLINE bfromChars #-}
74
75fromChars = (mconcat . map (fromWriteSingleton writeChar))
76{-# NOINLINE fromChars #-}
77
78bfrom2Chars = (mconcat . map (fromBWriteSingleton (\(c1, c2) -> bwriteChar c1 `mappend` bwriteChar c2)))
79{-# NOINLINE bfrom2Chars #-}
80
81from2Chars = (mconcat . map (fromWriteSingleton (\(c1, c2) -> writeChar c1 `mappend` writeChar c2)))
82{-# NOINLINE from2Chars #-}
83
84bfrom3Chars = (mconcat . map (fromBWriteSingleton (\(c1, c2, c3) -> bwriteChar c1 `mappend` bwriteChar c2 `mappend` bwriteChar c3)))
85{-# NOINLINE bfrom3Chars #-}
86
87from3Chars = (mconcat . map (fromWriteSingleton (\(c1, c2, c3) -> writeChar c1 `mappend` writeChar c2 `mappend` writeChar c3)))
88{-# NOINLINE from3Chars #-}
89
90------------------------------------------------------------------------------
91-- The Bounded Write Type
92------------------------------------------------------------------------------
93
94-- * GRRR* GHC is too 'clever'... code where we branch and each branch should
95-- execute a few IO actions and then return a value cannot be taught to GHC.
96-- At least not such that it returns the value of the branches unpacked.
97--
98-- Hmm.. at least he behaves much better for the Monoid instance of BWrite
99-- than the one for Write. Serializing UTF-8 chars gets a slowdown of a
100-- factor 2 when 2 chars are composed. Perhaps I should try out the writeList
101-- instances also, as they may be more sensitive to to much work per Char.
102--
103data BWrite = BWrite {-# UNPACK #-} !Int (UPtr -> UPtr)
104
105newtype UWrite = UWrite { runUWrite :: UPtr -> UPtr }
106
107instance Monoid UWrite where
108  mempty = UWrite $ \x -> x
109  {-# INLINE mempty #-}
110  (UWrite uw1) `mappend` (UWrite uw2) = UWrite (\up -> uw2 (uw1 up))
111  {-# INLINE mappend #-}
112
113instance Monoid BWrite where
114  mempty = BWrite 0 (\x -> x)
115  {-# INLINE mempty #-}
116  (BWrite b1 io1) `mappend` (BWrite b2 io2) =
117    BWrite (b1 + b2) (\op -> io2 (io1 op))
118  {-# INLINE mappend #-}
119
120execWrite :: IO () -> UPtr -> UPtr
121execWrite io op' = S.inlinePerformIO io `seq` op'
122{-# INLINE execWrite #-}
123
124execWriteSize :: (Ptr Word8 -> IO ()) -> Int -> UPtr -> UPtr
125execWriteSize io size op = execWrite (io (uptrToPtr op)) (op `plusUPtr` size)
126{-# INLINE execWriteSize #-}
127
128staticBWrite :: Int -> (Ptr Word8 -> IO ()) -> BWrite
129staticBWrite size io = BWrite size (execWriteSize io size)
130{-# INLINE staticBWrite #-}
131
132bwriteWord8 :: Word8 -> BWrite
133bwriteWord8 x = staticBWrite 1 (`poke` x)
134{-# INLINE bwriteWord8 #-}
135
136fromBWrite :: BWrite -> Builder
137fromBWrite (BWrite size io) =
138    Builder step
139  where
140    step k !pf !pe
141      | pf `plusPtr` size <= pe = do
142          let !pf' = io (ptrToUPtr pf)
143          k (uptrToPtr pf') pe
144      | otherwise = return $ BufferFull size pf (step k)
145{-# INLINE fromBWrite #-}
146
147fromBWriteSingleton :: (a -> BWrite) -> a -> Builder
148fromBWriteSingleton write =
149    mkPut
150  where
151    mkPut x = Builder step
152      where
153        step k !pf !pe
154          | pf `plusPtr` size <= pe = do
155              let !pf' = io (ptrToUPtr pf)
156              k (uptrToPtr pf') pe
157          | otherwise               = return $ BufferFull size pf (step k)
158          where
159            BWrite size io = write x
160{-# INLINE fromBWriteSingleton #-}
161
162bfromWord8 :: Word8 -> Builder
163bfromWord8 = fromBWriteSingleton bwriteWord8
164
165-- Utf-8 encoding
166-----------------
167
168bwriteChar :: Char -> BWrite
169bwriteChar c = BWrite 4 (encodeCharUtf8 f1 f2 f3 f4 c)
170  where
171    f1 x =  \uptr -> execWrite (do let !ptr = uptrToPtr uptr
172                                   poke ptr x )
173                               (uptr `plusUPtr` 1)
174
175    f2 x1 x2 = \uptr -> execWrite (do let !ptr = uptrToPtr uptr
176                                      poke ptr x1
177                                      poke (ptr `plusPtr` 1) x2 )
178                                  (uptr `plusUPtr` 2)
179
180    f3 x1 x2 x3 = \uptr -> execWrite (do let !ptr = uptrToPtr uptr
181                                         poke ptr x1
182                                         poke (ptr `plusPtr` 1) x2
183                                         poke (ptr `plusPtr` 2) x3 )
184                                     (uptr `plusUPtr` 3)
185
186    f4 x1 x2 x3 x4 = \uptr -> execWrite (do let !ptr = uptrToPtr uptr
187                                            poke ptr x1
188                                            poke (ptr `plusPtr` 1) x2
189                                            poke (ptr `plusPtr` 2) x3
190                                            poke (ptr `plusPtr` 3) x4 )
191                                        (uptr `plusUPtr` 4)
192{-# INLINE bwriteChar #-}
193
194writeChar :: Char -> Write
195writeChar = encodeCharUtf8 f1 f2 f3 f4
196  where
197    f1 x = Write 1 $ \ptr -> poke ptr x
198
199    f2 x1 x2 = Write 2 $ \ptr -> do poke ptr x1
200                                    poke (ptr `plusPtr` 1) x2
201
202    f3 x1 x2 x3 = Write 3 $ \ptr -> do poke ptr x1
203                                       poke (ptr `plusPtr` 1) x2
204                                       poke (ptr `plusPtr` 2) x3
205
206    f4 x1 x2 x3 x4 = Write 4 $ \ptr -> do poke ptr x1
207                                          poke (ptr `plusPtr` 1) x2
208                                          poke (ptr `plusPtr` 2) x3
209                                          poke (ptr `plusPtr` 3) x4
210{-# INLINE writeChar #-}
211
212-- | Encode a Unicode character to another datatype, using UTF-8. This function
213-- acts as an abstract way of encoding characters, as it is unaware of what
214-- needs to happen with the resulting bytes: you have to specify functions to
215-- deal with those.
216--
217encodeCharUtf8 :: (Word8 -> a)                             -- ^ 1-byte UTF-8
218               -> (Word8 -> Word8 -> a)                    -- ^ 2-byte UTF-8
219               -> (Word8 -> Word8 -> Word8 -> a)           -- ^ 3-byte UTF-8
220               -> (Word8 -> Word8 -> Word8 -> Word8 -> a)  -- ^ 4-byte UTF-8
221               -> Char                                     -- ^ Input 'Char'
222               -> a                                        -- ^ Result
223encodeCharUtf8 f1 f2 f3 f4 c = case ord c of
224    x | x <= 0x7F -> f1 $ fromIntegral x
225      | x <= 0x07FF ->
226           let x1 = fromIntegral $ (x `shiftR` 6) + 0xC0
227               x2 = fromIntegral $ (x .&. 0x3F)   + 0x80
228           in f2 x1 x2
229      | x <= 0xFFFF ->
230           let x1 = fromIntegral $ (x `shiftR` 12) + 0xE0
231               x2 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80
232               x3 = fromIntegral $ (x .&. 0x3F) + 0x80
233           in f3 x1 x2 x3
234      | otherwise ->
235           let x1 = fromIntegral $ (x `shiftR` 18) + 0xF0
236               x2 = fromIntegral $ ((x `shiftR` 12) .&. 0x3F) + 0x80
237               x3 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80
238               x4 = fromIntegral $ (x .&. 0x3F) + 0x80
239           in f4 x1 x2 x3 x4
240{-# INLINE encodeCharUtf8 #-}
241
242