1{-# LANGUAGE BangPatterns, ForeignFunctionInterface, CPP #-}
2
3-----------------------------------------------------------------------------
4-- |
5-- Module      :  ByteStringUtils
6-- Copyright   :  (c) The University of Glasgow 2001,
7--                    David Roundy 2003-2005
8-- License : GPL (I'm happy to also license this file BSD style but don't
9--           want to bother distributing two license files with darcs.
10--
11-- Maintainer  :  droundy@abridgegame.org
12-- Stability   :  experimental
13-- Portability :  portable
14--
15-- GZIp and MMap IO for ByteStrings, and miscellaneous functions for Data.ByteString
16--
17
18module ByteStringUtils (
19
20        unsafeWithInternals,
21        unpackPSfromUTF8,
22
23        -- IO with mmap or gzip
24        gzReadFilePS,
25        mmapFilePS,
26        gzWriteFilePS,
27        gzWriteFilePSs,
28
29        -- list utilities
30        ifHeadThenTail,
31        dropSpace,
32        breakSpace,
33        linesPS,
34        unlinesPS,
35        hashPS,
36        breakFirstPS,
37        breakLastPS,
38        substrPS,
39        readIntPS,
40        is_funky,
41        fromHex2PS,
42        fromPS2Hex,
43        betweenLinesPS,
44        break_after_nth_newline,
45        break_before_nth_newline,
46        intercalate
47    ) where
48
49import Prelude hiding ( catch )
50import qualified Data.ByteString            as B
51import qualified Data.ByteString.Char8      as BC
52import qualified Data.ByteString.Internal   as BI
53import Data.ByteString (intercalate, uncons)
54import Data.ByteString.Internal (fromForeignPtr)
55
56-- #if defined (HAVE_MMAP) || ! defined (HAVE_HASKELL_ZLIB)
57import Control.Exception        ( catch )
58-- #endif
59import System.IO
60import System.IO.Unsafe         ( unsafePerformIO )
61
62import Foreign.Storable         ( peekElemOff, peek )
63import Foreign.Marshal.Alloc    ( free )
64import Foreign.Marshal.Array    ( mallocArray, peekArray, advancePtr )
65import Foreign.C.Types          ( CInt )
66
67import Data.Bits                ( rotateL )
68import Data.Char                ( chr, ord, isSpace )
69import Data.Word                ( Word8 )
70import Data.Int                 ( Int32 )
71import Control.Monad            ( when )
72
73-- #ifndef HAVE_HASKELL_ZLIB
74import Foreign.Ptr              ( nullPtr )
75import Foreign.ForeignPtr       ( ForeignPtr )
76-- #endif
77import Foreign.Ptr              ( plusPtr, Ptr )
78import Foreign.ForeignPtr       ( withForeignPtr )
79
80-- #ifdef DEBUG_PS
81import Foreign.ForeignPtr       ( addForeignPtrFinalizer )
82import Foreign.Ptr              ( FunPtr )
83-- #endif
84
85-- #if HAVE_HASKELL_ZLIB
86import qualified Data.ByteString.Lazy as BL
87import qualified Codec.Compression.GZip as GZ
88-- #else
89import Foreign.C.String ( CString, withCString )
90-- #endif
91
92-- #ifdef HAVE_MMAP
93import System.IO.MMap( mmapFileByteString )
94import System.Mem( performGC )
95import System.Posix.Files( fileSize, getSymbolicLinkStatus )
96-- #endif
97
98-- -----------------------------------------------------------------------------
99-- obsolete debugging code
100
101-- # ifndef HAVE_HASKELL_ZLIB
102debugForeignPtr :: ForeignPtr a -> String -> IO ()
103-- #ifdef DEBUG_PS
104foreign import ccall unsafe "static fpstring.h debug_alloc" debug_alloc
105    :: Ptr a -> CString -> IO ()
106foreign import ccall unsafe "static fpstring.h & debug_free" debug_free
107    :: FunPtr (Ptr a -> IO ())
108debugForeignPtr fp n =
109    withCString n $ \cname-> withForeignPtr fp $ \p->
110    do debug_alloc p cname
111       addForeignPtrFinalizer debug_free fp
112-- #else
113debugForeignPtr _ _ = return ()
114-- #endif
115-- #endif
116
117-- -----------------------------------------------------------------------------
118-- unsafeWithInternals
119
120-- | Do something with the internals of a PackedString. Beware of
121-- altering the contents!
122unsafeWithInternals :: B.ByteString -> (Ptr Word8 -> Int -> IO a) -> IO a
123unsafeWithInternals ps f
124 = case BI.toForeignPtr ps of
125   (fp,s,l) -> withForeignPtr fp $ \p -> f (p `plusPtr` s) l
126
127-- | readIntPS skips any whitespace at the beginning of its argument, and
128-- reads an Int from the beginning of the PackedString.  If there is no
129-- integer at the beginning of the string, it returns Nothing, otherwise it
130-- just returns the int read, along with a B.ByteString containing the
131-- remainder of its input.
132
133readIntPS :: B.ByteString -> Maybe (Int, B.ByteString)
134readIntPS = BC.readInt . BC.dropWhile isSpace
135
136-- -----------------------------------------------------------------------------
137-- Destructor functions (taking PackedStrings apart)
138
139unpackPSfromUTF8 :: B.ByteString -> String
140unpackPSfromUTF8 ps =
141 case BI.toForeignPtr ps of
142   (_,_, 0) -> ""
143   (x,s,l)  ->
144    unsafePerformIO $ withForeignPtr x $ \p->
145    do outbuf <- mallocArray l
146       lout <- fromIntegral `fmap`
147               utf8_to_ints outbuf (p `plusPtr` s) (fromIntegral l)
148       when (lout < 0) $ error "Bad UTF8!"
149       str <- (map (chr . fromIntegral)) `fmap` peekArray lout outbuf
150       free outbuf
151       return str
152
153foreign import ccall unsafe "static fpstring.h utf8_to_ints" utf8_to_ints
154    :: Ptr Int -> Ptr Word8 -> CInt -> IO CInt
155
156-- -----------------------------------------------------------------------------
157-- List-mimicking functions for PackedStrings
158
159{-# INLINE ifHeadThenTail #-}
160ifHeadThenTail :: Word8 -> B.ByteString -> Maybe B.ByteString
161ifHeadThenTail c s = case uncons s of
162    Just (w, t) | w == c    -> Just t
163    _                       -> Nothing
164
165------------------------------------------------------------------------
166-- A reimplementation of Data.ByteString.Char8.dropSpace, but
167-- specialised to darcs' need for a 4 way isspace.
168--
169-- TODO: if it is safe to use the expanded definition of isSpaceWord8
170-- provided by Data.ByteString.Char8, then all this can go.
171
172-- A locale-independent isspace(3) so patches are interpreted the same everywhere.
173-- ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r')
174isSpaceWord8 :: Word8 -> Bool
175isSpaceWord8 w =
176    w == 0x20 ||    -- ' '
177    w == 0x09 ||    -- '\t'
178    w == 0x0A ||    -- '\n'
179    w == 0x0D       -- '\r'
180{-# INLINE isSpaceWord8 #-}
181
182firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int
183firstnonspace !ptr !n !m
184    | n >= m    = return n
185    | otherwise = do w <- peekElemOff ptr n
186                     if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n
187
188firstspace :: Ptr Word8 -> Int -> Int -> IO Int
189firstspace !ptr !n !m
190    | n >= m    = return n
191    | otherwise = do w <- peekElemOff ptr n
192                     if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n
193
194-- | 'dropSpace' efficiently returns the 'ByteString' argument with
195-- white space Chars removed from the front. It is more efficient than
196-- calling dropWhile for removing whitespace. I.e.
197--
198-- > dropWhile isSpace == dropSpace
199--
200dropSpace :: B.ByteString -> B.ByteString
201dropSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do
202    i <- firstnonspace (p `plusPtr` s) 0 l
203    return $! if i == l then B.empty else BI.PS x (s+i) (l-i)
204{-# INLINE dropSpace #-}
205
206-- | 'breakSpace' returns the pair of ByteStrings when the argument is
207-- broken at the first whitespace byte. I.e.
208--
209-- > break isSpace == breakSpace
210--
211breakSpace :: B.ByteString -> (B.ByteString,B.ByteString)
212breakSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do
213    i <- firstspace (p `plusPtr` s) 0 l
214    return $! case () of {_
215        | i == 0    -> (B.empty, BI.PS x s l)
216        | i == l    -> (BI.PS x s l, B.empty)
217        | otherwise -> (BI.PS x s i, BI.PS x (s+i) (l-i))
218    }
219{-# INLINE breakSpace #-}
220
221------------------------------------------------------------------------
222
223{-# INLINE is_funky #-}
224is_funky :: B.ByteString -> Bool
225is_funky ps = case BI.toForeignPtr ps of
226   (x,s,l) ->
227    unsafePerformIO $ withForeignPtr x $ \p->
228    (/=0) `fmap` has_funky_char (p `plusPtr` s) (fromIntegral l)
229
230foreign import ccall unsafe "fpstring.h has_funky_char" has_funky_char
231    :: Ptr Word8 -> CInt -> IO CInt
232
233------------------------------------------------------------------------
234
235-- ByteString rewrites break (=='x') to breakByte 'x'
236--  break ((==) x) = breakChar x
237--  break (==x) = breakChar x
238--
239
240{-
241{-# INLINE breakOnPS #-}
242breakOnPS :: Char -> B.ByteString -> (B.ByteString, B.ByteString)
243breakOnPS c p = case BC.elemIndex c p of
244                Nothing -> (p, BC.empty)
245                Just n  -> (B.take n p, B.drop n p)
246-}
247
248{-# INLINE hashPS #-}
249hashPS :: B.ByteString -> Int32
250hashPS ps =
251   case BI.toForeignPtr ps of
252   (x,s,l) ->
253    unsafePerformIO $ withForeignPtr x $ \p->
254    do hash (p `plusPtr` s) l
255
256hash :: Ptr Word8 -> Int -> IO Int32
257hash ptr len = f (0 :: Int32) ptr len
258 where f h _ 0 = return h
259       f h p n = do x <- peek p
260                    let !h' =  (fromIntegral x) + (rotateL h 8)
261                    f h' (p `advancePtr` 1) (n-1)
262
263{-# INLINE substrPS #-}
264substrPS :: B.ByteString -> B.ByteString -> Maybe Int
265substrPS tok str
266    | B.null tok = Just 0
267    | B.length tok > B.length str = Nothing
268    | otherwise = do n <- BC.elemIndex (BC.head tok) str
269                     let ttok = B.tail tok
270                         reststr = B.drop (n+1) str
271                     if ttok == B.take (B.length ttok) reststr
272                        then Just n
273                        else ((n+1)+) `fmap` substrPS tok reststr
274
275------------------------------------------------------------------------
276
277-- TODO: replace breakFirstPS and breakLastPS with definitions based on
278-- ByteString's break/breakEnd
279{-# INLINE breakFirstPS #-}
280breakFirstPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
281breakFirstPS c p = case BC.elemIndex c p of
282                   Nothing -> Nothing
283                   Just n -> Just (B.take n p, B.drop (n+1) p)
284
285{-# INLINE breakLastPS #-}
286breakLastPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
287breakLastPS c p = case BC.elemIndexEnd c p of
288                  Nothing -> Nothing
289                  Just n -> Just (B.take n p, B.drop (n+1) p)
290
291-- TODO: rename
292{-# INLINE linesPS #-}
293linesPS :: B.ByteString -> [B.ByteString]
294linesPS ps
295     | B.null ps = [B.empty]
296     | otherwise = BC.split '\n' ps
297
298{- QuickCheck property:
299
300import Test.QuickCheck
301import qualified Data.ByteString.Char8 as BC
302import Data.Char
303instance Arbitrary BC.ByteString where
304    arbitrary = fmap BC.pack arbitrary
305instance Arbitrary Char where
306  arbitrary = chr `fmap` choose (32,127)
307deepCheck = check (defaultConfig { configMaxTest = 10000})
308testLines =  deepCheck (\x -> (linesPS x == linesPSOld x))
309linesPSOld ps = case  BC.elemIndex '\n' ps of
310             Nothing -> [ps]
311             Just n -> B.take n ps : linesPS (B.drop (n+1) ps) -}
312
313{-| This function acts exactly like the "Prelude" unlines function, or like
314"Data.ByteString.Char8" 'unlines', but with one important difference: it will
315produce a string which may not end with a newline! That is:
316
317> unlinesPS ["foo", "bar"]
318
319evaluates to \"foo\\nbar\", not \"foo\\nbar\\n\"! This point should hold true for
320'linesPS' as well.
321
322TODO: rename this function. -}
323unlinesPS :: [B.ByteString] -> B.ByteString
324unlinesPS [] = BC.empty
325unlinesPS x  = BC.init $ BC.unlines x
326{-# INLINE unlinesPS #-}
327{- QuickCheck property:
328
329testUnlines = deepCheck (\x -> (unlinesPS x == unlinesPSOld x))
330unlinesPSOld ss = BC.concat $ intersperse_newlines ss
331    where intersperse_newlines (a:b:s) = a : newline : intersperse_newlines (b:s)
332          intersperse_newlines s = s
333          newline = BC.pack "\n" -}
334
335-- -----------------------------------------------------------------------------
336-- gzReadFilePS
337
338-- | Read an entire file, which may or may not be gzip compressed, directly
339-- into a 'B.ByteString'.
340
341-- #ifndef HAVE_HASKELL_ZLIB
342foreign import ccall unsafe "static zlib.h gzopen" c_gzopen
343    :: CString -> CString -> IO (Ptr ())
344foreign import ccall unsafe "static zlib.h gzclose" c_gzclose
345    :: Ptr () -> IO ()
346foreign import ccall unsafe "static zlib.h gzread" c_gzread
347    :: Ptr () -> Ptr Word8 -> CInt -> IO CInt
348foreign import ccall unsafe "static zlib.h gzwrite" c_gzwrite
349    :: Ptr () -> Ptr Word8 -> CInt -> IO CInt
350-- #endif
351
352gzReadFilePS :: FilePath -> IO B.ByteString
353gzReadFilePS f = do
354    h <- openBinaryFile f ReadMode
355    header <- B.hGet h 2
356    if header /= BC.pack "\31\139"
357       then do hClose h
358               mmapFilePS f
359       else do hSeek h SeekFromEnd (-4)
360               len <- hGetLittleEndInt h
361               hClose h
362-- #ifdef HAVE_HASKELL_ZLIB
363               -- Passing the length to GZ.decompressWith means
364               -- that BL.toChunks only produces one chunk, which in turn
365               -- means that B.concat won't need to copy data.
366               -- If the length is wrong this will just affect efficiency, not correctness
367               let decompress = GZ.decompressWith GZ.defaultDecompressParams {
368                                  GZ.decompressBufferSize = len
369                                }
370               fmap (B.concat . BL.toChunks . decompress) $
371-- #ifdef HAVE_OLD_BYTESTRING
372                        -- bytestring < 0.9.1 had a bug where it did not know to close handles upon EOF
373                        -- performance would be better with a newer bytestring and lazy
374                        -- readFile below -- ratify readFile: comment
375                        fmap (BL.fromChunks . (:[])) $
376                        B.readFile f  -- ratify readFile: immediately consumed
377-- #else
378                        BL.readFile f -- ratify readFile: immediately consumed by the conversion to a strict bytestring
379-- #endif
380-- #else
381               withCString f $ \fstr-> withCString "rb" $ \rb-> do
382                 gzf <- c_gzopen fstr rb
383                 when (gzf == nullPtr) $ fail $ "problem opening file "++f
384                 fp <- BI.mallocByteString len
385                 debugForeignPtr fp $ "gzReadFilePS "++f
386                 lread <- withForeignPtr fp $ \p ->
387                          c_gzread gzf p (fromIntegral len)
388                 c_gzclose gzf
389                 when (fromIntegral lread /= len) $
390                      fail $ "problem gzreading file "++f
391                 return $ fromForeignPtr fp 0 len
392-- #endif
393
394hGetLittleEndInt :: Handle -> IO Int
395hGetLittleEndInt h = do
396    b1 <- ord `fmap` hGetChar h
397    b2 <- ord `fmap` hGetChar h
398    b3 <- ord `fmap` hGetChar h
399    b4 <- ord `fmap` hGetChar h
400    return $ b1 + 256*b2 + 65536*b3 + 16777216*b4
401
402gzWriteFilePS :: FilePath -> B.ByteString -> IO ()
403gzWriteFilePS f ps = gzWriteFilePSs f [ps]
404
405gzWriteFilePSs :: FilePath -> [B.ByteString] -> IO ()
406gzWriteFilePSs f pss  =
407-- #ifdef HAVE_HASKELL_ZLIB
408    BL.writeFile f $ GZ.compress $ BL.fromChunks pss
409-- #else
410    withCString f $ \fstr -> withCString "wb" $ \wb -> do
411    gzf <- c_gzopen fstr wb
412    when (gzf == nullPtr) $ fail $ "problem gzopening file for write: "++f
413    mapM_ (gzWriteToGzf gzf) pss `catch`
414              \_ -> fail $ "problem gzwriting file: "++f
415    c_gzclose gzf
416
417gzWriteToGzf :: Ptr () -> B.ByteString -> IO ()
418gzWriteToGzf gzf ps = case BI.toForeignPtr ps of
419 (_,_,0) -> return () -- avoid calling gzwrite with 0 length this would
420                      -- trouble on some versions of zlib, and is always
421                      -- unnecessary.
422 (x,s,l) -> do
423    lw <- withForeignPtr x $ \p -> c_gzwrite gzf (p `plusPtr` s)
424                                                 (fromIntegral l)
425    when (fromIntegral lw /= l) $ fail $ "problem in gzWriteToGzf"
426-- #endif
427
428-- -----------------------------------------------------------------------------
429-- mmapFilePS
430
431-- | Like readFilePS, this reads an entire file directly into a
432-- 'B.ByteString', but it is even more efficient.  It involves directly
433-- mapping the file to memory.  This has the advantage that the contents of
434-- the file never need to be copied.  Also, under memory pressure the page
435-- may simply be discarded, wile in the case of readFilePS it would need to
436-- be written to swap.  If you read many small files, mmapFilePS will be
437-- less memory-efficient than readFilePS, since each mmapFilePS takes up a
438-- separate page of memory.  Also, you can run into bus errors if the file
439-- is modified.  NOTE: as with 'readFilePS', the string representation in
440-- the file is assumed to be ISO-8859-1.
441
442mmapFilePS :: FilePath -> IO B.ByteString
443-- #ifdef HAVE_MMAP
444mmapFilePS f = do
445  x <- mmapFileByteString f Nothing
446   `catch` (\_ -> do
447                     size <- fileSize `fmap` getSymbolicLinkStatus f
448                     if size == 0
449                        then return B.empty
450                        else performGC >> mmapFileByteString f Nothing)
451  return x
452-- #else
453mmapFilePS = B.readFile
454-- #endif
455
456-- -------------------------------------------------------------------------
457-- fromPS2Hex
458
459foreign import ccall unsafe "static fpstring.h conv_to_hex" conv_to_hex
460    :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
461
462fromPS2Hex :: B.ByteString -> B.ByteString
463fromPS2Hex ps = case BI.toForeignPtr ps of
464          (x,s,l) ->
465           BI.unsafeCreate (2*l) $ \p -> withForeignPtr x $ \f ->
466           conv_to_hex p (f `plusPtr` s) $ fromIntegral l
467
468-- -------------------------------------------------------------------------
469-- fromHex2PS
470
471foreign import ccall unsafe "static fpstring.h conv_from_hex" conv_from_hex
472    :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()
473
474fromHex2PS :: B.ByteString -> B.ByteString
475fromHex2PS ps = case BI.toForeignPtr ps of
476          (x,s,l) ->
477           BI.unsafeCreate (l `div` 2) $ \p -> withForeignPtr x $ \f ->
478           conv_from_hex p (f `plusPtr` s) (fromIntegral $ l `div` 2)
479
480-- -------------------------------------------------------------------------
481-- betweenLinesPS
482
483-- | betweenLinesPS returns the B.ByteString between the two lines given,
484-- or Nothing if they do not appear.
485
486betweenLinesPS :: B.ByteString -> B.ByteString -> B.ByteString
487               -> Maybe (B.ByteString)
488betweenLinesPS start end ps
489 = case break (start ==) (linesPS ps) of
490       (_, _:rest@(bs1:_)) ->
491           case BI.toForeignPtr bs1 of
492            (ps1,s1,_) ->
493             case break (end ==) rest of
494               (_, bs2:_) -> case BI.toForeignPtr bs2 of (_,s2,_) -> Just $ fromForeignPtr ps1 s1 (s2 - s1)
495               _ -> Nothing
496       _ -> Nothing
497
498-- -------------------------------------------------------------------------
499-- break_after_nth_newline
500
501break_after_nth_newline :: Int -> B.ByteString
502                        -> Maybe (B.ByteString, B.ByteString)
503break_after_nth_newline 0 the_ps | B.null the_ps = Just (B.empty, B.empty)
504break_after_nth_newline n the_ps =
505  case BI.toForeignPtr the_ps of
506  (fp,the_s,l) ->
507   unsafePerformIO $ withForeignPtr fp $ \p ->
508   do let findit 0 s | s == end = return $ Just (the_ps, B.empty)
509          findit _ s | s == end = return Nothing
510          findit 0 s = let left_l = s - the_s
511                       in return $ Just (fromForeignPtr fp the_s left_l,
512                                         fromForeignPtr fp s (l - left_l))
513          findit i s = do w <- peekElemOff p s
514                          if w == nl then findit (i-1) (s+1)
515                                     else findit i (s+1)
516          nl = BI.c2w '\n'
517          end = the_s + l
518      findit n the_s
519
520-- -------------------------------------------------------------------------
521-- break_before_nth_newline
522
523break_before_nth_newline :: Int -> B.ByteString -> (B.ByteString, B.ByteString)
524break_before_nth_newline 0 the_ps
525 | B.null the_ps = (B.empty, B.empty)
526break_before_nth_newline n the_ps =
527 case BI.toForeignPtr the_ps of
528 (fp,the_s,l) ->
529   unsafePerformIO $ withForeignPtr fp $ \p ->
530   do let findit _ s | s == end = return (the_ps, B.empty)
531          findit i s = do w <- peekElemOff p s
532                          if w == nl
533                            then if i == 0
534                                 then let left_l = s - the_s
535                                      in return (fromForeignPtr fp the_s left_l,
536                                                 fromForeignPtr fp s (l - left_l))
537                                 else findit (i-1) (s+1)
538                            else findit i (s+1)
539          nl = BI.c2w '\n'
540          end = the_s + l
541      findit n the_s
542