1{-# LANGUAGE BangPatterns, ForeignFunctionInterface, CPP #-} 2 3----------------------------------------------------------------------------- 4-- | 5-- Module : ByteStringUtils 6-- Copyright : (c) The University of Glasgow 2001, 7-- David Roundy 2003-2005 8-- License : GPL (I'm happy to also license this file BSD style but don't 9-- want to bother distributing two license files with darcs. 10-- 11-- Maintainer : droundy@abridgegame.org 12-- Stability : experimental 13-- Portability : portable 14-- 15-- GZIp and MMap IO for ByteStrings, and miscellaneous functions for Data.ByteString 16-- 17 18module ByteStringUtils ( 19 20 unsafeWithInternals, 21 unpackPSfromUTF8, 22 23 -- IO with mmap or gzip 24 gzReadFilePS, 25 mmapFilePS, 26 gzWriteFilePS, 27 gzWriteFilePSs, 28 29 -- list utilities 30 ifHeadThenTail, 31 dropSpace, 32 breakSpace, 33 linesPS, 34 unlinesPS, 35 hashPS, 36 breakFirstPS, 37 breakLastPS, 38 substrPS, 39 readIntPS, 40 is_funky, 41 fromHex2PS, 42 fromPS2Hex, 43 betweenLinesPS, 44 break_after_nth_newline, 45 break_before_nth_newline, 46 intercalate 47 ) where 48 49import Prelude hiding ( catch ) 50import qualified Data.ByteString as B 51import qualified Data.ByteString.Char8 as BC 52import qualified Data.ByteString.Internal as BI 53import Data.ByteString (intercalate, uncons) 54import Data.ByteString.Internal (fromForeignPtr) 55 56-- #if defined (HAVE_MMAP) || ! defined (HAVE_HASKELL_ZLIB) 57import Control.Exception ( catch ) 58-- #endif 59import System.IO 60import System.IO.Unsafe ( unsafePerformIO ) 61 62import Foreign.Storable ( peekElemOff, peek ) 63import Foreign.Marshal.Alloc ( free ) 64import Foreign.Marshal.Array ( mallocArray, peekArray, advancePtr ) 65import Foreign.C.Types ( CInt ) 66 67import Data.Bits ( rotateL ) 68import Data.Char ( chr, ord, isSpace ) 69import Data.Word ( Word8 ) 70import Data.Int ( Int32 ) 71import Control.Monad ( when ) 72 73-- #ifndef HAVE_HASKELL_ZLIB 74import Foreign.Ptr ( nullPtr ) 75import Foreign.ForeignPtr ( ForeignPtr ) 76-- #endif 77import Foreign.Ptr ( plusPtr, Ptr ) 78import Foreign.ForeignPtr ( withForeignPtr ) 79 80-- #ifdef DEBUG_PS 81import Foreign.ForeignPtr ( addForeignPtrFinalizer ) 82import Foreign.Ptr ( FunPtr ) 83-- #endif 84 85-- #if HAVE_HASKELL_ZLIB 86import qualified Data.ByteString.Lazy as BL 87import qualified Codec.Compression.GZip as GZ 88-- #else 89import Foreign.C.String ( CString, withCString ) 90-- #endif 91 92-- #ifdef HAVE_MMAP 93import System.IO.MMap( mmapFileByteString ) 94import System.Mem( performGC ) 95import System.Posix.Files( fileSize, getSymbolicLinkStatus ) 96-- #endif 97 98-- ----------------------------------------------------------------------------- 99-- obsolete debugging code 100 101-- # ifndef HAVE_HASKELL_ZLIB 102debugForeignPtr :: ForeignPtr a -> String -> IO () 103-- #ifdef DEBUG_PS 104foreign import ccall unsafe "static fpstring.h debug_alloc" debug_alloc 105 :: Ptr a -> CString -> IO () 106foreign import ccall unsafe "static fpstring.h & debug_free" debug_free 107 :: FunPtr (Ptr a -> IO ()) 108debugForeignPtr fp n = 109 withCString n $ \cname-> withForeignPtr fp $ \p-> 110 do debug_alloc p cname 111 addForeignPtrFinalizer debug_free fp 112-- #else 113debugForeignPtr _ _ = return () 114-- #endif 115-- #endif 116 117-- ----------------------------------------------------------------------------- 118-- unsafeWithInternals 119 120-- | Do something with the internals of a PackedString. Beware of 121-- altering the contents! 122unsafeWithInternals :: B.ByteString -> (Ptr Word8 -> Int -> IO a) -> IO a 123unsafeWithInternals ps f 124 = case BI.toForeignPtr ps of 125 (fp,s,l) -> withForeignPtr fp $ \p -> f (p `plusPtr` s) l 126 127-- | readIntPS skips any whitespace at the beginning of its argument, and 128-- reads an Int from the beginning of the PackedString. If there is no 129-- integer at the beginning of the string, it returns Nothing, otherwise it 130-- just returns the int read, along with a B.ByteString containing the 131-- remainder of its input. 132 133readIntPS :: B.ByteString -> Maybe (Int, B.ByteString) 134readIntPS = BC.readInt . BC.dropWhile isSpace 135 136-- ----------------------------------------------------------------------------- 137-- Destructor functions (taking PackedStrings apart) 138 139unpackPSfromUTF8 :: B.ByteString -> String 140unpackPSfromUTF8 ps = 141 case BI.toForeignPtr ps of 142 (_,_, 0) -> "" 143 (x,s,l) -> 144 unsafePerformIO $ withForeignPtr x $ \p-> 145 do outbuf <- mallocArray l 146 lout <- fromIntegral `fmap` 147 utf8_to_ints outbuf (p `plusPtr` s) (fromIntegral l) 148 when (lout < 0) $ error "Bad UTF8!" 149 str <- (map (chr . fromIntegral)) `fmap` peekArray lout outbuf 150 free outbuf 151 return str 152 153foreign import ccall unsafe "static fpstring.h utf8_to_ints" utf8_to_ints 154 :: Ptr Int -> Ptr Word8 -> CInt -> IO CInt 155 156-- ----------------------------------------------------------------------------- 157-- List-mimicking functions for PackedStrings 158 159{-# INLINE ifHeadThenTail #-} 160ifHeadThenTail :: Word8 -> B.ByteString -> Maybe B.ByteString 161ifHeadThenTail c s = case uncons s of 162 Just (w, t) | w == c -> Just t 163 _ -> Nothing 164 165------------------------------------------------------------------------ 166-- A reimplementation of Data.ByteString.Char8.dropSpace, but 167-- specialised to darcs' need for a 4 way isspace. 168-- 169-- TODO: if it is safe to use the expanded definition of isSpaceWord8 170-- provided by Data.ByteString.Char8, then all this can go. 171 172-- A locale-independent isspace(3) so patches are interpreted the same everywhere. 173-- ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r') 174isSpaceWord8 :: Word8 -> Bool 175isSpaceWord8 w = 176 w == 0x20 || -- ' ' 177 w == 0x09 || -- '\t' 178 w == 0x0A || -- '\n' 179 w == 0x0D -- '\r' 180{-# INLINE isSpaceWord8 #-} 181 182firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int 183firstnonspace !ptr !n !m 184 | n >= m = return n 185 | otherwise = do w <- peekElemOff ptr n 186 if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n 187 188firstspace :: Ptr Word8 -> Int -> Int -> IO Int 189firstspace !ptr !n !m 190 | n >= m = return n 191 | otherwise = do w <- peekElemOff ptr n 192 if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n 193 194-- | 'dropSpace' efficiently returns the 'ByteString' argument with 195-- white space Chars removed from the front. It is more efficient than 196-- calling dropWhile for removing whitespace. I.e. 197-- 198-- > dropWhile isSpace == dropSpace 199-- 200dropSpace :: B.ByteString -> B.ByteString 201dropSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do 202 i <- firstnonspace (p `plusPtr` s) 0 l 203 return $! if i == l then B.empty else BI.PS x (s+i) (l-i) 204{-# INLINE dropSpace #-} 205 206-- | 'breakSpace' returns the pair of ByteStrings when the argument is 207-- broken at the first whitespace byte. I.e. 208-- 209-- > break isSpace == breakSpace 210-- 211breakSpace :: B.ByteString -> (B.ByteString,B.ByteString) 212breakSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do 213 i <- firstspace (p `plusPtr` s) 0 l 214 return $! case () of {_ 215 | i == 0 -> (B.empty, BI.PS x s l) 216 | i == l -> (BI.PS x s l, B.empty) 217 | otherwise -> (BI.PS x s i, BI.PS x (s+i) (l-i)) 218 } 219{-# INLINE breakSpace #-} 220 221------------------------------------------------------------------------ 222 223{-# INLINE is_funky #-} 224is_funky :: B.ByteString -> Bool 225is_funky ps = case BI.toForeignPtr ps of 226 (x,s,l) -> 227 unsafePerformIO $ withForeignPtr x $ \p-> 228 (/=0) `fmap` has_funky_char (p `plusPtr` s) (fromIntegral l) 229 230foreign import ccall unsafe "fpstring.h has_funky_char" has_funky_char 231 :: Ptr Word8 -> CInt -> IO CInt 232 233------------------------------------------------------------------------ 234 235-- ByteString rewrites break (=='x') to breakByte 'x' 236-- break ((==) x) = breakChar x 237-- break (==x) = breakChar x 238-- 239 240{- 241{-# INLINE breakOnPS #-} 242breakOnPS :: Char -> B.ByteString -> (B.ByteString, B.ByteString) 243breakOnPS c p = case BC.elemIndex c p of 244 Nothing -> (p, BC.empty) 245 Just n -> (B.take n p, B.drop n p) 246-} 247 248{-# INLINE hashPS #-} 249hashPS :: B.ByteString -> Int32 250hashPS ps = 251 case BI.toForeignPtr ps of 252 (x,s,l) -> 253 unsafePerformIO $ withForeignPtr x $ \p-> 254 do hash (p `plusPtr` s) l 255 256hash :: Ptr Word8 -> Int -> IO Int32 257hash ptr len = f (0 :: Int32) ptr len 258 where f h _ 0 = return h 259 f h p n = do x <- peek p 260 let !h' = (fromIntegral x) + (rotateL h 8) 261 f h' (p `advancePtr` 1) (n-1) 262 263{-# INLINE substrPS #-} 264substrPS :: B.ByteString -> B.ByteString -> Maybe Int 265substrPS tok str 266 | B.null tok = Just 0 267 | B.length tok > B.length str = Nothing 268 | otherwise = do n <- BC.elemIndex (BC.head tok) str 269 let ttok = B.tail tok 270 reststr = B.drop (n+1) str 271 if ttok == B.take (B.length ttok) reststr 272 then Just n 273 else ((n+1)+) `fmap` substrPS tok reststr 274 275------------------------------------------------------------------------ 276 277-- TODO: replace breakFirstPS and breakLastPS with definitions based on 278-- ByteString's break/breakEnd 279{-# INLINE breakFirstPS #-} 280breakFirstPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString) 281breakFirstPS c p = case BC.elemIndex c p of 282 Nothing -> Nothing 283 Just n -> Just (B.take n p, B.drop (n+1) p) 284 285{-# INLINE breakLastPS #-} 286breakLastPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString) 287breakLastPS c p = case BC.elemIndexEnd c p of 288 Nothing -> Nothing 289 Just n -> Just (B.take n p, B.drop (n+1) p) 290 291-- TODO: rename 292{-# INLINE linesPS #-} 293linesPS :: B.ByteString -> [B.ByteString] 294linesPS ps 295 | B.null ps = [B.empty] 296 | otherwise = BC.split '\n' ps 297 298{- QuickCheck property: 299 300import Test.QuickCheck 301import qualified Data.ByteString.Char8 as BC 302import Data.Char 303instance Arbitrary BC.ByteString where 304 arbitrary = fmap BC.pack arbitrary 305instance Arbitrary Char where 306 arbitrary = chr `fmap` choose (32,127) 307deepCheck = check (defaultConfig { configMaxTest = 10000}) 308testLines = deepCheck (\x -> (linesPS x == linesPSOld x)) 309linesPSOld ps = case BC.elemIndex '\n' ps of 310 Nothing -> [ps] 311 Just n -> B.take n ps : linesPS (B.drop (n+1) ps) -} 312 313{-| This function acts exactly like the "Prelude" unlines function, or like 314"Data.ByteString.Char8" 'unlines', but with one important difference: it will 315produce a string which may not end with a newline! That is: 316 317> unlinesPS ["foo", "bar"] 318 319evaluates to \"foo\\nbar\", not \"foo\\nbar\\n\"! This point should hold true for 320'linesPS' as well. 321 322TODO: rename this function. -} 323unlinesPS :: [B.ByteString] -> B.ByteString 324unlinesPS [] = BC.empty 325unlinesPS x = BC.init $ BC.unlines x 326{-# INLINE unlinesPS #-} 327{- QuickCheck property: 328 329testUnlines = deepCheck (\x -> (unlinesPS x == unlinesPSOld x)) 330unlinesPSOld ss = BC.concat $ intersperse_newlines ss 331 where intersperse_newlines (a:b:s) = a : newline : intersperse_newlines (b:s) 332 intersperse_newlines s = s 333 newline = BC.pack "\n" -} 334 335-- ----------------------------------------------------------------------------- 336-- gzReadFilePS 337 338-- | Read an entire file, which may or may not be gzip compressed, directly 339-- into a 'B.ByteString'. 340 341-- #ifndef HAVE_HASKELL_ZLIB 342foreign import ccall unsafe "static zlib.h gzopen" c_gzopen 343 :: CString -> CString -> IO (Ptr ()) 344foreign import ccall unsafe "static zlib.h gzclose" c_gzclose 345 :: Ptr () -> IO () 346foreign import ccall unsafe "static zlib.h gzread" c_gzread 347 :: Ptr () -> Ptr Word8 -> CInt -> IO CInt 348foreign import ccall unsafe "static zlib.h gzwrite" c_gzwrite 349 :: Ptr () -> Ptr Word8 -> CInt -> IO CInt 350-- #endif 351 352gzReadFilePS :: FilePath -> IO B.ByteString 353gzReadFilePS f = do 354 h <- openBinaryFile f ReadMode 355 header <- B.hGet h 2 356 if header /= BC.pack "\31\139" 357 then do hClose h 358 mmapFilePS f 359 else do hSeek h SeekFromEnd (-4) 360 len <- hGetLittleEndInt h 361 hClose h 362-- #ifdef HAVE_HASKELL_ZLIB 363 -- Passing the length to GZ.decompressWith means 364 -- that BL.toChunks only produces one chunk, which in turn 365 -- means that B.concat won't need to copy data. 366 -- If the length is wrong this will just affect efficiency, not correctness 367 let decompress = GZ.decompressWith GZ.defaultDecompressParams { 368 GZ.decompressBufferSize = len 369 } 370 fmap (B.concat . BL.toChunks . decompress) $ 371-- #ifdef HAVE_OLD_BYTESTRING 372 -- bytestring < 0.9.1 had a bug where it did not know to close handles upon EOF 373 -- performance would be better with a newer bytestring and lazy 374 -- readFile below -- ratify readFile: comment 375 fmap (BL.fromChunks . (:[])) $ 376 B.readFile f -- ratify readFile: immediately consumed 377-- #else 378 BL.readFile f -- ratify readFile: immediately consumed by the conversion to a strict bytestring 379-- #endif 380-- #else 381 withCString f $ \fstr-> withCString "rb" $ \rb-> do 382 gzf <- c_gzopen fstr rb 383 when (gzf == nullPtr) $ fail $ "problem opening file "++f 384 fp <- BI.mallocByteString len 385 debugForeignPtr fp $ "gzReadFilePS "++f 386 lread <- withForeignPtr fp $ \p -> 387 c_gzread gzf p (fromIntegral len) 388 c_gzclose gzf 389 when (fromIntegral lread /= len) $ 390 fail $ "problem gzreading file "++f 391 return $ fromForeignPtr fp 0 len 392-- #endif 393 394hGetLittleEndInt :: Handle -> IO Int 395hGetLittleEndInt h = do 396 b1 <- ord `fmap` hGetChar h 397 b2 <- ord `fmap` hGetChar h 398 b3 <- ord `fmap` hGetChar h 399 b4 <- ord `fmap` hGetChar h 400 return $ b1 + 256*b2 + 65536*b3 + 16777216*b4 401 402gzWriteFilePS :: FilePath -> B.ByteString -> IO () 403gzWriteFilePS f ps = gzWriteFilePSs f [ps] 404 405gzWriteFilePSs :: FilePath -> [B.ByteString] -> IO () 406gzWriteFilePSs f pss = 407-- #ifdef HAVE_HASKELL_ZLIB 408 BL.writeFile f $ GZ.compress $ BL.fromChunks pss 409-- #else 410 withCString f $ \fstr -> withCString "wb" $ \wb -> do 411 gzf <- c_gzopen fstr wb 412 when (gzf == nullPtr) $ fail $ "problem gzopening file for write: "++f 413 mapM_ (gzWriteToGzf gzf) pss `catch` 414 \_ -> fail $ "problem gzwriting file: "++f 415 c_gzclose gzf 416 417gzWriteToGzf :: Ptr () -> B.ByteString -> IO () 418gzWriteToGzf gzf ps = case BI.toForeignPtr ps of 419 (_,_,0) -> return () -- avoid calling gzwrite with 0 length this would 420 -- trouble on some versions of zlib, and is always 421 -- unnecessary. 422 (x,s,l) -> do 423 lw <- withForeignPtr x $ \p -> c_gzwrite gzf (p `plusPtr` s) 424 (fromIntegral l) 425 when (fromIntegral lw /= l) $ fail $ "problem in gzWriteToGzf" 426-- #endif 427 428-- ----------------------------------------------------------------------------- 429-- mmapFilePS 430 431-- | Like readFilePS, this reads an entire file directly into a 432-- 'B.ByteString', but it is even more efficient. It involves directly 433-- mapping the file to memory. This has the advantage that the contents of 434-- the file never need to be copied. Also, under memory pressure the page 435-- may simply be discarded, wile in the case of readFilePS it would need to 436-- be written to swap. If you read many small files, mmapFilePS will be 437-- less memory-efficient than readFilePS, since each mmapFilePS takes up a 438-- separate page of memory. Also, you can run into bus errors if the file 439-- is modified. NOTE: as with 'readFilePS', the string representation in 440-- the file is assumed to be ISO-8859-1. 441 442mmapFilePS :: FilePath -> IO B.ByteString 443-- #ifdef HAVE_MMAP 444mmapFilePS f = do 445 x <- mmapFileByteString f Nothing 446 `catch` (\_ -> do 447 size <- fileSize `fmap` getSymbolicLinkStatus f 448 if size == 0 449 then return B.empty 450 else performGC >> mmapFileByteString f Nothing) 451 return x 452-- #else 453mmapFilePS = B.readFile 454-- #endif 455 456-- ------------------------------------------------------------------------- 457-- fromPS2Hex 458 459foreign import ccall unsafe "static fpstring.h conv_to_hex" conv_to_hex 460 :: Ptr Word8 -> Ptr Word8 -> CInt -> IO () 461 462fromPS2Hex :: B.ByteString -> B.ByteString 463fromPS2Hex ps = case BI.toForeignPtr ps of 464 (x,s,l) -> 465 BI.unsafeCreate (2*l) $ \p -> withForeignPtr x $ \f -> 466 conv_to_hex p (f `plusPtr` s) $ fromIntegral l 467 468-- ------------------------------------------------------------------------- 469-- fromHex2PS 470 471foreign import ccall unsafe "static fpstring.h conv_from_hex" conv_from_hex 472 :: Ptr Word8 -> Ptr Word8 -> CInt -> IO () 473 474fromHex2PS :: B.ByteString -> B.ByteString 475fromHex2PS ps = case BI.toForeignPtr ps of 476 (x,s,l) -> 477 BI.unsafeCreate (l `div` 2) $ \p -> withForeignPtr x $ \f -> 478 conv_from_hex p (f `plusPtr` s) (fromIntegral $ l `div` 2) 479 480-- ------------------------------------------------------------------------- 481-- betweenLinesPS 482 483-- | betweenLinesPS returns the B.ByteString between the two lines given, 484-- or Nothing if they do not appear. 485 486betweenLinesPS :: B.ByteString -> B.ByteString -> B.ByteString 487 -> Maybe (B.ByteString) 488betweenLinesPS start end ps 489 = case break (start ==) (linesPS ps) of 490 (_, _:rest@(bs1:_)) -> 491 case BI.toForeignPtr bs1 of 492 (ps1,s1,_) -> 493 case break (end ==) rest of 494 (_, bs2:_) -> case BI.toForeignPtr bs2 of (_,s2,_) -> Just $ fromForeignPtr ps1 s1 (s2 - s1) 495 _ -> Nothing 496 _ -> Nothing 497 498-- ------------------------------------------------------------------------- 499-- break_after_nth_newline 500 501break_after_nth_newline :: Int -> B.ByteString 502 -> Maybe (B.ByteString, B.ByteString) 503break_after_nth_newline 0 the_ps | B.null the_ps = Just (B.empty, B.empty) 504break_after_nth_newline n the_ps = 505 case BI.toForeignPtr the_ps of 506 (fp,the_s,l) -> 507 unsafePerformIO $ withForeignPtr fp $ \p -> 508 do let findit 0 s | s == end = return $ Just (the_ps, B.empty) 509 findit _ s | s == end = return Nothing 510 findit 0 s = let left_l = s - the_s 511 in return $ Just (fromForeignPtr fp the_s left_l, 512 fromForeignPtr fp s (l - left_l)) 513 findit i s = do w <- peekElemOff p s 514 if w == nl then findit (i-1) (s+1) 515 else findit i (s+1) 516 nl = BI.c2w '\n' 517 end = the_s + l 518 findit n the_s 519 520-- ------------------------------------------------------------------------- 521-- break_before_nth_newline 522 523break_before_nth_newline :: Int -> B.ByteString -> (B.ByteString, B.ByteString) 524break_before_nth_newline 0 the_ps 525 | B.null the_ps = (B.empty, B.empty) 526break_before_nth_newline n the_ps = 527 case BI.toForeignPtr the_ps of 528 (fp,the_s,l) -> 529 unsafePerformIO $ withForeignPtr fp $ \p -> 530 do let findit _ s | s == end = return (the_ps, B.empty) 531 findit i s = do w <- peekElemOff p s 532 if w == nl 533 then if i == 0 534 then let left_l = s - the_s 535 in return (fromForeignPtr fp the_s left_l, 536 fromForeignPtr fp s (l - left_l)) 537 else findit (i-1) (s+1) 538 else findit i (s+1) 539 nl = BI.c2w '\n' 540 end = the_s + l 541 findit n the_s 542