1{- 2(c) The University of Glasgow 2006 3(c) The University of Glasgow, 1997-2006 4 5 6Buffers for scanning string input stored in external arrays. 7-} 8 9{-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples #-} 10{-# OPTIONS_GHC -O2 #-} 11-- We always optimise this, otherwise performance of a non-optimised 12-- compiler is severely affected 13 14module StringBuffer 15 ( 16 StringBuffer(..), 17 -- non-abstract for vs\/HaskellService 18 19 -- * Creation\/destruction 20 hGetStringBuffer, 21 hGetStringBufferBlock, 22 hPutStringBuffer, 23 appendStringBuffers, 24 stringToStringBuffer, 25 26 -- * Inspection 27 nextChar, 28 currentChar, 29 prevChar, 30 atEnd, 31 32 -- * Moving and comparison 33 stepOn, 34 offsetBytes, 35 byteDiff, 36 atLine, 37 38 -- * Conversion 39 lexemeToString, 40 lexemeToFastString, 41 decodePrevNChars, 42 43 -- * Parsing integers 44 parseUnsignedInteger, 45 ) where 46 47#include "GhclibHsVersions.h" 48 49import GhcPrelude 50 51import Encoding 52import FastString 53import FastFunctions 54import PlainPanic 55import Util 56 57import Data.Maybe 58import Control.Exception 59import System.IO 60import System.IO.Unsafe ( unsafePerformIO ) 61import GHC.IO.Encoding.UTF8 ( mkUTF8 ) 62import GHC.IO.Encoding.Failure ( CodingFailureMode(IgnoreCodingFailure) ) 63 64import GHC.Exts 65 66import Foreign 67 68-- ----------------------------------------------------------------------------- 69-- The StringBuffer type 70 71-- |A StringBuffer is an internal pointer to a sized chunk of bytes. 72-- The bytes are intended to be *immutable*. There are pure 73-- operations to read the contents of a StringBuffer. 74-- 75-- A StringBuffer may have a finalizer, depending on how it was 76-- obtained. 77-- 78data StringBuffer 79 = StringBuffer { 80 buf :: {-# UNPACK #-} !(ForeignPtr Word8), 81 len :: {-# UNPACK #-} !Int, -- length 82 cur :: {-# UNPACK #-} !Int -- current pos 83 } 84 -- The buffer is assumed to be UTF-8 encoded, and furthermore 85 -- we add three @\'\\0\'@ bytes to the end as sentinels so that the 86 -- decoder doesn't have to check for overflow at every single byte 87 -- of a multibyte sequence. 88 89instance Show StringBuffer where 90 showsPrec _ s = showString "<stringbuffer(" 91 . shows (len s) . showString "," . shows (cur s) 92 . showString ")>" 93 94-- ----------------------------------------------------------------------------- 95-- Creation / Destruction 96 97-- | Read a file into a 'StringBuffer'. The resulting buffer is automatically 98-- managed by the garbage collector. 99hGetStringBuffer :: FilePath -> IO StringBuffer 100hGetStringBuffer fname = do 101 h <- openBinaryFile fname ReadMode 102 size_i <- hFileSize h 103 offset_i <- skipBOM h size_i 0 -- offset is 0 initially 104 let size = fromIntegral $ size_i - offset_i 105 buf <- mallocForeignPtrArray (size+3) 106 withForeignPtr buf $ \ptr -> do 107 r <- if size == 0 then return 0 else hGetBuf h ptr size 108 hClose h 109 if (r /= size) 110 then ioError (userError "short read of file") 111 else newUTF8StringBuffer buf ptr size 112 113hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer 114hGetStringBufferBlock handle wanted 115 = do size_i <- hFileSize handle 116 offset_i <- hTell handle >>= skipBOM handle size_i 117 let size = min wanted (fromIntegral $ size_i-offset_i) 118 buf <- mallocForeignPtrArray (size+3) 119 withForeignPtr buf $ \ptr -> 120 do r <- if size == 0 then return 0 else hGetBuf handle ptr size 121 if r /= size 122 then ioError (userError $ "short read of file: "++show(r,size,size_i,handle)) 123 else newUTF8StringBuffer buf ptr size 124 125hPutStringBuffer :: Handle -> StringBuffer -> IO () 126hPutStringBuffer hdl (StringBuffer buf len cur) 127 = do withForeignPtr (plusForeignPtr buf cur) $ \ptr -> 128 hPutBuf hdl ptr len 129 130-- | Skip the byte-order mark if there is one (see #1744 and #6016), 131-- and return the new position of the handle in bytes. 132-- 133-- This is better than treating #FEFF as whitespace, 134-- because that would mess up layout. We don't have a concept 135-- of zero-width whitespace in Haskell: all whitespace codepoints 136-- have a width of one column. 137skipBOM :: Handle -> Integer -> Integer -> IO Integer 138skipBOM h size offset = 139 -- Only skip BOM at the beginning of a file. 140 if size > 0 && offset == 0 141 then do 142 -- Validate assumption that handle is in binary mode. 143 ASSERTM( hGetEncoding h >>= return . isNothing ) 144 -- Temporarily select utf8 encoding with error ignoring, 145 -- to make `hLookAhead` and `hGetChar` return full Unicode characters. 146 bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do 147 c <- hLookAhead h 148 if c == '\xfeff' 149 then hGetChar h >> hTell h 150 else return offset 151 else return offset 152 where 153 safeEncoding = mkUTF8 IgnoreCodingFailure 154 155newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer 156newUTF8StringBuffer buf ptr size = do 157 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0] 158 -- sentinels for UTF-8 decoding 159 return $ StringBuffer buf size 0 160 161appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer 162appendStringBuffers sb1 sb2 163 = do newBuf <- mallocForeignPtrArray (size+3) 164 withForeignPtr newBuf $ \ptr -> 165 withForeignPtr (buf sb1) $ \sb1Ptr -> 166 withForeignPtr (buf sb2) $ \sb2Ptr -> 167 do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len 168 copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len 169 pokeArray (ptr `advancePtr` size) [0,0,0] 170 return (StringBuffer newBuf size 0) 171 where sb1_len = calcLen sb1 172 sb2_len = calcLen sb2 173 calcLen sb = len sb - cur sb 174 size = sb1_len + sb2_len 175 176-- | Encode a 'String' into a 'StringBuffer' as UTF-8. The resulting buffer 177-- is automatically managed by the garbage collector. 178stringToStringBuffer :: String -> StringBuffer 179stringToStringBuffer str = 180 unsafePerformIO $ do 181 let size = utf8EncodedLength str 182 buf <- mallocForeignPtrArray (size+3) 183 withForeignPtr buf $ \ptr -> do 184 utf8EncodeString ptr str 185 pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0] 186 -- sentinels for UTF-8 decoding 187 return (StringBuffer buf size 0) 188 189-- ----------------------------------------------------------------------------- 190-- Grab a character 191 192-- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well 193-- the remaining portion (analogous to 'Data.List.uncons'). __Warning:__ The 194-- behavior is undefined if the 'StringBuffer' is empty. The result shares 195-- the same buffer as the original. Similar to 'utf8DecodeChar', if the 196-- character cannot be decoded as UTF-8, @\'\\0\'@ is returned. 197{-# INLINE nextChar #-} 198nextChar :: StringBuffer -> (Char,StringBuffer) 199nextChar (StringBuffer buf len (I# cur#)) = 200 -- Getting our fingers dirty a little here, but this is performance-critical 201 inlinePerformIO $ do 202 withForeignPtr buf $ \(Ptr a#) -> do 203 case utf8DecodeChar# (a# `plusAddr#` cur#) of 204 (# c#, nBytes# #) -> 205 let cur' = I# (cur# +# nBytes#) in 206 return (C# c#, StringBuffer buf len cur') 207 208-- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous 209-- to 'Data.List.head'). __Warning:__ The behavior is undefined if the 210-- 'StringBuffer' is empty. Similar to 'utf8DecodeChar', if the character 211-- cannot be decoded as UTF-8, @\'\\0\'@ is returned. 212currentChar :: StringBuffer -> Char 213currentChar = fst . nextChar 214 215prevChar :: StringBuffer -> Char -> Char 216prevChar (StringBuffer _ _ 0) deflt = deflt 217prevChar (StringBuffer buf _ cur) _ = 218 inlinePerformIO $ do 219 withForeignPtr buf $ \p -> do 220 p' <- utf8PrevChar (p `plusPtr` cur) 221 return (fst (utf8DecodeChar p')) 222 223-- ----------------------------------------------------------------------------- 224-- Moving 225 226-- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous 227-- to 'Data.List.tail'). __Warning:__ The behavior is undefined if the 228-- 'StringBuffer' is empty. The result shares the same buffer as the 229-- original. 230stepOn :: StringBuffer -> StringBuffer 231stepOn s = snd (nextChar s) 232 233-- | Return a 'StringBuffer' with the first @n@ bytes removed. __Warning:__ 234-- If there aren't enough characters, the returned 'StringBuffer' will be 235-- invalid and any use of it may lead to undefined behavior. The result 236-- shares the same buffer as the original. 237offsetBytes :: Int -- ^ @n@, the number of bytes 238 -> StringBuffer 239 -> StringBuffer 240offsetBytes i s = s { cur = cur s + i } 241 242-- | Compute the difference in offset between two 'StringBuffer's that share 243-- the same buffer. __Warning:__ The behavior is undefined if the 244-- 'StringBuffer's use separate buffers. 245byteDiff :: StringBuffer -> StringBuffer -> Int 246byteDiff s1 s2 = cur s2 - cur s1 247 248-- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null'). 249atEnd :: StringBuffer -> Bool 250atEnd (StringBuffer _ l c) = l == c 251 252-- | Computes a 'StringBuffer' which points to the first character of the 253-- wanted line. Lines begin at 1. 254atLine :: Int -> StringBuffer -> Maybe StringBuffer 255atLine line sb@(StringBuffer buf len _) = 256 inlinePerformIO $ 257 withForeignPtr buf $ \p -> do 258 p' <- skipToLine line len p 259 if p' == nullPtr 260 then return Nothing 261 else 262 let 263 delta = p' `minusPtr` p 264 in return $ Just (sb { cur = delta 265 , len = len - delta 266 }) 267 268skipToLine :: Int -> Int -> Ptr Word8 -> IO (Ptr Word8) 269skipToLine !line !len !op0 = go 1 op0 270 where 271 !opend = op0 `plusPtr` len 272 273 go !i_line !op 274 | op >= opend = pure nullPtr 275 | i_line == line = pure op 276 | otherwise = do 277 w <- peek op :: IO Word8 278 case w of 279 10 -> go (i_line + 1) (plusPtr op 1) 280 13 -> do 281 -- this is safe because a 'StringBuffer' is 282 -- guaranteed to have 3 bytes sentinel values. 283 w' <- peek (plusPtr op 1) :: IO Word8 284 case w' of 285 10 -> go (i_line + 1) (plusPtr op 2) 286 _ -> go (i_line + 1) (plusPtr op 1) 287 _ -> go i_line (plusPtr op 1) 288 289-- ----------------------------------------------------------------------------- 290-- Conversion 291 292-- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'. 293-- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8, 294-- they will be replaced with @\'\\0\'@. 295lexemeToString :: StringBuffer 296 -> Int -- ^ @n@, the number of bytes 297 -> String 298lexemeToString _ 0 = "" 299lexemeToString (StringBuffer buf _ cur) bytes = 300 utf8DecodeStringLazy buf cur bytes 301 302lexemeToFastString :: StringBuffer 303 -> Int -- ^ @n@, the number of bytes 304 -> FastString 305lexemeToFastString _ 0 = nilFS 306lexemeToFastString (StringBuffer buf _ cur) len = 307 inlinePerformIO $ 308 withForeignPtr buf $ \ptr -> 309 return $! mkFastStringBytes (ptr `plusPtr` cur) len 310 311-- | Return the previous @n@ characters (or fewer if we are less than @n@ 312-- characters into the buffer. 313decodePrevNChars :: Int -> StringBuffer -> String 314decodePrevNChars n (StringBuffer buf _ cur) = 315 inlinePerformIO $ withForeignPtr buf $ \p0 -> 316 go p0 n "" (p0 `plusPtr` (cur - 1)) 317 where 318 go :: Ptr Word8 -> Int -> String -> Ptr Word8 -> IO String 319 go buf0 n acc p | n == 0 || buf0 >= p = return acc 320 go buf0 n acc p = do 321 p' <- utf8PrevChar p 322 let (c,_) = utf8DecodeChar p' 323 go buf0 (n - 1) (c:acc) p' 324 325-- ----------------------------------------------------------------------------- 326-- Parsing integer strings in various bases 327parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer 328parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int 329 = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let 330 go i x | i == len = x 331 | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of 332 '_' -> go (i + 1) x -- skip "_" (#14473) 333 char -> go (i + 1) (x * radix + toInteger (char_to_int char)) 334 in go 0 0 335