1{-
2(c) The University of Glasgow 2006
3(c) The University of Glasgow, 1997-2006
4
5
6Buffers for scanning string input stored in external arrays.
7-}
8
9{-# LANGUAGE BangPatterns, CPP, MagicHash, UnboxedTuples #-}
10{-# OPTIONS_GHC -O2 #-}
11-- We always optimise this, otherwise performance of a non-optimised
12-- compiler is severely affected
13
14module StringBuffer
15       (
16        StringBuffer(..),
17        -- non-abstract for vs\/HaskellService
18
19         -- * Creation\/destruction
20        hGetStringBuffer,
21        hGetStringBufferBlock,
22        hPutStringBuffer,
23        appendStringBuffers,
24        stringToStringBuffer,
25
26        -- * Inspection
27        nextChar,
28        currentChar,
29        prevChar,
30        atEnd,
31
32        -- * Moving and comparison
33        stepOn,
34        offsetBytes,
35        byteDiff,
36        atLine,
37
38        -- * Conversion
39        lexemeToString,
40        lexemeToFastString,
41        decodePrevNChars,
42
43         -- * Parsing integers
44        parseUnsignedInteger,
45       ) where
46
47#include "GhclibHsVersions.h"
48
49import GhcPrelude
50
51import Encoding
52import FastString
53import FastFunctions
54import PlainPanic
55import Util
56
57import Data.Maybe
58import Control.Exception
59import System.IO
60import System.IO.Unsafe         ( unsafePerformIO )
61import GHC.IO.Encoding.UTF8     ( mkUTF8 )
62import GHC.IO.Encoding.Failure  ( CodingFailureMode(IgnoreCodingFailure) )
63
64import GHC.Exts
65
66import Foreign
67
68-- -----------------------------------------------------------------------------
69-- The StringBuffer type
70
71-- |A StringBuffer is an internal pointer to a sized chunk of bytes.
72-- The bytes are intended to be *immutable*.  There are pure
73-- operations to read the contents of a StringBuffer.
74--
75-- A StringBuffer may have a finalizer, depending on how it was
76-- obtained.
77--
78data StringBuffer
79 = StringBuffer {
80     buf :: {-# UNPACK #-} !(ForeignPtr Word8),
81     len :: {-# UNPACK #-} !Int,        -- length
82     cur :: {-# UNPACK #-} !Int         -- current pos
83  }
84  -- The buffer is assumed to be UTF-8 encoded, and furthermore
85  -- we add three @\'\\0\'@ bytes to the end as sentinels so that the
86  -- decoder doesn't have to check for overflow at every single byte
87  -- of a multibyte sequence.
88
89instance Show StringBuffer where
90        showsPrec _ s = showString "<stringbuffer("
91                      . shows (len s) . showString "," . shows (cur s)
92                      . showString ")>"
93
94-- -----------------------------------------------------------------------------
95-- Creation / Destruction
96
97-- | Read a file into a 'StringBuffer'.  The resulting buffer is automatically
98-- managed by the garbage collector.
99hGetStringBuffer :: FilePath -> IO StringBuffer
100hGetStringBuffer fname = do
101   h <- openBinaryFile fname ReadMode
102   size_i <- hFileSize h
103   offset_i <- skipBOM h size_i 0  -- offset is 0 initially
104   let size = fromIntegral $ size_i - offset_i
105   buf <- mallocForeignPtrArray (size+3)
106   withForeignPtr buf $ \ptr -> do
107     r <- if size == 0 then return 0 else hGetBuf h ptr size
108     hClose h
109     if (r /= size)
110        then ioError (userError "short read of file")
111        else newUTF8StringBuffer buf ptr size
112
113hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
114hGetStringBufferBlock handle wanted
115    = do size_i <- hFileSize handle
116         offset_i <- hTell handle >>= skipBOM handle size_i
117         let size = min wanted (fromIntegral $ size_i-offset_i)
118         buf <- mallocForeignPtrArray (size+3)
119         withForeignPtr buf $ \ptr ->
120             do r <- if size == 0 then return 0 else hGetBuf handle ptr size
121                if r /= size
122                   then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
123                   else newUTF8StringBuffer buf ptr size
124
125hPutStringBuffer :: Handle -> StringBuffer -> IO ()
126hPutStringBuffer hdl (StringBuffer buf len cur)
127    = do withForeignPtr (plusForeignPtr buf cur) $ \ptr ->
128             hPutBuf hdl ptr len
129
130-- | Skip the byte-order mark if there is one (see #1744 and #6016),
131-- and return the new position of the handle in bytes.
132--
133-- This is better than treating #FEFF as whitespace,
134-- because that would mess up layout.  We don't have a concept
135-- of zero-width whitespace in Haskell: all whitespace codepoints
136-- have a width of one column.
137skipBOM :: Handle -> Integer -> Integer -> IO Integer
138skipBOM h size offset =
139  -- Only skip BOM at the beginning of a file.
140  if size > 0 && offset == 0
141    then do
142      -- Validate assumption that handle is in binary mode.
143      ASSERTM( hGetEncoding h >>= return . isNothing )
144      -- Temporarily select utf8 encoding with error ignoring,
145      -- to make `hLookAhead` and `hGetChar` return full Unicode characters.
146      bracket_ (hSetEncoding h safeEncoding) (hSetBinaryMode h True) $ do
147        c <- hLookAhead h
148        if c == '\xfeff'
149          then hGetChar h >> hTell h
150          else return offset
151    else return offset
152  where
153    safeEncoding = mkUTF8 IgnoreCodingFailure
154
155newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
156newUTF8StringBuffer buf ptr size = do
157  pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
158  -- sentinels for UTF-8 decoding
159  return $ StringBuffer buf size 0
160
161appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
162appendStringBuffers sb1 sb2
163    = do newBuf <- mallocForeignPtrArray (size+3)
164         withForeignPtr newBuf $ \ptr ->
165          withForeignPtr (buf sb1) $ \sb1Ptr ->
166           withForeignPtr (buf sb2) $ \sb2Ptr ->
167             do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
168                copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
169                pokeArray (ptr `advancePtr` size) [0,0,0]
170                return (StringBuffer newBuf size 0)
171    where sb1_len = calcLen sb1
172          sb2_len = calcLen sb2
173          calcLen sb = len sb - cur sb
174          size =  sb1_len + sb2_len
175
176-- | Encode a 'String' into a 'StringBuffer' as UTF-8.  The resulting buffer
177-- is automatically managed by the garbage collector.
178stringToStringBuffer :: String -> StringBuffer
179stringToStringBuffer str =
180 unsafePerformIO $ do
181  let size = utf8EncodedLength str
182  buf <- mallocForeignPtrArray (size+3)
183  withForeignPtr buf $ \ptr -> do
184    utf8EncodeString ptr str
185    pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
186    -- sentinels for UTF-8 decoding
187  return (StringBuffer buf size 0)
188
189-- -----------------------------------------------------------------------------
190-- Grab a character
191
192-- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well
193-- the remaining portion (analogous to 'Data.List.uncons').  __Warning:__ The
194-- behavior is undefined if the 'StringBuffer' is empty.  The result shares
195-- the same buffer as the original.  Similar to 'utf8DecodeChar', if the
196-- character cannot be decoded as UTF-8, @\'\\0\'@ is returned.
197{-# INLINE nextChar #-}
198nextChar :: StringBuffer -> (Char,StringBuffer)
199nextChar (StringBuffer buf len (I# cur#)) =
200  -- Getting our fingers dirty a little here, but this is performance-critical
201  inlinePerformIO $ do
202    withForeignPtr buf $ \(Ptr a#) -> do
203        case utf8DecodeChar# (a# `plusAddr#` cur#) of
204          (# c#, nBytes# #) ->
205             let cur' = I# (cur# +# nBytes#) in
206             return (C# c#, StringBuffer buf len cur')
207
208-- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
209-- to 'Data.List.head').  __Warning:__ The behavior is undefined if the
210-- 'StringBuffer' is empty.  Similar to 'utf8DecodeChar', if the character
211-- cannot be decoded as UTF-8, @\'\\0\'@ is returned.
212currentChar :: StringBuffer -> Char
213currentChar = fst . nextChar
214
215prevChar :: StringBuffer -> Char -> Char
216prevChar (StringBuffer _   _   0)   deflt = deflt
217prevChar (StringBuffer buf _   cur) _     =
218  inlinePerformIO $ do
219    withForeignPtr buf $ \p -> do
220      p' <- utf8PrevChar (p `plusPtr` cur)
221      return (fst (utf8DecodeChar p'))
222
223-- -----------------------------------------------------------------------------
224-- Moving
225
226-- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous
227-- to 'Data.List.tail').  __Warning:__ The behavior is undefined if the
228-- 'StringBuffer' is empty.  The result shares the same buffer as the
229-- original.
230stepOn :: StringBuffer -> StringBuffer
231stepOn s = snd (nextChar s)
232
233-- | Return a 'StringBuffer' with the first @n@ bytes removed.  __Warning:__
234-- If there aren't enough characters, the returned 'StringBuffer' will be
235-- invalid and any use of it may lead to undefined behavior.  The result
236-- shares the same buffer as the original.
237offsetBytes :: Int                      -- ^ @n@, the number of bytes
238            -> StringBuffer
239            -> StringBuffer
240offsetBytes i s = s { cur = cur s + i }
241
242-- | Compute the difference in offset between two 'StringBuffer's that share
243-- the same buffer.  __Warning:__ The behavior is undefined if the
244-- 'StringBuffer's use separate buffers.
245byteDiff :: StringBuffer -> StringBuffer -> Int
246byteDiff s1 s2 = cur s2 - cur s1
247
248-- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
249atEnd :: StringBuffer -> Bool
250atEnd (StringBuffer _ l c) = l == c
251
252-- | Computes a 'StringBuffer' which points to the first character of the
253-- wanted line. Lines begin at 1.
254atLine :: Int -> StringBuffer -> Maybe StringBuffer
255atLine line sb@(StringBuffer buf len _) =
256  inlinePerformIO $
257    withForeignPtr buf $ \p -> do
258      p' <- skipToLine line len p
259      if p' == nullPtr
260        then return Nothing
261        else
262          let
263            delta = p' `minusPtr` p
264          in return $ Just (sb { cur = delta
265                               , len = len - delta
266                               })
267
268skipToLine :: Int -> Int -> Ptr Word8 -> IO (Ptr Word8)
269skipToLine !line !len !op0 = go 1 op0
270  where
271    !opend = op0 `plusPtr` len
272
273    go !i_line !op
274      | op >= opend    = pure nullPtr
275      | i_line == line = pure op
276      | otherwise      = do
277          w <- peek op :: IO Word8
278          case w of
279            10 -> go (i_line + 1) (plusPtr op 1)
280            13 -> do
281              -- this is safe because a 'StringBuffer' is
282              -- guaranteed to have 3 bytes sentinel values.
283              w' <- peek (plusPtr op 1) :: IO Word8
284              case w' of
285                10 -> go (i_line + 1) (plusPtr op 2)
286                _  -> go (i_line + 1) (plusPtr op 1)
287            _  -> go i_line (plusPtr op 1)
288
289-- -----------------------------------------------------------------------------
290-- Conversion
291
292-- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'.
293-- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8,
294-- they will be replaced with @\'\\0\'@.
295lexemeToString :: StringBuffer
296               -> Int                   -- ^ @n@, the number of bytes
297               -> String
298lexemeToString _ 0 = ""
299lexemeToString (StringBuffer buf _ cur) bytes =
300  utf8DecodeStringLazy buf cur bytes
301
302lexemeToFastString :: StringBuffer
303                   -> Int               -- ^ @n@, the number of bytes
304                   -> FastString
305lexemeToFastString _ 0 = nilFS
306lexemeToFastString (StringBuffer buf _ cur) len =
307   inlinePerformIO $
308     withForeignPtr buf $ \ptr ->
309       return $! mkFastStringBytes (ptr `plusPtr` cur) len
310
311-- | Return the previous @n@ characters (or fewer if we are less than @n@
312-- characters into the buffer.
313decodePrevNChars :: Int -> StringBuffer -> String
314decodePrevNChars n (StringBuffer buf _ cur) =
315    inlinePerformIO $ withForeignPtr buf $ \p0 ->
316      go p0 n "" (p0 `plusPtr` (cur - 1))
317  where
318    go :: Ptr Word8 -> Int -> String -> Ptr Word8 -> IO String
319    go buf0 n acc p | n == 0 || buf0 >= p = return acc
320    go buf0 n acc p = do
321        p' <- utf8PrevChar p
322        let (c,_) = utf8DecodeChar p'
323        go buf0 (n - 1) (c:acc) p'
324
325-- -----------------------------------------------------------------------------
326-- Parsing integer strings in various bases
327parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
328parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
329  = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
330    go i x | i == len  = x
331           | otherwise = case fst (utf8DecodeChar (ptr `plusPtr` (cur + i))) of
332               '_'  -> go (i + 1) x    -- skip "_" (#14473)
333               char -> go (i + 1) (x * radix + toInteger (char_to_int char))
334  in go 0 0
335