1-- The exported symbols are the same whether HAVE_PCRE_H is defined,
2-- but when if it is not defined then 'getVersion == Nothing' and all
3-- other exported values will call error or fail.
4
5-- | This will fail or error only if allocation fails or a nullPtr is passed in.
6
7-- TODO :: Consider wrapMatchAll using list of start/end offsets and not MatchArray
8--
9
10{- Copyright   :  (c) Chris Kuklewicz 2007 -}
11module Text.Regex.PCRE.Wrap(
12  -- ** High-level interface
13  Regex,
14  CompOption(CompOption),
15  ExecOption(ExecOption),
16  (=~),
17  (=~~),
18
19  -- ** Low-level interface
20  StartOffset,
21  EndOffset,
22  ReturnCode(ReturnCode),
23  WrapError,
24  wrapCompile,
25  wrapTest,
26  wrapMatch,
27  wrapMatchAll,
28  wrapCount,
29
30  -- ** Miscellaneous
31  getVersion,
32  configUTF8,
33  getNumSubs,
34  unusedOffset,
35
36  -- ** CompOption values
37  compBlank,
38  compAnchored,
39  compAutoCallout,
40  compCaseless,
41  compDollarEndOnly,
42  compDotAll,
43  compExtended,
44  compExtra,
45  compFirstLine,
46  compMultiline,
47  compNoAutoCapture,
48  compUngreedy,
49  compUTF8,
50  compNoUTF8Check,
51
52  -- ** ExecOption values
53  execBlank,
54  execAnchored,
55  execNotBOL,
56  execNotEOL,
57  execNotEmpty,
58  execNoUTF8Check,
59  execPartial,
60
61  -- ** ReturnCode values
62  retOk,
63  retNoMatch,
64  retNull,
65  retBadOption,
66  retBadMagic,
67  retUnknownNode,
68  retNoMemory,
69  retNoSubstring
70  ) where
71
72import Prelude hiding (fail)
73import Control.Monad.Fail (MonadFail(fail))
74
75import Control.Monad(when)
76import Data.Array(Array,accumArray)
77import Data.Bits(Bits((.|.))) -- ((.&.),(.|.),complement))
78import System.IO.Unsafe(unsafePerformIO)
79import Foreign(Ptr,ForeignPtr,FinalizerPtr -- ,FunPtr
80              ,alloca,allocaBytes,nullPtr
81              ,peek,peekElemOff
82              ,newForeignPtr,withForeignPtr)
83import Foreign.C(CChar)
84#if __GLASGOW_HASKELL__ >= 703
85import Foreign.C(CInt(CInt))
86#else
87import Foreign.C(CInt)
88#endif
89import Foreign.C.String(CString,CStringLen,peekCString)
90import Text.Regex.Base.RegexLike(RegexOptions(..),RegexMaker(..),RegexContext(..),MatchArray,MatchOffset)
91
92-- | Version string of PCRE library
93--
94-- __NOTE__: The 'Maybe' type is used for historic reasons; practically, 'getVersion' is never 'Nothing'.
95{-# NOINLINE getVersion #-}
96getVersion :: Maybe String
97
98type PCRE = ()
99type StartOffset = MatchOffset
100type EndOffset = MatchOffset
101type WrapError = (ReturnCode,String)
102
103newtype CompOption = CompOption CInt deriving (Eq,Show,Num,Bits)
104newtype ExecOption = ExecOption CInt deriving (Eq,Show,Num,Bits)
105newtype ReturnCode = ReturnCode CInt deriving (Eq,Show)
106
107-- | A compiled regular expression
108data Regex = Regex (ForeignPtr PCRE) CompOption ExecOption
109
110compBlank :: CompOption
111execBlank :: ExecOption
112unusedOffset :: MatchOffset
113retOk :: ReturnCode
114
115wrapCompile :: CompOption -- ^ Flags (summed together)
116            -> ExecOption -- ^ Flags (summed together)
117            -> CString  -- ^ The regular expression to compile
118            -> IO (Either (MatchOffset,String) Regex) -- ^ Returns: an error offset and string or the compiled regular expression
119wrapTest :: StartOffset -- ^ Starting index in CStringLen
120         -> Regex       -- ^ Compiled regular expression
121         -> CStringLen  -- ^ String to match against and length in bytes
122         -> IO (Either WrapError Bool)
123wrapMatch :: StartOffset -- ^ Starting index in CStringLen
124          -> Regex       -- ^ Compiled regular expression
125          -> CStringLen  -- ^ String to match against and length in bytes
126          -> IO (Either WrapError (Maybe [(StartOffset,EndOffset)]))
127                -- ^ Returns: 'Right Nothing' if the regex did not match the
128                -- string, or:
129                --   'Right Just' an array of (offset,length) pairs where index 0 is whole match, and the rest are the captured subexpressions, or:
130                --   'Left ReturnCode' if there is some strange error
131wrapMatchAll :: Regex -> CStringLen -> IO (Either WrapError [ MatchArray ])
132wrapCount :: Regex -> CStringLen -> IO (Either WrapError Int)
133
134getNumSubs :: Regex -> Int
135
136{-# NOINLINE configUTF8 #-}
137configUTF8 :: Bool
138
139(=~)  :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target)
140      => source1 -> source -> target
141(=~~) :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target,MonadFail m)
142      => source1 -> source -> m target
143
144#include <sys/types.h>
145#include <pcre.h>
146
147instance RegexOptions Regex CompOption ExecOption where
148  blankCompOpt = compBlank
149  blankExecOpt = execBlank
150  defaultCompOpt = compMultiline
151  defaultExecOpt = execBlank
152  setExecOpts e' (Regex r c _) = Regex r c e'
153  getExecOpts (Regex _ _ e) = e
154
155-- (=~) :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target) => source1 -> source -> target
156(=~) x r = let q :: Regex
157               q = makeRegex r
158           in match q x
159
160-- (=~~) ::(RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target,MonadFail m) => source1 -> source -> m target
161(=~~) x r = do (q :: Regex) <-  makeRegexM r
162               matchM q x
163
164type PCRE_Extra = ()
165
166fi :: (Integral i,Num n ) => i -> n
167fi x = fromIntegral x
168
169compBlank = CompOption 0
170execBlank = ExecOption 0
171unusedOffset = (-1)
172retOk = ReturnCode 0
173
174retNeededMoreSpace :: ReturnCode
175retNeededMoreSpace = ReturnCode 0
176
177newtype InfoWhat = InfoWhat CInt deriving (Eq,Show)
178newtype ConfigWhat = ConfigWhat CInt deriving (Eq,Show)
179
180nullTest' :: Ptr a -> String -> IO (Either (MatchOffset,String) b) -> IO (Either (MatchOffset,String) b)
181{-# INLINE nullTest' #-}
182nullTest' ptr msg io = do
183  if nullPtr == ptr
184    then return (Left (0,"Ptr parameter was nullPtr in Text.Regex.PCRE.Wrap."++msg))
185    else io
186
187nullTest :: Ptr a -> String -> IO (Either WrapError b) -> IO (Either WrapError b)
188{-# INLINE nullTest #-}
189nullTest ptr msg io = do
190  if nullPtr == ptr
191    then return (Left (retOk,"Ptr parameter was nullPtr in Text.Regex.PCRE.Wrap."++msg))
192    else io
193
194wrapRC :: ReturnCode -> IO (Either WrapError b)
195{-# INLINE wrapRC #-}
196wrapRC r = return (Left (r,"Error in Text.Regex.PCRE.Wrap: "++show r))
197
198-- | Compiles a regular expression
199wrapCompile flags e pattern = do
200 nullTest' pattern "wrapCompile pattern" $ do
201  alloca $ \errOffset -> alloca $ \errPtr -> do
202   nullTest' errPtr "wrapCompile errPtr" $ do
203    pcre_ptr <- c_pcre_compile pattern flags errPtr errOffset nullPtr
204    if pcre_ptr == nullPtr
205      then do
206        -- No need to use c_ptr_free in the error case (e.g. pcredemo.c)
207        offset <- peek errOffset
208        string <- peekCString =<< peek errPtr
209        return (Left (fi offset,string))
210      else do regex <- newForeignPtr c_ptr_free pcre_ptr
211              return . Right $ Regex regex flags e
212
213getNumSubs (Regex pcre_fptr _ _) = fi . unsafePerformIO $ withForeignPtr pcre_fptr getNumSubs'
214
215getNumSubs' :: Ptr PCRE -> IO CInt
216{-# INLINE getNumSubs' #-}
217getNumSubs' pcre_ptr =
218  alloca $ \st -> do -- (st :: Ptr CInt)
219    when (st == nullPtr) (fail "Text.Regex.PCRE.Wrap.getNumSubs' could not allocate a CInt!!!")
220    ok0 <- c_pcre_fullinfo pcre_ptr nullPtr pcreInfoCapturecount st
221    when (ok0 /= 0) (fail $ "Impossible/fatal: Haskell package regex-pcre error in Text.Posix.PCRE.Wrap.getNumSubs' of ok0 /= 0.  ok0 is from pcre_fullinfo c-function which returned  "++show ok0)
222    peek st
223
224wrapTest startOffset (Regex pcre_fptr _ flags) (cstr,len) = do
225 nullTest cstr "wrapTest cstr" $ do
226  withForeignPtr pcre_fptr $ \pcre_ptr -> do
227    r@(ReturnCode r') <- c_pcre_exec pcre_ptr nullPtr cstr (fi len) (fi startOffset) flags nullPtr 0
228    if r == retNoMatch
229      then return (Right False)
230      else if r' < 0
231             then wrapRC r
232             else return (Right True)
233
234-- | Matches a regular expression against a string
235--
236-- Should never return (Right (Just []))
237wrapMatch startOffset (Regex pcre_fptr _ flags) (cstr,len) = do
238 nullTest cstr "wrapMatch cstr" $ do
239  withForeignPtr pcre_fptr $ \pcre_ptr -> do
240    nsub <- getNumSubs' pcre_ptr
241    let nsub_int :: Int
242        nsub_int = fi nsub
243        ovec_size :: CInt
244        ovec_size = ((nsub + 1) * 3) -- "man pcreapi" for explanation
245        ovec_bytes :: Int
246        ovec_bytes = (fi ovec_size) * (#const sizeof(int))
247    allocaBytes ovec_bytes $ \ovec -> do
248     nullTest ovec "wrapMatch ovec" $ do
249      r@(ReturnCode r') <- c_pcre_exec pcre_ptr nullPtr cstr (fi len) (fi startOffset) flags ovec ovec_size
250      if r == retNoMatch
251        then return (Right Nothing)
252        else if r' < 0
253          then wrapRC r
254          else do
255            let pairsSet :: Int
256                pairsSet = if r == retNeededMoreSpace -- if r == ReturnCode 0
257                             then nsub_int + 1 -- should not happen
258                             else fi r' -- implies pairsSet > 0
259                extraPairs :: [(Int,Int)]
260                extraPairs = replicate (nsub_int + 1 - pairsSet)
261                                       (unusedOffset,unusedOffset)
262            pairs <- return . toPairs =<< mapM (peekElemOff ovec) [0 .. ((pairsSet*2)-1)]
263            return . Right . Just $ (pairs ++ extraPairs)
264
265-- | wrapMatchAll is an improvement over wrapMatch since it only
266-- allocates memory with allocaBytes once at the start.
267--
268--
269wrapMatchAll (Regex pcre_fptr _ flags) (cstr,len) = do
270 nullTest cstr "wrapMatchAll cstr" $ do
271  withForeignPtr pcre_fptr $ \regex -> do
272    nsub <- getNumSubs' regex
273    let nsub_int :: Int
274        nsub_int = fi nsub
275        ovec_size :: CInt
276        ovec_size = ((nsub + 1) * 3) -- "man pcreapi" for explanation
277        ovec_bytes :: Int
278        ovec_bytes = (fi ovec_size) * (#const sizeof(int))
279        clen = fi len
280        flags' = (execNotEmpty .|. execAnchored .|. flags)
281    allocaBytes ovec_bytes $ \ovec ->
282     nullTest ovec "wrapMatchAll ovec" $
283      let loop acc flags_in_use pos = do
284            r@(ReturnCode r') <- c_pcre_exec regex nullPtr cstr clen (fi pos) flags_in_use ovec ovec_size
285            if r == retNoMatch
286              then return (Right (acc []))
287              else if r' < 0
288                     then wrapRC r
289                     else do
290                       let pairsSet = if r == retNeededMoreSpace then nsub_int+1 else fi r'
291                       pairs <- return . toPairs =<< mapM (peekElemOff ovec) [0 .. ((pairsSet*2)-1)]
292                       let acc' = acc . (toMatchArray nsub_int pairs:)
293                       case pairs of
294                         [] -> return (Right (acc' []))
295                         ((s,e):_) | s==e -> if s == len
296                                               then return (Right (acc' []))
297                                               else loop acc' flags' e
298                                   | otherwise -> loop acc' flags e
299      in loop id flags 0
300toMatchArray :: Int -> [(Int,Int)] -> Array Int (Int,Int)
301toMatchArray n pairs = accumArray (\_ (s,e) -> (s,(e-s))) (-1,0) (0,n) (zip [0..] pairs)
302
303toPairs :: [CInt] -> [(Int,Int)]
304toPairs [] = []
305toPairs (a:b:rest) = (fi a,fi b):toPairs rest
306toPairs [_] = error "Should not have just one element in WrapPCRE.wrapMatchAll.toPairs"
307
308wrapCount (Regex pcre_fptr _ flags) (cstr,len) = do
309 nullTest cstr "wrapCount cstr" $ do
310  withForeignPtr pcre_fptr $ \pcre_ptr -> do
311    nsub <- getNumSubs' pcre_ptr
312    let ovec_size :: CInt
313        ovec_size = ((nsub + 1) * 3) -- "man pcreapi" for explanation
314        ovec_bytes :: Int
315        ovec_bytes = (fi ovec_size) * (#const sizeof(int))
316        clen = fi len
317    allocaBytes ovec_bytes $ \ovec ->
318     nullTest ovec "wrapCount ovec" $
319      let act pos = c_pcre_exec pcre_ptr nullPtr cstr clen (fi pos) flags ovec ovec_size
320          loop acc pos | acc `seq` pos `seq` False = undefined
321                       | otherwise  = do
322            r@(ReturnCode r') <- act pos
323            if r == retNoMatch
324              then return (Right acc)
325              else if r' < 0
326                then wrapRC r
327                else do
328                  pairs <- return . toPairs =<< mapM (peekElemOff ovec) [0,1]
329                  case pairs of
330                    [] -> return (Right (succ acc))
331                    ((s,e):_) | s==e -> return (Right (succ acc))
332                              | otherwise -> loop (succ acc) e
333      in loop 0 0
334
335getVersion = unsafePerformIO $ do
336  version <- c_pcre_version
337  if version == nullPtr
338    then return (Just "pcre_version was null")
339    else return . Just =<< peekCString version
340
341configUTF8 = unsafePerformIO $
342  alloca $ \ptrVal -> do -- (ptrVal :: Ptr CInt)
343    when (ptrVal == nullPtr) (fail "Text.Regex.PCRE.Wrap.configUTF8 could not alloca CInt!!!")
344    _unicodeSupported <- c_pcre_config pcreConfigUtf8 ptrVal
345    {- pcre_config: The  output  is  an  integer that is set to one if UTF-8 support is available; otherwise it is set to zero. -}
346    val <- peek ptrVal
347    case val of
348      (1 :: CInt) -> return True
349      0 -> return False
350      _ -> return False -- should not happen
351
352foreign import ccall unsafe "pcre.h pcre_compile"
353  c_pcre_compile :: CString -> CompOption -> Ptr CString -> Ptr CInt -> CString -> IO (Ptr PCRE)
354foreign import ccall unsafe "&free"
355  c_ptr_free :: FinalizerPtr a -- FunPtr (Ptr a -> IO ())
356foreign import ccall unsafe "pcre.h pcre_exec"
357  c_pcre_exec :: Ptr PCRE -> Ptr PCRE_Extra -> CString -> CInt -> CInt -> ExecOption -> Ptr CInt -> CInt -> IO ReturnCode
358foreign import ccall unsafe "pcre.h pcre_fullinfo"
359  c_pcre_fullinfo :: Ptr PCRE -> Ptr PCRE_Extra -> InfoWhat -> Ptr a -> IO CInt
360foreign import ccall unsafe "pcre.h pcre_version"
361  c_pcre_version :: IO (Ptr CChar)
362foreign import ccall unsafe "pcre.h pcre_config"
363  c_pcre_config :: ConfigWhat -> Ptr a -> IO CInt
364
365
366#enum CompOption,CompOption, \
367  compAnchored = PCRE_ANCHORED, \
368  compAutoCallout = PCRE_AUTO_CALLOUT, \
369  compCaseless = PCRE_CASELESS, \
370  compDollarEndOnly = PCRE_DOLLAR_ENDONLY, \
371  compDotAll = PCRE_DOTALL, \
372  compExtended = PCRE_EXTENDED, \
373  compExtra = PCRE_EXTRA, \
374  compFirstLine = PCRE_FIRSTLINE, \
375  compMultiline = PCRE_MULTILINE, \
376  compNoAutoCapture = PCRE_NO_AUTO_CAPTURE, \
377  compUngreedy = PCRE_UNGREEDY, \
378  compUTF8 = PCRE_UTF8, \
379  compNoUTF8Check = PCRE_NO_UTF8_CHECK
380
381#enum ExecOption,ExecOption, \
382  execAnchored = PCRE_ANCHORED, \
383  execNotBOL = PCRE_NOTBOL, \
384  execNotEOL = PCRE_NOTEOL, \
385  execNotEmpty = PCRE_NOTEMPTY, \
386  execNoUTF8Check = PCRE_NO_UTF8_CHECK, \
387  execPartial = PCRE_PARTIAL
388
389#enum ReturnCode,ReturnCode, \
390  retNoMatch = PCRE_ERROR_NOMATCH, \
391  retNull = PCRE_ERROR_NULL, \
392  retBadOption = PCRE_ERROR_BADOPTION, \
393  retBadMagic = PCRE_ERROR_BADMAGIC, \
394  retUnknownNode = PCRE_ERROR_UNKNOWN_NODE, \
395  retNoMemory = PCRE_ERROR_NOMEMORY, \
396  retNoSubstring = PCRE_ERROR_NOSUBSTRING
397
398-- Comment out most of these to avoid unused binding warnings
399
400-- PCRE_INFO_FIRSTCHAR is deprecated, use PCRE_INFO_FIRSTBYTE instead.
401#enum InfoWhat,InfoWhat, \
402  PCRE_INFO_CAPTURECOUNT
403{-
404  PCRE_INFO_BACKREFMAX, \
405  PCRE_INFO_DEFAULT_TABLES, \
406  PCRE_INFO_FIRSTBYTE, \
407  PCRE_INFO_FIRSTCHAR, \
408  PCRE_INFO_FIRSTTABLE, \
409  PCRE_INFO_LASTLITERAL, \
410  PCRE_INFO_NAMECOUNT, \
411  PCRE_INFO_NAMEENTRYSIZE, \
412  PCRE_INFO_NAMETABLE, \
413  PCRE_INFO_OPTIONS, \
414  PCRE_INFO_SIZE, \
415  PCRE_INFO_STUDYSIZE
416-}
417#enum ConfigWhat,ConfigWhat, \
418  PCRE_CONFIG_UTF8
419{-
420  PCRE_CONFIG_UNICODE_PROPERTIES, \
421  PCRE_CONFIG_NEWLINE, \
422  PCRE_CONFIG_LINK_SIZE, \
423  PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, \
424  PCRE_CONFIG_MATCH_LIMIT, \
425  PCRE_CONFIG_MATCH_LIMIT_RECURSION, \
426  PCRE_CONFIG_STACKRECURSE
427-}
428
429