1-- The exported symbols are the same whether HAVE_PCRE_H is defined, 2-- but when if it is not defined then 'getVersion == Nothing' and all 3-- other exported values will call error or fail. 4 5-- | This will fail or error only if allocation fails or a nullPtr is passed in. 6 7-- TODO :: Consider wrapMatchAll using list of start/end offsets and not MatchArray 8-- 9 10{- Copyright : (c) Chris Kuklewicz 2007 -} 11module Text.Regex.PCRE.Wrap( 12 -- ** High-level interface 13 Regex, 14 CompOption(CompOption), 15 ExecOption(ExecOption), 16 (=~), 17 (=~~), 18 19 -- ** Low-level interface 20 StartOffset, 21 EndOffset, 22 ReturnCode(ReturnCode), 23 WrapError, 24 wrapCompile, 25 wrapTest, 26 wrapMatch, 27 wrapMatchAll, 28 wrapCount, 29 30 -- ** Miscellaneous 31 getVersion, 32 configUTF8, 33 getNumSubs, 34 unusedOffset, 35 36 -- ** CompOption values 37 compBlank, 38 compAnchored, 39 compAutoCallout, 40 compCaseless, 41 compDollarEndOnly, 42 compDotAll, 43 compExtended, 44 compExtra, 45 compFirstLine, 46 compMultiline, 47 compNoAutoCapture, 48 compUngreedy, 49 compUTF8, 50 compNoUTF8Check, 51 52 -- ** ExecOption values 53 execBlank, 54 execAnchored, 55 execNotBOL, 56 execNotEOL, 57 execNotEmpty, 58 execNoUTF8Check, 59 execPartial, 60 61 -- ** ReturnCode values 62 retOk, 63 retNoMatch, 64 retNull, 65 retBadOption, 66 retBadMagic, 67 retUnknownNode, 68 retNoMemory, 69 retNoSubstring 70 ) where 71 72import Prelude hiding (fail) 73import Control.Monad.Fail (MonadFail(fail)) 74 75import Control.Monad(when) 76import Data.Array(Array,accumArray) 77import Data.Bits(Bits((.|.))) -- ((.&.),(.|.),complement)) 78import System.IO.Unsafe(unsafePerformIO) 79import Foreign(Ptr,ForeignPtr,FinalizerPtr -- ,FunPtr 80 ,alloca,allocaBytes,nullPtr 81 ,peek,peekElemOff 82 ,newForeignPtr,withForeignPtr) 83import Foreign.C(CChar) 84#if __GLASGOW_HASKELL__ >= 703 85import Foreign.C(CInt(CInt)) 86#else 87import Foreign.C(CInt) 88#endif 89import Foreign.C.String(CString,CStringLen,peekCString) 90import Text.Regex.Base.RegexLike(RegexOptions(..),RegexMaker(..),RegexContext(..),MatchArray,MatchOffset) 91 92-- | Version string of PCRE library 93-- 94-- __NOTE__: The 'Maybe' type is used for historic reasons; practically, 'getVersion' is never 'Nothing'. 95{-# NOINLINE getVersion #-} 96getVersion :: Maybe String 97 98type PCRE = () 99type StartOffset = MatchOffset 100type EndOffset = MatchOffset 101type WrapError = (ReturnCode,String) 102 103newtype CompOption = CompOption CInt deriving (Eq,Show,Num,Bits) 104newtype ExecOption = ExecOption CInt deriving (Eq,Show,Num,Bits) 105newtype ReturnCode = ReturnCode CInt deriving (Eq,Show) 106 107-- | A compiled regular expression 108data Regex = Regex (ForeignPtr PCRE) CompOption ExecOption 109 110compBlank :: CompOption 111execBlank :: ExecOption 112unusedOffset :: MatchOffset 113retOk :: ReturnCode 114 115wrapCompile :: CompOption -- ^ Flags (summed together) 116 -> ExecOption -- ^ Flags (summed together) 117 -> CString -- ^ The regular expression to compile 118 -> IO (Either (MatchOffset,String) Regex) -- ^ Returns: an error offset and string or the compiled regular expression 119wrapTest :: StartOffset -- ^ Starting index in CStringLen 120 -> Regex -- ^ Compiled regular expression 121 -> CStringLen -- ^ String to match against and length in bytes 122 -> IO (Either WrapError Bool) 123wrapMatch :: StartOffset -- ^ Starting index in CStringLen 124 -> Regex -- ^ Compiled regular expression 125 -> CStringLen -- ^ String to match against and length in bytes 126 -> IO (Either WrapError (Maybe [(StartOffset,EndOffset)])) 127 -- ^ Returns: 'Right Nothing' if the regex did not match the 128 -- string, or: 129 -- 'Right Just' an array of (offset,length) pairs where index 0 is whole match, and the rest are the captured subexpressions, or: 130 -- 'Left ReturnCode' if there is some strange error 131wrapMatchAll :: Regex -> CStringLen -> IO (Either WrapError [ MatchArray ]) 132wrapCount :: Regex -> CStringLen -> IO (Either WrapError Int) 133 134getNumSubs :: Regex -> Int 135 136{-# NOINLINE configUTF8 #-} 137configUTF8 :: Bool 138 139(=~) :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target) 140 => source1 -> source -> target 141(=~~) :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target,MonadFail m) 142 => source1 -> source -> m target 143 144#include <sys/types.h> 145#include <pcre.h> 146 147instance RegexOptions Regex CompOption ExecOption where 148 blankCompOpt = compBlank 149 blankExecOpt = execBlank 150 defaultCompOpt = compMultiline 151 defaultExecOpt = execBlank 152 setExecOpts e' (Regex r c _) = Regex r c e' 153 getExecOpts (Regex _ _ e) = e 154 155-- (=~) :: (RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target) => source1 -> source -> target 156(=~) x r = let q :: Regex 157 q = makeRegex r 158 in match q x 159 160-- (=~~) ::(RegexMaker Regex CompOption ExecOption source,RegexContext Regex source1 target,MonadFail m) => source1 -> source -> m target 161(=~~) x r = do (q :: Regex) <- makeRegexM r 162 matchM q x 163 164type PCRE_Extra = () 165 166fi :: (Integral i,Num n ) => i -> n 167fi x = fromIntegral x 168 169compBlank = CompOption 0 170execBlank = ExecOption 0 171unusedOffset = (-1) 172retOk = ReturnCode 0 173 174retNeededMoreSpace :: ReturnCode 175retNeededMoreSpace = ReturnCode 0 176 177newtype InfoWhat = InfoWhat CInt deriving (Eq,Show) 178newtype ConfigWhat = ConfigWhat CInt deriving (Eq,Show) 179 180nullTest' :: Ptr a -> String -> IO (Either (MatchOffset,String) b) -> IO (Either (MatchOffset,String) b) 181{-# INLINE nullTest' #-} 182nullTest' ptr msg io = do 183 if nullPtr == ptr 184 then return (Left (0,"Ptr parameter was nullPtr in Text.Regex.PCRE.Wrap."++msg)) 185 else io 186 187nullTest :: Ptr a -> String -> IO (Either WrapError b) -> IO (Either WrapError b) 188{-# INLINE nullTest #-} 189nullTest ptr msg io = do 190 if nullPtr == ptr 191 then return (Left (retOk,"Ptr parameter was nullPtr in Text.Regex.PCRE.Wrap."++msg)) 192 else io 193 194wrapRC :: ReturnCode -> IO (Either WrapError b) 195{-# INLINE wrapRC #-} 196wrapRC r = return (Left (r,"Error in Text.Regex.PCRE.Wrap: "++show r)) 197 198-- | Compiles a regular expression 199wrapCompile flags e pattern = do 200 nullTest' pattern "wrapCompile pattern" $ do 201 alloca $ \errOffset -> alloca $ \errPtr -> do 202 nullTest' errPtr "wrapCompile errPtr" $ do 203 pcre_ptr <- c_pcre_compile pattern flags errPtr errOffset nullPtr 204 if pcre_ptr == nullPtr 205 then do 206 -- No need to use c_ptr_free in the error case (e.g. pcredemo.c) 207 offset <- peek errOffset 208 string <- peekCString =<< peek errPtr 209 return (Left (fi offset,string)) 210 else do regex <- newForeignPtr c_ptr_free pcre_ptr 211 return . Right $ Regex regex flags e 212 213getNumSubs (Regex pcre_fptr _ _) = fi . unsafePerformIO $ withForeignPtr pcre_fptr getNumSubs' 214 215getNumSubs' :: Ptr PCRE -> IO CInt 216{-# INLINE getNumSubs' #-} 217getNumSubs' pcre_ptr = 218 alloca $ \st -> do -- (st :: Ptr CInt) 219 when (st == nullPtr) (fail "Text.Regex.PCRE.Wrap.getNumSubs' could not allocate a CInt!!!") 220 ok0 <- c_pcre_fullinfo pcre_ptr nullPtr pcreInfoCapturecount st 221 when (ok0 /= 0) (fail $ "Impossible/fatal: Haskell package regex-pcre error in Text.Posix.PCRE.Wrap.getNumSubs' of ok0 /= 0. ok0 is from pcre_fullinfo c-function which returned "++show ok0) 222 peek st 223 224wrapTest startOffset (Regex pcre_fptr _ flags) (cstr,len) = do 225 nullTest cstr "wrapTest cstr" $ do 226 withForeignPtr pcre_fptr $ \pcre_ptr -> do 227 r@(ReturnCode r') <- c_pcre_exec pcre_ptr nullPtr cstr (fi len) (fi startOffset) flags nullPtr 0 228 if r == retNoMatch 229 then return (Right False) 230 else if r' < 0 231 then wrapRC r 232 else return (Right True) 233 234-- | Matches a regular expression against a string 235-- 236-- Should never return (Right (Just [])) 237wrapMatch startOffset (Regex pcre_fptr _ flags) (cstr,len) = do 238 nullTest cstr "wrapMatch cstr" $ do 239 withForeignPtr pcre_fptr $ \pcre_ptr -> do 240 nsub <- getNumSubs' pcre_ptr 241 let nsub_int :: Int 242 nsub_int = fi nsub 243 ovec_size :: CInt 244 ovec_size = ((nsub + 1) * 3) -- "man pcreapi" for explanation 245 ovec_bytes :: Int 246 ovec_bytes = (fi ovec_size) * (#const sizeof(int)) 247 allocaBytes ovec_bytes $ \ovec -> do 248 nullTest ovec "wrapMatch ovec" $ do 249 r@(ReturnCode r') <- c_pcre_exec pcre_ptr nullPtr cstr (fi len) (fi startOffset) flags ovec ovec_size 250 if r == retNoMatch 251 then return (Right Nothing) 252 else if r' < 0 253 then wrapRC r 254 else do 255 let pairsSet :: Int 256 pairsSet = if r == retNeededMoreSpace -- if r == ReturnCode 0 257 then nsub_int + 1 -- should not happen 258 else fi r' -- implies pairsSet > 0 259 extraPairs :: [(Int,Int)] 260 extraPairs = replicate (nsub_int + 1 - pairsSet) 261 (unusedOffset,unusedOffset) 262 pairs <- return . toPairs =<< mapM (peekElemOff ovec) [0 .. ((pairsSet*2)-1)] 263 return . Right . Just $ (pairs ++ extraPairs) 264 265-- | wrapMatchAll is an improvement over wrapMatch since it only 266-- allocates memory with allocaBytes once at the start. 267-- 268-- 269wrapMatchAll (Regex pcre_fptr _ flags) (cstr,len) = do 270 nullTest cstr "wrapMatchAll cstr" $ do 271 withForeignPtr pcre_fptr $ \regex -> do 272 nsub <- getNumSubs' regex 273 let nsub_int :: Int 274 nsub_int = fi nsub 275 ovec_size :: CInt 276 ovec_size = ((nsub + 1) * 3) -- "man pcreapi" for explanation 277 ovec_bytes :: Int 278 ovec_bytes = (fi ovec_size) * (#const sizeof(int)) 279 clen = fi len 280 flags' = (execNotEmpty .|. execAnchored .|. flags) 281 allocaBytes ovec_bytes $ \ovec -> 282 nullTest ovec "wrapMatchAll ovec" $ 283 let loop acc flags_in_use pos = do 284 r@(ReturnCode r') <- c_pcre_exec regex nullPtr cstr clen (fi pos) flags_in_use ovec ovec_size 285 if r == retNoMatch 286 then return (Right (acc [])) 287 else if r' < 0 288 then wrapRC r 289 else do 290 let pairsSet = if r == retNeededMoreSpace then nsub_int+1 else fi r' 291 pairs <- return . toPairs =<< mapM (peekElemOff ovec) [0 .. ((pairsSet*2)-1)] 292 let acc' = acc . (toMatchArray nsub_int pairs:) 293 case pairs of 294 [] -> return (Right (acc' [])) 295 ((s,e):_) | s==e -> if s == len 296 then return (Right (acc' [])) 297 else loop acc' flags' e 298 | otherwise -> loop acc' flags e 299 in loop id flags 0 300toMatchArray :: Int -> [(Int,Int)] -> Array Int (Int,Int) 301toMatchArray n pairs = accumArray (\_ (s,e) -> (s,(e-s))) (-1,0) (0,n) (zip [0..] pairs) 302 303toPairs :: [CInt] -> [(Int,Int)] 304toPairs [] = [] 305toPairs (a:b:rest) = (fi a,fi b):toPairs rest 306toPairs [_] = error "Should not have just one element in WrapPCRE.wrapMatchAll.toPairs" 307 308wrapCount (Regex pcre_fptr _ flags) (cstr,len) = do 309 nullTest cstr "wrapCount cstr" $ do 310 withForeignPtr pcre_fptr $ \pcre_ptr -> do 311 nsub <- getNumSubs' pcre_ptr 312 let ovec_size :: CInt 313 ovec_size = ((nsub + 1) * 3) -- "man pcreapi" for explanation 314 ovec_bytes :: Int 315 ovec_bytes = (fi ovec_size) * (#const sizeof(int)) 316 clen = fi len 317 allocaBytes ovec_bytes $ \ovec -> 318 nullTest ovec "wrapCount ovec" $ 319 let act pos = c_pcre_exec pcre_ptr nullPtr cstr clen (fi pos) flags ovec ovec_size 320 loop acc pos | acc `seq` pos `seq` False = undefined 321 | otherwise = do 322 r@(ReturnCode r') <- act pos 323 if r == retNoMatch 324 then return (Right acc) 325 else if r' < 0 326 then wrapRC r 327 else do 328 pairs <- return . toPairs =<< mapM (peekElemOff ovec) [0,1] 329 case pairs of 330 [] -> return (Right (succ acc)) 331 ((s,e):_) | s==e -> return (Right (succ acc)) 332 | otherwise -> loop (succ acc) e 333 in loop 0 0 334 335getVersion = unsafePerformIO $ do 336 version <- c_pcre_version 337 if version == nullPtr 338 then return (Just "pcre_version was null") 339 else return . Just =<< peekCString version 340 341configUTF8 = unsafePerformIO $ 342 alloca $ \ptrVal -> do -- (ptrVal :: Ptr CInt) 343 when (ptrVal == nullPtr) (fail "Text.Regex.PCRE.Wrap.configUTF8 could not alloca CInt!!!") 344 _unicodeSupported <- c_pcre_config pcreConfigUtf8 ptrVal 345 {- pcre_config: The output is an integer that is set to one if UTF-8 support is available; otherwise it is set to zero. -} 346 val <- peek ptrVal 347 case val of 348 (1 :: CInt) -> return True 349 0 -> return False 350 _ -> return False -- should not happen 351 352foreign import ccall unsafe "pcre.h pcre_compile" 353 c_pcre_compile :: CString -> CompOption -> Ptr CString -> Ptr CInt -> CString -> IO (Ptr PCRE) 354foreign import ccall unsafe "&free" 355 c_ptr_free :: FinalizerPtr a -- FunPtr (Ptr a -> IO ()) 356foreign import ccall unsafe "pcre.h pcre_exec" 357 c_pcre_exec :: Ptr PCRE -> Ptr PCRE_Extra -> CString -> CInt -> CInt -> ExecOption -> Ptr CInt -> CInt -> IO ReturnCode 358foreign import ccall unsafe "pcre.h pcre_fullinfo" 359 c_pcre_fullinfo :: Ptr PCRE -> Ptr PCRE_Extra -> InfoWhat -> Ptr a -> IO CInt 360foreign import ccall unsafe "pcre.h pcre_version" 361 c_pcre_version :: IO (Ptr CChar) 362foreign import ccall unsafe "pcre.h pcre_config" 363 c_pcre_config :: ConfigWhat -> Ptr a -> IO CInt 364 365 366#enum CompOption,CompOption, \ 367 compAnchored = PCRE_ANCHORED, \ 368 compAutoCallout = PCRE_AUTO_CALLOUT, \ 369 compCaseless = PCRE_CASELESS, \ 370 compDollarEndOnly = PCRE_DOLLAR_ENDONLY, \ 371 compDotAll = PCRE_DOTALL, \ 372 compExtended = PCRE_EXTENDED, \ 373 compExtra = PCRE_EXTRA, \ 374 compFirstLine = PCRE_FIRSTLINE, \ 375 compMultiline = PCRE_MULTILINE, \ 376 compNoAutoCapture = PCRE_NO_AUTO_CAPTURE, \ 377 compUngreedy = PCRE_UNGREEDY, \ 378 compUTF8 = PCRE_UTF8, \ 379 compNoUTF8Check = PCRE_NO_UTF8_CHECK 380 381#enum ExecOption,ExecOption, \ 382 execAnchored = PCRE_ANCHORED, \ 383 execNotBOL = PCRE_NOTBOL, \ 384 execNotEOL = PCRE_NOTEOL, \ 385 execNotEmpty = PCRE_NOTEMPTY, \ 386 execNoUTF8Check = PCRE_NO_UTF8_CHECK, \ 387 execPartial = PCRE_PARTIAL 388 389#enum ReturnCode,ReturnCode, \ 390 retNoMatch = PCRE_ERROR_NOMATCH, \ 391 retNull = PCRE_ERROR_NULL, \ 392 retBadOption = PCRE_ERROR_BADOPTION, \ 393 retBadMagic = PCRE_ERROR_BADMAGIC, \ 394 retUnknownNode = PCRE_ERROR_UNKNOWN_NODE, \ 395 retNoMemory = PCRE_ERROR_NOMEMORY, \ 396 retNoSubstring = PCRE_ERROR_NOSUBSTRING 397 398-- Comment out most of these to avoid unused binding warnings 399 400-- PCRE_INFO_FIRSTCHAR is deprecated, use PCRE_INFO_FIRSTBYTE instead. 401#enum InfoWhat,InfoWhat, \ 402 PCRE_INFO_CAPTURECOUNT 403{- 404 PCRE_INFO_BACKREFMAX, \ 405 PCRE_INFO_DEFAULT_TABLES, \ 406 PCRE_INFO_FIRSTBYTE, \ 407 PCRE_INFO_FIRSTCHAR, \ 408 PCRE_INFO_FIRSTTABLE, \ 409 PCRE_INFO_LASTLITERAL, \ 410 PCRE_INFO_NAMECOUNT, \ 411 PCRE_INFO_NAMEENTRYSIZE, \ 412 PCRE_INFO_NAMETABLE, \ 413 PCRE_INFO_OPTIONS, \ 414 PCRE_INFO_SIZE, \ 415 PCRE_INFO_STUDYSIZE 416-} 417#enum ConfigWhat,ConfigWhat, \ 418 PCRE_CONFIG_UTF8 419{- 420 PCRE_CONFIG_UNICODE_PROPERTIES, \ 421 PCRE_CONFIG_NEWLINE, \ 422 PCRE_CONFIG_LINK_SIZE, \ 423 PCRE_CONFIG_POSIX_MALLOC_THRESHOLD, \ 424 PCRE_CONFIG_MATCH_LIMIT, \ 425 PCRE_CONFIG_MATCH_LIMIT_RECURSION, \ 426 PCRE_CONFIG_STACKRECURSE 427-} 428 429