1{-# LANGUAGE CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving #-} 2-------------------------------------------------------------------- 3-- | 4-- Module : Text.Regex.PCRE.Light.Base 5-- Copyright: Copyright (c) 2007-2008, Don Stewart 6-- 7-- Documentation based on /man pcreapi/, written by Philip Hazel, 2007. 8-- 9-- License : BSD3 10-- Maintainer: Don Stewart <dons@galois.com> 11-- Stability : experimental 12-- Portability: CPP, FFI 13-- Tested with: GHC 6.8.2 14-- 15-- Raw FFI bindings to PCRE functions and constants. 16-- 17 18module Text.Regex.PCRE.Light.Base ( 19 20 -- * A PCRE structure 21 PCRE 22 , Regex(..) 23 24 -- * C exports 25 , c_pcre_compile 26 , c_pcre_exec 27 , c_pcre_fullinfo 28 29 ------------------------------------------------------------------------ 30 31 -- * PCRE Options, an abstract newtyped Num wrapper over a CInt 32 , PCREOption 33 , combineOptions 34 35 , anchored , auto_callout {-, bsr_anycrlf-} 36 {-, bsr_unicode-} , caseless , dollar_endonly 37 , dotall , dupnames , extended 38 , extra , firstline , multiline 39 {-, newline_any-} {-, newline_anycrlf-} , newline_cr 40 , newline_crlf , newline_lf , no_auto_capture 41 , ungreedy , utf8 , no_utf8_check 42 43 -- * PCRE exec-time options, an abstract, newtyped Num wrapper over CInt 44 , PCREExecOption 45 , combineExecOptions 46 47 , exec_anchored {-, exec_newline_any , exec_newline_anycrlf-} 48 , exec_newline_cr , exec_newline_crlf , exec_newline_lf 49 , exec_notbol , exec_noteol , exec_notempty 50 , exec_no_utf8_check , exec_partial 51 52 ------------------------------------------------------------------------ 53 -- * PCRE Errors 54 , PCREError 55 56 , error_nomatch , error_null , error_badoption 57 , error_badmagic {-, error_unknown_opcode-} , error_unknown_node 58 , error_nomemory , error_nosubstring , error_matchlimit 59 , error_callout , error_badutf8 , error_badutf8_offset 60 , error_partial , error_badpartial , error_internal 61 , error_badcount , error_dfa_uitem , error_dfa_ucond 62 , error_dfa_umlimit , error_dfa_wssize , error_dfa_recurse 63 , error_recursionlimit {-, error_nullwslimit-} {-, error_badnewline-} 64 65 -- * PCRE Info 66 , PCREInfo 67 68 , info_options , info_size , info_capturecount 69 , info_backrefmax , info_firstbyte , info_firstchar 70 , info_firsttable , info_lastliteral , info_nameentrysize 71 , info_namecount , info_nametable , info_studysize 72 , info_default_tables {-, info_okpartial-} {-, info_jchanged-} 73 {-, info_hascrorlf-} 74 75 -- * PCRE Configuration 76 , PCREConfig 77 78 , config_utf8 , config_newline , config_link_size 79 , config_posix_malloc_threshold , config_match_limit 80 , config_stackrecurse , config_unicode_properties 81 , config_match_limit_recursion {-, config_bsr-} 82 83 -- * PCRE Extra 84 , PCREExtraFlags 85 86 , extra_study_data , extra_match_limit , extra_callout_data 87 , extra_tables , extra_match_limit_recursion, 88 89 ------------------------------------------------------------------------ 90 91 size_of_cint 92 93 ) where 94 95-- Foreigns 96import Foreign 97import Foreign.Ptr 98import Foreign.C.Types 99import Foreign.C.String 100 101import qualified Data.ByteString.Char8 as S 102 103#include <pcre.h> 104 105-- | Get sizeof CInt from hsc2hs 106size_of_cint :: Int 107size_of_cint = #const (sizeof(int)) 108 109------------------------------------------------------------------------ 110-- Types 111 112-- | An abstract pointer to a compiled PCRE Regex structure 113-- The structure allocated by the PCRE library will be deallocated 114-- automatically by the Haskell storage manager. 115-- 116data Regex = Regex {-# UNPACK #-} !(ForeignPtr PCRE) 117 {-# UNPACK #-} !S.ByteString 118 deriving (Eq, Ord, Show) 119 120type PCRE = () 121 122------------------------------------------------------------------------ 123 124-- | A type for PCRE compile-time options. These are newtyped CInts, 125-- which can be bitwise-or'd together, using '(Data.Bits..|.)' 126-- 127newtype PCREOption = PCREOption { unPCREOption :: PCREOption_ } 128#if __GLASGOW_HASKELL__ 129 deriving (Eq,Ord,Show,Read) 130#endif 131 132-- | Combine a list of options into a single option, using bitwise (.|.) 133combineOptions :: [PCREOption] -> PCREOption 134combineOptions = PCREOption . foldr ((.|.) . unPCREOption) 0 135 136-- Now follows the user-visible options to _exec and _compile. 137-- To avoid type errors, we newtype the underlying CInts, and 138-- statically differentiate PCREOptions from non-PCREOptions 139-- 140-- The safety can still be defeated using numeric literals though, 141-- and other Num operations. We could do more to protect against this. 142-- (a smart constructor for .|.) 143 144-- | 'anchored' 145-- 146-- If this bit is set, the pattern is forced to be /anchored/, that is, 147-- it is constrained to match only at the first matching point in the 148-- string that is being searched (the /subject string/). This effect can 149-- also be achieved by appropriate constructs in the pattern itself, which 150-- is the only way to do it in Perl. 151-- 152anchored :: PCREOption 153anchored = PCREOption anchored_cint 154 155-- | 'auto_callout' 156-- 157-- If this bit is set, "compile" automatically inserts callout 158-- items, all with number 255, before each pattern item. For discussion 159-- of the callout facility, see the man pcrecallout documentation 160-- 161auto_callout :: PCREOption 162auto_callout = PCREOption auto_callout_cint 163 164-- | 'bsr_anycrlf' and 'bsr_unicode' 165-- 166-- These options (which are mutually exclusive) control what the \\R escape 167-- sequence matches. The choice is either to match only CR, LF, or CRLF, or to 168-- match any Unicode new- line sequence. The default is specified when PCRE is 169-- built. It can be overridden from within the pattern, or by setting an option 170-- when a compiled pattern is matched. 171-- 172-- bsr_anycrlf :: PCREOption 173-- bsr_anycrlf = PCREOption bsr_anycrlf_cint 174 175-- | 'bsr_unicode'. See 'bse_anycrlf' 176-- 177-- bsr_unicode :: PCREOption 178-- bsr_unicode = PCREOption bsr_unicode_cint 179 180-- | 'caseless' 181-- 182-- If this bit is set, letters in the pattern match both upper and lower case 183-- letters. It is equivalent to Perl's \/i option, and it can be changed within a 184-- pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the 185-- concept of case for characters whose values are less than 128, so caseless 186-- matching is always possible. For characters with higher values, the concept of 187-- case is supported if PCRE is compiled with Unicode property sup- port, but not 188-- otherwise. If you want to use caseless matching for characters 128 and above, 189-- you must ensure that PCRE is compiled with Unicode property support as well as 190-- with UTF-8 support. 191-- 192caseless :: PCREOption 193caseless = PCREOption caseless_cint 194 195-- | 'dollar_endonly' 196-- 197-- If this bit is set, a dollar metacharacter in the pattern matches only at 198-- the end of the subject string. Without this option, a dollar also matches 199-- immediately before a newline at the end of the string (but not before any other 200-- newlines). The 'dollar_endonly' option is ignored if 'multiline' 201-- is set. There is no equivalent to this option in Perl, and no way to set it 202-- within a pattern. 203-- 204dollar_endonly :: PCREOption 205dollar_endonly = PCREOption dollar_endonly_cint 206 207-- | 'dotall' 208-- 209-- If this bit is set, a dot metacharater in the pattern matches all 210-- characters, including those that indicate newline. Without it, a dot does 211-- not match when the current position is at a newline. This option is 212-- equivalent to Perl's \/s option, and it can be changed within a pattern by a 213-- (?s) option setting. A negative class such as [^a] always matches newline 214-- characters, independent of the setting of this option. 215-- 216dotall :: PCREOption 217dotall = PCREOption dotall_cint 218 219-- | 'dupnames' 220-- 221-- If this bit is set, names used to identify capturing subpatterns need not be 222-- unique. This can be helpful for certain types of pattern when it is known 223-- that only one instance of the named subpattern can ever be matched. There are 224-- more details of named subpatterns in the /man pcreapi/ documentation. 225-- 226dupnames :: PCREOption 227dupnames = PCREOption dupnames_cint 228 229-- | 'extended' 230-- 231-- If this bit is set, whitespace data characters in the pattern are totally 232-- ignored except when escaped or inside a character class. Whitespace does not 233-- include the VT character (code 11). In addition, characters between an 234-- unescaped \# outside a character class and the next newline, inclusive, are 235-- also ignored. This is equivalent to Perl's \/x option, and it can be changed 236--within a pattern by a (?x) option setting. 237-- 238-- This option makes it possible to include comments inside complicated 239-- patterns. Note, however, that this applies only to data characters. Whitespace 240-- characters may never appear within special character sequences in a pattern, 241-- for example within the sequence (?( which introduces a conditional subpattern. 242-- 243extended :: PCREOption 244extended = PCREOption extended_cint 245 246-- | 'extra' 247-- 248-- This option was invented in order to turn on additional functionality of 249-- PCRE that is incompatible with Perl, but it is currently of very little use. 250-- When set, any backslash in a pattern that is followed by a letter that has no 251-- special meaning causes an error, thus reserving these combinations for future 252-- expansion. By default, as in Perl, a backslash followed by a letter with no 253-- special meaning is treated as a literal. (Perl can, however, be persuaded to 254-- give a warning for this.) There are at present no other features controlled by 255-- this option. It can also be set by a (?X) option setting within a pattern. 256-- 257extra :: PCREOption 258extra = PCREOption extra_cint 259 260-- | 'firstline' 261-- 262-- If this option is set, an unanchored pattern is required to match before or 263-- at the first newline in the subject string, though the matched text may 264--continue over the newline. 265-- 266firstline :: PCREOption 267firstline = PCREOption firstline_cint 268 269-- | 'multiline' 270-- 271-- By default, PCRE treats the subject string as consisting of a single line 272-- of characters (even if it actually contains newlines). The /start of line/ 273-- metacharacter (^) matches only at the start of the string, while the /end of line/ 274-- metacharacter ($) matches only at the end of the string, or before a 275-- terminating newline (unless 'dollar_endonly' is set). This is the same 276-- as Perl. 277-- 278-- When 'multiline' it is set, the /start of line/ and /end of line/ 279-- constructs match immediately following or immediately before internal newlines 280-- in the subject string, respectively, as well as at the very start and end. This 281-- is equivalent to Perl's \/m option, and it can be changed within a pattern by a 282-- (?m) option setting. If there are no newlines in a subject string, or no occur- 283-- rences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect. 284-- 285multiline :: PCREOption 286multiline = PCREOption multiline_cint 287 288-- | newline_cr', 'newline_lf', 'newline_crlf', 289-- 'newline_anycrlf', 'newline_any' 290-- 291-- These options override the default newline definition that 292-- was chosen when PCRE was built. Setting the first or the 293-- second specifies that a newline is indicated by a single 294-- character (CR or LF, respectively). Setting 'newline_crlf' specifies 295-- that a newline is indicated by the two-character CRLF sequence. 296-- Setting 'newline_anycrlf' 297-- specifies that any of the three preceding sequences should 298-- be recognized. Setting 'newline_any' specifies that any 299-- Unicode newline sequence should be recognized. The Unicode 300-- newline sequences are the three just mentioned, plus the 301-- single characters VT (vertical tab, U+000B), FF (formfeed, 302-- U+000C), NEL (next line, U+0085), LS (line separator, 303-- U+2028), and PS (paragraph separator, U+2029). The last 304-- two are recognized only in UTF-8 mode. 305-- 306-- The newline setting in the options word uses three bits 307-- that are treated as a number, giving eight possibilities. 308-- Currently only six are used (default plus the five values 309-- above). This means that if you set more than one newline 310-- option, the combination may or may not be sensible. For 311-- example, 'newline_cr' with 'newline_lf' is equivalent to 312-- 'newline_crlf', but other combinations may yield unused numbers and 313-- cause an error. 314-- 315-- The only time that a line break is specially recognized 316-- when compiling a pattern is if 'extended' is set, and 317-- an unescaped \# outside a character class is encountered. 318-- This indicates a comment that lasts until after the next 319-- line break sequence. In other circumstances, line break 320-- sequences are treated as literal data, except that in 321-- 'extended' mode, both CR and LF are treated as whitespace characters 322-- and are therefore ignored. -- 323-- 324-- The newline option that is set at compile time becomes the 325-- default that is used for 'exec' but it can be overridden. 326-- 327-- newline_any :: PCREOption 328-- newline_any = PCREOption newline_any_cint 329 330-- | 'newline_anycrlf', see 'newline_any' 331-- newline_anycrlf :: PCREOption 332-- newline_anycrlf = PCREOption newline_anycrlf_cint 333 334-- | 'newline_cr', see 'newline_any' 335newline_cr :: PCREOption 336newline_cr = PCREOption newline_cr_cint 337 338-- | 'newline_crlf', see 'newline_any' 339newline_crlf :: PCREOption 340newline_crlf = PCREOption newline_crlf_cint 341 342-- | 'newline_lf', see 'newline_any' 343newline_lf :: PCREOption 344newline_lf = PCREOption newline_lf_cint 345 346-- | 'no_auto_capture' 347-- 348-- If this option is set, it disables the use of numbered 349-- capturing parentheses in the pattern. Any opening paren- 350-- thesis that is not followed by ? behaves as if it were 351-- followed by ?: but named parentheses can still be used for 352-- capturing (and they acquire numbers in the usual way). 353-- There is no equivalent of this option in Perl. 354-- 355no_auto_capture :: PCREOption 356no_auto_capture = PCREOption no_auto_capture_cint 357 358-- | 'ungreedy' 359-- 360-- This option inverts the /greediness/ of the quantifiers so 361-- that they are not greedy by default, but become greedy if 362-- followed by /?/. It is not compatible with Perl. It can 363-- also be set by a (?U) option setting within the pattern. 364-- 365ungreedy :: PCREOption 366ungreedy = PCREOption ungreedy_cint 367 368-- | 'utf8' 369-- 370-- This option causes PCRE to regard both the pattern and the 371-- subject as strings of UTF-8 characters instead of single-byte character 372-- strings. However, it is available only when 373-- PCRE is built to include UTF-8 support. If not, the use of 374-- this option provokes an error. Details of how this option 375-- changes the behaviour of PCRE are given in the section on 376-- UTF-8 support in the main pcre page. 377-- 378utf8 :: PCREOption 379utf8 = PCREOption utf8_cint 380 381-- | 'no_utf8_check' 382-- 383-- When PCRE_UTF8 is set, the validity of the pattern as a 384-- UTF-8 string is automatically checked. There is a discussion 385-- about the validity of UTF-8 strings in the main pcre 386-- page. If an invalid UTF-8 sequence of bytes is found, 387-- compile() returns an error. If you already know that 388-- your pattern is valid, and you want to skip this check for 389-- performance reasons, you can set the 'no_utf8_check' 390-- option. When it is set, the effect of passing an invalid 391-- UTF-8 string as a pattern is undefined. It may cause your 392-- program to crash. Note that this option can also be passed 393-- to 'exec', to suppress the UTF-8 validity checking of subject strings. 394-- 395no_utf8_check :: PCREOption 396no_utf8_check = PCREOption no_utf8_check_cint 397 398-- Internal name for hsc2hs to bind to. 399type PCREOption_ = CInt 400 401-- PCRE compile options, as CInts 402#{enum PCREOption_, 403 , anchored_cint = PCRE_ANCHORED 404 , auto_callout_cint = PCRE_AUTO_CALLOUT 405 , caseless_cint = PCRE_CASELESS 406 , dollar_endonly_cint = PCRE_DOLLAR_ENDONLY 407 , dotall_cint = PCRE_DOTALL 408 , dupnames_cint = PCRE_DUPNAMES 409 , extended_cint = PCRE_EXTENDED 410 , extra_cint = PCRE_EXTRA 411 , firstline_cint = PCRE_FIRSTLINE 412 , multiline_cint = PCRE_MULTILINE 413 , newline_cr_cint = PCRE_NEWLINE_CR 414 , newline_crlf_cint = PCRE_NEWLINE_CRLF 415 , newline_lf_cint = PCRE_NEWLINE_LF 416 , no_auto_capture_cint = PCRE_NO_AUTO_CAPTURE 417 , ungreedy_cint = PCRE_UNGREEDY 418 , utf8_cint = PCRE_UTF8 419 , no_utf8_check_cint = PCRE_NO_UTF8_CHECK 420 } 421 422-- , bsr_anycrlf_cint = PCRE_BSR_ANYCRLF 423-- , bsr_unicode_cint = PCRE_BSR_UNICODE 424-- , newline_any_cint = PCRE_NEWLINE_ANY 425-- , newline_anycrlf_cint = PCRE_NEWLINE_ANYCRLF 426 427------------------------------------------------------------------------ 428 429-- | PCRE exec options, to be passed to exec 430newtype PCREExecOption = PCREExecOption { unPCREExecOption :: PCREExecOption_ } 431#if __GLASGOW_HASKELL__ 432 deriving (Eq,Ord,Show,Read) 433#endif 434 435-- | Combine a list of exec options into a single option, using bitwise (.|.) 436combineExecOptions :: [PCREExecOption] -> PCREExecOption 437combineExecOptions = PCREExecOption . foldr ((.|.) . unPCREExecOption) 0 438 439-- | 'anchored'. 440-- 441-- The 'anchored' option limits 'exec' to matching at 442-- the first matching position. If a pattern was compiled 443-- with 'anchored', or turned out to be anchored by virtue 444-- of its contents, it cannot be made unachored at matching 445-- time. 446exec_anchored :: PCREExecOption 447exec_anchored = PCREExecOption exec_anchored_cint 448 449-- | 'newline_cr', 'newline_lf', 450-- 'newline_crlf', 'newline_anycrlf', 'newline_any' 451-- 452-- These options override the newline definition that was 453-- chosen or defaulted when the pattern was compiled. For 454-- details, see the description of 'compile' above. Dur- 455-- ing matching, the newline choice affects the behaviour of 456-- the dot, circumflex, and dollar metacharacters. It may 457-- also alter the way the match position is advanced after a 458-- match failure for an unanchored pattern. 459-- 460-- When 'newline_crlf', 'newline_anycrlf', or 'newline_any' 461-- is set, and a match attempt for an unanchored 462-- pattern fails when the current position is at a CRLF 463-- sequence, and the pattern contains no explicit matches for 464-- CR or LF characters, the match position is advanced by two 465-- characters instead of one, in other words, to after the 466-- CRLF. 467-- 468-- The above rule is a compromise that makes the most common 469-- cases work as expected. For example, if the pattern is .+A 470-- (and the 'dotall' option is not set), it does not match 471-- the string /\\r\\nA/ because, after failing at the start, it 472-- skips both the CR and the LF before retrying. However, the 473-- pattern /[\\r\\n]A/ does match that string, because it contains 474-- an explicit CR or LF reference, and so advances only 475-- by one character after the first failure. 476-- 477-- An explicit match for CR of LF is either a literal appear- 478-- ance of one of those characters, or one of the \\r or \\n 479-- escape sequences. Implicit matches such as [^X] do not 480-- count, nor does \\s (which includes CR and LF in the char- 481-- acters that it matches). 482-- 483-- Notwithstanding the above, anomalous effects may still 484-- occur when CRLF is a valid newline sequence and explicit 485-- \\r or \\n escapes appear in the pattern. 486-- 487-- exec_newline_any :: PCREExecOption 488-- exec_newline_any = PCREExecOption exec_newline_any_cint 489 490-- | 'exec_newline_anycrlf', see 'exec_newline_any' 491-- exec_newline_anycrlf :: PCREExecOption 492-- exec_newline_anycrlf = PCREExecOption exec_newline_anycrlf_cint 493 494-- | 'exec_newline_cr', see 'exec_newline_any' 495exec_newline_cr :: PCREExecOption 496exec_newline_cr = PCREExecOption exec_newline_cr_cint 497 498-- | 'exec_newline_crlf', see 'exec_newline_any' 499exec_newline_crlf :: PCREExecOption 500exec_newline_crlf = PCREExecOption exec_newline_crlf_cint 501 502-- | 'exec_newline_lf', see 'exec_newline_any' 503exec_newline_lf :: PCREExecOption 504exec_newline_lf = PCREExecOption exec_newline_lf_cint 505 506-- | 'PCRE_NOTBOL' 507-- 508-- This option specifies that first character of the subject 509-- string is not the beginning of a line, so the circumflex 510-- metacharacter should not match before it. Setting this 511-- without 'multiline' (at compile time) causes circumflex 512-- never to match. This option affects only the behaviour of 513-- the circumflex metacharacter. It does not affect \\A. 514-- 515exec_notbol :: PCREExecOption 516exec_notbol = PCREExecOption exec_notbol_cint 517 518-- | 'noteol' 519-- 520-- This option specifies that the end of the subject string 521-- is not the end of a line, so the dollar metacharacter 522-- should not match it nor (except in multiline mode) a newline 523-- immediately before it. Setting this without 'multiline' 524-- (at compile time) causes dollar never to match. 525-- This option affects only the behaviour of the dollar 526-- metacharacter. It does not affect \\Z or \\z. 527-- 528exec_noteol :: PCREExecOption 529exec_noteol = PCREExecOption exec_noteol_cint 530 531-- | PCRE_NOTEMPTY 532-- 533-- An empty string is not considered to be a valid match if 534-- this option is set. If there are alternatives in the pattern, 535-- they are tried. If all the alternatives match the 536-- empty string, the entire match fails. For example, if the 537-- pattern 538-- 539-- > a?b? 540-- 541-- is applied to a string not beginning with /a/ or /b/, it 542-- matches the empty string at the start of the subject. With 543-- 'notempty' set, this match is not valid, so 'PCRE 544-- searches further into the string for occurrences of /a/ or 545-- /b/. 546-- 547-- Perl has no direct equivalent of 'notempty', but it 548-- does make a special case of a pattern match of the empty 549-- string within its split() function, and when using the \/g 550-- modifier. It is possible to emulate Perl's behaviour after 551-- matching a null string by first trying the match again at 552-- the same offset with PCRE_NOTEMPTY and PCRE_ANCHORED, and 553-- then if that fails by advancing the starting offset (see 554-- below) and trying an ordinary match again. There is some 555-- code that demonstrates how to do this in the pcredemo.c 556-- sample program. 557-- 558exec_notempty :: PCREExecOption 559exec_notempty = PCREExecOption exec_notempty_cint 560 561-- | 'no_utf8_check' 562-- 563-- When 'utf8' is set at compile time, the validity of the 564-- subject as a UTF-8 string is automatically checked when 565-- exec() is subsequently called. The value of 566-- startoffset is also checked to ensure that it points to 567-- the start of a UTF-8 character. There is a discussion 568-- about the validity of UTF-8 strings in the section on 569-- UTF-8 support in the main pcre page. If an invalid UTF-8 570-- sequence of bytes is found, exec() returns the error 571-- 'error_badutf8'. If startoffset contains an invalid 572-- value, 'error_badutf8_offset' is returned. 573-- 574-- If you already know that your subject is valid, and you 575-- want to skip these checks for performance reasons, you can 576-- set the 'no_utf8_check' option when calling 577-- 'exec'. You might want to do this for the second and 578-- subsequent calls to exec() if you are making repeated 579-- calls to find all the matches in a single subject string. 580-- However, you should be sure that the value of startoffset 581-- points to the start of a UTF-8 character. When 582-- 'no_utf8_check' is set, the effect of passing an 583-- invalid UTF-8 string as a subject, or a value of startoff- 584-- set that does not point to the start of a UTF-8 character, 585-- is undefined. Your program may crash. 586-- 587exec_no_utf8_check :: PCREExecOption 588exec_no_utf8_check = PCREExecOption exec_no_utf8_check_cint 589 590-- | 'partial' 591-- 592-- This option turns on the partial matching feature. If the 593-- subject string fails to match the pattern, but at some 594-- point during the matching process the end of the subject 595-- was reached (that is, the subject partially matches the 596-- pattern and the failure to match occurred only because 597-- there were not enough subject characters), 'exec' 598-- returns 'error_partial' instead of 'error_nomatch'. 599-- When 'partial' is used, there are restrictions on what 600-- may appear in the pattern. These are discussed in the 601-- pcrepartial documentation. 602-- 603exec_partial :: PCREExecOption 604exec_partial = PCREExecOption exec_partial_cint 605 606-- Internal name for hsc2hs to bind to. 607type PCREExecOption_ = CInt 608 609-- PCRE exec options 610#{enum PCREExecOption_, 611 , exec_anchored_cint = PCRE_ANCHORED 612 , exec_newline_cr_cint = PCRE_NEWLINE_CR 613 , exec_newline_crlf_cint = PCRE_NEWLINE_CRLF 614 , exec_newline_lf_cint = PCRE_NEWLINE_LF 615 , exec_notbol_cint = PCRE_NOTBOL 616 , exec_noteol_cint = PCRE_NOTEOL 617 , exec_notempty_cint = PCRE_NOTEMPTY 618 , exec_no_utf8_check_cint = PCRE_NO_UTF8_CHECK 619 , exec_partial_cint = PCRE_PARTIAL 620 } 621 622-- , exec_newline_any_cint = PCRE_NEWLINE_ANY 623-- , exec_newline_anycrlf_cint = PCRE_NEWLINE_ANYCRLF 624-- , dfa_shortest = PCRE_DFA_SHORTEST 625-- , dfa_restart = PCRE_DFA_RESTART 626 627------------------------------------------------------------------------ 628 629-- | A type for PCRE Errors: exec-time error codes. 630type PCREError = CInt 631 632#{enum PCREError, 633 , error_nomatch = PCRE_ERROR_NOMATCH 634 , error_null = PCRE_ERROR_NULL 635 , error_badoption = PCRE_ERROR_BADOPTION 636 , error_badmagic = PCRE_ERROR_BADMAGIC 637 , error_unknown_node = PCRE_ERROR_UNKNOWN_NODE 638 , error_nomemory = PCRE_ERROR_NOMEMORY 639 , error_nosubstring = PCRE_ERROR_NOSUBSTRING 640 , error_matchlimit = PCRE_ERROR_MATCHLIMIT 641 , error_callout = PCRE_ERROR_CALLOUT 642 , error_badutf8 = PCRE_ERROR_BADUTF8 643 , error_badutf8_offset = PCRE_ERROR_BADUTF8_OFFSET 644 , error_partial = PCRE_ERROR_PARTIAL 645 , error_badpartial = PCRE_ERROR_BADPARTIAL 646 , error_internal = PCRE_ERROR_INTERNAL 647 , error_badcount = PCRE_ERROR_BADCOUNT 648 , error_dfa_uitem = PCRE_ERROR_DFA_UITEM 649 , error_dfa_ucond = PCRE_ERROR_DFA_UCOND 650 , error_dfa_umlimit = PCRE_ERROR_DFA_UMLIMIT 651 , error_dfa_wssize = PCRE_ERROR_DFA_WSSIZE 652 , error_dfa_recurse = PCRE_ERROR_DFA_RECURSE 653 , error_recursionlimit = PCRE_ERROR_RECURSIONLIMIT 654 } 655 656-- , error_unknown_opcode = PCRE_ERROR_UNKNOWN_OPCODE 657-- , error_nullwslimit = PCRE_ERROR_NULLWSLIMIT 658-- , error_badnewline = PCRE_ERROR_BADNEWLINE 659 660------------------------------------------------------------------------ 661-- Request types for fullinfo() */ 662 663-- | PCRE Info requests -- provides information about the compiled pattern. 664type PCREInfo = CInt 665 666#{enum PCREInfo, 667 , info_options = PCRE_INFO_OPTIONS 668 , info_size = PCRE_INFO_SIZE 669 , info_capturecount = PCRE_INFO_CAPTURECOUNT 670 , info_backrefmax = PCRE_INFO_BACKREFMAX 671 , info_firstbyte = PCRE_INFO_FIRSTBYTE 672 , info_firstchar = PCRE_INFO_FIRSTCHAR 673 , info_firsttable = PCRE_INFO_FIRSTTABLE 674 , info_lastliteral = PCRE_INFO_LASTLITERAL 675 , info_nameentrysize = PCRE_INFO_NAMEENTRYSIZE 676 , info_namecount = PCRE_INFO_NAMECOUNT 677 , info_nametable = PCRE_INFO_NAMETABLE 678 , info_studysize = PCRE_INFO_STUDYSIZE 679 , info_default_tables = PCRE_INFO_DEFAULT_TABLES 680 } 681 682-- , info_okpartial = PCRE_INFO_OKPARTIAL 683-- , info_jchanged = PCRE_INFO_JCHANGED 684-- , info_hascrorlf = PCRE_INFO_HASCRORLF 685 686------------------------------------------------------------------------ 687 688-- | Request types for config() 689type PCREConfig = CInt 690 691#{enum PCREConfig, 692 , config_utf8 = PCRE_CONFIG_UTF8 693 , config_newline = PCRE_CONFIG_NEWLINE 694 , config_link_size = PCRE_CONFIG_LINK_SIZE 695 , config_posix_malloc_threshold = PCRE_CONFIG_POSIX_MALLOC_THRESHOLD 696 , config_match_limit = PCRE_CONFIG_MATCH_LIMIT 697 , config_stackrecurse = PCRE_CONFIG_STACKRECURSE 698 , config_unicode_properties = PCRE_CONFIG_UNICODE_PROPERTIES 699 , config_match_limit_recursion = PCRE_CONFIG_MATCH_LIMIT_RECURSION 700 } 701 702-- Not portable 703-- , config_bsr = PCRE_CONFIG_BSR 704 705 706------------------------------------------------------------------------ 707 708-- | PCREExtra. 709-- A extra structure contains the following fields: 710-- 711-- * flags Bits indicating which fields are set 712-- * study_data Opaque data from study() 713-- * match_limit Limit on internal resource use 714-- * match_limit_recursion Limit on internal recursion depth 715-- * callout_data Opaque data passed back to callouts 716-- * tables Points to character tables or is NULL 717-- 718type PCREExtra = () 719 720-- | PCREExtraFlags. bit flags for extra structure. 721type PCREExtraFlags = CInt 722 723-- Bit flags for the extra structure. Do not re-arrange or redefine 724-- these bits, just add new ones on the end, in order to remain compatible. */ 725#{enum PCREExtraFlags, 726 , extra_study_data = PCRE_EXTRA_STUDY_DATA 727 , extra_match_limit = PCRE_EXTRA_MATCH_LIMIT 728 , extra_callout_data = PCRE_EXTRA_CALLOUT_DATA 729 , extra_tables = PCRE_EXTRA_TABLES 730 , extra_match_limit_recursion = PCRE_EXTRA_MATCH_LIMIT_RECURSION 731 } 732 733-- PCRE_EXP_DECL pcre *compile(const char *, int, const char **, int *, const unsigned char *); 734-- PCRE_EXP_DECL int config(int, void *); 735-- PCRE_EXP_DECL int exec(const pcre *, const extra *, PCRE_SPTR, int, int, int, int *, int); 736 737------------------------------------------------------------------------ 738-- C api 739 740{- 741 pcre *pcre_compile(const char *pattern, int options, 742 const char **errptr, int *erroffset, 743 const unsigned char *tableptr); 744-} 745 746-- | Compile a pattern to an internal form. The pattern is a C string 747-- terminated by a binary zero. A pointer to a single block of memory that is 748-- obtained via pcre_malloc is returned. It is up to the caller to free 749-- the memory (via pcre_free) when it is no longer required 750-- 751-- The options argument contains various bit settings that affect the 752-- compilation. It should be zero if no options are required. 753-- 754-- If errptr is NULL, pcre_compile() returns NULL immediately. 755-- Otherwise, if compilation of a pattern fails, pcre_compile() returns NULL, 756-- and sets the variable pointed to by errptr to point to a textual error 757-- message. 758-- 759-- The offset from the start of the pattern to the character where the error 760-- was discovered is placed in the variable pointed to by erroffset, which must 761-- not be NULL. 762-- 763foreign import ccall unsafe "pcre.h pcre_compile" 764 c_pcre_compile :: CString 765 -> PCREOption 766 -> Ptr CString 767 -> Ptr CInt 768 -> Ptr Word8 769 -> IO (Ptr PCRE) 770 771-- Additional fields to c_pcre_compile: 772-- 773-- errptr Where to put an error message 774-- erroffset Offset in pattern where error was found 775-- tableptr Pointer to character tables, or NULL to to use built in 776 777{- 778 int pcre_exec(const pcre *code, const pcre_extra *extra, 779 const char *subject, int length, int startoffset, 780 int options, int *ovector, int ovecsize); 781-} 782 783-- | This function matches a compiled regular expression 784-- against a given subject string, using a matching algorithm 785-- that is similar to Perl's. It returns offsets to captured 786-- substrings. 787-- 788-- Its arguments are, in order: 789-- 790-- * 'code' Points to the compiled pattern (result of pcre_compile) 791-- 792-- * 'extra' Points to an associated pcre_extra structure (result of pcre_study), or is NULL 793-- 794-- * 'subject' Points to the subject string 795-- 796-- * 'length' Length of the subject string, in bytes 797-- 798-- * 'startoffset' Offset in bytes in the subject at which to start matching 799-- 800-- * 'options' Option bits 801-- 802-- * 'ovector' Points to a vector of ints for result substrings 803-- 804-- * 'ovecsize' Number of elements in the vector (a multiple of 3) 805-- 806-- Note, subject not required to be null terminated. 807-- 808foreign import ccall unsafe "pcre.h pcre_exec" 809 c_pcre_exec :: Ptr PCRE 810 -> Ptr PCREExtra 811 -> Ptr Word8 812 -> CInt 813 -> CInt 814 -> PCREExecOption 815 -> Ptr CInt 816 -> CInt 817 -> IO CInt 818 819-- | Return information about a compiled pattern 820foreign import ccall unsafe "pcre.h pcre_fullinfo" 821 c_pcre_fullinfo :: Ptr PCRE 822 -> Ptr PCREExtra 823 -> PCREInfo 824 -> Ptr a 825 -> IO CInt 826 827