1{-# LANGUAGE CPP, ForeignFunctionInterface, GeneralizedNewtypeDeriving #-}
2--------------------------------------------------------------------
3-- |
4-- Module   : Text.Regex.PCRE.Light.Base
5-- Copyright: Copyright (c) 2007-2008, Don Stewart
6--
7-- Documentation based on /man pcreapi/, written by Philip Hazel, 2007.
8--
9-- License   : BSD3
10-- Maintainer:  Don Stewart <dons@galois.com>
11-- Stability :  experimental
12-- Portability: CPP, FFI
13-- Tested with: GHC 6.8.2
14--
15-- Raw FFI bindings to PCRE functions and constants.
16--
17
18module Text.Regex.PCRE.Light.Base (
19
20        -- * A PCRE structure
21          PCRE
22        , Regex(..)
23
24        -- * C exports
25        , c_pcre_compile
26        , c_pcre_exec
27        , c_pcre_fullinfo
28
29        ------------------------------------------------------------------------
30
31        -- * PCRE Options, an abstract newtyped Num wrapper over a CInt
32        , PCREOption
33        , combineOptions
34
35        , anchored , auto_callout {-, bsr_anycrlf-}
36        {-, bsr_unicode-} , caseless , dollar_endonly
37        , dotall , dupnames , extended
38        , extra , firstline , multiline
39        {-, newline_any-} {-, newline_anycrlf-} , newline_cr
40        , newline_crlf , newline_lf , no_auto_capture
41        , ungreedy , utf8 , no_utf8_check
42
43        -- * PCRE exec-time options, an abstract, newtyped Num wrapper over CInt
44        , PCREExecOption
45        , combineExecOptions
46
47        , exec_anchored {-, exec_newline_any , exec_newline_anycrlf-}
48        , exec_newline_cr , exec_newline_crlf , exec_newline_lf
49        , exec_notbol , exec_noteol , exec_notempty
50        , exec_no_utf8_check , exec_partial
51
52        ------------------------------------------------------------------------
53        -- * PCRE Errors
54        , PCREError
55
56        , error_nomatch , error_null , error_badoption
57        , error_badmagic {-, error_unknown_opcode-} , error_unknown_node
58        , error_nomemory , error_nosubstring , error_matchlimit
59        , error_callout , error_badutf8 , error_badutf8_offset
60        , error_partial , error_badpartial , error_internal
61        , error_badcount , error_dfa_uitem , error_dfa_ucond
62        , error_dfa_umlimit , error_dfa_wssize , error_dfa_recurse
63        , error_recursionlimit {-, error_nullwslimit-} {-, error_badnewline-}
64
65        -- * PCRE Info
66        , PCREInfo
67
68        , info_options , info_size , info_capturecount
69        , info_backrefmax , info_firstbyte , info_firstchar
70        , info_firsttable , info_lastliteral , info_nameentrysize
71        , info_namecount , info_nametable , info_studysize
72        , info_default_tables {-, info_okpartial-} {-, info_jchanged-}
73        {-, info_hascrorlf-}
74
75        -- * PCRE Configuration
76        , PCREConfig
77
78        , config_utf8 , config_newline , config_link_size
79        , config_posix_malloc_threshold , config_match_limit
80        , config_stackrecurse , config_unicode_properties
81        , config_match_limit_recursion {-, config_bsr-}
82
83        -- * PCRE Extra
84        , PCREExtraFlags
85
86        , extra_study_data , extra_match_limit , extra_callout_data
87        , extra_tables , extra_match_limit_recursion,
88
89        ------------------------------------------------------------------------
90
91        size_of_cint
92
93    ) where
94
95-- Foreigns
96import Foreign
97import Foreign.Ptr
98import Foreign.C.Types
99import Foreign.C.String
100
101import qualified Data.ByteString.Char8 as S
102
103#include <pcre.h>
104
105-- | Get sizeof CInt from hsc2hs
106size_of_cint :: Int
107size_of_cint = #const (sizeof(int))
108
109------------------------------------------------------------------------
110-- Types
111
112-- | An abstract pointer to a compiled PCRE Regex structure
113-- The structure allocated by the PCRE library will be deallocated
114-- automatically by the Haskell storage manager.
115--
116data Regex = Regex {-# UNPACK #-} !(ForeignPtr PCRE)
117                   {-# UNPACK #-} !S.ByteString
118        deriving (Eq, Ord, Show)
119
120type PCRE = ()
121
122------------------------------------------------------------------------
123
124-- | A type for PCRE compile-time options. These are newtyped CInts,
125-- which can be bitwise-or'd together, using '(Data.Bits..|.)'
126--
127newtype PCREOption = PCREOption { unPCREOption :: PCREOption_ }
128#if __GLASGOW_HASKELL__
129    deriving (Eq,Ord,Show,Read)
130#endif
131
132-- | Combine a list of options into a single option, using bitwise (.|.)
133combineOptions :: [PCREOption] -> PCREOption
134combineOptions = PCREOption . foldr ((.|.) . unPCREOption) 0
135
136-- Now follows the user-visible options to _exec and _compile.
137-- To avoid type errors, we newtype the underlying CInts, and
138-- statically differentiate PCREOptions from non-PCREOptions
139--
140-- The safety can still be defeated using numeric literals though,
141-- and other Num operations. We could do more to protect against this.
142-- (a smart constructor for .|.)
143
144-- | 'anchored'
145--
146-- If this bit is set, the pattern is forced to be /anchored/, that is,
147-- it is constrained to match only at the first matching point in the
148-- string that is being searched (the /subject string/). This effect can
149-- also be achieved by appropriate constructs in the pattern itself, which
150-- is the only way to do it in Perl.
151--
152anchored           :: PCREOption
153anchored           = PCREOption anchored_cint
154
155-- | 'auto_callout'
156--
157-- If this bit is set, "compile" automatically inserts callout
158-- items, all with number 255, before each pattern item. For discussion
159-- of the callout facility, see the man pcrecallout documentation
160--
161auto_callout       :: PCREOption
162auto_callout       = PCREOption auto_callout_cint
163
164-- | 'bsr_anycrlf' and 'bsr_unicode'
165--
166-- These options (which are mutually exclusive) control what the \\R escape
167-- sequence matches. The choice is either to match only CR, LF, or CRLF, or to
168-- match any Unicode new- line sequence. The default is specified when PCRE is
169-- built. It can be overridden from within the pattern, or by setting an option
170-- when a compiled pattern is matched.
171--
172-- bsr_anycrlf        :: PCREOption
173-- bsr_anycrlf        = PCREOption bsr_anycrlf_cint
174
175-- | 'bsr_unicode'. See 'bse_anycrlf'
176--
177-- bsr_unicode        :: PCREOption
178-- bsr_unicode        = PCREOption bsr_unicode_cint
179
180-- | 'caseless'
181--
182-- If this bit is set, letters in the pattern match both upper and lower case
183-- letters. It is equivalent to Perl's \/i option, and it can be changed within a
184-- pattern by a (?i) option setting. In UTF-8 mode, PCRE always understands the
185-- concept of case for characters whose values are less than 128, so caseless
186-- matching is always possible. For characters with higher values, the concept of
187-- case is supported if PCRE is compiled with Unicode property sup- port, but not
188-- otherwise. If you want to use caseless matching for characters 128 and above,
189-- you must ensure that PCRE is compiled with Unicode property support as well as
190-- with UTF-8 support.
191--
192caseless           :: PCREOption
193caseless           = PCREOption caseless_cint
194
195-- | 'dollar_endonly'
196--
197-- If this bit is set, a dollar metacharacter in the pattern matches only at
198-- the end of the subject string. Without this option, a dollar also matches
199-- immediately before a newline at the end of the string (but not before any other
200-- newlines). The 'dollar_endonly' option is ignored if 'multiline'
201-- is set.  There is no equivalent to this option in Perl, and no way to set it
202-- within a pattern.
203--
204dollar_endonly     :: PCREOption
205dollar_endonly     = PCREOption dollar_endonly_cint
206
207-- | 'dotall'
208--
209-- If this bit is set, a dot metacharater in the pattern matches all
210-- characters, including those that indicate newline. Without it, a dot does
211-- not match when the current position is at a newline. This option is
212-- equivalent to Perl's \/s option, and it can be changed within a pattern by a
213-- (?s) option setting. A negative class such as [^a] always matches newline
214-- characters, independent of the setting of this option.
215--
216dotall             :: PCREOption
217dotall             = PCREOption dotall_cint
218
219-- | 'dupnames'
220--
221-- If this bit is set, names used to identify capturing subpatterns need not be
222-- unique. This can be helpful for certain types of pattern when it is known
223-- that only one instance of the named subpattern can ever be matched. There are
224-- more details of named subpatterns in the /man pcreapi/ documentation.
225--
226dupnames           :: PCREOption
227dupnames           = PCREOption dupnames_cint
228
229-- | 'extended'
230--
231-- If this bit is set, whitespace data characters in the pattern are totally
232-- ignored except when escaped or inside a character class. Whitespace does not
233-- include the VT character (code 11). In addition, characters between an
234-- unescaped \# outside a character class and the next newline, inclusive, are
235-- also ignored. This is equivalent to Perl's \/x option, and it can be changed
236--within a pattern by a (?x) option setting.
237--
238-- This option makes it possible to include comments inside complicated
239-- patterns. Note, however, that this applies only to data characters. Whitespace
240-- characters may never appear within special character sequences in a pattern,
241-- for example within the sequence (?( which introduces a conditional subpattern.
242--
243extended           :: PCREOption
244extended           = PCREOption extended_cint
245
246-- | 'extra'
247--
248-- This option was invented in order to turn on additional functionality of
249-- PCRE that is incompatible with Perl, but it is currently of very little use.
250-- When set, any backslash in a pattern that is followed by a letter that has no
251-- special meaning causes an error, thus reserving these combinations for future
252-- expansion. By default, as in Perl, a backslash followed by a letter with no
253-- special meaning is treated as a literal. (Perl can, however, be persuaded to
254-- give a warning for this.) There are at present no other features controlled by
255-- this option. It can also be set by a (?X) option setting within a pattern.
256--
257extra              :: PCREOption
258extra              = PCREOption extra_cint
259
260-- | 'firstline'
261--
262-- If this option is set, an unanchored pattern is required to match before or
263-- at the first newline in the subject string, though the matched text may
264--continue over the newline.
265--
266firstline          :: PCREOption
267firstline          = PCREOption firstline_cint
268
269-- | 'multiline'
270--
271--  By default, PCRE treats the subject string as consisting of a single line
272-- of characters (even if it actually contains newlines). The /start of line/
273-- metacharacter (^) matches only at the start of the string, while the /end of line/
274--  metacharacter ($) matches only at the end of the string, or before a
275-- terminating newline (unless 'dollar_endonly' is set). This is the same
276-- as Perl.
277--
278-- When 'multiline' it is set, the /start of line/ and /end of line/
279-- constructs match immediately following or immediately before internal newlines
280-- in the subject string, respectively, as well as at the very start and end. This
281-- is equivalent to Perl's \/m option, and it can be changed within a pattern by a
282-- (?m) option setting. If there are no newlines in a subject string, or no occur-
283-- rences of ^ or $ in a pattern, setting PCRE_MULTILINE has no effect.
284--
285multiline          :: PCREOption
286multiline          = PCREOption multiline_cint
287
288-- | newline_cr', 'newline_lf', 'newline_crlf',
289-- 'newline_anycrlf', 'newline_any'
290--
291-- These options override the default newline definition that
292-- was chosen when PCRE was built. Setting the first or the
293-- second specifies that a newline is indicated by a single
294-- character (CR or LF, respectively). Setting 'newline_crlf' specifies
295-- that a newline is indicated by the two-character CRLF sequence.
296-- Setting 'newline_anycrlf'
297-- specifies that any of the three preceding sequences should
298-- be recognized. Setting 'newline_any' specifies that any
299-- Unicode newline sequence should be recognized. The Unicode
300-- newline sequences are the three just mentioned, plus the
301-- single characters VT (vertical tab, U+000B), FF (formfeed,
302-- U+000C), NEL (next line, U+0085), LS (line separator,
303-- U+2028), and PS (paragraph separator, U+2029). The last
304-- two are recognized only in UTF-8 mode.
305--
306-- The newline setting in the options word uses three bits
307-- that are treated as a number, giving eight possibilities.
308-- Currently only six are used (default plus the five values
309-- above). This means that if you set more than one newline
310-- option, the combination may or may not be sensible. For
311-- example, 'newline_cr' with 'newline_lf' is equivalent to
312-- 'newline_crlf', but other combinations may yield unused numbers and
313-- cause an error.
314--
315-- The only time that a line break is specially recognized
316-- when compiling a pattern is if 'extended' is set, and
317-- an unescaped \# outside a character class is encountered.
318-- This indicates a comment that lasts until after the next
319-- line break sequence. In other circumstances, line break
320-- sequences are treated as literal data, except that in
321-- 'extended' mode, both CR and LF are treated as whitespace characters
322-- and are therefore ignored.  --
323--
324-- The newline option that is set at compile time becomes the
325-- default that is used for 'exec' but it can be overridden.
326--
327-- newline_any        :: PCREOption
328-- newline_any        = PCREOption newline_any_cint
329
330-- | 'newline_anycrlf', see 'newline_any'
331-- newline_anycrlf    :: PCREOption
332-- newline_anycrlf    = PCREOption newline_anycrlf_cint
333
334-- | 'newline_cr', see 'newline_any'
335newline_cr         :: PCREOption
336newline_cr         = PCREOption newline_cr_cint
337
338-- | 'newline_crlf', see 'newline_any'
339newline_crlf       :: PCREOption
340newline_crlf       = PCREOption newline_crlf_cint
341
342-- | 'newline_lf', see 'newline_any'
343newline_lf         :: PCREOption
344newline_lf         = PCREOption newline_lf_cint
345
346-- | 'no_auto_capture'
347--
348-- If this option is set, it disables the use of numbered
349-- capturing parentheses in the pattern. Any opening paren-
350-- thesis that is not followed by ? behaves as if it were
351-- followed by ?: but named parentheses can still be used for
352-- capturing (and they acquire numbers in the usual way).
353-- There is no equivalent of this option in Perl.
354--
355no_auto_capture    :: PCREOption
356no_auto_capture    = PCREOption no_auto_capture_cint
357
358-- | 'ungreedy'
359--
360-- This option inverts the /greediness/ of the quantifiers so
361-- that they are not greedy by default, but become greedy if
362-- followed by /?/. It is not compatible with Perl. It can
363-- also be set by a (?U) option setting within the pattern.
364--
365ungreedy           :: PCREOption
366ungreedy           = PCREOption ungreedy_cint
367
368-- | 'utf8'
369--
370-- This option causes PCRE to regard both the pattern and the
371-- subject as strings of UTF-8 characters instead of single-byte character
372-- strings. However, it is available only when
373-- PCRE is built to include UTF-8 support. If not, the use of
374-- this option provokes an error. Details of how this option
375-- changes the behaviour of PCRE are given in the section on
376-- UTF-8 support in the main pcre page.
377--
378utf8               :: PCREOption
379utf8               = PCREOption utf8_cint
380
381-- | 'no_utf8_check'
382--
383-- When PCRE_UTF8 is set, the validity of the pattern as a
384-- UTF-8 string is automatically checked. There is a discussion
385-- about the validity of UTF-8 strings in the main pcre
386-- page. If an invalid UTF-8 sequence of bytes is found,
387-- compile() returns an error. If you already know that
388-- your pattern is valid, and you want to skip this check for
389-- performance reasons, you can set the 'no_utf8_check'
390-- option. When it is set, the effect of passing an invalid
391-- UTF-8 string as a pattern is undefined. It may cause your
392-- program to crash. Note that this option can also be passed
393-- to 'exec', to suppress the UTF-8 validity checking of subject strings.
394--
395no_utf8_check      :: PCREOption
396no_utf8_check      = PCREOption no_utf8_check_cint
397
398-- Internal name for hsc2hs to bind to.
399type PCREOption_ = CInt
400
401-- PCRE compile options, as CInts
402#{enum PCREOption_,
403  , anchored_cint        = PCRE_ANCHORED
404  , auto_callout_cint    = PCRE_AUTO_CALLOUT
405  , caseless_cint        = PCRE_CASELESS
406  , dollar_endonly_cint  = PCRE_DOLLAR_ENDONLY
407  , dotall_cint          = PCRE_DOTALL
408  , dupnames_cint        = PCRE_DUPNAMES
409  , extended_cint        = PCRE_EXTENDED
410  , extra_cint           = PCRE_EXTRA
411  , firstline_cint       = PCRE_FIRSTLINE
412  , multiline_cint       = PCRE_MULTILINE
413  , newline_cr_cint      = PCRE_NEWLINE_CR
414  , newline_crlf_cint    = PCRE_NEWLINE_CRLF
415  , newline_lf_cint      = PCRE_NEWLINE_LF
416  , no_auto_capture_cint = PCRE_NO_AUTO_CAPTURE
417  , ungreedy_cint        = PCRE_UNGREEDY
418  , utf8_cint            = PCRE_UTF8
419  , no_utf8_check_cint   = PCRE_NO_UTF8_CHECK
420  }
421
422--  , bsr_anycrlf_cint     = PCRE_BSR_ANYCRLF
423--  , bsr_unicode_cint     = PCRE_BSR_UNICODE
424--  , newline_any_cint     = PCRE_NEWLINE_ANY
425--  , newline_anycrlf_cint = PCRE_NEWLINE_ANYCRLF
426
427------------------------------------------------------------------------
428
429-- | PCRE exec options, to be passed to exec
430newtype PCREExecOption = PCREExecOption { unPCREExecOption :: PCREExecOption_ }
431#if __GLASGOW_HASKELL__
432    deriving (Eq,Ord,Show,Read)
433#endif
434
435-- | Combine a list of exec options into a single option, using bitwise (.|.)
436combineExecOptions :: [PCREExecOption] -> PCREExecOption
437combineExecOptions = PCREExecOption . foldr ((.|.) . unPCREExecOption) 0
438
439-- | 'anchored'.
440--
441-- The 'anchored' option limits 'exec' to matching at
442-- the first matching position. If a pattern was compiled
443-- with 'anchored', or turned out to be anchored by virtue
444-- of its contents, it cannot be made unachored at matching
445-- time.
446exec_anchored              :: PCREExecOption
447exec_anchored              = PCREExecOption exec_anchored_cint
448
449-- | 'newline_cr', 'newline_lf',
450-- 'newline_crlf', 'newline_anycrlf', 'newline_any'
451--
452-- These options override the newline definition that was
453-- chosen or defaulted when the pattern was compiled. For
454-- details, see the description of 'compile' above. Dur-
455-- ing matching, the newline choice affects the behaviour of
456-- the dot, circumflex, and dollar metacharacters. It may
457-- also alter the way the match position is advanced after a
458-- match failure for an unanchored pattern.
459--
460-- When 'newline_crlf', 'newline_anycrlf', or 'newline_any'
461-- is set, and a match attempt for an unanchored
462-- pattern fails when the current position is at a CRLF
463-- sequence, and the pattern contains no explicit matches for
464-- CR or LF characters, the match position is advanced by two
465-- characters instead of one, in other words, to after the
466-- CRLF.
467--
468-- The above rule is a compromise that makes the most common
469-- cases work as expected. For example, if the pattern is .+A
470-- (and the 'dotall' option is not set), it does not match
471-- the string /\\r\\nA/ because, after failing at the start, it
472-- skips both the CR and the LF before retrying. However, the
473-- pattern /[\\r\\n]A/ does match that string, because it contains
474-- an explicit CR or LF reference, and so advances only
475-- by one character after the first failure.
476--
477-- An explicit match for CR of LF is either a literal appear-
478-- ance of one of those characters, or one of the \\r or \\n
479-- escape sequences. Implicit matches such as [^X] do not
480-- count, nor does \\s (which includes CR and LF in the char-
481-- acters that it matches).
482--
483-- Notwithstanding the above, anomalous effects may still
484-- occur when CRLF is a valid newline sequence and explicit
485-- \\r or \\n escapes appear in the pattern.
486--
487-- exec_newline_any           :: PCREExecOption
488-- exec_newline_any           = PCREExecOption exec_newline_any_cint
489
490-- | 'exec_newline_anycrlf', see 'exec_newline_any'
491-- exec_newline_anycrlf       :: PCREExecOption
492-- exec_newline_anycrlf       = PCREExecOption exec_newline_anycrlf_cint
493
494-- | 'exec_newline_cr', see 'exec_newline_any'
495exec_newline_cr            :: PCREExecOption
496exec_newline_cr            = PCREExecOption exec_newline_cr_cint
497
498-- | 'exec_newline_crlf', see 'exec_newline_any'
499exec_newline_crlf          :: PCREExecOption
500exec_newline_crlf          = PCREExecOption exec_newline_crlf_cint
501
502-- | 'exec_newline_lf', see 'exec_newline_any'
503exec_newline_lf            :: PCREExecOption
504exec_newline_lf            = PCREExecOption exec_newline_lf_cint
505
506-- | 'PCRE_NOTBOL'
507--
508-- This option specifies that first character of the subject
509-- string is not the beginning of a line, so the circumflex
510-- metacharacter should not match before it. Setting this
511-- without 'multiline' (at compile time) causes circumflex
512-- never to match. This option affects only the behaviour of
513-- the circumflex metacharacter. It does not affect \\A.
514--
515exec_notbol                :: PCREExecOption
516exec_notbol                = PCREExecOption exec_notbol_cint
517
518-- | 'noteol'
519--
520-- This option specifies that the end of the subject string
521-- is not the end of a line, so the dollar metacharacter
522-- should not match it nor (except in multiline mode) a newline
523-- immediately before it. Setting this without 'multiline'
524-- (at compile time) causes dollar never to match.
525-- This option affects only the behaviour of the dollar
526-- metacharacter. It does not affect \\Z or \\z.
527--
528exec_noteol                :: PCREExecOption
529exec_noteol                = PCREExecOption exec_noteol_cint
530
531-- | PCRE_NOTEMPTY
532--
533-- An empty string is not considered to be a valid match if
534-- this option is set. If there are alternatives in the pattern,
535-- they are tried. If all the alternatives match the
536-- empty string, the entire match fails. For example, if the
537-- pattern
538--
539-- > a?b?
540--
541-- is applied to a string not beginning with /a/ or /b/, it
542-- matches the empty string at the start of the subject. With
543-- 'notempty' set, this match is not valid, so 'PCRE
544-- searches further into the string for occurrences of /a/ or
545-- /b/.
546--
547-- Perl has no direct equivalent of 'notempty', but it
548-- does make a special case of a pattern match of the empty
549-- string within its split() function, and when using the \/g
550-- modifier. It is possible to emulate Perl's behaviour after
551-- matching a null string by first trying the match again at
552-- the same offset with PCRE_NOTEMPTY and PCRE_ANCHORED, and
553-- then if that fails by advancing the starting offset (see
554-- below) and trying an ordinary match again. There is some
555-- code that demonstrates how to do this in the pcredemo.c
556-- sample program.
557--
558exec_notempty              :: PCREExecOption
559exec_notempty              = PCREExecOption exec_notempty_cint
560
561-- | 'no_utf8_check'
562--
563-- When 'utf8' is set at compile time, the validity of the
564-- subject as a UTF-8 string is automatically checked when
565-- exec() is subsequently called. The value of
566-- startoffset is also checked to ensure that it points to
567-- the start of a UTF-8 character. There is a discussion
568-- about the validity of UTF-8 strings in the section on
569-- UTF-8 support in the main pcre page. If an invalid UTF-8
570-- sequence of bytes is found, exec() returns the error
571-- 'error_badutf8'. If startoffset contains an invalid
572-- value, 'error_badutf8_offset' is returned.
573--
574-- If you already know that your subject is valid, and you
575-- want to skip these checks for performance reasons, you can
576-- set the 'no_utf8_check' option when calling
577-- 'exec'. You might want to do this for the second and
578-- subsequent calls to exec() if you are making repeated
579-- calls to find all the matches in a single subject string.
580-- However, you should be sure that the value of startoffset
581-- points to the start of a UTF-8 character. When
582-- 'no_utf8_check' is set, the effect of passing an
583-- invalid UTF-8 string as a subject, or a value of startoff-
584-- set that does not point to the start of a UTF-8 character,
585-- is undefined. Your program may crash.
586--
587exec_no_utf8_check         :: PCREExecOption
588exec_no_utf8_check         = PCREExecOption exec_no_utf8_check_cint
589
590-- | 'partial'
591--
592-- This option turns on the partial matching feature. If the
593-- subject string fails to match the pattern, but at some
594-- point during the matching process the end of the subject
595-- was reached (that is, the subject partially matches the
596-- pattern and the failure to match occurred only because
597-- there were not enough subject characters), 'exec'
598-- returns 'error_partial' instead of 'error_nomatch'.
599-- When 'partial' is used, there are restrictions on what
600-- may appear in the pattern. These are discussed in the
601-- pcrepartial documentation.
602--
603exec_partial               :: PCREExecOption
604exec_partial               = PCREExecOption exec_partial_cint
605
606-- Internal name for hsc2hs to bind to.
607type PCREExecOption_ = CInt
608
609-- PCRE exec options
610#{enum PCREExecOption_,
611  , exec_anchored_cint        = PCRE_ANCHORED
612  , exec_newline_cr_cint      = PCRE_NEWLINE_CR
613  , exec_newline_crlf_cint    = PCRE_NEWLINE_CRLF
614  , exec_newline_lf_cint      = PCRE_NEWLINE_LF
615  , exec_notbol_cint          = PCRE_NOTBOL
616  , exec_noteol_cint          = PCRE_NOTEOL
617  , exec_notempty_cint        = PCRE_NOTEMPTY
618  , exec_no_utf8_check_cint   = PCRE_NO_UTF8_CHECK
619  , exec_partial_cint         = PCRE_PARTIAL
620  }
621
622--  , exec_newline_any_cint     = PCRE_NEWLINE_ANY
623--  , exec_newline_anycrlf_cint = PCRE_NEWLINE_ANYCRLF
624--  , dfa_shortest   = PCRE_DFA_SHORTEST
625--  , dfa_restart    = PCRE_DFA_RESTART
626
627------------------------------------------------------------------------
628
629-- | A type for PCRE Errors: exec-time error codes.
630type PCREError = CInt
631
632#{enum PCREError,
633    , error_nomatch        = PCRE_ERROR_NOMATCH
634    , error_null           = PCRE_ERROR_NULL
635    , error_badoption      = PCRE_ERROR_BADOPTION
636    , error_badmagic       = PCRE_ERROR_BADMAGIC
637    , error_unknown_node   = PCRE_ERROR_UNKNOWN_NODE
638    , error_nomemory       = PCRE_ERROR_NOMEMORY
639    , error_nosubstring    = PCRE_ERROR_NOSUBSTRING
640    , error_matchlimit     = PCRE_ERROR_MATCHLIMIT
641    , error_callout        = PCRE_ERROR_CALLOUT
642    , error_badutf8        = PCRE_ERROR_BADUTF8
643    , error_badutf8_offset = PCRE_ERROR_BADUTF8_OFFSET
644    , error_partial        = PCRE_ERROR_PARTIAL
645    , error_badpartial     = PCRE_ERROR_BADPARTIAL
646    , error_internal       = PCRE_ERROR_INTERNAL
647    , error_badcount       = PCRE_ERROR_BADCOUNT
648    , error_dfa_uitem      = PCRE_ERROR_DFA_UITEM
649    , error_dfa_ucond      = PCRE_ERROR_DFA_UCOND
650    , error_dfa_umlimit    = PCRE_ERROR_DFA_UMLIMIT
651    , error_dfa_wssize     = PCRE_ERROR_DFA_WSSIZE
652    , error_dfa_recurse    = PCRE_ERROR_DFA_RECURSE
653    , error_recursionlimit = PCRE_ERROR_RECURSIONLIMIT
654    }
655
656--  , error_unknown_opcode = PCRE_ERROR_UNKNOWN_OPCODE
657--  , error_nullwslimit    = PCRE_ERROR_NULLWSLIMIT
658--  , error_badnewline     = PCRE_ERROR_BADNEWLINE
659
660------------------------------------------------------------------------
661-- Request types for fullinfo() */
662
663-- | PCRE Info requests -- provides information about the compiled pattern.
664type PCREInfo = CInt
665
666#{enum PCREInfo,
667    , info_options         = PCRE_INFO_OPTIONS
668    , info_size            = PCRE_INFO_SIZE
669    , info_capturecount    = PCRE_INFO_CAPTURECOUNT
670    , info_backrefmax      = PCRE_INFO_BACKREFMAX
671    , info_firstbyte       = PCRE_INFO_FIRSTBYTE
672    , info_firstchar       = PCRE_INFO_FIRSTCHAR
673    , info_firsttable      = PCRE_INFO_FIRSTTABLE
674    , info_lastliteral     = PCRE_INFO_LASTLITERAL
675    , info_nameentrysize   = PCRE_INFO_NAMEENTRYSIZE
676    , info_namecount       = PCRE_INFO_NAMECOUNT
677    , info_nametable       = PCRE_INFO_NAMETABLE
678    , info_studysize       = PCRE_INFO_STUDYSIZE
679    , info_default_tables  = PCRE_INFO_DEFAULT_TABLES
680    }
681
682--  , info_okpartial       = PCRE_INFO_OKPARTIAL
683--  , info_jchanged        = PCRE_INFO_JCHANGED
684--  , info_hascrorlf       = PCRE_INFO_HASCRORLF
685
686------------------------------------------------------------------------
687
688-- | Request types for config()
689type PCREConfig = CInt
690
691#{enum PCREConfig,
692    , config_utf8                   = PCRE_CONFIG_UTF8
693    , config_newline                = PCRE_CONFIG_NEWLINE
694    , config_link_size              = PCRE_CONFIG_LINK_SIZE
695    , config_posix_malloc_threshold = PCRE_CONFIG_POSIX_MALLOC_THRESHOLD
696    , config_match_limit            = PCRE_CONFIG_MATCH_LIMIT
697    , config_stackrecurse           = PCRE_CONFIG_STACKRECURSE
698    , config_unicode_properties     = PCRE_CONFIG_UNICODE_PROPERTIES
699    , config_match_limit_recursion  = PCRE_CONFIG_MATCH_LIMIT_RECURSION
700    }
701
702-- Not portable
703--  , config_bsr                    = PCRE_CONFIG_BSR
704
705
706------------------------------------------------------------------------
707
708-- | PCREExtra.
709-- A extra structure contains the following fields:
710--
711-- * flags        Bits indicating which fields are set
712-- * study_data   Opaque data from study()
713-- * match_limit  Limit on internal resource use
714-- * match_limit_recursion  Limit on internal recursion depth
715-- * callout_data Opaque data passed back to callouts
716-- * tables       Points to character tables or is NULL
717--
718type PCREExtra = ()
719
720-- | PCREExtraFlags. bit flags for extra structure.
721type PCREExtraFlags = CInt
722
723-- Bit flags for the extra structure. Do not re-arrange or redefine
724-- these bits, just add new ones on the end, in order to remain compatible. */
725#{enum PCREExtraFlags,
726    , extra_study_data            = PCRE_EXTRA_STUDY_DATA
727    , extra_match_limit           = PCRE_EXTRA_MATCH_LIMIT
728    , extra_callout_data          = PCRE_EXTRA_CALLOUT_DATA
729    , extra_tables                = PCRE_EXTRA_TABLES
730    , extra_match_limit_recursion = PCRE_EXTRA_MATCH_LIMIT_RECURSION
731    }
732
733-- PCRE_EXP_DECL pcre *compile(const char *, int, const char **, int *, const unsigned char *);
734-- PCRE_EXP_DECL int  config(int, void *);
735-- PCRE_EXP_DECL int  exec(const pcre *, const extra *, PCRE_SPTR, int, int, int, int *, int);
736
737------------------------------------------------------------------------
738-- C api
739
740{-
741   pcre *pcre_compile(const char *pattern, int options,
742               const char **errptr, int *erroffset,
743               const unsigned char *tableptr);
744-}
745
746-- | Compile a pattern to an internal form. The pattern is a C string
747-- terminated by a binary zero. A pointer to a single block of memory that is
748-- obtained via pcre_malloc is returned. It is up to the caller to free
749-- the memory (via pcre_free) when it is no longer required
750--
751-- The options argument contains various bit settings that affect the
752-- compilation. It should be zero if no options are required.
753--
754-- If errptr is NULL, pcre_compile() returns NULL immediately.
755-- Otherwise, if compilation of a pattern fails, pcre_compile() returns NULL,
756-- and sets the variable pointed to by errptr to point to a textual error
757-- message.
758--
759-- The offset from the start of the pattern to the character where the error
760-- was discovered is placed in the variable pointed to by erroffset, which must
761-- not be NULL.
762--
763foreign import ccall unsafe "pcre.h pcre_compile"
764    c_pcre_compile  :: CString
765                    -> PCREOption
766                    -> Ptr CString
767                    -> Ptr CInt
768                    -> Ptr Word8
769                    -> IO (Ptr PCRE)
770
771-- Additional fields to c_pcre_compile:
772--
773--    errptr        Where to put an error message
774--    erroffset     Offset in pattern where error was found
775--    tableptr      Pointer to character tables, or NULL to to use built in
776
777{-
778       int pcre_exec(const pcre *code, const pcre_extra *extra,
779            const char *subject, int length, int startoffset,
780            int options, int *ovector, int ovecsize);
781-}
782
783-- | This function matches a compiled regular expression
784-- against a given subject string, using a matching algorithm
785-- that is similar to Perl's. It returns offsets to captured
786-- substrings.
787--
788-- Its arguments are, in order:
789--
790-- * 'code'        Points to the compiled pattern (result of pcre_compile)
791--
792-- * 'extra'       Points to an associated pcre_extra structure (result of pcre_study), or is NULL
793--
794-- * 'subject'      Points to the subject string
795--
796-- * 'length'       Length of the subject string, in bytes
797--
798-- * 'startoffset'  Offset in bytes in the subject at which to start matching
799--
800-- * 'options'      Option bits
801--
802-- * 'ovector'      Points to a vector of ints for result substrings
803--
804-- * 'ovecsize'     Number of elements in the vector (a  multiple of 3)
805--
806-- Note, subject not required to be null terminated.
807--
808foreign import ccall unsafe "pcre.h pcre_exec"
809    c_pcre_exec     :: Ptr PCRE
810                    -> Ptr PCREExtra
811                    -> Ptr Word8
812                    -> CInt
813                    -> CInt
814                    -> PCREExecOption
815                    -> Ptr CInt
816                    -> CInt
817                    -> IO CInt
818
819-- | Return information about a compiled pattern
820foreign import ccall unsafe "pcre.h pcre_fullinfo"
821    c_pcre_fullinfo :: Ptr PCRE
822                    -> Ptr PCREExtra
823                    -> PCREInfo
824                    -> Ptr a
825                    -> IO CInt
826
827