1*38fd1498Szrj /* Definitions for data structures and routines for the regular 2*38fd1498Szrj expression library, version 0.12. 3*38fd1498Szrj 4*38fd1498Szrj Copyright (C) 1985-2018 Free Software Foundation, Inc. 5*38fd1498Szrj 6*38fd1498Szrj This file is part of the GNU C Library. Its master source is NOT part of 7*38fd1498Szrj the C library, however. The master source lives in /gd/gnu/lib. 8*38fd1498Szrj 9*38fd1498Szrj The GNU C Library is free software; you can redistribute it and/or 10*38fd1498Szrj modify it under the terms of the GNU Lesser General Public 11*38fd1498Szrj License as published by the Free Software Foundation; either 12*38fd1498Szrj version 2.1 of the License, or (at your option) any later version. 13*38fd1498Szrj 14*38fd1498Szrj The GNU C Library is distributed in the hope that it will be useful, 15*38fd1498Szrj but WITHOUT ANY WARRANTY; without even the implied warranty of 16*38fd1498Szrj MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17*38fd1498Szrj Lesser General Public License for more details. 18*38fd1498Szrj 19*38fd1498Szrj You should have received a copy of the GNU Lesser General Public 20*38fd1498Szrj License along with the GNU C Library; if not, write to the Free 21*38fd1498Szrj Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 22*38fd1498Szrj 02110-1301 USA. */ 23*38fd1498Szrj 24*38fd1498Szrj #ifndef _REGEX_H 25*38fd1498Szrj #define _REGEX_H 1 26*38fd1498Szrj 27*38fd1498Szrj /* Allow the use in C++ code. */ 28*38fd1498Szrj #ifdef __cplusplus 29*38fd1498Szrj extern "C" { 30*38fd1498Szrj #endif 31*38fd1498Szrj 32*38fd1498Szrj /* POSIX says that <sys/types.h> must be included (by the caller) before 33*38fd1498Szrj <regex.h>. */ 34*38fd1498Szrj 35*38fd1498Szrj #if !defined _POSIX_C_SOURCE && !defined _POSIX_SOURCE && defined VMS 36*38fd1498Szrj /* VMS doesn't have `size_t' in <sys/types.h>, even though POSIX says it 37*38fd1498Szrj should be there. */ 38*38fd1498Szrj # include <stddef.h> 39*38fd1498Szrj #endif 40*38fd1498Szrj 41*38fd1498Szrj /* The following two types have to be signed and unsigned integer type 42*38fd1498Szrj wide enough to hold a value of a pointer. For most ANSI compilers 43*38fd1498Szrj ptrdiff_t and size_t should be likely OK. Still size of these two 44*38fd1498Szrj types is 2 for Microsoft C. Ugh... */ 45*38fd1498Szrj typedef long int s_reg_t; 46*38fd1498Szrj typedef unsigned long int active_reg_t; 47*38fd1498Szrj 48*38fd1498Szrj /* The following bits are used to determine the regexp syntax we 49*38fd1498Szrj recognize. The set/not-set meanings are chosen so that Emacs syntax 50*38fd1498Szrj remains the value 0. The bits are given in alphabetical order, and 51*38fd1498Szrj the definitions shifted by one from the previous bit; thus, when we 52*38fd1498Szrj add or remove a bit, only one other definition need change. */ 53*38fd1498Szrj typedef unsigned long int reg_syntax_t; 54*38fd1498Szrj 55*38fd1498Szrj /* If this bit is not set, then \ inside a bracket expression is literal. 56*38fd1498Szrj If set, then such a \ quotes the following character. */ 57*38fd1498Szrj #define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long int) 1) 58*38fd1498Szrj 59*38fd1498Szrj /* If this bit is not set, then + and ? are operators, and \+ and \? are 60*38fd1498Szrj literals. 61*38fd1498Szrj If set, then \+ and \? are operators and + and ? are literals. */ 62*38fd1498Szrj #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1) 63*38fd1498Szrj 64*38fd1498Szrj /* If this bit is set, then character classes are supported. They are: 65*38fd1498Szrj [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:], 66*38fd1498Szrj [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:]. 67*38fd1498Szrj If not set, then character classes are not supported. */ 68*38fd1498Szrj #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1) 69*38fd1498Szrj 70*38fd1498Szrj /* If this bit is set, then ^ and $ are always anchors (outside bracket 71*38fd1498Szrj expressions, of course). 72*38fd1498Szrj If this bit is not set, then it depends: 73*38fd1498Szrj ^ is an anchor if it is at the beginning of a regular 74*38fd1498Szrj expression or after an open-group or an alternation operator; 75*38fd1498Szrj $ is an anchor if it is at the end of a regular expression, or 76*38fd1498Szrj before a close-group or an alternation operator. 77*38fd1498Szrj 78*38fd1498Szrj This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because 79*38fd1498Szrj POSIX draft 11.2 says that * etc. in leading positions is undefined. 80*38fd1498Szrj We already implemented a previous draft which made those constructs 81*38fd1498Szrj invalid, though, so we haven't changed the code back. */ 82*38fd1498Szrj #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1) 83*38fd1498Szrj 84*38fd1498Szrj /* If this bit is set, then special characters are always special 85*38fd1498Szrj regardless of where they are in the pattern. 86*38fd1498Szrj If this bit is not set, then special characters are special only in 87*38fd1498Szrj some contexts; otherwise they are ordinary. Specifically, 88*38fd1498Szrj * + ? and intervals are only special when not after the beginning, 89*38fd1498Szrj open-group, or alternation operator. */ 90*38fd1498Szrj #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1) 91*38fd1498Szrj 92*38fd1498Szrj /* If this bit is set, then *, +, ?, and { cannot be first in an re or 93*38fd1498Szrj immediately after an alternation or begin-group operator. */ 94*38fd1498Szrj #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1) 95*38fd1498Szrj 96*38fd1498Szrj /* If this bit is set, then . matches newline. 97*38fd1498Szrj If not set, then it doesn't. */ 98*38fd1498Szrj #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1) 99*38fd1498Szrj 100*38fd1498Szrj /* If this bit is set, then . doesn't match NUL. 101*38fd1498Szrj If not set, then it does. */ 102*38fd1498Szrj #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1) 103*38fd1498Szrj 104*38fd1498Szrj /* If this bit is set, nonmatching lists [^...] do not match newline. 105*38fd1498Szrj If not set, they do. */ 106*38fd1498Szrj #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1) 107*38fd1498Szrj 108*38fd1498Szrj /* If this bit is set, either \{...\} or {...} defines an 109*38fd1498Szrj interval, depending on RE_NO_BK_BRACES. 110*38fd1498Szrj If not set, \{, \}, {, and } are literals. */ 111*38fd1498Szrj #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1) 112*38fd1498Szrj 113*38fd1498Szrj /* If this bit is set, +, ? and | aren't recognized as operators. 114*38fd1498Szrj If not set, they are. */ 115*38fd1498Szrj #define RE_LIMITED_OPS (RE_INTERVALS << 1) 116*38fd1498Szrj 117*38fd1498Szrj /* If this bit is set, newline is an alternation operator. 118*38fd1498Szrj If not set, newline is literal. */ 119*38fd1498Szrj #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1) 120*38fd1498Szrj 121*38fd1498Szrj /* If this bit is set, then `{...}' defines an interval, and \{ and \} 122*38fd1498Szrj are literals. 123*38fd1498Szrj If not set, then `\{...\}' defines an interval. */ 124*38fd1498Szrj #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1) 125*38fd1498Szrj 126*38fd1498Szrj /* If this bit is set, (...) defines a group, and \( and \) are literals. 127*38fd1498Szrj If not set, \(...\) defines a group, and ( and ) are literals. */ 128*38fd1498Szrj #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1) 129*38fd1498Szrj 130*38fd1498Szrj /* If this bit is set, then \<digit> matches <digit>. 131*38fd1498Szrj If not set, then \<digit> is a back-reference. */ 132*38fd1498Szrj #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1) 133*38fd1498Szrj 134*38fd1498Szrj /* If this bit is set, then | is an alternation operator, and \| is literal. 135*38fd1498Szrj If not set, then \| is an alternation operator, and | is literal. */ 136*38fd1498Szrj #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1) 137*38fd1498Szrj 138*38fd1498Szrj /* If this bit is set, then an ending range point collating higher 139*38fd1498Szrj than the starting range point, as in [z-a], is invalid. 140*38fd1498Szrj If not set, then when ending range point collates higher than the 141*38fd1498Szrj starting range point, the range is ignored. */ 142*38fd1498Szrj #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1) 143*38fd1498Szrj 144*38fd1498Szrj /* If this bit is set, then an unmatched ) is ordinary. 145*38fd1498Szrj If not set, then an unmatched ) is invalid. */ 146*38fd1498Szrj #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1) 147*38fd1498Szrj 148*38fd1498Szrj /* If this bit is set, succeed as soon as we match the whole pattern, 149*38fd1498Szrj without further backtracking. */ 150*38fd1498Szrj #define RE_NO_POSIX_BACKTRACKING (RE_UNMATCHED_RIGHT_PAREN_ORD << 1) 151*38fd1498Szrj 152*38fd1498Szrj /* If this bit is set, do not process the GNU regex operators. 153*38fd1498Szrj If not set, then the GNU regex operators are recognized. */ 154*38fd1498Szrj #define RE_NO_GNU_OPS (RE_NO_POSIX_BACKTRACKING << 1) 155*38fd1498Szrj 156*38fd1498Szrj /* If this bit is set, turn on internal regex debugging. 157*38fd1498Szrj If not set, and debugging was on, turn it off. 158*38fd1498Szrj This only works if regex.c is compiled -DDEBUG. 159*38fd1498Szrj We define this bit always, so that all that's needed to turn on 160*38fd1498Szrj debugging is to recompile regex.c; the calling code can always have 161*38fd1498Szrj this bit set, and it won't affect anything in the normal case. */ 162*38fd1498Szrj #define RE_DEBUG (RE_NO_GNU_OPS << 1) 163*38fd1498Szrj 164*38fd1498Szrj /* If this bit is set, a syntactically invalid interval is treated as 165*38fd1498Szrj a string of ordinary characters. For example, the ERE 'a{1' is 166*38fd1498Szrj treated as 'a\{1'. */ 167*38fd1498Szrj #define RE_INVALID_INTERVAL_ORD (RE_DEBUG << 1) 168*38fd1498Szrj 169*38fd1498Szrj /* This global variable defines the particular regexp syntax to use (for 170*38fd1498Szrj some interfaces). When a regexp is compiled, the syntax used is 171*38fd1498Szrj stored in the pattern buffer, so changing this does not affect 172*38fd1498Szrj already-compiled regexps. */ 173*38fd1498Szrj extern reg_syntax_t re_syntax_options; 174*38fd1498Szrj 175*38fd1498Szrj /* Define combinations of the above bits for the standard possibilities. 176*38fd1498Szrj (The [[[ comments delimit what gets put into the Texinfo file, so 177*38fd1498Szrj don't delete them!) */ 178*38fd1498Szrj /* [[[begin syntaxes]]] */ 179*38fd1498Szrj #define RE_SYNTAX_EMACS 0 180*38fd1498Szrj 181*38fd1498Szrj #define RE_SYNTAX_AWK \ 182*38fd1498Szrj (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \ 183*38fd1498Szrj | RE_NO_BK_PARENS | RE_NO_BK_REFS \ 184*38fd1498Szrj | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \ 185*38fd1498Szrj | RE_DOT_NEWLINE | RE_CONTEXT_INDEP_ANCHORS \ 186*38fd1498Szrj | RE_UNMATCHED_RIGHT_PAREN_ORD | RE_NO_GNU_OPS) 187*38fd1498Szrj 188*38fd1498Szrj #define RE_SYNTAX_GNU_AWK \ 189*38fd1498Szrj ((RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DEBUG) \ 190*38fd1498Szrj & ~(RE_DOT_NOT_NULL | RE_INTERVALS | RE_CONTEXT_INDEP_OPS)) 191*38fd1498Szrj 192*38fd1498Szrj #define RE_SYNTAX_POSIX_AWK \ 193*38fd1498Szrj (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS \ 194*38fd1498Szrj | RE_INTERVALS | RE_NO_GNU_OPS) 195*38fd1498Szrj 196*38fd1498Szrj #define RE_SYNTAX_GREP \ 197*38fd1498Szrj (RE_BK_PLUS_QM | RE_CHAR_CLASSES \ 198*38fd1498Szrj | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \ 199*38fd1498Szrj | RE_NEWLINE_ALT) 200*38fd1498Szrj 201*38fd1498Szrj #define RE_SYNTAX_EGREP \ 202*38fd1498Szrj (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \ 203*38fd1498Szrj | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \ 204*38fd1498Szrj | RE_NEWLINE_ALT | RE_NO_BK_PARENS \ 205*38fd1498Szrj | RE_NO_BK_VBAR) 206*38fd1498Szrj 207*38fd1498Szrj #define RE_SYNTAX_POSIX_EGREP \ 208*38fd1498Szrj (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES \ 209*38fd1498Szrj | RE_INVALID_INTERVAL_ORD) 210*38fd1498Szrj 211*38fd1498Szrj /* P1003.2/D11.2, section 4.20.7.1, lines 5078ff. */ 212*38fd1498Szrj #define RE_SYNTAX_ED RE_SYNTAX_POSIX_BASIC 213*38fd1498Szrj 214*38fd1498Szrj #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC 215*38fd1498Szrj 216*38fd1498Szrj /* Syntax bits common to both basic and extended POSIX regex syntax. */ 217*38fd1498Szrj #define _RE_SYNTAX_POSIX_COMMON \ 218*38fd1498Szrj (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \ 219*38fd1498Szrj | RE_INTERVALS | RE_NO_EMPTY_RANGES) 220*38fd1498Szrj 221*38fd1498Szrj #define RE_SYNTAX_POSIX_BASIC \ 222*38fd1498Szrj (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM) 223*38fd1498Szrj 224*38fd1498Szrj /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes 225*38fd1498Szrj RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this 226*38fd1498Szrj isn't minimal, since other operators, such as \`, aren't disabled. */ 227*38fd1498Szrj #define RE_SYNTAX_POSIX_MINIMAL_BASIC \ 228*38fd1498Szrj (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) 229*38fd1498Szrj 230*38fd1498Szrj #define RE_SYNTAX_POSIX_EXTENDED \ 231*38fd1498Szrj (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ 232*38fd1498Szrj | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ 233*38fd1498Szrj | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ 234*38fd1498Szrj | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD) 235*38fd1498Szrj 236*38fd1498Szrj /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INDEP_OPS is 237*38fd1498Szrj removed and RE_NO_BK_REFS is added. */ 238*38fd1498Szrj #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ 239*38fd1498Szrj (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ 240*38fd1498Szrj | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ 241*38fd1498Szrj | RE_NO_BK_PARENS | RE_NO_BK_REFS \ 242*38fd1498Szrj | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) 243*38fd1498Szrj /* [[[end syntaxes]]] */ 244*38fd1498Szrj 245*38fd1498Szrj /* Maximum number of duplicates an interval can allow. Some systems 246*38fd1498Szrj (erroneously) define this in other header files, but we want our 247*38fd1498Szrj value, so remove any previous define. */ 248*38fd1498Szrj #ifdef RE_DUP_MAX 249*38fd1498Szrj # undef RE_DUP_MAX 250*38fd1498Szrj #endif 251*38fd1498Szrj /* If sizeof(int) == 2, then ((1 << 15) - 1) overflows. */ 252*38fd1498Szrj #define RE_DUP_MAX (0x7fff) 253*38fd1498Szrj 254*38fd1498Szrj 255*38fd1498Szrj /* POSIX `cflags' bits (i.e., information for `regcomp'). */ 256*38fd1498Szrj 257*38fd1498Szrj /* If this bit is set, then use extended regular expression syntax. 258*38fd1498Szrj If not set, then use basic regular expression syntax. */ 259*38fd1498Szrj #define REG_EXTENDED 1 260*38fd1498Szrj 261*38fd1498Szrj /* If this bit is set, then ignore case when matching. 262*38fd1498Szrj If not set, then case is significant. */ 263*38fd1498Szrj #define REG_ICASE (REG_EXTENDED << 1) 264*38fd1498Szrj 265*38fd1498Szrj /* If this bit is set, then anchors do not match at newline 266*38fd1498Szrj characters in the string. 267*38fd1498Szrj If not set, then anchors do match at newlines. */ 268*38fd1498Szrj #define REG_NEWLINE (REG_ICASE << 1) 269*38fd1498Szrj 270*38fd1498Szrj /* If this bit is set, then report only success or fail in regexec. 271*38fd1498Szrj If not set, then returns differ between not matching and errors. */ 272*38fd1498Szrj #define REG_NOSUB (REG_NEWLINE << 1) 273*38fd1498Szrj 274*38fd1498Szrj 275*38fd1498Szrj /* POSIX `eflags' bits (i.e., information for regexec). */ 276*38fd1498Szrj 277*38fd1498Szrj /* If this bit is set, then the beginning-of-line operator doesn't match 278*38fd1498Szrj the beginning of the string (presumably because it's not the 279*38fd1498Szrj beginning of a line). 280*38fd1498Szrj If not set, then the beginning-of-line operator does match the 281*38fd1498Szrj beginning of the string. */ 282*38fd1498Szrj #define REG_NOTBOL 1 283*38fd1498Szrj 284*38fd1498Szrj /* Like REG_NOTBOL, except for the end-of-line. */ 285*38fd1498Szrj #define REG_NOTEOL (1 << 1) 286*38fd1498Szrj 287*38fd1498Szrj 288*38fd1498Szrj /* If any error codes are removed, changed, or added, update the 289*38fd1498Szrj `re_error_msg' table in regex.c. */ 290*38fd1498Szrj typedef enum 291*38fd1498Szrj { 292*38fd1498Szrj #ifdef _XOPEN_SOURCE 293*38fd1498Szrj REG_ENOSYS = -1, /* This will never happen for this implementation. */ 294*38fd1498Szrj #endif 295*38fd1498Szrj 296*38fd1498Szrj REG_NOERROR = 0, /* Success. */ 297*38fd1498Szrj REG_NOMATCH, /* Didn't find a match (for regexec). */ 298*38fd1498Szrj 299*38fd1498Szrj /* POSIX regcomp return error codes. (In the order listed in the 300*38fd1498Szrj standard.) */ 301*38fd1498Szrj REG_BADPAT, /* Invalid pattern. */ 302*38fd1498Szrj REG_ECOLLATE, /* Not implemented. */ 303*38fd1498Szrj REG_ECTYPE, /* Invalid character class name. */ 304*38fd1498Szrj REG_EESCAPE, /* Trailing backslash. */ 305*38fd1498Szrj REG_ESUBREG, /* Invalid back reference. */ 306*38fd1498Szrj REG_EBRACK, /* Unmatched left bracket. */ 307*38fd1498Szrj REG_EPAREN, /* Parenthesis imbalance. */ 308*38fd1498Szrj REG_EBRACE, /* Unmatched \{. */ 309*38fd1498Szrj REG_BADBR, /* Invalid contents of \{\}. */ 310*38fd1498Szrj REG_ERANGE, /* Invalid range end. */ 311*38fd1498Szrj REG_ESPACE, /* Ran out of memory. */ 312*38fd1498Szrj REG_BADRPT, /* No preceding re for repetition op. */ 313*38fd1498Szrj 314*38fd1498Szrj /* Error codes we've added. */ 315*38fd1498Szrj REG_EEND, /* Premature end. */ 316*38fd1498Szrj REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */ 317*38fd1498Szrj REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */ 318*38fd1498Szrj } reg_errcode_t; 319*38fd1498Szrj 320*38fd1498Szrj /* This data structure represents a compiled pattern. Before calling 321*38fd1498Szrj the pattern compiler, the fields `buffer', `allocated', `fastmap', 322*38fd1498Szrj `translate', and `no_sub' can be set. After the pattern has been 323*38fd1498Szrj compiled, the `re_nsub' field is available. All other fields are 324*38fd1498Szrj private to the regex routines. */ 325*38fd1498Szrj 326*38fd1498Szrj #ifndef RE_TRANSLATE_TYPE 327*38fd1498Szrj # define RE_TRANSLATE_TYPE char * 328*38fd1498Szrj #endif 329*38fd1498Szrj 330*38fd1498Szrj struct re_pattern_buffer 331*38fd1498Szrj { 332*38fd1498Szrj /* [[[begin pattern_buffer]]] */ 333*38fd1498Szrj /* Space that holds the compiled pattern. It is declared as 334*38fd1498Szrj `unsigned char *' because its elements are 335*38fd1498Szrj sometimes used as array indexes. */ 336*38fd1498Szrj unsigned char *buffer; 337*38fd1498Szrj 338*38fd1498Szrj /* Number of bytes to which `buffer' points. */ 339*38fd1498Szrj unsigned long int allocated; 340*38fd1498Szrj 341*38fd1498Szrj /* Number of bytes actually used in `buffer'. */ 342*38fd1498Szrj unsigned long int used; 343*38fd1498Szrj 344*38fd1498Szrj /* Syntax setting with which the pattern was compiled. */ 345*38fd1498Szrj reg_syntax_t syntax; 346*38fd1498Szrj 347*38fd1498Szrj /* Pointer to a fastmap, if any, otherwise zero. re_search uses 348*38fd1498Szrj the fastmap, if there is one, to skip over impossible 349*38fd1498Szrj starting points for matches. */ 350*38fd1498Szrj char *fastmap; 351*38fd1498Szrj 352*38fd1498Szrj /* Either a translate table to apply to all characters before 353*38fd1498Szrj comparing them, or zero for no translation. The translation 354*38fd1498Szrj is applied to a pattern when it is compiled and to a string 355*38fd1498Szrj when it is matched. */ 356*38fd1498Szrj RE_TRANSLATE_TYPE translate; 357*38fd1498Szrj 358*38fd1498Szrj /* Number of subexpressions found by the compiler. */ 359*38fd1498Szrj size_t re_nsub; 360*38fd1498Szrj 361*38fd1498Szrj /* Zero if this pattern cannot match the empty string, one else. 362*38fd1498Szrj Well, in truth it's used only in `re_search_2', to see 363*38fd1498Szrj whether or not we should use the fastmap, so we don't set 364*38fd1498Szrj this absolutely perfectly; see `re_compile_fastmap' (the 365*38fd1498Szrj `duplicate' case). */ 366*38fd1498Szrj unsigned can_be_null : 1; 367*38fd1498Szrj 368*38fd1498Szrj /* If REGS_UNALLOCATED, allocate space in the `regs' structure 369*38fd1498Szrj for `max (RE_NREGS, re_nsub + 1)' groups. 370*38fd1498Szrj If REGS_REALLOCATE, reallocate space if necessary. 371*38fd1498Szrj If REGS_FIXED, use what's there. */ 372*38fd1498Szrj #define REGS_UNALLOCATED 0 373*38fd1498Szrj #define REGS_REALLOCATE 1 374*38fd1498Szrj #define REGS_FIXED 2 375*38fd1498Szrj unsigned regs_allocated : 2; 376*38fd1498Szrj 377*38fd1498Szrj /* Set to zero when `regex_compile' compiles a pattern; set to one 378*38fd1498Szrj by `re_compile_fastmap' if it updates the fastmap. */ 379*38fd1498Szrj unsigned fastmap_accurate : 1; 380*38fd1498Szrj 381*38fd1498Szrj /* If set, `re_match_2' does not return information about 382*38fd1498Szrj subexpressions. */ 383*38fd1498Szrj unsigned no_sub : 1; 384*38fd1498Szrj 385*38fd1498Szrj /* If set, a beginning-of-line anchor doesn't match at the 386*38fd1498Szrj beginning of the string. */ 387*38fd1498Szrj unsigned not_bol : 1; 388*38fd1498Szrj 389*38fd1498Szrj /* Similarly for an end-of-line anchor. */ 390*38fd1498Szrj unsigned not_eol : 1; 391*38fd1498Szrj 392*38fd1498Szrj /* If true, an anchor at a newline matches. */ 393*38fd1498Szrj unsigned newline_anchor : 1; 394*38fd1498Szrj 395*38fd1498Szrj /* [[[end pattern_buffer]]] */ 396*38fd1498Szrj }; 397*38fd1498Szrj 398*38fd1498Szrj typedef struct re_pattern_buffer regex_t; 399*38fd1498Szrj 400*38fd1498Szrj /* Type for byte offsets within the string. POSIX mandates this. */ 401*38fd1498Szrj typedef int regoff_t; 402*38fd1498Szrj 403*38fd1498Szrj 404*38fd1498Szrj /* This is the structure we store register match data in. See 405*38fd1498Szrj regex.texinfo for a full description of what registers match. */ 406*38fd1498Szrj struct re_registers 407*38fd1498Szrj { 408*38fd1498Szrj unsigned num_regs; 409*38fd1498Szrj regoff_t *start; 410*38fd1498Szrj regoff_t *end; 411*38fd1498Szrj }; 412*38fd1498Szrj 413*38fd1498Szrj 414*38fd1498Szrj /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer, 415*38fd1498Szrj `re_match_2' returns information about at least this many registers 416*38fd1498Szrj the first time a `regs' structure is passed. */ 417*38fd1498Szrj #ifndef RE_NREGS 418*38fd1498Szrj # define RE_NREGS 30 419*38fd1498Szrj #endif 420*38fd1498Szrj 421*38fd1498Szrj 422*38fd1498Szrj /* POSIX specification for registers. Aside from the different names than 423*38fd1498Szrj `re_registers', POSIX uses an array of structures, instead of a 424*38fd1498Szrj structure of arrays. */ 425*38fd1498Szrj typedef struct 426*38fd1498Szrj { 427*38fd1498Szrj regoff_t rm_so; /* Byte offset from string's start to substring's start. */ 428*38fd1498Szrj regoff_t rm_eo; /* Byte offset from string's start to substring's end. */ 429*38fd1498Szrj } regmatch_t; 430*38fd1498Szrj 431*38fd1498Szrj /* Declarations for routines. */ 432*38fd1498Szrj 433*38fd1498Szrj /* To avoid duplicating every routine declaration -- once with a 434*38fd1498Szrj prototype (if we are ANSI), and once without (if we aren't) -- we 435*38fd1498Szrj use the following macro to declare argument types. This 436*38fd1498Szrj unfortunately clutters up the declarations a bit, but I think it's 437*38fd1498Szrj worth it. */ 438*38fd1498Szrj 439*38fd1498Szrj /* Sets the current default syntax to SYNTAX, and return the old syntax. 440*38fd1498Szrj You can also simply assign to the `re_syntax_options' variable. */ 441*38fd1498Szrj extern reg_syntax_t re_set_syntax (reg_syntax_t syntax); 442*38fd1498Szrj 443*38fd1498Szrj /* Compile the regular expression PATTERN, with length LENGTH 444*38fd1498Szrj and syntax given by the global `re_syntax_options', into the buffer 445*38fd1498Szrj BUFFER. Return NULL if successful, and an error string if not. */ 446*38fd1498Szrj extern const char *re_compile_pattern (const char *pattern, size_t length, 447*38fd1498Szrj struct re_pattern_buffer *buffer); 448*38fd1498Szrj 449*38fd1498Szrj 450*38fd1498Szrj /* Compile a fastmap for the compiled pattern in BUFFER; used to 451*38fd1498Szrj accelerate searches. Return 0 if successful and -2 if was an 452*38fd1498Szrj internal error. */ 453*38fd1498Szrj extern int re_compile_fastmap (struct re_pattern_buffer *buffer); 454*38fd1498Szrj 455*38fd1498Szrj 456*38fd1498Szrj /* Search in the string STRING (with length LENGTH) for the pattern 457*38fd1498Szrj compiled into BUFFER. Start searching at position START, for RANGE 458*38fd1498Szrj characters. Return the starting position of the match, -1 for no 459*38fd1498Szrj match, or -2 for an internal error. Also return register 460*38fd1498Szrj information in REGS (if REGS and BUFFER->no_sub are nonzero). */ 461*38fd1498Szrj extern int re_search (struct re_pattern_buffer *buffer, const char *string, 462*38fd1498Szrj int length, int start, int range, 463*38fd1498Szrj struct re_registers *regs); 464*38fd1498Szrj 465*38fd1498Szrj 466*38fd1498Szrj /* Like `re_search', but search in the concatenation of STRING1 and 467*38fd1498Szrj STRING2. Also, stop searching at index START + STOP. */ 468*38fd1498Szrj extern int re_search_2 (struct re_pattern_buffer *buffer, const char *string1, 469*38fd1498Szrj int length1, const char *string2, int length2, 470*38fd1498Szrj int start, int range, struct re_registers *regs, 471*38fd1498Szrj int stop); 472*38fd1498Szrj 473*38fd1498Szrj 474*38fd1498Szrj /* Like `re_search', but return how many characters in STRING the regexp 475*38fd1498Szrj in BUFFER matched, starting at position START. */ 476*38fd1498Szrj extern int re_match (struct re_pattern_buffer *buffer, const char *string, 477*38fd1498Szrj int length, int start, struct re_registers *regs); 478*38fd1498Szrj 479*38fd1498Szrj 480*38fd1498Szrj /* Relates to `re_match' as `re_search_2' relates to `re_search'. */ 481*38fd1498Szrj extern int re_match_2 (struct re_pattern_buffer *buffer, const char *string1, 482*38fd1498Szrj int length1, const char *string2, int length2, 483*38fd1498Szrj int start, struct re_registers *regs, int stop); 484*38fd1498Szrj 485*38fd1498Szrj 486*38fd1498Szrj /* Set REGS to hold NUM_REGS registers, storing them in STARTS and 487*38fd1498Szrj ENDS. Subsequent matches using BUFFER and REGS will use this memory 488*38fd1498Szrj for recording register information. STARTS and ENDS must be 489*38fd1498Szrj allocated with malloc, and must each be at least `NUM_REGS * sizeof 490*38fd1498Szrj (regoff_t)' bytes long. 491*38fd1498Szrj 492*38fd1498Szrj If NUM_REGS == 0, then subsequent matches should allocate their own 493*38fd1498Szrj register data. 494*38fd1498Szrj 495*38fd1498Szrj Unless this function is called, the first search or match using 496*38fd1498Szrj PATTERN_BUFFER will allocate its own register data, without 497*38fd1498Szrj freeing the old data. */ 498*38fd1498Szrj extern void re_set_registers (struct re_pattern_buffer *buffer, 499*38fd1498Szrj struct re_registers *regs, 500*38fd1498Szrj unsigned num_regs, regoff_t *starts, 501*38fd1498Szrj regoff_t *ends); 502*38fd1498Szrj 503*38fd1498Szrj #if defined _REGEX_RE_COMP || defined _LIBC 504*38fd1498Szrj # ifndef _CRAY 505*38fd1498Szrj /* 4.2 bsd compatibility. */ 506*38fd1498Szrj extern char *re_comp (const char *); 507*38fd1498Szrj extern int re_exec (const char *); 508*38fd1498Szrj # endif 509*38fd1498Szrj #endif 510*38fd1498Szrj 511*38fd1498Szrj /* GCC 2.95 and later have "__restrict"; C99 compilers have 512*38fd1498Szrj "restrict", and "configure" may have defined "restrict". */ 513*38fd1498Szrj #ifndef __restrict 514*38fd1498Szrj # if ! (2 < __GNUC__ || (2 == __GNUC__ && 95 <= __GNUC_MINOR__)) 515*38fd1498Szrj # if defined restrict || 199901L <= __STDC_VERSION__ 516*38fd1498Szrj # define __restrict restrict 517*38fd1498Szrj # else 518*38fd1498Szrj # define __restrict 519*38fd1498Szrj # endif 520*38fd1498Szrj # endif 521*38fd1498Szrj #endif 522*38fd1498Szrj 523*38fd1498Szrj /* GCC 3.1 and later support declaring arrays as non-overlapping 524*38fd1498Szrj using the syntax array_name[restrict] */ 525*38fd1498Szrj #ifndef __restrict_arr 526*38fd1498Szrj # if ! (3 < __GNUC__ || (3 == __GNUC__ && 1 <= __GNUC_MINOR__)) || defined (__GNUG__) 527*38fd1498Szrj # define __restrict_arr 528*38fd1498Szrj # else 529*38fd1498Szrj # define __restrict_arr __restrict 530*38fd1498Szrj # endif 531*38fd1498Szrj #endif 532*38fd1498Szrj 533*38fd1498Szrj /* POSIX compatibility. */ 534*38fd1498Szrj extern int regcomp (regex_t *__restrict __preg, 535*38fd1498Szrj const char *__restrict __pattern, 536*38fd1498Szrj int __cflags); 537*38fd1498Szrj 538*38fd1498Szrj #if (__GNUC__) 539*38fd1498Szrj __extension__ 540*38fd1498Szrj #endif 541*38fd1498Szrj extern int regexec (const regex_t *__restrict __preg, 542*38fd1498Szrj const char *__restrict __string, size_t __nmatch, 543*38fd1498Szrj regmatch_t __pmatch[__restrict_arr], 544*38fd1498Szrj int __eflags); 545*38fd1498Szrj 546*38fd1498Szrj extern size_t regerror (int __errcode, const regex_t *__preg, 547*38fd1498Szrj char *__errbuf, size_t __errbuf_size); 548*38fd1498Szrj 549*38fd1498Szrj extern void regfree (regex_t *__preg); 550*38fd1498Szrj 551*38fd1498Szrj 552*38fd1498Szrj #ifdef __cplusplus 553*38fd1498Szrj } 554*38fd1498Szrj #endif /* C++ */ 555*38fd1498Szrj 556*38fd1498Szrj #endif /* regex.h */ 557*38fd1498Szrj 558*38fd1498Szrj /* 559*38fd1498Szrj Local variables: 560*38fd1498Szrj make-backup-files: t 561*38fd1498Szrj version-control: t 562*38fd1498Szrj trim-versions-without-asking: nil 563*38fd1498Szrj End: 564*38fd1498Szrj */ 565