1 /******************************************************************************** 2 * * 3 * R e g u l a r E x p r e s s i o n C l a s s * 4 * * 5 ********************************************************************************* 6 * Copyright (C) 1999,2020 by Jeroen van der Zijp. All Rights Reserved. * 7 ********************************************************************************* 8 * This library is free software; you can redistribute it and/or modify * 9 * it under the terms of the GNU Lesser General Public License as published by * 10 * the Free Software Foundation; either version 3 of the License, or * 11 * (at your option) any later version. * 12 * * 13 * This library is distributed in the hope that it will be useful, * 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 16 * GNU Lesser General Public License for more details. * 17 * * 18 * You should have received a copy of the GNU Lesser General Public License * 19 * along with this program. If not, see <http://www.gnu.org/licenses/> * 20 ********************************************************************************/ 21 #ifndef FXREX_H 22 #define FXREX_H 23 24 namespace FX { 25 26 27 /** 28 * FXRex is a regular expression class implementing a NFA matcher. 29 * It supports capturing parentheses, non-capturing parentheses, positive or negative 30 * lookahead, backreferences, case-insensitive matching, counted repetitions, greedy, lazy and 31 * possessive matches, and PERL-like matching operators. 32 * The subject string may be searched forwards or backwards, and may contain any of 33 * 256 possible byte values. 34 * 35 * When parsing a regular expression pattern, the mode parameter is the bitwise OR of 36 * a set of flags and affects the match algorithm. Passing the flag Capture enables 37 * capturing parentheses and back references, and allows the matcher engine to return 38 * the locations of the string matching these sub-patterns. The flag IgnoreCase enables 39 * case-insensitive matching. 40 * 41 * When the flag Newline is passed, newlines are treated like normal characters, and 42 * not line-separators. If Newline flag is not passed, character classes such as '.', 43 * '\D', '\s', [^a-z] etc. will NOT match newlines. The flag Verbatim disables all 44 * special character interpretation, making the entire pattern a literal string to be 45 * matched against a string. 46 * 47 * When the Exact flag is passed, a match succeeds only if the entire string is matched, 48 * i.e. the entire input presented to FXRex must match against the pattern; otherwise, 49 * only a (possibly empty) substring of the input is matched against the pattern. 50 * If the NotEmpty flag is passed, the pattern must match at least one character in order 51 * to succeed, and empty matches are considered non-matching. 52 * 53 * If the flag Syntax will check the pattern for correct syntax only, and not generate a 54 * matching engine; it will just reset the engine to the empty pattern; use this flag to 55 * verify the syntax of the pattern without compiling it. 56 * 57 * When matching a compiled pattern, the mode parameter is the bitwise OR of a set of 58 * flags that affects how the match is performed. Passing the flags NotBol and/or NotEol 59 * causes the begin and end of the subject string NOT to be considered a line start or 60 * line end. 61 * 62 * Patterns which cause inordinate amounts of recursion may cause FXRex to fail where 63 * otherwise it would succeed to match. 64 * FXRex uses no global variables, and thus multiple threads may simultaneously use it; 65 * moreover, multiple threads may use the same instance to perform a match. 66 */ 67 class FXAPI FXRex { 68 private: 69 FXuchar *code; 70 private: 71 static const FXchar *const errors[]; 72 static const FXuchar fallback[]; 73 public: 74 75 /// Regular expression flags 76 enum { 77 78 /// Flags for both parse and match mode 79 Normal = 0, /// Normal mode (default) 80 Unicode = 1, /// Unicode mode 81 82 /// Regular expression parse flags 83 Syntax = 2, /// Perform syntax check only 84 Verbatim = 4, /// Literal pattern mode with no magic characters 85 Capture = 8, /// Perform capturing parentheses 86 IgnoreCase = 16, /// Ignore case differences 87 Newline = 32, /// Match-any operators match newline too 88 Exact = 64, /// Exact match to entire string (\A..\Z) 89 NotEmpty = 128, /// A successful match must not be empty 90 Reverse = 256, /// Reverse expression mode 91 92 /// Regular expression match flags 93 NotBol = 512, /// Start of string is NOT begin of line 94 NotEol = 1024 /// End of string is NOT end of line 95 }; 96 97 /// Regular expression error codes 98 enum Error { 99 ErrOK = 0, /// No errors 100 ErrEmpty = 1, /// Empty pattern 101 ErrParent = 2, /// Unmatched parenthesis 102 ErrBracket = 3, /// Unmatched bracket 103 ErrBrace = 4, /// Unmatched brace 104 ErrRange = 5, /// Bad character range 105 ErrEscape = 6, /// Bad escape sequence 106 ErrCount = 7, /// Bad counted repeat 107 ErrNoAtom = 8, /// No atom preceding repetition 108 ErrRepeat = 9, /// Repeat following repeat 109 ErrBackRef = 10, /// Bad backward reference 110 ErrClass = 11, /// Bad character class 111 ErrComplex = 12, /// Expression too complex 112 ErrMemory = 13, /// Out of memory 113 ErrToken = 14, /// Illegal token 114 ErrBehind = 15, /// Bad look-behind pattern 115 ErrSupport = 16 /// Unsupported 116 }; 117 118 public: 119 120 /** 121 * Construct empty regular expression object, with the 122 * fallback program installed. 123 */ 124 FXRex(); 125 126 /** 127 * Copy regular expression object from another. 128 */ 129 FXRex(const FXRex& orig); 130 131 /// Compile expression from pattern; if error is not NULL, error code is returned 132 FXRex(const FXchar* pattern,FXint mode=Normal,Error* error=NULL); 133 134 /// Compile expression from pattern; if error is not NULL, error code is returned 135 FXRex(const FXString& pattern,FXint mode=Normal,Error* error=NULL); 136 137 /** 138 * See if regular expression is empty; the regular expression 139 * will be empty when it is unable to parse a pattern due to 140 * a syntax error. 141 */ empty()142 FXbool empty() const { return (code==fallback); } 143 144 /** 145 * Parse pattern, return error code if syntax error is found. 146 * The parse-mode flags control the compile options, and affect how 147 * the generated matcher behaves. 148 * If a parse fails, an error code is returned; in this case, the 149 * expression matcher will be set up to a fallback program. 150 */ 151 Error parse(const FXchar* pattern,FXint mode=Normal); 152 Error parse(const FXString& pattern,FXint mode=Normal); 153 154 /** 155 * Perform anchored match of subject string of length len at position pos, returning true 156 * if the pattern matches at this point. 157 * If there is a match, the pattern and subpatterns are captured in the arrays beg[] and end[] 158 * which must both be at least npar entries long. 159 */ 160 FXbool amatch(const FXchar* string,FXint len,FXint pos=0,FXint mode=Normal,FXint* beg=NULL,FXint* end=NULL,FXint npar=0) const; 161 FXbool amatch(const FXString& string,FXint pos=0,FXint mode=Normal,FXint* beg=NULL,FXint* end=NULL,FXint npar=0) const; 162 163 /** 164 * Search subject string of length len for a pattern, returning the location where the pattern 165 * is found relative from the start of the string, or -1 if there is no match. 166 * In case of a successful match, the pattern and subpatterns are captured in the arrays beg[] and end[] 167 * which must be at least npar entries long. 168 * The string is searched forwards (or backwards) starting from position fm toward to, both of which 169 * must lie inside the string. 170 */ 171 FXint search(const FXchar* string,FXint len,FXint fm,FXint to,FXint mode=Normal,FXint* beg=NULL,FXint* end=NULL,FXint npar=0) const; 172 FXint search(const FXString& string,FXint fm,FXint to,FXint mode=Normal,FXint* beg=NULL,FXint* end=NULL,FXint npar=0) const; 173 174 /** 175 * After performing a regular expression match with capturing parentheses, 176 * a substitution string is build from the replace string, where where "&" 177 * is replaced by the entire matched pattern, and "\1" through "\9" are 178 * replaced by captured expressions. The original source string and its 179 * length, and the match arrays beg and end must be passed. 180 * The replace string may also contain regular escape sequences to embed special 181 * characters. 182 */ 183 static FXString substitute(const FXchar* string,FXint len,FXint* beg,FXint* end,const FXchar* replace,FXint npar=1); 184 static FXString substitute(const FXchar* string,FXint len,FXint* beg,FXint* end,const FXString& replace,FXint npar=1); 185 static FXString substitute(const FXString& string,FXint* beg,FXint* end,const FXchar* replace,FXint npar=1); 186 static FXString substitute(const FXString& string,FXint* beg,FXint* end,const FXString& replace,FXint npar=1); 187 188 /// Returns error message text for a given error code getError(Error err)189 static const FXchar* getError(Error err){ return errors[err]; } 190 191 /// Assign another regular expression to this one 192 FXRex& operator=(const FXRex& orig); 193 194 /// Comparison operators 195 FXbool operator==(const FXRex& rex) const; 196 FXbool operator!=(const FXRex& rex) const; 197 198 /// Saving and loading 199 friend FXAPI FXStream& operator<<(FXStream& store,const FXRex& s); 200 friend FXAPI FXStream& operator>>(FXStream& store,FXRex& s); 201 202 /** 203 * Clear the expression object and reinstate the fallback program. 204 */ 205 void clear(); 206 207 /// Delete 208 ~FXRex(); 209 }; 210 211 212 // Serialization 213 extern FXAPI FXStream& operator<<(FXStream& store,const FXRex& s); 214 extern FXAPI FXStream& operator>>(FXStream& store,FXRex& s); 215 216 } 217 218 #endif 219