1 /********************************************************************************
2 *                                                                               *
3 *                 R e g u l a r   E x p r e s s i o n   C l a s s               *
4 *                                                                               *
5 *********************************************************************************
6 * Copyright (C) 1999,2020 by Jeroen van der Zijp.   All Rights Reserved.        *
7 *********************************************************************************
8 * This library is free software; you can redistribute it and/or modify          *
9 * it under the terms of the GNU Lesser General Public License as published by   *
10 * the Free Software Foundation; either version 3 of the License, or             *
11 * (at your option) any later version.                                           *
12 *                                                                               *
13 * This library is distributed in the hope that it will be useful,               *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of                *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                 *
16 * GNU Lesser General Public License for more details.                           *
17 *                                                                               *
18 * You should have received a copy of the GNU Lesser General Public License      *
19 * along with this program.  If not, see <http://www.gnu.org/licenses/>          *
20 ********************************************************************************/
21 #ifndef FXREX_H
22 #define FXREX_H
23 
24 namespace FX {
25 
26 
27 /**
28 * FXRex is a regular expression class implementing a NFA matcher.
29 * It supports capturing parentheses, non-capturing parentheses, positive or negative
30 * lookahead, backreferences, case-insensitive matching, counted repetitions, greedy, lazy and
31 * possessive matches, and PERL-like matching operators.
32 * The subject string may be searched forwards or backwards, and may contain any of
33 * 256 possible byte values.
34 *
35 * When parsing a regular expression pattern, the mode parameter is the bitwise OR of
36 * a set of flags and affects the match algorithm.  Passing the flag Capture enables
37 * capturing parentheses and back references, and allows the matcher engine to return
38 * the locations of the string matching these sub-patterns. The flag IgnoreCase enables
39 * case-insensitive matching.
40 *
41 * When the flag Newline is passed, newlines are treated like normal characters, and
42 * not line-separators.  If Newline flag is not passed, character classes such as '.',
43 * '\D', '\s', [^a-z] etc. will NOT match newlines.  The flag Verbatim disables all
44 * special character interpretation, making the entire pattern a literal string to be
45 * matched against a string.
46 *
47 * When the Exact flag is passed, a match succeeds only if the entire string is matched,
48 * i.e. the entire input presented to FXRex must match against the pattern; otherwise,
49 * only a (possibly empty) substring of the input is matched against the pattern.
50 * If the NotEmpty flag is passed, the pattern must match at least one character in order
51 * to succeed, and empty matches are considered non-matching.
52 *
53 * If the flag Syntax will check the pattern for correct syntax only, and not generate a
54 * matching engine; it will just reset the engine to the empty pattern; use this flag to
55 * verify the syntax of the pattern without compiling it.
56 *
57 * When matching a compiled pattern, the mode parameter is the bitwise OR of a set of
58 * flags that affects how the match is performed.  Passing the flags NotBol and/or NotEol
59 * causes the begin and end of the subject string NOT to be considered a line start or
60 * line end.
61 *
62 * Patterns which cause inordinate amounts of recursion may cause FXRex to fail where
63 * otherwise it would succeed to match.
64 * FXRex uses no global variables, and thus multiple threads may simultaneously use it;
65 * moreover, multiple threads may use the same instance to perform a match.
66 */
67 class FXAPI FXRex {
68 private:
69   FXuchar *code;
70 private:
71   static const FXchar *const errors[];
72   static const FXuchar fallback[];
73 public:
74 
75   /// Regular expression flags
76   enum {
77 
78     /// Flags for both parse and match mode
79     Normal     = 0,     /// Normal mode (default)
80     Unicode    = 1,     /// Unicode mode
81 
82     /// Regular expression parse flags
83     Syntax     = 2,     /// Perform syntax check only
84     Verbatim   = 4,     /// Literal pattern mode with no magic characters
85     Capture    = 8,     /// Perform capturing parentheses
86     IgnoreCase = 16,    /// Ignore case differences
87     Newline    = 32,    /// Match-any operators match newline too
88     Exact      = 64,    /// Exact match to entire string (\A..\Z)
89     NotEmpty   = 128,   /// A successful match must not be empty
90     Reverse    = 256,   /// Reverse expression mode
91 
92     /// Regular expression match flags
93     NotBol     = 512,   /// Start of string is NOT begin of line
94     NotEol     = 1024   /// End of string is NOT end of line
95     };
96 
97   /// Regular expression error codes
98   enum Error {
99     ErrOK      = 0,     /// No errors
100     ErrEmpty   = 1,     /// Empty pattern
101     ErrParent  = 2,     /// Unmatched parenthesis
102     ErrBracket = 3,     /// Unmatched bracket
103     ErrBrace   = 4,     /// Unmatched brace
104     ErrRange   = 5,     /// Bad character range
105     ErrEscape  = 6,     /// Bad escape sequence
106     ErrCount   = 7,     /// Bad counted repeat
107     ErrNoAtom  = 8,     /// No atom preceding repetition
108     ErrRepeat  = 9,     /// Repeat following repeat
109     ErrBackRef = 10,    /// Bad backward reference
110     ErrClass   = 11,    /// Bad character class
111     ErrComplex = 12,    /// Expression too complex
112     ErrMemory  = 13,    /// Out of memory
113     ErrToken   = 14,    /// Illegal token
114     ErrBehind  = 15,    /// Bad look-behind pattern
115     ErrSupport = 16     /// Unsupported
116     };
117 
118 public:
119 
120   /**
121   * Construct empty regular expression object, with the
122   * fallback program installed.
123   */
124   FXRex();
125 
126   /**
127   * Copy regular expression object  from another.
128   */
129   FXRex(const FXRex& orig);
130 
131   /// Compile expression from pattern; if error is not NULL, error code is returned
132   FXRex(const FXchar* pattern,FXint mode=Normal,Error* error=NULL);
133 
134   /// Compile expression from pattern; if error is not NULL, error code is returned
135   FXRex(const FXString& pattern,FXint mode=Normal,Error* error=NULL);
136 
137   /**
138   * See if regular expression is empty; the regular expression
139   * will be empty when it is unable to parse a pattern due to
140   * a syntax error.
141   */
empty()142   FXbool empty() const { return (code==fallback); }
143 
144   /**
145   * Parse pattern, return error code if syntax error is found.
146   * The parse-mode flags control the compile options, and affect how
147   * the generated matcher behaves.
148   * If a parse fails, an error code is returned; in this case, the
149   * expression matcher will be set up to a fallback program.
150   */
151   Error parse(const FXchar* pattern,FXint mode=Normal);
152   Error parse(const FXString& pattern,FXint mode=Normal);
153 
154   /**
155   * Perform anchored match of subject string of length len at position pos, returning true
156   * if the pattern matches at this point.
157   * If there is a match, the pattern and subpatterns are captured in the arrays beg[] and end[]
158   * which must both be at least npar entries long.
159   */
160   FXbool amatch(const FXchar* string,FXint len,FXint pos=0,FXint mode=Normal,FXint* beg=NULL,FXint* end=NULL,FXint npar=0) const;
161   FXbool amatch(const FXString& string,FXint pos=0,FXint mode=Normal,FXint* beg=NULL,FXint* end=NULL,FXint npar=0) const;
162 
163   /**
164   * Search subject string of length len for a pattern, returning the location where the pattern
165   * is found relative from the start of the string, or -1 if there is no match.
166   * In case of a successful match, the pattern and subpatterns are captured in the arrays beg[] and end[]
167   * which must be at least npar entries long.
168   * The string is searched forwards (or backwards) starting from position fm toward to, both of which
169   * must lie inside the string.
170   */
171   FXint search(const FXchar* string,FXint len,FXint fm,FXint to,FXint mode=Normal,FXint* beg=NULL,FXint* end=NULL,FXint npar=0) const;
172   FXint search(const FXString& string,FXint fm,FXint to,FXint mode=Normal,FXint* beg=NULL,FXint* end=NULL,FXint npar=0) const;
173 
174   /**
175   * After performing a regular expression match with capturing parentheses,
176   * a substitution string is build from the replace string, where where "&"
177   * is replaced by the entire matched pattern, and "\1" through "\9" are
178   * replaced by captured expressions.  The original source string and its
179   * length, and the match arrays beg and end must be passed.
180   * The replace string may also contain regular escape sequences to embed special
181   * characters.
182   */
183   static FXString substitute(const FXchar* string,FXint len,FXint* beg,FXint* end,const FXchar* replace,FXint npar=1);
184   static FXString substitute(const FXchar* string,FXint len,FXint* beg,FXint* end,const FXString& replace,FXint npar=1);
185   static FXString substitute(const FXString& string,FXint* beg,FXint* end,const FXchar* replace,FXint npar=1);
186   static FXString substitute(const FXString& string,FXint* beg,FXint* end,const FXString& replace,FXint npar=1);
187 
188   /// Returns error message text for a given error code
getError(Error err)189   static const FXchar* getError(Error err){ return errors[err]; }
190 
191   /// Assign another regular expression to this one
192   FXRex& operator=(const FXRex& orig);
193 
194   /// Comparison operators
195   FXbool operator==(const FXRex& rex) const;
196   FXbool operator!=(const FXRex& rex) const;
197 
198   /// Saving and loading
199   friend FXAPI FXStream& operator<<(FXStream& store,const FXRex& s);
200   friend FXAPI FXStream& operator>>(FXStream& store,FXRex& s);
201 
202   /**
203   * Clear the expression object and reinstate the fallback program.
204   */
205   void clear();
206 
207   /// Delete
208  ~FXRex();
209   };
210 
211 
212 // Serialization
213 extern FXAPI FXStream& operator<<(FXStream& store,const FXRex& s);
214 extern FXAPI FXStream& operator>>(FXStream& store,FXRex& s);
215 
216 }
217 
218 #endif
219