1 /********************************************************************************
2 *                                                                               *
3 *                 R e g u l a r   E x p r e s s i o n   C l a s s               *
4 *                                                                               *
5 *********************************************************************************
6 * Copyright (C) 1999,2006 by Jeroen van der Zijp.   All Rights Reserved.        *
7 *********************************************************************************
8 * This library is free software; you can redistribute it and/or                 *
9 * modify it under the terms of the GNU Lesser General Public                    *
10 * License as published by the Free Software Foundation; either                  *
11 * version 2.1 of the License, or (at your option) any later version.            *
12 *                                                                               *
13 * This library is distributed in the hope that it will be useful,               *
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of                *
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU             *
16 * Lesser General Public License for more details.                               *
17 *                                                                               *
18 * You should have received a copy of the GNU Lesser General Public              *
19 * License along with this library; if not, write to the Free Software           *
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.    *
21 *********************************************************************************
22 * $Id: FXRex.cpp,v 1.98 2006/03/18 01:05:58 fox Exp $                           *
23 ********************************************************************************/
24 #include "xincs.h"
25 #include "fxver.h"
26 #include "fxdefs.h"
27 #include "fxascii.h"
28 #include "fxunicode.h"
29 #include "FXHash.h"
30 #include "FXStream.h"
31 #include "FXString.h"
32 #include "FXRex.h"
33 
34 
35 /*
36   The Story:
37   ==========
38 
39   As with most regex implementations, this one is inspired by Henry Spencer's
40   original implementation.
41 
42   This is however an ab-initio implementation, with the following goals:
43 
44         o Full C++ implementation, no simple C++ wrapper.
45         o Trade speed and memory in favor of speed, but keep it compact where
46           possible.
47         o Thread-safe:
48             - No global variables used during parsing or execution.
49             - Multiple threads could use same single FXRex at the same time.
50         o Perl-like syntax for character classes.
51         o Additional features such as lazy/greedy/possessive closures, counting
52           repeats, back references, trailing context.
53         o Forward and backward subject string scanning mode.
54         o Single line/multi line matching modes.
55         o 8-bit safe (you can use it to grep binary data!).
56         o No arbitrary limitations.  Patterns can be as long as you need them
57           to be, memory allowing.  This is possible due to representing the
58           progam as 32-bit integers.
59         o When parsing fails, or when created with default constructor, FXRex
60           is initialized to a "fallback" program; its thus safe to invoke match
61           at any time.
62         o The default fallback program will fail to match anything.
63         o Observe current locale if your "ctype.h" functions support it, by calling
64           all locale-sensitive functions during the match.
65         o Convenient feature: disallow empty string matches; this is nice as
66           it prevents a common problem, for example searching for "a*" in "bbba";
67           without the REX_NOT_EMPTY option, this matches "" and not the "a".
68         o Another convenient feature is the ability to compile verbatim strings.
69           This is practical as it allows FXRex to be populated with a simple
70           string with no interpretation of special characters ^*?+{}()\$.[].
71         o Note that FXRex's implementation would naturally move to wide
72           character support...
73 
74   Because this is completely new implementation of regular expressions, and
75   not merely an extension of a previous implementation, some people may want to
76   adapt it for use outside of FOX.  This is perfectly OK with me.
77 
78   However:
79 
80         o The Author is not responsible for the consequences of using this
81           software.
82 
83         o Recipient should be clearly informed of the origin of the software;
84           and if alterations have been made, this must be stated clearly.
85 
86         o Software will continue to fall under Lesser GPL, unless specific
87           permission from the Author has been obtained.
88 
89 
90   Implementation notes:
91   =====================
92 
93   This implementation does not use "nodes" with "next" pointers; instead, the "next"
94   opcode is located implicitly by simple sequentiality.  This has beneficial effect
95   on speed, as one can simply add to the program counter instead of performing a
96   memory reference.
97 
98   Sometimes one needs to jump out of sequence; this is implemented by an
99   explicit jump instruction.  Because it works with 32-bit offsets, there
100   is no need to distinguish between forward and backward jumps.
101 
102   Henry Spencer implemented a trick to "prefix" simple single-character matching
103   opcodes with a closure operator, by shifting down already generated code and
104   inserting the closure operator in front of it.
105 
106   FXRex uses the same trick of shifting down code; but this same trick is also
107   useful for branches!
108 
109   FXRex only generates a branch node when an alternative has in fact been seen;
110   if no alternative is there, we've saved ourselves both a branch node and a
111   jump node!
112 
113   This has interesting side-effects:
114 
115         o The common case of 1 single branch now no longer needs a branch opcode
116           and corresponding jump opcode at all!
117 
118         o When an alternative is found, we insert a branch node in front and a jump
119           node behind the already generated code.  This can be done easily as
120           branches and jumps within the shifted block of code are relative, and have
121           already been resolved!
122 
123         o The matching algorithm for a branch opcode simplifies as well:- either
124           it matches the first branch, or it continues after the jump.
125 
126         o Its easier to dig out some info from the program, and in fact so easy
127           that this digging can be moved to the execute phase.
128 
129   When a repeat is prefixed in front of a simple single-character match, counted
130   repeats are simplified: {1}, {1,1} is eliminated, {}, {0,} becomes *, {1,} becomes
131   +, and {0,1} becomes ?.
132 
133   Because single-character repeats are prefixed with a repeat instruction, there is
134   no recursion needed; single character repeats are therefore very fast.
135 
136   Complex repeats are implemented using branch loop constructs and may involve
137   recursions (in fact only the fixed repeat is non-recursive!).  Hence complex repeats
138   should be avoided when single-character repeats will suffice.
139 
140   OP_BRANCH and OP_BRANCHREV implement alternatives. For OP_BRANCH, first the inline
141   code immediately following the offset is executed; if the inline code fails, OP_BRANCH
142   takes a jump to the new location and tries the alternative.
143 
144   For OP_BRANCHREV, it works the opposite way: OP_BRANCHREV takes the alternative
145   first, before trying the inline code.
146 
147   Having both OP_BRANCH and OP_BRANCHREV substantially simplifies the design of
148   complex, greedy or lazy matches:- the greedy and lazy match turn out to have the
149   same code structure, except we're using OP_BRANCHREV for the lazy matches and
150   OP_BRANCH for the greedy ones.
151 
152   OP_JUMP is an unconditional jump to a new location. OP_JUMPLT and OP_JUMPGT jump
153   when the loop counter is less than or greater than the given value, respectively.
154 
155 
156   Atomic Matching Groups
157   ======================
158 
159   For example, trying to match pattern "\d+foo" against subject string "123456bar",
160   the matcher will eat all digits up till "6", and then backtrack by trying digits
161   up till 5, and so on.  A atomic subgroup match will simply fail if the sub-pattern
162   fails to match at the end.  This can be written as: "(?>\d+)foo".  Atomic groups
163   are thus more efficient since no repeated tries are being made.
164 
165 
166   Greedy, Lazy, and Possessive Matches
167   ====================================
168 
169   Greedy: the "a*" in pattern "a*ardvark" matching subject string "aardvark" will match
170   "aa", then backtrack and match "a", after which the match succeeds.
171 
172   Lazy: the "a*?" in pattern "a*?ardvark" will first match "", then try match "a" after
173   which the match succeeds.
174 
175   Possessive: the "a*+" in pattern "a*+ardvark" will match "aa", then fail without
176   backing off.
177 
178   Possessive matches and Atomic matching groups are closely related in terms of
179   controlling the recursion depth of the matcher.
180 
181 
182   Syntax:
183   =======
184 
185       Special Constructs
186 
187       (?i X )   Match sub pattern case insensitive
188       (?I X )   Match sub pattern case sensitive
189       (?n X )   Match sub pattern with newlines
190       (?N X )   Match sub pattern with no newlines
191       (?: X )   Non-capturing parentheses
192       (?= X )   Zero width positive lookahead
193       (?! X )   Zero width negative lookahead
194       (?<= X )  Zero width positive lookbehind
195       (?<! X )  Zero width negative lookbehind
196       (?> X )   Atomic grouping (possessive match)
197 
198       Logical Operators
199 
200       X Y       X followed by Y
201       X | Y     Either X or Y
202       ( X )     Sub pattern (capturing if REX_CAPTURE)
203 
204       Greedy Quantifiers
205 
206       X *       Match 0 or more
207       X +       Match 1 or more
208       X ?       Match 0 or 1
209       X {}      Match 0 or more
210       X {n}     Match n times
211       X {,m}    Match no more than m times
212       X {n,}    Match n or more
213       X {n,m}   Match at least n but no more than m times
214 
215       Lazy Quantifiers
216 
217       X *?      Match 0 or more
218       X +?      Match 1 or more
219       X ??      Match 0 or 1
220       X {}?     Match 0 or more times
221       X {n}?    Match n times
222       X {,m}?   Match no more than m times
223       X {n,}?   Match n or more
224       X {n,m}?  Match at least n but no more than m times
225 
226       Possessive (non-backtracking) Quantifiers
227 
228       X *+      Match 0 or more
229       X ++      Match 1 or more
230       X ?+      Match 0 or 1
231       X {}+     Match 0 or more times
232       X {n}+    Match n times
233       X {,m}+   Match no more than m times
234       X {n,}+   Match n or more
235       X {n,m}+  Match at least n but no more than m times
236 
237       Boundary Matching
238       ^         Match begin of line [if at begin of pattern]
239       $         Match end of line [if at end of pattern]
240       \<        Begin of word
241       \>        End of word
242       \b        Word boundary
243       \B        Word interior
244       \A        Match only beginning of string
245       \Z        Match only and end of string
246 
247       Character Classes
248 
249       [abc]     Match a, b, or c
250       [^abc]    Match any but a, b, or c
251       [a-zA-Z]  Match upper- or lower-case a through z
252       []]       Matches ]
253       [-]       Matches -
254 
255       Predefined character classes
256 
257       .         Match any character
258       \d        Digit [0-9]
259       \D        Non-digit
260       \s        Space
261       \S        Non-space
262       \w        Word character [a-zA-Z_0-9]
263       \W        Non-word character
264       \l        Letter [a-zA-Z]
265       \L        Non-letter
266       \h        Hex digit [0-9a-fA-F]
267       \H        Non-hex digit
268       \u        Single uppercase character
269       \U        Single lowercase character
270       \p        Punctuation (not including '_')
271       \P        Non punctuation
272 
273       Characters
274 
275       x           Any character
276       \\          Back slash character
277       \033        Octal
278       \x1b        Hex
279       \u1FD1      Unicode U+1FD1 (GREEK SMALL LETTER IOTA WITH MACRON))
280       \U00010450  Wide unicode U+10450 (SHAVIAN LETTER PEEP)
281       \a          Alarm, bell
282       \e          Escape character
283       \t          Tab
284       \f          Form feed
285       \n          Newline
286       \r          Return
287       \v          Vertical tab
288       \cx         Control character
289 
290       Back References
291 
292       \1        Reference to 1st capturing group
293       \2        Reference to 2nd capturing group
294       ...
295       \9        Reference to 9th capturing group
296 
297       POSIX character classes (US-ASCII only)
298 
299       \P{name}    Matches anything BUT what \p{name} matches
300 
301       \p{Lower}   A lower-case alphabetic character: [a-z]
302       \p{Upper}   An upper-case alphabetic character: [A-Z]
303       \p{ASCII}   All ASCII: [\x00-\x7F]
304       \p{Alpha}   An alphabetic character:[\p{Lower}\p{Upper}]
305       \p{Digit}   A decimal digit: [0-9]
306       \p{Alnum}   An alphanumeric character: [\p{Alpha}\p{Digit}]
307       \p{Punct}   Punctuation: One of !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
308       \p{Graph}   A visible character: [\p{Alnum}\p{Punct}]
309       \p{Print}   A printable character: [\p{Graph}]
310       \p{Blank}   A space or a tab: [ \t]
311       \p{Cntrl}   A control character: [\x00-\x1F\x7F]
312       \p{XDigit}  A hexadecimal digit: [0-9a-fA-F]
313       \p{Space}   A whitespace character: [ \t\n\x0B\f\r]
314 
315 
316       Unicode general character categories
317 
318       \p{C}     Other (Cc | Cf | Cn | Co | Cs)
319       \p{Cc}    Control
320       \p{Cf}    Format
321       \p{Cn}    Unassigned
322       \p{Co}    Private use
323       \p{Cs}    Surrogate
324 
325       \p{L}     Letter (Ll | Lm | Lo | Lt | Lu)
326       \p{Ll}    Lower case letter
327       \p{Lm}    Modifier letter
328       \p{Lo}    Other letter
329       \p{Lt}    Title case letter
330       \p{Lu}    Upper case letter
331 
332       \p{M}     Mark (Mc | Me | Mn)
333       \p{Mc}    Spacing mark
334       \p{Me }   Enclosing mark
335       \p{Mn}    Non-spacing mark
336 
337       \p{N}     Number (Nd | Nl | No)
338       \p{Nd}    Decimal number
339       \p{Nl}    Letter number
340       \p{No}    Other number
341 
342       \p{P}     Punctuation (Pc | Pd | Pe | Pf | Pi | Po | Ps)
343       \p{Pc}    Connector punctuation
344       \p{Pd}    Dash punctuation
345       \p{Pe}    Close punctuation
346       \p{Pf}    Final punctuation
347       \p{Pi}    Initial punctuation
348       \p{Po}    Other punctuation
349       \p{Ps}    Open punctuation
350 
351       \p{S}     Symbol (Sc | Sk | Sm | So)
352       \p{Sc}    Currency symbol
353       \p{Sk}    Modifier symbol
354       \p{Sm}    Mathematical symbol
355       \p{So}    Other symbol
356 
357       \p{Z}     Separator (Zl | Zp | Zs)
358       \p{Zl}    Line separator
359       \p{Zp}    Paragraph separator
360       \p{Zs}    Space separator
361 
362 
363       Unicode script categories
364 
365       \p{Arab}  Arabic
366       \p{Armn}  Armenian
367       \p{Beng}  Bengali
368       \p{Bopo}  Bopomofo
369       \p{Brai}  Braille
370       \p{Bugi}  Buginese
371       \p{Buhd}  Buhid
372       \p{Cans}  Canadian_Aboriginal
373       \p{Cher}  Cherokee
374       \p{Copt}  Coptic (Qaac)
375       \p{Cprt}  Cypriot
376       \p{Cyrl}  Cyrillic
377       \p{Deva}  Devanagari
378       \p{Dsrt}  Deseret
379       \p{Ethi}  Ethiopic
380       \p{Geor}  Georgian
381       \p{Glag}  Glagolitic
382       \p{Goth}  Gothic
383       \p{Grek}  Greek
384       \p{Gujr}  Gujarati
385       \p{Guru}  Gurmukhi
386       \p{Hang}  Hangul
387       \p{Hani}  Han
388       \p{Hano}  Hanunoo
389       \p{Hebr}  Hebrew
390       \p{Hira}  Hiragana
391       \p{Hrkt}  Katakana_Or_Hiragana
392       \p{Ital}  Old_Italic
393       \p{Kana}  Katakana
394       \p{Khar}  Kharoshthi
395       \p{Khmr}  Khmer
396       \p{Knda}  Kannada
397       \p{Laoo}  Lao
398       \p{Latn}  Latin
399       \p{Limb}  Limbu
400       \p{Linb}  Linear_B
401       \p{Mlym}  Malayalam
402       \p{Mong}  Mongolian
403       \p{Mymr}  Myanmar
404       \p{Ogam}  Ogham
405       \p{Orya}  Oriya
406       \p{Osma}  Osmanya
407       \p{Qaai}  Inherited
408       \p{Runr}  Runic
409       \p{Shaw}  Shavian
410       \p{Sinh}  Sinhala
411       \p{Sylo}  Syloti_Nagri
412       \p{Syrc}  Syriac
413       \p{Tagb}  Tagbanwa
414       \p{Tale}  Tai_Le
415       \p{Talu}  New_Tai_Lue
416       \p{Taml}  Tamil
417       \p{Telu}  Telugu
418       \p{Tfng}  Tifinagh
419       \p{Tglg}  Tagalog
420       \p{Thaa}  Thaana
421       \p{Thai}  Thai
422       \p{Tibt}  Tibetan
423       \p{Ugar}  Ugaritic
424       \p{Xpeo}  Old_Persian
425       \p{Yiii}  Yi
426       \p{Zyyy}  Common
427 
428 
429   Grammar:
430   ========
431 
432       exp        ::= branch { "|" branch }*
433       branch     ::= { piece }*
434       piece      ::= atom [ rep ]
435       rep        ::= ( "*" | "+" | "?" | counts ) [ "?" ]
436       counts     ::= "{" digits ["," [ digits] ] "}"
437       atom       ::= "(" exp ")" | "[" [^] range "]" | characters
438       range      ::= { character | character "-" character } +
439       characters ::= { character }*
440       digits     ::= { digit }*
441 
442   To do:
443   ======
444 
445   - Look into optimizing character class when possible (e.g.
446     collapse [0-9] to OP_DIGIT and [^A] into OP_NOT_CHAR).
447   - Should \D, \W, \L match newline?
448   - Look behind would be nice...
449   - Repeating back references, only possible if capturing parentheses
450     are known NOT to match "".
451   - Note the \uXXXX is going to be used for UNICODE perhaps:
452     See: http://www.unicode.org/unicode/reports/tr18.
453   - Implement possessive and atomic groups
454 */
455 
456 
457 // As close to infinity as we're going to get; this seems big
458 // enough.  We can not make it MAX_INT as this may wrap around when
459 // added to something else!
460 #define ONEINDIG 1000000
461 
462 // Number of capturing sub-expressions allowed
463 #define NSUBEXP  10
464 
465 // Size of string buffer
466 #define MAXCHARS 512
467 
468 // Set operations
469 #define EXCL(set,ch) (set[((FXuchar)(ch))>>5]&=~(1<<(((FXuchar)(ch))&31)))
470 #define INCL(set,ch) (set[((FXuchar)(ch))>>5]|=(1<<(((FXuchar)(ch))&31)))
471 #define ISIN(set,ch) (set[((FXuchar)(ch))>>5]&(1<<(((FXuchar)(ch))&31)))
472 #define CLEAR(set)   (set[0]=set[1]=set[2]=set[3]=set[4]=set[5]=set[6]=set[7]=0)
473 
474 using namespace FX;
475 
476 /*******************************************************************************/
477 
478 namespace {
479 
480 // Opcodes of the engine
481 enum {
482   OP_END           =   0,           // End of pattern reached
483   OP_FAIL          =   1,           // Always fail
484   OP_SUCCEED       =   2,           // Always succeed
485   OP_LINE_BEG      =   3,           // Beginning of line
486   OP_LINE_END      =   4,           // End of line
487   OP_WORD_BEG      =   5,           // Beginning of word
488   OP_WORD_END      =   6,           // End of word
489   OP_WORD_BND      =   7,           // Word boundary
490   OP_WORD_INT      =   8,           // Word interior
491   OP_STR_BEG       =   9,           // Beginning of string
492   OP_STR_END       =  10,           // End of string
493   OP_ANY_OF        =  11,           // Any character in set
494   OP_ANY_BUT       =  12,           // Any character not in set
495   OP_ANY           =  13,           // Any character but no newline
496   OP_ANY_NL        =  14,           // Any character including newline
497   OP_SPACE         =  15,           // White space
498   OP_SPACE_NL      =  16,           // White space including newline
499   OP_NOT_SPACE     =  17,           // Non-white space
500   OP_DIGIT         =  18,           // Digit
501   OP_NOT_DIGIT     =  19,           // Non-digit
502   OP_NOT_DIGIT_NL  =  20,           // Non-digit including newline
503   OP_LETTER        =  21,           // Letter
504   OP_NOT_LETTER    =  22,           // Non-letter
505   OP_NOT_LETTER_NL =  23,           // Non-letter including newline
506   OP_WORD          =  24,           // Word character
507   OP_NOT_WORD      =  25,           // Non-word character
508   OP_NOT_WORD_NL   =  26,           // Non-word character including newline
509   OP_HEX           =  27,           // Hex digit
510   OP_NOT_HEX       =  28,           // Non hex digit
511   OP_NOT_HEX_NL    =  29,           // Non hex digit including newline
512   OP_PUNCT         =  30,           // Punctuation
513   OP_NOT_PUNCT     =  31,           // Non punctuation
514   OP_NOT_PUNCT_NL  =  32,           // Non punctuation including newline
515   OP_CHARS         =  33,           // Match literal string
516   OP_CHARS_CI      =  34,           // Match literal string, case insensitive
517   OP_CHAR          =  35,           // Single character
518   OP_CHAR_CI       =  36,           // Single character, case insensitive
519   OP_JUMP          =  37,           // Jump to another location
520   OP_BRANCH        =  38,           // Branch: jump after trying following code
521   OP_BRANCHREV     =  39,           // Branch: jump before trying following code
522   OP_STAR          =  40,           // Greedy * (simple)
523   OP_MIN_STAR      =  41,           // Lazy * (simple)
524   OP_POS_STAR      =  42,           // Possessive * (simple)
525   OP_PLUS          =  43,           // Greedy + (simple)
526   OP_MIN_PLUS      =  44,           // Lazy + (simple)
527   OP_POS_PLUS      =  45,           // Possessive + (simple)
528   OP_QUEST         =  46,           // Greedy ? (simple)
529   OP_MIN_QUEST     =  47,           // Lazy ? (simple)
530   OP_POS_QUEST     =  48,           // Possessive ? (simple)
531   OP_REP           =  49,           // Greedy counted repeat (simple)
532   OP_MIN_REP       =  50,           // Lazy counted repeat (simple)
533   OP_POS_REP       =  51,           // Possessive counted repeat (simple)
534   OP_LOOK_NEG      =  52,           // Negative look ahead
535   OP_LOOK_POS      =  53,           // Positive look ahead
536   OP_UPPER         =  54,           // Match upper case
537   OP_LOWER         =  55,           // Match lower case
538   OP_SUB_BEG       =  56,           // Start of substring i
539   OP_SUB_END       =  66,           // End of substring i
540   OP_REF           =  76,           // Back reference to substring i
541   OP_REF_CI        =  86,           // Match substring i case insensitive
542   OP_ZERO          =  96,           // Zero count i
543   OP_INCR          = 106,           // Increment count i
544   OP_JUMPLT        = 116,           // Jump if count i less than value
545   OP_JUMPGT        = 126            // JUmp if count i greater than value
546   };
547 
548 
549 // Flags
550 enum {
551   FLG_WORST  = 0,           // Worst case
552   FLG_WIDTH  = 1,           // Matches >=1 character
553   FLG_SIMPLE = 2            // Simple
554   };
555 
556 
557 // Structure used during matching
558 struct FXExecute {
559   const FXchar  *str;               // String
560   const FXchar  *str_beg;           // Begin of string
561   const FXchar  *str_end;           // End of string
562   FXint         *sub_beg;           // Begin of substring i
563   FXint         *sub_end;           // End of substring i
564   const FXint   *code;              // Program code
565   FXint          npar;              // Number of capturing parentheses
566   FXint          count[10];         // Counters for counted repeats
567   FXint          mode;              // Match mode
568 
569   // Attempt to match
570   bool attempt(const FXchar* string);
571 
572   // Match at current string position
573   bool match(const FXint* prog);
574 
575   // Execute
576   bool execute(const FXchar* fm,const FXchar* to);
577   };
578 
579 
580 // Structure used during compiling
581 struct FXCompile {
582   const FXchar  *pat;               // Pattern string pointer
583   FXint         *code;              // Program code
584   FXint         *pc;                // Program counter
585   FXint          mode;              // Compile mode
586   FXint          nbra;              // Number of counting braces
587   FXint          npar;              // Number of capturing parentheses
588 
589   // Code generation
590   FXint* append(FXint op);
591   FXint* append(FXint op,FXint arg);
592   FXint* append(FXint op,FXint arg1,FXint arg2);
593   FXint* append(FXint op,FXint set[]);
594   FXint* append(FXint op,FXint len,FXint *data);
595   FXint* insert(FXint *ptr,FXint op);
596   FXint* insert(FXint *ptr,FXint op,FXint arg);
597   FXint* insert(FXint *ptr,FXint op,FXint arg1,FXint arg2);
598 
599   // Patch branches
600   void patch(FXint *fm,FXint *to);
601 
602   // Parsing
603   FXRexError compile(FXint& flags);
604   FXRexError expression(FXint& flags);
605   FXRexError verbatim(FXint& flags);
606   FXRexError alternative(FXint& flags);
607   FXRexError piece(FXint& flags);
608   FXRexError atom(FXint& flags);
609   FXRexError charset();
610   };
611 
612 
613 /*******************************************************************************/
614 
615 #ifndef NDEBUG
616 
617 // Dump program
dump(FXint * prog)618 void dump(FXint *prog){
619   FXint op,min,max,no,val;
620   fxmessage("\n");
621   fxmessage("Program:\n");
622   fxmessage("%-10p SIZE %d\n",prog,*prog);
623   prog++;
624   while(1){
625     fxmessage("%-10p ",prog);
626     op=*prog++;
627     switch(op){
628       case OP_END:
629         fxmessage("OP_END\n");
630         goto x;
631       case OP_FAIL:
632         fxmessage("OP_FAIL\n");
633         break;
634       case OP_SUCCEED:
635         fxmessage("OP_SUCCEED\n");
636         break;
637       case OP_LINE_BEG:
638         fxmessage("OP_LINE_BEG\n");
639         break;
640       case OP_LINE_END:
641         fxmessage("OP_LINE_END\n");
642         break;
643       case OP_WORD_BEG:
644         fxmessage("OP_WORD_BEG\n");
645         break;
646       case OP_WORD_END:
647         fxmessage("OP_WORD_END\n");
648         break;
649       case OP_WORD_BND:
650         fxmessage("OP_WORD_BND\n");
651         break;
652       case OP_WORD_INT:
653         fxmessage("OP_WORD_INT\n");
654         break;
655       case OP_STR_BEG:
656         fxmessage("OP_STR_BEG\n");
657         break;
658       case OP_STR_END:
659         fxmessage("OP_STR_END\n");
660         break;
661       case OP_ANY_OF:
662         fxmessage("OP_ANY_OF   \"");
663         for(no=0; no<256; no++){
664           if(ISIN(prog,no)){
665             if(' '<=no){ fxmessage("%c",no); } else { fxmessage("\\x%02x",no); }
666             }
667           }
668         fxmessage("\"\n");
669         prog+=8;
670         break;
671       case OP_ANY_BUT:
672         fxmessage("OP_ANY_BUT  \"");
673         for(no=0; no<256; no++){
674           if(ISIN(prog,no)){
675             if(' '<=no){ fxmessage("%c",no); } else { fxmessage("\\x%02x",no); }
676             }
677           }
678         fxmessage("\"\n");
679         prog+=8;
680         break;
681       case OP_ANY:
682         fxmessage("OP_ANY\n");
683         break;
684       case OP_ANY_NL:
685         fxmessage("OP_ANY_NL\n");
686         break;
687       case OP_SPACE:
688         fxmessage("OP_SPACE\n");
689         break;
690       case OP_SPACE_NL:
691         fxmessage("OP_SPACE_NL\n");
692         break;
693       case OP_NOT_SPACE:
694         fxmessage("OP_NOT_SPACE\n");
695         break;
696       case OP_DIGIT:
697         fxmessage("OP_DIGIT\n");
698         break;
699       case OP_NOT_DIGIT:
700         fxmessage("OP_NOT_DIGIT\n");
701         break;
702       case OP_NOT_DIGIT_NL:
703         fxmessage("OP_NOT_DIGIT_NL\n");
704         break;
705       case OP_LETTER:
706         fxmessage("OP_LETTER\n");
707         break;
708       case OP_NOT_LETTER:
709         fxmessage("OP_NOT_LETTER\n");
710         break;
711       case OP_NOT_LETTER_NL:
712         fxmessage("OP_NOT_LETTER_NL\n");
713         break;
714       case OP_WORD:
715         fxmessage("OP_WORD\n");
716         break;
717       case OP_NOT_WORD:
718         fxmessage("OP_NOT_WORD\n");
719         break;
720       case OP_NOT_WORD_NL:
721         fxmessage("OP_NOT_WORD_NL\n");
722         break;
723       case OP_HEX:
724         fxmessage("OP_HEX\n");
725         break;
726       case OP_NOT_HEX:
727         fxmessage("OP_NOT_HEX\n");
728         break;
729       case OP_NOT_HEX_NL:
730         fxmessage("OP_NOT_HEX_NL\n");
731         break;
732       case OP_PUNCT:
733         fxmessage("OP_PUNCT\n");
734         break;
735       case OP_NOT_PUNCT:
736         fxmessage("OP_NOT_PUNCT\n");
737         break;
738       case OP_NOT_PUNCT_NL:
739         fxmessage("OP_NOT_PUNCT_NL\n");
740         break;
741       case OP_UPPER:
742         fxmessage("OP_UPPER\n");
743         break;
744       case OP_LOWER:
745         fxmessage("OP_LOWER\n");
746         break;
747       case OP_CHARS:
748         no=*prog++;
749         fxmessage("OP_CHARS     %d,\"",no);
750         while(no>0){
751           if(' '<=*prog){ fxmessage("%c",*prog); } else { fxmessage("\\x%02x",*prog); }
752           prog++;
753           no--;
754           }
755         fxmessage("\"\n");
756         break;
757       case OP_CHARS_CI:
758         no=*prog++;
759         fxmessage("OP_CHARS_CI  %d,\"",no);
760         while(no>0){
761           if(' '<=*prog){ fxmessage("%c",*prog); } else { fxmessage("\\x%02x",*prog); }
762           prog++;
763           no--;
764           }
765         fxmessage("\"\n");
766         break;
767       case OP_CHAR:
768         fxmessage("OP_CHAR      \"");
769         if(' '<=*prog){ fxmessage("%c",*prog); } else { fxmessage("\\x%02x",*prog); }
770         fxmessage("\"\n");
771         prog++;
772         break;
773       case OP_CHAR_CI:
774         fxmessage("OP_CHAR_CI   \"");
775         if(' '<=*prog){ fxmessage("%c",*prog); } else { fxmessage("\\x%02x",*prog); }
776         fxmessage("\"\n");
777         prog++;
778         break;
779       case OP_JUMP:
780         fxmessage("OP_JUMP      %-10p\n",*prog ? prog+*prog : 0);
781         prog++;
782         break;
783       case OP_BRANCH:
784         fxmessage("OP_BRANCH    %-10p\n",*prog ? prog+*prog : 0);
785         prog++;
786         break;
787       case OP_BRANCHREV:
788         fxmessage("OP_BRANCHREV %-10p\n",*prog ? prog+*prog : 0);
789         prog++;
790         break;
791       case OP_STAR:
792         fxmessage("OP_STAR\n");
793         break;
794       case OP_MIN_STAR:
795         fxmessage("OP_MIN_STAR\n");
796         break;
797       case OP_POS_STAR:
798         fxmessage("OP_POS_STAR\n");
799         break;
800       case OP_PLUS:
801         fxmessage("OP_PLUS\n");
802         break;
803       case OP_MIN_PLUS:
804         fxmessage("OP_MIN_PLUS\n");
805         break;
806       case OP_POS_PLUS:
807         fxmessage("OP_POS_PLUS\n");
808         break;
809       case OP_QUEST:
810         fxmessage("OP_QUEST\n");
811         break;
812       case OP_MIN_QUEST:
813         fxmessage("OP_MIN_QUEST\n");
814         break;
815       case OP_POS_QUEST:
816         fxmessage("OP_POS_QUEST\n");
817         break;
818       case OP_REP:
819         min=*prog++;
820         max=*prog++;
821         fxmessage("OP_REP       {%d,%d}\n",min,max);
822         break;
823       case OP_MIN_REP:
824         min=*prog++;
825         max=*prog++;
826         fxmessage("OP_MIN_REP   {%d,%d}\n",min,max);
827         break;
828       case OP_POS_REP:
829         min=*prog++;
830         max=*prog++;
831         fxmessage("OP_POS_REP   {%d,%d}\n",min,max);
832         break;
833       case OP_LOOK_NEG:
834         fxmessage("OP_LOOK_NEG  %-10p\n",*prog ? prog+*prog : 0);
835         prog++;
836         break;
837       case OP_LOOK_POS:
838         fxmessage("OP_LOOK_POS  %-10p\n",*prog ? prog+*prog : 0);
839         prog++;
840         break;
841       case OP_SUB_BEG+0:
842       case OP_SUB_BEG+1:
843       case OP_SUB_BEG+2:
844       case OP_SUB_BEG+3:
845       case OP_SUB_BEG+4:
846       case OP_SUB_BEG+5:
847       case OP_SUB_BEG+6:
848       case OP_SUB_BEG+7:
849       case OP_SUB_BEG+8:
850       case OP_SUB_BEG+9:
851         fxmessage("OP_SUB_BEG%d\n",op-OP_SUB_BEG);
852         break;
853       case OP_SUB_END+0:
854       case OP_SUB_END+1:
855       case OP_SUB_END+2:
856       case OP_SUB_END+3:
857       case OP_SUB_END+4:
858       case OP_SUB_END+5:
859       case OP_SUB_END+6:
860       case OP_SUB_END+7:
861       case OP_SUB_END+8:
862       case OP_SUB_END+9:
863         fxmessage("OP_SUB_END%d\n",op-OP_SUB_END);
864         break;
865       case OP_REF+0:
866       case OP_REF+1:
867       case OP_REF+2:
868       case OP_REF+3:
869       case OP_REF+4:
870       case OP_REF+5:
871       case OP_REF+6:
872       case OP_REF+7:
873       case OP_REF+8:
874       case OP_REF+9:
875         fxmessage("OP_REF%d\n",op-OP_REF);
876         break;
877       case OP_REF_CI+0:
878       case OP_REF_CI+1:
879       case OP_REF_CI+2:
880       case OP_REF_CI+3:
881       case OP_REF_CI+4:
882       case OP_REF_CI+5:
883       case OP_REF_CI+6:
884       case OP_REF_CI+7:
885       case OP_REF_CI+8:
886       case OP_REF_CI+9:
887         fxmessage("OP_REF_CI%d\n",op-OP_REF_CI);
888         break;
889       case OP_ZERO+0:
890       case OP_ZERO+1:
891       case OP_ZERO+2:
892       case OP_ZERO+3:
893       case OP_ZERO+4:
894       case OP_ZERO+5:
895       case OP_ZERO+6:
896       case OP_ZERO+7:
897       case OP_ZERO+8:
898       case OP_ZERO+9:
899         fxmessage("OP_ZERO%d\n",op-OP_ZERO);
900         break;
901       case OP_INCR+0:
902       case OP_INCR+1:
903       case OP_INCR+2:
904       case OP_INCR+3:
905       case OP_INCR+4:
906       case OP_INCR+5:
907       case OP_INCR+6:
908       case OP_INCR+7:
909       case OP_INCR+8:
910       case OP_INCR+9:
911         fxmessage("OP_INCR%d\n",op-OP_INCR);
912         break;
913       case OP_JUMPLT+0:
914       case OP_JUMPLT+1:
915       case OP_JUMPLT+2:
916       case OP_JUMPLT+3:
917       case OP_JUMPLT+4:
918       case OP_JUMPLT+5:
919       case OP_JUMPLT+6:
920       case OP_JUMPLT+7:
921       case OP_JUMPLT+8:
922       case OP_JUMPLT+9:
923         val=*prog++;
924         fxmessage("OP_JUMPLT%d   %d,%-10p\n",op-OP_JUMPLT,val,*prog ? prog+*prog : 0);
925         prog++;
926         break;
927       case OP_JUMPGT+0:
928       case OP_JUMPGT+1:
929       case OP_JUMPGT+2:
930       case OP_JUMPGT+3:
931       case OP_JUMPGT+4:
932       case OP_JUMPGT+5:
933       case OP_JUMPGT+6:
934       case OP_JUMPGT+7:
935       case OP_JUMPGT+8:
936       case OP_JUMPGT+9:
937         val=*prog++;
938         fxmessage("OP_JUMPGT%d   %d,%-10p\n",op-OP_JUMPGT,val,*prog ? prog+*prog : 0);
939         prog++;
940         break;
941       default:
942         fxmessage("OP_%d: error\n",op);
943         goto x;
944       }
945     }
946 x:fxmessage("end\n");
947   }
948 
949 #endif
950 
951 /*******************************************************************************/
952 
953 // FXCompile members
954 
955 // Parse hex escape code
hex(const FXchar * & pat)956 FXint hex(const FXchar*& pat){
957   register FXint ch,n;
958   for(ch=0,n=2; Ascii::isHexDigit(*pat) && n; n--){
959     ch=(ch<<4)+Ascii::digitValue(*pat++);
960     }
961   return ch;
962   }
963 
964 
965 // Parse octal escape code
oct(const FXchar * & pat)966 FXint oct(const FXchar*& pat){
967   register FXint ch,n;
968   for(ch=0,n=3; '0'<=*pat && *pat<='7' && n; n--){
969     ch=(ch<<3)+(*pat++-'0');
970     }
971   return ch;
972   }
973 
974 
975 // Compiler main
compile(FXint & flags)976 FXRexError FXCompile::compile(FXint& flags){
977   FXRexError err;
978   if(*pat=='\0') return REGERR_EMPTY;
979   if(mode&REX_VERBATIM)
980     err=verbatim(flags);
981   else
982     err=expression(flags);
983   if(err!=REGERR_OK) return err;
984   if(*pat!='\0') return REGERR_PAREN;
985   append(OP_END);
986   return REGERR_OK;
987   }
988 
989 
990 // Parse without interpretation of magic characters
verbatim(FXint & flags)991 FXRexError FXCompile::verbatim(FXint& flags){
992   FXint buf[MAXCHARS],ch,len;
993   flags=FLG_WIDTH;
994   while(*pat!='\0'){
995     len=0;
996     do{
997       ch=*pat++;
998       if(mode&REX_ICASE) ch=Ascii::toLower(ch);
999       buf[len++]=(FXuchar)ch;
1000       }
1001     while(*pat!='\0' && len<MAXCHARS);
1002     if(len==1){
1003       flags|=FLG_SIMPLE;
1004       append((mode&REX_ICASE)?OP_CHAR_CI:OP_CHAR,buf[0]);
1005       }
1006     else{
1007       append((mode&REX_ICASE)?OP_CHARS_CI:OP_CHARS,len,buf);
1008       }
1009     }
1010   return REGERR_OK;
1011   }
1012 
1013 
1014 // Parse expression
expression(FXint & flags)1015 FXRexError FXCompile::expression(FXint& flags){
1016   FXRexError err;
1017   FXint *at,*jp,flg;
1018   flags=FLG_WIDTH;
1019   at=pc;
1020   jp=NULL;
1021   err=alternative(flg);
1022   if(err!=REGERR_OK) return err;
1023   if(!(flg&FLG_WIDTH)) flags&=~FLG_WIDTH;
1024   while(*pat=='|'){
1025     pat++;
1026     insert(at,OP_BRANCH,pc-at+3);
1027     append(OP_JUMP,jp?jp-pc-1:0);
1028     jp=pc-1;
1029     at=pc;
1030     err=alternative(flg);
1031     if(err!=REGERR_OK) return err;
1032     if(!(flg&FLG_WIDTH)) flags&=~FLG_WIDTH;
1033     }
1034   patch(jp,pc);
1035   return REGERR_OK;
1036   }
1037 
1038 
1039 // Parse branch
alternative(FXint & flags)1040 FXRexError FXCompile::alternative(FXint& flags){
1041   FXRexError err;
1042   FXint flg;
1043   flags=FLG_WORST;
1044   while(*pat!='\0' && *pat!='|' && *pat!=')'){
1045     err=piece(flg);
1046     if(err!=REGERR_OK) return err;
1047     flags|=flg;
1048     }
1049   return REGERR_OK;
1050   }
1051 
1052 
1053 // Parse piece
piece(FXint & flags)1054 FXRexError FXCompile::piece(FXint& flags){
1055   FXint ch,rep_min,rep_max,lazy,flg,*ptr;
1056   FXRexError err;
1057   ptr=pc;
1058 
1059   // Process atom
1060   err=atom(flg);
1061 
1062   // Error in atom
1063   if(err!=REGERR_OK) return err;
1064 
1065   // Followed by repetition
1066   if((ch=*pat)=='*' || ch=='+' || ch=='?' || ch=='{'){
1067 
1068     // Repeats may not match empty
1069     if(!(flg&FLG_WIDTH)) return REGERR_NOATOM;
1070 
1071     pat++;
1072     rep_min=1;
1073     rep_max=1;
1074 
1075     // Handle repetition type
1076     switch(ch){
1077       case '*':                                           // Repeat 0-INF
1078         rep_min=0;
1079         rep_max=ONEINDIG;
1080         break;
1081       case '+':                                           // Repeat 1-INF
1082         rep_min=1;
1083         rep_max=ONEINDIG;
1084         break;
1085       case '?':                                           // Repeat 0-1
1086         rep_min=0;
1087         rep_max=1;
1088         break;
1089       case '{':                                           // Repeat n-m
1090         rep_min=0;
1091         rep_max=ONEINDIG;
1092         if(*pat!='}'){
1093           while(Ascii::isDigit(*pat)){
1094             rep_min=10*rep_min+(*pat-'0');
1095             pat++;
1096             }
1097           rep_max=rep_min;
1098           if(*pat==','){
1099             pat++;
1100             rep_max=ONEINDIG;
1101             if(*pat!='}'){
1102               rep_max=0;
1103               while(Ascii::isDigit(*pat)){
1104                 rep_max=10*rep_max+(*pat-'0');
1105                 pat++;
1106                 }
1107               }
1108             }
1109           if(rep_min>rep_max) return REGERR_RANGE;              // Illegal range
1110           if(rep_min==0 && rep_max==0) return REGERR_COUNT;     // Bad count
1111           }
1112         if(*pat!='}') return REGERR_BRACE;                      // Unmatched brace
1113         pat++;
1114         break;
1115       default:
1116         return REGERR_TOKEN;
1117       }
1118 
1119     // Handle greedy, lazy, or possessive forms
1120     if(*pat=='?'){      // Lazy
1121       lazy=1; pat++;
1122       }
1123     else if(*pat=='+'){ // Possessive
1124       lazy=2; pat++;
1125       }
1126     else{               // Greedy
1127       lazy=0;
1128       }
1129 
1130     // If zero repetitions are allowed, then may have no width
1131     if(rep_min==0) flg&=~FLG_WIDTH;
1132 
1133     // Handle only non-trivial cases
1134     if(!(rep_min==1 && rep_max==1)){
1135 
1136       // For simple repeats we prefix the last operation
1137       if(flg&FLG_SIMPLE){
1138         if(rep_min==0 && rep_max==ONEINDIG){
1139           insert(ptr,OP_STAR+lazy);
1140           }
1141         else if(rep_min==1 && rep_max==ONEINDIG){
1142           insert(ptr,OP_PLUS+lazy);
1143           }
1144         else if(rep_min==0 && rep_max==1){
1145           insert(ptr,OP_QUEST+lazy);
1146           }
1147         else{
1148           insert(ptr,OP_REP+lazy,rep_min,rep_max);
1149           }
1150         }
1151 
1152       // For complex repeats we build loop constructs
1153       else{
1154         FXASSERT(lazy!=2);                              // FIXME not yet implemented
1155         if(rep_min==0 && rep_max==ONEINDIG){
1156           /*    ________
1157           **   |        \
1158           ** --B--(...)--J--+--                 (...){0,ONEINDIG}
1159           **    \___________|
1160           */
1161           insert(ptr,lazy?OP_BRANCHREV:OP_BRANCH,pc-ptr+3);
1162           append(OP_JUMP,ptr-pc-1);
1163           }
1164         else if(rep_min==1 && rep_max==ONEINDIG){
1165           /*    ________
1166           **   |        \
1167           ** --+--(...)--B--                    (...){1,ONEINDIG}
1168           **
1169           */
1170           append(lazy?OP_BRANCH:OP_BRANCHREV,ptr-pc-1);
1171           }
1172         else if(rep_min==0 && rep_max==1){
1173           /*
1174           **
1175           ** --B--(...)--+--                    (...){0,1}
1176           **    \________|
1177           */
1178           insert(ptr,lazy?OP_BRANCHREV:OP_BRANCH,pc-ptr+1);
1179           }
1180         else if(0<rep_min && rep_min==rep_max){
1181           /*       ___________
1182           **      |           \
1183           ** --Z--+--(...)--I--L--              (...){n,n}
1184           **
1185           */
1186           if(nbra>=NSUBEXP) return REGERR_COMPLEX;
1187           insert(ptr,OP_ZERO+nbra);
1188           append(OP_INCR+nbra);
1189           append(OP_JUMPLT+nbra,rep_min,ptr-pc-1);
1190           nbra++;
1191           }
1192         else if(rep_min==0 && rep_max<ONEINDIG){
1193           /*       ___________
1194           **      |           \
1195           ** --Z--B--(...)--I--L--+--           (...){0,n}
1196           **       \______________|
1197           */
1198           if(nbra>=NSUBEXP) return REGERR_COMPLEX;
1199           insert(ptr,OP_ZERO+nbra);
1200           insert(ptr+1,lazy?OP_BRANCHREV:OP_BRANCH,pc-ptr+4);
1201           append(OP_INCR+nbra);
1202           append(OP_JUMPLT+nbra,rep_max,ptr-pc-1);
1203           nbra++;
1204           }
1205         else if(0<rep_min && rep_max==ONEINDIG){
1206           /*       ________________
1207           **      |   ___________  \
1208           **      |  |           \  \
1209           ** --Z--+--+--(...)--I--L--B--        (...){n,ONEINDIG}
1210           */
1211           if(nbra>=NSUBEXP) return REGERR_COMPLEX;
1212           insert(ptr,OP_ZERO+nbra);
1213           append(OP_INCR+nbra);
1214           append(OP_JUMPLT+nbra,rep_min,ptr-pc-1);
1215           append(lazy?OP_BRANCH:OP_BRANCHREV,ptr-pc);
1216           nbra++;
1217           }
1218         else{
1219           /*       ___________________
1220           **      |   ___________     \
1221           **      |  |           \     \
1222           ** --Z--+--+--(...)--I--L--G--B--+--  (...){n,m}
1223           **                          \____|
1224           */
1225           if(nbra>=NSUBEXP) return REGERR_COMPLEX;
1226           insert(ptr,OP_ZERO+nbra);
1227           append(OP_INCR+nbra);
1228           append(OP_JUMPLT+nbra,rep_min,ptr-pc-1);
1229           append(OP_JUMPGT+nbra,rep_max,3);
1230           append(lazy?OP_BRANCH:OP_BRANCHREV,ptr-pc);
1231           nbra++;
1232           }
1233         }
1234       }
1235     }
1236   flags=flg&FLG_WIDTH;
1237   return REGERR_OK;
1238   }
1239 
1240 
1241 // Parse atom
atom(FXint & flags)1242 FXRexError FXCompile::atom(FXint& flags){
1243   FXint buf[MAXCHARS],level,save,ch,len,flg,*ptr;
1244   const FXchar *p;
1245   FXRexError err;
1246   flags=FLG_WORST;                                // Assume the worst
1247   switch(*pat){
1248     case '(':                                     // Subexpression grouping
1249       pat++;
1250       if(*pat=='?'){
1251         pat++;
1252         ch=*pat++;
1253         if(ch==':'){                              // Non capturing parentheses
1254           err=expression(flg);
1255           if(err!=REGERR_OK) return err;          // Propagate error
1256           }
1257         else if(ch=='=' || ch=='!'){              // Positive or negative look ahead
1258           append((ch=='=')?OP_LOOK_POS:OP_LOOK_NEG);
1259           ptr=append(0);
1260           err=expression(flg);
1261           if(err!=REGERR_OK) return err;          // Propagate error
1262           append(OP_SUCCEED);
1263           patch(ptr,pc);                          // If trailing context matches (fails), go here!
1264           flg=FLG_WORST;                          // Look ahead has no width!
1265           }
1266         else if(ch=='>'){                         // Atomic group
1267           // FIXME
1268           }
1269         else if(ch=='i' || ch=='I' || ch=='n' || ch=='N'){
1270           save=mode;                              // Save flags
1271           if(ch=='i') mode|=REX_ICASE;
1272           if(ch=='I') mode&=~REX_ICASE;
1273           if(ch=='n') mode|=REX_NEWLINE;
1274           if(ch=='N') mode&=~REX_NEWLINE;
1275           err=expression(flg);
1276           if(err!=REGERR_OK) return err;          // Propagate error
1277           mode=save;                              // Restore flags
1278           }
1279         else{
1280           return REGERR_TOKEN;
1281           }
1282         }
1283       else if(mode&REX_CAPTURE){                  // Capturing
1284         level=++npar;
1285         if(level>=NSUBEXP) return REGERR_COMPLEX; // Expression too complex
1286         append(OP_SUB_BEG+level);
1287         err=expression(flg);
1288         if(err!=REGERR_OK) return err;            // Propagate error
1289         append(OP_SUB_END+level);
1290         }
1291       else{                                       // Normal
1292         err=expression(flg);
1293         if(err!=REGERR_OK) return err;            // Propagate error
1294         }
1295       if(*pat!=')') return REGERR_PAREN;          // Unmatched parenthesis
1296       pat++;
1297       flags=flg&~FLG_SIMPLE;
1298       break;
1299     case '.':                                     // Any character
1300       pat++;
1301       append((mode&REX_NEWLINE)?OP_ANY_NL:OP_ANY);
1302       flags=FLG_WIDTH|FLG_SIMPLE;
1303       break;
1304     case '^':                                     // Begin of line
1305       pat++;
1306       append(OP_LINE_BEG);
1307       break;
1308     case '$':                                     // End of line
1309       pat++;
1310       append(OP_LINE_END);
1311       break;
1312     case '*':                                     // No preceding atom
1313     case '+':
1314     case '?':
1315     case '{':
1316       return REGERR_NOATOM;
1317     case '\0':                                    // Technically, this can not happen!
1318     case '|':
1319     case ')':
1320       return REGERR_NOATOM;
1321     case '}':                                     // Unmatched brace
1322       return REGERR_BRACE;
1323     case '[':
1324       pat++;
1325       err=charset();
1326       if(err!=REGERR_OK) return err;              // Bad character class
1327       if(*pat!=']') return REGERR_BRACK;          // Unmatched bracket
1328       pat++;
1329       flags=FLG_WIDTH|FLG_SIMPLE;
1330       break;
1331     case ']':                                     // Unmatched bracket
1332       return REGERR_BRACK;
1333     case '\\':                                    // Escape sequences which are NOT part of simple character-run
1334       ch=*(pat+1);
1335       switch(ch){
1336         case '\0':                                // Unexpected pattern end
1337           return REGERR_NOATOM;
1338         case 'w':                                 // Word character
1339           append(OP_WORD);
1340           pat+=2;
1341           flags=FLG_WIDTH|FLG_SIMPLE;
1342           return REGERR_OK;
1343         case 'W':                                 // Non-word character
1344           append((mode&REX_NEWLINE)?OP_NOT_WORD_NL:OP_NOT_WORD);
1345           pat+=2;
1346           flags=FLG_WIDTH|FLG_SIMPLE;
1347           return REGERR_OK;
1348         case 's':                                 // Space
1349           append((mode&REX_NEWLINE)?OP_SPACE_NL:OP_SPACE);
1350           pat+=2;
1351           flags=FLG_WIDTH|FLG_SIMPLE;
1352           return REGERR_OK;
1353         case 'S':                                 // Non-space
1354           append(OP_NOT_SPACE);
1355           pat+=2;
1356           flags=FLG_WIDTH|FLG_SIMPLE;
1357           return REGERR_OK;
1358         case 'd':                                 // Digit
1359           append(OP_DIGIT);
1360           pat+=2;
1361           flags=FLG_WIDTH|FLG_SIMPLE;
1362           return REGERR_OK;
1363         case 'D':                                 // Non-digit
1364           append((mode&REX_NEWLINE)?OP_NOT_DIGIT_NL:OP_NOT_DIGIT);
1365           pat+=2;
1366           flags=FLG_WIDTH|FLG_SIMPLE;
1367           return REGERR_OK;
1368         case 'h':                                 // Hex digit
1369           append(OP_HEX);
1370           pat+=2;
1371           flags=FLG_WIDTH|FLG_SIMPLE;
1372           return REGERR_OK;
1373         case 'H':                                 // Non-hex digit
1374           append((mode&REX_NEWLINE)?OP_NOT_HEX_NL:OP_NOT_HEX);
1375           pat+=2;
1376           flags=FLG_WIDTH|FLG_SIMPLE;
1377           return REGERR_OK;
1378         case 'p':                                 // Punctuation
1379           append(OP_PUNCT);
1380           pat+=2;
1381           flags=FLG_WIDTH|FLG_SIMPLE;
1382           return REGERR_OK;
1383         case 'P':                                 // Non-punctuation
1384           append((mode&REX_NEWLINE)?OP_NOT_PUNCT_NL:OP_NOT_PUNCT);
1385           pat+=2;
1386           flags=FLG_WIDTH|FLG_SIMPLE;
1387           return REGERR_OK;
1388         case 'l':                                 // Letter
1389           append(OP_LETTER);
1390           pat+=2;
1391           flags=FLG_WIDTH|FLG_SIMPLE;
1392           return REGERR_OK;
1393         case 'L':                                 // Non-letter
1394           append((mode&REX_NEWLINE)?OP_NOT_LETTER_NL:OP_NOT_LETTER);
1395           pat+=2;
1396           flags=FLG_WIDTH|FLG_SIMPLE;
1397           return REGERR_OK;
1398         case 'u':                                 // Upper case
1399           append(OP_UPPER);
1400           pat+=2;
1401           flags=FLG_WIDTH|FLG_SIMPLE;
1402           return REGERR_OK;
1403         case 'U':                                 // Lower case
1404           append(OP_LOWER);
1405           pat+=2;
1406           flags=FLG_WIDTH|FLG_SIMPLE;
1407           return REGERR_OK;
1408         case 'b':                                 // Word boundary
1409           append(OP_WORD_BND);
1410           pat+=2;
1411           return REGERR_OK;
1412         case 'B':                                 // Word interior
1413           append(OP_WORD_INT);
1414           pat+=2;
1415           return REGERR_OK;
1416         case 'A':                                 // Match only beginning of string
1417           append(OP_STR_BEG);
1418           pat+=2;
1419           return REGERR_OK;
1420         case 'Z':                                 // Match only and end of string
1421           append(OP_STR_END);
1422           pat+=2;
1423           return REGERR_OK;
1424         case '<':                                 // Begin of word
1425           append(OP_WORD_BEG);
1426           pat+=2;
1427           return REGERR_OK;
1428         case '>':                                 // End of word
1429           append(OP_WORD_END);
1430           pat+=2;
1431           return REGERR_OK;
1432         case '1':                                 // Back reference to previously matched subexpression
1433         case '2':
1434         case '3':
1435         case '4':
1436         case '5':
1437         case '6':
1438         case '7':
1439         case '8':
1440         case '9':
1441           if(!(mode&REX_CAPTURE)) return REGERR_BACKREF;  // Can't do backreferences
1442           level=ch-'0';
1443           if(level>npar) return REGERR_BACKREF;           // Back reference out of range
1444           append((mode&REX_ICASE)?(OP_REF_CI+level):(OP_REF+level));
1445           pat+=2;
1446           return REGERR_OK;
1447         }
1448       /*fall*/
1449     default:
1450       len=0;
1451       do{
1452         p=pat;                                    // In case we need to back up...
1453         ch=*pat;
1454         switch(ch){
1455           case '^':                               // Bail out on magic characters
1456           case '$':
1457           case '.':
1458           case '(':
1459           case ')':
1460           case '[':
1461           case ']':
1462           case '|':
1463             goto x;
1464           case '\\':
1465             ch=*(pat+1);
1466             switch(ch){
1467               case 'w':                           // Bail out on special matching constructs
1468               case 'W':
1469               case 's':
1470               case 'S':
1471               case 'd':
1472               case 'D':
1473               case 'h':
1474               case 'H':
1475               case 'p':
1476               case 'P':
1477               case 'l':
1478               case 'L':
1479               case 'u':
1480               case 'U':
1481               case 'b':
1482               case 'B':
1483               case 'A':
1484               case 'Z':
1485               case '<':
1486               case '>':
1487               case '1':
1488               case '2':
1489               case '3':
1490               case '4':
1491               case '5':
1492               case '6':
1493               case '7':
1494               case '8':
1495               case '9':
1496                 goto x;
1497               case 'a':                           // Bell
1498                 pat+=2;
1499                 ch='\a';
1500                 break;
1501               case 'e':                           // Escape
1502                 pat+=2;
1503                 ch='\033';
1504                 break;
1505               case 'f':                           // Form feed
1506                 pat+=2;
1507                 ch='\f';
1508                 break;
1509               case 'n':                           // Newline
1510                 pat+=2;
1511                 ch='\n';
1512                 break;
1513               case 'r':                           // Return
1514                 pat+=2;
1515                 ch='\r';
1516                 break;
1517               case 't':                           // Tab
1518                 pat+=2;
1519                 ch='\t';
1520                 break;
1521               case 'v':                           // Vertical tab
1522                 pat+=2;
1523                 ch='\v';
1524                 break;
1525               case 'c':                           // Control character
1526                 pat+=2;
1527                 ch=*pat++;
1528                 if(ch=='\0') return REGERR_NOATOM;// Unexpected pattern end
1529                 ch=Ascii::toUpper(ch)-'@';
1530                 break;
1531               case '0':                           // Octal digit
1532                 pat+=2;
1533                 ch=oct(pat);
1534                 if(ch>256) return REGERR_TOKEN;   // Characters should be 0..255
1535                 break;
1536               case 'x':                           // Hex digit
1537                 pat+=2;
1538                 ch=hex(pat);
1539                 if(ch>256) return REGERR_TOKEN;   // Characters should be 0..255
1540                 break;
1541               case '\0':                          // Unexpected pattern end
1542                 return REGERR_NOATOM;
1543               default:
1544                 pat+=2;
1545                 break;
1546               }
1547             break;
1548           case '\0':                              // Unexpected pattern end
1549             return REGERR_NOATOM;
1550           default:
1551             pat++;
1552             break;
1553           }
1554 
1555         // Make lower case?
1556         if(mode&REX_ICASE) ch=Ascii::toLower(ch);
1557 
1558         // Add to buffer
1559         buf[len++]=(FXuchar)ch;
1560         }
1561       while(*pat!='\0' && *pat!='*' && *pat!='+' && *pat!='?' && *pat!='{' && len<MAXCHARS);
1562 
1563       // Back up one character if followed by a repetition: aaa* is interpreted as (aa)a*
1564 x:    if(1<len && (*pat=='*' || *pat=='+' || *pat=='?' || *pat=='{')){
1565         pat=p;
1566         len--;
1567         }
1568 
1569       FXASSERT(1<=len);
1570 
1571       // Had at least 1 character
1572       flags=FLG_WIDTH;
1573 
1574       // Simple only if length is 1
1575       if(len==1){
1576         flags|=FLG_SIMPLE;
1577         append((mode&REX_ICASE)?OP_CHAR_CI:OP_CHAR,buf[0]);
1578         }
1579 
1580       // Array of characters
1581       else{
1582         append((mode&REX_ICASE)?OP_CHARS_CI:OP_CHARS,len,buf);
1583         }
1584       break;
1585     }
1586   return REGERR_OK;
1587   }
1588 
1589 
1590 // True if character is a word character
isword(int ch)1591 inline int isword(int ch){
1592   return Ascii::isAlphaNumeric(ch) || ch=='_';
1593   }
1594 
1595 
1596 // True if character is punctuation (delimiter) character
isdelim(int ch)1597 inline int isdelim(int ch){
1598   return Ascii::isPunct(ch) && ch!='_';
1599   }
1600 
1601 
1602 // The new character class structure:
1603 //
1604 //          <N>
1605 //   ( <lo_1> <hi_1> )
1606 //   ( <lo_2> <hi_2> )
1607 //      ...    ...
1608 //   ( <lo_N> <hi_N> )
1609 //
1610 // Parse character class
charset()1611 FXRexError FXCompile::charset(){
1612   register FXint first,last,op,i;
1613   FXint set[8];
1614   CLEAR(set);
1615   first=-1;
1616   if(*pat=='^'){                                  // Negated character class
1617     op=OP_ANY_BUT;
1618     pat++;
1619     }
1620   else{
1621     op=OP_ANY_OF;
1622     }
1623   if(*pat=='-' || *pat==']') goto in;             // '-' and ']' are literal at begin
1624   while(*pat!='\0' && *pat!=']'){
1625 in: last=*pat++;
1626     if(last=='\\'){
1627       last=*pat++;
1628       switch(last){
1629         case 'w':
1630           for(i=0; i<256; i++) {if(isword(i)) INCL(set,i); }
1631           first=-1;
1632           continue;
1633         case 'W':
1634           for(i=0; i<256; i++){ if(!isword(i)) INCL(set,i); }
1635           first=-1;
1636           continue;
1637         case 's':
1638           for(i=0; i<256; i++){ if(Ascii::isSpace(i)) INCL(set,i); }
1639           first=-1;
1640           continue;
1641         case 'S':
1642           for(i=0; i<256; i++){ if(!Ascii::isSpace(i)) INCL(set,i); }
1643           first=-1;
1644           continue;
1645         case 'd':
1646           for(i=0; i<256; i++){ if(Ascii::isDigit(i)) INCL(set,i); }
1647           first=-1;
1648           continue;
1649         case 'D':
1650           for(i=0; i<256; i++){ if(!Ascii::isDigit(i)) INCL(set,i); }
1651           first=-1;
1652           continue;
1653         case 'h':
1654           for(i=0; i<256; i++){ if(Ascii::isHexDigit(i)) INCL(set,i); }
1655           first=-1;
1656           continue;
1657         case 'H':
1658           for(i=0; i<256; i++){ if(!Ascii::isHexDigit(i)) INCL(set,i); }
1659           first=-1;
1660           continue;
1661         case 'p':
1662           for(i=0; i<256; i++){ if(isdelim(i)) INCL(set,i); }
1663           first=-1;
1664           continue;
1665         case 'P':
1666           for(i=0; i<256; i++){ if(!isdelim(i)) INCL(set,i); }
1667           first=-1;
1668           continue;
1669         case 'l':
1670           for(i=0; i<256; i++){ if(Ascii::isLetter(i)) INCL(set,i); }
1671           first=-1;
1672           continue;
1673         case 'L':
1674           for(i=0; i<256; i++){ if(!Ascii::isLetter(i)) INCL(set,i); }
1675           first=-1;
1676           continue;
1677         case 'u':
1678           for(i=0; i<256; i++){ if(Ascii::isUpper(i)) INCL(set,i); }
1679           first=-1;
1680           continue;
1681         case 'U':
1682           for(i=0; i<256; i++){ if(Ascii::isLower(i)) INCL(set,i); }
1683           first=-1;
1684           continue;
1685         case 'a':                             // Bell
1686           last='\a';
1687           break;
1688         case 'e':                             // Escape
1689           last='\033';
1690           break;
1691         case 'b':                             // Backspace
1692           last='\b';
1693           break;
1694         case 'f':                             // Form feed
1695           last='\f';
1696           break;
1697         case 'n':                             // Newline
1698           last='\n';
1699           break;
1700         case 'r':                             // Return
1701           last='\r';
1702           break;
1703         case 't':                             // Tab
1704           last='\t';
1705           break;
1706         case 'v':                             // Vertical tab
1707           last='\v';
1708           break;
1709         case 'c':                             // Control character
1710           last=*pat++;
1711           if(last=='\0') return REGERR_NOATOM;// Unexpected pattern end
1712           last=Ascii::toUpper(last)-'@';
1713           break;
1714         case '0':                             // Octal digit
1715           last=oct(pat);
1716           break;
1717         case 'x':                             // Hex digit
1718           last=hex(pat);
1719           break;
1720         case '\0':
1721           return REGERR_NOATOM;               // Unexpected pattern end
1722         }
1723       }
1724     if(first==-1){
1725       if(mode&REX_ICASE){
1726         INCL(set,Ascii::toLower(last));
1727         INCL(set,Ascii::toUpper(last));
1728         }
1729       else{
1730         INCL(set,last);
1731         }
1732       if(*pat=='-' && *(pat+1)!='\0' && *(pat+1)!=']'){
1733         first=last;
1734         pat++;
1735         }
1736       }
1737     else{
1738       if(first>=last) return REGERR_RANGE;    // Bad range
1739       if(mode&REX_ICASE){
1740         for(i=first; i<=last; i++){
1741           INCL(set,Ascii::toLower(i));
1742           INCL(set,Ascii::toUpper(i));
1743           }
1744         }
1745       else{
1746         for(i=first; i<=last; i++){
1747           INCL(set,i);
1748           }
1749         }
1750       first=-1;
1751       }
1752     }
1753 
1754   // Are we matching newlines
1755   if((op==OP_ANY_BUT) && !(mode&REX_NEWLINE) && !ISIN(set,'\n')){
1756     INCL(set,'\n');
1757     }
1758 
1759   // Emit opcode
1760   append(op,set);
1761   return REGERR_OK;
1762   }
1763 
1764 
1765 // Append opcode
append(FXint op)1766 FXint* FXCompile::append(FXint op){
1767   register FXint *val=pc;
1768   if(code){
1769     pc[0]=op;
1770     }
1771   pc++;
1772   return val;
1773   }
1774 
1775 
1776 // Append one-argument opcode
append(FXint op,FXint arg)1777 FXint* FXCompile::append(FXint op,FXint arg){
1778   register FXint *val=pc;
1779   if(code){
1780     pc[0]=op;
1781     pc[1]=arg;
1782     }
1783   pc+=2;
1784   return val;
1785   }
1786 
1787 
1788 // Append two-argument opcode
append(FXint op,FXint arg1,FXint arg2)1789 FXint* FXCompile::append(FXint op,FXint arg1,FXint arg2){
1790   register FXint *val=pc;
1791   if(code){
1792     pc[0]=op;
1793     pc[1]=arg1;
1794     pc[2]=arg2;
1795     }
1796   pc+=3;
1797   return val;
1798   }
1799 
1800 
1801 // Append character class opcode
append(FXint op,FXint set[])1802 FXint* FXCompile::append(FXint op,FXint set[]){
1803   register FXint *val=pc;
1804   if(code){
1805     pc[0]=op;
1806     pc[1]=set[0];
1807     pc[2]=set[1];
1808     pc[3]=set[2];
1809     pc[4]=set[3];
1810     pc[5]=set[4];
1811     pc[6]=set[5];
1812     pc[7]=set[6];
1813     pc[8]=set[7];
1814     }
1815   pc+=9;
1816   return val;
1817   }
1818 
1819 
1820 // Append character array
append(FXint op,FXint len,FXint * data)1821 FXint* FXCompile::append(FXint op,FXint len,FXint *data){
1822   register FXint *val=pc;
1823   if(code){
1824     pc[0]=op;
1825     pc[1]=len;
1826     memcpy(pc+2,data,sizeof(FXint)*len);
1827     }
1828   pc+=len+2;
1829   return val;
1830   }
1831 
1832 
1833 // Insert opcode at ptr
insert(FXint * ptr,FXint op)1834 FXint* FXCompile::insert(FXint *ptr,FXint op){
1835   if(code){
1836     memmove(ptr+1,ptr,sizeof(FXint)*(pc-ptr));
1837     ptr[0]=op;
1838     }
1839   pc+=1;
1840   return ptr;
1841   }
1842 
1843 
1844 // Insert one-argument opcode at ptr
insert(FXint * ptr,FXint op,FXint arg)1845 FXint* FXCompile::insert(FXint *ptr,FXint op,FXint arg){
1846   if(code){
1847     memmove(ptr+2,ptr,sizeof(FXint)*(pc-ptr));
1848     ptr[0]=op;
1849     ptr[1]=arg;
1850     }
1851   pc+=2;
1852   return ptr;
1853   }
1854 
1855 
1856 // Insert two-argument opcode at ptr
insert(FXint * ptr,FXint op,FXint arg1,FXint arg2)1857 FXint* FXCompile::insert(FXint *ptr,FXint op,FXint arg1,FXint arg2){
1858   if(code){
1859     memmove(ptr+3,ptr,sizeof(FXint)*(pc-ptr));
1860     ptr[0]=op;
1861     ptr[1]=arg1;
1862     ptr[2]=arg2;
1863     }
1864   pc+=3;
1865   return ptr;
1866   }
1867 
1868 
1869 
1870 // Patch linked set of branches or jumps
1871 // Example:
1872 //
1873 //      Before:        After:
1874 //      ==========================
1875 //      0:  OP_JUMP    0:  OP_JUMP
1876 //      1:  0          1:  9
1877 //      2:  ....       2:  ....
1878 //      3:  OP_JUMP    3:  OP_JUMP
1879 //      4:  -3         4:  6
1880 //      5:  ....       5:  ....
1881 //      6:  ....       6:  ....
1882 //      7:  OP_JUMP    7:  OP_JUMP
1883 // fm-> 8:  -4         8:  2
1884 //      9:  ....       9:  ....
1885 // to->10:  ....      10:  ....
1886 //
patch(FXint * fm,FXint * to)1887 void FXCompile::patch(FXint *fm,FXint *to){
1888   register FXint delta;
1889   if(code && fm){
1890     do{
1891       delta=*fm;
1892       *fm=to-fm;
1893       fm+=delta;
1894       }
1895     while(delta);
1896     }
1897   }
1898 
1899 
1900 /*******************************************************************************/
1901 
1902 // FXExecute members
1903 
1904 // The workhorse
match(const FXint * prog)1905 bool FXExecute::match(const FXint* prog){
1906   register FXint no,keep,rep_min,rep_max,greed,op;
1907   register const FXchar *save,*beg,*end;
1908   register FXchar ch;
1909   for(;;){
1910     op=*prog++;
1911     switch(op){
1912       case OP_END:
1913         return true;
1914       case OP_FAIL:           // Fail (sub) pattern
1915         return false;
1916       case OP_SUCCEED:        // Succeed (sub) pattern
1917         return true;
1918       case OP_JUMP:
1919         prog+=*prog;
1920         break;
1921       case OP_BRANCH:         // Jump after trying following code
1922         save=str;
1923         if(match(prog+1)) return true;
1924         str=save;
1925         prog+=*prog;
1926         break;
1927       case OP_BRANCHREV:      // Jump before trying following code
1928         save=str;
1929         if(match(prog+*prog)) return true;
1930         str=save;
1931         prog++;
1932         break;
1933       case OP_LINE_BEG:       // Must be at begin of line
1934         if((str==str_beg && (mode&REX_NOT_BOL)) || (str_beg<str && *(str-1)!='\n')) return false;
1935         break;
1936       case OP_LINE_END:       // Must be at end of line
1937         if((str==str_end && (mode&REX_NOT_EOL)) || (str<str_end && *str!='\n')) return false;
1938         break;
1939       case OP_WORD_BEG:       // Must be at begin of word
1940         if(str_beg<str && isword((FXuchar) *(str-1))) return false;
1941         if(str_end<=str || !isword((FXuchar) *str)) return false;
1942         break;
1943       case OP_WORD_END:       // Must be at end of word
1944         if(str<str_end && isword((FXuchar) *str)) return false;
1945         if(str<=str_beg || !isword((FXuchar) *(str-1))) return false;
1946         break;
1947       case OP_WORD_BND:       // Must be at word boundary
1948         if(!(((str==str_beg || !isword((FXuchar) *(str-1))) && (str<str_end && isword((FXuchar) *str))) || ((str==str_end || !isword((FXuchar) *str)) && (str_beg<str && isword((FXuchar) *(str-1)))))) return false;
1949         break;
1950       case OP_WORD_INT:       // Must be inside a word
1951         if(str==str_beg || !isword((FXuchar) *(str-1))) return false;
1952         if(str==str_end || !isword((FXuchar) *str)) return false;
1953         break;
1954       case OP_STR_BEG:        // Must be at begin of entire string
1955         if(str!=str_beg) return false;
1956         break;
1957       case OP_STR_END:        // Must be at end of entire string
1958         if(str!=str_end) return false;
1959         break;
1960       case OP_ANY_OF:         // Match a character in a set
1961         if(str==str_end || !ISIN(prog,*str)) return false;
1962         prog+=8;
1963         str++;
1964         break;
1965       case OP_ANY_BUT:        // Match a character NOT in a set
1966         if(str==str_end || ISIN(prog,*str)) return false;
1967         prog+=8;
1968         str++;
1969         break;
1970       case OP_CHAR:           // Match single character
1971         if(str==str_end || *prog != *str) return false;
1972         prog++;
1973         str++;
1974         break;
1975       case OP_CHAR_CI:        // Match single character, disregard case
1976         if(str==str_end || *prog != Ascii::toLower(*str)) return false;
1977         prog++;
1978         str++;
1979         break;
1980       case OP_CHARS:          // Match a run of 1 or more characters
1981         no=*prog++;
1982         if(str+no>str_end) return false;
1983         do{
1984           if(*prog++ != (FXuchar)*str++) return false;
1985           }
1986         while(--no);
1987         break;
1988       case OP_CHARS_CI:       // Match a run of 1 or more characters, disregard case
1989         no=*prog++;
1990         if(str+no>str_end) return false;
1991         do{
1992           if(*prog++ != Ascii::toLower(*str++)) return false;
1993           }
1994         while(--no);
1995         break;
1996       case OP_SPACE:          // Match space
1997         if(str==str_end || *str=='\n' || !Ascii::isSpace(*str)) return false;
1998         str++;
1999         break;
2000       case OP_SPACE_NL:       // Match space including newline
2001         if(str==str_end || !Ascii::isSpace(*str)) return false;
2002         str++;
2003         break;
2004       case OP_NOT_SPACE:      // Match non-space
2005         if(str==str_end || Ascii::isSpace(*str)) return false;
2006         str++;
2007         break;
2008       case OP_DIGIT:          // Match a digit 0..9
2009         if(str==str_end || !Ascii::isDigit(*str)) return false;
2010         str++;
2011         break;
2012       case OP_NOT_DIGIT:      // Match a non-digit
2013         if(str==str_end || *str=='\n' || Ascii::isDigit(*str)) return false;
2014         str++;
2015         break;
2016       case OP_NOT_DIGIT_NL:   // Match a non-digit including newline
2017         if(str==str_end || Ascii::isDigit(*str)) return false;
2018         str++;
2019         break;
2020       case OP_HEX:            // Match a hex digit 0..9A-Fa-f
2021         if(str==str_end || !Ascii::isHexDigit(*str)) return false;
2022         str++;
2023         break;
2024       case OP_NOT_HEX:        // Match a non-hex digit
2025         if(str==str_end || *str=='\n' || Ascii::isHexDigit(*str)) return false;
2026         str++;
2027         break;
2028       case OP_NOT_HEX_NL:     // Match a non-hex digit including newline
2029         if(str==str_end || Ascii::isHexDigit(*str)) return false;
2030         str++;
2031         break;
2032       case OP_PUNCT:          // Match a punctuation
2033         if(str==str_end || !isdelim((FXuchar) *str)) return false;
2034         str++;
2035         break;
2036       case OP_NOT_PUNCT:      // Match a non-punctuation
2037         if(str==str_end || *str=='\n' || isdelim((FXuchar) *str)) return false;
2038         str++;
2039         break;
2040       case OP_NOT_PUNCT_NL:   // Match a non-punctuation including newline
2041         if(str==str_end || isdelim((FXuchar) *str)) return false;
2042         str++;
2043         break;
2044       case OP_LETTER:         // Match a letter a..z, A..Z
2045         if(str==str_end || !Ascii::isLetter(*str)) return false;
2046         str++;
2047         break;
2048       case OP_NOT_LETTER:     // Match a non-letter
2049         if(str==str_end || *str=='\n' || Ascii::isLetter(*str)) return false;
2050         str++;
2051         break;
2052       case OP_NOT_LETTER_NL:  // Match a non-letter including newline
2053         if(str==str_end || Ascii::isLetter(*str)) return false;
2054         str++;
2055         break;
2056       case OP_WORD:           // Match a word character a..z,A..Z,0..9,_
2057         if(str==str_end || !isword((FXuchar) *str)) return false;
2058         str++;
2059         break;
2060       case OP_NOT_WORD:       // Match a non-word character
2061         if(str==str_end || *str=='\n' || isword((FXuchar) *str)) return false;
2062         str++;
2063         break;
2064       case OP_NOT_WORD_NL:    // Match a non-word character including newline
2065         if(str==str_end || isword((FXuchar) *str)) return false;
2066         str++;
2067         break;
2068       case OP_UPPER:          // Match if uppercase
2069         if(str==str_end || !Ascii::isUpper(*str)) return false;
2070         str++;
2071         break;
2072       case OP_LOWER:          // Match if lowercase
2073         if(str==str_end || !Ascii::isLower(*str)) return false;
2074         str++;
2075         break;
2076       case OP_ANY:            // Match any character
2077         if(str==str_end || *str=='\n') return false;
2078         str++;
2079         break;
2080       case OP_ANY_NL:         // Matches any character including newline
2081         if(str==str_end) return false;
2082         str++;
2083         break;
2084       case OP_MIN_PLUS:       // Lazy one or more repetitions
2085         rep_min=1;
2086         rep_max=ONEINDIG;
2087         greed=0;
2088         goto rep;
2089       case OP_POS_PLUS:       // Possessive one or more repetitions
2090         rep_min=1;
2091         rep_max=ONEINDIG;
2092         greed=2;
2093         goto rep;
2094       case OP_PLUS:           // Greedy one or more repetitions
2095         rep_min=1;
2096         rep_max=ONEINDIG;
2097         greed=1;
2098         goto rep;
2099       case OP_MIN_QUEST:      // Lazy zero or one
2100         rep_min=0;
2101         rep_max=1;
2102         greed=0;
2103         goto rep;
2104       case OP_POS_QUEST:      // Possessive zero or one
2105         rep_min=0;
2106         rep_max=1;
2107         greed=2;
2108         goto rep;
2109       case OP_QUEST:          // Greedy zero or one
2110         rep_min=0;
2111         rep_max=1;
2112         greed=1;
2113         goto rep;
2114       case OP_MIN_REP:        // Lazy bounded repeat
2115         rep_min=*prog++;
2116         rep_max=*prog++;
2117         greed=0;
2118         goto rep;
2119       case OP_POS_REP:        // Possessive bounded repeat
2120         rep_min=*prog++;
2121         rep_max=*prog++;
2122         greed=2;
2123         goto rep;
2124       case OP_REP:            // Greedy bounded repeat
2125         rep_min=*prog++;
2126         rep_max=*prog++;
2127         greed=1;
2128         goto rep;
2129       case OP_MIN_STAR:       // Lazy zero or more repetitions
2130         rep_min=0;
2131         rep_max=ONEINDIG;
2132         greed=0;
2133         goto rep;
2134       case OP_POS_STAR:       // Possessive zero or more repetitions
2135         rep_min=0;
2136         rep_max=ONEINDIG;
2137         greed=2;
2138         goto rep;
2139       case OP_STAR:           // Greedy zero or more repetitions
2140         rep_min=0;
2141         rep_max=ONEINDIG;
2142         greed=1;
2143 
2144         // We need to match more characters than are available
2145 rep:    if(str+rep_min>str_end) return false;
2146         beg=str;
2147         end=beg+rep_max;
2148         if(end>str_end) end=str_end;
2149         save=beg;
2150 
2151         // Find out how much could be matched
2152         op=*prog++;
2153         switch(op){
2154           case OP_CHAR:         // For UTF8 we should have OP_CHAR2, OP_CHAR3, ... OP_CHAR6, for  possible UTF8 lengths
2155             ch=*prog++;
2156             while(save<end && *save==ch) save++;
2157             break;
2158           case OP_CHAR_CI:
2159             ch=*prog++;
2160             while(save<end && Ascii::toLower(*save)==ch && *save!='\n') save++;
2161             break;
2162           case OP_CHARS:
2163             ch=*++prog;
2164             while(save<end && *save==ch) save++;
2165             prog+=3;
2166             break;
2167           case OP_CHARS_CI:
2168             ch=*++prog;
2169             while(save<end && Ascii::toLower(*save)==ch) save++;
2170             prog+=3;
2171             break;
2172           case OP_ANY_OF:
2173             while(save<end && ISIN(prog,*save)) save++;
2174             prog+=8;
2175             break;
2176           case OP_ANY_BUT:
2177             while(save<end && !ISIN(prog,*save)) save++;
2178             prog+=8;
2179             break;
2180           case OP_SPACE:
2181             while(save<end && *save!='\n' && Ascii::isSpace(*save)) save++;
2182             break;
2183           case OP_SPACE_NL:
2184             while(save<end && Ascii::isSpace(*save)) save++;
2185             break;
2186           case OP_NOT_SPACE:
2187             while(save<end && !Ascii::isSpace(*save)) save++;
2188             break;
2189           case OP_DIGIT:
2190             while(save<end && Ascii::isDigit(*save)) save++;
2191             break;
2192           case OP_NOT_DIGIT:
2193             while(save<end && *save!='\n' && !Ascii::isDigit(*save)) save++;
2194             break;
2195           case OP_NOT_DIGIT_NL:
2196             while(save<end && !Ascii::isDigit(*save)) save++;
2197             break;
2198           case OP_HEX:
2199             while(save<end && Ascii::isHexDigit(*save)) save++;
2200             break;
2201           case OP_NOT_HEX:
2202             while(save<end && *save!='\n' && !Ascii::isHexDigit(*save)) save++;
2203             break;
2204           case OP_NOT_HEX_NL:
2205             while(save<end && !Ascii::isHexDigit(*save)) save++;
2206             break;
2207           case OP_PUNCT:
2208             while(save<end && isdelim((FXuchar) *save)) save++;
2209             break;
2210           case OP_NOT_PUNCT:
2211             while(save<end && *save!='\n' && !isdelim((FXuchar) *save)) save++;
2212             break;
2213           case OP_NOT_PUNCT_NL:
2214             while(save<end && !isdelim((FXuchar) *save)) save++;
2215             break;
2216           case OP_LETTER:
2217             while(save<end && Ascii::isLetter(*save)) save++;
2218             break;
2219           case OP_NOT_LETTER:
2220             while(save<end && *save!='\n' && !Ascii::isLetter(*save)) save++;
2221             break;
2222           case OP_NOT_LETTER_NL:
2223             while(save<end && !Ascii::isLetter(*save)) save++;
2224             break;
2225           case OP_WORD:
2226             while(save<end && isword((FXuchar) *save)) save++;
2227             break;
2228           case OP_NOT_WORD:
2229             while(save<end && *save!='\n' && !isword((FXuchar) *save)) save++;
2230             break;
2231           case OP_NOT_WORD_NL:
2232             while(save<end && !isword((FXuchar) *save)) save++;
2233             break;
2234           case OP_UPPER:
2235             while(save<end && Ascii::isUpper(*save)) save++;
2236             break;
2237           case OP_LOWER:
2238             while(save<end && Ascii::isLower(*save)) save++;
2239             break;
2240           case OP_ANY:
2241             while(save<end && *save!='\n') save++;
2242             break;
2243           case OP_ANY_NL:
2244             save=end; // Big byte
2245             break;
2246           default:
2247             fxerror("FXRex::match: bad opcode (%d) at: %p on line: %d\n",op,prog-1,__LINE__);
2248             break;
2249           }
2250 
2251         // Matched fewer than the minimum desired so bail out
2252         if(save<beg+rep_min) return false;
2253 
2254         // We must match between beg and end characters
2255         beg+=rep_min;
2256         end=save;
2257 
2258         switch(greed){
2259           case 0:                     // Lazily match the fewest characters
2260             while(beg<=end){
2261               str=beg;
2262               if(match(prog)) return true;
2263               beg++;
2264               }
2265             return false;
2266           case 1:                     // Greedily match the most characters
2267             while(beg<=end){
2268               str=end;
2269               if(match(prog)) return true;
2270               end--;
2271               }
2272             return false;
2273           case 2:                     // Possessive match
2274             return match(prog);
2275           }
2276         return false;
2277       case OP_SUB_BEG+0:              // Capturing open parentheses
2278       case OP_SUB_BEG+1:
2279       case OP_SUB_BEG+2:
2280       case OP_SUB_BEG+3:
2281       case OP_SUB_BEG+4:
2282       case OP_SUB_BEG+5:
2283       case OP_SUB_BEG+6:
2284       case OP_SUB_BEG+7:
2285       case OP_SUB_BEG+8:
2286       case OP_SUB_BEG+9:
2287         no=op-OP_SUB_BEG;
2288         if(no>=npar) break;           // Match w/o capture if array too small
2289         keep=sub_beg[no];             // Keep old value
2290         sub_beg[no]=str-str_beg;      // Tentatively set new value
2291         if(match(prog)) return true;  // Match the rest
2292         sub_beg[no]=keep;             // Restore old value
2293         return false;
2294       case OP_SUB_END+0:              // Capturing close parentheses
2295       case OP_SUB_END+1:
2296       case OP_SUB_END+2:
2297       case OP_SUB_END+3:
2298       case OP_SUB_END+4:
2299       case OP_SUB_END+5:
2300       case OP_SUB_END+6:
2301       case OP_SUB_END+7:
2302       case OP_SUB_END+8:
2303       case OP_SUB_END+9:
2304         no=op-OP_SUB_END;
2305         if(no>=npar) break;           // Match w/o capture if array too small
2306         keep=sub_end[no];
2307         sub_end[no]=str-str_beg;      // Remember capture end for future back reference
2308         if(match(prog)) return true;
2309         sub_end[no]=keep;             // Restore old value
2310         return false;
2311       case OP_REF+0:                  // Back reference to capturing parentheses
2312       case OP_REF+1:
2313       case OP_REF+2:
2314       case OP_REF+3:
2315       case OP_REF+4:
2316       case OP_REF+5:
2317       case OP_REF+6:
2318       case OP_REF+7:
2319       case OP_REF+8:
2320       case OP_REF+9:
2321         no=op-OP_REF;
2322         if(no>=npar) return false;                    // Arrays were too small
2323         if(sub_beg[no]<0) return false;               // Not captured yet
2324         if(sub_end[no]<0) return false;               // Not captured yet
2325         beg=str_beg+sub_beg[no];
2326         end=str_beg+sub_end[no];
2327         if(beg<end){                                  // Empty capture matches!
2328           if(str+(end-beg)>str_end) return false;     // Not enough characters left
2329           do{
2330             if(*beg != *str) return false;            // No match
2331             beg++;
2332             str++;
2333             }
2334           while(beg<end);
2335           }
2336         break;
2337       case OP_REF_CI+0:               // Back reference to capturing parentheses
2338       case OP_REF_CI+1:
2339       case OP_REF_CI+2:
2340       case OP_REF_CI+3:
2341       case OP_REF_CI+4:
2342       case OP_REF_CI+5:
2343       case OP_REF_CI+6:
2344       case OP_REF_CI+7:
2345       case OP_REF_CI+8:
2346       case OP_REF_CI+9:
2347         no=op-OP_REF_CI;
2348         if(no>=npar) return false;                    // Arrays were too small
2349         if(sub_beg[no]<0) return false;               // Not captured yet
2350         if(sub_end[no]<0) return false;               // Not captured yet
2351         beg=str_beg+sub_beg[no];
2352         end=str_beg+sub_end[no];
2353         if(beg<end){                                  // Empty capture matches!
2354           if(str+(end-beg)>str_end) return false;     // Not enough characters left
2355           do{
2356             if(*beg != Ascii::toLower(*str)) return false;            // No match
2357             beg++;
2358             str++;
2359             }
2360           while(beg<end);
2361           }
2362         break;
2363       case OP_LOOK_NEG:               // Positive or negative look ahead
2364       case OP_LOOK_POS:
2365         save=str;
2366         keep=match(prog+1);
2367         str=save;
2368         if((op-OP_LOOK_NEG)!=keep) return false;      // Didn't get what we expected
2369         prog=prog+*prog;              // Jump to code after OP_SUCCEED
2370         break;
2371       case OP_ZERO+0:                 // Initialize counter for counting repeat
2372       case OP_ZERO+1:
2373       case OP_ZERO+2:
2374       case OP_ZERO+3:
2375       case OP_ZERO+4:
2376       case OP_ZERO+5:
2377       case OP_ZERO+6:
2378       case OP_ZERO+7:
2379       case OP_ZERO+8:
2380       case OP_ZERO+9:
2381         count[op-OP_ZERO]=0;
2382         break;
2383       case OP_INCR+0:                 // Increment counter for counting repeat
2384       case OP_INCR+1:
2385       case OP_INCR+2:
2386       case OP_INCR+3:
2387       case OP_INCR+4:
2388       case OP_INCR+5:
2389       case OP_INCR+6:
2390       case OP_INCR+7:
2391       case OP_INCR+8:
2392       case OP_INCR+9:
2393         count[op-OP_INCR]++;
2394         break;
2395       case OP_JUMPLT+0:               // Jump if counter less than value
2396       case OP_JUMPLT+1:
2397       case OP_JUMPLT+2:
2398       case OP_JUMPLT+3:
2399       case OP_JUMPLT+4:
2400       case OP_JUMPLT+5:
2401       case OP_JUMPLT+6:
2402       case OP_JUMPLT+7:
2403       case OP_JUMPLT+8:
2404       case OP_JUMPLT+9:
2405         if(count[op-OP_JUMPLT] < *prog++)   // Compare with value
2406           prog+=*prog;
2407         else
2408           prog++;
2409         break;
2410       case OP_JUMPGT+0:               // Jump if counter greater than value
2411       case OP_JUMPGT+1:
2412       case OP_JUMPGT+2:
2413       case OP_JUMPGT+3:
2414       case OP_JUMPGT+4:
2415       case OP_JUMPGT+5:
2416       case OP_JUMPGT+6:
2417       case OP_JUMPGT+7:
2418       case OP_JUMPGT+8:
2419       case OP_JUMPGT+9:
2420         if(count[op-OP_JUMPGT] > *prog++)   // Compare with value
2421           prog+=*prog;
2422         else
2423           prog++;
2424         break;
2425       default:
2426         fxerror("FXRex::match: bad opcode (%d) at: %p on line: %d\n",op,prog-1,__LINE__);
2427         break;
2428       }
2429     }
2430   return false;
2431   }
2432 
2433 
2434 // regtry - try match at specific point; 0 failure, 1 success
attempt(const FXchar * string)2435 bool FXExecute::attempt(const FXchar* string){
2436   register FXint i=npar;
2437   str=string;
2438   do{--i;sub_beg[i]=sub_end[i]=-1;}while(i);          // Possibly move this to FXExecute::execute?
2439   if(match(code+1)){
2440     if(string!=str || !(mode&REX_NOT_EMPTY)){         // Match if non-empty or empty is allowed!
2441       sub_beg[0]=string-str_beg;
2442       sub_end[0]=str-str_beg;
2443       return true;
2444       }
2445     }
2446   return false;
2447   }
2448 
2449 
2450 // Match subject string, returning number of matches found
execute(const FXchar * fm,const FXchar * to)2451 bool FXExecute::execute(const FXchar* fm,const FXchar* to){
2452   register FXchar ch;
2453 
2454   // Simple case
2455   if(fm==to) return attempt(fm);
2456 
2457   // Match backwards
2458   if(mode&REX_BACKWARD){
2459     if(code[1]==OP_STR_BEG){                          // Anchored at string start
2460       return (fm==str_beg) && attempt(str_beg);
2461       }
2462     if(code[1]==OP_LINE_BEG){                         // Anchored at BOL
2463       while(fm<=to){
2464         if(((to==str_beg)||(*(to-1)=='\n')) && attempt(to)) return true;
2465         to--;
2466         }
2467       return false;
2468       }
2469     if(code[1]==OP_CHAR || code[1]==OP_CHARS){        // Known starting character
2470       ch=(code[1]==OP_CHAR)?code[2]:code[3];
2471       if(to==str_end) to--;
2472       while(fm<=to){
2473         if(*to==ch && attempt(to)) return true;
2474         to--;
2475         }
2476       return false;
2477       }
2478     if(code[1]==OP_CHAR_CI || code[1]==OP_CHARS_CI){  // Known starting character, ignoring case
2479       ch=(code[1]==OP_CHAR_CI)?code[2]:code[3];
2480       if(to==str_end) to--;
2481       while(fm<=to){
2482         if(Ascii::toLower(*to)==ch && attempt(to)) return true;
2483         to--;
2484         }
2485       return false;
2486       }
2487     while(fm<=to){                                    // General case
2488       if(attempt(to)) return true;
2489       to--;
2490       }
2491     }
2492 
2493   // Match forwards
2494   else{
2495     if(code[1]==OP_STR_BEG){                          // Anchored at string start
2496       return (fm==str_beg) && attempt(str_beg);
2497       }
2498     if(code[1]==OP_LINE_BEG){                         // Anchored at BOL
2499       while(fm<=to){
2500         if(((fm==str_beg)||(*(fm-1)=='\n')) && attempt(fm)) return true;
2501         fm++;
2502         }
2503       return false;
2504       }
2505     if(code[1]==OP_CHAR || code[1]==OP_CHARS){        // Known starting character
2506       ch=(code[1]==OP_CHAR)?code[2]:code[3];
2507       if(to==str_end) to--;
2508       while(fm<=to){
2509         if(*fm==ch && attempt(fm)) return true;
2510         fm++;
2511         }
2512       return false;
2513       }
2514     if(code[1]==OP_CHAR_CI || code[1]==OP_CHARS_CI){  // Known starting character, ignoring case
2515       ch=(code[1]==OP_CHAR_CI)?code[2]:code[3];
2516       if(to==str_end) to--;
2517       while(fm<=to){
2518         if(Ascii::toLower(*fm)==ch && attempt(fm)) return true;
2519         fm++;
2520         }
2521       return false;
2522       }
2523     while(fm<=to){                                   // General case
2524       if(attempt(fm)) return true;
2525       fm++;
2526       }
2527     }
2528   return false;
2529   }
2530 
2531 }
2532 
2533 /*******************************************************************************/
2534 
2535 namespace FX {
2536 
2537 // Table of error messages
2538 const FXchar *const FXRex::errors[]={
2539   "OK",
2540   "Empty pattern",
2541   "Unmatched parenthesis",
2542   "Unmatched bracket",
2543   "Unmatched brace",
2544   "Bad character range",
2545   "Bad escape sequence",
2546   "Bad counted repeat",
2547   "No atom preceding repetition",
2548   "Repeat following repeat",
2549   "Bad backward reference",
2550   "Bad character class",
2551   "Expression too complex",
2552   "Out of memory",
2553   "Illegal token"
2554   };
2555 
2556 
2557 // Default program always fails
2558 const FXint FXRex::fallback[]={2,OP_FAIL};
2559 
2560 
2561 // Copy regex object
FXRex(const FXRex & orig)2562 FXRex::FXRex(const FXRex& orig){
2563   code=(FXint*)fallback;
2564   if(orig.code!=fallback){
2565     FXMEMDUP(&code,orig.code,FXint,orig.code[0]);
2566     }
2567   }
2568 
2569 
2570 // Compile expression from pattern; fail if error
FXRex(const FXchar * pattern,FXint mode,FXRexError * error)2571 FXRex::FXRex(const FXchar* pattern,FXint mode,FXRexError* error):code((FXint*)fallback){
2572   FXRexError err=parse(pattern,mode);
2573   if(error){ *error=err; }
2574   }
2575 
2576 
2577 // Compile expression from pattern; fail if error
FXRex(const FXString & pattern,FXint mode,FXRexError * error)2578 FXRex::FXRex(const FXString& pattern,FXint mode,FXRexError* error):code((FXint*)fallback){
2579   FXRexError err=parse(pattern.text(),mode);
2580   if(error){ *error=err; }
2581   }
2582 
2583 
2584 // Assignment
operator =(const FXRex & orig)2585 FXRex& FXRex::operator=(const FXRex& orig){
2586   if(code!=orig.code){
2587     if(code!=fallback) FXFREE(&code);
2588     code=(FXint*)fallback;
2589     if(orig.code!=fallback){
2590       FXMEMDUP(&code,orig.code,FXint,orig.code[0]);
2591       }
2592     }
2593   return *this;
2594   }
2595 
2596 
2597 // Parse pattern
parse(const FXchar * pattern,FXint mode)2598 FXRexError FXRex::parse(const FXchar* pattern,FXint mode){
2599   FXRexError err=REGERR_EMPTY;
2600   FXCompile cs;
2601   FXint flags,size;
2602 
2603   // Free old code, if any
2604   if(code!=fallback) FXFREE(&code);
2605   code=(FXint*)fallback;
2606 
2607   // Check
2608   if(pattern){
2609 
2610     // Fill in compile data
2611     cs.code=NULL;
2612     cs.pc=NULL;
2613     cs.pat=pattern;
2614     cs.mode=mode;
2615     cs.nbra=0;
2616     cs.npar=0;
2617 
2618     // Unknown size
2619     cs.append(0);
2620 
2621     // Check syntax and amount of memory needed
2622     err=cs.compile(flags);
2623     if(err==REGERR_OK){
2624 
2625       // Compile code unless only syntax checking
2626       if(!(mode&REX_SYNTAX)){
2627 
2628         // Allocate new code
2629         size=cs.pc-((FXint*)NULL);
2630         if(!FXMALLOC(&code,FXint,size)){
2631           code=(FXint*)fallback;
2632           return REGERR_MEMORY;
2633           }
2634 
2635         // Fill in compile data
2636         cs.code=code;
2637         cs.pc=code;
2638         cs.pat=pattern;
2639         cs.mode=mode;
2640         cs.nbra=0;
2641         cs.npar=0;
2642 
2643         // Size of program
2644         cs.append(size);
2645 
2646         // Generate program
2647         err=cs.compile(flags);
2648 
2649         // Dump for debugging
2650 #ifndef NDEBUG
2651         if(fxTraceLevel>100) dump(code);
2652 #endif
2653         }
2654       }
2655     }
2656   return err;
2657   }
2658 
2659 
2660 // Parse pattern, return error code if syntax error is found
parse(const FXString & pattern,FXint mode)2661 FXRexError FXRex::parse(const FXString& pattern,FXint mode){
2662   return parse(pattern.text(),mode);
2663   }
2664 
2665 
2666 /*******************************************************************************/
2667 
2668 
2669 // Match subject string, returning number of matches found
match(const FXchar * string,FXint len,FXint * beg,FXint * end,FXint mode,FXint npar,FXint fm,FXint to) const2670 bool FXRex::match(const FXchar* string,FXint len,FXint* beg,FXint* end,FXint mode,FXint npar,FXint fm,FXint to) const {
2671   if(!string || len<0 || npar<1 || NSUBEXP<npar){ fxerror("FXRex::match: bad argument.\n"); }
2672   if(fm<0) fm=0;
2673   if(to>len) to=len;
2674   if(fm<=to){
2675     FXint abeg[NSUBEXP];
2676     FXint aend[NSUBEXP];
2677     FXExecute ms;
2678     if(!beg) beg=abeg;
2679     if(!end) end=aend;
2680     ms.str_beg=string;
2681     ms.str_end=string+len;
2682     ms.sub_beg=beg;
2683     ms.sub_end=end;
2684     ms.code=code;
2685     ms.npar=npar;
2686     ms.mode=mode;
2687     return ms.execute(string+fm,string+to);
2688     }
2689   return false;
2690   }
2691 
2692 
2693 // Search for match in string
match(const FXString & string,FXint * beg,FXint * end,FXint mode,FXint npar,FXint fm,FXint to) const2694 bool FXRex::match(const FXString& string,FXint* beg,FXint* end,FXint mode,FXint npar,FXint fm,FXint to) const {
2695   return match(string.text(),string.length(),beg,end,mode,npar,fm,to);
2696   }
2697 
2698 
2699 // Return substitution string
substitute(const FXchar * string,FXint len,FXint * beg,FXint * end,const FXString & replace,FXint npar)2700 FXString FXRex::substitute(const FXchar* string,FXint len,FXint* beg,FXint* end,const FXString& replace,FXint npar){
2701   register FXint ch,n,i=0;
2702   FXString result;
2703   if(!string || len<0 || !beg || !end || npar<1 || NSUBEXP<npar){ fxerror("FXRex::substitute: bad argument.\n"); }
2704   while((ch=replace[i++])!='\0'){
2705     if(ch=='&'){
2706       if(0<=beg[0] && end[0]<=len){result.append(&string[beg[0]],end[0]-beg[0]);}
2707       }
2708     else if(ch=='\\' && '0'<=replace[i] && replace[i]<='9'){
2709       n=replace[i++]-'0';
2710       if(n<npar && 0<=beg[n] && end[n]<=len){result.append(&string[beg[n]],end[n]-beg[n]);}
2711       }
2712     else{
2713       if(ch=='\\' && (replace[i]=='\\' || replace[i]=='&')){ch=replace[i++];}
2714       result.append(ch);
2715       }
2716     }
2717   return result;
2718   }
2719 
2720 
2721 // Return substitution string
substitute(const FXString & string,FXint * beg,FXint * end,const FXString & replace,FXint npar)2722 FXString FXRex::substitute(const FXString& string,FXint* beg,FXint* end,const FXString& replace,FXint npar){
2723   return substitute(string.text(),string.length(),beg,end,replace,npar);
2724   }
2725 
2726 
2727 // Equality
operator ==(const FXRex & rex) const2728 bool FXRex::operator==(const FXRex& rex) const {
2729   return code==rex.code || (code[0]==rex.code[0] && memcmp(code,rex.code,sizeof(FXint)*code[0])==0);
2730   }
2731 
2732 
2733 // Inequality
operator !=(const FXRex & rex) const2734 bool FXRex::operator!=(const FXRex& rex) const {
2735   return !operator==(rex);
2736   }
2737 
2738 
2739 // Save
operator <<(FXStream & store,const FXRex & s)2740 FXStream& operator<<(FXStream& store,const FXRex& s){
2741   FXint size=s.code[0];
2742   store << size;
2743   store.save(s.code+1,size-1);
2744   return store;
2745   }
2746 
2747 
2748 // Load
operator >>(FXStream & store,FXRex & s)2749 FXStream& operator>>(FXStream& store,FXRex& s){
2750   FXint size;
2751   store >> size;
2752   FXMALLOC(&s.code,FXint,size);
2753   store.load(s.code+1,size-1);
2754   return store;
2755   }
2756 
2757 
2758 // Clean up
~FXRex()2759 FXRex::~FXRex(){
2760   if(code!=fallback) FXFREE(&code);
2761   }
2762 
2763 }
2764