1 /****************************************************************************
2 ** $Id: qt/qregexp.cpp   3.3.8   edited Jan 11 14:38 $
3 **
4 ** Implementation of QRegExp class
5 **
6 ** Created : 950126
7 **
8 ** Copyright (C) 1992-2007 Trolltech ASA.  All rights reserved.
9 **
10 ** This file is part of the tools module of the Qt GUI Toolkit.
11 **
12 ** This file may be distributed under the terms of the Q Public License
13 ** as defined by Trolltech ASA of Norway and appearing in the file
14 ** LICENSE.QPL included in the packaging of this file.
15 **
16 ** This file may be distributed and/or modified under the terms of the
17 ** GNU General Public License version 2 as published by the Free Software
18 ** Foundation and appearing in the file LICENSE.GPL included in the
19 ** packaging of this file.
20 **
21 ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22 ** licenses may use this file in accordance with the Qt Commercial License
23 ** Agreement provided with the Software.
24 **
25 ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26 ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27 **
28 ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29 **   information about Qt Commercial License Agreements.
30 ** See http://www.trolltech.com/qpl/ for QPL licensing information.
31 ** See http://www.trolltech.com/gpl/ for GPL licensing information.
32 **
33 ** Contact info@trolltech.com if any conditions of this licensing are
34 ** not clear to you.
35 **
36 **********************************************************************/
37 
38 #include "qregexp.h"
39 
40 #ifndef QT_NO_REGEXP
41 
42 #include "qmemarray.h"
43 #include "qbitarray.h"
44 #include "qcache.h"
45 #include "qcleanuphandler.h"
46 #include "qintdict.h"
47 #include "qmap.h"
48 #include "qptrvector.h"
49 #include "qstring.h"
50 #include "qtl.h"
51 
52 #ifdef QT_THREAD_SUPPORT
53 #include "qthreadstorage.h"
54 #include <private/qthreadinstance_p.h>
55 #endif // QT_THREAD_SUPPORT
56 
57 #undef QT_TRANSLATE_NOOP
58 #define QT_TRANSLATE_NOOP( context, sourceText ) sourceText
59 
60 #include <limits.h>
61 
62 // error strings for the regexp parser
63 #define RXERR_OK         QT_TRANSLATE_NOOP( "QRegExp", "no error occurred" )
64 #define RXERR_DISABLED   QT_TRANSLATE_NOOP( "QRegExp", "disabled feature used" )
65 #define RXERR_CHARCLASS  QT_TRANSLATE_NOOP( "QRegExp", "bad char class syntax" )
66 #define RXERR_LOOKAHEAD  QT_TRANSLATE_NOOP( "QRegExp", "bad lookahead syntax" )
67 #define RXERR_REPETITION QT_TRANSLATE_NOOP( "QRegExp", "bad repetition syntax" )
68 #define RXERR_OCTAL      QT_TRANSLATE_NOOP( "QRegExp", "invalid octal value" )
69 #define RXERR_LEFTDELIM  QT_TRANSLATE_NOOP( "QRegExp", "missing left delim" )
70 #define RXERR_END        QT_TRANSLATE_NOOP( "QRegExp", "unexpected end" )
71 #define RXERR_LIMIT      QT_TRANSLATE_NOOP( "QRegExp", "met internal limit" )
72 
73 /*
74   WARNING! Be sure to read qregexp.tex before modifying this file.
75 */
76 
77 /*!
78     \class QRegExp qregexp.h
79     \reentrant
80     \brief The QRegExp class provides pattern matching using regular expressions.
81 
82     \ingroup tools
83     \ingroup misc
84     \ingroup shared
85     \mainclass
86     \keyword regular expression
87 
88     Regular expressions, or "regexps", provide a way to find patterns
89     within text. This is useful in many contexts, for example:
90 
91     \table
92     \row \i Validation
93 	 \i A regexp can be used to check whether a piece of text
94 	 meets some criteria, e.g. is an integer or contains no
95 	 whitespace.
96     \row \i Searching
97 	 \i Regexps provide a much more powerful means of searching
98 	 text than simple string matching does. For example we can
99 	 create a regexp which says "find one of the words 'mail',
100 	 'letter' or 'correspondence' but not any of the words
101 	 'email', 'mailman' 'mailer', 'letterbox' etc."
102     \row \i Search and Replace
103 	 \i A regexp can be used to replace a pattern with a piece of
104 	 text, for example replace all occurrences of '&' with
105 	 '\&amp;' except where the '&' is already followed by 'amp;'.
106     \row \i String Splitting
107 	 \i A regexp can be used to identify where a string should be
108 	 split into its component fields, e.g. splitting tab-delimited
109 	 strings.
110     \endtable
111 
112     We present a very brief introduction to regexps, a description of
113     Qt's regexp language, some code examples, and finally the function
114     documentation itself. QRegExp is modeled on Perl's regexp
115     language, and also fully supports Unicode. QRegExp can also be
116     used in the weaker 'wildcard' (globbing) mode which works in a
117     similar way to command shells. A good text on regexps is \e
118     {Mastering Regular Expressions: Powerful Techniques for Perl and
119     Other Tools} by Jeffrey E. Friedl, ISBN 1565922573.
120 
121     Experienced regexp users may prefer to skip the introduction and
122     go directly to the relevant information.
123 
124     In case of multi-threaded programming, note that QRegExp depends on
125     QThreadStorage internally. For that reason, QRegExp should only be
126     used with threads started with QThread, i.e. not with threads
127     started with platform-specific APIs.
128 
129     \tableofcontents
130 
131     \section1 Introduction
132 
133     Regexps are built up from expressions, quantifiers, and assertions.
134     The simplest form of expression is simply a character, e.g.
135     <b>x</b> or <b>5</b>. An expression can also be a set of
136     characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or
137     a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could
138     write this as <b>[A-D]</b>. If we want to match any of the
139     captital letters in the English alphabet we can write
140     <b>[A-Z]</b>. A quantifier tells the regexp engine how many
141     occurrences of the expression we want, e.g. <b>x{1,1}</b> means
142     match an <b>x</b> which occurs at least once and at most once.
143     We'll look at assertions and more complex expressions later.
144 
145     Note that in general regexps cannot be used to check for balanced
146     brackets or tags. For example if you want to match an opening html
147     \c <b> and its closing \c </b> you can only use a regexp if you
148     know that these tags are not nested; the html fragment, \c{<b>bold
149     <b>bolder</b></b>} will not match as expected. If you know the
150     maximum level of nesting it is possible to create a regexp that
151     will match correctly, but for an unknown level of nesting, regexps
152     will fail.
153 
154     We'll start by writing a regexp to match integers in the range 0
155     to 99. We will require at least one digit so we will start with
156     <b>[0-9]{1,1}</b> which means match a digit exactly once. This
157     regexp alone will match integers in the range 0 to 9. To match one
158     or two digits we can increase the maximum number of occurrences so
159     the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at
160     least once and at most twice. However, this regexp as it stands
161     will not match correctly. This regexp will match one or two digits
162     \e within a string. To ensure that we match against the whole
163     string we must use the anchor assertions. We need <b>^</b> (caret)
164     which when it is the first character in the regexp means that the
165     regexp must match from the beginning of the string. And we also
166     need <b>$</b> (dollar) which when it is the last character in the
167     regexp means that the regexp must match until the end of the
168     string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that
169     assertions, such as <b>^</b> and <b>$</b>, do not match any
170     characters.
171 
172     If you've seen regexps elsewhere they may have looked different from
173     the ones above. This is because some sets of characters and some
174     quantifiers are so common that they have special symbols to
175     represent them. <b>[0-9]</b> can be replaced with the symbol
176     <b>\d</b>. The quantifier to match exactly one occurrence,
177     <b>{1,1}</b>, can be replaced with the expression itself. This means
178     that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0
179     to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of
180     writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the
181     string match a digit followed by zero or one digits. In practice
182     most people would write it <b>^\d\d?$</b>. The <b>?</b> is a
183     shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no
184     occurrences a maximum of one occurrence. This is used to make an
185     expression optional. The regexp <b>^\d\d?$</b> means "from the
186     beginning of the string match one digit followed by zero or one
187     digits and then the end of the string".
188 
189     Our second example is matching the words 'mail', 'letter' or
190     'correspondence' but without matching 'email', 'mailman',
191     'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In
192     full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since
193     each expression itself is automatically quantified by <b>{1,1}</b>
194     we can simply write this as <b>mail</b>; an 'm' followed by an 'a'
195     followed by an 'i' followed by an 'l'. The symbol '|' (bar) is
196     used for \e alternation, so our regexp now becomes
197     <b>mail|letter|correspondence</b> which means match 'mail' \e or
198     'letter' \e or 'correspondence'. Whilst this regexp will find the
199     words we want it will also find words we don't want such as
200     'email'. We will start by putting our regexp in parentheses,
201     <b>(mail|letter|correspondence)</b>. Parentheses have two effects,
202     firstly they group expressions together and secondly they identify
203     parts of the regexp that we wish to \link #capturing-text capture
204     \endlink. Our regexp still matches any of the three words but now
205     they are grouped together as a unit. This is useful for building
206     up more complex regexps. It is also useful because it allows us to
207     examine which of the words actually matched. We need to use
208     another assertion, this time <b>\b</b> "word boundary":
209     <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match
210     a word boundary followed by the expression in parentheses followed
211     by another word boundary". The <b>\b</b> assertion matches at a \e
212     position in the regexp not a \e character in the regexp. A word
213     boundary is any non-word character such as a space a newline or
214     the beginning or end of the string.
215 
216     For our third example we want to replace ampersands with the HTML
217     entity '\&amp;'. The regexp to match is simple: <b>\&</b>, i.e.
218     match one ampersand. Unfortunately this will mess up our text if
219     some of the ampersands have already been turned into HTML
220     entities. So what we really want to say is replace an ampersand
221     providing it is not followed by 'amp;'. For this we need the
222     negative lookahead assertion and our regexp becomes:
223     <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced
224     with '(?!' and finishes at the ')'. It means that the text it
225     contains, 'amp;' in our example, must \e not follow the expression
226     that preceeds it.
227 
228     Regexps provide a rich language that can be used in a variety of
229     ways. For example suppose we want to count all the occurrences of
230     'Eric' and 'Eirik' in a string. Two valid regexps to match these
231     are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need
232     the word boundary '\b' so we don't get 'Ericsson' etc. The second
233     regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric'
234     and 'Eirik'.
235 
236     We will implement some the examples above in the
237     \link #code-examples code examples \endlink section.
238 
239     \target characters-and-abbreviations-for-sets-of-characters
240     \section1 Characters and Abbreviations for Sets of Characters
241 
242     \table
243     \header \i Element \i Meaning
244     \row \i <b>c</b>
245 	 \i Any character represents itself unless it has a special
246 	 regexp meaning. Thus <b>c</b> matches the character \e c.
247     \row \i <b>\\c</b>
248 	 \i A character that follows a backslash matches the character
249 	 itself except where mentioned below. For example if you
250 	 wished to match a literal caret at the beginning of a string
251 	 you would write <b>\^</b>.
252     \row \i <b>\\a</b>
253 	 \i This matches the ASCII bell character (BEL, 0x07).
254     \row \i <b>\\f</b>
255 	 \i This matches the ASCII form feed character (FF, 0x0C).
256     \row \i <b>\\n</b>
257 	 \i This matches the ASCII line feed character (LF, 0x0A, Unix newline).
258     \row \i <b>\\r</b>
259 	 \i This matches the ASCII carriage return character (CR, 0x0D).
260     \row \i <b>\\t</b>
261 	 \i This matches the ASCII horizontal tab character (HT, 0x09).
262     \row \i <b>\\v</b>
263 	 \i This matches the ASCII vertical tab character (VT, 0x0B).
264     \row \i <b>\\xhhhh</b>
265 	 \i This matches the Unicode character corresponding to the
266 	 hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo
267 	 (i.e., \zero ooo) matches the ASCII/Latin-1 character
268 	 corresponding to the octal number ooo (between 0 and 0377).
269     \row \i <b>. (dot)</b>
270 	 \i This matches any character (including newline).
271     \row \i <b>\\d</b>
272 	 \i This matches a digit (QChar::isDigit()).
273     \row \i <b>\\D</b>
274 	 \i This matches a non-digit.
275     \row \i <b>\\s</b>
276 	 \i This matches a whitespace (QChar::isSpace()).
277     \row \i <b>\\S</b>
278 	 \i This matches a non-whitespace.
279     \row \i <b>\\w</b>
280 	 \i This matches a word character (QChar::isLetterOrNumber() or '_').
281     \row \i <b>\\W</b>
282 	 \i This matches a non-word character.
283     \row \i <b>\\n</b>
284 	 \i The n-th \link #capturing-text backreference \endlink,
285 	 e.g. \1, \2, etc.
286     \endtable
287 
288     \e {Note that the C++ compiler transforms backslashes in strings
289     so to include a <b>\\</b> in a regexp you will need to enter it
290     twice, i.e. <b>\\\\</b>.}
291 
292     \target sets-of-characters
293     \section1 Sets of Characters
294 
295     Square brackets are used to match any character in the set of
296     characters contained within the square brackets. All the character
297     set abbreviations described above can be used within square
298     brackets. Apart from the character set abbreviations and the
299     following two exceptions no characters have special meanings in
300     square brackets.
301 
302     \table
303     \row \i <b>^</b>
304 	 \i The caret negates the character set if it occurs as the
305 	 first character, i.e. immediately after the opening square
306 	 bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c',
307 	 but <b>[^abc]</b> matches anything \e except 'a' or 'b' or
308 	 'c'.
309     \row \i <b>-</b>
310 	 \i The dash is used to indicate a range of characters, for
311 	 example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'.
312     \endtable
313 
314     Using the predefined character set abbreviations is more portable
315     than using character ranges across platforms and languages. For
316     example, <b>[0-9]</b> matches a digit in Western alphabets but
317     <b>\d</b> matches a digit in \e any alphabet.
318 
319     Note that in most regexp literature sets of characters are called
320     "character classes".
321 
322     \target quantifiers
323     \section1 Quantifiers
324 
325     By default an expression is automatically quantified by
326     <b>{1,1}</b>, i.e. it should occur exactly once. In the following
327     list <b>\e {E}</b> stands for any expression. An expression is a
328     character or an abbreviation for a set of characters or a set of
329     characters in square brackets or any parenthesised expression.
330 
331     \table
332     \row \i <b>\e {E}?</b>
333 	 \i Matches zero or one occurrence of \e E. This quantifier
334 	 means "the previous expression is optional" since it will
335 	 match whether or not the expression occurs in the string. It
336 	 is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b>
337 	 will match 'dent' and 'dents'.
338 
339     \row \i <b>\e {E}+</b>
340 	 \i Matches one or more occurrences of \e E. This is the same
341 	 as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match
342 	 '0', '00', '000', etc.
343 
344     \row \i <b>\e {E}*</b>
345 	 \i Matches zero or more occurrences of \e E. This is the same
346 	 as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often
347 	 used by a mistake. Since it matches \e zero or more
348 	 occurrences it will match no occurrences at all. For example
349 	 if we want to match strings that end in whitespace and use
350 	 the regexp <b>\s*$</b> we would get a match on every string.
351 	 This is because we have said find zero or more whitespace
352 	 followed by the end of string, so even strings that don't end
353 	 in whitespace will match. The regexp we want in this case is
354 	 <b>\s+$</b> to match strings that have at least one
355 	 whitespace at the end.
356 
357     \row \i <b>\e {E}{n}</b>
358 	 \i Matches exactly \e n occurrences of the expression. This
359 	 is the same as repeating the expression \e n times. For
360 	 example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also
361 	 the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>.
362 
363     \row \i <b>\e {E}{n,}</b>
364 	 \i Matches at least \e n occurrences of the expression. This
365 	 is the same as <b>\e {E}{n,MAXINT}</b>.
366 
367     \row \i <b>\e {E}{,m}</b>
368 	 \i Matches at most \e m occurrences of the expression. This
369 	 is the same as <b>\e {E}{0,m}</b>.
370 
371     \row \i <b>\e {E}{n,m}</b>
372 	 \i Matches at least \e n occurrences of the expression and at
373 	 most \e m occurrences of the expression.
374     \endtable
375 
376     (MAXINT is implementation dependent but will not be smaller than
377     1024.)
378 
379     If we wish to apply a quantifier to more than just the preceding
380     character we can use parentheses to group characters together in
381     an expression. For example, <b>tag+</b> matches a 't' followed by
382     an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches
383     at least one occurrence of 'tag'.
384 
385     Note that quantifiers are "greedy". They will match as much text
386     as they can. For example, <b>0+</b> will match as many zeros as it
387     can from the first zero it finds, e.g. '2.<u>000</u>5'.
388     Quantifiers can be made non-greedy, see setMinimal().
389 
390     \target capturing-text
391     \section1 Capturing Text
392 
393     Parentheses allow us to group elements together so that we can
394     quantify and capture them. For example if we have the expression
395     <b>mail|letter|correspondence</b> that matches a string we know
396     that \e one of the words matched but not which one. Using
397     parentheses allows us to "capture" whatever is matched within
398     their bounds, so if we used <b>(mail|letter|correspondence)</b>
399     and matched this regexp against the string "I sent you some email"
400     we can use the cap() or capturedTexts() functions to extract the
401     matched characters, in this case 'mail'.
402 
403     We can use captured text within the regexp itself. To refer to the
404     captured text we use \e backreferences which are indexed from 1,
405     the same as for cap(). For example we could search for duplicate
406     words in a string using <b>\b(\w+)\W+\1\b</b> which means match a
407     word boundary followed by one or more word characters followed by
408     one or more non-word characters followed by the same text as the
409     first parenthesised expression followed by a word boundary.
410 
411     If we want to use parentheses purely for grouping and not for
412     capturing we can use the non-capturing syntax, e.g.
413     <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and
414     end ')'. In this example we match either 'green' or 'blue' but we
415     do not capture the match so we only know whether or not we matched
416     but not which color we actually found. Using non-capturing
417     parentheses is more efficient than using capturing parentheses
418     since the regexp engine has to do less book-keeping.
419 
420     Both capturing and non-capturing parentheses may be nested.
421 
422     \target assertions
423     \section1 Assertions
424 
425     Assertions make some statement about the text at the point where
426     they occur in the regexp but they do not match any characters. In
427     the following list <b>\e {E}</b> stands for any expression.
428 
429     \table
430     \row \i <b>^</b>
431 	 \i The caret signifies the beginning of the string. If you
432 	 wish to match a literal \c{^} you must escape it by
433 	 writing <b>&#92;^</b>. For example, <b>^#include</b> will only
434 	 match strings which \e begin with the characters '#include'.
435 	 (When the caret is the first character of a character set it
436 	 has a special meaning, see \link #sets-of-characters Sets of
437 	 Characters \endlink.)
438 
439     \row \i <b>$</b>
440 	 \i The dollar signifies the end of the string. For example
441 	 <b>\d\s*$</b> will match strings which end with a digit
442 	 optionally followed by whitespace. If you wish to match a
443 	 literal \c{$} you must escape it by writing
444 	 <b>&#92;$</b>.
445 
446     \row \i <b>\\b</b>
447 	 \i A word boundary. For example the regexp
448 	 <b>\\bOK\\b</b> means match immediately after a word
449 	 boundary (e.g. start of string or whitespace) the letter 'O'
450 	 then the letter 'K' immediately before another word boundary
451 	 (e.g. end of string or whitespace). But note that the
452 	 assertion does not actually match any whitespace so if we
453 	 write <b>(\\bOK\\b)</b> and we have a match it will only
454 	 contain 'OK' even if the string is "Its <u>OK</u> now".
455 
456     \row \i <b>\\B</b>
457 	 \i A non-word boundary. This assertion is true wherever
458 	 <b>\\b</b> is false. For example if we searched for
459 	 <b>\\Bon\\B</b> in "Left on" the match would fail (space
460 	 and end of string aren't non-word boundaries), but it would
461 	 match in "t<u>on</u>ne".
462 
463     \row \i <b>(?=\e E)</b>
464 	 \i Positive lookahead. This assertion is true if the
465 	 expression matches at this point in the regexp. For example,
466 	 <b>const(?=\\s+char)</b> matches 'const' whenever it is
467 	 followed by 'char', as in 'static <u>const</u> char *'.
468 	 (Compare with <b>const\\s+char</b>, which matches 'static
469 	 <u>const char</u> *'.)
470 
471     \row \i <b>(?!\e E)</b>
472 	 \i Negative lookahead. This assertion is true if the
473 	 expression does not match at this point in the regexp. For
474 	 example, <b>const(?!\\s+char)</b> matches 'const' \e except
475 	 when it is followed by 'char'.
476     \endtable
477 
478     \target wildcard-matching
479     \section1 Wildcard Matching (globbing)
480 
481     Most command shells such as \e bash or \e cmd.exe support "file
482     globbing", the ability to identify a group of files by using
483     wildcards. The setWildcard() function is used to switch between
484     regexp and wildcard mode. Wildcard matching is much simpler than
485     full regexps and has only four features:
486 
487     \table
488     \row \i <b>c</b>
489 	 \i Any character represents itself apart from those mentioned
490 	 below. Thus <b>c</b> matches the character \e c.
491     \row \i <b>?</b>
492 	 \i This matches any single character. It is the same as
493 	 <b>.</b> in full regexps.
494     \row \i <b>*</b>
495 	 \i This matches zero or more of any characters. It is the
496 	 same as <b>.*</b> in full regexps.
497     \row \i <b>[...]</b>
498 	 \i Sets of characters can be represented in square brackets,
499 	 similar to full regexps. Within the character class, like
500 	 outside, backslash has no special meaning.
501     \endtable
502 
503     For example if we are in wildcard mode and have strings which
504     contain filenames we could identify HTML files with <b>*.html</b>.
505     This will match zero or more characters followed by a dot followed
506     by 'h', 't', 'm' and 'l'.
507 
508     \target perl-users
509     \section1 Notes for Perl Users
510 
511     Most of the character class abbreviations supported by Perl are
512     supported by QRegExp, see \link
513     #characters-and-abbreviations-for-sets-of-characters characters
514     and abbreviations for sets of characters \endlink.
515 
516     In QRegExp, apart from within character classes, \c{^} always
517     signifies the start of the string, so carets must always be
518     escaped unless used for that purpose. In Perl the meaning of caret
519     varies automagically depending on where it occurs so escaping it
520     is rarely necessary. The same applies to \c{$} which in
521     QRegExp always signifies the end of the string.
522 
523     QRegExp's quantifiers are the same as Perl's greedy quantifiers.
524     Non-greedy matching cannot be applied to individual quantifiers,
525     but can be applied to all the quantifiers in the pattern. For
526     example, to match the Perl regexp <b>ro+?m</b> requires:
527     \code
528     QRegExp rx( "ro+m" );
529     rx.setMinimal( TRUE );
530     \endcode
531 
532     The equivalent of Perl's \c{/i} option is
533     setCaseSensitive(FALSE).
534 
535     Perl's \c{/g} option can be emulated using a \link
536     #cap_in_a_loop loop \endlink.
537 
538     In QRegExp <b>.</b> matches any character, therefore all QRegExp
539     regexps have the equivalent of Perl's \c{/s} option. QRegExp
540     does not have an equivalent to Perl's \c{/m} option, but this
541     can be emulated in various ways for example by splitting the input
542     into lines or by looping with a regexp that searches for newlines.
543 
544     Because QRegExp is string oriented there are no \A, \Z or \z
545     assertions. The \G assertion is not supported but can be emulated
546     in a loop.
547 
548     Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
549     equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
550     ... correspond to cap(1) or capturedTexts()[1], cap(2) or
551     capturedTexts()[2], etc.
552 
553     To substitute a pattern use QString::replace().
554 
555     Perl's extended \c{/x} syntax is not supported, nor are
556     directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
557     the other hand, C++'s rules for literal strings can be used to
558     achieve the same:
559     \code
560     QRegExp mark( "\\b" // word boundary
561 		  "[Mm]ark" // the word we want to match
562 		);
563     \endcode
564 
565     Both zero-width positive and zero-width negative lookahead
566     assertions (?=pattern) and (?!pattern) are supported with the same
567     syntax as Perl. Perl's lookbehind assertions, "independent"
568     subexpressions and conditional expressions are not supported.
569 
570     Non-capturing parentheses are also supported, with the same
571     (?:pattern) syntax.
572 
573     See QStringList::split() and QStringList::join() for equivalents
574     to Perl's split and join functions.
575 
576     Note: because C++ transforms \\'s they must be written \e twice in
577     code, e.g. <b>\\b</b> must be written <b>\\\\b</b>.
578 
579     \target code-examples
580     \section1 Code Examples
581 
582     \code
583     QRegExp rx( "^\\d\\d?$" );  // match integers 0 to 99
584     rx.search( "123" );         // returns -1 (no match)
585     rx.search( "-6" );          // returns -1 (no match)
586     rx.search( "6" );           // returns 0 (matched as position 0)
587     \endcode
588 
589     The third string matches '<u>6</u>'. This is a simple validation
590     regexp for integers in the range 0 to 99.
591 
592     \code
593     QRegExp rx( "^\\S+$" );     // match strings without whitespace
594     rx.search( "Hello world" ); // returns -1 (no match)
595     rx.search( "This_is-OK" );  // returns 0 (matched at position 0)
596     \endcode
597 
598     The second string matches '<u>This_is-OK</u>'. We've used the
599     character set abbreviation '\S' (non-whitespace) and the anchors
600     to match strings which contain no whitespace.
601 
602     In the following example we match strings containing 'mail' or
603     'letter' or 'correspondence' but only match whole words i.e. not
604     'email'
605 
606     \code
607     QRegExp rx( "\\b(mail|letter|correspondence)\\b" );
608     rx.search( "I sent you an email" );     // returns -1 (no match)
609     rx.search( "Please write the letter" ); // returns 17
610     \endcode
611 
612     The second string matches "Please write the <u>letter</u>". The
613     word 'letter' is also captured (because of the parentheses). We
614     can see what text we've captured like this:
615 
616     \code
617     QString captured = rx.cap( 1 ); // captured == "letter"
618     \endcode
619 
620     This will capture the text from the first set of capturing
621     parentheses (counting capturing left parentheses from left to
622     right). The parentheses are counted from 1 since cap( 0 ) is the
623     whole matched regexp (equivalent to '&' in most regexp engines).
624 
625     \code
626     QRegExp rx( "&(?!amp;)" );      // match ampersands but not &amp;
627     QString line1 = "This & that";
628     line1.replace( rx, "&amp;" );
629     // line1 == "This &amp; that"
630     QString line2 = "His &amp; hers & theirs";
631     line2.replace( rx, "&amp;" );
632     // line2 == "His &amp; hers &amp; theirs"
633     \endcode
634 
635     Here we've passed the QRegExp to QString's replace() function to
636     replace the matched text with new text.
637 
638     \code
639     QString str = "One Eric another Eirik, and an Ericsson."
640 		    " How many Eiriks, Eric?";
641     QRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik
642     int pos = 0;    // where we are in the string
643     int count = 0;  // how many Eric and Eirik's we've counted
644     while ( pos >= 0 ) {
645 	pos = rx.search( str, pos );
646 	if ( pos >= 0 ) {
647 	    pos++;      // move along in str
648 	    count++;    // count our Eric or Eirik
649 	}
650     }
651     \endcode
652 
653     We've used the search() function to repeatedly match the regexp in
654     the string. Note that instead of moving forward by one character
655     at a time \c pos++ we could have written \c {pos +=
656     rx.matchedLength()} to skip over the already matched string. The
657     count will equal 3, matching 'One <u>Eric</u> another
658     <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it
659     doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
660     by non-word boundaries.
661 
662     One common use of regexps is to split lines of delimited data into
663     their component fields.
664 
665     \code
666     str = "Trolltech AS\twww.trolltech.com\tNorway";
667     QString company, web, country;
668     rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" );
669     if ( rx.search( str ) != -1 ) {
670 	company = rx.cap( 1 );
671 	web = rx.cap( 2 );
672 	country = rx.cap( 3 );
673     }
674     \endcode
675 
676     In this example our input lines have the format company name, web
677     address and country. Unfortunately the regexp is rather long and
678     not very versatile -- the code will break if we add any more
679     fields. A simpler and better solution is to look for the
680     separator, '\t' in this case, and take the surrounding text. The
681     QStringList split() function can take a separator string or regexp
682     as an argument and split a string accordingly.
683 
684     \code
685     QStringList field = QStringList::split( "\t", str );
686     \endcode
687 
688     Here field[0] is the company, field[1] the web address and so on.
689 
690     To imitate the matching of a shell we can use wildcard mode.
691 
692     \code
693     QRegExp rx( "*.html" );         // invalid regexp: * doesn't quantify anything
694     rx.setWildcard( TRUE );         // now it's a valid wildcard regexp
695     rx.exactMatch( "index.html" );  // returns TRUE
696     rx.exactMatch( "default.htm" ); // returns FALSE
697     rx.exactMatch( "readme.txt" );  // returns FALSE
698     \endcode
699 
700     Wildcard matching can be convenient because of its simplicity, but
701     any wildcard regexp can be defined using full regexps, e.g.
702     <b>.*\.html$</b>. Notice that we can't match both \c .html and \c
703     .htm files with a wildcard unless we use <b>*.htm*</b> which will
704     also match 'test.html.bak'. A full regexp gives us the precision
705     we need, <b>.*\\.html?$</b>.
706 
707     QRegExp can match case insensitively using setCaseSensitive(), and
708     can use non-greedy matching, see setMinimal(). By default QRegExp
709     uses full regexps but this can be changed with setWildcard().
710     Searching can be forward with search() or backward with
711     searchRev(). Captured text can be accessed using capturedTexts()
712     which returns a string list of all captured strings, or using
713     cap() which returns the captured string for the given index. The
714     pos() function takes a match index and returns the position in the
715     string where the match was made (or -1 if there was no match).
716 
717     \sa QRegExpValidator QString QStringList
718 
719     \target member-function-documentation
720 */
721 
722 const int NumBadChars = 64;
723 #define BadChar( ch ) ( (ch).unicode() % NumBadChars )
724 
725 const int NoOccurrence = INT_MAX;
726 const int EmptyCapture = INT_MAX;
727 const int InftyLen = INT_MAX;
728 const int InftyRep = 1025;
729 const int EOS = -1;
730 
isWord(QChar ch)731 static bool isWord( QChar ch )
732 {
733     return ch.isLetterOrNumber() || ch == QChar( '_' );
734 }
735 
736 /*
737   Merges two QMemArrays of ints and puts the result into the first
738   one.
739 */
mergeInto(QMemArray<int> * a,const QMemArray<int> & b)740 static void mergeInto( QMemArray<int> *a, const QMemArray<int>& b )
741 {
742     int asize = a->size();
743     int bsize = b.size();
744     if ( asize == 0 ) {
745 	*a = b.copy();
746 #ifndef QT_NO_REGEXP_OPTIM
747     } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) {
748 	a->resize( asize + 1 );
749 	(*a)[asize] = b[0];
750 #endif
751     } else if ( bsize >= 1 ) {
752 	int csize = asize + bsize;
753 	QMemArray<int> c( csize );
754 	int i = 0, j = 0, k = 0;
755 	while ( i < asize ) {
756 	    if ( j < bsize ) {
757 		if ( (*a)[i] == b[j] ) {
758 		    i++;
759 		    csize--;
760 		} else if ( (*a)[i] < b[j] ) {
761 		    c[k++] = (*a)[i++];
762 		} else {
763 		    c[k++] = b[j++];
764 		}
765 	    } else {
766 		memcpy( c.data() + k, (*a).data() + i,
767 			(asize - i) * sizeof(int) );
768 		break;
769 	    }
770 	}
771 	c.resize( csize );
772 	if ( j < bsize )
773 	    memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) );
774 	*a = c;
775     }
776 }
777 
778 /*
779   Merges two disjoint QMaps of (int, int) pairs and puts the result
780   into the first one.
781 */
mergeInto(QMap<int,int> * a,const QMap<int,int> & b)782 static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b )
783 {
784     QMap<int, int>::ConstIterator it;
785     for ( it = b.begin(); it != b.end(); ++it )
786 	a->insert( it.key(), *it );
787 }
788 
789 /*
790   Returns the value associated to key k in QMap m of (int, int)
791   pairs, or 0 if no such value is explicitly present.
792 */
at(const QMap<int,int> & m,int k)793 static int at( const QMap<int, int>& m, int k )
794 {
795     QMap<int, int>::ConstIterator it = m.find( k );
796     if ( it == m.end() )
797 	return 0;
798     else
799 	return *it;
800 }
801 
802 #ifndef QT_NO_REGEXP_WILDCARD
803 /*
804   Translates a wildcard pattern to an equivalent regular expression
805   pattern (e.g., *.cpp to .*\.cpp).
806 */
wc2rx(const QString & wc_str)807 static QString wc2rx( const QString& wc_str )
808 {
809     int wclen = wc_str.length();
810     QString rx = QString::fromLatin1( "" );
811     int i = 0;
812     const QChar *wc = wc_str.unicode();
813     while ( i < wclen ) {
814 	QChar c = wc[i++];
815 	switch ( c.unicode() ) {
816 	case '*':
817 	    rx += QString::fromLatin1( ".*" );
818 	    break;
819 	case '?':
820 	    rx += QChar( '.' );
821 	    break;
822 	case '$':
823 	case '(':
824 	case ')':
825 	case '+':
826 	case '.':
827 	case '\\':
828 	case '^':
829 	case '{':
830 	case '|':
831 	case '}':
832 	    rx += QChar( '\\' );
833 	    rx += c;
834 	    break;
835 	case '[':
836 	    rx += c;
837 	    if ( wc[i] == QChar('^') )
838 		rx += wc[i++];
839 	    if ( i < wclen ) {
840 		if ( rx[i] == ']' )
841 		    rx += wc[i++];
842 		while ( i < wclen && wc[i] != QChar(']') ) {
843 		    if ( wc[i] == '\\' )
844 			rx += QChar( '\\' );
845 		    rx += wc[i++];
846 		}
847 	    }
848 	    break;
849 	default:
850 	    rx += c;
851 	}
852     }
853     return rx;
854 }
855 #endif
856 
857 /*
858   The class QRegExpEngine encapsulates a modified nondeterministic
859   finite automaton (NFA).
860 */
861 class QRegExpEngine : public QShared
862 {
863 public:
864 #ifndef QT_NO_REGEXP_CCLASS
865     /*
866       The class CharClass represents a set of characters, such as can
867       be found in regular expressions (e.g., [a-z] denotes the set
868       {a, b, ..., z}).
869     */
870     class CharClass
871     {
872     public:
873 	CharClass();
CharClass(const CharClass & cc)874 	CharClass( const CharClass& cc ) { operator=( cc ); }
875 
876 	CharClass& operator=( const CharClass& cc );
877 
878 	void clear();
negative() const879 	bool negative() const { return n; }
880 	void setNegative( bool negative );
881 	void addCategories( int cats );
882 	void addRange( ushort from, ushort to );
addSingleton(ushort ch)883 	void addSingleton( ushort ch ) { addRange( ch, ch ); }
884 
885 	bool in( QChar ch ) const;
886 #ifndef QT_NO_REGEXP_OPTIM
firstOccurrence() const887 	const QMemArray<int>& firstOccurrence() const { return occ1; }
888 #endif
889 
890 #if defined(QT_DEBUG)
891 	void dump() const;
892 #endif
893 
894     private:
895 	/*
896 	  The struct Range represents a range of characters (e.g.,
897 	  [0-9] denotes range 48 to 57).
898 	*/
899 	struct Range
900 	{
901 	    ushort from; // 48
902 	    ushort to; // 57
903 	};
904 
905 	int c; // character classes
906 	QMemArray<Range> r; // character ranges
907 	bool n; // negative?
908 #ifndef QT_NO_REGEXP_OPTIM
909 	QMemArray<int> occ1; // first-occurrence array
910 #endif
911     };
912 #else
913     struct CharClass
914     {
915 	int dummy;
916 
917 #ifndef QT_NO_REGEXP_OPTIM
918 	CharClass() { occ1.fill( 0, NumBadChars ); }
919 
920 	const QMemArray<int>& firstOccurrence() const { return occ1; }
921 	QMemArray<int> occ1;
922 #endif
923     };
924 #endif
925 
QRegExpEngine(bool caseSensitive)926     QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); }
927     QRegExpEngine( const QString& rx, bool caseSensitive );
928 #ifndef QT_NO_REGEXP_OPTIM
929     ~QRegExpEngine();
930 #endif
931 
isValid() const932     bool isValid() const { return valid; }
caseSensitive() const933     bool caseSensitive() const { return cs; }
errorString() const934     const QString& errorString() const { return yyError; }
numCaptures() const935     int numCaptures() const { return officialncap; }
936     void match( const QString& str, int pos, bool minimal, bool oneTest,
937 		int caretIndex, QMemArray<int>& captured );
partialMatchLength() const938     int partialMatchLength() const { return mmOneTestMatchedLen; }
939 
940     int createState( QChar ch );
941     int createState( const CharClass& cc );
942 #ifndef QT_NO_REGEXP_BACKREF
943     int createState( int bref );
944 #endif
945 
946     void addCatTransitions( const QMemArray<int>& from,
947 			    const QMemArray<int>& to );
948 #ifndef QT_NO_REGEXP_CAPTURE
949     void addPlusTransitions( const QMemArray<int>& from,
950 			     const QMemArray<int>& to, int atom );
951 #endif
952 
953 #ifndef QT_NO_REGEXP_ANCHOR_ALT
954     int anchorAlternation( int a, int b );
955     int anchorConcatenation( int a, int b );
956 #else
anchorAlternation(int a,int b)957     int anchorAlternation( int a, int b ) { return a & b; }
anchorConcatenation(int a,int b)958     int anchorConcatenation( int a, int b ) { return a | b; }
959 #endif
960     void addAnchors( int from, int to, int a );
961 
962 #ifndef QT_NO_REGEXP_OPTIM
963     void heuristicallyChooseHeuristic();
964 #endif
965 
966 #if defined(QT_DEBUG)
967     void dump() const;
968 #endif
969 
970 private:
971     enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
972 
973     /*
974       The struct State represents one state in a modified NFA. The
975       input characters matched are stored in the state instead of on
976       the transitions, something possible for an automaton
977       constructed from a regular expression.
978     */
979     struct State
980     {
981 #ifndef QT_NO_REGEXP_CAPTURE
982 	int atom; // which atom does this state belong to?
983 #endif
984 	int match; // what does it match? (see CharClassBit and BackRefBit)
985 	QMemArray<int> outs; // out-transitions
986 	QMap<int, int> *reenter; // atoms reentered when transiting out
987 	QMap<int, int> *anchors; // anchors met when transiting out
988 
989 #ifndef QT_NO_REGEXP_CAPTURE
StateQRegExpEngine::State990 	State( int a, int m )
991 	    : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { }
992 #else
StateQRegExpEngine::State993 	State( int m )
994 	    : match( m ), reenter( 0 ), anchors( 0 ) { }
995 #endif
~StateQRegExpEngine::State996 	~State() { delete reenter; delete anchors; }
997     };
998 
999 #ifndef QT_NO_REGEXP_LOOKAHEAD
1000     /*
1001       The struct Lookahead represents a lookahead a la Perl (e.g.,
1002       (?=foo) and (?!bar)).
1003     */
1004     struct Lookahead
1005     {
1006 	QRegExpEngine *eng; // NFA representing the embedded regular expression
1007 	bool neg; // negative lookahead?
1008 
LookaheadQRegExpEngine::Lookahead1009 	Lookahead( QRegExpEngine *eng0, bool neg0 )
1010 	    : eng( eng0 ), neg( neg0 ) { }
~LookaheadQRegExpEngine::Lookahead1011 	~Lookahead() { delete eng; }
1012     };
1013 #endif
1014 
1015 #ifndef QT_NO_REGEXP_CAPTURE
1016     /*
1017       The struct Atom represents one node in the hierarchy of regular
1018       expression atoms.
1019     */
1020     struct Atom
1021     {
1022 	int parent; // index of parent in array of atoms
1023 	int capture; // index of capture, from 1 to ncap
1024     };
1025 #endif
1026 
1027 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1028     /*
1029       The struct AnchorAlternation represents a pair of anchors with
1030       OR semantics.
1031     */
1032     struct AnchorAlternation
1033     {
1034 	int a; // this anchor...
1035 	int b; // ...or this one
1036     };
1037 #endif
1038 
1039     enum { InitialState = 0, FinalState = 1 };
1040     void setup( bool caseSensitive );
1041     int setupState( int match );
1042 
1043     /*
1044       Let's hope that 13 lookaheads and 14 back-references are
1045       enough.
1046      */
1047     enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1048     enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002,
1049 	   Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008,
1050 	   Anchor_FirstLookahead = 0x00000010,
1051 	   Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1052 	   Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1053 	   Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs,
1054 
1055 	   Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^
1056 		   ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) };
1057 #ifndef QT_NO_REGEXP_CAPTURE
1058     int startAtom( bool capture );
finishAtom(int atom)1059     void finishAtom( int atom ) { cf = f[atom].parent; }
1060 #endif
1061 
1062 #ifndef QT_NO_REGEXP_LOOKAHEAD
1063     int addLookahead( QRegExpEngine *eng, bool negative );
1064 #endif
1065 
1066 #ifndef QT_NO_REGEXP_CAPTURE
1067     bool isBetterCapture( const int *begin1, const int *end1, const int *begin2,
1068 			  const int *end2 );
1069 #endif
1070     bool testAnchor( int i, int a, const int *capBegin );
1071 
1072 #ifndef QT_NO_REGEXP_OPTIM
1073     bool goodStringMatch();
1074     bool badCharMatch();
1075 #else
1076     bool bruteMatch();
1077 #endif
1078     bool matchHere();
1079 
1080     QPtrVector<State> s; // array of states
1081     int ns; // number of states
1082 #ifndef QT_NO_REGEXP_CAPTURE
1083     QMemArray<Atom> f; // atom hierarchy
1084     int nf; // number of atoms
1085     int cf; // current atom
1086 #endif
1087     int officialncap; // number of captures, seen from the outside
1088     int ncap; // number of captures, seen from the inside
1089 #ifndef QT_NO_REGEXP_CCLASS
1090     QPtrVector<CharClass> cl; // array of character classes
1091 #endif
1092 #ifndef QT_NO_REGEXP_LOOKAHEAD
1093     QPtrVector<Lookahead> ahead; // array of lookaheads
1094 #endif
1095 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1096     QMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors
1097 #endif
1098 #ifndef QT_NO_REGEXP_OPTIM
1099     bool caretAnchored; // does the regexp start with ^?
1100     bool trivial; // is the good-string all that needs to match?
1101 #endif
1102     bool valid; // is the regular expression valid?
1103     bool cs; // case sensitive?
1104 #ifndef QT_NO_REGEXP_BACKREF
1105     int nbrefs; // number of back-references
1106 #endif
1107 
1108 #ifndef QT_NO_REGEXP_OPTIM
1109     bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1110 
1111     int goodEarlyStart; // the index where goodStr can first occur in a match
1112     int goodLateStart; // the index where goodStr can last occur in a match
1113     QString goodStr; // the string that any match has to contain
1114 
1115     int minl; // the minimum length of a match
1116     QMemArray<int> occ1; // first-occurrence array
1117 #endif
1118 
1119     /*
1120       The class Box is an abstraction for a regular expression
1121       fragment. It can also be seen as one node in the syntax tree of
1122       a regular expression with synthetized attributes.
1123 
1124       Its interface is ugly for performance reasons.
1125     */
1126     class Box
1127     {
1128     public:
1129 	Box( QRegExpEngine *engine );
Box(const Box & b)1130 	Box( const Box& b ) { operator=( b ); }
1131 
1132 	Box& operator=( const Box& b );
1133 
clear()1134 	void clear() { operator=( Box(eng) ); }
1135 	void set( QChar ch );
1136 	void set( const CharClass& cc );
1137 #ifndef QT_NO_REGEXP_BACKREF
1138 	void set( int bref );
1139 #endif
1140 
1141 	void cat( const Box& b );
1142 	void orx( const Box& b );
1143 	void plus( int atom );
1144 	void opt();
1145 	void catAnchor( int a );
1146 #ifndef QT_NO_REGEXP_OPTIM
1147 	void setupHeuristics();
1148 #endif
1149 
1150 #if defined(QT_DEBUG)
1151 	void dump() const;
1152 #endif
1153 
1154     private:
1155 	void addAnchorsToEngine( const Box& to ) const;
1156 
1157 	QRegExpEngine *eng; // the automaton under construction
1158 	QMemArray<int> ls; // the left states (firstpos)
1159 	QMemArray<int> rs; // the right states (lastpos)
1160 	QMap<int, int> lanchors; // the left anchors
1161 	QMap<int, int> ranchors; // the right anchors
1162 	int skipanchors; // the anchors to match if the box is skipped
1163 
1164 #ifndef QT_NO_REGEXP_OPTIM
1165 	int earlyStart; // the index where str can first occur
1166 	int lateStart; // the index where str can last occur
1167 	QString str; // a string that has to occur in any match
1168 	QString leftStr; // a string occurring at the left of this box
1169 	QString rightStr; // a string occurring at the right of this box
1170 	int maxl; // the maximum length of this box (possibly InftyLen)
1171 #endif
1172 
1173 	int minl; // the minimum length of this box
1174 #ifndef QT_NO_REGEXP_OPTIM
1175 	QMemArray<int> occ1; // first-occurrence array
1176 #endif
1177     };
1178     friend class Box;
1179 
1180     /*
1181       This is the lexical analyzer for regular expressions.
1182     */
1183     enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen,
1184 	   Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass,
1185 	   Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord,
1186 	   Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1187     int getChar();
1188     int getEscape();
1189 #ifndef QT_NO_REGEXP_INTERVAL
1190     int getRep( int def );
1191 #endif
1192 #ifndef QT_NO_REGEXP_LOOKAHEAD
1193     void skipChars( int n );
1194 #endif
1195     void error( const char *msg );
1196     void startTokenizer( const QChar *rx, int len );
1197     int getToken();
1198 
1199     const QChar *yyIn; // a pointer to the input regular expression pattern
1200     int yyPos0; // the position of yyTok in the input pattern
1201     int yyPos; // the position of the next character to read
1202     int yyLen; // the length of yyIn
1203     int yyCh; // the last character read
1204     CharClass *yyCharClass; // attribute for Tok_CharClass tokens
1205     int yyMinRep; // attribute for Tok_Quantifier
1206     int yyMaxRep; // ditto
1207     QString yyError; // syntax error or overflow during parsing?
1208 
1209     /*
1210       This is the syntactic analyzer for regular expressions.
1211     */
1212     int parse( const QChar *rx, int len );
1213     void parseAtom( Box *box );
1214     void parseFactor( Box *box );
1215     void parseTerm( Box *box );
1216     void parseExpression( Box *box );
1217 
1218     int yyTok; // the last token read
1219     bool yyMayCapture; // set this to FALSE to disable capturing
1220 
1221     /*
1222       This is the engine state during matching.
1223     */
1224     const QString *mmStr; // a pointer to the input QString
1225     const QChar *mmIn; // a pointer to the input string data
1226     int mmPos; // the current position in the string
1227     int mmCaretPos;
1228     int mmLen; // the length of the input string
1229     bool mmMinimal; // minimal matching?
1230     QMemArray<int> mmBigArray; // big QMemArray<int> array
1231     int *mmInNextStack; // is state is mmNextStack?
1232     int *mmCurStack; // stack of current states
1233     int *mmNextStack; // stack of next states
1234     int *mmCurCapBegin; // start of current states' captures
1235     int *mmNextCapBegin; // start of next states' captures
1236     int *mmCurCapEnd; // end of current states' captures
1237     int *mmNextCapEnd; // end of next states' captures
1238     int *mmTempCapBegin; // start of temporary captures
1239     int *mmTempCapEnd; // end of temporary captures
1240     int *mmCapBegin; // start of captures for a next state
1241     int *mmCapEnd; // end of captures for a next state
1242     int *mmSlideTab; // bump-along slide table for bad-character heuristic
1243     int mmSlideTabSize; // size of slide table
1244 #ifndef QT_NO_REGEXP_BACKREF
1245     QIntDict<int> mmSleeping; // dictionary of back-reference sleepers
1246 #endif
1247     int mmMatchLen; // length of match
1248     int mmOneTestMatchedLen; // length of partial match
1249 };
1250 
QRegExpEngine(const QString & rx,bool caseSensitive)1251 QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive )
1252 #ifndef QT_NO_REGEXP_BACKREF
1253     : mmSleeping( 101 )
1254 #endif
1255 {
1256     setup( caseSensitive );
1257     valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() );
1258     if ( !valid ) {
1259 #ifndef QT_NO_REGEXP_OPTIM
1260 	trivial = FALSE;
1261 #endif
1262 	error( RXERR_LEFTDELIM );
1263     }
1264 }
1265 
1266 #ifndef QT_NO_REGEXP_OPTIM
~QRegExpEngine()1267 QRegExpEngine::~QRegExpEngine()
1268 {
1269 }
1270 #endif
1271 
1272 /*
1273   Tries to match in str and returns an array of (begin, length) pairs
1274   for captured text. If there is no match, all pairs are (-1, -1).
1275 */
match(const QString & str,int pos,bool minimal,bool oneTest,int caretIndex,QMemArray<int> & captured)1276 void QRegExpEngine::match( const QString& str, int pos, bool minimal,
1277 			   bool oneTest, int caretIndex,
1278 			   QMemArray<int>& captured )
1279 {
1280     bool matched = FALSE;
1281 
1282 #ifndef QT_NO_REGEXP_OPTIM
1283     if ( trivial && !oneTest ) {
1284 	mmPos = str.find( goodStr, pos, cs );
1285 	mmMatchLen = goodStr.length();
1286 	matched = ( mmPos != -1 );
1287     } else
1288 #endif
1289     {
1290 	mmStr = &str;
1291 	mmIn = str.unicode();
1292 	if ( mmIn == 0 )
1293 	    mmIn = &QChar::null;
1294 	mmPos = pos;
1295 	mmCaretPos = caretIndex;
1296 	mmLen = str.length();
1297 	mmMinimal = minimal;
1298 	mmMatchLen = 0;
1299 	mmOneTestMatchedLen = 0;
1300 
1301 	if ( valid && mmPos >= 0 && mmPos <= mmLen ) {
1302 #ifndef QT_NO_REGEXP_OPTIM
1303 	    if ( oneTest ) {
1304 		matched = matchHere();
1305 	    } else {
1306 		if ( mmPos <= mmLen - minl ) {
1307 		    if ( caretAnchored ) {
1308 			matched = matchHere();
1309 		    } else if ( useGoodStringHeuristic ) {
1310 			matched = goodStringMatch();
1311 		    } else {
1312 			matched = badCharMatch();
1313 		    }
1314 		}
1315 	    }
1316 #else
1317 	    matched = oneTest ? matchHere() : bruteMatch();
1318 #endif
1319 	}
1320     }
1321 
1322     int capturedSize = 2 + 2 * officialncap;
1323     captured.detach();
1324     captured.resize( capturedSize );
1325     if ( matched ) {
1326 	captured[0] = mmPos;
1327 	captured[1] = mmMatchLen;
1328 	for ( int j = 0; j < officialncap; j++ ) {
1329 	    int len = mmCapEnd[j] - mmCapBegin[j];
1330 	    captured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0;
1331 	    captured[2 + 2 * j + 1] = len;
1332 	}
1333     } else {
1334 	// we rely on 2's complement here
1335 	memset( captured.data(), -1, capturedSize * sizeof(int) );
1336     }
1337 }
1338 
1339 /*
1340   The three following functions add one state to the automaton and
1341   return the number of the state.
1342 */
1343 
createState(QChar ch)1344 int QRegExpEngine::createState( QChar ch )
1345 {
1346     return setupState( ch.unicode() );
1347 }
1348 
createState(const CharClass & cc)1349 int QRegExpEngine::createState( const CharClass& cc )
1350 {
1351 #ifndef QT_NO_REGEXP_CCLASS
1352     int n = cl.size();
1353     cl.resize( n + 1 );
1354     cl.insert( n, new CharClass(cc) );
1355     return setupState( CharClassBit | n );
1356 #else
1357     Q_UNUSED( cc );
1358     return setupState( CharClassBit );
1359 #endif
1360 }
1361 
1362 #ifndef QT_NO_REGEXP_BACKREF
createState(int bref)1363 int QRegExpEngine::createState( int bref )
1364 {
1365     if ( bref > nbrefs ) {
1366 	nbrefs = bref;
1367 	if ( nbrefs > MaxBackRefs ) {
1368 	    error( RXERR_LIMIT );
1369 	    return 0;
1370 	}
1371     }
1372     return setupState( BackRefBit | bref );
1373 }
1374 #endif
1375 
1376 /*
1377   The two following functions add a transition between all pairs of
1378   states (i, j) where i is fond in from, and j is found in to.
1379 
1380   Cat-transitions are distinguished from plus-transitions for
1381   capturing.
1382 */
1383 
addCatTransitions(const QMemArray<int> & from,const QMemArray<int> & to)1384 void QRegExpEngine::addCatTransitions( const QMemArray<int>& from,
1385 				       const QMemArray<int>& to )
1386 {
1387     for ( int i = 0; i < (int) from.size(); i++ ) {
1388 	State *st = s[from[i]];
1389 	mergeInto( &st->outs, to );
1390     }
1391 }
1392 
1393 #ifndef QT_NO_REGEXP_CAPTURE
addPlusTransitions(const QMemArray<int> & from,const QMemArray<int> & to,int atom)1394 void QRegExpEngine::addPlusTransitions( const QMemArray<int>& from,
1395 					const QMemArray<int>& to, int atom )
1396 {
1397     for ( int i = 0; i < (int) from.size(); i++ ) {
1398 	State *st = s[from[i]];
1399 	QMemArray<int> oldOuts = st->outs.copy();
1400 	mergeInto( &st->outs, to );
1401 	if ( f[atom].capture >= 0 ) {
1402 	    if ( st->reenter == 0 )
1403 		st->reenter = new QMap<int, int>;
1404 	    for ( int j = 0; j < (int) to.size(); j++ ) {
1405 		if ( !st->reenter->contains(to[j]) &&
1406 		     oldOuts.bsearch(to[j]) < 0 )
1407 		    st->reenter->insert( to[j], atom );
1408 	    }
1409 	}
1410     }
1411 }
1412 #endif
1413 
1414 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1415 /*
1416   Returns an anchor that means a OR b.
1417 */
anchorAlternation(int a,int b)1418 int QRegExpEngine::anchorAlternation( int a, int b )
1419 {
1420     if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 )
1421 	return a & b;
1422 
1423     int n = aa.size();
1424 #ifndef QT_NO_REGEXP_OPTIM
1425     if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b )
1426 	return Anchor_Alternation | ( n - 1 );
1427 #endif
1428 
1429     aa.resize( n + 1 );
1430     aa[n].a = a;
1431     aa[n].b = b;
1432     return Anchor_Alternation | n;
1433 }
1434 
1435 /*
1436   Returns an anchor that means a AND b.
1437 */
anchorConcatenation(int a,int b)1438 int QRegExpEngine::anchorConcatenation( int a, int b )
1439 {
1440     if ( ((a | b) & Anchor_Alternation) == 0 )
1441 	return a | b;
1442     if ( (b & Anchor_Alternation) != 0 )
1443 	qSwap( a, b );
1444 
1445     int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b );
1446     int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b );
1447     return anchorAlternation( aprime, bprime );
1448 }
1449 #endif
1450 
1451 /*
1452   Adds anchor a on a transition caracterised by its from state and
1453   its to state.
1454 */
addAnchors(int from,int to,int a)1455 void QRegExpEngine::addAnchors( int from, int to, int a )
1456 {
1457     State *st = s[from];
1458     if ( st->anchors == 0 )
1459 	st->anchors = new QMap<int, int>;
1460     if ( st->anchors->contains(to) )
1461 	a = anchorAlternation( (*st->anchors)[to], a );
1462     st->anchors->insert( to, a );
1463 }
1464 
1465 #ifndef QT_NO_REGEXP_OPTIM
1466 /*
1467   This function chooses between the good-string and the bad-character
1468   heuristics. It computes two scores and chooses the heuristic with
1469   the highest score.
1470 
1471   Here are some common-sense constraints on the scores that should be
1472   respected if the formulas are ever modified: (1) If goodStr is
1473   empty, the good-string heuristic scores 0. (2) If the regular
1474   expression is trivial, the good-string heuristic should be used.
1475   (3) If the search is case insensitive, the good-string heuristic
1476   should be used, unless it scores 0. (Case insensitivity turns all
1477   entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1478   big, the good-string heuristic should score less.
1479 */
heuristicallyChooseHeuristic()1480 void QRegExpEngine::heuristicallyChooseHeuristic()
1481 {
1482     if ( minl == 0 ) {
1483 	useGoodStringHeuristic = FALSE;
1484     } else if ( trivial ) {
1485 	useGoodStringHeuristic = TRUE;
1486     } else {
1487 	/*
1488 	  Magic formula: The good string has to constitute a good
1489 	  proportion of the minimum-length string, and appear at a
1490 	  more-or-less known index.
1491 	*/
1492 	int goodStringScore = ( 64 * goodStr.length() / minl ) -
1493 			      ( goodLateStart - goodEarlyStart );
1494 	/*
1495 	  Less magic formula: We pick some characters at random, and
1496 	  check whether they are good or bad.
1497 	*/
1498 	int badCharScore = 0;
1499 	int step = QMAX( 1, NumBadChars / 32 );
1500 	for ( int i = 1; i < NumBadChars; i += step ) {
1501 	    if ( occ1[i] == NoOccurrence )
1502 		badCharScore += minl;
1503 	    else
1504 		badCharScore += occ1[i];
1505 	}
1506 	badCharScore /= minl;
1507 	useGoodStringHeuristic = ( goodStringScore > badCharScore );
1508     }
1509 }
1510 #endif
1511 
1512 #if defined(QT_DEBUG)
dump() const1513 void QRegExpEngine::dump() const
1514 {
1515     int i, j;
1516     qDebug( "Case %ssensitive engine", cs ? "" : "in" );
1517     qDebug( "  States" );
1518     for ( i = 0; i < ns; i++ ) {
1519 	qDebug( "  %d%s", i,
1520 		i == InitialState ? " (initial)" :
1521 		i == FinalState ? " (final)" : "" );
1522 #ifndef QT_NO_REGEXP_CAPTURE
1523 	qDebug( "    in atom %d", s[i]->atom );
1524 #endif
1525 	int m = s[i]->match;
1526 	if ( (m & CharClassBit) != 0 ) {
1527 	    qDebug( "    match character class %d", m ^ CharClassBit );
1528 #ifndef QT_NO_REGEXP_CCLASS
1529 	    cl[m ^ CharClassBit]->dump();
1530 #else
1531 	    qDebug( "    negative character class" );
1532 #endif
1533 	} else if ( (m & BackRefBit) != 0 ) {
1534 	    qDebug( "    match back-reference %d", m ^ BackRefBit );
1535 	} else if ( m >= 0x20 && m <= 0x7e ) {
1536 	    qDebug( "    match 0x%.4x (%c)", m, m );
1537 	} else {
1538 	    qDebug( "    match 0x%.4x", m );
1539 	}
1540 	for ( j = 0; j < (int) s[i]->outs.size(); j++ ) {
1541 	    int next = s[i]->outs[j];
1542 	    qDebug( "    -> %d", next );
1543 	    if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) )
1544 		qDebug( "       [reenter %d]", (*s[i]->reenter)[next] );
1545 	    if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 )
1546 		qDebug( "       [anchors 0x%.8x]", (*s[i]->anchors)[next] );
1547 	}
1548     }
1549 #ifndef QT_NO_REGEXP_CAPTURE
1550     if ( nf > 0 ) {
1551 	qDebug( "  Atom    Parent  Capture" );
1552 	for ( i = 0; i < nf; i++ )
1553 	    qDebug( "  %6d  %6d  %6d", i, f[i].parent, f[i].capture );
1554     }
1555 #endif
1556 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1557     for ( i = 0; i < (int) aa.size(); i++ )
1558 	qDebug( "  Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a,
1559 		aa[i].b );
1560 #endif
1561 }
1562 #endif
1563 
setup(bool caseSensitive)1564 void QRegExpEngine::setup( bool caseSensitive )
1565 {
1566     s.setAutoDelete( TRUE );
1567     s.resize( 32 );
1568     ns = 0;
1569 #ifndef QT_NO_REGEXP_CAPTURE
1570     f.resize( 32 );
1571     nf = 0;
1572     cf = -1;
1573 #endif
1574     officialncap = 0;
1575     ncap = 0;
1576 #ifndef QT_NO_REGEXP_CCLASS
1577     cl.setAutoDelete( TRUE );
1578 #endif
1579 #ifndef QT_NO_REGEXP_LOOKAHEAD
1580     ahead.setAutoDelete( TRUE );
1581 #endif
1582 #ifndef QT_NO_REGEXP_OPTIM
1583     caretAnchored = TRUE;
1584     trivial = TRUE;
1585 #endif
1586     valid = FALSE;
1587     cs = caseSensitive;
1588 #ifndef QT_NO_REGEXP_BACKREF
1589     nbrefs = 0;
1590 #endif
1591 #ifndef QT_NO_REGEXP_OPTIM
1592     useGoodStringHeuristic = TRUE;
1593     minl = 0;
1594     occ1.fill( 0, NumBadChars );
1595 #endif
1596 }
1597 
setupState(int match)1598 int QRegExpEngine::setupState( int match )
1599 {
1600     if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() )
1601 	s.resize( (ns + 1) << 1 );
1602 #ifndef QT_NO_REGEXP_CAPTURE
1603     s.insert( ns, new State(cf, match) );
1604 #else
1605     s.insert( ns, new State(match) );
1606 #endif
1607     return ns++;
1608 }
1609 
1610 #ifndef QT_NO_REGEXP_CAPTURE
1611 /*
1612   Functions startAtom() and finishAtom() should be called to delimit
1613   atoms. When a state is created, it is assigned to the current atom.
1614   The information is later used for capturing.
1615 */
startAtom(bool capture)1616 int QRegExpEngine::startAtom( bool capture )
1617 {
1618     if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() )
1619 	f.resize( (nf + 1) << 1 );
1620     f[nf].parent = cf;
1621     cf = nf++;
1622     f[cf].capture = capture ? ncap++ : -1;
1623     return cf;
1624 }
1625 #endif
1626 
1627 #ifndef QT_NO_REGEXP_LOOKAHEAD
1628 /*
1629   Creates a lookahead anchor.
1630 */
addLookahead(QRegExpEngine * eng,bool negative)1631 int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative )
1632 {
1633     int n = ahead.size();
1634     if ( n == MaxLookaheads ) {
1635 	error( RXERR_LIMIT );
1636 	return 0;
1637     }
1638     ahead.resize( n + 1 );
1639     ahead.insert( n, new Lookahead(eng, negative) );
1640     return Anchor_FirstLookahead << n;
1641 }
1642 #endif
1643 
1644 #ifndef QT_NO_REGEXP_CAPTURE
1645 /*
1646   We want the longest leftmost captures.
1647 */
isBetterCapture(const int * begin1,const int * end1,const int * begin2,const int * end2)1648 bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1,
1649 				     const int *begin2, const int *end2 )
1650 {
1651     for ( int i = 0; i < ncap; i++ ) {
1652 	int delta = begin2[i] - begin1[i]; // it has to start early...
1653 	if ( delta == 0 )
1654 	    delta = end1[i] - end2[i]; // ...and end late (like a party)
1655 
1656 	if ( delta != 0 )
1657 	    return delta > 0;
1658     }
1659     return FALSE;
1660 }
1661 #endif
1662 
1663 /*
1664   Returns TRUE if anchor a matches at position mmPos + i in the input
1665   string, otherwise FALSE.
1666 */
testAnchor(int i,int a,const int * capBegin)1667 bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin )
1668 {
1669     int j;
1670 
1671 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1672     if ( (a & Anchor_Alternation) != 0 ) {
1673 	return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) ||
1674 	       testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin );
1675     }
1676 #endif
1677 
1678     if ( (a & Anchor_Caret) != 0 ) {
1679 	if ( mmPos + i != mmCaretPos )
1680 	    return FALSE;
1681     }
1682     if ( (a & Anchor_Dollar) != 0 ) {
1683 	if ( mmPos + i != mmLen )
1684 	    return FALSE;
1685     }
1686 #ifndef QT_NO_REGEXP_ESCAPE
1687     if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) {
1688 	bool before = FALSE;
1689 	bool after = FALSE;
1690 	if ( mmPos + i != 0 )
1691 	    before = isWord( mmIn[mmPos + i - 1] );
1692 	if ( mmPos + i != mmLen )
1693 	    after = isWord( mmIn[mmPos + i] );
1694 	if ( (a & Anchor_Word) != 0 && (before == after) )
1695 	    return FALSE;
1696 	if ( (a & Anchor_NonWord) != 0 && (before != after) )
1697 	    return FALSE;
1698     }
1699 #endif
1700 #ifndef QT_NO_REGEXP_LOOKAHEAD
1701     if ( (a & Anchor_LookaheadMask) != 0 ) {
1702 	QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i,
1703 					   mmLen - mmPos - i );
1704 	for ( j = 0; j < (int) ahead.size(); j++ ) {
1705 	    if ( (a & (Anchor_FirstLookahead << j)) != 0 ) {
1706 		QMemArray<int> captured;
1707 		ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE,
1708 				      mmCaretPos - mmPos - i, captured );
1709 		if ( (captured[0] == 0) == ahead[j]->neg )
1710 		    return FALSE;
1711 	    }
1712 	}
1713     }
1714 #endif
1715 #ifndef QT_NO_REGEXP_CAPTURE
1716 #ifndef QT_NO_REGEXP_BACKREF
1717     for ( j = 0; j < nbrefs; j++ ) {
1718 	if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) {
1719 	    if ( capBegin[j] != EmptyCapture )
1720 		return FALSE;
1721 	}
1722     }
1723 #endif
1724 #endif
1725     return TRUE;
1726 }
1727 
1728 #ifndef QT_NO_REGEXP_OPTIM
1729 /*
1730   The three following functions are what Jeffrey Friedl would call
1731   transmissions (or bump-alongs). Using one or the other should make
1732   no difference except in performance.
1733 */
1734 
goodStringMatch()1735 bool QRegExpEngine::goodStringMatch()
1736 {
1737     int k = mmPos + goodEarlyStart;
1738     while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) {
1739 	int from = k - goodLateStart;
1740 	int to = k - goodEarlyStart;
1741 	if ( from > mmPos )
1742 	    mmPos = from;
1743 
1744 	while ( mmPos <= to ) {
1745 	    if ( matchHere() )
1746 		return TRUE;
1747 	    mmPos++;
1748 	}
1749 	k++;
1750     }
1751     return FALSE;
1752 }
1753 
badCharMatch()1754 bool QRegExpEngine::badCharMatch()
1755 {
1756     int slideHead = 0;
1757     int slideNext = 0;
1758     int i;
1759     int lastPos = mmLen - minl;
1760     memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) );
1761 
1762     /*
1763       Set up the slide table, used for the bad-character heuristic,
1764       using the table of first occurrence of each character.
1765     */
1766     for ( i = 0; i < minl; i++ ) {
1767 	int sk = occ1[BadChar(mmIn[mmPos + i])];
1768 	if ( sk == NoOccurrence )
1769 	    sk = i + 1;
1770 	if ( sk > 0 ) {
1771 	    int k = i + 1 - sk;
1772 	    if ( k < 0 ) {
1773 		sk = i + 1;
1774 		k = 0;
1775 	    }
1776 	    if ( sk > mmSlideTab[k] )
1777 		mmSlideTab[k] = sk;
1778 	}
1779     }
1780 
1781     if ( mmPos > lastPos )
1782 	return FALSE;
1783 
1784     for ( ;; ) {
1785 	if ( ++slideNext >= mmSlideTabSize )
1786 	    slideNext = 0;
1787 	if ( mmSlideTab[slideHead] > 0 ) {
1788 	    if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] )
1789 		mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1;
1790 	    mmSlideTab[slideHead] = 0;
1791 	} else {
1792 	    if ( matchHere() )
1793 		return TRUE;
1794 	}
1795 
1796 	if ( mmPos == lastPos )
1797 	    break;
1798 
1799 	/*
1800 	  Update the slide table. This code has much in common with
1801 	  the initialization code.
1802 	*/
1803 	int sk = occ1[BadChar(mmIn[mmPos + minl])];
1804 	if ( sk == NoOccurrence ) {
1805 	    mmSlideTab[slideNext] = minl;
1806 	} else if ( sk > 0 ) {
1807 	    int k = slideNext + minl - sk;
1808 	    if ( k >= mmSlideTabSize )
1809 		k -= mmSlideTabSize;
1810 	    if ( sk > mmSlideTab[k] )
1811 		mmSlideTab[k] = sk;
1812 	}
1813 	slideHead = slideNext;
1814 	mmPos++;
1815     }
1816     return FALSE;
1817 }
1818 #else
bruteMatch()1819 bool QRegExpEngine::bruteMatch()
1820 {
1821     while ( mmPos <= mmLen ) {
1822 	if ( matchHere() )
1823 	    return TRUE;
1824 	mmPos++;
1825     }
1826     return FALSE;
1827 }
1828 #endif
1829 
1830 /*
1831   Here's the core of the engine. It tries to do a match here and now.
1832 */
matchHere()1833 bool QRegExpEngine::matchHere()
1834 {
1835     int ncur = 1, nnext = 0;
1836     int i = 0, j, k, m;
1837     bool stop = FALSE;
1838 
1839     mmMatchLen = -1;
1840     mmOneTestMatchedLen = -1;
1841     mmCurStack[0] = InitialState;
1842 
1843 #ifndef QT_NO_REGEXP_CAPTURE
1844     if ( ncap > 0 ) {
1845 	for ( j = 0; j < ncap; j++ ) {
1846 	    mmCurCapBegin[j] = EmptyCapture;
1847 	    mmCurCapEnd[j] = EmptyCapture;
1848 	}
1849     }
1850 #endif
1851 
1852 #ifndef QT_NO_REGEXP_BACKREF
1853     int *zzZ = 0;
1854 
1855     while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos &&
1856 	    !stop )
1857 #else
1858     while ( ncur > 0 && i <= mmLen - mmPos && !stop )
1859 #endif
1860     {
1861 	int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0;
1862 	for ( j = 0; j < ncur; j++ ) {
1863 	    int cur = mmCurStack[j];
1864 	    State *scur = s[cur];
1865 	    QMemArray<int>& outs = scur->outs;
1866 	    for ( k = 0; k < (int) outs.size(); k++ ) {
1867 		int next = outs[k];
1868 		State *snext = s[next];
1869 		bool in = TRUE;
1870 #ifndef QT_NO_REGEXP_BACKREF
1871 		int needSomeSleep = 0;
1872 #endif
1873 
1874 		/*
1875 		  First, check if the anchors are anchored properly.
1876 		*/
1877 		if ( scur->anchors != 0 ) {
1878 		    int a = at( *scur->anchors, next );
1879 		    if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) )
1880 			in = FALSE;
1881 		}
1882 		/*
1883 		  If indeed they are, check if the input character is
1884 		  correct for this transition.
1885 		*/
1886 		if ( in ) {
1887 		    m = snext->match;
1888 		    if ( (m & (CharClassBit | BackRefBit)) == 0 ) {
1889 			if ( cs )
1890 			    in = ( m == ch );
1891 			else
1892 			    in = ( QChar(m).lower() == QChar(ch).lower() );
1893 		    } else if ( next == FinalState ) {
1894 			mmMatchLen = i;
1895 			stop = mmMinimal;
1896 			in = TRUE;
1897 		    } else if ( (m & CharClassBit) != 0 ) {
1898 #ifndef QT_NO_REGEXP_CCLASS
1899 			const CharClass *cc = cl[m ^ CharClassBit];
1900 			if ( cs )
1901 			    in = cc->in( ch );
1902 			else if ( cc->negative() )
1903 			    in = cc->in( QChar(ch).lower() ) &&
1904 				 cc->in( QChar(ch).upper() );
1905 			else
1906 			    in = cc->in( QChar(ch).lower() ) ||
1907 				 cc->in( QChar(ch).upper() );
1908 #endif
1909 #ifndef QT_NO_REGEXP_BACKREF
1910 		    } else { /* ( (m & BackRefBit) != 0 ) */
1911 			int bref = m ^ BackRefBit;
1912 			int ell = j * ncap + ( bref - 1 );
1913 
1914 			in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture;
1915 			if ( in ) {
1916 			    if ( cs )
1917 				in = ( mmIn[mmPos + mmCurCapBegin[ell]]
1918 				       == QChar(ch) );
1919 			    else
1920 				in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower()
1921 				       == QChar(ch).lower() );
1922 			}
1923 
1924 			if ( in ) {
1925 			    int delta;
1926 			    if ( mmCurCapEnd[ell] == EmptyCapture )
1927 				delta = i - mmCurCapBegin[ell];
1928 			    else
1929 				delta = mmCurCapEnd[ell] - mmCurCapBegin[ell];
1930 
1931 			    in = ( delta <= mmLen - (mmPos + i) );
1932 			    if ( in && delta > 1 ) {
1933 				int n = 1;
1934 				if ( cs ) {
1935 				    while ( n < delta ) {
1936 					if ( mmIn[mmPos +
1937 						  mmCurCapBegin[ell] + n] !=
1938 					     mmIn[mmPos + i + n] )
1939 					    break;
1940 					n++;
1941 				    }
1942 				} else {
1943 				    while ( n < delta ) {
1944 					QChar a = mmIn[mmPos +
1945 						       mmCurCapBegin[ell] + n];
1946 					QChar b = mmIn[mmPos + i + n];
1947 					if ( a.lower() != b.lower() )
1948 					    break;
1949 					n++;
1950 				    }
1951 				}
1952 				in = ( n == delta );
1953 				if ( in )
1954 				    needSomeSleep = delta - 1;
1955 			    }
1956 			}
1957 #endif
1958 		    }
1959 		}
1960 
1961 		/*
1962 		  We must now update our data structures.
1963 		*/
1964 		if ( in ) {
1965 #ifndef QT_NO_REGEXP_CAPTURE
1966 		    int *capBegin, *capEnd;
1967 #endif
1968 		    /*
1969 		      If the next state was not encountered yet, all
1970 		      is fine.
1971 		    */
1972 		    if ( (m = mmInNextStack[next]) == -1 ) {
1973 			m = nnext++;
1974 			mmNextStack[m] = next;
1975 			mmInNextStack[next] = m;
1976 #ifndef QT_NO_REGEXP_CAPTURE
1977 			capBegin = mmNextCapBegin + m * ncap;
1978 			capEnd = mmNextCapEnd + m * ncap;
1979 
1980 		    /*
1981 		      Otherwise, we'll first maintain captures in
1982 		      temporary arrays, and decide at the end whether
1983 		      it's best to keep the previous capture zones or
1984 		      the new ones.
1985 		    */
1986 		    } else {
1987 			capBegin = mmTempCapBegin;
1988 			capEnd = mmTempCapEnd;
1989 #endif
1990 		    }
1991 
1992 #ifndef QT_NO_REGEXP_CAPTURE
1993 		    /*
1994 		      Updating the capture zones is much of a task.
1995 		    */
1996 		    if ( ncap > 0 ) {
1997 			memcpy( capBegin, mmCurCapBegin + j * ncap,
1998 				ncap * sizeof(int) );
1999 			memcpy( capEnd, mmCurCapEnd + j * ncap,
2000 				ncap * sizeof(int) );
2001 			int c = scur->atom, n = snext->atom;
2002 			int p = -1, q = -1;
2003 			int cap;
2004 
2005 			/*
2006 			  Lemma 1. For any x in the range [0..nf), we
2007 			  have f[x].parent < x.
2008 
2009 			  Proof. By looking at startAtom(), it is
2010 			  clear that cf < nf holds all the time, and
2011 			  thus that f[nf].parent < nf.
2012 			*/
2013 
2014 			/*
2015 			  If we are reentering an atom, we empty all
2016 			  capture zones inside it.
2017 			*/
2018 			if ( scur->reenter != 0 &&
2019 			     (q = at(*scur->reenter, next)) != 0 ) {
2020 			    QBitArray b;
2021 			    b.fill( FALSE, nf );
2022 			    b.setBit( q, TRUE );
2023 			    for ( int ell = q + 1; ell < nf; ell++ ) {
2024 				if ( b.testBit(f[ell].parent) ) {
2025 				    b.setBit( ell, TRUE );
2026 				    cap = f[ell].capture;
2027 				    if ( cap >= 0 ) {
2028 					capBegin[cap] = EmptyCapture;
2029 					capEnd[cap] = EmptyCapture;
2030 				    }
2031 				}
2032 			    }
2033 			    p = f[q].parent;
2034 
2035 			/*
2036 			  Otherwise, close the capture zones we are
2037 			  leaving. We are leaving f[c].capture,
2038 			  f[f[c].parent].capture,
2039 			  f[f[f[c].parent].parent].capture, ...,
2040 			  until f[x].capture, with x such that
2041 			  f[x].parent is the youngest common ancestor
2042 			  for c and n.
2043 
2044 			  We go up along c's and n's ancestry until
2045 			  we find x.
2046 			*/
2047 			} else {
2048 			    p = c;
2049 			    q = n;
2050 			    while ( p != q ) {
2051 				if ( p > q ) {
2052 				    cap = f[p].capture;
2053 				    if ( cap >= 0 ) {
2054 					if ( capBegin[cap] == i ) {
2055 					    capBegin[cap] = EmptyCapture;
2056 					    capEnd[cap] = EmptyCapture;
2057 					} else {
2058 					    capEnd[cap] = i;
2059 					}
2060 				    }
2061 				    p = f[p].parent;
2062 				} else {
2063 				    q = f[q].parent;
2064 				}
2065 			    }
2066 			}
2067 
2068 			/*
2069 			  In any case, we now open the capture zones
2070 			  we are entering. We work upwards from n
2071 			  until we reach p (the parent of the atom we
2072 			  reenter or the youngest common ancestor).
2073 			*/
2074 			while ( n > p ) {
2075 			    cap = f[n].capture;
2076 			    if ( cap >= 0 ) {
2077 				capBegin[cap] = i;
2078 				capEnd[cap] = EmptyCapture;
2079 			    }
2080 			    n = f[n].parent;
2081 			}
2082 			/*
2083 			  If the next state was already in
2084 			  mmNextStack, we must choose carefully which
2085 			  capture zones we want to keep.
2086 			*/
2087 			if ( capBegin == mmTempCapBegin &&
2088 			     isBetterCapture(capBegin, capEnd,
2089 					     mmNextCapBegin + m * ncap,
2090 					     mmNextCapEnd + m * ncap) ) {
2091 			    memcpy( mmNextCapBegin + m * ncap, capBegin,
2092 				    ncap * sizeof(int) );
2093 			    memcpy( mmNextCapEnd + m * ncap, capEnd,
2094 				    ncap * sizeof(int) );
2095 			}
2096 		    }
2097 #ifndef QT_NO_REGEXP_BACKREF
2098 		    /*
2099 		      We are done with updating the capture zones.
2100 		      It's now time to put the next state to sleep,
2101 		      if it needs to, and to remove it from
2102 		      mmNextStack.
2103 		    */
2104 		    if ( needSomeSleep > 0 ) {
2105 			zzZ = new int[1 + 2 * ncap];
2106 			zzZ[0] = next;
2107 			if ( ncap > 0 ) {
2108 			    memcpy( zzZ + 1, capBegin, ncap * sizeof(int) );
2109 			    memcpy( zzZ + 1 + ncap, capEnd,
2110 				    ncap * sizeof(int) );
2111 			}
2112 			mmInNextStack[mmNextStack[--nnext]] = -1;
2113 			mmSleeping.insert( i + needSomeSleep, zzZ );
2114 		    }
2115 #endif
2116 #endif
2117 		}
2118 	    }
2119 	}
2120 #ifndef QT_NO_REGEXP_CAPTURE
2121 	/*
2122 	  If we reached the final state, hurray! Copy the captured
2123 	  zone.
2124 	*/
2125 	if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) {
2126 	    memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) );
2127 	    memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) );
2128 	}
2129 #ifndef QT_NO_REGEXP_BACKREF
2130 	/*
2131 	  It's time to wake up the sleepers.
2132 	*/
2133 	if ( !mmSleeping.isEmpty() ) {
2134 	    while ( (zzZ = mmSleeping.take(i)) != 0 ) {
2135 		int next = zzZ[0];
2136 		int *capBegin = zzZ + 1;
2137 		int *capEnd = zzZ + 1 + ncap;
2138 		bool copyOver = TRUE;
2139 
2140 		if ( (m = mmInNextStack[zzZ[0]]) == -1 ) {
2141 		    m = nnext++;
2142 		    mmNextStack[m] = next;
2143 		    mmInNextStack[next] = m;
2144 		} else {
2145 		    copyOver = isBetterCapture( mmNextCapBegin + m * ncap,
2146 						mmNextCapEnd + m * ncap,
2147 						capBegin, capEnd );
2148 		}
2149 		if ( copyOver ) {
2150 		    memcpy( mmNextCapBegin + m * ncap, capBegin,
2151 			    ncap * sizeof(int) );
2152 		    memcpy( mmNextCapEnd + m * ncap, capEnd,
2153 			    ncap * sizeof(int) );
2154 		}
2155 		delete[] zzZ;
2156 	    }
2157 	}
2158 #endif
2159 #endif
2160 	for ( j = 0; j < nnext; j++ )
2161 	    mmInNextStack[mmNextStack[j]] = -1;
2162 
2163 	// avoid needless iteration that confuses mmOneTestMatchedLen
2164 	if ( nnext == 1 && mmNextStack[0] == FinalState
2165 #ifndef QT_NO_REGEXP_BACKREF
2166 	     && mmSleeping.isEmpty()
2167 #endif
2168 	     )
2169 	    stop = TRUE;
2170 
2171 	qSwap( mmCurStack, mmNextStack );
2172 #ifndef QT_NO_REGEXP_CAPTURE
2173 	qSwap( mmCurCapBegin, mmNextCapBegin );
2174 	qSwap( mmCurCapEnd, mmNextCapEnd );
2175 #endif
2176 	ncur = nnext;
2177 	nnext = 0;
2178 	i++;
2179     }
2180 
2181 #ifndef QT_NO_REGEXP_BACKREF
2182     /*
2183       If minimal matching is enabled, we might have some sleepers
2184       left.
2185     */
2186     while ( !mmSleeping.isEmpty() ) {
2187 	zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) );
2188 	delete[] zzZ;
2189     }
2190 #endif
2191 
2192     mmOneTestMatchedLen = i - 1;
2193     return ( mmMatchLen >= 0 );
2194 }
2195 
2196 #ifndef QT_NO_REGEXP_CCLASS
2197 
CharClass()2198 QRegExpEngine::CharClass::CharClass()
2199     : c( 0 ), n( FALSE )
2200 {
2201 #ifndef QT_NO_REGEXP_OPTIM
2202     occ1.fill( NoOccurrence, NumBadChars );
2203 #endif
2204 }
2205 
operator =(const CharClass & cc)2206 QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=(
2207 	const CharClass& cc )
2208 {
2209     c = cc.c;
2210     r = cc.r.copy();
2211     n = cc.n;
2212 #ifndef QT_NO_REGEXP_OPTIM
2213     occ1 = cc.occ1;
2214 #endif
2215     return *this;
2216 }
2217 
clear()2218 void QRegExpEngine::CharClass::clear()
2219 {
2220     c = 0;
2221     r.resize( 0 );
2222     n = FALSE;
2223 }
2224 
setNegative(bool negative)2225 void QRegExpEngine::CharClass::setNegative( bool negative )
2226 {
2227     n = negative;
2228 #ifndef QT_NO_REGEXP_OPTIM
2229     occ1.fill( 0, NumBadChars );
2230 #endif
2231 }
2232 
addCategories(int cats)2233 void QRegExpEngine::CharClass::addCategories( int cats )
2234 {
2235     c |= cats;
2236 #ifndef QT_NO_REGEXP_OPTIM
2237     occ1.fill( 0, NumBadChars );
2238 #endif
2239 }
2240 
addRange(ushort from,ushort to)2241 void QRegExpEngine::CharClass::addRange( ushort from, ushort to )
2242 {
2243     if ( from > to )
2244 	qSwap( from, to );
2245     int m = r.size();
2246     r.resize( m + 1 );
2247     r[m].from = from;
2248     r[m].to = to;
2249 
2250 #ifndef QT_NO_REGEXP_OPTIM
2251     int i;
2252 
2253     if ( to - from < NumBadChars ) {
2254 	occ1.detach();
2255 	if ( from % NumBadChars <= to % NumBadChars ) {
2256 	    for ( i = from % NumBadChars; i <= to % NumBadChars; i++ )
2257 		occ1[i] = 0;
2258 	} else {
2259 	    for ( i = 0; i <= to % NumBadChars; i++ )
2260 		occ1[i] = 0;
2261 	    for ( i = from % NumBadChars; i < NumBadChars; i++ )
2262 		occ1[i] = 0;
2263 	}
2264     } else {
2265 	occ1.fill( 0, NumBadChars );
2266     }
2267 #endif
2268 }
2269 
in(QChar ch) const2270 bool QRegExpEngine::CharClass::in( QChar ch ) const
2271 {
2272 #ifndef QT_NO_REGEXP_OPTIM
2273     if ( occ1[BadChar(ch)] == NoOccurrence )
2274 	return n;
2275 #endif
2276 
2277     if ( c != 0 && (c & (1 << (int) ch.category())) != 0 )
2278 	return !n;
2279     for ( int i = 0; i < (int) r.size(); i++ ) {
2280 	if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to )
2281 	    return !n;
2282     }
2283     return n;
2284 }
2285 
2286 #if defined(QT_DEBUG)
dump() const2287 void QRegExpEngine::CharClass::dump() const
2288 {
2289     int i;
2290     qDebug( "    %stive character class", n ? "nega" : "posi" );
2291 #ifndef QT_NO_REGEXP_CCLASS
2292     if ( c != 0 )
2293 	qDebug( "      categories 0x%.8x", c );
2294 #endif
2295     for ( i = 0; i < (int) r.size(); i++ )
2296 	qDebug( "      0x%.4x through 0x%.4x", r[i].from, r[i].to );
2297 }
2298 #endif
2299 #endif
2300 
Box(QRegExpEngine * engine)2301 QRegExpEngine::Box::Box( QRegExpEngine *engine )
2302     : eng( engine ), skipanchors( 0 )
2303 #ifndef QT_NO_REGEXP_OPTIM
2304       , earlyStart( 0 ), lateStart( 0 ), maxl( 0 )
2305 #endif
2306 {
2307 #ifndef QT_NO_REGEXP_OPTIM
2308     occ1.fill( NoOccurrence, NumBadChars );
2309 #endif
2310     minl = 0;
2311 }
2312 
operator =(const Box & b)2313 QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b )
2314 {
2315     eng = b.eng;
2316     ls = b.ls;
2317     rs = b.rs;
2318     lanchors = b.lanchors;
2319     ranchors = b.ranchors;
2320     skipanchors = b.skipanchors;
2321 #ifndef QT_NO_REGEXP_OPTIM
2322     earlyStart = b.earlyStart;
2323     lateStart = b.lateStart;
2324     str = b.str;
2325     leftStr = b.leftStr;
2326     rightStr = b.rightStr;
2327     maxl = b.maxl;
2328     occ1 = b.occ1;
2329 #endif
2330     minl = b.minl;
2331     return *this;
2332 }
2333 
set(QChar ch)2334 void QRegExpEngine::Box::set( QChar ch )
2335 {
2336     ls.resize( 1 );
2337     ls[0] = eng->createState( ch );
2338     rs = ls;
2339     rs.detach();
2340 #ifndef QT_NO_REGEXP_OPTIM
2341     str = ch;
2342     leftStr = ch;
2343     rightStr = ch;
2344     maxl = 1;
2345     occ1.detach();
2346     occ1[BadChar(ch)] = 0;
2347 #endif
2348     minl = 1;
2349 }
2350 
set(const CharClass & cc)2351 void QRegExpEngine::Box::set( const CharClass& cc )
2352 {
2353     ls.resize( 1 );
2354     ls[0] = eng->createState( cc );
2355     rs = ls;
2356     rs.detach();
2357 #ifndef QT_NO_REGEXP_OPTIM
2358     maxl = 1;
2359     occ1 = cc.firstOccurrence();
2360 #endif
2361     minl = 1;
2362 }
2363 
2364 #ifndef QT_NO_REGEXP_BACKREF
set(int bref)2365 void QRegExpEngine::Box::set( int bref )
2366 {
2367     ls.resize( 1 );
2368     ls[0] = eng->createState( bref );
2369     rs = ls;
2370     rs.detach();
2371     if ( bref >= 1 && bref <= MaxBackRefs )
2372 	skipanchors = Anchor_BackRef0Empty << bref;
2373 #ifndef QT_NO_REGEXP_OPTIM
2374     maxl = InftyLen;
2375 #endif
2376     minl = 0;
2377 }
2378 #endif
2379 
cat(const Box & b)2380 void QRegExpEngine::Box::cat( const Box& b )
2381 {
2382     eng->addCatTransitions( rs, b.ls );
2383     addAnchorsToEngine( b );
2384     if ( minl == 0 ) {
2385 	mergeInto( &lanchors, b.lanchors );
2386 	if ( skipanchors != 0 ) {
2387 	    for ( int i = 0; i < (int) b.ls.size(); i++ ) {
2388 		int a = eng->anchorConcatenation( at(lanchors, b.ls[i]),
2389 						  skipanchors );
2390 		lanchors.insert( b.ls[i], a );
2391 	    }
2392 	}
2393 	mergeInto( &ls, b.ls );
2394     }
2395     if ( b.minl == 0 ) {
2396 	mergeInto( &ranchors, b.ranchors );
2397 	if ( b.skipanchors != 0 ) {
2398 	    for ( int i = 0; i < (int) rs.size(); i++ ) {
2399 		int a = eng->anchorConcatenation( at(ranchors, rs[i]),
2400 						  b.skipanchors );
2401 		ranchors.insert( rs[i], a );
2402 	    }
2403 	}
2404 	mergeInto( &rs, b.rs );
2405     } else {
2406 	ranchors = b.ranchors;
2407 	rs = b.rs;
2408     }
2409 
2410 #ifndef QT_NO_REGEXP_OPTIM
2411     if ( maxl != InftyLen ) {
2412 	if ( rightStr.length() + b.leftStr.length() >
2413 	     QMAX(str.length(), b.str.length()) ) {
2414 	    earlyStart = minl - rightStr.length();
2415 	    lateStart = maxl - rightStr.length();
2416 	    str = rightStr + b.leftStr;
2417 	} else if ( b.str.length() > str.length() ) {
2418 	    earlyStart = minl + b.earlyStart;
2419 	    lateStart = maxl + b.lateStart;
2420 	    str = b.str;
2421 	}
2422     }
2423 
2424     if ( (int) leftStr.length() == maxl )
2425 	leftStr += b.leftStr;
2426 
2427     if ( (int) b.rightStr.length() == b.maxl ) {
2428 	rightStr += b.rightStr;
2429     } else {
2430 	rightStr = b.rightStr;
2431     }
2432 
2433     if ( maxl == InftyLen || b.maxl == InftyLen ) {
2434 	maxl = InftyLen;
2435     } else {
2436 	maxl += b.maxl;
2437     }
2438 
2439     occ1.detach();
2440     for ( int i = 0; i < NumBadChars; i++ ) {
2441 	if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] )
2442 	    occ1[i] = minl + b.occ1[i];
2443     }
2444 #endif
2445 
2446     minl += b.minl;
2447     if ( minl == 0 )
2448 	skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors );
2449     else
2450 	skipanchors = 0;
2451 }
2452 
orx(const Box & b)2453 void QRegExpEngine::Box::orx( const Box& b )
2454 {
2455     mergeInto( &ls, b.ls );
2456     mergeInto( &lanchors, b.lanchors );
2457     mergeInto( &rs, b.rs );
2458     mergeInto( &ranchors, b.ranchors );
2459 
2460     if ( b.minl == 0 ) {
2461 	if ( minl == 0 )
2462 	    skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors );
2463 	else
2464 	    skipanchors = b.skipanchors;
2465     }
2466 
2467 #ifndef QT_NO_REGEXP_OPTIM
2468     occ1.detach();
2469     for ( int i = 0; i < NumBadChars; i++ ) {
2470 	if ( occ1[i] > b.occ1[i] )
2471 	    occ1[i] = b.occ1[i];
2472     }
2473     earlyStart = 0;
2474     lateStart = 0;
2475     str = QString();
2476     leftStr = QString();
2477     rightStr = QString();
2478     if ( b.maxl > maxl )
2479 	maxl = b.maxl;
2480 #endif
2481     if ( b.minl < minl )
2482 	minl = b.minl;
2483 }
2484 
plus(int atom)2485 void QRegExpEngine::Box::plus( int atom )
2486 {
2487 #ifndef QT_NO_REGEXP_CAPTURE
2488     eng->addPlusTransitions( rs, ls, atom );
2489 #else
2490     Q_UNUSED( atom );
2491     eng->addCatTransitions( rs, ls );
2492 #endif
2493     addAnchorsToEngine( *this );
2494 #ifndef QT_NO_REGEXP_OPTIM
2495     maxl = InftyLen;
2496 #endif
2497 }
2498 
opt()2499 void QRegExpEngine::Box::opt()
2500 {
2501 #ifndef QT_NO_REGEXP_OPTIM
2502     earlyStart = 0;
2503     lateStart = 0;
2504     str = QString();
2505     leftStr = QString();
2506     rightStr = QString();
2507 #endif
2508     skipanchors = 0;
2509     minl = 0;
2510 }
2511 
catAnchor(int a)2512 void QRegExpEngine::Box::catAnchor( int a )
2513 {
2514     if ( a != 0 ) {
2515 	for ( int i = 0; i < (int) rs.size(); i++ ) {
2516 	    a = eng->anchorConcatenation( at(ranchors, rs[i]), a );
2517 	    ranchors.insert( rs[i], a );
2518 	}
2519 	if ( minl == 0 )
2520 	    skipanchors = eng->anchorConcatenation( skipanchors, a );
2521     }
2522 }
2523 
2524 #ifndef QT_NO_REGEXP_OPTIM
setupHeuristics()2525 void QRegExpEngine::Box::setupHeuristics()
2526 {
2527     eng->goodEarlyStart = earlyStart;
2528     eng->goodLateStart = lateStart;
2529     eng->goodStr = eng->cs ? str : str.lower();
2530 
2531     eng->minl = minl;
2532     if ( eng->cs ) {
2533 	/*
2534 	  A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2535 	  1 at this point. An entry of occ1 has to be at most minl or
2536 	  infinity for the rest of the algorithm to go well.
2537 
2538 	  We waited until here before normalizing these cases (instead of
2539 	  doing it in Box::orx()) because sometimes things improve by
2540 	  themselves. Consider for example (112|1)34.
2541 	*/
2542 	for ( int i = 0; i < NumBadChars; i++ ) {
2543 	    if ( occ1[i] != NoOccurrence && occ1[i] >= minl )
2544 		occ1[i] = minl;
2545 	}
2546 	eng->occ1 = occ1;
2547     } else {
2548 	eng->occ1.fill( 0, NumBadChars );
2549     }
2550 
2551     eng->heuristicallyChooseHeuristic();
2552 }
2553 #endif
2554 
2555 #if defined(QT_DEBUG)
dump() const2556 void QRegExpEngine::Box::dump() const
2557 {
2558     int i;
2559     qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" );
2560     qDebug( "  Left states:" );
2561     for ( i = 0; i < (int) ls.size(); i++ ) {
2562 	if ( at(lanchors, ls[i]) == 0 )
2563 	    qDebug( "    %d", ls[i] );
2564 	else
2565 	    qDebug( "    %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] );
2566     }
2567     qDebug( "  Right states:" );
2568     for ( i = 0; i < (int) rs.size(); i++ ) {
2569 	if ( at(ranchors, rs[i]) == 0 )
2570 	    qDebug( "    %d", rs[i] );
2571 	else
2572 	    qDebug( "    %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] );
2573     }
2574     qDebug( "  Skip anchors: 0x%.8x", skipanchors );
2575 }
2576 #endif
2577 
addAnchorsToEngine(const Box & to) const2578 void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const
2579 {
2580     for ( int i = 0; i < (int) to.ls.size(); i++ ) {
2581 	for ( int j = 0; j < (int) rs.size(); j++ ) {
2582 	    int a = eng->anchorConcatenation( at(ranchors, rs[j]),
2583 					      at(to.lanchors, to.ls[i]) );
2584 	    eng->addAnchors( rs[j], to.ls[i], a );
2585 	}
2586     }
2587 }
2588 
getChar()2589 int QRegExpEngine::getChar()
2590 {
2591     return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode();
2592 }
2593 
getEscape()2594 int QRegExpEngine::getEscape()
2595 {
2596 #ifndef QT_NO_REGEXP_ESCAPE
2597     const char tab[] = "afnrtv"; // no b, as \b means word boundary
2598     const char backTab[] = "\a\f\n\r\t\v";
2599     ushort low;
2600     int i;
2601 #endif
2602     ushort val;
2603     int prevCh = yyCh;
2604 
2605     if ( prevCh == EOS ) {
2606 	error( RXERR_END );
2607 	return Tok_Char | '\\';
2608     }
2609     yyCh = getChar();
2610 #ifndef QT_NO_REGEXP_ESCAPE
2611     if ( (prevCh & ~0xff) == 0 ) {
2612 	const char *p = strchr( tab, prevCh );
2613 	if ( p != 0 )
2614 	    return Tok_Char | backTab[p - tab];
2615     }
2616 #endif
2617 
2618     switch ( prevCh ) {
2619 #ifndef QT_NO_REGEXP_ESCAPE
2620     case '0':
2621 	val = 0;
2622 	for ( i = 0; i < 3; i++ ) {
2623 	    if ( yyCh >= '0' && yyCh <= '7' )
2624 		val = ( val << 3 ) | ( yyCh - '0' );
2625 	    else
2626 		break;
2627 	    yyCh = getChar();
2628 	}
2629 	if ( (val & ~0377) != 0 )
2630 	    error( RXERR_OCTAL );
2631 	return Tok_Char | val;
2632 #endif
2633 #ifndef QT_NO_REGEXP_ESCAPE
2634     case 'B':
2635 	return Tok_NonWord;
2636 #endif
2637 #ifndef QT_NO_REGEXP_CCLASS
2638     case 'D':
2639 	// see QChar::isDigit()
2640 	yyCharClass->addCategories( 0x7fffffef );
2641 	return Tok_CharClass;
2642     case 'S':
2643 	// see QChar::isSpace()
2644 	yyCharClass->addCategories( 0x7ffff87f );
2645 	yyCharClass->addRange( 0x0000, 0x0008 );
2646 	yyCharClass->addRange( 0x000e, 0x001f );
2647 	yyCharClass->addRange( 0x007f, 0x009f );
2648 	return Tok_CharClass;
2649     case 'W':
2650 	// see QChar::isLetterOrNumber()
2651 	yyCharClass->addCategories( 0x7fe07f8f );
2652 	yyCharClass->addRange( 0x203f, 0x2040 );
2653 	yyCharClass->addSingleton( 0x2040 );
2654 	yyCharClass->addSingleton( 0x30fb );
2655 	yyCharClass->addRange( 0xfe33, 0xfe34 );
2656 	yyCharClass->addRange( 0xfe4d, 0xfe4f );
2657 	yyCharClass->addSingleton( 0xff3f );
2658 	yyCharClass->addSingleton( 0xff65 );
2659 	return Tok_CharClass;
2660 #endif
2661 #ifndef QT_NO_REGEXP_ESCAPE
2662     case 'b':
2663 	return Tok_Word;
2664 #endif
2665 #ifndef QT_NO_REGEXP_CCLASS
2666     case 'd':
2667 	// see QChar::isDigit()
2668 	yyCharClass->addCategories( 0x00000010 );
2669 	return Tok_CharClass;
2670     case 's':
2671 	// see QChar::isSpace()
2672 	yyCharClass->addCategories( 0x00000380 );
2673 	yyCharClass->addRange( 0x0009, 0x000d );
2674 	return Tok_CharClass;
2675     case 'w':
2676 	// see QChar::isLetterOrNumber()
2677 	yyCharClass->addCategories( 0x000f8070 );
2678 	yyCharClass->addSingleton( 0x005f ); // '_'
2679 	return Tok_CharClass;
2680 #endif
2681 #ifndef QT_NO_REGEXP_ESCAPE
2682     case 'x':
2683 	val = 0;
2684 	for ( i = 0; i < 4; i++ ) {
2685 	    low = QChar( yyCh ).lower();
2686 	    if ( low >= '0' && low <= '9' )
2687 		val = ( val << 4 ) | ( low - '0' );
2688 	    else if ( low >= 'a' && low <= 'f' )
2689 		val = ( val << 4 ) | ( low - 'a' + 10 );
2690 	    else
2691 		break;
2692 	    yyCh = getChar();
2693 	}
2694 	return Tok_Char | val;
2695 #endif
2696     default:
2697 	if ( prevCh >= '1' && prevCh <= '9' ) {
2698 #ifndef QT_NO_REGEXP_BACKREF
2699 	    val = prevCh - '0';
2700 	    while ( yyCh >= '0' && yyCh <= '9' ) {
2701 		val = ( val * 10 ) + ( yyCh - '0' );
2702 		yyCh = getChar();
2703 	    }
2704 	    return Tok_BackRef | val;
2705 #else
2706 	    error( RXERR_DISABLED );
2707 #endif
2708 	}
2709 	return Tok_Char | prevCh;
2710     }
2711 }
2712 
2713 #ifndef QT_NO_REGEXP_INTERVAL
getRep(int def)2714 int QRegExpEngine::getRep( int def )
2715 {
2716     if ( yyCh >= '0' && yyCh <= '9' ) {
2717 	int rep = 0;
2718 	do {
2719 	    rep = 10 * rep + yyCh - '0';
2720 	    if ( rep >= InftyRep ) {
2721 		error( RXERR_REPETITION );
2722 		rep = def;
2723 	    }
2724 	    yyCh = getChar();
2725 	} while ( yyCh >= '0' && yyCh <= '9' );
2726 	return rep;
2727     } else {
2728 	return def;
2729     }
2730 }
2731 #endif
2732 
2733 #ifndef QT_NO_REGEXP_LOOKAHEAD
skipChars(int n)2734 void QRegExpEngine::skipChars( int n )
2735 {
2736     if ( n > 0 ) {
2737 	yyPos += n - 1;
2738 	yyCh = getChar();
2739     }
2740 }
2741 #endif
2742 
error(const char * msg)2743 void QRegExpEngine::error( const char *msg )
2744 {
2745     if ( yyError.isEmpty() )
2746 	yyError = QString::fromLatin1( msg );
2747 }
2748 
startTokenizer(const QChar * rx,int len)2749 void QRegExpEngine::startTokenizer( const QChar *rx, int len )
2750 {
2751     yyIn = rx;
2752     yyPos0 = 0;
2753     yyPos = 0;
2754     yyLen = len;
2755     yyCh = getChar();
2756     yyCharClass = new CharClass;
2757     yyMinRep = 0;
2758     yyMaxRep = 0;
2759     yyError = QString();
2760 }
2761 
getToken()2762 int QRegExpEngine::getToken()
2763 {
2764 #ifndef QT_NO_REGEXP_CCLASS
2765     ushort pendingCh = 0;
2766     bool charPending;
2767     bool rangePending;
2768     int tok;
2769 #endif
2770     int prevCh = yyCh;
2771 
2772     yyPos0 = yyPos - 1;
2773 #ifndef QT_NO_REGEXP_CCLASS
2774     yyCharClass->clear();
2775 #endif
2776     yyMinRep = 0;
2777     yyMaxRep = 0;
2778     yyCh = getChar();
2779 
2780     switch ( prevCh ) {
2781     case EOS:
2782 	yyPos0 = yyPos;
2783 	return Tok_Eos;
2784     case '$':
2785 	return Tok_Dollar;
2786     case '(':
2787 	if ( yyCh == '?' ) {
2788 	    prevCh = getChar();
2789 	    yyCh = getChar();
2790 	    switch ( prevCh ) {
2791 #ifndef QT_NO_REGEXP_LOOKAHEAD
2792 	    case '!':
2793 		return Tok_NegLookahead;
2794 	    case '=':
2795 		return Tok_PosLookahead;
2796 #endif
2797 	    case ':':
2798 		return Tok_MagicLeftParen;
2799 	    default:
2800 		error( RXERR_LOOKAHEAD );
2801 		return Tok_MagicLeftParen;
2802 	    }
2803 	} else {
2804 	    return Tok_LeftParen;
2805 	}
2806     case ')':
2807 	return Tok_RightParen;
2808     case '*':
2809 	yyMinRep = 0;
2810 	yyMaxRep = InftyRep;
2811 	return Tok_Quantifier;
2812     case '+':
2813 	yyMinRep = 1;
2814 	yyMaxRep = InftyRep;
2815 	return Tok_Quantifier;
2816     case '.':
2817 #ifndef QT_NO_REGEXP_CCLASS
2818 	yyCharClass->setNegative( TRUE );
2819 #endif
2820 	return Tok_CharClass;
2821     case '?':
2822 	yyMinRep = 0;
2823 	yyMaxRep = 1;
2824 	return Tok_Quantifier;
2825     case '[':
2826 #ifndef QT_NO_REGEXP_CCLASS
2827 	if ( yyCh == '^' ) {
2828 	    yyCharClass->setNegative( TRUE );
2829 	    yyCh = getChar();
2830 	}
2831 	charPending = FALSE;
2832 	rangePending = FALSE;
2833 	do {
2834 	    if ( yyCh == '-' && charPending && !rangePending ) {
2835 		rangePending = TRUE;
2836 		yyCh = getChar();
2837 	    } else {
2838 		if ( charPending && !rangePending ) {
2839 		    yyCharClass->addSingleton( pendingCh );
2840 		    charPending = FALSE;
2841 		}
2842 		if ( yyCh == '\\' ) {
2843 		    yyCh = getChar();
2844 		    tok = getEscape();
2845 		    if ( tok == Tok_Word )
2846 			tok = '\b';
2847 		} else {
2848 		    tok = Tok_Char | yyCh;
2849 		    yyCh = getChar();
2850 		}
2851 		if ( tok == Tok_CharClass ) {
2852 		    if ( rangePending ) {
2853 			yyCharClass->addSingleton( '-' );
2854 			yyCharClass->addSingleton( pendingCh );
2855 			charPending = FALSE;
2856 			rangePending = FALSE;
2857 		    }
2858 		} else if ( (tok & Tok_Char) != 0 ) {
2859 		    if ( rangePending ) {
2860 			yyCharClass->addRange( pendingCh, tok ^ Tok_Char );
2861 			charPending = FALSE;
2862 			rangePending = FALSE;
2863 		    } else {
2864 			pendingCh = tok ^ Tok_Char;
2865 			charPending = TRUE;
2866 		    }
2867 		} else {
2868 		    error( RXERR_CHARCLASS );
2869 		}
2870 	    }
2871 	}  while ( yyCh != ']' && yyCh != EOS );
2872 	if ( rangePending )
2873 	    yyCharClass->addSingleton( '-' );
2874 	if ( charPending )
2875 	    yyCharClass->addSingleton( pendingCh );
2876 	if ( yyCh == EOS )
2877 	    error( RXERR_END );
2878 	else
2879 	    yyCh = getChar();
2880 	return Tok_CharClass;
2881 #else
2882 	error( RXERR_END );
2883 	return Tok_Char | '[';
2884 #endif
2885     case '\\':
2886 	return getEscape();
2887     case ']':
2888 	error( RXERR_LEFTDELIM );
2889 	return Tok_Char | ']';
2890     case '^':
2891 	return Tok_Caret;
2892     case '{':
2893 #ifndef QT_NO_REGEXP_INTERVAL
2894 	yyMinRep = getRep( 0 );
2895 	yyMaxRep = yyMinRep;
2896 	if ( yyCh == ',' ) {
2897 	    yyCh = getChar();
2898 	    yyMaxRep = getRep( InftyRep );
2899 	}
2900 	if ( yyMaxRep < yyMinRep )
2901 	    qSwap( yyMinRep, yyMaxRep );
2902 	if ( yyCh != '}' )
2903 	    error( RXERR_REPETITION );
2904 	yyCh = getChar();
2905 	return Tok_Quantifier;
2906 #else
2907 	error( RXERR_DISABLED );
2908 	return Tok_Char | '{';
2909 #endif
2910     case '|':
2911 	return Tok_Bar;
2912     case '}':
2913 	error( RXERR_LEFTDELIM );
2914 	return Tok_Char | '}';
2915     default:
2916 	return Tok_Char | prevCh;
2917     }
2918 }
2919 
parse(const QChar * pattern,int len)2920 int QRegExpEngine::parse( const QChar *pattern, int len )
2921 {
2922     valid = TRUE;
2923     startTokenizer( pattern, len );
2924     yyTok = getToken();
2925 #ifndef QT_NO_REGEXP_CAPTURE
2926     yyMayCapture = TRUE;
2927 #else
2928     yyMayCapture = FALSE;
2929 #endif
2930 
2931 #ifndef QT_NO_REGEXP_CAPTURE
2932     int atom = startAtom( FALSE );
2933 #endif
2934     CharClass anything;
2935     Box box( this ); // create InitialState
2936     box.set( anything );
2937     Box rightBox( this ); // create FinalState
2938     rightBox.set( anything );
2939 
2940     Box middleBox( this );
2941     parseExpression( &middleBox );
2942 #ifndef QT_NO_REGEXP_CAPTURE
2943     finishAtom( atom );
2944 #endif
2945 #ifndef QT_NO_REGEXP_OPTIM
2946     middleBox.setupHeuristics();
2947 #endif
2948     box.cat( middleBox );
2949     box.cat( rightBox );
2950     delete yyCharClass;
2951     yyCharClass = 0;
2952 
2953     officialncap = ncap;
2954 #ifndef QT_NO_REGEXP_BACKREF
2955     if ( nbrefs > ncap )
2956 	ncap = nbrefs;
2957 #endif
2958 
2959     /*
2960       We use one QMemArray<int> for all the big data used a lot in
2961       matchHere() and friends.
2962     */
2963 #ifndef QT_NO_REGEXP_OPTIM
2964     mmSlideTabSize = QMAX( minl + 1, 16 );
2965 #else
2966     mmSlideTabSize = 0;
2967 #endif
2968     mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize );
2969 
2970     mmInNextStack = mmBigArray.data();
2971     memset( mmInNextStack, -1, ns * sizeof(int) );
2972     mmCurStack = mmInNextStack + ns;
2973     mmNextStack = mmInNextStack + 2 * ns;
2974 
2975     mmCurCapBegin = mmInNextStack + 3 * ns;
2976     mmNextCapBegin = mmCurCapBegin + ncap * ns;
2977     mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns;
2978     mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns;
2979 
2980     mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns;
2981     mmTempCapEnd = mmTempCapBegin + ncap;
2982     mmCapBegin = mmTempCapBegin + 2 * ncap;
2983     mmCapEnd = mmTempCapBegin + 3 * ncap;
2984 
2985     mmSlideTab = mmTempCapBegin + 4 * ncap;
2986 
2987     if ( !yyError.isEmpty() )
2988 	return -1;
2989 
2990 #ifndef QT_NO_REGEXP_OPTIM
2991     State *sinit = s[InitialState];
2992     caretAnchored = ( sinit->anchors != 0 );
2993     if ( caretAnchored ) {
2994 	QMap<int, int>& anchors = *sinit->anchors;
2995 	QMap<int, int>::ConstIterator a;
2996 	for ( a = anchors.begin(); a != anchors.end(); ++a ) {
2997 	    if (
2998 #ifndef QT_NO_REGEXP_ANCHOR_ALT
2999 		 (*a & Anchor_Alternation) != 0 ||
3000 #endif
3001 		 (*a & Anchor_Caret) == 0 ) {
3002 		caretAnchored = FALSE;
3003 		break;
3004 	    }
3005 	}
3006     }
3007 #endif
3008     return yyPos0;
3009 }
3010 
parseAtom(Box * box)3011 void QRegExpEngine::parseAtom( Box *box )
3012 {
3013 #ifndef QT_NO_REGEXP_LOOKAHEAD
3014     QRegExpEngine *eng = 0;
3015     bool neg;
3016     int len;
3017 #endif
3018 
3019     if ( (yyTok & Tok_Char) != 0 ) {
3020 	box->set( QChar(yyTok ^ Tok_Char) );
3021     } else {
3022 #ifndef QT_NO_REGEXP_OPTIM
3023 	trivial = FALSE;
3024 #endif
3025 	switch ( yyTok ) {
3026 	case Tok_Dollar:
3027 	    box->catAnchor( Anchor_Dollar );
3028 	    break;
3029 	case Tok_Caret:
3030 	    box->catAnchor( Anchor_Caret );
3031 	    break;
3032 #ifndef QT_NO_REGEXP_LOOKAHEAD
3033 	case Tok_PosLookahead:
3034 	case Tok_NegLookahead:
3035 	    neg = ( yyTok == Tok_NegLookahead );
3036 	    eng = new QRegExpEngine( cs );
3037 	    len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 );
3038 	    if ( len >= 0 )
3039 		skipChars( len );
3040 	    else
3041 		error( RXERR_LOOKAHEAD );
3042 	    box->catAnchor( addLookahead(eng, neg) );
3043 	    yyTok = getToken();
3044 	    if ( yyTok != Tok_RightParen )
3045 		error( RXERR_LOOKAHEAD );
3046 	    break;
3047 #endif
3048 #ifndef QT_NO_REGEXP_ESCAPE
3049 	case Tok_Word:
3050 	    box->catAnchor( Anchor_Word );
3051 	    break;
3052 	case Tok_NonWord:
3053 	    box->catAnchor( Anchor_NonWord );
3054 	    break;
3055 #endif
3056 	case Tok_LeftParen:
3057 	case Tok_MagicLeftParen:
3058 	    yyTok = getToken();
3059 	    parseExpression( box );
3060 	    if ( yyTok != Tok_RightParen )
3061 		error( RXERR_END );
3062 	    break;
3063 	case Tok_CharClass:
3064 	    box->set( *yyCharClass );
3065 	    break;
3066 	case Tok_Quantifier:
3067 	    error( RXERR_REPETITION );
3068 	    break;
3069 	default:
3070 #ifndef QT_NO_REGEXP_BACKREF
3071 	    if ( (yyTok & Tok_BackRef) != 0 )
3072 		box->set( yyTok ^ Tok_BackRef );
3073 	    else
3074 #endif
3075 		error( RXERR_DISABLED );
3076 	}
3077     }
3078     yyTok = getToken();
3079 }
3080 
parseFactor(Box * box)3081 void QRegExpEngine::parseFactor( Box *box )
3082 {
3083 #ifndef QT_NO_REGEXP_CAPTURE
3084     int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen );
3085 #else
3086     static const int atom = 0;
3087 #endif
3088 
3089 #ifndef QT_NO_REGEXP_INTERVAL
3090 #define YYREDO() \
3091 	yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3092 	*yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3093 
3094     const QChar *in = yyIn;
3095     int pos0 = yyPos0;
3096     int pos = yyPos;
3097     int len = yyLen;
3098     int ch = yyCh;
3099     CharClass charClass;
3100     if ( yyTok == Tok_CharClass )
3101 	charClass = *yyCharClass;
3102     int tok = yyTok;
3103     bool mayCapture = yyMayCapture;
3104 #endif
3105 
3106     parseAtom( box );
3107 #ifndef QT_NO_REGEXP_CAPTURE
3108     finishAtom( atom );
3109 #endif
3110 
3111     if ( yyTok == Tok_Quantifier ) {
3112 #ifndef QT_NO_REGEXP_OPTIM
3113 	trivial = FALSE;
3114 #endif
3115 	if ( yyMaxRep == InftyRep ) {
3116 	    box->plus( atom );
3117 #ifndef QT_NO_REGEXP_INTERVAL
3118 	} else if ( yyMaxRep == 0 ) {
3119 	    box->clear();
3120 #endif
3121 	}
3122 	if ( yyMinRep == 0 )
3123 	    box->opt();
3124 
3125 #ifndef QT_NO_REGEXP_INTERVAL
3126 	yyMayCapture = FALSE;
3127 	int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1;
3128 	int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 );
3129 
3130 	Box rightBox( this );
3131 	int i;
3132 
3133 	for ( i = 0; i < beta; i++ ) {
3134 	    YYREDO();
3135 	    Box leftBox( this );
3136 	    parseAtom( &leftBox );
3137 	    leftBox.cat( rightBox );
3138 	    leftBox.opt();
3139 	    rightBox = leftBox;
3140 	}
3141 	for ( i = 0; i < alpha; i++ ) {
3142 	    YYREDO();
3143 	    Box leftBox( this );
3144 	    parseAtom( &leftBox );
3145 	    leftBox.cat( rightBox );
3146 	    rightBox = leftBox;
3147 	}
3148 	rightBox.cat( *box );
3149 	*box = rightBox;
3150 #endif
3151 	yyTok = getToken();
3152 #ifndef QT_NO_REGEXP_INTERVAL
3153 	yyMayCapture = mayCapture;
3154 #endif
3155     }
3156 #undef YYREDO
3157 }
3158 
parseTerm(Box * box)3159 void QRegExpEngine::parseTerm( Box *box )
3160 {
3161 #ifndef QT_NO_REGEXP_OPTIM
3162     if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar )
3163 	parseFactor( box );
3164 #endif
3165     while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) {
3166 	Box rightBox( this );
3167 	parseFactor( &rightBox );
3168 	box->cat( rightBox );
3169     }
3170 }
3171 
parseExpression(Box * box)3172 void QRegExpEngine::parseExpression( Box *box )
3173 {
3174     parseTerm( box );
3175     while ( yyTok == Tok_Bar ) {
3176 #ifndef QT_NO_REGEXP_OPTIM
3177 	trivial = FALSE;
3178 #endif
3179 	Box rightBox( this );
3180 	yyTok = getToken();
3181 	parseTerm( &rightBox );
3182 	box->orx( rightBox );
3183     }
3184 }
3185 
3186 /*
3187   The struct QRegExpPrivate contains the private data of a regular
3188   expression other than the automaton. It makes it possible for many
3189   QRegExp objects to use the same QRegExpEngine object with different
3190   QRegExpPrivate objects.
3191 */
3192 struct QRegExpPrivate
3193 {
3194     QString pattern; // regular-expression or wildcard pattern
3195     QString rxpattern; // regular-expression pattern
3196 #ifndef QT_NO_REGEXP_WILDCARD
3197     bool wc : 1; // wildcard mode?
3198 #endif
3199     bool min : 1; // minimal matching? (instead of maximal)
3200     bool cs : 1; // case sensitive?
3201 #ifndef QT_NO_REGEXP_CAPTURE
3202     QString t; // last string passed to QRegExp::search() or searchRev()
3203     QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3204 #endif
3205     QMemArray<int> captured; // what QRegExpEngine::search() returned last
3206 
QRegExpPrivateQRegExpPrivate3207     QRegExpPrivate() { captured.fill( -1, 2 ); }
3208 };
3209 
3210 #ifndef QT_NO_REGEXP_OPTIM
3211 static QSingleCleanupHandler<QCache<QRegExpEngine> > cleanup_cache;
3212 #  ifndef QT_THREAD_SUPPORT
3213 static QCache<QRegExpEngine> *engineCache = 0;
3214 #  endif // QT_THREAD_SUPPORT
3215 #endif // QT_NO_REGEXP_OPTIM
3216 
regexpEngine(QRegExpEngine * & eng,const QString & pattern,bool caseSensitive,bool deref)3217 static void regexpEngine( QRegExpEngine *&eng, const QString &pattern,
3218 			  bool caseSensitive, bool deref )
3219 {
3220 #  ifdef QT_THREAD_SUPPORT
3221     static QThreadStorage<QCache<QRegExpEngine> *> engineCaches;
3222     QCache<QRegExpEngine> *engineCache = 0;
3223     QThreadInstance *currentThread = QThreadInstance::current();
3224     if (currentThread)
3225         engineCache = engineCaches.localData();
3226 #endif // QT_THREAD_SUPPORT
3227 
3228     if ( !deref ) {
3229 #ifndef QT_NO_REGEXP_OPTIM
3230 #  ifdef QT_THREAD_SUPPORT
3231         if ( currentThread )
3232 #  endif
3233             {
3234                 if ( engineCache != 0 ) {
3235                     eng = engineCache->take( pattern );
3236                     if ( eng == 0 || eng->caseSensitive() != caseSensitive ) {
3237                         delete eng;
3238                     } else {
3239                         eng->ref();
3240                         return;
3241                     }
3242                 }
3243             }
3244 #endif // QT_NO_REGEXP_OPTIM
3245 	eng = new QRegExpEngine( pattern, caseSensitive );
3246 	return;
3247     }
3248 
3249     if ( eng->deref() ) {
3250 #ifndef QT_NO_REGEXP_OPTIM
3251 #  ifdef QT_THREAD_SUPPORT
3252         if ( currentThread )
3253 #  endif
3254             {
3255                 if ( engineCache == 0 ) {
3256                     engineCache = new QCache<QRegExpEngine>;
3257                     engineCache->setAutoDelete( TRUE );
3258 #  ifdef QT_THREAD_SUPPORT
3259                     engineCaches.setLocalData(engineCache);
3260 #  else
3261                     cleanup_cache.set( &engineCache );
3262 #  endif // !QT_THREAD_SUPPORT
3263                 }
3264                 if ( !pattern.isNull() &&
3265                      engineCache->insert(pattern, eng, 4 + pattern.length() / 4) )
3266                     return;
3267             }
3268 #else
3269 	Q_UNUSED( pattern );
3270 #endif // QT_NO_REGEXP_OPTIM
3271 	delete eng;
3272 	eng = 0;
3273     }
3274 }
3275 
3276 /*!
3277     \enum QRegExp::CaretMode
3278 
3279     The CaretMode enum defines the different meanings of the caret
3280     (<b>^</b>) in a regular expression. The possible values are:
3281 
3282     \value CaretAtZero
3283 	   The caret corresponds to index 0 in the searched string.
3284 
3285     \value CaretAtOffset
3286 	   The caret corresponds to the start offset of the search.
3287 
3288     \value CaretWontMatch
3289 	   The caret never matches.
3290 */
3291 
3292 /*!
3293     Constructs an empty regexp.
3294 
3295     \sa isValid() errorString()
3296 */
QRegExp()3297 QRegExp::QRegExp()
3298     : eng( 0 )
3299 {
3300     priv = new QRegExpPrivate;
3301 #ifndef QT_NO_REGEXP_WILDCARD
3302     priv->wc = FALSE;
3303 #endif
3304     priv->min = FALSE;
3305     priv->cs = TRUE;
3306 }
3307 
3308 /*!
3309     Constructs a regular expression object for the given \a pattern
3310     string. The pattern must be given using wildcard notation if \a
3311     wildcard is TRUE (default is FALSE). The pattern is case
3312     sensitive, unless \a caseSensitive is FALSE. Matching is greedy
3313     (maximal), but can be changed by calling setMinimal().
3314 
3315     \sa setPattern() setCaseSensitive() setWildcard() setMinimal()
3316 */
QRegExp(const QString & pattern,bool caseSensitive,bool wildcard)3317 QRegExp::QRegExp( const QString& pattern, bool caseSensitive, bool wildcard )
3318     : eng( 0 )
3319 {
3320     priv = new QRegExpPrivate;
3321     priv->pattern = pattern;
3322 #ifndef QT_NO_REGEXP_WILDCARD
3323     priv->wc = wildcard;
3324 #endif
3325     priv->min = FALSE;
3326     priv->cs = caseSensitive;
3327 }
3328 
3329 /*!
3330     Constructs a regular expression as a copy of \a rx.
3331 
3332     \sa operator=()
3333 */
QRegExp(const QRegExp & rx)3334 QRegExp::QRegExp( const QRegExp& rx )
3335     : eng( 0 )
3336 {
3337     priv = new QRegExpPrivate;
3338     operator=( rx );
3339 }
3340 
3341 /*!
3342     Destroys the regular expression and cleans up its internal data.
3343 */
~QRegExp()3344 QRegExp::~QRegExp()
3345 {
3346     invalidateEngine();
3347     delete priv;
3348 }
3349 
3350 /*!
3351     Copies the regular expression \a rx and returns a reference to the
3352     copy. The case sensitivity, wildcard and minimal matching options
3353     are also copied.
3354 */
operator =(const QRegExp & rx)3355 QRegExp& QRegExp::operator=( const QRegExp& rx )
3356 {
3357     QRegExpEngine *otherEng = rx.eng;
3358     if ( otherEng != 0 )
3359 	otherEng->ref();
3360     invalidateEngine();
3361     eng = otherEng;
3362     priv->pattern = rx.priv->pattern;
3363     priv->rxpattern = rx.priv->rxpattern;
3364 #ifndef QT_NO_REGEXP_WILDCARD
3365     priv->wc = rx.priv->wc;
3366 #endif
3367     priv->min = rx.priv->min;
3368     priv->cs = rx.priv->cs;
3369 #ifndef QT_NO_REGEXP_CAPTURE
3370     priv->t = rx.priv->t;
3371     priv->capturedCache = rx.priv->capturedCache;
3372 #endif
3373     priv->captured = rx.priv->captured;
3374     return *this;
3375 }
3376 
3377 /*!
3378     Returns TRUE if this regular expression is equal to \a rx;
3379     otherwise returns FALSE.
3380 
3381     Two QRegExp objects are equal if they have the same pattern
3382     strings and the same settings for case sensitivity, wildcard and
3383     minimal matching.
3384 */
operator ==(const QRegExp & rx) const3385 bool QRegExp::operator==( const QRegExp& rx ) const
3386 {
3387     return priv->pattern == rx.priv->pattern &&
3388 #ifndef QT_NO_REGEXP_WILDCARD
3389 	   priv->wc == rx.priv->wc &&
3390 #endif
3391 	   priv->min == rx.priv->min &&
3392 	   priv->cs == rx.priv->cs;
3393 }
3394 
3395 /*!
3396     \fn bool QRegExp::operator!=( const QRegExp& rx ) const
3397 
3398     Returns TRUE if this regular expression is not equal to \a rx;
3399     otherwise returns FALSE.
3400 
3401     \sa operator==()
3402 */
3403 
3404 /*!
3405     Returns TRUE if the pattern string is empty; otherwise returns
3406     FALSE.
3407 
3408     If you call exactMatch() with an empty pattern on an empty string
3409     it will return TRUE; otherwise it returns FALSE since it operates
3410     over the whole string. If you call search() with an empty pattern
3411     on \e any string it will return the start offset (0 by default)
3412     because the empty pattern matches the 'emptiness' at the start of
3413     the string. In this case the length of the match returned by
3414     matchedLength() will be 0.
3415 
3416     See QString::isEmpty().
3417 */
3418 
isEmpty() const3419 bool QRegExp::isEmpty() const
3420 {
3421     return priv->pattern.isEmpty();
3422 }
3423 
3424 /*!
3425     Returns TRUE if the regular expression is valid; otherwise returns
3426     FALSE. An invalid regular expression never matches.
3427 
3428     The pattern <b>[a-z</b> is an example of an invalid pattern, since
3429     it lacks a closing square bracket.
3430 
3431     Note that the validity of a regexp may also depend on the setting
3432     of the wildcard flag, for example <b>*.html</b> is a valid
3433     wildcard regexp but an invalid full regexp.
3434 
3435     \sa errorString()
3436 */
isValid() const3437 bool QRegExp::isValid() const
3438 {
3439     if ( priv->pattern.isEmpty() ) {
3440 	return TRUE;
3441     } else {
3442 	prepareEngine();
3443 	return eng->isValid();
3444     }
3445 }
3446 
3447 /*!
3448     Returns the pattern string of the regular expression. The pattern
3449     has either regular expression syntax or wildcard syntax, depending
3450     on wildcard().
3451 
3452     \sa setPattern()
3453 */
pattern() const3454 QString QRegExp::pattern() const
3455 {
3456     return priv->pattern;
3457 }
3458 
3459 /*!
3460     Sets the pattern string to \a pattern. The case sensitivity,
3461     wildcard and minimal matching options are not changed.
3462 
3463     \sa pattern()
3464 */
setPattern(const QString & pattern)3465 void QRegExp::setPattern( const QString& pattern )
3466 {
3467     if ( priv->pattern != pattern ) {
3468 	priv->pattern = pattern;
3469 	invalidateEngine();
3470     }
3471 }
3472 
3473 /*!
3474     Returns TRUE if case sensitivity is enabled; otherwise returns
3475     FALSE. The default is TRUE.
3476 
3477     \sa setCaseSensitive()
3478 */
caseSensitive() const3479 bool QRegExp::caseSensitive() const
3480 {
3481     return priv->cs;
3482 }
3483 
3484 /*!
3485     Sets case sensitive matching to \a sensitive.
3486 
3487     If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but
3488     not \c{README.TXT}.
3489 
3490     \sa caseSensitive()
3491 */
setCaseSensitive(bool sensitive)3492 void QRegExp::setCaseSensitive( bool sensitive )
3493 {
3494     if ( sensitive != priv->cs ) {
3495 	priv->cs = sensitive;
3496 	invalidateEngine();
3497     }
3498 }
3499 
3500 #ifndef QT_NO_REGEXP_WILDCARD
3501 /*!
3502     Returns TRUE if wildcard mode is enabled; otherwise returns FALSE.
3503     The default is FALSE.
3504 
3505     \sa setWildcard()
3506 */
wildcard() const3507 bool QRegExp::wildcard() const
3508 {
3509     return priv->wc;
3510 }
3511 
3512 /*!
3513     Sets the wildcard mode for the regular expression. The default is
3514     FALSE.
3515 
3516     Setting \a wildcard to TRUE enables simple shell-like wildcard
3517     matching. (See \link #wildcard-matching wildcard matching
3518     (globbing) \endlink.)
3519 
3520     For example, <b>r*.txt</b> matches the string \c{readme.txt} in
3521     wildcard mode, but does not match \c{readme}.
3522 
3523     \sa wildcard()
3524 */
setWildcard(bool wildcard)3525 void QRegExp::setWildcard( bool wildcard )
3526 {
3527     if ( wildcard != priv->wc ) {
3528 	priv->wc = wildcard;
3529 	invalidateEngine();
3530     }
3531 }
3532 #endif
3533 
3534 /*!
3535     Returns TRUE if minimal (non-greedy) matching is enabled;
3536     otherwise returns FALSE.
3537 
3538     \sa setMinimal()
3539 */
minimal() const3540 bool QRegExp::minimal() const
3541 {
3542     return priv->min;
3543 }
3544 
3545 /*!
3546     Enables or disables minimal matching. If \a minimal is FALSE,
3547     matching is greedy (maximal) which is the default.
3548 
3549     For example, suppose we have the input string "We must be
3550     \<b>bold\</b>, very \<b>bold\</b>!" and the pattern
3551     <b>\<b>.*\</b></b>. With the default greedy (maximal) matching,
3552     the match is "We must be <u>\<b>bold\</b>, very
3553     \<b>bold\</b></u>!". But with minimal (non-greedy) matching the
3554     first match is: "We must be <u>\<b>bold\</b></u>, very
3555     \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>,
3556     very <u>\<b>bold\</b></u>!". In practice we might use the pattern
3557     <b>\<b>[^\<]+\</b></b> instead, although this will still fail for
3558     nested tags.
3559 
3560     \sa minimal()
3561 */
setMinimal(bool minimal)3562 void QRegExp::setMinimal( bool minimal )
3563 {
3564     priv->min = minimal;
3565 }
3566 
3567 /*!
3568     Returns TRUE if \a str is matched exactly by this regular
3569     expression; otherwise returns FALSE. You can determine how much of
3570     the string was matched by calling matchedLength().
3571 
3572     For a given regexp string, R, exactMatch("R") is the equivalent of
3573     search("^R$") since exactMatch() effectively encloses the regexp
3574     in the start of string and end of string anchors, except that it
3575     sets matchedLength() differently.
3576 
3577     For example, if the regular expression is <b>blue</b>, then
3578     exactMatch() returns TRUE only for input \c blue. For inputs \c
3579     bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE
3580     and matchedLength() will return 4, 3 and 0 respectively.
3581 
3582     Although const, this function sets matchedLength(),
3583     capturedTexts() and pos().
3584 
3585     \sa search() searchRev() QRegExpValidator
3586 */
exactMatch(const QString & str) const3587 bool QRegExp::exactMatch( const QString& str ) const
3588 {
3589     prepareEngineForMatch( str );
3590     eng->match( str, 0, priv->min, TRUE, 0, priv->captured );
3591     if ( priv->captured[1] == (int) str.length() ) {
3592 	return TRUE;
3593     } else {
3594 	priv->captured[0] = 0;
3595 	priv->captured[1] = eng->partialMatchLength();
3596 	return FALSE;
3597     }
3598 }
3599 
3600 #ifndef QT_NO_COMPAT
3601 /*! \obsolete
3602 
3603   Attempts to match in \a str, starting from position \a index.
3604   Returns the position of the match, or -1 if there was no match.
3605 
3606   The length of the match is stored in \a *len, unless \a len is a
3607   null pointer.
3608 
3609   If \a indexIsStart is TRUE (the default), the position \a index in
3610   the string will match the start of string anchor, <b>^</b>, in the
3611   regexp, if present. Otherwise, position 0 in \a str will match.
3612 
3613   Use search() and matchedLength() instead of this function.
3614 
3615   \sa QString::mid() QConstString
3616 */
match(const QString & str,int index,int * len,bool indexIsStart) const3617 int QRegExp::match( const QString& str, int index, int *len,
3618 		    bool indexIsStart ) const
3619 {
3620     int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero );
3621     if ( len != 0 )
3622 	*len = matchedLength();
3623     return pos;
3624 }
3625 #endif // QT_NO_COMPAT
3626 
search(const QString & str,int offset) const3627 int QRegExp::search( const QString& str, int offset ) const
3628 {
3629     return search( str, offset, CaretAtZero );
3630 }
3631 
3632 /*!
3633     Attempts to find a match in \a str from position \a offset (0 by
3634     default). If \a offset is -1, the search starts at the last
3635     character; if -2, at the next to last character; etc.
3636 
3637     Returns the position of the first match, or -1 if there was no
3638     match.
3639 
3640     The \a caretMode parameter can be used to instruct whether <b>^</b>
3641     should match at index 0 or at \a offset.
3642 
3643     You might prefer to use QString::find(), QString::contains() or
3644     even QStringList::grep(). To replace matches use
3645     QString::replace().
3646 
3647     Example:
3648     \code
3649 	QString str = "offsets: 1.23 .50 71.00 6.00";
3650 	QRegExp rx( "\\d*\\.\\d+" );    // primitive floating point matching
3651 	int count = 0;
3652 	int pos = 0;
3653 	while ( (pos = rx.search(str, pos)) != -1 ) {
3654 	    count++;
3655 	    pos += rx.matchedLength();
3656 	}
3657 	// pos will be 9, 14, 18 and finally 24; count will end up as 4
3658     \endcode
3659 
3660     Although const, this function sets matchedLength(),
3661     capturedTexts() and pos().
3662 
3663     \sa searchRev() exactMatch()
3664 */
3665 
search(const QString & str,int offset,CaretMode caretMode) const3666 int QRegExp::search( const QString& str, int offset, CaretMode caretMode ) const
3667 {
3668     prepareEngineForMatch( str );
3669     if ( offset < 0 )
3670 	offset += str.length();
3671     eng->match( str, offset, priv->min, FALSE, caretIndex(offset, caretMode),
3672 		priv->captured );
3673     return priv->captured[0];
3674 }
3675 
3676 
searchRev(const QString & str,int offset) const3677 int QRegExp::searchRev( const QString& str, int offset ) const
3678 {
3679     return searchRev( str, offset, CaretAtZero );
3680 }
3681 
3682 /*!
3683     Attempts to find a match backwards in \a str from position \a
3684     offset. If \a offset is -1 (the default), the search starts at the
3685     last character; if -2, at the next to last character; etc.
3686 
3687     Returns the position of the first match, or -1 if there was no
3688     match.
3689 
3690     The \a caretMode parameter can be used to instruct whether <b>^</b>
3691     should match at index 0 or at \a offset.
3692 
3693     Although const, this function sets matchedLength(),
3694     capturedTexts() and pos().
3695 
3696     \warning Searching backwards is much slower than searching
3697     forwards.
3698 
3699     \sa search() exactMatch()
3700 */
3701 
searchRev(const QString & str,int offset,CaretMode caretMode) const3702 int QRegExp::searchRev( const QString& str, int offset,
3703 			CaretMode caretMode ) const
3704 {
3705     prepareEngineForMatch( str );
3706     if ( offset < 0 )
3707 	offset += str.length();
3708     if ( offset < 0 || offset > (int) str.length() ) {
3709 	priv->captured.detach();
3710 	priv->captured.fill( -1 );
3711 	return -1;
3712     }
3713 
3714     while ( offset >= 0 ) {
3715 	eng->match( str, offset, priv->min, TRUE, caretIndex(offset, caretMode),
3716 		    priv->captured );
3717 	if ( priv->captured[0] == offset )
3718 	    return offset;
3719 	offset--;
3720     }
3721     return -1;
3722 }
3723 
3724 /*!
3725     Returns the length of the last matched string, or -1 if there was
3726     no match.
3727 
3728     \sa exactMatch() search() searchRev()
3729 */
matchedLength() const3730 int QRegExp::matchedLength() const
3731 {
3732     return priv->captured[1];
3733 }
3734 
3735 #ifndef QT_NO_REGEXP_CAPTURE
3736 /*!
3737   Returns the number of captures contained in the regular expression.
3738  */
numCaptures() const3739 int QRegExp::numCaptures() const
3740 {
3741     prepareEngine();
3742     return eng->numCaptures();
3743 }
3744 
3745 /*!
3746     Returns a list of the captured text strings.
3747 
3748     The first string in the list is the entire matched string. Each
3749     subsequent list element contains a string that matched a
3750     (capturing) subexpression of the regexp.
3751 
3752     For example:
3753     \code
3754 	QRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" );
3755 	int pos = rx.search( "Length: 36 inches" );
3756 	QStringList list = rx.capturedTexts();
3757 	// list is now ( "36 inches", "36", " ", "inches", "es" )
3758     \endcode
3759 
3760     The above example also captures elements that may be present but
3761     which we have no interest in. This problem can be solved by using
3762     non-capturing parentheses:
3763 
3764     \code
3765 	QRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" );
3766 	int pos = rx.search( "Length: 36 inches" );
3767 	QStringList list = rx.capturedTexts();
3768 	// list is now ( "36 inches", "36", "inches" )
3769     \endcode
3770 
3771     Note that if you want to iterate over the list, you should iterate
3772     over a copy, e.g.
3773     \code
3774 	QStringList list = rx.capturedTexts();
3775 	QStringList::Iterator it = list.begin();
3776 	while( it != list.end() ) {
3777 	    myProcessing( *it );
3778 	    ++it;
3779 	}
3780     \endcode
3781 
3782     Some regexps can match an indeterminate number of times. For
3783     example if the input string is "Offsets: 12 14 99 231 7" and the
3784     regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of
3785     all the numbers matched. However, after calling
3786     \c{rx.search(str)}, capturedTexts() will return the list ( "12",
3787     "12" ), i.e. the entire match was "12" and the first subexpression
3788     matched was "12". The correct approach is to use cap() in a \link
3789     #cap_in_a_loop loop \endlink.
3790 
3791     The order of elements in the string list is as follows. The first
3792     element is the entire matching string. Each subsequent element
3793     corresponds to the next capturing open left parentheses. Thus
3794     capturedTexts()[1] is the text of the first capturing parentheses,
3795     capturedTexts()[2] is the text of the second and so on
3796     (corresponding to $1, $2, etc., in some other regexp languages).
3797 
3798     \sa cap() pos() exactMatch() search() searchRev()
3799 */
capturedTexts()3800 QStringList QRegExp::capturedTexts()
3801 {
3802     if ( priv->capturedCache.isEmpty() ) {
3803 	for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) {
3804 	    QString m;
3805 	    if ( priv->captured[i + 1] == 0 )
3806 		m = QString::fromLatin1( "" );
3807 	    else if ( priv->captured[i] >= 0 )
3808 		m = priv->t.mid( priv->captured[i],
3809 				 priv->captured[i + 1] );
3810 	    priv->capturedCache.append( m );
3811 	}
3812 	priv->t = QString::null;
3813     }
3814     return priv->capturedCache;
3815 }
3816 
3817 /*!
3818     Returns the text captured by the \a nth subexpression. The entire
3819     match has index 0 and the parenthesized subexpressions have
3820     indices starting from 1 (excluding non-capturing parentheses).
3821 
3822     \code
3823     QRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" );
3824     int pos = rxlen.search( "Length: 189cm" );
3825     if ( pos > -1 ) {
3826 	QString value = rxlen.cap( 1 ); // "189"
3827 	QString unit = rxlen.cap( 2 );  // "cm"
3828 	// ...
3829     }
3830     \endcode
3831 
3832     The order of elements matched by cap() is as follows. The first
3833     element, cap(0), is the entire matching string. Each subsequent
3834     element corresponds to the next capturing open left parentheses.
3835     Thus cap(1) is the text of the first capturing parentheses, cap(2)
3836     is the text of the second, and so on.
3837 
3838     \target cap_in_a_loop
3839     Some patterns may lead to a number of matches which cannot be
3840     determined in advance, for example:
3841 
3842     \code
3843     QRegExp rx( "(\\d+)" );
3844     str = "Offsets: 12 14 99 231 7";
3845     QStringList list;
3846     pos = 0;
3847     while ( pos >= 0 ) {
3848 	pos = rx.search( str, pos );
3849 	if ( pos > -1 ) {
3850 	    list += rx.cap( 1 );
3851 	    pos  += rx.matchedLength();
3852 	}
3853     }
3854     // list contains "12", "14", "99", "231", "7"
3855     \endcode
3856 
3857     \sa capturedTexts() pos() exactMatch() search() searchRev()
3858 */
cap(int nth)3859 QString QRegExp::cap( int nth )
3860 {
3861     if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) {
3862 	return QString::null;
3863     } else {
3864 	return capturedTexts()[nth];
3865     }
3866 }
3867 
3868 /*!
3869     Returns the position of the \a nth captured text in the searched
3870     string. If \a nth is 0 (the default), pos() returns the position
3871     of the whole match.
3872 
3873     Example:
3874     \code
3875     QRegExp rx( "/([a-z]+)/([a-z]+)" );
3876     rx.search( "Output /dev/null" );    // returns 7 (position of /dev/null)
3877     rx.pos( 0 );                        // returns 7 (position of /dev/null)
3878     rx.pos( 1 );                        // returns 8 (position of dev)
3879     rx.pos( 2 );                        // returns 12 (position of null)
3880     \endcode
3881 
3882     For zero-length matches, pos() always returns -1. (For example, if
3883     cap(4) would return an empty string, pos(4) returns -1.) This is
3884     due to an implementation tradeoff.
3885 
3886     \sa capturedTexts() exactMatch() search() searchRev()
3887 */
pos(int nth)3888 int QRegExp::pos( int nth )
3889 {
3890     if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
3891 	return -1;
3892     else
3893 	return priv->captured[2 * nth];
3894 }
3895 
3896 /*!
3897   Returns a text string that explains why a regexp pattern is
3898   invalid the case being; otherwise returns "no error occurred".
3899 
3900   \sa isValid()
3901 */
errorString()3902 QString QRegExp::errorString()
3903 {
3904     if ( isValid() ) {
3905 	return QString( RXERR_OK );
3906     } else {
3907 	return eng->errorString();
3908     }
3909 }
3910 #endif
3911 
3912 /*!
3913   Returns the string \a str with every regexp special character
3914   escaped with a backslash. The special characters are $, (, ), *, +,
3915   ., ?, [, \, ], ^, {, | and }.
3916 
3917   Example:
3918   \code
3919      s1 = QRegExp::escape( "bingo" );   // s1 == "bingo"
3920      s2 = QRegExp::escape( "f(x)" );    // s2 == "f\\(x\\)"
3921   \endcode
3922 
3923   This function is useful to construct regexp patterns dynamically:
3924 
3925   \code
3926     QRegExp rx( "(" + QRegExp::escape(name) +
3927 		"|" + QRegExp::escape(alias) + ")" );
3928   \endcode
3929 */
escape(const QString & str)3930 QString QRegExp::escape( const QString& str )
3931 {
3932     static const char meta[] = "$()*+.?[\\]^{|}";
3933     QString quoted = str;
3934     int i = 0;
3935 
3936     while ( i < (int) quoted.length() ) {
3937 	if ( strchr(meta, quoted[i].latin1()) != 0 )
3938 	    quoted.insert( i++, "\\" );
3939 	i++;
3940     }
3941     return quoted;
3942 }
3943 
prepareEngine() const3944 void QRegExp::prepareEngine() const
3945 {
3946     if ( eng == 0 ) {
3947 #ifndef QT_NO_REGEXP_WILDCARD
3948 	if ( priv->wc )
3949 	    priv->rxpattern = wc2rx( priv->pattern );
3950 	else
3951 #endif
3952 	    priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" )
3953 			      : priv->pattern;
3954 	QRegExp *that = (QRegExp *) this;
3955 	// that->eng = newEngine( priv->rxpattern, priv->cs );
3956 	regexpEngine( that->eng, priv->rxpattern, priv->cs, FALSE );
3957 	priv->captured.detach();
3958 	priv->captured.fill( -1, 2 + 2 * eng->numCaptures() );
3959     }
3960 }
3961 
prepareEngineForMatch(const QString & str) const3962 void QRegExp::prepareEngineForMatch( const QString& str ) const
3963 {
3964     prepareEngine();
3965 #ifndef QT_NO_REGEXP_CAPTURE
3966     priv->t = str;
3967     priv->capturedCache.clear();
3968 #else
3969     Q_UNUSED( str );
3970 #endif
3971 }
3972 
invalidateEngine()3973 void QRegExp::invalidateEngine()
3974 {
3975     if ( eng != 0 ) {
3976 	regexpEngine( eng, priv->rxpattern, priv->cs, TRUE );
3977 	priv->rxpattern = QString();
3978 	eng = 0;
3979     }
3980 }
3981 
caretIndex(int offset,CaretMode caretMode)3982 int QRegExp::caretIndex( int offset, CaretMode caretMode )
3983 {
3984     if ( caretMode == CaretAtZero ) {
3985 	return 0;
3986     } else if ( caretMode == CaretAtOffset ) {
3987 	return offset;
3988     } else { // CaretWontMatch
3989 	return -1;
3990     }
3991 }
3992 
3993 #endif // QT_NO_REGEXP
3994