1 /****************************************************************************
2 **
3 ** Copyright (C) 2015 The Qt Company Ltd.
4 ** Contact: http://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see http://www.qt.io/terms-conditions. For further
15 ** information use the contact form at http://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 or version 3 as published by the Free
20 ** Software Foundation and appearing in the file LICENSE.LGPLv21 and
21 ** LICENSE.LGPLv3 included in the packaging of this file. Please review the
22 ** following information to ensure the GNU Lesser General Public License
23 ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
24 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 **
26 ** As a special exception, The Qt Company gives you certain additional
27 ** rights. These rights are described in The Qt Company LGPL Exception
28 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 **
30 ** GNU General Public License Usage
31 ** Alternatively, this file may be used under the terms of the GNU
32 ** General Public License version 3.0 as published by the Free Software
33 ** Foundation and appearing in the file LICENSE.GPL included in the
34 ** packaging of this file.  Please review the following information to
35 ** ensure the GNU General Public License version 3.0 requirements will be
36 ** met: http://www.gnu.org/copyleft/gpl.html.
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41 
42 #include "qregexp.h"
43 
44 #include "qalgorithms.h"
45 #include "qbitarray.h"
46 #include "qcache.h"
47 #include "qdatastream.h"
48 #include "qlist.h"
49 #include "qmap.h"
50 #include "qmutex.h"
51 #include "qstring.h"
52 #include "qstringlist.h"
53 #include "qstringmatcher.h"
54 #include "qvector.h"
55 #include "private/qfunctions_p.h"
56 
57 #include <limits.h>
58 
59 QT_BEGIN_NAMESPACE
60 
61 int qFindString(const QChar *haystack, int haystackLen, int from,
62     const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
63 
64 // error strings for the regexp parser
65 #define RXERR_OK         QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
66 #define RXERR_DISABLED   QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
67 #define RXERR_CHARCLASS  QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
68 #define RXERR_LOOKAHEAD  QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
69 #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
70 #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
71 #define RXERR_OCTAL      QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
72 #define RXERR_LEFTDELIM  QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
73 #define RXERR_END        QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
74 #define RXERR_LIMIT      QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
75 #define RXERR_INTERVAL   QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
76 #define RXERR_CATEGORY   QT_TRANSLATE_NOOP("QRegExp", "invalid category")
77 
78 /*!
79     \class QRegExp
80     \reentrant
81     \brief The QRegExp class provides pattern matching using regular expressions.
82 
83     \ingroup tools
84     \ingroup shared
85 
86     \keyword regular expression
87 
88     A regular expression, or "regexp", is a pattern for matching
89     substrings in a text. This is useful in many contexts, e.g.,
90 
91     \table
92     \row \i Validation
93          \i A regexp can test whether a substring meets some criteria,
94          e.g. is an integer or contains no whitespace.
95     \row \i Searching
96          \i A regexp provides more powerful pattern matching than
97          simple substring matching, e.g., match one of the words
98          \e{mail}, \e{letter} or \e{correspondence}, but none of the
99          words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
100      \row \i Search and Replace
101          \i A regexp can replace all occurrences of a substring with a
102          different substring, e.g., replace all occurrences of \e{&}
103          with \e{\&amp;} except where the \e{&} is already followed by
104          an \e{amp;}.
105     \row \i String Splitting
106          \i A regexp can be used to identify where a string should be
107          split apart, e.g. splitting tab-delimited strings.
108     \endtable
109 
110     A brief introduction to regexps is presented, a description of
111     Qt's regexp language, some examples, and the function
112     documentation itself. QRegExp is modeled on Perl's regexp
113     language. It fully supports Unicode. QRegExp can also be used in a
114     simpler, \e{wildcard mode} that is similar to the functionality
115     found in command shells. The syntax rules used by QRegExp can be
116     changed with setPatternSyntax(). In particular, the pattern syntax
117     can be set to QRegExp::FixedString, which means the pattern to be
118     matched is interpreted as a plain string, i.e., special characters
119     (e.g., backslash) are not escaped.
120 
121     A good text on regexps is \e {Mastering Regular Expressions}
122     (Third Edition) by Jeffrey E. F.  Friedl, ISBN 0-596-52812-4.
123 
124     \tableofcontents
125 
126     \section1 Introduction
127 
128     Regexps are built up from expressions, quantifiers, and
129     assertions. The simplest expression is a character, e.g. \bold{x}
130     or \bold{5}. An expression can also be a set of characters
131     enclosed in square brackets. \bold{[ABCD]} will match an \bold{A}
132     or a \bold{B} or a \bold{C} or a \bold{D}. We can write this same
133     expression as \bold{[A-D]}, and an experession to match any
134     captital letter in the English alphabet is written as
135     \bold{[A-Z]}.
136 
137     A quantifier specifies the number of occurrences of an expression
138     that must be matched. \bold{x{1,1}} means match one and only one
139     \bold{x}. \bold{x{1,5}} means match a sequence of \bold{x}
140     characters that contains at least one \bold{x} but no more than
141     five.
142 
143     Note that in general regexps cannot be used to check for balanced
144     brackets or tags. For example, a regexp can be written to match an
145     opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
146     are not nested, but if the \c{<b>} tags are nested, that same
147     regexp will match an opening \c{<b>} tag with the wrong closing
148     \c{</b>}.  For the fragment \c{<b>bold <b>bolder</b></b>}, the
149     first \c{<b>} would be matched with the first \c{</b>}, which is
150     not correct. However, it is possible to write a regexp that will
151     match nested brackets or tags correctly, but only if the number of
152     nesting levels is fixed and known. If the number of nesting levels
153     is not fixed and known, it is impossible to write a regexp that
154     will not fail.
155 
156     Suppose we want a regexp to match integers in the range 0 to 99.
157     At least one digit is required, so we start with the expression
158     \bold{[0-9]{1,1}}, which matches a single digit exactly once. This
159     regexp matches integers in the range 0 to 9. To match integers up
160     to 99, increase the maximum number of occurrences to 2, so the
161     regexp becomes \bold{[0-9]{1,2}}. This regexp satisfies the
162     original requirement to match integers from 0 to 99, but it will
163     also match integers that occur in the middle of strings. If we
164     want the matched integer to be the whole string, we must use the
165     anchor assertions, \bold{^} (caret) and \bold{$} (dollar). When
166     \bold{^} is the first character in a regexp, it means the regexp
167     must match from the beginning of the string. When \bold{$} is the
168     last character of the regexp, it means the regexp must match to
169     the end of the string. The regexp becomes \bold{^[0-9]{1,2}$}.
170     Note that assertions, e.g. \bold{^} and \bold{$}, do not match
171     characters but locations in the string.
172 
173     If you have seen regexps described elsewhere, they may have looked
174     different from the ones shown here. This is because some sets of
175     characters and some quantifiers are so common that they have been
176     given special symbols to represent them. \bold{[0-9]} can be
177     replaced with the symbol \bold{\\d}. The quantifier to match
178     exactly one occurrence, \bold{{1,1}}, can be replaced with the
179     expression itself, i.e. \bold{x{1,1}} is the same as \bold{x}. So
180     our 0 to 99 matcher could be written as \bold{^\\d{1,2}$}. It can
181     also be written \bold{^\\d\\d{0,1}$}, i.e. \e{From the start of
182     the string, match a digit, followed immediately by 0 or 1 digits}.
183     In practice, it would be written as \bold{^\\d\\d?$}. The \bold{?}
184     is shorthand for the quantifier \bold{{0,1}}, i.e. 0 or 1
185     occurrences. \bold{?} makes an expression optional. The regexp
186     \bold{^\\d\\d?$} means \e{From the beginning of the string, match
187     one digit, followed immediately by 0 or 1 more digit, followed
188     immediately by end of string}.
189 
190     To write a regexp that matches one of the words 'mail' \e or
191     'letter' \e or 'correspondence' but does not match words that
192     contain these words, e.g., 'email', 'mailman', 'mailer', and
193     'letterbox', start with a regexp that matches 'mail'. Expressed
194     fully, the regexp is \bold{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
195     a character expression is automatically quantified by
196     \bold{{1,1}}, we can simplify the regexp to \bold{mail}, i.e., an
197     'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
198     we can use the vertical bar \bold{|}, which means \bold{or}, to
199     include the other two words, so our regexp for matching any of the
200     three words becomes \bold{mail|letter|correspondence}. Match
201     'mail' \bold{or} 'letter' \bold{or} 'correspondence'. While this
202     regexp will match one of the three words we want to match, it will
203     also match words we don't want to match, e.g., 'email'.  To
204     prevent the regexp from matching unwanted words, we must tell it
205     to begin and end the match at word boundaries. First we enclose
206     our regexp in parentheses, \bold{(mail|letter|correspondence)}.
207     Parentheses group expressions together, and they identify a part
208     of the regexp that we wish to \l{capturing text}{capture}.
209     Enclosing the expression in parentheses allows us to use it as a
210     component in more complex regexps. It also allows us to examine
211     which of the three words was actually matched. To force the match
212     to begin and end on word boundaries, we enclose the regexp in
213     \bold{\\b} \e{word boundary} assertions:
214     \bold{\\b(mail|letter|correspondence)\\b}.  Now the regexp means:
215     \e{Match a word boundary, followed by the regexp in parentheses,
216     followed by a word boundary}. The \bold{\\b} assertion matches a
217     \e position in the regexp, not a \e character. A word boundary is
218     any non-word character, e.g., a space, newline, or the beginning
219     or ending of a string.
220 
221     If we want to replace ampersand characters with the HTML entity
222     \bold{\&amp;}, the regexp to match is simply \bold{\&}. But this
223     regexp will also match ampersands that have already been converted
224     to HTML entities. We want to replace only ampersands that are not
225     already followed by \bold{amp;}. For this, we need the negative
226     lookahead assertion, \bold{(?!}__\bold{)}. The regexp can then be
227     written as \bold{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
228     \bold{not} \e{followed by} \bold{amp;}.
229 
230     If we want to count all the occurrences of 'Eric' and 'Eirik' in a
231     string, two valid solutions are \bold{\\b(Eric|Eirik)\\b} and
232     \bold{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
233     required to avoid matching words that contain either name,
234     e.g. 'Ericsson'. Note that the second regexp matches more
235     spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
236 
237     Some of the examples discussed above are implemented in the
238     \link #code-examples code examples \endlink section.
239 
240     \target characters-and-abbreviations-for-sets-of-characters
241     \section1 Characters and Abbreviations for Sets of Characters
242 
243     \table
244     \header \i Element \i Meaning
245     \row \i \bold{c}
246          \i A character represents itself unless it has a special
247          regexp meaning. e.g. \bold{c} matches the character \e c.
248     \row \i \bold{\\c}
249          \i A character that follows a backslash matches the character
250          itself, except as specified below. e.g., To match a literal
251          caret at the beginning of a string, write \bold{\\^}.
252     \row \i \bold{\\a}
253          \i Matches the ASCII bell (BEL, 0x07).
254     \row \i \bold{\\f}
255          \i Matches the ASCII form feed (FF, 0x0C).
256     \row \i \bold{\\n}
257          \i Matches the ASCII line feed (LF, 0x0A, Unix newline).
258     \row \i \bold{\\r}
259          \i Matches the ASCII carriage return (CR, 0x0D).
260     \row \i \bold{\\t}
261          \i Matches the ASCII horizontal tab (HT, 0x09).
262     \row \i \bold{\\v}
263          \i Matches the ASCII vertical tab (VT, 0x0B).
264     \row \i \bold{\\x\e{hhhh}}
265          \i Matches the Unicode character corresponding to the
266          hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
267     \row \i \bold{\\0\e{ooo}} (i.e., \\zero \e{ooo})
268          \i matches the ASCII/Latin1 character for the octal number
269          \e{ooo} (between 0 and 0377).
270     \row \i \bold{. (dot)}
271          \i Matches any character (including newline).
272     \row \i \bold{\\d}
273          \i Matches a digit (QChar::isDigit()).
274     \row \i \bold{\\D}
275          \i Matches a non-digit.
276     \row \i \bold{\\s}
277          \i Matches a whitespace character (QChar::isSpace()).
278     \row \i \bold{\\S}
279          \i Matches a non-whitespace character.
280     \row \i \bold{\\w}
281          \i Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
282     \row \i \bold{\\W}
283          \i Matches a non-word character.
284     \row \i \bold{\\\e{n}}
285          \i The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
286     \endtable
287 
288     \bold{Note:} The C++ compiler transforms backslashes in strings.
289     To include a \bold{\\} in a regexp, enter it twice, i.e. \c{\\}.
290     To match the backslash character itself, enter it four times, i.e.
291     \c{\\\\}.
292 
293     \target sets-of-characters
294     \section1 Sets of Characters
295 
296     Square brackets mean match any character contained in the square
297     brackets. The character set abbreviations described above can
298     appear in a character set in square brackets. Except for the
299     character set abbreviations and the following two exceptions,
300     characters do not have special meanings in square brackets.
301 
302     \table
303     \row \i \bold{^}
304 
305          \i The caret negates the character set if it occurs as the
306          first character (i.e. immediately after the opening square
307          bracket). \bold{[abc]} matches 'a' or 'b' or 'c', but
308          \bold{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
309 
310     \row \i \bold{-}
311 
312          \i The dash indicates a range of characters. \bold{[W-Z]}
313          matches 'W' or 'X' or 'Y' or 'Z'.
314 
315     \endtable
316 
317     Using the predefined character set abbreviations is more portable
318     than using character ranges across platforms and languages. For
319     example, \bold{[0-9]} matches a digit in Western alphabets but
320     \bold{\\d} matches a digit in \e any alphabet.
321 
322     Note: In other regexp documentation, sets of characters are often
323     called "character classes".
324 
325     \target quantifiers
326     \section1 Quantifiers
327 
328     By default, an expression is automatically quantified by
329     \bold{{1,1}}, i.e. it should occur exactly once. In the following
330     list, \bold{\e {E}} stands for expression. An expression is a
331     character, or an abbreviation for a set of characters, or a set of
332     characters in square brackets, or an expression in parentheses.
333 
334     \table
335     \row \i \bold{\e {E}?}
336 
337          \i Matches zero or one occurrences of \e E. This quantifier
338          means \e{The previous expression is optional}, because it
339          will match whether or not the expression is found. \bold{\e
340          {E}?} is the same as \bold{\e {E}{0,1}}. e.g., \bold{dents?}
341          matches 'dent' or 'dents'.
342 
343     \row \i \bold{\e {E}+}
344 
345          \i Matches one or more occurrences of \e E. \bold{\e {E}+} is
346          the same as \bold{\e {E}{1,}}. e.g., \bold{0+} matches '0',
347          '00', '000', etc.
348 
349     \row \i \bold{\e {E}*}
350 
351          \i Matches zero or more occurrences of \e E. It is the same
352          as \bold{\e {E}{0,}}. The \bold{*} quantifier is often used
353          in error where \bold{+} should be used. For example, if
354          \bold{\\s*$} is used in an expression to match strings that
355          end in whitespace, it will match every string because
356          \bold{\\s*$} means \e{Match zero or more whitespaces followed
357          by end of string}. The correct regexp to match strings that
358          have at least one trailing whitespace character is
359          \bold{\\s+$}.
360 
361     \row \i \bold{\e {E}{n}}
362 
363          \i Matches exactly \e n occurrences of \e E. \bold{\e {E}{n}}
364          is the same as repeating \e E \e n times. For example,
365          \bold{x{5}} is the same as \bold{xxxxx}. It is also the same
366          as \bold{\e {E}{n,n}}, e.g. \bold{x{5,5}}.
367 
368     \row \i \bold{\e {E}{n,}}
369          \i Matches at least \e n occurrences of \e E.
370 
371     \row \i \bold{\e {E}{,m}}
372          \i Matches at most \e m occurrences of \e E. \bold{\e {E}{,m}}
373          is the same as \bold{\e {E}{0,m}}.
374 
375     \row \i \bold{\e {E}{n,m}}
376          \i Matches at least \e n and at most \e m occurrences of \e E.
377     \endtable
378 
379     To apply a quantifier to more than just the preceding character,
380     use parentheses to group characters together in an expression. For
381     example, \bold{tag+} matches a 't' followed by an 'a' followed by
382     at least one 'g', whereas \bold{(tag)+} matches at least one
383     occurrence of 'tag'.
384 
385     Note: Quantifiers are normally "greedy". They always match as much
386     text as they can. For example, \bold{0+} matches the first zero it
387     finds and all the consecutive zeros after the first zero. Applied
388     to '20005', it matches'2\underline{000}5'. Quantifiers can be made
389     non-greedy, see setMinimal().
390 
391     \target capturing parentheses
392     \target backreferences
393     \section1 Capturing Text
394 
395     Parentheses allow us to group elements together so that we can
396     quantify and capture them. For example if we have the expression
397     \bold{mail|letter|correspondence} that matches a string we know
398     that \e one of the words matched but not which one. Using
399     parentheses allows us to "capture" whatever is matched within
400     their bounds, so if we used \bold{(mail|letter|correspondence)}
401     and matched this regexp against the string "I sent you some email"
402     we can use the cap() or capturedTexts() functions to extract the
403     matched characters, in this case 'mail'.
404 
405     We can use captured text within the regexp itself. To refer to the
406     captured text we use \e backreferences which are indexed from 1,
407     the same as for cap(). For example we could search for duplicate
408     words in a string using \bold{\\b(\\w+)\\W+\\1\\b} which means match a
409     word boundary followed by one or more word characters followed by
410     one or more non-word characters followed by the same text as the
411     first parenthesized expression followed by a word boundary.
412 
413     If we want to use parentheses purely for grouping and not for
414     capturing we can use the non-capturing syntax, e.g.
415     \bold{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
416     end ')'. In this example we match either 'green' or 'blue' but we
417     do not capture the match so we only know whether or not we matched
418     but not which color we actually found. Using non-capturing
419     parentheses is more efficient than using capturing parentheses
420     since the regexp engine has to do less book-keeping.
421 
422     Both capturing and non-capturing parentheses may be nested.
423 
424     \target greedy quantifiers
425 
426     For historical reasons, quantifiers (e.g. \bold{*}) that apply to
427     capturing parentheses are more "greedy" than other quantifiers.
428     For example, \bold{a*(a*)} will match "aaa" with cap(1) == "aaa".
429     This behavior is different from what other regexp engines do
430     (notably, Perl). To obtain a more intuitive capturing behavior,
431     specify QRegExp::RegExp2 to the QRegExp constructor or call
432     setPatternSyntax(QRegExp::RegExp2).
433 
434     \target cap_in_a_loop
435 
436     When the number of matches cannot be determined in advance, a
437     common idiom is to use cap() in a loop. For example:
438 
439     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
440 
441     \target assertions
442     \section1 Assertions
443 
444     Assertions make some statement about the text at the point where
445     they occur in the regexp but they do not match any characters. In
446     the following list \bold{\e {E}} stands for any expression.
447 
448     \table
449     \row \i \bold{^}
450          \i The caret signifies the beginning of the string. If you
451          wish to match a literal \c{^} you must escape it by
452          writing \c{\\^}. For example, \bold{^#include} will only
453          match strings which \e begin with the characters '#include'.
454          (When the caret is the first character of a character set it
455          has a special meaning, see \link #sets-of-characters Sets of
456          Characters \endlink.)
457 
458     \row \i \bold{$}
459          \i The dollar signifies the end of the string. For example
460          \bold{\\d\\s*$} will match strings which end with a digit
461          optionally followed by whitespace. If you wish to match a
462          literal \c{$} you must escape it by writing
463          \c{\\$}.
464 
465     \row \i \bold{\\b}
466          \i A word boundary. For example the regexp
467          \bold{\\bOK\\b} means match immediately after a word
468          boundary (e.g. start of string or whitespace) the letter 'O'
469          then the letter 'K' immediately before another word boundary
470          (e.g. end of string or whitespace). But note that the
471          assertion does not actually match any whitespace so if we
472          write \bold{(\\bOK\\b)} and we have a match it will only
473          contain 'OK' even if the string is "It's \underline{OK} now".
474 
475     \row \i \bold{\\B}
476          \i A non-word boundary. This assertion is true wherever
477          \bold{\\b} is false. For example if we searched for
478          \bold{\\Bon\\B} in "Left on" the match would fail (space
479          and end of string aren't non-word boundaries), but it would
480          match in "t\underline{on}ne".
481 
482     \row \i \bold{(?=\e E)}
483          \i Positive lookahead. This assertion is true if the
484          expression matches at this point in the regexp. For example,
485          \bold{const(?=\\s+char)} matches 'const' whenever it is
486          followed by 'char', as in 'static \underline{const} char *'.
487          (Compare with \bold{const\\s+char}, which matches 'static
488          \underline{const char} *'.)
489 
490     \row \i \bold{(?!\e E)}
491          \i Negative lookahead. This assertion is true if the
492          expression does not match at this point in the regexp. For
493          example, \bold{const(?!\\s+char)} matches 'const' \e except
494          when it is followed by 'char'.
495     \endtable
496 
497     \keyword QRegExp wildcard matching
498     \section1 Wildcard Matching
499 
500     Most command shells such as \e bash or \e cmd.exe support "file
501     globbing", the ability to identify a group of files by using
502     wildcards. The setPatternSyntax() function is used to switch
503     between regexp and wildcard mode. Wildcard matching is much
504     simpler than full regexps and has only four features:
505 
506     \table
507     \row \i \bold{c}
508          \i Any character represents itself apart from those mentioned
509          below. Thus \bold{c} matches the character \e c.
510     \row \i \bold{?}
511          \i Matches any single character. It is the same as
512          \bold{.} in full regexps.
513     \row \i \bold{*}
514          \i Matches zero or more of any characters. It is the
515          same as \bold{.*} in full regexps.
516     \row \i \bold{[...]}
517          \i Sets of characters can be represented in square brackets,
518          similar to full regexps. Within the character class, like
519          outside, backslash has no special meaning.
520     \endtable
521 
522     In the mode Wildcard, the wildcard characters cannot be
523     escaped. In the mode WildcardUnix, the character '\\' escapes the
524     wildcard.
525 
526     For example if we are in wildcard mode and have strings which
527     contain filenames we could identify HTML files with \bold{*.html}.
528     This will match zero or more characters followed by a dot followed
529     by 'h', 't', 'm' and 'l'.
530 
531     To test a string against a wildcard expression, use exactMatch().
532     For example:
533 
534     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
535 
536     \target perl-users
537     \section1 Notes for Perl Users
538 
539     Most of the character class abbreviations supported by Perl are
540     supported by QRegExp, see \link
541     #characters-and-abbreviations-for-sets-of-characters characters
542     and abbreviations for sets of characters \endlink.
543 
544     In QRegExp, apart from within character classes, \c{^} always
545     signifies the start of the string, so carets must always be
546     escaped unless used for that purpose. In Perl the meaning of caret
547     varies automagically depending on where it occurs so escaping it
548     is rarely necessary. The same applies to \c{$} which in
549     QRegExp always signifies the end of the string.
550 
551     QRegExp's quantifiers are the same as Perl's greedy quantifiers
552     (but see the \l{greedy quantifiers}{note above}). Non-greedy
553     matching cannot be applied to individual quantifiers, but can be
554     applied to all the quantifiers in the pattern. For example, to
555     match the Perl regexp \bold{ro+?m} requires:
556 
557     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
558 
559     The equivalent of Perl's \c{/i} option is
560     setCaseSensitivity(Qt::CaseInsensitive).
561 
562     Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
563 
564     In QRegExp \bold{.} matches any character, therefore all QRegExp
565     regexps have the equivalent of Perl's \c{/s} option. QRegExp
566     does not have an equivalent to Perl's \c{/m} option, but this
567     can be emulated in various ways for example by splitting the input
568     into lines or by looping with a regexp that searches for newlines.
569 
570     Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
571     assertions. The \\G assertion is not supported but can be emulated
572     in a loop.
573 
574     Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
575     equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
576     ... correspond to cap(1) or capturedTexts()[1], cap(2) or
577     capturedTexts()[2], etc.
578 
579     To substitute a pattern use QString::replace().
580 
581     Perl's extended \c{/x} syntax is not supported, nor are
582     directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
583     the other hand, C++'s rules for literal strings can be used to
584     achieve the same:
585 
586     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
587 
588     Both zero-width positive and zero-width negative lookahead
589     assertions (?=pattern) and (?!pattern) are supported with the same
590     syntax as Perl. Perl's lookbehind assertions, "independent"
591     subexpressions and conditional expressions are not supported.
592 
593     Non-capturing parentheses are also supported, with the same
594     (?:pattern) syntax.
595 
596     See QString::split() and QStringList::join() for equivalents
597     to Perl's split and join functions.
598 
599     Note: because C++ transforms \\'s they must be written \e twice in
600     code, e.g. \bold{\\b} must be written \bold{\\\\b}.
601 
602     \target code-examples
603     \section1 Code Examples
604 
605     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
606 
607     The third string matches '\underline{6}'. This is a simple validation
608     regexp for integers in the range 0 to 99.
609 
610     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
611 
612     The second string matches '\underline{This_is-OK}'. We've used the
613     character set abbreviation '\\S' (non-whitespace) and the anchors
614     to match strings which contain no whitespace.
615 
616     In the following example we match strings containing 'mail' or
617     'letter' or 'correspondence' but only match whole words i.e. not
618     'email'
619 
620     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
621 
622     The second string matches "Please write the \underline{letter}". The
623     word 'letter' is also captured (because of the parentheses). We
624     can see what text we've captured like this:
625 
626     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
627 
628     This will capture the text from the first set of capturing
629     parentheses (counting capturing left parentheses from left to
630     right). The parentheses are counted from 1 since cap(0) is the
631     whole matched regexp (equivalent to '&' in most regexp engines).
632 
633     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
634 
635     Here we've passed the QRegExp to QString's replace() function to
636     replace the matched text with new text.
637 
638     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
639 
640     We've used the indexIn() function to repeatedly match the regexp in
641     the string. Note that instead of moving forward by one character
642     at a time \c pos++ we could have written \c {pos +=
643     rx.matchedLength()} to skip over the already matched string. The
644     count will equal 3, matching 'One \underline{Eric} another
645     \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
646     doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
647     by non-word boundaries.
648 
649     One common use of regexps is to split lines of delimited data into
650     their component fields.
651 
652     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
653 
654     In this example our input lines have the format company name, web
655     address and country. Unfortunately the regexp is rather long and
656     not very versatile -- the code will break if we add any more
657     fields. A simpler and better solution is to look for the
658     separator, '\\t' in this case, and take the surrounding text. The
659     QString::split() function can take a separator string or regexp
660     as an argument and split a string accordingly.
661 
662     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
663 
664     Here field[0] is the company, field[1] the web address and so on.
665 
666     To imitate the matching of a shell we can use wildcard mode.
667 
668     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
669 
670     Wildcard matching can be convenient because of its simplicity, but
671     any wildcard regexp can be defined using full regexps, e.g.
672     \bold{.*\\.html$}. Notice that we can't match both \c .html and \c
673     .htm files with a wildcard unless we use \bold{*.htm*} which will
674     also match 'test.html.bak'. A full regexp gives us the precision
675     we need, \bold{.*\\.html?$}.
676 
677     QRegExp can match case insensitively using setCaseSensitivity(),
678     and can use non-greedy matching, see setMinimal(). By
679     default QRegExp uses full regexps but this can be changed with
680     setWildcard(). Searching can be forward with indexIn() or backward
681     with lastIndexIn(). Captured text can be accessed using
682     capturedTexts() which returns a string list of all captured
683     strings, or using cap() which returns the captured string for the
684     given index. The pos() function takes a match index and returns
685     the position in the string where the match was made (or -1 if
686     there was no match).
687 
688     \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
689         {tools/regexp}{Regular Expression Example}
690 */
691 
692 #if defined(Q_OS_VXWORKS) && defined(EOS)
693 #  undef EOS
694 #endif
695 
696 const int NumBadChars = 64;
697 #define BadChar(ch) ((ch).unicode() % NumBadChars)
698 
699 const int NoOccurrence = INT_MAX;
700 const int EmptyCapture = INT_MAX;
701 const int InftyLen = INT_MAX;
702 const int InftyRep = 1025;
703 const int EOS = -1;
704 
isWord(QChar ch)705 static bool isWord(QChar ch)
706 {
707     return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
708 }
709 
710 /*
711   Merges two vectors of ints and puts the result into the first
712   one.
713 */
mergeInto(QVector<int> * a,const QVector<int> & b)714 static void mergeInto(QVector<int> *a, const QVector<int> &b)
715 {
716     int asize = a->size();
717     int bsize = b.size();
718     if (asize == 0) {
719         *a = b;
720 #ifndef QT_NO_REGEXP_OPTIM
721     } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
722         a->resize(asize + 1);
723         (*a)[asize] = b.at(0);
724 #endif
725     } else if (bsize >= 1) {
726         int csize = asize + bsize;
727         QVector<int> c(csize);
728         int i = 0, j = 0, k = 0;
729         while (i < asize) {
730             if (j < bsize) {
731                 if (a->at(i) == b.at(j)) {
732                     ++i;
733                     --csize;
734                 } else if (a->at(i) < b.at(j)) {
735                     c[k++] = a->at(i++);
736                 } else {
737                     c[k++] = b.at(j++);
738                 }
739             } else {
740                 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
741                 break;
742             }
743         }
744         c.resize(csize);
745         if (j < bsize)
746             memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
747         *a = c;
748     }
749 }
750 
751 #ifndef QT_NO_REGEXP_WILDCARD
752 /*
753   Translates a wildcard pattern to an equivalent regular expression
754   pattern (e.g., *.cpp to .*\.cpp).
755 
756   If enableEscaping is true, it is possible to escape the wildcard
757   characters with \
758 */
wc2rx(const QString & wc_str,const bool enableEscaping)759 static QString wc2rx(const QString &wc_str, const bool enableEscaping)
760 {
761     const int wclen = wc_str.length();
762     QString rx;
763     int i = 0;
764     bool isEscaping = false; // the previous character is '\'
765     const QChar *wc = wc_str.unicode();
766 
767     while (i < wclen) {
768         const QChar c = wc[i++];
769         switch (c.unicode()) {
770         case '\\':
771             if (enableEscaping) {
772                 if (isEscaping) {
773                     rx += QLatin1String("\\\\");
774                 } // we insert the \\ later if necessary
775                 if (i == wclen) { // the end
776                     rx += QLatin1String("\\\\");
777                 }
778             } else {
779                 rx += QLatin1String("\\\\");
780             }
781             isEscaping = true;
782             break;
783         case '*':
784             if (isEscaping) {
785                 rx += QLatin1String("\\*");
786                 isEscaping = false;
787             } else {
788                 rx += QLatin1String(".*");
789             }
790             break;
791         case '?':
792             if (isEscaping) {
793                 rx += QLatin1String("\\?");
794                 isEscaping = false;
795             } else {
796                 rx += QLatin1Char('.');
797             }
798 
799             break;
800         case '$':
801         case '(':
802         case ')':
803         case '+':
804         case '.':
805         case '^':
806         case '{':
807         case '|':
808         case '}':
809             if (isEscaping) {
810                 isEscaping = false;
811                 rx += QLatin1String("\\\\");
812             }
813             rx += QLatin1Char('\\');
814             rx += c;
815             break;
816          case '[':
817             if (isEscaping) {
818                 isEscaping = false;
819                 rx += QLatin1String("\\[");
820             } else {
821                 rx += c;
822                 if (wc[i] == QLatin1Char('^'))
823                     rx += wc[i++];
824                 if (i < wclen) {
825                     if (rx[i] == QLatin1Char(']'))
826                         rx += wc[i++];
827                     while (i < wclen && wc[i] != QLatin1Char(']')) {
828                         if (wc[i] == QLatin1Char('\\'))
829                             rx += QLatin1Char('\\');
830                         rx += wc[i++];
831                     }
832                 }
833             }
834              break;
835 
836         case ']':
837             if(isEscaping){
838                 isEscaping = false;
839                 rx += QLatin1String("\\");
840             }
841             rx += c;
842             break;
843 
844         default:
845             if(isEscaping){
846                 isEscaping = false;
847                 rx += QLatin1String("\\\\");
848             }
849             rx += c;
850         }
851     }
852     return rx;
853 }
854 #endif
855 
caretIndex(int offset,QRegExp::CaretMode caretMode)856 static int caretIndex(int offset, QRegExp::CaretMode caretMode)
857 {
858     if (caretMode == QRegExp::CaretAtZero) {
859         return 0;
860     } else if (caretMode == QRegExp::CaretAtOffset) {
861         return offset;
862     } else { // QRegExp::CaretWontMatch
863         return -1;
864     }
865 }
866 
867 /*
868     The QRegExpEngineKey struct uniquely identifies an engine.
869 */
870 struct QRegExpEngineKey
871 {
872     QString pattern;
873     QRegExp::PatternSyntax patternSyntax;
874     Qt::CaseSensitivity cs;
875 
QRegExpEngineKeyQRegExpEngineKey876     inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
877                             Qt::CaseSensitivity cs)
878         : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
879 
clearQRegExpEngineKey880     inline void clear() {
881         pattern.clear();
882         patternSyntax = QRegExp::RegExp;
883         cs = Qt::CaseSensitive;
884     }
885 };
886 
operator ==(const QRegExpEngineKey & key1,const QRegExpEngineKey & key2)887 Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
888 {
889     return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
890            && key1.cs == key2.cs;
891 }
892 
893 class QRegExpEngine;
894 
895 //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
896 
897 /*
898   This is the engine state during matching.
899 */
900 struct QRegExpMatchState
901 {
902     const QChar *in; // a pointer to the input string data
903     int pos; // the current position in the string
904     int caretPos;
905     int len; // the length of the input string
906     bool minimal; // minimal matching?
907     int *bigArray; // big array holding the data for the next pointers
908     int *inNextStack; // is state is nextStack?
909     int *curStack; // stack of current states
910     int *nextStack; // stack of next states
911     int *curCapBegin; // start of current states' captures
912     int *nextCapBegin; // start of next states' captures
913     int *curCapEnd; // end of current states' captures
914     int *nextCapEnd; // end of next states' captures
915     int *tempCapBegin; // start of temporary captures
916     int *tempCapEnd; // end of temporary captures
917     int *capBegin; // start of captures for a next state
918     int *capEnd; // end of captures for a next state
919     int *slideTab; // bump-along slide table for bad-character heuristic
920     int *captured; // what match() returned last
921     int slideTabSize; // size of slide table
922     int capturedSize;
923 #ifndef QT_NO_REGEXP_BACKREF
924     QList<QVector<int> > sleeping; // list of back-reference sleepers
925 #endif
926     int matchLen; // length of match
927     int oneTestMatchedLen; // length of partial match
928 
929     const QRegExpEngine *eng;
930 
QRegExpMatchStateQRegExpMatchState931     inline QRegExpMatchState() : bigArray(0), captured(0) {}
~QRegExpMatchStateQRegExpMatchState932     inline ~QRegExpMatchState() { free(bigArray); }
933 
drainQRegExpMatchState934     void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
935     void prepareForMatch(QRegExpEngine *eng);
936     void match(const QChar *str, int len, int pos, bool minimal,
937         bool oneTest, int caretIndex);
938     bool matchHere();
939     bool testAnchor(int i, int a, const int *capBegin);
940 };
941 
942 /*
943   The struct QRegExpAutomatonState represents one state in a modified NFA. The
944   input characters matched are stored in the state instead of on
945   the transitions, something possible for an automaton
946   constructed from a regular expression.
947 */
948 struct QRegExpAutomatonState
949 {
950 #ifndef QT_NO_REGEXP_CAPTURE
951     int atom; // which atom does this state belong to?
952 #endif
953     int match; // what does it match? (see CharClassBit and BackRefBit)
954     QVector<int> outs; // out-transitions
955     QMap<int, int> reenter; // atoms reentered when transiting out
956     QMap<int, int> anchors; // anchors met when transiting out
957 
QRegExpAutomatonStateQRegExpAutomatonState958     inline QRegExpAutomatonState() { }
959 #ifndef QT_NO_REGEXP_CAPTURE
QRegExpAutomatonStateQRegExpAutomatonState960     inline QRegExpAutomatonState(int a, int m)
961         : atom(a), match(m) { }
962 #else
QRegExpAutomatonStateQRegExpAutomatonState963     inline QRegExpAutomatonState(int m)
964         : match(m) { }
965 #endif
966 };
967 
968 Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
969 
970 /*
971   The struct QRegExpCharClassRange represents a range of characters (e.g.,
972   [0-9] denotes range 48 to 57).
973 */
974 struct QRegExpCharClassRange
975 {
976     ushort from; // 48
977     ushort len; // 10
978 };
979 
980 Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
981 
982 #ifndef QT_NO_REGEXP_CAPTURE
983 /*
984   The struct QRegExpAtom represents one node in the hierarchy of regular
985   expression atoms.
986 */
987 struct QRegExpAtom
988 {
989     enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
990 
991     int parent; // index of parent in array of atoms
992     int capture; // index of capture, from 1 to ncap - 1
993 };
994 
995 Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
996 #endif
997 
998 struct QRegExpLookahead;
999 
1000 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1001 /*
1002   The struct QRegExpAnchorAlternation represents a pair of anchors with
1003   OR semantics.
1004 */
1005 struct QRegExpAnchorAlternation
1006 {
1007     int a; // this anchor...
1008     int b; // ...or this one
1009 };
1010 
1011 Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1012 #endif
1013 
1014 #ifndef QT_NO_REGEXP_CCLASS
1015 /*
1016   The class QRegExpCharClass represents a set of characters, such as can
1017   be found in regular expressions (e.g., [a-z] denotes the set
1018   {a, b, ..., z}).
1019 */
1020 class QRegExpCharClass
1021 {
1022 public:
1023     QRegExpCharClass();
QRegExpCharClass(const QRegExpCharClass & cc)1024     inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
1025 
1026     QRegExpCharClass &operator=(const QRegExpCharClass &cc);
1027 
1028     void clear();
negative() const1029     bool negative() const { return n; }
1030     void setNegative(bool negative);
1031     void addCategories(int cats);
1032     void addRange(ushort from, ushort to);
addSingleton(ushort ch)1033     void addSingleton(ushort ch) { addRange(ch, ch); }
1034 
1035     bool in(QChar ch) const;
1036 #ifndef QT_NO_REGEXP_OPTIM
firstOccurrence() const1037     const QVector<int> &firstOccurrence() const { return occ1; }
1038 #endif
1039 
1040 #if defined(QT_DEBUG)
1041     void dump() const;
1042 #endif
1043 
1044 private:
1045     int c; // character classes
1046     QVector<QRegExpCharClassRange> r; // character ranges
1047     bool n; // negative?
1048 #ifndef QT_NO_REGEXP_OPTIM
1049     QVector<int> occ1; // first-occurrence array
1050 #endif
1051 };
1052 #else
1053 struct QRegExpCharClass
1054 {
1055     int dummy;
1056 
1057 #ifndef QT_NO_REGEXP_OPTIM
QRegExpCharClassQRegExpCharClass1058     QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1059 
firstOccurrenceQRegExpCharClass1060     const QVector<int> &firstOccurrence() const { return occ1; }
1061     QVector<int> occ1;
1062 #endif
1063 };
1064 #endif
1065 
1066 Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1067 
1068 /*
1069   The QRegExpEngine class encapsulates a modified nondeterministic
1070   finite automaton (NFA).
1071 */
1072 class QRegExpEngine
1073 {
1074 public:
QRegExpEngine(Qt::CaseSensitivity cs,bool greedyQuantifiers)1075     QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1076         : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1077 
1078     QRegExpEngine(const QRegExpEngineKey &key);
1079     ~QRegExpEngine();
1080 
isValid() const1081     bool isValid() const { return valid; }
errorString() const1082     const QString &errorString() const { return yyError; }
captureCount() const1083     int captureCount() const { return officialncap; }
1084 
1085     int createState(QChar ch);
1086     int createState(const QRegExpCharClass &cc);
1087 #ifndef QT_NO_REGEXP_BACKREF
1088     int createState(int bref);
1089 #endif
1090 
1091     void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1092 #ifndef QT_NO_REGEXP_CAPTURE
1093     void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1094 #endif
1095 
1096 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1097     int anchorAlternation(int a, int b);
1098     int anchorConcatenation(int a, int b);
1099 #else
anchorAlternation(int a,int b)1100     int anchorAlternation(int a, int b) { return a & b; }
anchorConcatenation(int a,int b)1101     int anchorConcatenation(int a, int b) { return a | b; }
1102 #endif
1103     void addAnchors(int from, int to, int a);
1104 
1105 #ifndef QT_NO_REGEXP_OPTIM
1106     void heuristicallyChooseHeuristic();
1107 #endif
1108 
1109 #if defined(QT_DEBUG)
1110     void dump() const;
1111 #endif
1112 
1113     QAtomicInt ref;
1114 
1115 private:
1116     enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1117     enum { InitialState = 0, FinalState = 1 };
1118 
1119     void setup();
1120     int setupState(int match);
1121 
1122     /*
1123       Let's hope that 13 lookaheads and 14 back-references are
1124       enough.
1125      */
1126     enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1127     enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1128            Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1129            Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1130            Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1131            Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1132 
1133            Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1134                    ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1135 #ifndef QT_NO_REGEXP_CAPTURE
1136     int startAtom(bool officialCapture);
1137     void finishAtom(int atom, bool needCapture);
1138 #endif
1139 
1140 #ifndef QT_NO_REGEXP_LOOKAHEAD
1141     int addLookahead(QRegExpEngine *eng, bool negative);
1142 #endif
1143 
1144 #ifndef QT_NO_REGEXP_OPTIM
1145     bool goodStringMatch(QRegExpMatchState &matchState) const;
1146     bool badCharMatch(QRegExpMatchState &matchState) const;
1147 #else
1148     bool bruteMatch(QRegExpMatchState &matchState) const;
1149 #endif
1150 
1151     QVector<QRegExpAutomatonState> s; // array of states
1152 #ifndef QT_NO_REGEXP_CAPTURE
1153     QVector<QRegExpAtom> f; // atom hierarchy
1154     int nf; // number of atoms
1155     int cf; // current atom
1156     QVector<int> captureForOfficialCapture;
1157 #endif
1158     int officialncap; // number of captures, seen from the outside
1159     int ncap; // number of captures, seen from the inside
1160 #ifndef QT_NO_REGEXP_CCLASS
1161     QVector<QRegExpCharClass> cl; // array of character classes
1162 #endif
1163 #ifndef QT_NO_REGEXP_LOOKAHEAD
1164     QVector<QRegExpLookahead *> ahead; // array of lookaheads
1165 #endif
1166 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1167     QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1168 #endif
1169 #ifndef QT_NO_REGEXP_OPTIM
1170     bool caretAnchored; // does the regexp start with ^?
1171     bool trivial; // is the good-string all that needs to match?
1172 #endif
1173     bool valid; // is the regular expression valid?
1174     Qt::CaseSensitivity cs; // case sensitive?
1175     bool greedyQuantifiers; // RegExp2?
1176     bool xmlSchemaExtensions;
1177 #ifndef QT_NO_REGEXP_BACKREF
1178     int nbrefs; // number of back-references
1179 #endif
1180 
1181 #ifndef QT_NO_REGEXP_OPTIM
1182     bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1183 
1184     int goodEarlyStart; // the index where goodStr can first occur in a match
1185     int goodLateStart; // the index where goodStr can last occur in a match
1186     QString goodStr; // the string that any match has to contain
1187 
1188     int minl; // the minimum length of a match
1189     QVector<int> occ1; // first-occurrence array
1190 #endif
1191 
1192     /*
1193       The class Box is an abstraction for a regular expression
1194       fragment. It can also be seen as one node in the syntax tree of
1195       a regular expression with synthetized attributes.
1196 
1197       Its interface is ugly for performance reasons.
1198     */
1199     class Box
1200     {
1201     public:
1202         Box(QRegExpEngine *engine);
Box(const Box & b)1203         Box(const Box &b) { operator=(b); }
1204 
1205         Box &operator=(const Box &b);
1206 
clear()1207         void clear() { operator=(Box(eng)); }
1208         void set(QChar ch);
1209         void set(const QRegExpCharClass &cc);
1210 #ifndef QT_NO_REGEXP_BACKREF
1211         void set(int bref);
1212 #endif
1213 
1214         void cat(const Box &b);
1215         void orx(const Box &b);
1216         void plus(int atom);
1217         void opt();
1218         void catAnchor(int a);
1219 #ifndef QT_NO_REGEXP_OPTIM
1220         void setupHeuristics();
1221 #endif
1222 
1223 #if defined(QT_DEBUG)
1224         void dump() const;
1225 #endif
1226 
1227     private:
1228         void addAnchorsToEngine(const Box &to) const;
1229 
1230         QRegExpEngine *eng; // the automaton under construction
1231         QVector<int> ls; // the left states (firstpos)
1232         QVector<int> rs; // the right states (lastpos)
1233         QMap<int, int> lanchors; // the left anchors
1234         QMap<int, int> ranchors; // the right anchors
1235         int skipanchors; // the anchors to match if the box is skipped
1236 
1237 #ifndef QT_NO_REGEXP_OPTIM
1238         int earlyStart; // the index where str can first occur
1239         int lateStart; // the index where str can last occur
1240         QString str; // a string that has to occur in any match
1241         QString leftStr; // a string occurring at the left of this box
1242         QString rightStr; // a string occurring at the right of this box
1243         int maxl; // the maximum length of this box (possibly InftyLen)
1244 #endif
1245 
1246         int minl; // the minimum length of this box
1247 #ifndef QT_NO_REGEXP_OPTIM
1248         QVector<int> occ1; // first-occurrence array
1249 #endif
1250     };
1251 
1252     friend class Box;
1253 
1254     void setupCategoriesRangeMap();
1255 
1256     /*
1257       This is the lexical analyzer for regular expressions.
1258     */
1259     enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1260            Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1261            Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1262     int getChar();
1263     int getEscape();
1264 #ifndef QT_NO_REGEXP_INTERVAL
1265     int getRep(int def);
1266 #endif
1267 #ifndef QT_NO_REGEXP_LOOKAHEAD
1268     void skipChars(int n);
1269 #endif
1270     void error(const char *msg);
1271     void startTokenizer(const QChar *rx, int len);
1272     int getToken();
1273 
1274     const QChar *yyIn; // a pointer to the input regular expression pattern
1275     int yyPos0; // the position of yyTok in the input pattern
1276     int yyPos; // the position of the next character to read
1277     int yyLen; // the length of yyIn
1278     int yyCh; // the last character read
1279     QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1280     int yyMinRep; // attribute for Tok_Quantifier
1281     int yyMaxRep; // ditto
1282     QString yyError; // syntax error or overflow during parsing?
1283 
1284     /*
1285       This is the syntactic analyzer for regular expressions.
1286     */
1287     int parse(const QChar *rx, int len);
1288     void parseAtom(Box *box);
1289     void parseFactor(Box *box);
1290     void parseTerm(Box *box);
1291     void parseExpression(Box *box);
1292 
1293     int yyTok; // the last token read
1294     bool yyMayCapture; // set this to false to disable capturing
1295     QHash<QByteArray, QPair<int, int> > categoriesRangeMap; // fast lookup hash for xml schema extensions
1296 
1297     friend struct QRegExpMatchState;
1298 };
1299 
1300 #ifndef QT_NO_REGEXP_LOOKAHEAD
1301 /*
1302   The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1303   (?=foo) and (?!bar)).
1304 */
1305 struct QRegExpLookahead
1306 {
1307     QRegExpEngine *eng; // NFA representing the embedded regular expression
1308     bool neg; // negative lookahead?
1309 
QRegExpLookaheadQRegExpLookahead1310     inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1311         : eng(eng0), neg(neg0) { }
~QRegExpLookaheadQRegExpLookahead1312     inline ~QRegExpLookahead() { delete eng; }
1313 };
1314 #endif
1315 
1316 /*! \internal
1317     convert the pattern string to the RegExp syntax.
1318 
1319     This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1320  */
qt_regexp_toCanonical(const QString & pattern,QRegExp::PatternSyntax patternSyntax)1321 Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1322 {
1323     switch (patternSyntax) {
1324 #ifndef QT_NO_REGEXP_WILDCARD
1325     case QRegExp::Wildcard:
1326         return wc2rx(pattern, false);
1327         break;
1328     case QRegExp::WildcardUnix:
1329         return wc2rx(pattern, true);
1330         break;
1331 #endif
1332     case QRegExp::FixedString:
1333         return QRegExp::escape(pattern);
1334         break;
1335     case QRegExp::W3CXmlSchema11:
1336     default:
1337         return pattern;
1338     }
1339 }
1340 
QRegExpEngine(const QRegExpEngineKey & key)1341 QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1342     : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1343       xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1344 {
1345     setup();
1346 
1347     QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1348 
1349     valid = (parse(rx.unicode(), rx.length()) == rx.length());
1350     if (!valid) {
1351 #ifndef QT_NO_REGEXP_OPTIM
1352         trivial = false;
1353 #endif
1354         error(RXERR_LEFTDELIM);
1355     }
1356 }
1357 
~QRegExpEngine()1358 QRegExpEngine::~QRegExpEngine()
1359 {
1360 #ifndef QT_NO_REGEXP_LOOKAHEAD
1361     qDeleteAll(ahead);
1362 #endif
1363 }
1364 
prepareForMatch(QRegExpEngine * eng)1365 void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1366 {
1367     /*
1368       We use one QVector<int> for all the big data used a lot in
1369       matchHere() and friends.
1370     */
1371     int ns = eng->s.size(); // number of states
1372     int ncap = eng->ncap;
1373 #ifndef QT_NO_REGEXP_OPTIM
1374     int newSlideTabSize = qMax(eng->minl + 1, 16);
1375 #else
1376     int newSlideTabSize = 0;
1377 #endif
1378     int numCaptures = eng->captureCount();
1379     int newCapturedSize = 2 + 2 * numCaptures;
1380     bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1381 
1382     // set all internal variables only _after_ bigArray is realloc'ed
1383     // to prevent a broken regexp in oom case
1384 
1385     slideTabSize = newSlideTabSize;
1386     capturedSize = newCapturedSize;
1387     inNextStack = bigArray;
1388     memset(inNextStack, -1, ns * sizeof(int));
1389     curStack = inNextStack + ns;
1390     nextStack = inNextStack + 2 * ns;
1391 
1392     curCapBegin = inNextStack + 3 * ns;
1393     nextCapBegin = curCapBegin + ncap * ns;
1394     curCapEnd = curCapBegin + 2 * ncap * ns;
1395     nextCapEnd = curCapBegin + 3 * ncap * ns;
1396 
1397     tempCapBegin = curCapBegin + 4 * ncap * ns;
1398     tempCapEnd = tempCapBegin + ncap;
1399     capBegin = tempCapBegin + 2 * ncap;
1400     capEnd = tempCapBegin + 3 * ncap;
1401 
1402     slideTab = tempCapBegin + 4 * ncap;
1403     captured = slideTab + slideTabSize;
1404     memset(captured, -1, capturedSize*sizeof(int));
1405     this->eng = eng;
1406 }
1407 
1408 /*
1409   Tries to match in str and returns an array of (begin, length) pairs
1410   for captured text. If there is no match, all pairs are (-1, -1).
1411 */
match(const QChar * str0,int len0,int pos0,bool minimal0,bool oneTest,int caretIndex)1412 void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1413     bool minimal0, bool oneTest, int caretIndex)
1414 {
1415     bool matched = false;
1416     QChar char_null;
1417 
1418 #ifndef QT_NO_REGEXP_OPTIM
1419     if (eng->trivial && !oneTest) {
1420         pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1421         matchLen = eng->goodStr.length();
1422         matched = (pos != -1);
1423     } else
1424 #endif
1425     {
1426         in = str0;
1427         if (in == 0)
1428             in = &char_null;
1429         pos = pos0;
1430         caretPos = caretIndex;
1431         len = len0;
1432         minimal = minimal0;
1433         matchLen = 0;
1434         oneTestMatchedLen = 0;
1435 
1436         if (eng->valid && pos >= 0 && pos <= len) {
1437 #ifndef QT_NO_REGEXP_OPTIM
1438             if (oneTest) {
1439                 matched = matchHere();
1440             } else {
1441                 if (pos <= len - eng->minl) {
1442                     if (eng->caretAnchored) {
1443                         matched = matchHere();
1444                     } else if (eng->useGoodStringHeuristic) {
1445                         matched = eng->goodStringMatch(*this);
1446                     } else {
1447                         matched = eng->badCharMatch(*this);
1448                     }
1449                 }
1450             }
1451 #else
1452             matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1453 #endif
1454         }
1455     }
1456 
1457     if (matched) {
1458         int *c = captured;
1459         *c++ = pos;
1460         *c++ = matchLen;
1461 
1462         int numCaptures = (capturedSize - 2) >> 1;
1463 #ifndef QT_NO_REGEXP_CAPTURE
1464         for (int i = 0; i < numCaptures; ++i) {
1465             int j = eng->captureForOfficialCapture.at(i);
1466             if (capBegin[j] != EmptyCapture) {
1467                 int len = capEnd[j] - capBegin[j];
1468                 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1469                 *c++ = len;
1470             } else {
1471                 *c++ = -1;
1472                 *c++ = -1;
1473             }
1474         }
1475 #endif
1476     } else {
1477         // we rely on 2's complement here
1478         memset(captured, -1, capturedSize * sizeof(int));
1479     }
1480 }
1481 
1482 /*
1483   The three following functions add one state to the automaton and
1484   return the number of the state.
1485 */
1486 
createState(QChar ch)1487 int QRegExpEngine::createState(QChar ch)
1488 {
1489     return setupState(ch.unicode());
1490 }
1491 
createState(const QRegExpCharClass & cc)1492 int QRegExpEngine::createState(const QRegExpCharClass &cc)
1493 {
1494 #ifndef QT_NO_REGEXP_CCLASS
1495     int n = cl.size();
1496     cl += QRegExpCharClass(cc);
1497     return setupState(CharClassBit | n);
1498 #else
1499     Q_UNUSED(cc);
1500     return setupState(CharClassBit);
1501 #endif
1502 }
1503 
1504 #ifndef QT_NO_REGEXP_BACKREF
createState(int bref)1505 int QRegExpEngine::createState(int bref)
1506 {
1507     if (bref > nbrefs) {
1508         nbrefs = bref;
1509         if (nbrefs > MaxBackRefs) {
1510             error(RXERR_LIMIT);
1511             return 0;
1512         }
1513     }
1514     return setupState(BackRefBit | bref);
1515 }
1516 #endif
1517 
1518 /*
1519   The two following functions add a transition between all pairs of
1520   states (i, j) where i is found in from, and j is found in to.
1521 
1522   Cat-transitions are distinguished from plus-transitions for
1523   capturing.
1524 */
1525 
addCatTransitions(const QVector<int> & from,const QVector<int> & to)1526 void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1527 {
1528     for (int i = 0; i < from.size(); i++)
1529         mergeInto(&s[from.at(i)].outs, to);
1530 }
1531 
1532 #ifndef QT_NO_REGEXP_CAPTURE
addPlusTransitions(const QVector<int> & from,const QVector<int> & to,int atom)1533 void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1534 {
1535     for (int i = 0; i < from.size(); i++) {
1536         QRegExpAutomatonState &st = s[from.at(i)];
1537         const QVector<int> oldOuts = st.outs;
1538         mergeInto(&st.outs, to);
1539         if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1540             for (int j = 0; j < to.size(); j++) {
1541                 // ### st.reenter.contains(to.at(j)) check looks suspicious
1542                 if (!st.reenter.contains(to.at(j)) &&
1543                      qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1544                     st.reenter.insert(to.at(j), atom);
1545             }
1546         }
1547     }
1548 }
1549 #endif
1550 
1551 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1552 /*
1553   Returns an anchor that means a OR b.
1554 */
anchorAlternation(int a,int b)1555 int QRegExpEngine::anchorAlternation(int a, int b)
1556 {
1557     if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1558         return a & b;
1559 
1560     int n = aa.size();
1561 #ifndef QT_NO_REGEXP_OPTIM
1562     if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1563         return Anchor_Alternation | (n - 1);
1564 #endif
1565 
1566     QRegExpAnchorAlternation element = {a, b};
1567     aa.append(element);
1568     return Anchor_Alternation | n;
1569 }
1570 
1571 /*
1572   Returns an anchor that means a AND b.
1573 */
anchorConcatenation(int a,int b)1574 int QRegExpEngine::anchorConcatenation(int a, int b)
1575 {
1576     if (((a | b) & Anchor_Alternation) == 0)
1577         return a | b;
1578     if ((b & Anchor_Alternation) != 0)
1579         qSwap(a, b);
1580 
1581     int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1582     int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1583     return anchorAlternation(aprime, bprime);
1584 }
1585 #endif
1586 
1587 /*
1588   Adds anchor a on a transition caracterised by its from state and
1589   its to state.
1590 */
addAnchors(int from,int to,int a)1591 void QRegExpEngine::addAnchors(int from, int to, int a)
1592 {
1593     QRegExpAutomatonState &st = s[from];
1594     if (st.anchors.contains(to))
1595         a = anchorAlternation(st.anchors.value(to), a);
1596     st.anchors.insert(to, a);
1597 }
1598 
1599 #ifndef QT_NO_REGEXP_OPTIM
1600 /*
1601   This function chooses between the good-string and the bad-character
1602   heuristics. It computes two scores and chooses the heuristic with
1603   the highest score.
1604 
1605   Here are some common-sense constraints on the scores that should be
1606   respected if the formulas are ever modified: (1) If goodStr is
1607   empty, the good-string heuristic scores 0. (2) If the regular
1608   expression is trivial, the good-string heuristic should be used.
1609   (3) If the search is case insensitive, the good-string heuristic
1610   should be used, unless it scores 0. (Case insensitivity turns all
1611   entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1612   big, the good-string heuristic should score less.
1613 */
heuristicallyChooseHeuristic()1614 void QRegExpEngine::heuristicallyChooseHeuristic()
1615 {
1616     if (minl == 0) {
1617         useGoodStringHeuristic = false;
1618     } else if (trivial) {
1619         useGoodStringHeuristic = true;
1620     } else {
1621         /*
1622           Magic formula: The good string has to constitute a good
1623           proportion of the minimum-length string, and appear at a
1624           more-or-less known index.
1625         */
1626         int goodStringScore = (64 * goodStr.length() / minl) -
1627                               (goodLateStart - goodEarlyStart);
1628         /*
1629           Less magic formula: We pick some characters at random, and
1630           check whether they are good or bad.
1631         */
1632         int badCharScore = 0;
1633         int step = qMax(1, NumBadChars / 32);
1634         for (int i = 1; i < NumBadChars; i += step) {
1635             if (occ1.at(i) == NoOccurrence)
1636                 badCharScore += minl;
1637             else
1638                 badCharScore += occ1.at(i);
1639         }
1640         badCharScore /= minl;
1641         useGoodStringHeuristic = (goodStringScore > badCharScore);
1642     }
1643 }
1644 #endif
1645 
1646 #if defined(QT_DEBUG)
dump() const1647 void QRegExpEngine::dump() const
1648 {
1649     int i, j;
1650     qDebug("Case %ssensitive engine", cs ? "" : "in");
1651     qDebug("  States");
1652     for (i = 0; i < s.size(); i++) {
1653         qDebug("  %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1654 #ifndef QT_NO_REGEXP_CAPTURE
1655         if (nf > 0)
1656             qDebug("    in atom %d", s[i].atom);
1657 #endif
1658         int m = s[i].match;
1659         if ((m & CharClassBit) != 0) {
1660             qDebug("    match character class %d", m ^ CharClassBit);
1661 #ifndef QT_NO_REGEXP_CCLASS
1662             cl[m ^ CharClassBit].dump();
1663 #else
1664             qDebug("    negative character class");
1665 #endif
1666         } else if ((m & BackRefBit) != 0) {
1667             qDebug("    match back-reference %d", m ^ BackRefBit);
1668         } else if (m >= 0x20 && m <= 0x7e) {
1669             qDebug("    match 0x%.4x (%c)", m, m);
1670         } else {
1671             qDebug("    match 0x%.4x", m);
1672         }
1673         for (j = 0; j < s[i].outs.size(); j++) {
1674             int next = s[i].outs[j];
1675             qDebug("    -> %d", next);
1676             if (s[i].reenter.contains(next))
1677                 qDebug("       [reenter %d]", s[i].reenter[next]);
1678             if (s[i].anchors.value(next) != 0)
1679                 qDebug("       [anchors 0x%.8x]", s[i].anchors[next]);
1680         }
1681     }
1682 #ifndef QT_NO_REGEXP_CAPTURE
1683     if (nf > 0) {
1684         qDebug("  Atom    Parent  Capture");
1685         for (i = 0; i < nf; i++) {
1686             if (f[i].capture == QRegExpAtom::NoCapture) {
1687                 qDebug("  %6d  %6d     nil", i, f[i].parent);
1688             } else {
1689                 int cap = f[i].capture;
1690                 bool official = captureForOfficialCapture.contains(cap);
1691                 qDebug("  %6d  %6d  %6d  %s", i, f[i].parent, f[i].capture,
1692                        official ? "official" : "");
1693             }
1694         }
1695     }
1696 #endif
1697 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1698     for (i = 0; i < aa.size(); i++)
1699         qDebug("  Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1700 #endif
1701 }
1702 #endif
1703 
setup()1704 void QRegExpEngine::setup()
1705 {
1706     ref = 1;
1707 #ifndef QT_NO_REGEXP_CAPTURE
1708     f.resize(32);
1709     nf = 0;
1710     cf = -1;
1711 #endif
1712     officialncap = 0;
1713     ncap = 0;
1714 #ifndef QT_NO_REGEXP_OPTIM
1715     caretAnchored = true;
1716     trivial = true;
1717 #endif
1718     valid = false;
1719 #ifndef QT_NO_REGEXP_BACKREF
1720     nbrefs = 0;
1721 #endif
1722 #ifndef QT_NO_REGEXP_OPTIM
1723     useGoodStringHeuristic = true;
1724     minl = 0;
1725     occ1.fill(0, NumBadChars);
1726 #endif
1727 }
1728 
setupState(int match)1729 int QRegExpEngine::setupState(int match)
1730 {
1731 #ifndef QT_NO_REGEXP_CAPTURE
1732     s += QRegExpAutomatonState(cf, match);
1733 #else
1734     s += QRegExpAutomatonState(match);
1735 #endif
1736     return s.size() - 1;
1737 }
1738 
1739 #ifndef QT_NO_REGEXP_CAPTURE
1740 /*
1741   Functions startAtom() and finishAtom() should be called to delimit
1742   atoms. When a state is created, it is assigned to the current atom.
1743   The information is later used for capturing.
1744 */
startAtom(bool officialCapture)1745 int QRegExpEngine::startAtom(bool officialCapture)
1746 {
1747     if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1748         f.resize((nf + 1) << 1);
1749     f[nf].parent = cf;
1750     cf = nf++;
1751     f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1752     return cf;
1753 }
1754 
finishAtom(int atom,bool needCapture)1755 void QRegExpEngine::finishAtom(int atom, bool needCapture)
1756 {
1757     if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1758         f[atom].capture = QRegExpAtom::UnofficialCapture;
1759     cf = f.at(atom).parent;
1760 }
1761 #endif
1762 
1763 #ifndef QT_NO_REGEXP_LOOKAHEAD
1764 /*
1765   Creates a lookahead anchor.
1766 */
addLookahead(QRegExpEngine * eng,bool negative)1767 int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1768 {
1769     int n = ahead.size();
1770     if (n == MaxLookaheads) {
1771         error(RXERR_LIMIT);
1772         return 0;
1773     }
1774     ahead += new QRegExpLookahead(eng, negative);
1775     return Anchor_FirstLookahead << n;
1776 }
1777 #endif
1778 
1779 #ifndef QT_NO_REGEXP_CAPTURE
1780 /*
1781   We want the longest leftmost captures.
1782 */
isBetterCapture(int ncap,const int * begin1,const int * end1,const int * begin2,const int * end2)1783 static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1784                             const int *end2)
1785 {
1786     for (int i = 0; i < ncap; i++) {
1787         int delta = begin2[i] - begin1[i]; // it has to start early...
1788         if (delta == 0)
1789             delta = end1[i] - end2[i]; // ...and end late
1790 
1791         if (delta != 0)
1792             return delta > 0;
1793     }
1794     return false;
1795 }
1796 #endif
1797 
1798 /*
1799   Returns true if anchor a matches at position pos + i in the input
1800   string, otherwise false.
1801 */
testAnchor(int i,int a,const int * capBegin)1802 bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1803 {
1804     int j;
1805 
1806 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1807     if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1808         return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1809                || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1810 #endif
1811 
1812     if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1813         if (pos + i != caretPos)
1814             return false;
1815     }
1816     if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1817         if (pos + i != len)
1818             return false;
1819     }
1820 #ifndef QT_NO_REGEXP_ESCAPE
1821     if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1822         bool before = false;
1823         bool after = false;
1824         if (pos + i != 0)
1825             before = isWord(in[pos + i - 1]);
1826         if (pos + i != len)
1827             after = isWord(in[pos + i]);
1828         if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1829             return false;
1830         if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1831             return false;
1832     }
1833 #endif
1834 #ifndef QT_NO_REGEXP_LOOKAHEAD
1835     if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1836         const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1837         for (j = 0; j < ahead.size(); j++) {
1838             if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1839                 QRegExpMatchState matchState;
1840                 matchState.prepareForMatch(ahead[j]->eng);
1841                 matchState.match(in + pos + i, len - pos - i, 0,
1842                     true, true, caretPos - pos - i);
1843                 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1844                     return false;
1845             }
1846         }
1847     }
1848 #endif
1849 #ifndef QT_NO_REGEXP_CAPTURE
1850 #ifndef QT_NO_REGEXP_BACKREF
1851     for (j = 0; j < eng->nbrefs; j++) {
1852         if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1853             int i = eng->captureForOfficialCapture.at(j);
1854             if (capBegin[i] != EmptyCapture)
1855                 return false;
1856         }
1857     }
1858 #endif
1859 #endif
1860     return true;
1861 }
1862 
1863 #ifndef QT_NO_REGEXP_OPTIM
1864 /*
1865   The three following functions are what Jeffrey Friedl would call
1866   transmissions (or bump-alongs). Using one or the other should make
1867   no difference except in performance.
1868 */
1869 
goodStringMatch(QRegExpMatchState & matchState) const1870 bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1871 {
1872     int k = matchState.pos + goodEarlyStart;
1873     QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1874     while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1875         int from = k - goodLateStart;
1876         int to = k - goodEarlyStart;
1877         if (from > matchState.pos)
1878             matchState.pos = from;
1879 
1880         while (matchState.pos <= to) {
1881             if (matchState.matchHere())
1882                 return true;
1883             ++matchState.pos;
1884         }
1885         ++k;
1886     }
1887     return false;
1888 }
1889 
badCharMatch(QRegExpMatchState & matchState) const1890 bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1891 {
1892     int slideHead = 0;
1893     int slideNext = 0;
1894     int i;
1895     int lastPos = matchState.len - minl;
1896     memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1897 
1898     /*
1899       Set up the slide table, used for the bad-character heuristic,
1900       using the table of first occurrence of each character.
1901     */
1902     for (i = 0; i < minl; i++) {
1903         int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1904         if (sk == NoOccurrence)
1905             sk = i + 1;
1906         if (sk > 0) {
1907             int k = i + 1 - sk;
1908             if (k < 0) {
1909                 sk = i + 1;
1910                 k = 0;
1911             }
1912             if (sk > matchState.slideTab[k])
1913                 matchState.slideTab[k] = sk;
1914         }
1915     }
1916 
1917     if (matchState.pos > lastPos)
1918         return false;
1919 
1920     for (;;) {
1921         if (++slideNext >= matchState.slideTabSize)
1922             slideNext = 0;
1923         if (matchState.slideTab[slideHead] > 0) {
1924             if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1925                 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1926             matchState.slideTab[slideHead] = 0;
1927         } else {
1928             if (matchState.matchHere())
1929                 return true;
1930         }
1931 
1932         if (matchState.pos == lastPos)
1933             break;
1934 
1935         /*
1936           Update the slide table. This code has much in common with
1937           the initialization code.
1938         */
1939         int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1940         if (sk == NoOccurrence) {
1941             matchState.slideTab[slideNext] = minl;
1942         } else if (sk > 0) {
1943             int k = slideNext + minl - sk;
1944             if (k >= matchState.slideTabSize)
1945                 k -= matchState.slideTabSize;
1946             if (sk > matchState.slideTab[k])
1947                 matchState.slideTab[k] = sk;
1948         }
1949         slideHead = slideNext;
1950         ++matchState.pos;
1951     }
1952     return false;
1953 }
1954 #else
bruteMatch(QRegExpMatchState & matchState) const1955 bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1956 {
1957     while (matchState.pos <= matchState.len) {
1958         if (matchState.matchHere())
1959             return true;
1960         ++matchState.pos;
1961     }
1962     return false;
1963 }
1964 #endif
1965 
1966 /*
1967   Here's the core of the engine. It tries to do a match here and now.
1968 */
matchHere()1969 bool QRegExpMatchState::matchHere()
1970 {
1971     int ncur = 1, nnext = 0;
1972     int i = 0, j, k, m;
1973     bool stop = false;
1974 
1975     matchLen = -1;
1976     oneTestMatchedLen = -1;
1977     curStack[0] = QRegExpEngine::InitialState;
1978 
1979     int ncap = eng->ncap;
1980 #ifndef QT_NO_REGEXP_CAPTURE
1981     if (ncap > 0) {
1982         for (j = 0; j < ncap; j++) {
1983             curCapBegin[j] = EmptyCapture;
1984             curCapEnd[j] = EmptyCapture;
1985         }
1986     }
1987 #endif
1988 
1989 #ifndef QT_NO_REGEXP_BACKREF
1990     while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1991 #else
1992     while (ncur > 0 && i <= len - pos && !stop)
1993 #endif
1994     {
1995         int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1996         for (j = 0; j < ncur; j++) {
1997             int cur = curStack[j];
1998             const QRegExpAutomatonState &scur = eng->s.at(cur);
1999             const QVector<int> &outs = scur.outs;
2000             for (k = 0; k < outs.size(); k++) {
2001                 int next = outs.at(k);
2002                 const QRegExpAutomatonState &snext = eng->s.at(next);
2003                 bool inside = true;
2004 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2005                 int needSomeSleep = 0;
2006 #endif
2007 
2008                 /*
2009                   First, check if the anchors are anchored properly.
2010                 */
2011                 int a = scur.anchors.value(next);
2012                 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2013                     inside = false;
2014 
2015                 /*
2016                   If indeed they are, check if the input character is
2017                   correct for this transition.
2018                 */
2019                 if (inside) {
2020                     m = snext.match;
2021                     if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2022                         if (eng->cs)
2023                             inside = (m == ch);
2024                         else
2025                             inside = (QChar(m).toLower() == QChar(ch).toLower());
2026                     } else if (next == QRegExpEngine::FinalState) {
2027                         matchLen = i;
2028                         stop = minimal;
2029                         inside = true;
2030                     } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2031 #ifndef QT_NO_REGEXP_CCLASS
2032                         const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2033                         if (eng->cs)
2034                             inside = cc.in(ch);
2035                         else if (cc.negative())
2036                             inside = cc.in(QChar(ch).toLower()) &&
2037                                      cc.in(QChar(ch).toUpper());
2038                         else
2039                             inside = cc.in(QChar(ch).toLower()) ||
2040                                      cc.in(QChar(ch).toUpper());
2041 #endif
2042 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2043                     } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2044                         int bref = m ^ QRegExpEngine::BackRefBit;
2045                         int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2046 
2047                         inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2048                         if (inside) {
2049                             if (eng->cs)
2050                                 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2051                             else
2052                                 inside = (in[pos + curCapBegin[ell]].toLower()
2053                                        == QChar(ch).toLower());
2054                         }
2055 
2056                         if (inside) {
2057                             int delta;
2058                             if (curCapEnd[ell] == EmptyCapture)
2059                                 delta = i - curCapBegin[ell];
2060                             else
2061                                 delta = curCapEnd[ell] - curCapBegin[ell];
2062 
2063                             inside = (delta <= len - (pos + i));
2064                             if (inside && delta > 1) {
2065                                 int n = 1;
2066                                 if (eng->cs) {
2067                                     while (n < delta) {
2068                                         if (in[pos + curCapBegin[ell] + n]
2069                                             != in[pos + i + n])
2070                                             break;
2071                                         ++n;
2072                                     }
2073                                 } else {
2074                                     while (n < delta) {
2075                                         QChar a = in[pos + curCapBegin[ell] + n];
2076                                         QChar b = in[pos + i + n];
2077                                         if (a.toLower() != b.toLower())
2078                                             break;
2079                                         ++n;
2080                                     }
2081                                 }
2082                                 inside = (n == delta);
2083                                 if (inside)
2084                                     needSomeSleep = delta - 1;
2085                             }
2086                         }
2087 #endif
2088                     }
2089                 }
2090 
2091                 /*
2092                   We must now update our data structures.
2093                 */
2094                 if (inside) {
2095 #ifndef QT_NO_REGEXP_CAPTURE
2096                     int *capBegin, *capEnd;
2097 #endif
2098                     /*
2099                       If the next state was not encountered yet, all
2100                       is fine.
2101                     */
2102                     if ((m = inNextStack[next]) == -1) {
2103                         m = nnext++;
2104                         nextStack[m] = next;
2105                         inNextStack[next] = m;
2106 #ifndef QT_NO_REGEXP_CAPTURE
2107                         capBegin = nextCapBegin + m * ncap;
2108                         capEnd = nextCapEnd + m * ncap;
2109 
2110                     /*
2111                       Otherwise, we'll first maintain captures in
2112                       temporary arrays, and decide at the end whether
2113                       it's best to keep the previous capture zones or
2114                       the new ones.
2115                     */
2116                     } else {
2117                         capBegin = tempCapBegin;
2118                         capEnd = tempCapEnd;
2119 #endif
2120                     }
2121 
2122 #ifndef QT_NO_REGEXP_CAPTURE
2123                     /*
2124                       Updating the capture zones is much of a task.
2125                     */
2126                     if (ncap > 0) {
2127                         memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2128                         memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2129                         int c = scur.atom, n = snext.atom;
2130                         int p = -1, q = -1;
2131                         int cap;
2132 
2133                         /*
2134                           Lemma 1. For any x in the range [0..nf), we
2135                           have f[x].parent < x.
2136 
2137                           Proof. By looking at startAtom(), it is
2138                           clear that cf < nf holds all the time, and
2139                           thus that f[nf].parent < nf.
2140                         */
2141 
2142                         /*
2143                           If we are reentering an atom, we empty all
2144                           capture zones inside it.
2145                         */
2146                         if ((q = scur.reenter.value(next)) != 0) {
2147                             QBitArray b(eng->nf, false);
2148                             b.setBit(q, true);
2149                             for (int ell = q + 1; ell < eng->nf; ell++) {
2150                                 if (b.testBit(eng->f.at(ell).parent)) {
2151                                     b.setBit(ell, true);
2152                                     cap = eng->f.at(ell).capture;
2153                                     if (cap >= 0) {
2154                                         capBegin[cap] = EmptyCapture;
2155                                         capEnd[cap] = EmptyCapture;
2156                                     }
2157                                 }
2158                             }
2159                             p = eng->f.at(q).parent;
2160 
2161                         /*
2162                           Otherwise, close the capture zones we are
2163                           leaving. We are leaving f[c].capture,
2164                           f[f[c].parent].capture,
2165                           f[f[f[c].parent].parent].capture, ...,
2166                           until f[x].capture, with x such that
2167                           f[x].parent is the youngest common ancestor
2168                           for c and n.
2169 
2170                           We go up along c's and n's ancestry until
2171                           we find x.
2172                         */
2173                         } else {
2174                             p = c;
2175                             q = n;
2176                             while (p != q) {
2177                                 if (p > q) {
2178                                     cap = eng->f.at(p).capture;
2179                                     if (cap >= 0) {
2180                                         if (capBegin[cap] == i) {
2181                                             capBegin[cap] = EmptyCapture;
2182                                             capEnd[cap] = EmptyCapture;
2183                                         } else {
2184                                             capEnd[cap] = i;
2185                                         }
2186                                     }
2187                                     p = eng->f.at(p).parent;
2188                                 } else {
2189                                     q = eng->f.at(q).parent;
2190                                 }
2191                             }
2192                         }
2193 
2194                         /*
2195                           In any case, we now open the capture zones
2196                           we are entering. We work upwards from n
2197                           until we reach p (the parent of the atom we
2198                           reenter or the youngest common ancestor).
2199                         */
2200                         while (n > p) {
2201                             cap = eng->f.at(n).capture;
2202                             if (cap >= 0) {
2203                                 capBegin[cap] = i;
2204                                 capEnd[cap] = EmptyCapture;
2205                             }
2206                             n = eng->f.at(n).parent;
2207                         }
2208                         /*
2209                           If the next state was already in
2210                           nextStack, we must choose carefully which
2211                           capture zones we want to keep.
2212                         */
2213                         if (capBegin == tempCapBegin &&
2214                                 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2215                                                 nextCapEnd + m * ncap)) {
2216                             memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2217                             memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2218                         }
2219                     }
2220 #ifndef QT_NO_REGEXP_BACKREF
2221                     /*
2222                       We are done with updating the capture zones.
2223                       It's now time to put the next state to sleep,
2224                       if it needs to, and to remove it from
2225                       nextStack.
2226                     */
2227                     if (needSomeSleep > 0) {
2228                         QVector<int> zzZ(2 + 2 * ncap);
2229                         zzZ[0] = i + needSomeSleep;
2230                         zzZ[1] = next;
2231                         if (ncap > 0) {
2232                             memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2233                             memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2234                         }
2235                         inNextStack[nextStack[--nnext]] = -1;
2236                         sleeping.append(zzZ);
2237                     }
2238 #endif
2239 #endif
2240                 }
2241             }
2242         }
2243 #ifndef QT_NO_REGEXP_CAPTURE
2244         /*
2245           If we reached the final state, hurray! Copy the captured
2246           zone.
2247         */
2248         if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2249             memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2250             memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2251         }
2252 #ifndef QT_NO_REGEXP_BACKREF
2253         /*
2254           It's time to wake up the sleepers.
2255         */
2256         j = 0;
2257         while (j < sleeping.count()) {
2258             if (sleeping.at(j)[0] == i) {
2259                 const QVector<int> &zzZ = sleeping.at(j);
2260                 int next = zzZ[1];
2261                 const int *capBegin = zzZ.data() + 2;
2262                 const int *capEnd = zzZ.data() + 2 + ncap;
2263                 bool copyOver = true;
2264 
2265                 if ((m = inNextStack[next]) == -1) {
2266                     m = nnext++;
2267                     nextStack[m] = next;
2268                     inNextStack[next] = m;
2269                 } else {
2270                     copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2271                                                capBegin, capEnd);
2272                 }
2273                 if (copyOver) {
2274                     memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2275                     memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2276                 }
2277 
2278                 sleeping.removeAt(j);
2279             } else {
2280                 ++j;
2281             }
2282         }
2283 #endif
2284 #endif
2285         for (j = 0; j < nnext; j++)
2286             inNextStack[nextStack[j]] = -1;
2287 
2288         // avoid needless iteration that confuses oneTestMatchedLen
2289         if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2290 #ifndef QT_NO_REGEXP_BACKREF
2291              && sleeping.isEmpty()
2292 #endif
2293            )
2294             stop = true;
2295 
2296         qSwap(curStack, nextStack);
2297 #ifndef QT_NO_REGEXP_CAPTURE
2298         qSwap(curCapBegin, nextCapBegin);
2299         qSwap(curCapEnd, nextCapEnd);
2300 #endif
2301         ncur = nnext;
2302         nnext = 0;
2303         ++i;
2304     }
2305 
2306 #ifndef QT_NO_REGEXP_BACKREF
2307     /*
2308       If minimal matching is enabled, we might have some sleepers
2309       left.
2310     */
2311     if (!sleeping.isEmpty())
2312         sleeping.clear();
2313 #endif
2314 
2315     oneTestMatchedLen = i - 1;
2316     return (matchLen >= 0);
2317 }
2318 
2319 #ifndef QT_NO_REGEXP_CCLASS
2320 
QRegExpCharClass()2321 QRegExpCharClass::QRegExpCharClass()
2322     : c(0), n(false)
2323 {
2324 #ifndef QT_NO_REGEXP_OPTIM
2325     occ1.fill(NoOccurrence, NumBadChars);
2326 #endif
2327 }
2328 
operator =(const QRegExpCharClass & cc)2329 QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2330 {
2331     c = cc.c;
2332     r = cc.r;
2333     n = cc.n;
2334 #ifndef QT_NO_REGEXP_OPTIM
2335     occ1 = cc.occ1;
2336 #endif
2337     return *this;
2338 }
2339 
clear()2340 void QRegExpCharClass::clear()
2341 {
2342     c = 0;
2343     r.resize(0);
2344     n = false;
2345 }
2346 
setNegative(bool negative)2347 void QRegExpCharClass::setNegative(bool negative)
2348 {
2349     n = negative;
2350 #ifndef QT_NO_REGEXP_OPTIM
2351     occ1.fill(0, NumBadChars);
2352 #endif
2353 }
2354 
addCategories(int cats)2355 void QRegExpCharClass::addCategories(int cats)
2356 {
2357     c |= cats;
2358 #ifndef QT_NO_REGEXP_OPTIM
2359     occ1.fill(0, NumBadChars);
2360 #endif
2361 }
2362 
addRange(ushort from,ushort to)2363 void QRegExpCharClass::addRange(ushort from, ushort to)
2364 {
2365     if (from > to)
2366         qSwap(from, to);
2367     int m = r.size();
2368     r.resize(m + 1);
2369     r[m].from = from;
2370     r[m].len = to - from + 1;
2371 
2372 #ifndef QT_NO_REGEXP_OPTIM
2373     int i;
2374 
2375     if (to - from < NumBadChars) {
2376         if (from % NumBadChars <= to % NumBadChars) {
2377             for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2378                 occ1[i] = 0;
2379         } else {
2380             for (i = 0; i <= to % NumBadChars; i++)
2381                 occ1[i] = 0;
2382             for (i = from % NumBadChars; i < NumBadChars; i++)
2383                 occ1[i] = 0;
2384         }
2385     } else {
2386         occ1.fill(0, NumBadChars);
2387     }
2388 #endif
2389 }
2390 
in(QChar ch) const2391 bool QRegExpCharClass::in(QChar ch) const
2392 {
2393 #ifndef QT_NO_REGEXP_OPTIM
2394     if (occ1.at(BadChar(ch)) == NoOccurrence)
2395         return n;
2396 #endif
2397 
2398     if (c != 0 && (c & (1 << (int)ch.category())) != 0)
2399         return !n;
2400 
2401     const int uc = ch.unicode();
2402     int size = r.size();
2403 
2404     for (int i = 0; i < size; ++i) {
2405         const QRegExpCharClassRange &range = r.at(i);
2406         if (uint(uc - range.from) < uint(r.at(i).len))
2407             return !n;
2408     }
2409     return n;
2410 }
2411 
2412 #if defined(QT_DEBUG)
dump() const2413 void QRegExpCharClass::dump() const
2414 {
2415     int i;
2416     qDebug("    %stive character class", n ? "nega" : "posi");
2417 #ifndef QT_NO_REGEXP_CCLASS
2418     if (c != 0)
2419         qDebug("      categories 0x%.8x", c);
2420 #endif
2421     for (i = 0; i < r.size(); i++)
2422         qDebug("      0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2423 }
2424 #endif
2425 #endif
2426 
Box(QRegExpEngine * engine)2427 QRegExpEngine::Box::Box(QRegExpEngine *engine)
2428     : eng(engine), skipanchors(0)
2429 #ifndef QT_NO_REGEXP_OPTIM
2430       , earlyStart(0), lateStart(0), maxl(0)
2431 #endif
2432 {
2433 #ifndef QT_NO_REGEXP_OPTIM
2434     occ1.fill(NoOccurrence, NumBadChars);
2435 #endif
2436     minl = 0;
2437 }
2438 
operator =(const Box & b)2439 QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2440 {
2441     eng = b.eng;
2442     ls = b.ls;
2443     rs = b.rs;
2444     lanchors = b.lanchors;
2445     ranchors = b.ranchors;
2446     skipanchors = b.skipanchors;
2447 #ifndef QT_NO_REGEXP_OPTIM
2448     earlyStart = b.earlyStart;
2449     lateStart = b.lateStart;
2450     str = b.str;
2451     leftStr = b.leftStr;
2452     rightStr = b.rightStr;
2453     maxl = b.maxl;
2454     occ1 = b.occ1;
2455 #endif
2456     minl = b.minl;
2457     return *this;
2458 }
2459 
set(QChar ch)2460 void QRegExpEngine::Box::set(QChar ch)
2461 {
2462     ls.resize(1);
2463     ls[0] = eng->createState(ch);
2464     rs = ls;
2465 #ifndef QT_NO_REGEXP_OPTIM
2466     str = ch;
2467     leftStr = ch;
2468     rightStr = ch;
2469     maxl = 1;
2470     occ1[BadChar(ch)] = 0;
2471 #endif
2472     minl = 1;
2473 }
2474 
set(const QRegExpCharClass & cc)2475 void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2476 {
2477     ls.resize(1);
2478     ls[0] = eng->createState(cc);
2479     rs = ls;
2480 #ifndef QT_NO_REGEXP_OPTIM
2481     maxl = 1;
2482     occ1 = cc.firstOccurrence();
2483 #endif
2484     minl = 1;
2485 }
2486 
2487 #ifndef QT_NO_REGEXP_BACKREF
set(int bref)2488 void QRegExpEngine::Box::set(int bref)
2489 {
2490     ls.resize(1);
2491     ls[0] = eng->createState(bref);
2492     rs = ls;
2493     if (bref >= 1 && bref <= MaxBackRefs)
2494         skipanchors = Anchor_BackRef0Empty << bref;
2495 #ifndef QT_NO_REGEXP_OPTIM
2496     maxl = InftyLen;
2497 #endif
2498     minl = 0;
2499 }
2500 #endif
2501 
cat(const Box & b)2502 void QRegExpEngine::Box::cat(const Box &b)
2503 {
2504     eng->addCatTransitions(rs, b.ls);
2505     addAnchorsToEngine(b);
2506     if (minl == 0) {
2507         lanchors.unite(b.lanchors);
2508         if (skipanchors != 0) {
2509             for (int i = 0; i < b.ls.size(); i++) {
2510                 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2511                 lanchors.insert(b.ls.at(i), a);
2512             }
2513         }
2514         mergeInto(&ls, b.ls);
2515     }
2516     if (b.minl == 0) {
2517         ranchors.unite(b.ranchors);
2518         if (b.skipanchors != 0) {
2519             for (int i = 0; i < rs.size(); i++) {
2520                 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2521                 ranchors.insert(rs.at(i), a);
2522             }
2523         }
2524         mergeInto(&rs, b.rs);
2525     } else {
2526         ranchors = b.ranchors;
2527         rs = b.rs;
2528     }
2529 
2530 #ifndef QT_NO_REGEXP_OPTIM
2531     if (maxl != InftyLen) {
2532         if (rightStr.length() + b.leftStr.length() >
2533              qMax(str.length(), b.str.length())) {
2534             earlyStart = minl - rightStr.length();
2535             lateStart = maxl - rightStr.length();
2536             str = rightStr + b.leftStr;
2537         } else if (b.str.length() > str.length()) {
2538             earlyStart = minl + b.earlyStart;
2539             lateStart = maxl + b.lateStart;
2540             str = b.str;
2541         }
2542     }
2543 
2544     if (leftStr.length() == maxl)
2545         leftStr += b.leftStr;
2546 
2547     if (b.rightStr.length() == b.maxl) {
2548         rightStr += b.rightStr;
2549     } else {
2550         rightStr = b.rightStr;
2551     }
2552 
2553     if (maxl == InftyLen || b.maxl == InftyLen) {
2554         maxl = InftyLen;
2555     } else {
2556         maxl += b.maxl;
2557     }
2558 
2559     for (int i = 0; i < NumBadChars; i++) {
2560         if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2561             occ1[i] = minl + b.occ1.at(i);
2562     }
2563 #endif
2564 
2565     minl += b.minl;
2566     if (minl == 0)
2567         skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2568     else
2569         skipanchors = 0;
2570 }
2571 
orx(const Box & b)2572 void QRegExpEngine::Box::orx(const Box &b)
2573 {
2574     mergeInto(&ls, b.ls);
2575     lanchors.unite(b.lanchors);
2576     mergeInto(&rs, b.rs);
2577     ranchors.unite(b.ranchors);
2578 
2579     if (b.minl == 0) {
2580         if (minl == 0)
2581             skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2582         else
2583             skipanchors = b.skipanchors;
2584     }
2585 
2586 #ifndef QT_NO_REGEXP_OPTIM
2587     for (int i = 0; i < NumBadChars; i++) {
2588         if (occ1.at(i) > b.occ1.at(i))
2589             occ1[i] = b.occ1.at(i);
2590     }
2591     earlyStart = 0;
2592     lateStart = 0;
2593     str = QString();
2594     leftStr = QString();
2595     rightStr = QString();
2596     if (b.maxl > maxl)
2597         maxl = b.maxl;
2598 #endif
2599     if (b.minl < minl)
2600         minl = b.minl;
2601 }
2602 
plus(int atom)2603 void QRegExpEngine::Box::plus(int atom)
2604 {
2605 #ifndef QT_NO_REGEXP_CAPTURE
2606     eng->addPlusTransitions(rs, ls, atom);
2607 #else
2608     Q_UNUSED(atom);
2609     eng->addCatTransitions(rs, ls);
2610 #endif
2611     addAnchorsToEngine(*this);
2612 #ifndef QT_NO_REGEXP_OPTIM
2613     maxl = InftyLen;
2614 #endif
2615 }
2616 
opt()2617 void QRegExpEngine::Box::opt()
2618 {
2619 #ifndef QT_NO_REGEXP_OPTIM
2620     earlyStart = 0;
2621     lateStart = 0;
2622     str = QString();
2623     leftStr = QString();
2624     rightStr = QString();
2625 #endif
2626     skipanchors = 0;
2627     minl = 0;
2628 }
2629 
catAnchor(int a)2630 void QRegExpEngine::Box::catAnchor(int a)
2631 {
2632     if (a != 0) {
2633         for (int i = 0; i < rs.size(); i++) {
2634             a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2635             ranchors.insert(rs.at(i), a);
2636         }
2637         if (minl == 0)
2638             skipanchors = eng->anchorConcatenation(skipanchors, a);
2639     }
2640 }
2641 
2642 #ifndef QT_NO_REGEXP_OPTIM
setupHeuristics()2643 void QRegExpEngine::Box::setupHeuristics()
2644 {
2645     eng->goodEarlyStart = earlyStart;
2646     eng->goodLateStart = lateStart;
2647     eng->goodStr = eng->cs ? str : str.toLower();
2648 
2649     eng->minl = minl;
2650     if (eng->cs) {
2651         /*
2652           A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2653           1 at this point. An entry of occ1 has to be at most minl or
2654           infinity for the rest of the algorithm to go well.
2655 
2656           We waited until here before normalizing these cases (instead of
2657           doing it in Box::orx()) because sometimes things improve by
2658           themselves. Consider for example (112|1)34.
2659         */
2660         for (int i = 0; i < NumBadChars; i++) {
2661             if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2662                 occ1[i] = minl;
2663         }
2664         eng->occ1 = occ1;
2665     } else {
2666         eng->occ1.fill(0, NumBadChars);
2667     }
2668 
2669     eng->heuristicallyChooseHeuristic();
2670 }
2671 #endif
2672 
2673 #if defined(QT_DEBUG)
dump() const2674 void QRegExpEngine::Box::dump() const
2675 {
2676     int i;
2677     qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2678     qDebug("  Left states:");
2679     for (i = 0; i < ls.size(); i++) {
2680         if (lanchors.value(ls[i], 0) == 0)
2681             qDebug("    %d", ls[i]);
2682         else
2683             qDebug("    %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2684     }
2685     qDebug("  Right states:");
2686     for (i = 0; i < rs.size(); i++) {
2687         if (ranchors.value(rs[i], 0) == 0)
2688             qDebug("    %d", rs[i]);
2689         else
2690             qDebug("    %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2691     }
2692     qDebug("  Skip anchors: 0x%.8x", skipanchors);
2693 }
2694 #endif
2695 
addAnchorsToEngine(const Box & to) const2696 void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2697 {
2698     for (int i = 0; i < to.ls.size(); i++) {
2699         for (int j = 0; j < rs.size(); j++) {
2700             int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2701                                              to.lanchors.value(to.ls.at(i), 0));
2702             eng->addAnchors(rs[j], to.ls[i], a);
2703         }
2704     }
2705 }
2706 
setupCategoriesRangeMap()2707 void QRegExpEngine::setupCategoriesRangeMap()
2708 {
2709    categoriesRangeMap.insert("IsBasicLatin",                           qMakePair(0x0000, 0x007F));
2710    categoriesRangeMap.insert("IsLatin-1Supplement",                    qMakePair(0x0080, 0x00FF));
2711    categoriesRangeMap.insert("IsLatinExtended-A",                      qMakePair(0x0100, 0x017F));
2712    categoriesRangeMap.insert("IsLatinExtended-B",                      qMakePair(0x0180, 0x024F));
2713    categoriesRangeMap.insert("IsIPAExtensions",                        qMakePair(0x0250, 0x02AF));
2714    categoriesRangeMap.insert("IsSpacingModifierLetters",               qMakePair(0x02B0, 0x02FF));
2715    categoriesRangeMap.insert("IsCombiningDiacriticalMarks",            qMakePair(0x0300, 0x036F));
2716    categoriesRangeMap.insert("IsGreek",                                qMakePair(0x0370, 0x03FF));
2717    categoriesRangeMap.insert("IsCyrillic",                             qMakePair(0x0400, 0x04FF));
2718    categoriesRangeMap.insert("IsCyrillicSupplement",                   qMakePair(0x0500, 0x052F));
2719    categoriesRangeMap.insert("IsArmenian",                             qMakePair(0x0530, 0x058F));
2720    categoriesRangeMap.insert("IsHebrew",                               qMakePair(0x0590, 0x05FF));
2721    categoriesRangeMap.insert("IsArabic",                               qMakePair(0x0600, 0x06FF));
2722    categoriesRangeMap.insert("IsSyriac",                               qMakePair(0x0700, 0x074F));
2723    categoriesRangeMap.insert("IsArabicSupplement",                     qMakePair(0x0750, 0x077F));
2724    categoriesRangeMap.insert("IsThaana",                               qMakePair(0x0780, 0x07BF));
2725    categoriesRangeMap.insert("IsDevanagari",                           qMakePair(0x0900, 0x097F));
2726    categoriesRangeMap.insert("IsBengali",                              qMakePair(0x0980, 0x09FF));
2727    categoriesRangeMap.insert("IsGurmukhi",                             qMakePair(0x0A00, 0x0A7F));
2728    categoriesRangeMap.insert("IsGujarati",                             qMakePair(0x0A80, 0x0AFF));
2729    categoriesRangeMap.insert("IsOriya",                                qMakePair(0x0B00, 0x0B7F));
2730    categoriesRangeMap.insert("IsTamil",                                qMakePair(0x0B80, 0x0BFF));
2731    categoriesRangeMap.insert("IsTelugu",                               qMakePair(0x0C00, 0x0C7F));
2732    categoriesRangeMap.insert("IsKannada",                              qMakePair(0x0C80, 0x0CFF));
2733    categoriesRangeMap.insert("IsMalayalam",                            qMakePair(0x0D00, 0x0D7F));
2734    categoriesRangeMap.insert("IsSinhala",                              qMakePair(0x0D80, 0x0DFF));
2735    categoriesRangeMap.insert("IsThai",                                 qMakePair(0x0E00, 0x0E7F));
2736    categoriesRangeMap.insert("IsLao",                                  qMakePair(0x0E80, 0x0EFF));
2737    categoriesRangeMap.insert("IsTibetan",                              qMakePair(0x0F00, 0x0FFF));
2738    categoriesRangeMap.insert("IsMyanmar",                              qMakePair(0x1000, 0x109F));
2739    categoriesRangeMap.insert("IsGeorgian",                             qMakePair(0x10A0, 0x10FF));
2740    categoriesRangeMap.insert("IsHangulJamo",                           qMakePair(0x1100, 0x11FF));
2741    categoriesRangeMap.insert("IsEthiopic",                             qMakePair(0x1200, 0x137F));
2742    categoriesRangeMap.insert("IsEthiopicSupplement",                   qMakePair(0x1380, 0x139F));
2743    categoriesRangeMap.insert("IsCherokee",                             qMakePair(0x13A0, 0x13FF));
2744    categoriesRangeMap.insert("IsUnifiedCanadianAboriginalSyllabics",   qMakePair(0x1400, 0x167F));
2745    categoriesRangeMap.insert("IsOgham",                                qMakePair(0x1680, 0x169F));
2746    categoriesRangeMap.insert("IsRunic",                                qMakePair(0x16A0, 0x16FF));
2747    categoriesRangeMap.insert("IsTagalog",                              qMakePair(0x1700, 0x171F));
2748    categoriesRangeMap.insert("IsHanunoo",                              qMakePair(0x1720, 0x173F));
2749    categoriesRangeMap.insert("IsBuhid",                                qMakePair(0x1740, 0x175F));
2750    categoriesRangeMap.insert("IsTagbanwa",                             qMakePair(0x1760, 0x177F));
2751    categoriesRangeMap.insert("IsKhmer",                                qMakePair(0x1780, 0x17FF));
2752    categoriesRangeMap.insert("IsMongolian",                            qMakePair(0x1800, 0x18AF));
2753    categoriesRangeMap.insert("IsLimbu",                                qMakePair(0x1900, 0x194F));
2754    categoriesRangeMap.insert("IsTaiLe",                                qMakePair(0x1950, 0x197F));
2755    categoriesRangeMap.insert("IsNewTaiLue",                            qMakePair(0x1980, 0x19DF));
2756    categoriesRangeMap.insert("IsKhmerSymbols",                         qMakePair(0x19E0, 0x19FF));
2757    categoriesRangeMap.insert("IsBuginese",                             qMakePair(0x1A00, 0x1A1F));
2758    categoriesRangeMap.insert("IsPhoneticExtensions",                   qMakePair(0x1D00, 0x1D7F));
2759    categoriesRangeMap.insert("IsPhoneticExtensionsSupplement",         qMakePair(0x1D80, 0x1DBF));
2760    categoriesRangeMap.insert("IsCombiningDiacriticalMarksSupplement",  qMakePair(0x1DC0, 0x1DFF));
2761    categoriesRangeMap.insert("IsLatinExtendedAdditional",              qMakePair(0x1E00, 0x1EFF));
2762    categoriesRangeMap.insert("IsGreekExtended",                        qMakePair(0x1F00, 0x1FFF));
2763    categoriesRangeMap.insert("IsGeneralPunctuation",                   qMakePair(0x2000, 0x206F));
2764    categoriesRangeMap.insert("IsSuperscriptsandSubscripts",            qMakePair(0x2070, 0x209F));
2765    categoriesRangeMap.insert("IsCurrencySymbols",                      qMakePair(0x20A0, 0x20CF));
2766    categoriesRangeMap.insert("IsCombiningMarksforSymbols",             qMakePair(0x20D0, 0x20FF));
2767    categoriesRangeMap.insert("IsLetterlikeSymbols",                    qMakePair(0x2100, 0x214F));
2768    categoriesRangeMap.insert("IsNumberForms",                          qMakePair(0x2150, 0x218F));
2769    categoriesRangeMap.insert("IsArrows",                               qMakePair(0x2190, 0x21FF));
2770    categoriesRangeMap.insert("IsMathematicalOperators",                qMakePair(0x2200, 0x22FF));
2771    categoriesRangeMap.insert("IsMiscellaneousTechnical",               qMakePair(0x2300, 0x23FF));
2772    categoriesRangeMap.insert("IsControlPictures",                      qMakePair(0x2400, 0x243F));
2773    categoriesRangeMap.insert("IsOpticalCharacterRecognition",          qMakePair(0x2440, 0x245F));
2774    categoriesRangeMap.insert("IsEnclosedAlphanumerics",                qMakePair(0x2460, 0x24FF));
2775    categoriesRangeMap.insert("IsBoxDrawing",                           qMakePair(0x2500, 0x257F));
2776    categoriesRangeMap.insert("IsBlockElements",                        qMakePair(0x2580, 0x259F));
2777    categoriesRangeMap.insert("IsGeometricShapes",                      qMakePair(0x25A0, 0x25FF));
2778    categoriesRangeMap.insert("IsMiscellaneousSymbols",                 qMakePair(0x2600, 0x26FF));
2779    categoriesRangeMap.insert("IsDingbats",                             qMakePair(0x2700, 0x27BF));
2780    categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-A",   qMakePair(0x27C0, 0x27EF));
2781    categoriesRangeMap.insert("IsSupplementalArrows-A",                 qMakePair(0x27F0, 0x27FF));
2782    categoriesRangeMap.insert("IsBraillePatterns",                      qMakePair(0x2800, 0x28FF));
2783    categoriesRangeMap.insert("IsSupplementalArrows-B",                 qMakePair(0x2900, 0x297F));
2784    categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-B",   qMakePair(0x2980, 0x29FF));
2785    categoriesRangeMap.insert("IsSupplementalMathematicalOperators",    qMakePair(0x2A00, 0x2AFF));
2786    categoriesRangeMap.insert("IsMiscellaneousSymbolsandArrows",        qMakePair(0x2B00, 0x2BFF));
2787    categoriesRangeMap.insert("IsGlagolitic",                           qMakePair(0x2C00, 0x2C5F));
2788    categoriesRangeMap.insert("IsCoptic",                               qMakePair(0x2C80, 0x2CFF));
2789    categoriesRangeMap.insert("IsGeorgianSupplement",                   qMakePair(0x2D00, 0x2D2F));
2790    categoriesRangeMap.insert("IsTifinagh",                             qMakePair(0x2D30, 0x2D7F));
2791    categoriesRangeMap.insert("IsEthiopicExtended",                     qMakePair(0x2D80, 0x2DDF));
2792    categoriesRangeMap.insert("IsSupplementalPunctuation",              qMakePair(0x2E00, 0x2E7F));
2793    categoriesRangeMap.insert("IsCJKRadicalsSupplement",                qMakePair(0x2E80, 0x2EFF));
2794    categoriesRangeMap.insert("IsKangxiRadicals",                       qMakePair(0x2F00, 0x2FDF));
2795    categoriesRangeMap.insert("IsIdeographicDescriptionCharacters",     qMakePair(0x2FF0, 0x2FFF));
2796    categoriesRangeMap.insert("IsCJKSymbolsandPunctuation",             qMakePair(0x3000, 0x303F));
2797    categoriesRangeMap.insert("IsHiragana",                             qMakePair(0x3040, 0x309F));
2798    categoriesRangeMap.insert("IsKatakana",                             qMakePair(0x30A0, 0x30FF));
2799    categoriesRangeMap.insert("IsBopomofo",                             qMakePair(0x3100, 0x312F));
2800    categoriesRangeMap.insert("IsHangulCompatibilityJamo",              qMakePair(0x3130, 0x318F));
2801    categoriesRangeMap.insert("IsKanbun",                               qMakePair(0x3190, 0x319F));
2802    categoriesRangeMap.insert("IsBopomofoExtended",                     qMakePair(0x31A0, 0x31BF));
2803    categoriesRangeMap.insert("IsCJKStrokes",                           qMakePair(0x31C0, 0x31EF));
2804    categoriesRangeMap.insert("IsKatakanaPhoneticExtensions",           qMakePair(0x31F0, 0x31FF));
2805    categoriesRangeMap.insert("IsEnclosedCJKLettersandMonths",          qMakePair(0x3200, 0x32FF));
2806    categoriesRangeMap.insert("IsCJKCompatibility",                     qMakePair(0x3300, 0x33FF));
2807    categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionA",       qMakePair(0x3400, 0x4DB5));
2808    categoriesRangeMap.insert("IsYijingHexagramSymbols",                qMakePair(0x4DC0, 0x4DFF));
2809    categoriesRangeMap.insert("IsCJKUnifiedIdeographs",                 qMakePair(0x4E00, 0x9FFF));
2810    categoriesRangeMap.insert("IsYiSyllables",                          qMakePair(0xA000, 0xA48F));
2811    categoriesRangeMap.insert("IsYiRadicals",                           qMakePair(0xA490, 0xA4CF));
2812    categoriesRangeMap.insert("IsModifierToneLetters",                  qMakePair(0xA700, 0xA71F));
2813    categoriesRangeMap.insert("IsSylotiNagri",                          qMakePair(0xA800, 0xA82F));
2814    categoriesRangeMap.insert("IsHangulSyllables",                      qMakePair(0xAC00, 0xD7A3));
2815    categoriesRangeMap.insert("IsPrivateUse",                           qMakePair(0xE000, 0xF8FF));
2816    categoriesRangeMap.insert("IsCJKCompatibilityIdeographs",           qMakePair(0xF900, 0xFAFF));
2817    categoriesRangeMap.insert("IsAlphabeticPresentationForms",          qMakePair(0xFB00, 0xFB4F));
2818    categoriesRangeMap.insert("IsArabicPresentationForms-A",            qMakePair(0xFB50, 0xFDFF));
2819    categoriesRangeMap.insert("IsVariationSelectors",                   qMakePair(0xFE00, 0xFE0F));
2820    categoriesRangeMap.insert("IsVerticalForms",                        qMakePair(0xFE10, 0xFE1F));
2821    categoriesRangeMap.insert("IsCombiningHalfMarks",                   qMakePair(0xFE20, 0xFE2F));
2822    categoriesRangeMap.insert("IsCJKCompatibilityForms",                qMakePair(0xFE30, 0xFE4F));
2823    categoriesRangeMap.insert("IsSmallFormVariants",                    qMakePair(0xFE50, 0xFE6F));
2824    categoriesRangeMap.insert("IsArabicPresentationForms-B",            qMakePair(0xFE70, 0xFEFF));
2825    categoriesRangeMap.insert("IsHalfwidthandFullwidthForms",           qMakePair(0xFF00, 0xFFEF));
2826    categoriesRangeMap.insert("IsSpecials",                             qMakePair(0xFFF0, 0xFFFF));
2827    categoriesRangeMap.insert("IsLinearBSyllabary",                     qMakePair(0x10000, 0x1007F));
2828    categoriesRangeMap.insert("IsLinearBIdeograms",                     qMakePair(0x10080, 0x100FF));
2829    categoriesRangeMap.insert("IsAegeanNumbers",                        qMakePair(0x10100, 0x1013F));
2830    categoriesRangeMap.insert("IsAncientGreekNumbers",                  qMakePair(0x10140, 0x1018F));
2831    categoriesRangeMap.insert("IsOldItalic",                            qMakePair(0x10300, 0x1032F));
2832    categoriesRangeMap.insert("IsGothic",                               qMakePair(0x10330, 0x1034F));
2833    categoriesRangeMap.insert("IsUgaritic",                             qMakePair(0x10380, 0x1039F));
2834    categoriesRangeMap.insert("IsOldPersian",                           qMakePair(0x103A0, 0x103DF));
2835    categoriesRangeMap.insert("IsDeseret",                              qMakePair(0x10400, 0x1044F));
2836    categoriesRangeMap.insert("IsShavian",                              qMakePair(0x10450, 0x1047F));
2837    categoriesRangeMap.insert("IsOsmanya",                              qMakePair(0x10480, 0x104AF));
2838    categoriesRangeMap.insert("IsCypriotSyllabary",                     qMakePair(0x10800, 0x1083F));
2839    categoriesRangeMap.insert("IsKharoshthi",                           qMakePair(0x10A00, 0x10A5F));
2840    categoriesRangeMap.insert("IsByzantineMusicalSymbols",              qMakePair(0x1D000, 0x1D0FF));
2841    categoriesRangeMap.insert("IsMusicalSymbols",                       qMakePair(0x1D100, 0x1D1FF));
2842    categoriesRangeMap.insert("IsAncientGreekMusicalNotation",          qMakePair(0x1D200, 0x1D24F));
2843    categoriesRangeMap.insert("IsTaiXuanJingSymbols",                   qMakePair(0x1D300, 0x1D35F));
2844    categoriesRangeMap.insert("IsMathematicalAlphanumericSymbols",      qMakePair(0x1D400, 0x1D7FF));
2845    categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionB",       qMakePair(0x20000, 0x2A6DF));
2846    categoriesRangeMap.insert("IsCJKCompatibilityIdeographsSupplement", qMakePair(0x2F800, 0x2FA1F));
2847    categoriesRangeMap.insert("IsTags",                                 qMakePair(0xE0000, 0xE007F));
2848    categoriesRangeMap.insert("IsVariationSelectorsSupplement",         qMakePair(0xE0100, 0xE01EF));
2849    categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-A",        qMakePair(0xF0000, 0xFFFFF));
2850    categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-B",        qMakePair(0x100000, 0x10FFFF));
2851 }
2852 
getChar()2853 int QRegExpEngine::getChar()
2854 {
2855     return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2856 }
2857 
getEscape()2858 int QRegExpEngine::getEscape()
2859 {
2860 #ifndef QT_NO_REGEXP_ESCAPE
2861     const char tab[] = "afnrtv"; // no b, as \b means word boundary
2862     const char backTab[] = "\a\f\n\r\t\v";
2863     ushort low;
2864     int i;
2865 #endif
2866     ushort val;
2867     int prevCh = yyCh;
2868 
2869     if (prevCh == EOS) {
2870         error(RXERR_END);
2871         return Tok_Char | '\\';
2872     }
2873     yyCh = getChar();
2874 #ifndef QT_NO_REGEXP_ESCAPE
2875     if ((prevCh & ~0xff) == 0) {
2876         const char *p = strchr(tab, prevCh);
2877         if (p != 0)
2878             return Tok_Char | backTab[p - tab];
2879     }
2880 #endif
2881 
2882     switch (prevCh) {
2883 #ifndef QT_NO_REGEXP_ESCAPE
2884     case '0':
2885         val = 0;
2886         for (i = 0; i < 3; i++) {
2887             if (yyCh >= '0' && yyCh <= '7')
2888                 val = (val << 3) | (yyCh - '0');
2889             else
2890                 break;
2891             yyCh = getChar();
2892         }
2893         if ((val & ~0377) != 0)
2894             error(RXERR_OCTAL);
2895         return Tok_Char | val;
2896 #endif
2897 #ifndef QT_NO_REGEXP_ESCAPE
2898     case 'B':
2899         return Tok_NonWord;
2900 #endif
2901 #ifndef QT_NO_REGEXP_CCLASS
2902     case 'D':
2903         // see QChar::isDigit()
2904         yyCharClass->addCategories(0x7fffffef);
2905         return Tok_CharClass;
2906     case 'S':
2907         // see QChar::isSpace()
2908         yyCharClass->addCategories(0x7ffff87f);
2909         yyCharClass->addRange(0x0000, 0x0008);
2910         yyCharClass->addRange(0x000e, 0x001f);
2911         yyCharClass->addRange(0x007f, 0x009f);
2912         return Tok_CharClass;
2913     case 'W':
2914         // see QChar::isLetterOrNumber() and QChar::isMark()
2915         yyCharClass->addCategories(0x7fe07f81);
2916         yyCharClass->addRange(0x203f, 0x2040);
2917         yyCharClass->addSingleton(0x2040);
2918         yyCharClass->addSingleton(0x2054);
2919         yyCharClass->addSingleton(0x30fb);
2920         yyCharClass->addRange(0xfe33, 0xfe34);
2921         yyCharClass->addRange(0xfe4d, 0xfe4f);
2922         yyCharClass->addSingleton(0xff3f);
2923         yyCharClass->addSingleton(0xff65);
2924         return Tok_CharClass;
2925 #endif
2926 #ifndef QT_NO_REGEXP_ESCAPE
2927     case 'b':
2928         return Tok_Word;
2929 #endif
2930 #ifndef QT_NO_REGEXP_CCLASS
2931     case 'd':
2932         // see QChar::isDigit()
2933         yyCharClass->addCategories(0x00000010);
2934         return Tok_CharClass;
2935     case 's':
2936         // see QChar::isSpace()
2937         yyCharClass->addCategories(0x00000380);
2938         yyCharClass->addRange(0x0009, 0x000d);
2939         return Tok_CharClass;
2940     case 'w':
2941         // see QChar::isLetterOrNumber() and QChar::isMark()
2942         yyCharClass->addCategories(0x000f807e);
2943         yyCharClass->addSingleton(0x005f); // '_'
2944         return Tok_CharClass;
2945     case 'I':
2946         if (xmlSchemaExtensions) {
2947             yyCharClass->setNegative(!yyCharClass->negative());
2948             // fall through
2949         }
2950     case 'i':
2951         if (xmlSchemaExtensions) {
2952             yyCharClass->addCategories(0x000f807e);
2953             yyCharClass->addSingleton(0x003a); // ':'
2954             yyCharClass->addSingleton(0x005f); // '_'
2955             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
2956             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
2957             yyCharClass->addRange(0xc0, 0xd6);
2958             yyCharClass->addRange(0xd8, 0xf6);
2959             yyCharClass->addRange(0xf8, 0x2ff);
2960             yyCharClass->addRange(0x370, 0x37d);
2961             yyCharClass->addRange(0x37f, 0x1fff);
2962             yyCharClass->addRange(0x200c, 0x200d);
2963             yyCharClass->addRange(0x2070, 0x218f);
2964             yyCharClass->addRange(0x2c00, 0x2fef);
2965             yyCharClass->addRange(0x3001, 0xd7ff);
2966             yyCharClass->addRange(0xf900, 0xfdcf);
2967             yyCharClass->addRange(0xfdf0, 0xfffd);
2968             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
2969         }
2970         return Tok_CharClass;
2971     case 'C':
2972         if (xmlSchemaExtensions) {
2973             yyCharClass->setNegative(!yyCharClass->negative());
2974             // fall through
2975         }
2976     case 'c':
2977         if (xmlSchemaExtensions) {
2978             yyCharClass->addCategories(0x000f807e);
2979             yyCharClass->addSingleton(0x002d); // '-'
2980             yyCharClass->addSingleton(0x002e); // '.'
2981             yyCharClass->addSingleton(0x003a); // ':'
2982             yyCharClass->addSingleton(0x005f); // '_'
2983             yyCharClass->addSingleton(0xb7);
2984             yyCharClass->addRange(0x0030, 0x0039); // [0-9]
2985             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
2986             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
2987             yyCharClass->addRange(0xc0, 0xd6);
2988             yyCharClass->addRange(0xd8, 0xf6);
2989             yyCharClass->addRange(0xf8, 0x2ff);
2990             yyCharClass->addRange(0x370, 0x37d);
2991             yyCharClass->addRange(0x37f, 0x1fff);
2992             yyCharClass->addRange(0x200c, 0x200d);
2993             yyCharClass->addRange(0x2070, 0x218f);
2994             yyCharClass->addRange(0x2c00, 0x2fef);
2995             yyCharClass->addRange(0x3001, 0xd7ff);
2996             yyCharClass->addRange(0xf900, 0xfdcf);
2997             yyCharClass->addRange(0xfdf0, 0xfffd);
2998             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
2999             yyCharClass->addRange(0x0300, 0x036f);
3000             yyCharClass->addRange(0x203f, 0x2040);
3001         }
3002         return Tok_CharClass;
3003     case 'P':
3004         if (xmlSchemaExtensions) {
3005             yyCharClass->setNegative(!yyCharClass->negative());
3006             // fall through
3007         }
3008     case 'p':
3009         if (xmlSchemaExtensions) {
3010             if (yyCh != '{') {
3011                 error(RXERR_CHARCLASS);
3012                 return Tok_CharClass;
3013             }
3014 
3015             QByteArray category;
3016             yyCh = getChar();
3017             while (yyCh != '}') {
3018                 if (yyCh == EOS) {
3019                     error(RXERR_END);
3020                     return Tok_CharClass;
3021                 }
3022                 category.append(yyCh);
3023                 yyCh = getChar();
3024             }
3025             yyCh = getChar(); // skip closing '}'
3026 
3027             if (category == "M") {
3028                 yyCharClass->addCategories(0x0000000e);
3029             } else if (category == "Mn") {
3030                 yyCharClass->addCategories(0x00000002);
3031             } else if (category == "Mc") {
3032                 yyCharClass->addCategories(0x00000004);
3033             } else if (category == "Me") {
3034                 yyCharClass->addCategories(0x00000008);
3035             } else if (category == "N") {
3036                 yyCharClass->addCategories(0x00000070);
3037             } else if (category == "Nd") {
3038                 yyCharClass->addCategories(0x00000010);
3039             } else if (category == "Nl") {
3040                 yyCharClass->addCategories(0x00000020);
3041             } else if (category == "No") {
3042                 yyCharClass->addCategories(0x00000040);
3043             } else if (category == "Z") {
3044                 yyCharClass->addCategories(0x00000380);
3045             } else if (category == "Zs") {
3046                 yyCharClass->addCategories(0x00000080);
3047             } else if (category == "Zl") {
3048                 yyCharClass->addCategories(0x00000100);
3049             } else if (category == "Zp") {
3050                 yyCharClass->addCategories(0x00000200);
3051             } else if (category == "C") {
3052                 yyCharClass->addCategories(0x00006c00);
3053             } else if (category == "Cc") {
3054                 yyCharClass->addCategories(0x00000400);
3055             } else if (category == "Cf") {
3056                 yyCharClass->addCategories(0x00000800);
3057             } else if (category == "Cs") {
3058                 yyCharClass->addCategories(0x00001000);
3059             } else if (category == "Co") {
3060                 yyCharClass->addCategories(0x00002000);
3061             } else if (category == "Cn") {
3062                 yyCharClass->addCategories(0x00004000);
3063             } else if (category == "L") {
3064                 yyCharClass->addCategories(0x000f8000);
3065             } else if (category == "Lu") {
3066                 yyCharClass->addCategories(0x00008000);
3067             } else if (category == "Ll") {
3068                 yyCharClass->addCategories(0x00010000);
3069             } else if (category == "Lt") {
3070                 yyCharClass->addCategories(0x00020000);
3071             } else if (category == "Lm") {
3072                 yyCharClass->addCategories(0x00040000);
3073             } else if (category == "Lo") {
3074                 yyCharClass->addCategories(0x00080000);
3075             } else if (category == "P") {
3076                 yyCharClass->addCategories(0x4f580780);
3077             } else if (category == "Pc") {
3078                 yyCharClass->addCategories(0x00100000);
3079             } else if (category == "Pd") {
3080                 yyCharClass->addCategories(0x00200000);
3081             } else if (category == "Ps") {
3082                 yyCharClass->addCategories(0x00400000);
3083             } else if (category == "Pe") {
3084                 yyCharClass->addCategories(0x00800000);
3085             } else if (category == "Pi") {
3086                 yyCharClass->addCategories(0x01000000);
3087             } else if (category == "Pf") {
3088                 yyCharClass->addCategories(0x02000000);
3089             } else if (category == "Po") {
3090                 yyCharClass->addCategories(0x04000000);
3091             } else if (category == "S") {
3092                 yyCharClass->addCategories(0x78000000);
3093             } else if (category == "Sm") {
3094                 yyCharClass->addCategories(0x08000000);
3095             } else if (category == "Sc") {
3096                 yyCharClass->addCategories(0x10000000);
3097             } else if (category == "Sk") {
3098                 yyCharClass->addCategories(0x20000000);
3099             } else if (category == "So") {
3100                 yyCharClass->addCategories(0x40000000);
3101             } else if (category.startsWith("Is")) {
3102                 if (categoriesRangeMap.isEmpty())
3103                     setupCategoriesRangeMap();
3104 
3105                 if (categoriesRangeMap.contains(category)) {
3106                     const QPair<int, int> range = categoriesRangeMap.value(category);
3107                     yyCharClass->addRange(range.first, range.second);
3108                 } else {
3109                     error(RXERR_CATEGORY);
3110                 }
3111             } else {
3112                 error(RXERR_CATEGORY);
3113             }
3114         }
3115         return Tok_CharClass;
3116 #endif
3117 #ifndef QT_NO_REGEXP_ESCAPE
3118     case 'x':
3119         val = 0;
3120         for (i = 0; i < 4; i++) {
3121             low = QChar(yyCh).toLower().unicode();
3122             if (low >= '0' && low <= '9')
3123                 val = (val << 4) | (low - '0');
3124             else if (low >= 'a' && low <= 'f')
3125                 val = (val << 4) | (low - 'a' + 10);
3126             else
3127                 break;
3128             yyCh = getChar();
3129         }
3130         return Tok_Char | val;
3131 #endif
3132     default:
3133         if (prevCh >= '1' && prevCh <= '9') {
3134 #ifndef QT_NO_REGEXP_BACKREF
3135             val = prevCh - '0';
3136             while (yyCh >= '0' && yyCh <= '9') {
3137                 val = (val * 10) + (yyCh - '0');
3138                 yyCh = getChar();
3139             }
3140             return Tok_BackRef | val;
3141 #else
3142             error(RXERR_DISABLED);
3143 #endif
3144         }
3145         return Tok_Char | prevCh;
3146     }
3147 }
3148 
3149 #ifndef QT_NO_REGEXP_INTERVAL
getRep(int def)3150 int QRegExpEngine::getRep(int def)
3151 {
3152     if (yyCh >= '0' && yyCh <= '9') {
3153         int rep = 0;
3154         do {
3155             rep = 10 * rep + yyCh - '0';
3156             if (rep >= InftyRep) {
3157                 error(RXERR_REPETITION);
3158                 rep = def;
3159             }
3160             yyCh = getChar();
3161         } while (yyCh >= '0' && yyCh <= '9');
3162         return rep;
3163     } else {
3164         return def;
3165     }
3166 }
3167 #endif
3168 
3169 #ifndef QT_NO_REGEXP_LOOKAHEAD
skipChars(int n)3170 void QRegExpEngine::skipChars(int n)
3171 {
3172     if (n > 0) {
3173         yyPos += n - 1;
3174         yyCh = getChar();
3175     }
3176 }
3177 #endif
3178 
error(const char * msg)3179 void QRegExpEngine::error(const char *msg)
3180 {
3181     if (yyError.isEmpty())
3182         yyError = QLatin1String(msg);
3183 }
3184 
startTokenizer(const QChar * rx,int len)3185 void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3186 {
3187     yyIn = rx;
3188     yyPos0 = 0;
3189     yyPos = 0;
3190     yyLen = len;
3191     yyCh = getChar();
3192     yyCharClass.reset(new QRegExpCharClass);
3193     yyMinRep = 0;
3194     yyMaxRep = 0;
3195     yyError = QString();
3196 }
3197 
getToken()3198 int QRegExpEngine::getToken()
3199 {
3200 #ifndef QT_NO_REGEXP_CCLASS
3201     ushort pendingCh = 0;
3202     bool charPending;
3203     bool rangePending;
3204     int tok;
3205 #endif
3206     int prevCh = yyCh;
3207 
3208     yyPos0 = yyPos - 1;
3209 #ifndef QT_NO_REGEXP_CCLASS
3210     yyCharClass->clear();
3211 #endif
3212     yyMinRep = 0;
3213     yyMaxRep = 0;
3214     yyCh = getChar();
3215 
3216     switch (prevCh) {
3217     case EOS:
3218         yyPos0 = yyPos;
3219         return Tok_Eos;
3220     case '$':
3221         return Tok_Dollar;
3222     case '(':
3223         if (yyCh == '?') {
3224             prevCh = getChar();
3225             yyCh = getChar();
3226             switch (prevCh) {
3227 #ifndef QT_NO_REGEXP_LOOKAHEAD
3228             case '!':
3229                 return Tok_NegLookahead;
3230             case '=':
3231                 return Tok_PosLookahead;
3232 #endif
3233             case ':':
3234                 return Tok_MagicLeftParen;
3235             case '<':
3236                 error(RXERR_LOOKBEHIND);
3237                 return Tok_MagicLeftParen;
3238             default:
3239                 error(RXERR_LOOKAHEAD);
3240                 return Tok_MagicLeftParen;
3241             }
3242         } else {
3243             return Tok_LeftParen;
3244         }
3245     case ')':
3246         return Tok_RightParen;
3247     case '*':
3248         yyMinRep = 0;
3249         yyMaxRep = InftyRep;
3250         return Tok_Quantifier;
3251     case '+':
3252         yyMinRep = 1;
3253         yyMaxRep = InftyRep;
3254         return Tok_Quantifier;
3255     case '.':
3256 #ifndef QT_NO_REGEXP_CCLASS
3257         yyCharClass->setNegative(true);
3258 #endif
3259         return Tok_CharClass;
3260     case '?':
3261         yyMinRep = 0;
3262         yyMaxRep = 1;
3263         return Tok_Quantifier;
3264     case '[':
3265 #ifndef QT_NO_REGEXP_CCLASS
3266         if (yyCh == '^') {
3267             yyCharClass->setNegative(true);
3268             yyCh = getChar();
3269         }
3270         charPending = false;
3271         rangePending = false;
3272         do {
3273             if (yyCh == '-' && charPending && !rangePending) {
3274                 rangePending = true;
3275                 yyCh = getChar();
3276             } else {
3277                 if (charPending && !rangePending) {
3278                     yyCharClass->addSingleton(pendingCh);
3279                     charPending = false;
3280                 }
3281                 if (yyCh == '\\') {
3282                     yyCh = getChar();
3283                     tok = getEscape();
3284                     if (tok == Tok_Word)
3285                         tok = '\b';
3286                 } else {
3287                     tok = Tok_Char | yyCh;
3288                     yyCh = getChar();
3289                 }
3290                 if (tok == Tok_CharClass) {
3291                     if (rangePending) {
3292                         yyCharClass->addSingleton('-');
3293                         yyCharClass->addSingleton(pendingCh);
3294                         charPending = false;
3295                         rangePending = false;
3296                     }
3297                 } else if ((tok & Tok_Char) != 0) {
3298                     if (rangePending) {
3299                         yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3300                         charPending = false;
3301                         rangePending = false;
3302                     } else {
3303                         pendingCh = tok ^ Tok_Char;
3304                         charPending = true;
3305                     }
3306                 } else {
3307                     error(RXERR_CHARCLASS);
3308                 }
3309             }
3310         }  while (yyCh != ']' && yyCh != EOS);
3311         if (rangePending)
3312             yyCharClass->addSingleton('-');
3313         if (charPending)
3314             yyCharClass->addSingleton(pendingCh);
3315         if (yyCh == EOS)
3316             error(RXERR_END);
3317         else
3318             yyCh = getChar();
3319         return Tok_CharClass;
3320 #else
3321         error(RXERR_END);
3322         return Tok_Char | '[';
3323 #endif
3324     case '\\':
3325         return getEscape();
3326     case ']':
3327         error(RXERR_LEFTDELIM);
3328         return Tok_Char | ']';
3329     case '^':
3330         return Tok_Caret;
3331     case '{':
3332 #ifndef QT_NO_REGEXP_INTERVAL
3333         yyMinRep = getRep(0);
3334         yyMaxRep = yyMinRep;
3335         if (yyCh == ',') {
3336             yyCh = getChar();
3337             yyMaxRep = getRep(InftyRep);
3338         }
3339         if (yyMaxRep < yyMinRep)
3340             error(RXERR_INTERVAL);
3341         if (yyCh != '}')
3342             error(RXERR_REPETITION);
3343         yyCh = getChar();
3344         return Tok_Quantifier;
3345 #else
3346         error(RXERR_DISABLED);
3347         return Tok_Char | '{';
3348 #endif
3349     case '|':
3350         return Tok_Bar;
3351     case '}':
3352         error(RXERR_LEFTDELIM);
3353         return Tok_Char | '}';
3354     default:
3355         return Tok_Char | prevCh;
3356     }
3357 }
3358 
parse(const QChar * pattern,int len)3359 int QRegExpEngine::parse(const QChar *pattern, int len)
3360 {
3361     valid = true;
3362     startTokenizer(pattern, len);
3363     yyTok = getToken();
3364 #ifndef QT_NO_REGEXP_CAPTURE
3365     yyMayCapture = true;
3366 #else
3367     yyMayCapture = false;
3368 #endif
3369 
3370 #ifndef QT_NO_REGEXP_CAPTURE
3371     int atom = startAtom(false);
3372 #endif
3373     QRegExpCharClass anything;
3374     Box box(this); // create InitialState
3375     box.set(anything);
3376     Box rightBox(this); // create FinalState
3377     rightBox.set(anything);
3378 
3379     Box middleBox(this);
3380     parseExpression(&middleBox);
3381 #ifndef QT_NO_REGEXP_CAPTURE
3382     finishAtom(atom, false);
3383 #endif
3384 #ifndef QT_NO_REGEXP_OPTIM
3385     middleBox.setupHeuristics();
3386 #endif
3387     box.cat(middleBox);
3388     box.cat(rightBox);
3389     yyCharClass.reset(0);
3390 
3391 #ifndef QT_NO_REGEXP_CAPTURE
3392     for (int i = 0; i < nf; ++i) {
3393         switch (f[i].capture) {
3394         case QRegExpAtom::NoCapture:
3395             break;
3396         case QRegExpAtom::OfficialCapture:
3397             f[i].capture = ncap;
3398             captureForOfficialCapture.append(ncap);
3399             ++ncap;
3400             ++officialncap;
3401             break;
3402         case QRegExpAtom::UnofficialCapture:
3403             f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3404         }
3405     }
3406 
3407 #ifndef QT_NO_REGEXP_BACKREF
3408 #ifndef QT_NO_REGEXP_OPTIM
3409     if (officialncap == 0 && nbrefs == 0) {
3410         ncap = nf = 0;
3411         f.clear();
3412     }
3413 #endif
3414     // handle the case where there's a \5 with no corresponding capture
3415     // (captureForOfficialCapture.size() != officialncap)
3416     for (int i = 0; i < nbrefs - officialncap; ++i) {
3417         captureForOfficialCapture.append(ncap);
3418         ++ncap;
3419     }
3420 #endif
3421 #endif
3422 
3423     if (!yyError.isEmpty())
3424         return -1;
3425 
3426 #ifndef QT_NO_REGEXP_OPTIM
3427     const QRegExpAutomatonState &sinit = s.at(InitialState);
3428     caretAnchored = !sinit.anchors.isEmpty();
3429     if (caretAnchored) {
3430         const QMap<int, int> &anchors = sinit.anchors;
3431         QMap<int, int>::const_iterator a;
3432         for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3433             if (
3434 #ifndef QT_NO_REGEXP_ANCHOR_ALT
3435                 (*a & Anchor_Alternation) != 0 ||
3436 #endif
3437                 (*a & Anchor_Caret) == 0)
3438             {
3439                 caretAnchored = false;
3440                 break;
3441             }
3442         }
3443     }
3444 #endif
3445 
3446     // cleanup anchors
3447     int numStates = s.count();
3448     for (int i = 0; i < numStates; ++i) {
3449         QRegExpAutomatonState &state = s[i];
3450         if (!state.anchors.isEmpty()) {
3451             QMap<int, int>::iterator a = state.anchors.begin();
3452             while (a != state.anchors.end()) {
3453                 if (a.value() == 0)
3454                     a = state.anchors.erase(a);
3455                 else
3456                     ++a;
3457             }
3458         }
3459     }
3460 
3461     return yyPos0;
3462 }
3463 
parseAtom(Box * box)3464 void QRegExpEngine::parseAtom(Box *box)
3465 {
3466 #ifndef QT_NO_REGEXP_LOOKAHEAD
3467     QRegExpEngine *eng = 0;
3468     bool neg;
3469     int len;
3470 #endif
3471 
3472     if ((yyTok & Tok_Char) != 0) {
3473         box->set(QChar(yyTok ^ Tok_Char));
3474     } else {
3475 #ifndef QT_NO_REGEXP_OPTIM
3476         trivial = false;
3477 #endif
3478         switch (yyTok) {
3479         case Tok_Dollar:
3480             box->catAnchor(Anchor_Dollar);
3481             break;
3482         case Tok_Caret:
3483             box->catAnchor(Anchor_Caret);
3484             break;
3485 #ifndef QT_NO_REGEXP_LOOKAHEAD
3486         case Tok_PosLookahead:
3487         case Tok_NegLookahead:
3488             neg = (yyTok == Tok_NegLookahead);
3489             eng = new QRegExpEngine(cs, greedyQuantifiers);
3490             len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3491             if (len >= 0)
3492                 skipChars(len);
3493             else
3494                 error(RXERR_LOOKAHEAD);
3495             box->catAnchor(addLookahead(eng, neg));
3496             yyTok = getToken();
3497             if (yyTok != Tok_RightParen)
3498                 error(RXERR_LOOKAHEAD);
3499             break;
3500 #endif
3501 #ifndef QT_NO_REGEXP_ESCAPE
3502         case Tok_Word:
3503             box->catAnchor(Anchor_Word);
3504             break;
3505         case Tok_NonWord:
3506             box->catAnchor(Anchor_NonWord);
3507             break;
3508 #endif
3509         case Tok_LeftParen:
3510         case Tok_MagicLeftParen:
3511             yyTok = getToken();
3512             parseExpression(box);
3513             if (yyTok != Tok_RightParen)
3514                 error(RXERR_END);
3515             break;
3516         case Tok_CharClass:
3517             box->set(*yyCharClass);
3518             break;
3519         case Tok_Quantifier:
3520             error(RXERR_REPETITION);
3521             break;
3522         default:
3523 #ifndef QT_NO_REGEXP_BACKREF
3524             if ((yyTok & Tok_BackRef) != 0)
3525                 box->set(yyTok ^ Tok_BackRef);
3526             else
3527 #endif
3528                 error(RXERR_DISABLED);
3529         }
3530     }
3531     yyTok = getToken();
3532 }
3533 
parseFactor(Box * box)3534 void QRegExpEngine::parseFactor(Box *box)
3535 {
3536 #ifndef QT_NO_REGEXP_CAPTURE
3537     int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3538     int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3539     bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3540 #else
3541     const int innerAtom = -1;
3542 #endif
3543 
3544 #ifndef QT_NO_REGEXP_INTERVAL
3545 #define YYREDO() \
3546         yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3547         *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3548 
3549     const QChar *in = yyIn;
3550     int pos0 = yyPos0;
3551     int pos = yyPos;
3552     int len = yyLen;
3553     int ch = yyCh;
3554     QRegExpCharClass charClass;
3555     if (yyTok == Tok_CharClass)
3556         charClass = *yyCharClass;
3557     int tok = yyTok;
3558     bool mayCapture = yyMayCapture;
3559 #endif
3560 
3561     parseAtom(box);
3562 #ifndef QT_NO_REGEXP_CAPTURE
3563     finishAtom(innerAtom, magicLeftParen);
3564 #endif
3565 
3566     bool hasQuantifier = (yyTok == Tok_Quantifier);
3567     if (hasQuantifier) {
3568 #ifndef QT_NO_REGEXP_OPTIM
3569         trivial = false;
3570 #endif
3571         if (yyMaxRep == InftyRep) {
3572             box->plus(innerAtom);
3573 #ifndef QT_NO_REGEXP_INTERVAL
3574         } else if (yyMaxRep == 0) {
3575             box->clear();
3576 #endif
3577         }
3578         if (yyMinRep == 0)
3579             box->opt();
3580 
3581 #ifndef QT_NO_REGEXP_INTERVAL
3582         yyMayCapture = false;
3583         int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3584         int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3585 
3586         Box rightBox(this);
3587         int i;
3588 
3589         for (i = 0; i < beta; i++) {
3590             YYREDO();
3591             Box leftBox(this);
3592             parseAtom(&leftBox);
3593             leftBox.cat(rightBox);
3594             leftBox.opt();
3595             rightBox = leftBox;
3596         }
3597         for (i = 0; i < alpha; i++) {
3598             YYREDO();
3599             Box leftBox(this);
3600             parseAtom(&leftBox);
3601             leftBox.cat(rightBox);
3602             rightBox = leftBox;
3603         }
3604         rightBox.cat(*box);
3605         *box = rightBox;
3606 #endif
3607         yyTok = getToken();
3608 #ifndef QT_NO_REGEXP_INTERVAL
3609         yyMayCapture = mayCapture;
3610 #endif
3611     }
3612 #undef YYREDO
3613 #ifndef QT_NO_REGEXP_CAPTURE
3614     if (greedyQuantifiers)
3615         finishAtom(outerAtom, hasQuantifier);
3616 #endif
3617 }
3618 
parseTerm(Box * box)3619 void QRegExpEngine::parseTerm(Box *box)
3620 {
3621 #ifndef QT_NO_REGEXP_OPTIM
3622     if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3623         parseFactor(box);
3624 #endif
3625     while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3626         Box rightBox(this);
3627         parseFactor(&rightBox);
3628         box->cat(rightBox);
3629     }
3630 }
3631 
parseExpression(Box * box)3632 void QRegExpEngine::parseExpression(Box *box)
3633 {
3634     parseTerm(box);
3635     while (yyTok == Tok_Bar) {
3636 #ifndef QT_NO_REGEXP_OPTIM
3637         trivial = false;
3638 #endif
3639         Box rightBox(this);
3640         yyTok = getToken();
3641         parseTerm(&rightBox);
3642         box->orx(rightBox);
3643     }
3644 }
3645 
3646 /*
3647   The struct QRegExpPrivate contains the private data of a regular
3648   expression other than the automaton. It makes it possible for many
3649   QRegExp objects to use the same QRegExpEngine object with different
3650   QRegExpPrivate objects.
3651 */
3652 struct QRegExpPrivate
3653 {
3654     QRegExpEngine *eng;
3655     QRegExpEngineKey engineKey;
3656     bool minimal;
3657 #ifndef QT_NO_REGEXP_CAPTURE
3658     QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3659     QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3660 #endif
3661     QRegExpMatchState matchState;
3662 
QRegExpPrivateQRegExpPrivate3663     inline QRegExpPrivate()
3664         : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
QRegExpPrivateQRegExpPrivate3665     inline QRegExpPrivate(const QRegExpEngineKey &key)
3666         : eng(0), engineKey(key), minimal(false) {}
3667 };
3668 
3669 #if !defined(QT_NO_REGEXP_OPTIM)
qHash(const QRegExpEngineKey & key)3670 uint qHash(const QRegExpEngineKey &key)
3671 {
3672     return qHash(key.pattern);
3673 }
3674 
3675 typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
Q_GLOBAL_STATIC(EngineCache,globalEngineCache)3676 Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3677 Q_GLOBAL_STATIC(QMutex, mutex)
3678 #endif // QT_NO_REGEXP_OPTIM
3679 
3680 static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3681 {
3682     if (!eng->ref.deref()) {
3683 #if !defined(QT_NO_REGEXP_OPTIM)
3684         if (globalEngineCache()) {
3685             QMutexLocker locker(mutex());
3686             QT_TRY {
3687                 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3688             } QT_CATCH(const std::bad_alloc &) {
3689                 // in case of an exception (e.g. oom), just delete the engine
3690                 delete eng;
3691             }
3692         } else {
3693             delete eng;
3694         }
3695 #else
3696         Q_UNUSED(key);
3697         delete eng;
3698 #endif
3699     }
3700 }
3701 
prepareEngine_helper(QRegExpPrivate * priv)3702 static void prepareEngine_helper(QRegExpPrivate *priv)
3703 {
3704     bool initMatchState = !priv->eng;
3705 #if !defined(QT_NO_REGEXP_OPTIM)
3706     if (!priv->eng && globalEngineCache()) {
3707         QMutexLocker locker(mutex());
3708         priv->eng = globalEngineCache()->take(priv->engineKey);
3709         if (priv->eng != 0)
3710             priv->eng->ref.ref();
3711     }
3712 #endif // QT_NO_REGEXP_OPTIM
3713 
3714     if (!priv->eng)
3715         priv->eng = new QRegExpEngine(priv->engineKey);
3716 
3717     if (initMatchState)
3718         priv->matchState.prepareForMatch(priv->eng);
3719 }
3720 
prepareEngine(QRegExpPrivate * priv)3721 inline static void prepareEngine(QRegExpPrivate *priv)
3722 {
3723     if (priv->eng)
3724         return;
3725     prepareEngine_helper(priv);
3726 }
3727 
prepareEngineForMatch(QRegExpPrivate * priv,const QString & str)3728 static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3729 {
3730     prepareEngine(priv);
3731     priv->matchState.prepareForMatch(priv->eng);
3732 #ifndef QT_NO_REGEXP_CAPTURE
3733     priv->t = str;
3734     priv->capturedCache.clear();
3735 #else
3736     Q_UNUSED(str);
3737 #endif
3738 }
3739 
invalidateEngine(QRegExpPrivate * priv)3740 static void invalidateEngine(QRegExpPrivate *priv)
3741 {
3742     if (priv->eng != 0) {
3743         derefEngine(priv->eng, priv->engineKey);
3744         priv->eng = 0;
3745         priv->matchState.drain();
3746     }
3747 }
3748 
3749 /*!
3750     \enum QRegExp::CaretMode
3751 
3752     The CaretMode enum defines the different meanings of the caret
3753     (\bold{^}) in a regular expression. The possible values are:
3754 
3755     \value CaretAtZero
3756            The caret corresponds to index 0 in the searched string.
3757 
3758     \value CaretAtOffset
3759            The caret corresponds to the start offset of the search.
3760 
3761     \value CaretWontMatch
3762            The caret never matches.
3763 */
3764 
3765 /*!
3766     \enum QRegExp::PatternSyntax
3767 
3768     The syntax used to interpret the meaning of the pattern.
3769 
3770     \value RegExp A rich Perl-like pattern matching syntax. This is
3771     the default.
3772 
3773     \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. This
3774     will be the default in Qt 5. (Introduced in Qt 4.2.)
3775 
3776     \value Wildcard This provides a simple pattern matching syntax
3777     similar to that used by shells (command interpreters) for "file
3778     globbing". See \l{Wildcard Matching}.
3779 
3780     \value WildcardUnix This is similar to Wildcard but with the
3781     behavior of a Unix shell. The wildcard characters can be escaped
3782     with the character "\\".
3783 
3784     \value FixedString The pattern is a fixed string. This is
3785     equivalent to using the RegExp pattern on a string in
3786     which all metacharacters are escaped using escape().
3787 
3788     \value W3CXmlSchema11 The pattern is a regular expression as
3789     defined by the W3C XML Schema 1.1 specification.
3790 
3791     \sa setPatternSyntax()
3792 */
3793 
3794 /*!
3795     Constructs an empty regexp.
3796 
3797     \sa isValid(), errorString()
3798 */
QRegExp()3799 QRegExp::QRegExp()
3800 {
3801     priv = new QRegExpPrivate;
3802     prepareEngine(priv);
3803 }
3804 
3805 /*!
3806     Constructs a regular expression object for the given \a pattern
3807     string. The pattern must be given using wildcard notation if \a
3808     syntax is \l Wildcard; the default is \l RegExp. The pattern is
3809     case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3810     greedy (maximal), but can be changed by calling
3811     setMinimal().
3812 
3813     \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3814 */
QRegExp(const QString & pattern,Qt::CaseSensitivity cs,PatternSyntax syntax)3815 QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3816 {
3817     priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3818     prepareEngine(priv);
3819 }
3820 
3821 /*!
3822     Constructs a regular expression as a copy of \a rx.
3823 
3824     \sa operator=()
3825 */
QRegExp(const QRegExp & rx)3826 QRegExp::QRegExp(const QRegExp &rx)
3827 {
3828     priv = new QRegExpPrivate;
3829     operator=(rx);
3830 }
3831 
3832 /*!
3833     Destroys the regular expression and cleans up its internal data.
3834 */
~QRegExp()3835 QRegExp::~QRegExp()
3836 {
3837     invalidateEngine(priv);
3838     delete priv;
3839 }
3840 
3841 /*!
3842     Copies the regular expression \a rx and returns a reference to the
3843     copy. The case sensitivity, wildcard, and minimal matching options
3844     are also copied.
3845 */
operator =(const QRegExp & rx)3846 QRegExp &QRegExp::operator=(const QRegExp &rx)
3847 {
3848     prepareEngine(rx.priv); // to allow sharing
3849     QRegExpEngine *otherEng = rx.priv->eng;
3850     if (otherEng)
3851         otherEng->ref.ref();
3852     invalidateEngine(priv);
3853     priv->eng = otherEng;
3854     priv->engineKey = rx.priv->engineKey;
3855     priv->minimal = rx.priv->minimal;
3856 #ifndef QT_NO_REGEXP_CAPTURE
3857     priv->t = rx.priv->t;
3858     priv->capturedCache = rx.priv->capturedCache;
3859 #endif
3860     if (priv->eng)
3861         priv->matchState.prepareForMatch(priv->eng);
3862     priv->matchState.captured = rx.priv->matchState.captured;
3863     return *this;
3864 }
3865 
3866 /*!
3867     \fn void QRegExp::swap(QRegExp &other)
3868     \since 4.8
3869 
3870     Swaps regular expression \a other with this regular
3871     expression. This operation is very fast and never fails.
3872 */
3873 
3874 /*!
3875     Returns true if this regular expression is equal to \a rx;
3876     otherwise returns false.
3877 
3878     Two QRegExp objects are equal if they have the same pattern
3879     strings and the same settings for case sensitivity, wildcard and
3880     minimal matching.
3881 */
operator ==(const QRegExp & rx) const3882 bool QRegExp::operator==(const QRegExp &rx) const
3883 {
3884     return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
3885 }
3886 
3887 /*!
3888     \fn bool QRegExp::operator!=(const QRegExp &rx) const
3889 
3890     Returns true if this regular expression is not equal to \a rx;
3891     otherwise returns false.
3892 
3893     \sa operator==()
3894 */
3895 
3896 /*!
3897     Returns true if the pattern string is empty; otherwise returns
3898     false.
3899 
3900     If you call exactMatch() with an empty pattern on an empty string
3901     it will return true; otherwise it returns false since it operates
3902     over the whole string. If you call indexIn() with an empty pattern
3903     on \e any string it will return the start offset (0 by default)
3904     because the empty pattern matches the 'emptiness' at the start of
3905     the string. In this case the length of the match returned by
3906     matchedLength() will be 0.
3907 
3908     See QString::isEmpty().
3909 */
3910 
isEmpty() const3911 bool QRegExp::isEmpty() const
3912 {
3913     return priv->engineKey.pattern.isEmpty();
3914 }
3915 
3916 /*!
3917     Returns true if the regular expression is valid; otherwise returns
3918     false. An invalid regular expression never matches.
3919 
3920     The pattern \bold{[a-z} is an example of an invalid pattern, since
3921     it lacks a closing square bracket.
3922 
3923     Note that the validity of a regexp may also depend on the setting
3924     of the wildcard flag, for example \bold{*.html} is a valid
3925     wildcard regexp but an invalid full regexp.
3926 
3927     \sa errorString()
3928 */
isValid() const3929 bool QRegExp::isValid() const
3930 {
3931     if (priv->engineKey.pattern.isEmpty()) {
3932         return true;
3933     } else {
3934         prepareEngine(priv);
3935         return priv->eng->isValid();
3936     }
3937 }
3938 
3939 /*!
3940     Returns the pattern string of the regular expression. The pattern
3941     has either regular expression syntax or wildcard syntax, depending
3942     on patternSyntax().
3943 
3944     \sa patternSyntax(), caseSensitivity()
3945 */
pattern() const3946 QString QRegExp::pattern() const
3947 {
3948     return priv->engineKey.pattern;
3949 }
3950 
3951 /*!
3952     Sets the pattern string to \a pattern. The case sensitivity,
3953     wildcard, and minimal matching options are not changed.
3954 
3955     \sa setPatternSyntax(), setCaseSensitivity()
3956 */
setPattern(const QString & pattern)3957 void QRegExp::setPattern(const QString &pattern)
3958 {
3959     if (priv->engineKey.pattern != pattern) {
3960         invalidateEngine(priv);
3961         priv->engineKey.pattern = pattern;
3962     }
3963 }
3964 
3965 /*!
3966     Returns Qt::CaseSensitive if the regexp is matched case
3967     sensitively; otherwise returns Qt::CaseInsensitive.
3968 
3969     \sa patternSyntax(), pattern(), isMinimal()
3970 */
caseSensitivity() const3971 Qt::CaseSensitivity QRegExp::caseSensitivity() const
3972 {
3973     return priv->engineKey.cs;
3974 }
3975 
3976 /*!
3977     Sets case sensitive matching to \a cs.
3978 
3979     If \a cs is Qt::CaseSensitive, \bold{\\.txt$} matches
3980     \c{readme.txt} but not \c{README.TXT}.
3981 
3982     \sa setPatternSyntax(), setPattern(), setMinimal()
3983 */
setCaseSensitivity(Qt::CaseSensitivity cs)3984 void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
3985 {
3986     if ((bool)cs != (bool)priv->engineKey.cs) {
3987         invalidateEngine(priv);
3988         priv->engineKey.cs = cs;
3989     }
3990 }
3991 
3992 /*!
3993     Returns the syntax used by the regular expression. The default is
3994     QRegExp::RegExp.
3995 
3996     \sa pattern(), caseSensitivity()
3997 */
patternSyntax() const3998 QRegExp::PatternSyntax QRegExp::patternSyntax() const
3999 {
4000     return priv->engineKey.patternSyntax;
4001 }
4002 
4003 /*!
4004     Sets the syntax mode for the regular expression. The default is
4005     QRegExp::RegExp.
4006 
4007     Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4008     \l{wildcard matching}. For example, \bold{r*.txt} matches the
4009     string \c{readme.txt} in wildcard mode, but does not match
4010     \c{readme}.
4011 
4012     Setting \a syntax to QRegExp::FixedString means that the pattern
4013     is interpreted as a plain string. Special characters (e.g.,
4014     backslash) don't need to be escaped then.
4015 
4016     \sa setPattern(), setCaseSensitivity(), escape()
4017 */
setPatternSyntax(PatternSyntax syntax)4018 void QRegExp::setPatternSyntax(PatternSyntax syntax)
4019 {
4020     if (syntax != priv->engineKey.patternSyntax) {
4021         invalidateEngine(priv);
4022         priv->engineKey.patternSyntax = syntax;
4023     }
4024 }
4025 
4026 /*!
4027     Returns true if minimal (non-greedy) matching is enabled;
4028     otherwise returns false.
4029 
4030     \sa caseSensitivity(), setMinimal()
4031 */
isMinimal() const4032 bool QRegExp::isMinimal() const
4033 {
4034     return priv->minimal;
4035 }
4036 
4037 /*!
4038     Enables or disables minimal matching. If \a minimal is false,
4039     matching is greedy (maximal) which is the default.
4040 
4041     For example, suppose we have the input string "We must be
4042     <b>bold</b>, very <b>bold</b>!" and the pattern
4043     \bold{<b>.*</b>}. With the default greedy (maximal) matching,
4044     the match is "We must be \underline{<b>bold</b>, very
4045     <b>bold</b>}!". But with minimal (non-greedy) matching, the
4046     first match is: "We must be \underline{<b>bold</b>}, very
4047     <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4048     very \underline{<b>bold</b>}!". In practice we might use the pattern
4049     \bold{<b>[^<]*\</b>} instead, although this will still fail for
4050     nested tags.
4051 
4052     \sa setCaseSensitivity()
4053 */
setMinimal(bool minimal)4054 void QRegExp::setMinimal(bool minimal)
4055 {
4056     priv->minimal = minimal;
4057 }
4058 
4059 // ### Qt 5: make non-const
4060 /*!
4061     Returns true if \a str is matched exactly by this regular
4062     expression; otherwise returns false. You can determine how much of
4063     the string was matched by calling matchedLength().
4064 
4065     For a given regexp string R, exactMatch("R") is the equivalent of
4066     indexIn("^R$") since exactMatch() effectively encloses the regexp
4067     in the start of string and end of string anchors, except that it
4068     sets matchedLength() differently.
4069 
4070     For example, if the regular expression is \bold{blue}, then
4071     exactMatch() returns true only for input \c blue. For inputs \c
4072     bluebell, \c blutak and \c lightblue, exactMatch() returns false
4073     and matchedLength() will return 4, 3 and 0 respectively.
4074 
4075     Although const, this function sets matchedLength(),
4076     capturedTexts(), and pos().
4077 
4078     \sa indexIn(), lastIndexIn()
4079 */
exactMatch(const QString & str) const4080 bool QRegExp::exactMatch(const QString &str) const
4081 {
4082     prepareEngineForMatch(priv, str);
4083     priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4084     if (priv->matchState.captured[1] == str.length()) {
4085         return true;
4086     } else {
4087         priv->matchState.captured[0] = 0;
4088         priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4089         return false;
4090     }
4091 }
4092 
4093 // ### Qt 5: make non-const
4094 /*!
4095     Attempts to find a match in \a str from position \a offset (0 by
4096     default). If \a offset is -1, the search starts at the last
4097     character; if -2, at the next to last character; etc.
4098 
4099     Returns the position of the first match, or -1 if there was no
4100     match.
4101 
4102     The \a caretMode parameter can be used to instruct whether \bold{^}
4103     should match at index 0 or at \a offset.
4104 
4105     You might prefer to use QString::indexOf(), QString::contains(),
4106     or even QStringList::filter(). To replace matches use
4107     QString::replace().
4108 
4109     Example:
4110     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
4111 
4112     Although const, this function sets matchedLength(),
4113     capturedTexts() and pos().
4114 
4115     If the QRegExp is a wildcard expression (see setPatternSyntax())
4116     and want to test a string against the whole wildcard expression,
4117     use exactMatch() instead of this function.
4118 
4119     \sa lastIndexIn(), exactMatch()
4120 */
4121 
indexIn(const QString & str,int offset,CaretMode caretMode) const4122 int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4123 {
4124     prepareEngineForMatch(priv, str);
4125     if (offset < 0)
4126         offset += str.length();
4127     priv->matchState.match(str.unicode(), str.length(), offset,
4128         priv->minimal, false, caretIndex(offset, caretMode));
4129     return priv->matchState.captured[0];
4130 }
4131 
4132 // ### Qt 5: make non-const
4133 /*!
4134     Attempts to find a match backwards in \a str from position \a
4135     offset. If \a offset is -1 (the default), the search starts at the
4136     last character; if -2, at the next to last character; etc.
4137 
4138     Returns the position of the first match, or -1 if there was no
4139     match.
4140 
4141     The \a caretMode parameter can be used to instruct whether \bold{^}
4142     should match at index 0 or at \a offset.
4143 
4144     Although const, this function sets matchedLength(),
4145     capturedTexts() and pos().
4146 
4147     \warning Searching backwards is much slower than searching
4148     forwards.
4149 
4150     \sa indexIn(), exactMatch()
4151 */
4152 
lastIndexIn(const QString & str,int offset,CaretMode caretMode) const4153 int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4154 {
4155     prepareEngineForMatch(priv, str);
4156     if (offset < 0)
4157         offset += str.length();
4158     if (offset < 0 || offset > str.length()) {
4159         memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4160         return -1;
4161     }
4162 
4163     while (offset >= 0) {
4164         priv->matchState.match(str.unicode(), str.length(), offset,
4165             priv->minimal, true, caretIndex(offset, caretMode));
4166         if (priv->matchState.captured[0] == offset)
4167             return offset;
4168         --offset;
4169     }
4170     return -1;
4171 }
4172 
4173 /*!
4174     Returns the length of the last matched string, or -1 if there was
4175     no match.
4176 
4177     \sa exactMatch(), indexIn(), lastIndexIn()
4178 */
matchedLength() const4179 int QRegExp::matchedLength() const
4180 {
4181     return priv->matchState.captured[1];
4182 }
4183 
4184 #ifndef QT_NO_REGEXP_CAPTURE
4185 
4186 #ifndef QT_NO_DEPRECATED
4187 /*!
4188   \obsolete
4189   Returns the number of captures contained in the regular expression.
4190 
4191   \sa captureCount()
4192  */
numCaptures() const4193 int QRegExp::numCaptures() const
4194 {
4195     return captureCount();
4196 }
4197 #endif
4198 
4199 /*!
4200   \since 4.6
4201   Returns the number of captures contained in the regular expression.
4202  */
captureCount() const4203 int QRegExp::captureCount() const
4204 {
4205     prepareEngine(priv);
4206     return priv->eng->captureCount();
4207 }
4208 
4209 /*!
4210     Returns a list of the captured text strings.
4211 
4212     The first string in the list is the entire matched string. Each
4213     subsequent list element contains a string that matched a
4214     (capturing) subexpression of the regexp.
4215 
4216     For example:
4217     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
4218 
4219     The above example also captures elements that may be present but
4220     which we have no interest in. This problem can be solved by using
4221     non-capturing parentheses:
4222 
4223     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
4224 
4225     Note that if you want to iterate over the list, you should iterate
4226     over a copy, e.g.
4227     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
4228 
4229     Some regexps can match an indeterminate number of times. For
4230     example if the input string is "Offsets: 12 14 99 231 7" and the
4231     regexp, \c{rx}, is \bold{(\\d+)+}, we would hope to get a list of
4232     all the numbers matched. However, after calling
4233     \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4234     "12"), i.e. the entire match was "12" and the first subexpression
4235     matched was "12". The correct approach is to use cap() in a
4236     \l{QRegExp#cap_in_a_loop}{loop}.
4237 
4238     The order of elements in the string list is as follows. The first
4239     element is the entire matching string. Each subsequent element
4240     corresponds to the next capturing open left parentheses. Thus
4241     capturedTexts()[1] is the text of the first capturing parentheses,
4242     capturedTexts()[2] is the text of the second and so on
4243     (corresponding to $1, $2, etc., in some other regexp languages).
4244 
4245     \sa cap(), pos()
4246 */
capturedTexts() const4247 QStringList QRegExp::capturedTexts() const
4248 {
4249     if (priv->capturedCache.isEmpty()) {
4250         prepareEngine(priv);
4251         const int *captured = priv->matchState.captured;
4252         int n = priv->matchState.capturedSize;
4253 
4254         for (int i = 0; i < n; i += 2) {
4255             QString m;
4256             if (captured[i + 1] == 0)
4257                 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4258             else if (captured[i] >= 0)
4259                 m = priv->t.mid(captured[i], captured[i + 1]);
4260             priv->capturedCache.append(m);
4261         }
4262         priv->t.clear();
4263     }
4264     return priv->capturedCache;
4265 }
4266 
4267 /*!
4268     \internal
4269 */
capturedTexts()4270 QStringList QRegExp::capturedTexts()
4271 {
4272     return const_cast<const QRegExp *>(this)->capturedTexts();
4273 }
4274 
4275 /*!
4276     Returns the text captured by the \a nth subexpression. The entire
4277     match has index 0 and the parenthesized subexpressions have
4278     indexes starting from 1 (excluding non-capturing parentheses).
4279 
4280     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
4281 
4282     The order of elements matched by cap() is as follows. The first
4283     element, cap(0), is the entire matching string. Each subsequent
4284     element corresponds to the next capturing open left parentheses.
4285     Thus cap(1) is the text of the first capturing parentheses, cap(2)
4286     is the text of the second, and so on.
4287 
4288     \sa capturedTexts(), pos()
4289 */
cap(int nth) const4290 QString QRegExp::cap(int nth) const
4291 {
4292     return capturedTexts().value(nth);
4293 }
4294 
4295 /*!
4296     \internal
4297 */
cap(int nth)4298 QString QRegExp::cap(int nth)
4299 {
4300     return const_cast<const QRegExp *>(this)->cap(nth);
4301 }
4302 
4303 /*!
4304     Returns the position of the \a nth captured text in the searched
4305     string. If \a nth is 0 (the default), pos() returns the position
4306     of the whole match.
4307 
4308     Example:
4309     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
4310 
4311     For zero-length matches, pos() always returns -1. (For example, if
4312     cap(4) would return an empty string, pos(4) returns -1.) This is
4313     a feature of the implementation.
4314 
4315     \sa cap(), capturedTexts()
4316 */
pos(int nth) const4317 int QRegExp::pos(int nth) const
4318 {
4319     if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4320         return -1;
4321     else
4322         return priv->matchState.captured[2 * nth];
4323 }
4324 
4325 /*!
4326     \internal
4327 */
pos(int nth)4328 int QRegExp::pos(int nth)
4329 {
4330     return const_cast<const QRegExp *>(this)->pos(nth);
4331 }
4332 
4333 /*!
4334   Returns a text string that explains why a regexp pattern is
4335   invalid the case being; otherwise returns "no error occurred".
4336 
4337   \sa isValid()
4338 */
errorString() const4339 QString QRegExp::errorString() const
4340 {
4341     if (isValid()) {
4342         return QString::fromLatin1(RXERR_OK);
4343     } else {
4344         return priv->eng->errorString();
4345     }
4346 }
4347 
4348 /*!
4349     \internal
4350 */
errorString()4351 QString QRegExp::errorString()
4352 {
4353     return const_cast<const QRegExp *>(this)->errorString();
4354 }
4355 #endif
4356 
4357 /*!
4358     Returns the string \a str with every regexp special character
4359     escaped with a backslash. The special characters are $, (,), *, +,
4360     ., ?, [, \,], ^, {, | and }.
4361 
4362     Example:
4363 
4364     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
4365 
4366     This function is useful to construct regexp patterns dynamically:
4367 
4368     \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
4369 
4370     \sa setPatternSyntax()
4371 */
escape(const QString & str)4372 QString QRegExp::escape(const QString &str)
4373 {
4374     QString quoted;
4375     const int count = str.count();
4376     quoted.reserve(count * 2);
4377     const QLatin1Char backslash('\\');
4378     for (int i = 0; i < count; i++) {
4379         switch (str.at(i).toLatin1()) {
4380         case '$':
4381         case '(':
4382         case ')':
4383         case '*':
4384         case '+':
4385         case '.':
4386         case '?':
4387         case '[':
4388         case '\\':
4389         case ']':
4390         case '^':
4391         case '{':
4392         case '|':
4393         case '}':
4394             quoted.append(backslash);
4395         }
4396         quoted.append(str.at(i));
4397     }
4398     return quoted;
4399 }
4400 
4401 /*!
4402     \fn bool QRegExp::caseSensitive() const
4403 
4404     Use \l caseSensitivity() instead.
4405 */
4406 
4407 /*!
4408     \fn void QRegExp::setCaseSensitive(bool sensitive)
4409 
4410     Use \l setCaseSensitivity() instead.
4411 */
4412 
4413 /*!
4414     \fn bool QRegExp::wildcard() const
4415 
4416     Use \l patternSyntax() instead.
4417 
4418     \oldcode
4419         bool wc = rx.wildcard();
4420     \newcode
4421         bool wc = (rx.patternSyntax() == QRegExp::Wildcard);
4422     \endcode
4423 */
4424 
4425 /*!
4426     \fn void QRegExp::setWildcard(bool wildcard)
4427 
4428     Use \l setPatternSyntax() instead.
4429 
4430     \oldcode
4431         rx.setWildcard(wc);
4432     \newcode
4433         rx.setPatternSyntax(wc ? QRegExp::Wildcard : QRegExp::RegExp);
4434     \endcode
4435 */
4436 
4437 /*!
4438     \fn bool QRegExp::minimal() const
4439 
4440     Use \l isMinimal() instead.
4441 */
4442 
4443 /*!
4444     \fn int QRegExp::search(const QString &str, int from = 0,
4445                             CaretMode caretMode = CaretAtZero) const
4446 
4447     Use \l indexIn() instead.
4448 */
4449 
4450 /*!
4451     \fn int QRegExp::searchRev(const QString &str, int from = -1, \
4452                                CaretMode caretMode = CaretAtZero) const
4453 
4454     Use \l lastIndexIn() instead.
4455 */
4456 
4457 /*!
4458     \fn QRegExp::QRegExp(const QString &pattern, bool cs, bool wildcard = false)
4459 
4460     Use another constructor instead.
4461 
4462     \oldcode
4463         QRegExp rx("*.txt", false, true);
4464     \newcode
4465         QRegExp rx("*.txt", Qt::CaseInsensitive, QRegExp::Wildcard);
4466     \endcode
4467 */
4468 
4469 #ifndef QT_NO_DATASTREAM
4470 /*!
4471     \relates QRegExp
4472 
4473     Writes the regular expression \a regExp to stream \a out.
4474 
4475     \sa {Serializing Qt Data Types}
4476 */
operator <<(QDataStream & out,const QRegExp & regExp)4477 QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4478 {
4479     return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4480                << (quint8)regExp.patternSyntax()
4481                << (quint8)!!regExp.isMinimal();
4482 }
4483 
4484 /*!
4485     \relates QRegExp
4486 
4487     Reads a regular expression from stream \a in into \a regExp.
4488 
4489     \sa {Serializing Qt Data Types}
4490 */
operator >>(QDataStream & in,QRegExp & regExp)4491 QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4492 {
4493     QString pattern;
4494     quint8 cs;
4495     quint8 patternSyntax;
4496     quint8 isMinimal;
4497 
4498     in >> pattern >> cs >> patternSyntax >> isMinimal;
4499 
4500     QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4501                       QRegExp::PatternSyntax(patternSyntax));
4502 
4503     newRegExp.setMinimal(isMinimal);
4504     regExp = newRegExp;
4505     return in;
4506 }
4507 #endif // QT_NO_DATASTREAM
4508 
4509 QT_END_NAMESPACE
4510