1 /****************************************************************************
2 **
3 ** Copyright (C) 2016 The Qt Company Ltd.
4 ** Contact: https://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see https://www.qt.io/terms-conditions. For further
15 ** information use the contact form at https://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 3 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL3 included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 3 requirements
23 ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24 **
25 ** GNU General Public License Usage
26 ** Alternatively, this file may be used under the terms of the GNU
27 ** General Public License version 2.0 or (at your option) the GNU General
28 ** Public license version 3 or any later version approved by the KDE Free
29 ** Qt Foundation. The licenses are as published by the Free Software
30 ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31 ** included in the packaging of this file. Please review the following
32 ** information to ensure the GNU General Public License requirements will
33 ** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34 ** https://www.gnu.org/licenses/gpl-3.0.html.
35 **
36 ** $QT_END_LICENSE$
37 **
38 ****************************************************************************/
39 
40 #include "qregexp.h"
41 
42 #include "qalgorithms.h"
43 #include "qbitarray.h"
44 #include "qcache.h"
45 #include "qdatastream.h"
46 #include "qdebug.h"
47 #include "qhashfunctions.h"
48 #include "qlist.h"
49 #include "qmap.h"
50 #include "qmutex.h"
51 #include "qstring.h"
52 #include "qstringlist.h"
53 #include "qstringmatcher.h"
54 #include "qvector.h"
55 #include "private/qlocking_p.h"
56 
57 #include <limits.h>
58 #include <algorithm>
59 
60 QT_BEGIN_NAMESPACE
61 
62 // error strings for the regexp parser
63 #define RXERR_OK         QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
64 #define RXERR_DISABLED   QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
65 #define RXERR_CHARCLASS  QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
66 #define RXERR_LOOKAHEAD  QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
67 #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
68 #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
69 #define RXERR_OCTAL      QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
70 #define RXERR_LEFTDELIM  QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
71 #define RXERR_END        QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
72 #define RXERR_LIMIT      QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
73 #define RXERR_INTERVAL   QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
74 #define RXERR_CATEGORY   QT_TRANSLATE_NOOP("QRegExp", "invalid category")
75 
76 /*!
77     \class QRegExp
78     \inmodule QtCore
79     \reentrant
80     \brief The QRegExp class provides pattern matching using regular expressions.
81 
82     \ingroup tools
83     \ingroup shared
84 
85     \keyword regular expression
86 
87     A regular expression, or "regexp", is a pattern for matching
88     substrings in a text. This is useful in many contexts, e.g.,
89 
90     \table
91     \row \li Validation
92          \li A regexp can test whether a substring meets some criteria,
93          e.g. is an integer or contains no whitespace.
94     \row \li Searching
95          \li A regexp provides more powerful pattern matching than
96          simple substring matching, e.g., match one of the words
97          \e{mail}, \e{letter} or \e{correspondence}, but none of the
98          words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
99      \row \li Search and Replace
100          \li A regexp can replace all occurrences of a substring with a
101          different substring, e.g., replace all occurrences of \e{&}
102          with \e{\&amp;} except where the \e{&} is already followed by
103          an \e{amp;}.
104     \row \li String Splitting
105          \li A regexp can be used to identify where a string should be
106          split apart, e.g. splitting tab-delimited strings.
107     \endtable
108 
109     A brief introduction to regexps is presented, a description of
110     Qt's regexp language, some examples, and the function
111     documentation itself. QRegExp is modeled on Perl's regexp
112     language. It fully supports Unicode. QRegExp can also be used in a
113     simpler, \e{wildcard mode} that is similar to the functionality
114     found in command shells. The syntax rules used by QRegExp can be
115     changed with setPatternSyntax(). In particular, the pattern syntax
116     can be set to QRegExp::FixedString, which means the pattern to be
117     matched is interpreted as a plain string, i.e., special characters
118     (e.g., backslash) are not escaped.
119 
120     A good text on regexps is \e {Mastering Regular Expressions}
121     (Third Edition) by Jeffrey E. F.  Friedl, ISBN 0-596-52812-4.
122 
123     \note In Qt 5, the new QRegularExpression class provides a Perl
124     compatible implementation of regular expressions and is recommended
125     in place of QRegExp.
126 
127     \tableofcontents
128 
129     \section1 Introduction
130 
131     Regexps are built up from expressions, quantifiers, and
132     assertions. The simplest expression is a character, e.g. \b{x}
133     or \b{5}. An expression can also be a set of characters
134     enclosed in square brackets. \b{[ABCD]} will match an \b{A}
135     or a \b{B} or a \b{C} or a \b{D}. We can write this same
136     expression as \b{[A-D]}, and an expression to match any
137     capital letter in the English alphabet is written as
138     \b{[A-Z]}.
139 
140     A quantifier specifies the number of occurrences of an expression
141     that must be matched. \b{x{1,1}} means match one and only one
142     \b{x}. \b{x{1,5}} means match a sequence of \b{x}
143     characters that contains at least one \b{x} but no more than
144     five.
145 
146     Note that in general regexps cannot be used to check for balanced
147     brackets or tags. For example, a regexp can be written to match an
148     opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
149     are not nested, but if the \c{<b>} tags are nested, that same
150     regexp will match an opening \c{<b>} tag with the wrong closing
151     \c{</b>}.  For the fragment \c{<b>bold <b>bolder</b></b>}, the
152     first \c{<b>} would be matched with the first \c{</b>}, which is
153     not correct. However, it is possible to write a regexp that will
154     match nested brackets or tags correctly, but only if the number of
155     nesting levels is fixed and known. If the number of nesting levels
156     is not fixed and known, it is impossible to write a regexp that
157     will not fail.
158 
159     Suppose we want a regexp to match integers in the range 0 to 99.
160     At least one digit is required, so we start with the expression
161     \b{[0-9]{1,1}}, which matches a single digit exactly once. This
162     regexp matches integers in the range 0 to 9. To match integers up
163     to 99, increase the maximum number of occurrences to 2, so the
164     regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
165     original requirement to match integers from 0 to 99, but it will
166     also match integers that occur in the middle of strings. If we
167     want the matched integer to be the whole string, we must use the
168     anchor assertions, \b{^} (caret) and \b{$} (dollar). When
169     \b{^} is the first character in a regexp, it means the regexp
170     must match from the beginning of the string. When \b{$} is the
171     last character of the regexp, it means the regexp must match to
172     the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
173     Note that assertions, e.g. \b{^} and \b{$}, do not match
174     characters but locations in the string.
175 
176     If you have seen regexps described elsewhere, they may have looked
177     different from the ones shown here. This is because some sets of
178     characters and some quantifiers are so common that they have been
179     given special symbols to represent them. \b{[0-9]} can be
180     replaced with the symbol \b{\\d}. The quantifier to match
181     exactly one occurrence, \b{{1,1}}, can be replaced with the
182     expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
183     our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
184     also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
185     the string, match a digit, followed immediately by 0 or 1 digits}.
186     In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
187     is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
188     occurrences. \b{?} makes an expression optional. The regexp
189     \b{^\\d\\d?$} means \e{From the beginning of the string, match
190     one digit, followed immediately by 0 or 1 more digit, followed
191     immediately by end of string}.
192 
193     To write a regexp that matches one of the words 'mail' \e or
194     'letter' \e or 'correspondence' but does not match words that
195     contain these words, e.g., 'email', 'mailman', 'mailer', and
196     'letterbox', start with a regexp that matches 'mail'. Expressed
197     fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
198     a character expression is automatically quantified by
199     \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
200     'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
201     we can use the vertical bar \b{|}, which means \b{or}, to
202     include the other two words, so our regexp for matching any of the
203     three words becomes \b{mail|letter|correspondence}. Match
204     'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
205     regexp will match one of the three words we want to match, it will
206     also match words we don't want to match, e.g., 'email'.  To
207     prevent the regexp from matching unwanted words, we must tell it
208     to begin and end the match at word boundaries. First we enclose
209     our regexp in parentheses, \b{(mail|letter|correspondence)}.
210     Parentheses group expressions together, and they identify a part
211     of the regexp that we wish to \l{capturing text}{capture}.
212     Enclosing the expression in parentheses allows us to use it as a
213     component in more complex regexps. It also allows us to examine
214     which of the three words was actually matched. To force the match
215     to begin and end on word boundaries, we enclose the regexp in
216     \b{\\b} \e{word boundary} assertions:
217     \b{\\b(mail|letter|correspondence)\\b}.  Now the regexp means:
218     \e{Match a word boundary, followed by the regexp in parentheses,
219     followed by a word boundary}. The \b{\\b} assertion matches a
220     \e position in the regexp, not a \e character. A word boundary is
221     any non-word character, e.g., a space, newline, or the beginning
222     or ending of a string.
223 
224     If we want to replace ampersand characters with the HTML entity
225     \b{\&amp;}, the regexp to match is simply \b{\&}. But this
226     regexp will also match ampersands that have already been converted
227     to HTML entities. We want to replace only ampersands that are not
228     already followed by \b{amp;}. For this, we need the negative
229     lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
230     written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
231     \b{not} \e{followed by} \b{amp;}.
232 
233     If we want to count all the occurrences of 'Eric' and 'Eirik' in a
234     string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and
235     \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
236     required to avoid matching words that contain either name,
237     e.g. 'Ericsson'. Note that the second regexp matches more
238     spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
239 
240     Some of the examples discussed above are implemented in the
241     \l{#code-examples}{code examples} section.
242 
243     \target characters-and-abbreviations-for-sets-of-characters
244     \section1 Characters and Abbreviations for Sets of Characters
245 
246     \table
247     \header \li Element \li Meaning
248     \row \li \b{c}
249          \li A character represents itself unless it has a special
250          regexp meaning. e.g. \b{c} matches the character \e c.
251     \row \li \b{\\c}
252          \li A character that follows a backslash matches the character
253          itself, except as specified below. e.g., To match a literal
254          caret at the beginning of a string, write \b{\\^}.
255     \row \li \b{\\a}
256          \li Matches the ASCII bell (BEL, 0x07).
257     \row \li \b{\\f}
258          \li Matches the ASCII form feed (FF, 0x0C).
259     \row \li \b{\\n}
260          \li Matches the ASCII line feed (LF, 0x0A, Unix newline).
261     \row \li \b{\\r}
262          \li Matches the ASCII carriage return (CR, 0x0D).
263     \row \li \b{\\t}
264          \li Matches the ASCII horizontal tab (HT, 0x09).
265     \row \li \b{\\v}
266          \li Matches the ASCII vertical tab (VT, 0x0B).
267     \row \li \b{\\x\e{hhhh}}
268          \li Matches the Unicode character corresponding to the
269          hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
270     \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
271          \li matches the ASCII/Latin1 character for the octal number
272          \e{ooo} (between 0 and 0377).
273     \row \li \b{. (dot)}
274          \li Matches any character (including newline).
275     \row \li \b{\\d}
276          \li Matches a digit (QChar::isDigit()).
277     \row \li \b{\\D}
278          \li Matches a non-digit.
279     \row \li \b{\\s}
280          \li Matches a whitespace character (QChar::isSpace()).
281     \row \li \b{\\S}
282          \li Matches a non-whitespace character.
283     \row \li \b{\\w}
284          \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
285     \row \li \b{\\W}
286          \li Matches a non-word character.
287     \row \li \b{\\\e{n}}
288          \li The \e{n}-th backreference, e.g. \\1, \\2, etc.
289     \endtable
290 
291     \b{Note:} The C++ compiler transforms backslashes in strings.
292     To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
293     To match the backslash character itself, enter it four times, i.e.
294     \c{\\\\}.
295 
296     \target sets-of-characters
297     \section1 Sets of Characters
298 
299     Square brackets mean match any character contained in the square
300     brackets. The character set abbreviations described above can
301     appear in a character set in square brackets. Except for the
302     character set abbreviations and the following two exceptions,
303     characters do not have special meanings in square brackets.
304 
305     \table
306     \row \li \b{^}
307 
308          \li The caret negates the character set if it occurs as the
309          first character (i.e. immediately after the opening square
310          bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
311          \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
312 
313     \row \li \b{-}
314 
315          \li The dash indicates a range of characters. \b{[W-Z]}
316          matches 'W' or 'X' or 'Y' or 'Z'.
317 
318     \endtable
319 
320     Using the predefined character set abbreviations is more portable
321     than using character ranges across platforms and languages. For
322     example, \b{[0-9]} matches a digit in Western alphabets but
323     \b{\\d} matches a digit in \e any alphabet.
324 
325     Note: In other regexp documentation, sets of characters are often
326     called "character classes".
327 
328     \target quantifiers
329     \section1 Quantifiers
330 
331     By default, an expression is automatically quantified by
332     \b{{1,1}}, i.e. it should occur exactly once. In the following
333     list, \b{\e {E}} stands for expression. An expression is a
334     character, or an abbreviation for a set of characters, or a set of
335     characters in square brackets, or an expression in parentheses.
336 
337     \table
338     \row \li \b{\e {E}?}
339 
340          \li Matches zero or one occurrences of \e E. This quantifier
341          means \e{The previous expression is optional}, because it
342          will match whether or not the expression is found. \b{\e
343          {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
344          matches 'dent' or 'dents'.
345 
346     \row \li \b{\e {E}+}
347 
348          \li Matches one or more occurrences of \e E. \b{\e {E}+} is
349          the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
350          '00', '000', etc.
351 
352     \row \li \b{\e {E}*}
353 
354          \li Matches zero or more occurrences of \e E. It is the same
355          as \b{\e {E}{0,}}. The \b{*} quantifier is often used
356          in error where \b{+} should be used. For example, if
357          \b{\\s*$} is used in an expression to match strings that
358          end in whitespace, it will match every string because
359          \b{\\s*$} means \e{Match zero or more whitespaces followed
360          by end of string}. The correct regexp to match strings that
361          have at least one trailing whitespace character is
362          \b{\\s+$}.
363 
364     \row \li \b{\e {E}{n}}
365 
366          \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
367          is the same as repeating \e E \e n times. For example,
368          \b{x{5}} is the same as \b{xxxxx}. It is also the same
369          as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
370 
371     \row \li \b{\e {E}{n,}}
372          \li Matches at least \e n occurrences of \e E.
373 
374     \row \li \b{\e {E}{,m}}
375          \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
376          is the same as \b{\e {E}{0,m}}.
377 
378     \row \li \b{\e {E}{n,m}}
379          \li Matches at least \e n and at most \e m occurrences of \e E.
380     \endtable
381 
382     To apply a quantifier to more than just the preceding character,
383     use parentheses to group characters together in an expression. For
384     example, \b{tag+} matches a 't' followed by an 'a' followed by
385     at least one 'g', whereas \b{(tag)+} matches at least one
386     occurrence of 'tag'.
387 
388     Note: Quantifiers are normally "greedy". They always match as much
389     text as they can. For example, \b{0+} matches the first zero it
390     finds and all the consecutive zeros after the first zero. Applied
391     to '20005', it matches '2\underline{000}5'. Quantifiers can be made
392     non-greedy, see setMinimal().
393 
394     \target capturing parentheses
395     \target backreferences
396     \section1 Capturing Text
397 
398     Parentheses allow us to group elements together so that we can
399     quantify and capture them. For example if we have the expression
400     \b{mail|letter|correspondence} that matches a string we know
401     that \e one of the words matched but not which one. Using
402     parentheses allows us to "capture" whatever is matched within
403     their bounds, so if we used \b{(mail|letter|correspondence)}
404     and matched this regexp against the string "I sent you some email"
405     we can use the cap() or capturedTexts() functions to extract the
406     matched characters, in this case 'mail'.
407 
408     We can use captured text within the regexp itself. To refer to the
409     captured text we use \e backreferences which are indexed from 1,
410     the same as for cap(). For example we could search for duplicate
411     words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
412     word boundary followed by one or more word characters followed by
413     one or more non-word characters followed by the same text as the
414     first parenthesized expression followed by a word boundary.
415 
416     If we want to use parentheses purely for grouping and not for
417     capturing we can use the non-capturing syntax, e.g.
418     \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
419     end ')'. In this example we match either 'green' or 'blue' but we
420     do not capture the match so we only know whether or not we matched
421     but not which color we actually found. Using non-capturing
422     parentheses is more efficient than using capturing parentheses
423     since the regexp engine has to do less book-keeping.
424 
425     Both capturing and non-capturing parentheses may be nested.
426 
427     \target greedy quantifiers
428 
429     For historical reasons, quantifiers (e.g. \b{*}) that apply to
430     capturing parentheses are more "greedy" than other quantifiers.
431     For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".
432     This behavior is different from what other regexp engines do
433     (notably, Perl). To obtain a more intuitive capturing behavior,
434     specify QRegExp::RegExp2 to the QRegExp constructor or call
435     setPatternSyntax(QRegExp::RegExp2).
436 
437     \target cap_in_a_loop
438 
439     When the number of matches cannot be determined in advance, a
440     common idiom is to use cap() in a loop. For example:
441 
442     \snippet code/src_corelib_tools_qregexp.cpp 0
443 
444     \target assertions
445     \section1 Assertions
446 
447     Assertions make some statement about the text at the point where
448     they occur in the regexp but they do not match any characters. In
449     the following list \b{\e {E}} stands for any expression.
450 
451     \table
452     \row \li \b{^}
453          \li The caret signifies the beginning of the string. If you
454          wish to match a literal \c{^} you must escape it by
455          writing \c{\\^}. For example, \b{^#include} will only
456          match strings which \e begin with the characters '#include'.
457          (When the caret is the first character of a character set it
458          has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)
459 
460     \row \li \b{$}
461          \li The dollar signifies the end of the string. For example
462          \b{\\d\\s*$} will match strings which end with a digit
463          optionally followed by whitespace. If you wish to match a
464          literal \c{$} you must escape it by writing
465          \c{\\$}.
466 
467     \row \li \b{\\b}
468          \li A word boundary. For example the regexp
469          \b{\\bOK\\b} means match immediately after a word
470          boundary (e.g. start of string or whitespace) the letter 'O'
471          then the letter 'K' immediately before another word boundary
472          (e.g. end of string or whitespace). But note that the
473          assertion does not actually match any whitespace so if we
474          write \b{(\\bOK\\b)} and we have a match it will only
475          contain 'OK' even if the string is "It's \underline{OK} now".
476 
477     \row \li \b{\\B}
478          \li A non-word boundary. This assertion is true wherever
479          \b{\\b} is false. For example if we searched for
480          \b{\\Bon\\B} in "Left on" the match would fail (space
481          and end of string aren't non-word boundaries), but it would
482          match in "t\underline{on}ne".
483 
484     \row \li \b{(?=\e E)}
485          \li Positive lookahead. This assertion is true if the
486          expression matches at this point in the regexp. For example,
487          \b{const(?=\\s+char)} matches 'const' whenever it is
488          followed by 'char', as in 'static \underline{const} char *'.
489          (Compare with \b{const\\s+char}, which matches 'static
490          \underline{const char} *'.)
491 
492     \row \li \b{(?!\e E)}
493          \li Negative lookahead. This assertion is true if the
494          expression does not match at this point in the regexp. For
495          example, \b{const(?!\\s+char)} matches 'const' \e except
496          when it is followed by 'char'.
497     \endtable
498 
499     \target QRegExp wildcard matching
500     \section1 Wildcard Matching
501 
502     Most command shells such as \e bash or \e cmd.exe support "file
503     globbing", the ability to identify a group of files by using
504     wildcards. The setPatternSyntax() function is used to switch
505     between regexp and wildcard mode. Wildcard matching is much
506     simpler than full regexps and has only four features:
507 
508     \table
509     \row \li \b{c}
510          \li Any character represents itself apart from those mentioned
511          below. Thus \b{c} matches the character \e c.
512     \row \li \b{?}
513          \li Matches any single character. It is the same as
514          \b{.} in full regexps.
515     \row \li \b{*}
516          \li Matches zero or more of any characters. It is the
517          same as \b{.*} in full regexps.
518     \row \li \b{[...]}
519          \li Sets of characters can be represented in square brackets,
520          similar to full regexps. Within the character class, like
521          outside, backslash has no special meaning.
522     \endtable
523 
524     In the mode Wildcard, the wildcard characters cannot be
525     escaped. In the mode WildcardUnix, the character '\\' escapes the
526     wildcard.
527 
528     For example if we are in wildcard mode and have strings which
529     contain filenames we could identify HTML files with \b{*.html}.
530     This will match zero or more characters followed by a dot followed
531     by 'h', 't', 'm' and 'l'.
532 
533     To test a string against a wildcard expression, use exactMatch().
534     For example:
535 
536     \snippet code/src_corelib_tools_qregexp.cpp 1
537 
538     \target perl-users
539     \section1 Notes for Perl Users
540 
541     Most of the character class abbreviations supported by Perl are
542     supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}
543     {characters and abbreviations for sets of characters}.
544 
545     In QRegExp, apart from within character classes, \c{^} always
546     signifies the start of the string, so carets must always be
547     escaped unless used for that purpose. In Perl the meaning of caret
548     varies automagically depending on where it occurs so escaping it
549     is rarely necessary. The same applies to \c{$} which in
550     QRegExp always signifies the end of the string.
551 
552     QRegExp's quantifiers are the same as Perl's greedy quantifiers
553     (but see the \l{greedy quantifiers}{note above}). Non-greedy
554     matching cannot be applied to individual quantifiers, but can be
555     applied to all the quantifiers in the pattern. For example, to
556     match the Perl regexp \b{ro+?m} requires:
557 
558     \snippet code/src_corelib_tools_qregexp.cpp 2
559 
560     The equivalent of Perl's \c{/i} option is
561     setCaseSensitivity(Qt::CaseInsensitive).
562 
563     Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
564 
565     In QRegExp \b{.} matches any character, therefore all QRegExp
566     regexps have the equivalent of Perl's \c{/s} option. QRegExp
567     does not have an equivalent to Perl's \c{/m} option, but this
568     can be emulated in various ways for example by splitting the input
569     into lines or by looping with a regexp that searches for newlines.
570 
571     Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
572     assertions. The \\G assertion is not supported but can be emulated
573     in a loop.
574 
575     Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
576     equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
577     ... correspond to cap(1) or capturedTexts()[1], cap(2) or
578     capturedTexts()[2], etc.
579 
580     To substitute a pattern use QString::replace().
581 
582     Perl's extended \c{/x} syntax is not supported, nor are
583     directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
584     the other hand, C++'s rules for literal strings can be used to
585     achieve the same:
586 
587     \snippet code/src_corelib_tools_qregexp.cpp 3
588 
589     Both zero-width positive and zero-width negative lookahead
590     assertions (?=pattern) and (?!pattern) are supported with the same
591     syntax as Perl. Perl's lookbehind assertions, "independent"
592     subexpressions and conditional expressions are not supported.
593 
594     Non-capturing parentheses are also supported, with the same
595     (?:pattern) syntax.
596 
597     See QString::split() and QStringList::join() for equivalents
598     to Perl's split and join functions.
599 
600     Note: because C++ transforms \\'s they must be written \e twice in
601     code, e.g. \b{\\b} must be written \b{\\\\b}.
602 
603     \target code-examples
604     \section1 Code Examples
605 
606     \snippet code/src_corelib_tools_qregexp.cpp 4
607 
608     The third string matches '\underline{6}'. This is a simple validation
609     regexp for integers in the range 0 to 99.
610 
611     \snippet code/src_corelib_tools_qregexp.cpp 5
612 
613     The second string matches '\underline{This_is-OK}'. We've used the
614     character set abbreviation '\\S' (non-whitespace) and the anchors
615     to match strings which contain no whitespace.
616 
617     In the following example we match strings containing 'mail' or
618     'letter' or 'correspondence' but only match whole words i.e. not
619     'email'
620 
621     \snippet code/src_corelib_tools_qregexp.cpp 6
622 
623     The second string matches "Please write the \underline{letter}". The
624     word 'letter' is also captured (because of the parentheses). We
625     can see what text we've captured like this:
626 
627     \snippet code/src_corelib_tools_qregexp.cpp 7
628 
629     This will capture the text from the first set of capturing
630     parentheses (counting capturing left parentheses from left to
631     right). The parentheses are counted from 1 since cap(0) is the
632     whole matched regexp (equivalent to '&' in most regexp engines).
633 
634     \snippet code/src_corelib_tools_qregexp.cpp 8
635 
636     Here we've passed the QRegExp to QString's replace() function to
637     replace the matched text with new text.
638 
639     \snippet code/src_corelib_tools_qregexp.cpp 9
640 
641     We've used the indexIn() function to repeatedly match the regexp in
642     the string. Note that instead of moving forward by one character
643     at a time \c pos++ we could have written \c {pos +=
644     rx.matchedLength()} to skip over the already matched string. The
645     count will equal 3, matching 'One \underline{Eric} another
646     \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
647     doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
648     by non-word boundaries.
649 
650     One common use of regexps is to split lines of delimited data into
651     their component fields.
652 
653     \snippet code/src_corelib_tools_qregexp.cpp 10
654 
655     In this example our input lines have the format company name, web
656     address and country. Unfortunately the regexp is rather long and
657     not very versatile -- the code will break if we add any more
658     fields. A simpler and better solution is to look for the
659     separator, '\\t' in this case, and take the surrounding text. The
660     QString::split() function can take a separator string or regexp
661     as an argument and split a string accordingly.
662 
663     \snippet code/src_corelib_tools_qregexp.cpp 11
664 
665     Here field[0] is the company, field[1] the web address and so on.
666 
667     To imitate the matching of a shell we can use wildcard mode.
668 
669     \snippet code/src_corelib_tools_qregexp.cpp 12
670 
671     Wildcard matching can be convenient because of its simplicity, but
672     any wildcard regexp can be defined using full regexps, e.g.
673     \b{.*\\.html$}. Notice that we can't match both \c .html and \c
674     .htm files with a wildcard unless we use \b{*.htm*} which will
675     also match 'test.html.bak'. A full regexp gives us the precision
676     we need, \b{.*\\.html?$}.
677 
678     QRegExp can match case insensitively using setCaseSensitivity(),
679     and can use non-greedy matching, see setMinimal(). By
680     default QRegExp uses full regexps but this can be changed with
681     setPatternSyntax(). Searching can be done forward with indexIn() or backward
682     with lastIndexIn(). Captured text can be accessed using
683     capturedTexts() which returns a string list of all captured
684     strings, or using cap() which returns the captured string for the
685     given index. The pos() function takes a match index and returns
686     the position in the string where the match was made (or -1 if
687     there was no match).
688 
689     \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
690         {tools/regexp}{Regular Expression Example}
691 */
692 
693 #if defined(Q_OS_VXWORKS) && defined(EOS)
694 #  undef EOS
695 #endif
696 
697 const int NumBadChars = 64;
698 #define BadChar(ch) ((ch).unicode() % NumBadChars)
699 
700 const int NoOccurrence = INT_MAX;
701 const int EmptyCapture = INT_MAX;
702 const int InftyLen = INT_MAX;
703 const int InftyRep = 1025;
704 const int EOS = -1;
705 
isWord(QChar ch)706 static bool isWord(QChar ch)
707 {
708     return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
709 }
710 
711 /*
712   Merges two vectors of ints and puts the result into the first
713   one.
714 */
mergeInto(QVector<int> * a,const QVector<int> & b)715 static void mergeInto(QVector<int> *a, const QVector<int> &b)
716 {
717     int asize = a->size();
718     int bsize = b.size();
719     if (asize == 0) {
720         *a = b;
721 #ifndef QT_NO_REGEXP_OPTIM
722     } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
723         a->resize(asize + 1);
724         (*a)[asize] = b.at(0);
725 #endif
726     } else if (bsize >= 1) {
727         int csize = asize + bsize;
728         QVector<int> c(csize);
729         int i = 0, j = 0, k = 0;
730         while (i < asize) {
731             if (j < bsize) {
732                 if (a->at(i) == b.at(j)) {
733                     ++i;
734                     --csize;
735                 } else if (a->at(i) < b.at(j)) {
736                     c[k++] = a->at(i++);
737                 } else {
738                     c[k++] = b.at(j++);
739                 }
740             } else {
741                 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
742                 break;
743             }
744         }
745         c.resize(csize);
746         if (j < bsize)
747             memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
748         *a = c;
749     }
750 }
751 
752 #ifndef QT_NO_REGEXP_WILDCARD
753 /*
754   Translates a wildcard pattern to an equivalent regular expression
755   pattern (e.g., *.cpp to .*\.cpp).
756 
757   If enableEscaping is true, it is possible to escape the wildcard
758   characters with \
759 */
wc2rx(const QString & wc_str,const bool enableEscaping)760 static QString wc2rx(const QString &wc_str, const bool enableEscaping)
761 {
762     const int wclen = wc_str.length();
763     QString rx;
764     int i = 0;
765     bool isEscaping = false; // the previous character is '\'
766     const QChar *wc = wc_str.unicode();
767 
768     while (i < wclen) {
769         const QChar c = wc[i++];
770         switch (c.unicode()) {
771         case '\\':
772             if (enableEscaping) {
773                 if (isEscaping) {
774                     rx += QLatin1String("\\\\");
775                 } // we insert the \\ later if necessary
776                 if (i == wclen) { // the end
777                     rx += QLatin1String("\\\\");
778                 }
779             } else {
780                 rx += QLatin1String("\\\\");
781             }
782             isEscaping = true;
783             break;
784         case '*':
785             if (isEscaping) {
786                 rx += QLatin1String("\\*");
787                 isEscaping = false;
788             } else {
789                 rx += QLatin1String(".*");
790             }
791             break;
792         case '?':
793             if (isEscaping) {
794                 rx += QLatin1String("\\?");
795                 isEscaping = false;
796             } else {
797                 rx += QLatin1Char('.');
798             }
799 
800             break;
801         case '$':
802         case '(':
803         case ')':
804         case '+':
805         case '.':
806         case '^':
807         case '{':
808         case '|':
809         case '}':
810             if (isEscaping) {
811                 isEscaping = false;
812                 rx += QLatin1String("\\\\");
813             }
814             rx += QLatin1Char('\\');
815             rx += c;
816             break;
817          case '[':
818             if (isEscaping) {
819                 isEscaping = false;
820                 rx += QLatin1String("\\[");
821             } else {
822                 rx += c;
823                 if (wc[i] == QLatin1Char('^'))
824                     rx += wc[i++];
825                 if (i < wclen) {
826                     if (wc[i] == QLatin1Char(']'))
827                         rx += wc[i++];
828                     while (i < wclen && wc[i] != QLatin1Char(']')) {
829                         if (wc[i] == QLatin1Char('\\'))
830                             rx += QLatin1Char('\\');
831                         rx += wc[i++];
832                     }
833                 }
834             }
835              break;
836 
837         case ']':
838             if(isEscaping){
839                 isEscaping = false;
840                 rx += QLatin1String("\\");
841             }
842             rx += c;
843             break;
844 
845         default:
846             if(isEscaping){
847                 isEscaping = false;
848                 rx += QLatin1String("\\\\");
849             }
850             rx += c;
851         }
852     }
853     return rx;
854 }
855 #endif
856 
caretIndex(int offset,QRegExp::CaretMode caretMode)857 static int caretIndex(int offset, QRegExp::CaretMode caretMode)
858 {
859     if (caretMode == QRegExp::CaretAtZero) {
860         return 0;
861     } else if (caretMode == QRegExp::CaretAtOffset) {
862         return offset;
863     } else { // QRegExp::CaretWontMatch
864         return -1;
865     }
866 }
867 
868 /*
869     The QRegExpEngineKey struct uniquely identifies an engine.
870 */
871 struct QRegExpEngineKey
872 {
873     QString pattern;
874     QRegExp::PatternSyntax patternSyntax;
875     Qt::CaseSensitivity cs;
876 
QRegExpEngineKeyQRegExpEngineKey877     inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
878                             Qt::CaseSensitivity cs)
879         : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
880 
clearQRegExpEngineKey881     inline void clear() {
882         pattern.clear();
883         patternSyntax = QRegExp::RegExp;
884         cs = Qt::CaseSensitive;
885     }
886 };
887 
operator ==(const QRegExpEngineKey & key1,const QRegExpEngineKey & key2)888 static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
889 {
890     return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
891            && key1.cs == key2.cs;
892 }
893 
qHash(const QRegExpEngineKey & key,uint seed=0)894 static uint qHash(const QRegExpEngineKey &key, uint seed = 0) noexcept
895 {
896     QtPrivate::QHashCombine hash;
897     seed = hash(seed, key.pattern);
898     seed = hash(seed, key.patternSyntax);
899     seed = hash(seed, key.cs);
900     return seed;
901 }
902 
903 class QRegExpEngine;
904 
905 //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
906 
907 /*
908   This is the engine state during matching.
909 */
910 struct QRegExpMatchState
911 {
912     const QChar *in; // a pointer to the input string data
913     int pos; // the current position in the string
914     int caretPos;
915     int len; // the length of the input string
916     bool minimal; // minimal matching?
917     int *bigArray; // big array holding the data for the next pointers
918     int *inNextStack; // is state is nextStack?
919     int *curStack; // stack of current states
920     int *nextStack; // stack of next states
921     int *curCapBegin; // start of current states' captures
922     int *nextCapBegin; // start of next states' captures
923     int *curCapEnd; // end of current states' captures
924     int *nextCapEnd; // end of next states' captures
925     int *tempCapBegin; // start of temporary captures
926     int *tempCapEnd; // end of temporary captures
927     int *capBegin; // start of captures for a next state
928     int *capEnd; // end of captures for a next state
929     int *slideTab; // bump-along slide table for bad-character heuristic
930     int *captured; // what match() returned last
931     int slideTabSize; // size of slide table
932     int capturedSize;
933 #ifndef QT_NO_REGEXP_BACKREF
934     QList<QVector<int> > sleeping; // list of back-reference sleepers
935 #endif
936     int matchLen; // length of match
937     int oneTestMatchedLen; // length of partial match
938 
939     const QRegExpEngine *eng;
940 
QRegExpMatchStateQRegExpMatchState941     inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {}
~QRegExpMatchStateQRegExpMatchState942     inline ~QRegExpMatchState() { free(bigArray); }
943 
drainQRegExpMatchState944     void drain() { free(bigArray); bigArray = nullptr; captured = nullptr; } // to save memory
945     void prepareForMatch(QRegExpEngine *eng);
946     void match(const QChar *str, int len, int pos, bool minimal,
947         bool oneTest, int caretIndex);
948     bool matchHere();
949     bool testAnchor(int i, int a, const int *capBegin);
950 };
951 
952 /*
953   The struct QRegExpAutomatonState represents one state in a modified NFA. The
954   input characters matched are stored in the state instead of on
955   the transitions, something possible for an automaton
956   constructed from a regular expression.
957 */
958 struct QRegExpAutomatonState
959 {
960 #ifndef QT_NO_REGEXP_CAPTURE
961     int atom; // which atom does this state belong to?
962 #endif
963     int match; // what does it match? (see CharClassBit and BackRefBit)
964     QVector<int> outs; // out-transitions
965     QMap<int, int> reenter; // atoms reentered when transiting out
966     QMap<int, int> anchors; // anchors met when transiting out
967 
QRegExpAutomatonStateQRegExpAutomatonState968     inline QRegExpAutomatonState() { }
969 #ifndef QT_NO_REGEXP_CAPTURE
QRegExpAutomatonStateQRegExpAutomatonState970     inline QRegExpAutomatonState(int a, int m)
971         : atom(a), match(m) { }
972 #else
QRegExpAutomatonStateQRegExpAutomatonState973     inline QRegExpAutomatonState(int m)
974         : match(m) { }
975 #endif
976 };
977 
978 Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
979 
980 /*
981   The struct QRegExpCharClassRange represents a range of characters (e.g.,
982   [0-9] denotes range 48 to 57).
983 */
984 struct QRegExpCharClassRange
985 {
986     ushort from; // 48
987     ushort len; // 10
988 };
989 
990 Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
991 
992 #ifndef QT_NO_REGEXP_CAPTURE
993 /*
994   The struct QRegExpAtom represents one node in the hierarchy of regular
995   expression atoms.
996 */
997 struct QRegExpAtom
998 {
999     enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
1000 
1001     int parent; // index of parent in array of atoms
1002     int capture; // index of capture, from 1 to ncap - 1
1003 };
1004 
1005 Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
1006 #endif
1007 
1008 struct QRegExpLookahead;
1009 
1010 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1011 /*
1012   The struct QRegExpAnchorAlternation represents a pair of anchors with
1013   OR semantics.
1014 */
1015 struct QRegExpAnchorAlternation
1016 {
1017     int a; // this anchor...
1018     int b; // ...or this one
1019 };
1020 
1021 Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1022 #endif
1023 
1024 #ifndef QT_NO_REGEXP_CCLASS
1025 
1026 #define FLAG(x) (1 << (x))
1027 /*
1028   The class QRegExpCharClass represents a set of characters, such as can
1029   be found in regular expressions (e.g., [a-z] denotes the set
1030   {a, b, ..., z}).
1031 */
1032 class QRegExpCharClass
1033 {
1034 public:
1035     QRegExpCharClass();
1036 
1037     void clear();
negative() const1038     bool negative() const { return n; }
1039     void setNegative(bool negative);
1040     void addCategories(uint cats);
1041     void addRange(ushort from, ushort to);
addSingleton(ushort ch)1042     void addSingleton(ushort ch) { addRange(ch, ch); }
1043 
1044     bool in(QChar ch) const;
1045 #ifndef QT_NO_REGEXP_OPTIM
firstOccurrence() const1046     const QVector<int> &firstOccurrence() const { return occ1; }
1047 #endif
1048 
1049 #if defined(QT_DEBUG)
1050     void dump() const;
1051 #endif
1052 
1053 private:
1054     QVector<QRegExpCharClassRange> r; // character ranges
1055 #ifndef QT_NO_REGEXP_OPTIM
1056     QVector<int> occ1; // first-occurrence array
1057 #endif
1058     uint c; // character classes
1059     bool n; // negative?
1060 };
1061 #else
1062 struct QRegExpCharClass
1063 {
1064     int dummy;
1065 
1066 #ifndef QT_NO_REGEXP_OPTIM
QRegExpCharClassQRegExpCharClass1067     QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1068 
firstOccurrenceQRegExpCharClass1069     const QVector<int> &firstOccurrence() const { return occ1; }
1070     QVector<int> occ1;
1071 #endif
1072 };
1073 #endif
1074 
1075 Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1076 
1077 /*
1078   The QRegExpEngine class encapsulates a modified nondeterministic
1079   finite automaton (NFA).
1080 */
1081 class QRegExpEngine
1082 {
1083 public:
QRegExpEngine(Qt::CaseSensitivity cs,bool greedyQuantifiers)1084     QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1085         : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1086 
1087     QRegExpEngine(const QRegExpEngineKey &key);
1088     ~QRegExpEngine();
1089 
isValid() const1090     bool isValid() const { return valid; }
errorString() const1091     const QString &errorString() const { return yyError; }
captureCount() const1092     int captureCount() const { return officialncap; }
1093 
1094     int createState(QChar ch);
1095     int createState(const QRegExpCharClass &cc);
1096 #ifndef QT_NO_REGEXP_BACKREF
1097     int createState(int bref);
1098 #endif
1099 
1100     void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1101 #ifndef QT_NO_REGEXP_CAPTURE
1102     void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1103 #endif
1104 
1105 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1106     int anchorAlternation(int a, int b);
1107     int anchorConcatenation(int a, int b);
1108 #else
anchorAlternation(int a,int b)1109     int anchorAlternation(int a, int b) { return a & b; }
anchorConcatenation(int a,int b)1110     int anchorConcatenation(int a, int b) { return a | b; }
1111 #endif
1112     void addAnchors(int from, int to, int a);
1113 
1114 #ifndef QT_NO_REGEXP_OPTIM
1115     void heuristicallyChooseHeuristic();
1116 #endif
1117 
1118 #if defined(QT_DEBUG)
1119     void dump() const;
1120 #endif
1121 
1122     QAtomicInt ref;
1123 
1124 private:
1125     enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1126     enum { InitialState = 0, FinalState = 1 };
1127 
1128     void setup();
1129     int setupState(int match);
1130 
1131     /*
1132       Let's hope that 13 lookaheads and 14 back-references are
1133       enough.
1134      */
1135     enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1136     enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1137            Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1138            Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1139            Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1140            Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1141 
1142            Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1143                    ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1144 #ifndef QT_NO_REGEXP_CAPTURE
1145     int startAtom(bool officialCapture);
1146     void finishAtom(int atom, bool needCapture);
1147 #endif
1148 
1149 #ifndef QT_NO_REGEXP_LOOKAHEAD
1150     int addLookahead(QRegExpEngine *eng, bool negative);
1151 #endif
1152 
1153 #ifndef QT_NO_REGEXP_OPTIM
1154     bool goodStringMatch(QRegExpMatchState &matchState) const;
1155     bool badCharMatch(QRegExpMatchState &matchState) const;
1156 #else
1157     bool bruteMatch(QRegExpMatchState &matchState) const;
1158 #endif
1159 
1160     QVector<QRegExpAutomatonState> s; // array of states
1161 #ifndef QT_NO_REGEXP_CAPTURE
1162     QVector<QRegExpAtom> f; // atom hierarchy
1163     int nf; // number of atoms
1164     int cf; // current atom
1165     QVector<int> captureForOfficialCapture;
1166 #endif
1167     int officialncap; // number of captures, seen from the outside
1168     int ncap; // number of captures, seen from the inside
1169 #ifndef QT_NO_REGEXP_CCLASS
1170     QVector<QRegExpCharClass> cl; // array of character classes
1171 #endif
1172 #ifndef QT_NO_REGEXP_LOOKAHEAD
1173     QVector<QRegExpLookahead *> ahead; // array of lookaheads
1174 #endif
1175 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1176     QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1177 #endif
1178 #ifndef QT_NO_REGEXP_OPTIM
1179     bool caretAnchored; // does the regexp start with ^?
1180     bool trivial; // is the good-string all that needs to match?
1181 #endif
1182     bool valid; // is the regular expression valid?
1183     Qt::CaseSensitivity cs; // case sensitive?
1184     bool greedyQuantifiers; // RegExp2?
1185     bool xmlSchemaExtensions;
1186 #ifndef QT_NO_REGEXP_BACKREF
1187     int nbrefs; // number of back-references
1188 #endif
1189 
1190 #ifndef QT_NO_REGEXP_OPTIM
1191     bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1192 
1193     int goodEarlyStart; // the index where goodStr can first occur in a match
1194     int goodLateStart; // the index where goodStr can last occur in a match
1195     QString goodStr; // the string that any match has to contain
1196 
1197     int minl; // the minimum length of a match
1198     QVector<int> occ1; // first-occurrence array
1199 #endif
1200 
1201     /*
1202       The class Box is an abstraction for a regular expression
1203       fragment. It can also be seen as one node in the syntax tree of
1204       a regular expression with synthetized attributes.
1205 
1206       Its interface is ugly for performance reasons.
1207     */
1208     class Box
1209     {
1210     public:
1211         Box(QRegExpEngine *engine);
Box(const Box & b)1212         Box(const Box &b) { operator=(b); }
1213 
1214         Box &operator=(const Box &b);
1215 
clear()1216         void clear() { operator=(Box(eng)); }
1217         void set(QChar ch);
1218         void set(const QRegExpCharClass &cc);
1219 #ifndef QT_NO_REGEXP_BACKREF
1220         void set(int bref);
1221 #endif
1222 
1223         void cat(const Box &b);
1224         void orx(const Box &b);
1225         void plus(int atom);
1226         void opt();
1227         void catAnchor(int a);
1228 #ifndef QT_NO_REGEXP_OPTIM
1229         void setupHeuristics();
1230 #endif
1231 
1232 #if defined(QT_DEBUG)
1233         void dump() const;
1234 #endif
1235 
1236     private:
1237         void addAnchorsToEngine(const Box &to) const;
1238 
1239         QRegExpEngine *eng; // the automaton under construction
1240         QVector<int> ls; // the left states (firstpos)
1241         QVector<int> rs; // the right states (lastpos)
1242         QMap<int, int> lanchors; // the left anchors
1243         QMap<int, int> ranchors; // the right anchors
1244         int skipanchors; // the anchors to match if the box is skipped
1245 
1246 #ifndef QT_NO_REGEXP_OPTIM
1247         int earlyStart; // the index where str can first occur
1248         int lateStart; // the index where str can last occur
1249         QString str; // a string that has to occur in any match
1250         QString leftStr; // a string occurring at the left of this box
1251         QString rightStr; // a string occurring at the right of this box
1252         int maxl; // the maximum length of this box (possibly InftyLen)
1253 #endif
1254 
1255         int minl; // the minimum length of this box
1256 #ifndef QT_NO_REGEXP_OPTIM
1257         QVector<int> occ1; // first-occurrence array
1258 #endif
1259     };
1260 
1261     friend class Box;
1262 
1263     /*
1264       This is the lexical analyzer for regular expressions.
1265     */
1266     enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1267            Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1268            Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1269     int getChar();
1270     int getEscape();
1271 #ifndef QT_NO_REGEXP_INTERVAL
1272     int getRep(int def);
1273 #endif
1274 #ifndef QT_NO_REGEXP_LOOKAHEAD
1275     void skipChars(int n);
1276 #endif
1277     void error(const char *msg);
1278     void startTokenizer(const QChar *rx, int len);
1279     int getToken();
1280 
1281     const QChar *yyIn; // a pointer to the input regular expression pattern
1282     int yyPos0; // the position of yyTok in the input pattern
1283     int yyPos; // the position of the next character to read
1284     int yyLen; // the length of yyIn
1285     int yyCh; // the last character read
1286     QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1287     int yyMinRep; // attribute for Tok_Quantifier
1288     int yyMaxRep; // ditto
1289     QString yyError; // syntax error or overflow during parsing?
1290 
1291     /*
1292       This is the syntactic analyzer for regular expressions.
1293     */
1294     int parse(const QChar *rx, int len);
1295     void parseAtom(Box *box);
1296     void parseFactor(Box *box);
1297     void parseTerm(Box *box);
1298     void parseExpression(Box *box);
1299 
1300     int yyTok; // the last token read
1301     bool yyMayCapture; // set this to false to disable capturing
1302 
1303     friend struct QRegExpMatchState;
1304 };
1305 
1306 #ifndef QT_NO_REGEXP_LOOKAHEAD
1307 /*
1308   The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1309   (?=foo) and (?!bar)).
1310 */
1311 struct QRegExpLookahead
1312 {
1313     QRegExpEngine *eng; // NFA representing the embedded regular expression
1314     bool neg; // negative lookahead?
1315 
QRegExpLookaheadQRegExpLookahead1316     inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1317         : eng(eng0), neg(neg0) { }
~QRegExpLookaheadQRegExpLookahead1318     inline ~QRegExpLookahead() { delete eng; }
1319 };
1320 #endif
1321 
1322 /*!
1323     \internal
1324     convert the pattern string to the RegExp syntax.
1325 
1326     This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1327  */
qt_regexp_toCanonical(const QString & pattern,QRegExp::PatternSyntax patternSyntax)1328 Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1329 {
1330     switch (patternSyntax) {
1331 #ifndef QT_NO_REGEXP_WILDCARD
1332     case QRegExp::Wildcard:
1333         return wc2rx(pattern, false);
1334     case QRegExp::WildcardUnix:
1335         return wc2rx(pattern, true);
1336 #endif
1337     case QRegExp::FixedString:
1338         return QRegExp::escape(pattern);
1339     case QRegExp::W3CXmlSchema11:
1340     default:
1341         return pattern;
1342     }
1343 }
1344 
QRegExpEngine(const QRegExpEngineKey & key)1345 QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1346     : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1347       xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1348 {
1349     setup();
1350 
1351     QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1352 
1353     valid = (parse(rx.unicode(), rx.length()) == rx.length());
1354     if (!valid) {
1355 #ifndef QT_NO_REGEXP_OPTIM
1356         trivial = false;
1357 #endif
1358         error(RXERR_LEFTDELIM);
1359     }
1360 }
1361 
~QRegExpEngine()1362 QRegExpEngine::~QRegExpEngine()
1363 {
1364 #ifndef QT_NO_REGEXP_LOOKAHEAD
1365     qDeleteAll(ahead);
1366 #endif
1367 }
1368 
prepareForMatch(QRegExpEngine * eng)1369 void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1370 {
1371     /*
1372       We use one QVector<int> for all the big data used a lot in
1373       matchHere() and friends.
1374     */
1375     int ns = eng->s.size(); // number of states
1376     int ncap = eng->ncap;
1377 #ifndef QT_NO_REGEXP_OPTIM
1378     int newSlideTabSize = qMax(eng->minl + 1, 16);
1379 #else
1380     int newSlideTabSize = 0;
1381 #endif
1382     int numCaptures = eng->captureCount();
1383     int newCapturedSize = 2 + 2 * numCaptures;
1384     bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1385 
1386     // set all internal variables only _after_ bigArray is realloc'ed
1387     // to prevent a broken regexp in oom case
1388 
1389     slideTabSize = newSlideTabSize;
1390     capturedSize = newCapturedSize;
1391     inNextStack = bigArray;
1392     memset(inNextStack, -1, ns * sizeof(int));
1393     curStack = inNextStack + ns;
1394     nextStack = inNextStack + 2 * ns;
1395 
1396     curCapBegin = inNextStack + 3 * ns;
1397     nextCapBegin = curCapBegin + ncap * ns;
1398     curCapEnd = curCapBegin + 2 * ncap * ns;
1399     nextCapEnd = curCapBegin + 3 * ncap * ns;
1400 
1401     tempCapBegin = curCapBegin + 4 * ncap * ns;
1402     tempCapEnd = tempCapBegin + ncap;
1403     capBegin = tempCapBegin + 2 * ncap;
1404     capEnd = tempCapBegin + 3 * ncap;
1405 
1406     slideTab = tempCapBegin + 4 * ncap;
1407     captured = slideTab + slideTabSize;
1408     memset(captured, -1, capturedSize*sizeof(int));
1409     this->eng = eng;
1410 }
1411 
1412 /*
1413   Tries to match in str and returns an array of (begin, length) pairs
1414   for captured text. If there is no match, all pairs are (-1, -1).
1415 */
match(const QChar * str0,int len0,int pos0,bool minimal0,bool oneTest,int caretIndex)1416 void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1417     bool minimal0, bool oneTest, int caretIndex)
1418 {
1419     bool matched = false;
1420     QChar char_null;
1421 
1422 #ifndef QT_NO_REGEXP_OPTIM
1423     if (eng->trivial && !oneTest) {
1424         // ### Qt6: qsizetype
1425         pos = int(QtPrivate::findString(QStringView(str0, len0), pos0, QStringView(eng->goodStr.unicode(), eng->goodStr.length()), eng->cs));
1426         matchLen = eng->goodStr.length();
1427         matched = (pos != -1);
1428     } else
1429 #endif
1430     {
1431         in = str0;
1432         if (in == nullptr)
1433             in = &char_null;
1434         pos = pos0;
1435         caretPos = caretIndex;
1436         len = len0;
1437         minimal = minimal0;
1438         matchLen = 0;
1439         oneTestMatchedLen = 0;
1440 
1441         if (eng->valid && pos >= 0 && pos <= len) {
1442 #ifndef QT_NO_REGEXP_OPTIM
1443             if (oneTest) {
1444                 matched = matchHere();
1445             } else {
1446                 if (pos <= len - eng->minl) {
1447                     if (eng->caretAnchored) {
1448                         matched = matchHere();
1449                     } else if (eng->useGoodStringHeuristic) {
1450                         matched = eng->goodStringMatch(*this);
1451                     } else {
1452                         matched = eng->badCharMatch(*this);
1453                     }
1454                 }
1455             }
1456 #else
1457             matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1458 #endif
1459         }
1460     }
1461 
1462     if (matched) {
1463         int *c = captured;
1464         *c++ = pos;
1465         *c++ = matchLen;
1466 
1467         int numCaptures = (capturedSize - 2) >> 1;
1468 #ifndef QT_NO_REGEXP_CAPTURE
1469         for (int i = 0; i < numCaptures; ++i) {
1470             int j = eng->captureForOfficialCapture.at(i);
1471             if (capBegin[j] != EmptyCapture) {
1472                 int len = capEnd[j] - capBegin[j];
1473                 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1474                 *c++ = len;
1475             } else {
1476                 *c++ = -1;
1477                 *c++ = -1;
1478             }
1479         }
1480 #endif
1481     } else {
1482         // we rely on 2's complement here
1483         memset(captured, -1, capturedSize * sizeof(int));
1484     }
1485 }
1486 
1487 /*
1488   The three following functions add one state to the automaton and
1489   return the number of the state.
1490 */
1491 
createState(QChar ch)1492 int QRegExpEngine::createState(QChar ch)
1493 {
1494     return setupState(ch.unicode());
1495 }
1496 
createState(const QRegExpCharClass & cc)1497 int QRegExpEngine::createState(const QRegExpCharClass &cc)
1498 {
1499 #ifndef QT_NO_REGEXP_CCLASS
1500     int n = cl.size();
1501     cl += QRegExpCharClass(cc);
1502     return setupState(CharClassBit | n);
1503 #else
1504     Q_UNUSED(cc);
1505     return setupState(CharClassBit);
1506 #endif
1507 }
1508 
1509 #ifndef QT_NO_REGEXP_BACKREF
createState(int bref)1510 int QRegExpEngine::createState(int bref)
1511 {
1512     if (bref > nbrefs) {
1513         nbrefs = bref;
1514         if (nbrefs > MaxBackRefs) {
1515             error(RXERR_LIMIT);
1516             return 0;
1517         }
1518     }
1519     return setupState(BackRefBit | bref);
1520 }
1521 #endif
1522 
1523 /*
1524   The two following functions add a transition between all pairs of
1525   states (i, j) where i is found in from, and j is found in to.
1526 
1527   Cat-transitions are distinguished from plus-transitions for
1528   capturing.
1529 */
1530 
addCatTransitions(const QVector<int> & from,const QVector<int> & to)1531 void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1532 {
1533     for (int i = 0; i < from.size(); i++)
1534         mergeInto(&s[from.at(i)].outs, to);
1535 }
1536 
1537 #ifndef QT_NO_REGEXP_CAPTURE
addPlusTransitions(const QVector<int> & from,const QVector<int> & to,int atom)1538 void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1539 {
1540     for (int i = 0; i < from.size(); i++) {
1541         QRegExpAutomatonState &st = s[from.at(i)];
1542         const QVector<int> oldOuts = st.outs;
1543         mergeInto(&st.outs, to);
1544         if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1545             for (int j = 0; j < to.size(); j++) {
1546                 // ### st.reenter.contains(to.at(j)) check looks suspicious
1547                 if (!st.reenter.contains(to.at(j)) &&
1548                      !std::binary_search(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)))
1549                     st.reenter.insert(to.at(j), atom);
1550             }
1551         }
1552     }
1553 }
1554 #endif
1555 
1556 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1557 /*
1558   Returns an anchor that means a OR b.
1559 */
anchorAlternation(int a,int b)1560 int QRegExpEngine::anchorAlternation(int a, int b)
1561 {
1562     if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1563         return a & b;
1564 
1565     int n = aa.size();
1566 #ifndef QT_NO_REGEXP_OPTIM
1567     if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1568         return Anchor_Alternation | (n - 1);
1569 #endif
1570 
1571     QRegExpAnchorAlternation element = {a, b};
1572     aa.append(element);
1573     return Anchor_Alternation | n;
1574 }
1575 
1576 /*
1577   Returns an anchor that means a AND b.
1578 */
anchorConcatenation(int a,int b)1579 int QRegExpEngine::anchorConcatenation(int a, int b)
1580 {
1581     if (((a | b) & Anchor_Alternation) == 0)
1582         return a | b;
1583     if ((b & Anchor_Alternation) != 0)
1584         qSwap(a, b);
1585 
1586     int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1587     int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1588     return anchorAlternation(aprime, bprime);
1589 }
1590 #endif
1591 
1592 /*
1593   Adds anchor a on a transition caracterised by its from state and
1594   its to state.
1595 */
addAnchors(int from,int to,int a)1596 void QRegExpEngine::addAnchors(int from, int to, int a)
1597 {
1598     QRegExpAutomatonState &st = s[from];
1599     if (st.anchors.contains(to))
1600         a = anchorAlternation(st.anchors.value(to), a);
1601     st.anchors.insert(to, a);
1602 }
1603 
1604 #ifndef QT_NO_REGEXP_OPTIM
1605 /*
1606   This function chooses between the good-string and the bad-character
1607   heuristics. It computes two scores and chooses the heuristic with
1608   the highest score.
1609 
1610   Here are some common-sense constraints on the scores that should be
1611   respected if the formulas are ever modified: (1) If goodStr is
1612   empty, the good-string heuristic scores 0. (2) If the regular
1613   expression is trivial, the good-string heuristic should be used.
1614   (3) If the search is case insensitive, the good-string heuristic
1615   should be used, unless it scores 0. (Case insensitivity turns all
1616   entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1617   big, the good-string heuristic should score less.
1618 */
heuristicallyChooseHeuristic()1619 void QRegExpEngine::heuristicallyChooseHeuristic()
1620 {
1621     if (minl == 0) {
1622         useGoodStringHeuristic = false;
1623     } else if (trivial) {
1624         useGoodStringHeuristic = true;
1625     } else {
1626         /*
1627           Magic formula: The good string has to constitute a good
1628           proportion of the minimum-length string, and appear at a
1629           more-or-less known index.
1630         */
1631         int goodStringScore = (64 * goodStr.length() / minl) -
1632                               (goodLateStart - goodEarlyStart);
1633         /*
1634           Less magic formula: We pick some characters at random, and
1635           check whether they are good or bad.
1636         */
1637         int badCharScore = 0;
1638         int step = qMax(1, NumBadChars / 32);
1639         for (int i = 1; i < NumBadChars; i += step) {
1640             if (occ1.at(i) == NoOccurrence)
1641                 badCharScore += minl;
1642             else
1643                 badCharScore += occ1.at(i);
1644         }
1645         badCharScore /= minl;
1646         useGoodStringHeuristic = (goodStringScore > badCharScore);
1647     }
1648 }
1649 #endif
1650 
1651 #if defined(QT_DEBUG)
dump() const1652 void QRegExpEngine::dump() const
1653 {
1654     int i, j;
1655     qDebug("Case %ssensitive engine", cs ? "" : "in");
1656     qDebug("  States");
1657     for (i = 0; i < s.size(); i++) {
1658         qDebug("  %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1659 #ifndef QT_NO_REGEXP_CAPTURE
1660         if (nf > 0)
1661             qDebug("    in atom %d", s[i].atom);
1662 #endif
1663         int m = s[i].match;
1664         if ((m & CharClassBit) != 0) {
1665             qDebug("    match character class %d", m ^ CharClassBit);
1666 #ifndef QT_NO_REGEXP_CCLASS
1667             cl[m ^ CharClassBit].dump();
1668 #else
1669             qDebug("    negative character class");
1670 #endif
1671         } else if ((m & BackRefBit) != 0) {
1672             qDebug("    match back-reference %d", m ^ BackRefBit);
1673         } else if (m >= 0x20 && m <= 0x7e) {
1674             qDebug("    match 0x%.4x (%c)", m, m);
1675         } else {
1676             qDebug("    match 0x%.4x", m);
1677         }
1678         for (j = 0; j < s[i].outs.size(); j++) {
1679             int next = s[i].outs[j];
1680             qDebug("    -> %d", next);
1681             if (s[i].reenter.contains(next))
1682                 qDebug("       [reenter %d]", s[i].reenter[next]);
1683             if (s[i].anchors.value(next) != 0)
1684                 qDebug("       [anchors 0x%.8x]", s[i].anchors[next]);
1685         }
1686     }
1687 #ifndef QT_NO_REGEXP_CAPTURE
1688     if (nf > 0) {
1689         qDebug("  Atom    Parent  Capture");
1690         for (i = 0; i < nf; i++) {
1691             if (f[i].capture == QRegExpAtom::NoCapture) {
1692                 qDebug("  %6d  %6d     nil", i, f[i].parent);
1693             } else {
1694                 int cap = f[i].capture;
1695                 bool official = captureForOfficialCapture.contains(cap);
1696                 qDebug("  %6d  %6d  %6d  %s", i, f[i].parent, f[i].capture,
1697                        official ? "official" : "");
1698             }
1699         }
1700     }
1701 #endif
1702 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1703     for (i = 0; i < aa.size(); i++)
1704         qDebug("  Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1705 #endif
1706 }
1707 #endif
1708 
setup()1709 void QRegExpEngine::setup()
1710 {
1711     ref.storeRelaxed(1);
1712 #ifndef QT_NO_REGEXP_CAPTURE
1713     f.resize(32);
1714     nf = 0;
1715     cf = -1;
1716 #endif
1717     officialncap = 0;
1718     ncap = 0;
1719 #ifndef QT_NO_REGEXP_OPTIM
1720     caretAnchored = true;
1721     trivial = true;
1722 #endif
1723     valid = false;
1724 #ifndef QT_NO_REGEXP_BACKREF
1725     nbrefs = 0;
1726 #endif
1727 #ifndef QT_NO_REGEXP_OPTIM
1728     useGoodStringHeuristic = true;
1729     minl = 0;
1730     occ1.fill(0, NumBadChars);
1731 #endif
1732 }
1733 
setupState(int match)1734 int QRegExpEngine::setupState(int match)
1735 {
1736 #ifndef QT_NO_REGEXP_CAPTURE
1737     s += QRegExpAutomatonState(cf, match);
1738 #else
1739     s += QRegExpAutomatonState(match);
1740 #endif
1741     return s.size() - 1;
1742 }
1743 
1744 #ifndef QT_NO_REGEXP_CAPTURE
1745 /*
1746   Functions startAtom() and finishAtom() should be called to delimit
1747   atoms. When a state is created, it is assigned to the current atom.
1748   The information is later used for capturing.
1749 */
startAtom(bool officialCapture)1750 int QRegExpEngine::startAtom(bool officialCapture)
1751 {
1752     if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1753         f.resize((nf + 1) << 1);
1754     f[nf].parent = cf;
1755     cf = nf++;
1756     f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1757     return cf;
1758 }
1759 
finishAtom(int atom,bool needCapture)1760 void QRegExpEngine::finishAtom(int atom, bool needCapture)
1761 {
1762     if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1763         f[atom].capture = QRegExpAtom::UnofficialCapture;
1764     cf = f.at(atom).parent;
1765 }
1766 #endif
1767 
1768 #ifndef QT_NO_REGEXP_LOOKAHEAD
1769 /*
1770   Creates a lookahead anchor.
1771 */
addLookahead(QRegExpEngine * eng,bool negative)1772 int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1773 {
1774     int n = ahead.size();
1775     if (n == MaxLookaheads) {
1776         error(RXERR_LIMIT);
1777         return 0;
1778     }
1779     ahead += new QRegExpLookahead(eng, negative);
1780     return Anchor_FirstLookahead << n;
1781 }
1782 #endif
1783 
1784 #ifndef QT_NO_REGEXP_CAPTURE
1785 /*
1786   We want the longest leftmost captures.
1787 */
isBetterCapture(int ncap,const int * begin1,const int * end1,const int * begin2,const int * end2)1788 static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1789                             const int *end2)
1790 {
1791     for (int i = 0; i < ncap; i++) {
1792         int delta = begin2[i] - begin1[i]; // it has to start early...
1793         if (delta == 0)
1794             delta = end1[i] - end2[i]; // ...and end late
1795 
1796         if (delta != 0)
1797             return delta > 0;
1798     }
1799     return false;
1800 }
1801 #endif
1802 
1803 /*
1804   Returns \c true if anchor a matches at position pos + i in the input
1805   string, otherwise false.
1806 */
testAnchor(int i,int a,const int * capBegin)1807 bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1808 {
1809     int j;
1810 
1811 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1812     if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1813         return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1814                || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1815 #endif
1816 
1817     if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1818         if (pos + i != caretPos)
1819             return false;
1820     }
1821     if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1822         if (pos + i != len)
1823             return false;
1824     }
1825 #ifndef QT_NO_REGEXP_ESCAPE
1826     if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1827         bool before = false;
1828         bool after = false;
1829         if (pos + i != 0)
1830             before = isWord(in[pos + i - 1]);
1831         if (pos + i != len)
1832             after = isWord(in[pos + i]);
1833         if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1834             return false;
1835         if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1836             return false;
1837     }
1838 #endif
1839 #ifndef QT_NO_REGEXP_LOOKAHEAD
1840     if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1841         const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1842         for (j = 0; j < ahead.size(); j++) {
1843             if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1844                 QRegExpMatchState matchState;
1845                 matchState.prepareForMatch(ahead[j]->eng);
1846                 matchState.match(in + pos + i, len - pos - i, 0,
1847                     true, true, caretPos - pos - i);
1848                 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1849                     return false;
1850             }
1851         }
1852     }
1853 #endif
1854 #ifndef QT_NO_REGEXP_CAPTURE
1855 #ifndef QT_NO_REGEXP_BACKREF
1856     for (j = 0; j < eng->nbrefs; j++) {
1857         if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1858             int i = eng->captureForOfficialCapture.at(j);
1859             if (capBegin[i] != EmptyCapture)
1860                 return false;
1861         }
1862     }
1863 #endif
1864 #endif
1865     return true;
1866 }
1867 
1868 #ifndef QT_NO_REGEXP_OPTIM
1869 /*
1870   The three following functions are what Jeffrey Friedl would call
1871   transmissions (or bump-alongs). Using one or the other should make
1872   no difference except in performance.
1873 */
1874 
goodStringMatch(QRegExpMatchState & matchState) const1875 bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1876 {
1877     int k = matchState.pos + goodEarlyStart;
1878     QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1879     while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1880         int from = k - goodLateStart;
1881         int to = k - goodEarlyStart;
1882         if (from > matchState.pos)
1883             matchState.pos = from;
1884 
1885         while (matchState.pos <= to) {
1886             if (matchState.matchHere())
1887                 return true;
1888             ++matchState.pos;
1889         }
1890         ++k;
1891     }
1892     return false;
1893 }
1894 
badCharMatch(QRegExpMatchState & matchState) const1895 bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1896 {
1897     int slideHead = 0;
1898     int slideNext = 0;
1899     int i;
1900     int lastPos = matchState.len - minl;
1901     memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1902 
1903     /*
1904       Set up the slide table, used for the bad-character heuristic,
1905       using the table of first occurrence of each character.
1906     */
1907     for (i = 0; i < minl; i++) {
1908         int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1909         if (sk == NoOccurrence)
1910             sk = i + 1;
1911         if (sk > 0) {
1912             int k = i + 1 - sk;
1913             if (k < 0) {
1914                 sk = i + 1;
1915                 k = 0;
1916             }
1917             if (sk > matchState.slideTab[k])
1918                 matchState.slideTab[k] = sk;
1919         }
1920     }
1921 
1922     if (matchState.pos > lastPos)
1923         return false;
1924 
1925     for (;;) {
1926         if (++slideNext >= matchState.slideTabSize)
1927             slideNext = 0;
1928         if (matchState.slideTab[slideHead] > 0) {
1929             if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1930                 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1931             matchState.slideTab[slideHead] = 0;
1932         } else {
1933             if (matchState.matchHere())
1934                 return true;
1935         }
1936 
1937         if (matchState.pos == lastPos)
1938             break;
1939 
1940         /*
1941           Update the slide table. This code has much in common with
1942           the initialization code.
1943         */
1944         int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1945         if (sk == NoOccurrence) {
1946             matchState.slideTab[slideNext] = minl;
1947         } else if (sk > 0) {
1948             int k = slideNext + minl - sk;
1949             if (k >= matchState.slideTabSize)
1950                 k -= matchState.slideTabSize;
1951             if (sk > matchState.slideTab[k])
1952                 matchState.slideTab[k] = sk;
1953         }
1954         slideHead = slideNext;
1955         ++matchState.pos;
1956     }
1957     return false;
1958 }
1959 #else
bruteMatch(QRegExpMatchState & matchState) const1960 bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1961 {
1962     while (matchState.pos <= matchState.len) {
1963         if (matchState.matchHere())
1964             return true;
1965         ++matchState.pos;
1966     }
1967     return false;
1968 }
1969 #endif
1970 
1971 /*
1972   Here's the core of the engine. It tries to do a match here and now.
1973 */
matchHere()1974 bool QRegExpMatchState::matchHere()
1975 {
1976     int ncur = 1, nnext = 0;
1977     int i = 0, j, k, m;
1978     bool stop = false;
1979 
1980     matchLen = -1;
1981     oneTestMatchedLen = -1;
1982     curStack[0] = QRegExpEngine::InitialState;
1983 
1984     int ncap = eng->ncap;
1985 #ifndef QT_NO_REGEXP_CAPTURE
1986     if (ncap > 0) {
1987         for (j = 0; j < ncap; j++) {
1988             curCapBegin[j] = EmptyCapture;
1989             curCapEnd[j] = EmptyCapture;
1990         }
1991     }
1992 #endif
1993 
1994 #ifndef QT_NO_REGEXP_BACKREF
1995     while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1996 #else
1997     while (ncur > 0 && i <= len - pos && !stop)
1998 #endif
1999     {
2000         int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
2001         for (j = 0; j < ncur; j++) {
2002             int cur = curStack[j];
2003             const QRegExpAutomatonState &scur = eng->s.at(cur);
2004             const QVector<int> &outs = scur.outs;
2005             for (k = 0; k < outs.size(); k++) {
2006                 int next = outs.at(k);
2007                 const QRegExpAutomatonState &snext = eng->s.at(next);
2008                 bool inside = true;
2009 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2010                 int needSomeSleep = 0;
2011 #endif
2012 
2013                 /*
2014                   First, check if the anchors are anchored properly.
2015                 */
2016                 int a = scur.anchors.value(next);
2017                 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2018                     inside = false;
2019 
2020                 /*
2021                   If indeed they are, check if the input character is
2022                   correct for this transition.
2023                 */
2024                 if (inside) {
2025                     m = snext.match;
2026                     if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2027                         if (eng->cs)
2028                             inside = (m == ch);
2029                         else
2030                             inside = (QChar(m).toLower() == QChar(ch).toLower());
2031                     } else if (next == QRegExpEngine::FinalState) {
2032                         matchLen = i;
2033                         stop = minimal;
2034                         inside = true;
2035                     } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2036 #ifndef QT_NO_REGEXP_CCLASS
2037                         const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2038                         if (eng->cs)
2039                             inside = cc.in(QChar(ch));
2040                         else if (cc.negative())
2041                             inside = cc.in(QChar(ch).toLower()) &&
2042                                      cc.in(QChar(ch).toUpper());
2043                         else
2044                             inside = cc.in(QChar(ch).toLower()) ||
2045                                      cc.in(QChar(ch).toUpper());
2046 #endif
2047 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2048                     } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2049                         int bref = m ^ QRegExpEngine::BackRefBit;
2050                         int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2051 
2052                         inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2053                         if (inside) {
2054                             if (eng->cs)
2055                                 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2056                             else
2057                                 inside = (in[pos + curCapBegin[ell]].toLower()
2058                                        == QChar(ch).toLower());
2059                         }
2060 
2061                         if (inside) {
2062                             int delta;
2063                             if (curCapEnd[ell] == EmptyCapture)
2064                                 delta = i - curCapBegin[ell];
2065                             else
2066                                 delta = curCapEnd[ell] - curCapBegin[ell];
2067 
2068                             inside = (delta <= len - (pos + i));
2069                             if (inside && delta > 1) {
2070                                 int n = 1;
2071                                 if (eng->cs) {
2072                                     while (n < delta) {
2073                                         if (in[pos + curCapBegin[ell] + n]
2074                                             != in[pos + i + n])
2075                                             break;
2076                                         ++n;
2077                                     }
2078                                 } else {
2079                                     while (n < delta) {
2080                                         QChar a = in[pos + curCapBegin[ell] + n];
2081                                         QChar b = in[pos + i + n];
2082                                         if (a.toLower() != b.toLower())
2083                                             break;
2084                                         ++n;
2085                                     }
2086                                 }
2087                                 inside = (n == delta);
2088                                 if (inside)
2089                                     needSomeSleep = delta - 1;
2090                             }
2091                         }
2092 #endif
2093                     }
2094                 }
2095 
2096                 /*
2097                   We must now update our data structures.
2098                 */
2099                 if (inside) {
2100 #ifndef QT_NO_REGEXP_CAPTURE
2101                     int *capBegin, *capEnd;
2102 #endif
2103                     /*
2104                       If the next state was not encountered yet, all
2105                       is fine.
2106                     */
2107                     if ((m = inNextStack[next]) == -1) {
2108                         m = nnext++;
2109                         nextStack[m] = next;
2110                         inNextStack[next] = m;
2111 #ifndef QT_NO_REGEXP_CAPTURE
2112                         capBegin = nextCapBegin + m * ncap;
2113                         capEnd = nextCapEnd + m * ncap;
2114 
2115                     /*
2116                       Otherwise, we'll first maintain captures in
2117                       temporary arrays, and decide at the end whether
2118                       it's best to keep the previous capture zones or
2119                       the new ones.
2120                     */
2121                     } else {
2122                         capBegin = tempCapBegin;
2123                         capEnd = tempCapEnd;
2124 #endif
2125                     }
2126 
2127 #ifndef QT_NO_REGEXP_CAPTURE
2128                     /*
2129                       Updating the capture zones is much of a task.
2130                     */
2131                     if (ncap > 0) {
2132                         memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2133                         memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2134                         int c = scur.atom, n = snext.atom;
2135                         int p = -1, q = -1;
2136                         int cap;
2137 
2138                         /*
2139                           Lemma 1. For any x in the range [0..nf), we
2140                           have f[x].parent < x.
2141 
2142                           Proof. By looking at startAtom(), it is
2143                           clear that cf < nf holds all the time, and
2144                           thus that f[nf].parent < nf.
2145                         */
2146 
2147                         /*
2148                           If we are reentering an atom, we empty all
2149                           capture zones inside it.
2150                         */
2151                         if ((q = scur.reenter.value(next)) != 0) {
2152                             QBitArray b(eng->nf, false);
2153                             b.setBit(q, true);
2154                             for (int ell = q + 1; ell < eng->nf; ell++) {
2155                                 if (b.testBit(eng->f.at(ell).parent)) {
2156                                     b.setBit(ell, true);
2157                                     cap = eng->f.at(ell).capture;
2158                                     if (cap >= 0) {
2159                                         capBegin[cap] = EmptyCapture;
2160                                         capEnd[cap] = EmptyCapture;
2161                                     }
2162                                 }
2163                             }
2164                             p = eng->f.at(q).parent;
2165 
2166                         /*
2167                           Otherwise, close the capture zones we are
2168                           leaving. We are leaving f[c].capture,
2169                           f[f[c].parent].capture,
2170                           f[f[f[c].parent].parent].capture, ...,
2171                           until f[x].capture, with x such that
2172                           f[x].parent is the youngest common ancestor
2173                           for c and n.
2174 
2175                           We go up along c's and n's ancestry until
2176                           we find x.
2177                         */
2178                         } else {
2179                             p = c;
2180                             q = n;
2181                             while (p != q) {
2182                                 if (p > q) {
2183                                     cap = eng->f.at(p).capture;
2184                                     if (cap >= 0) {
2185                                         if (capBegin[cap] == i) {
2186                                             capBegin[cap] = EmptyCapture;
2187                                             capEnd[cap] = EmptyCapture;
2188                                         } else {
2189                                             capEnd[cap] = i;
2190                                         }
2191                                     }
2192                                     p = eng->f.at(p).parent;
2193                                 } else {
2194                                     q = eng->f.at(q).parent;
2195                                 }
2196                             }
2197                         }
2198 
2199                         /*
2200                           In any case, we now open the capture zones
2201                           we are entering. We work upwards from n
2202                           until we reach p (the parent of the atom we
2203                           reenter or the youngest common ancestor).
2204                         */
2205                         while (n > p) {
2206                             cap = eng->f.at(n).capture;
2207                             if (cap >= 0) {
2208                                 capBegin[cap] = i;
2209                                 capEnd[cap] = EmptyCapture;
2210                             }
2211                             n = eng->f.at(n).parent;
2212                         }
2213                         /*
2214                           If the next state was already in
2215                           nextStack, we must choose carefully which
2216                           capture zones we want to keep.
2217                         */
2218                         if (capBegin == tempCapBegin &&
2219                                 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2220                                                 nextCapEnd + m * ncap)) {
2221                             memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2222                             memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2223                         }
2224                     }
2225 #ifndef QT_NO_REGEXP_BACKREF
2226                     /*
2227                       We are done with updating the capture zones.
2228                       It's now time to put the next state to sleep,
2229                       if it needs to, and to remove it from
2230                       nextStack.
2231                     */
2232                     if (needSomeSleep > 0) {
2233                         QVector<int> zzZ(2 + 2 * ncap);
2234                         zzZ[0] = i + needSomeSleep;
2235                         zzZ[1] = next;
2236                         if (ncap > 0) {
2237                             memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2238                             memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2239                         }
2240                         inNextStack[nextStack[--nnext]] = -1;
2241                         sleeping.append(zzZ);
2242                     }
2243 #endif
2244 #endif
2245                 }
2246             }
2247         }
2248 #ifndef QT_NO_REGEXP_CAPTURE
2249         /*
2250           If we reached the final state, hurray! Copy the captured
2251           zone.
2252         */
2253         if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2254             memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2255             memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2256         }
2257 #ifndef QT_NO_REGEXP_BACKREF
2258         /*
2259           It's time to wake up the sleepers.
2260         */
2261         j = 0;
2262         while (j < sleeping.count()) {
2263             if (sleeping.at(j)[0] == i) {
2264                 const QVector<int> &zzZ = sleeping.at(j);
2265                 int next = zzZ[1];
2266                 const int *capBegin = zzZ.data() + 2;
2267                 const int *capEnd = zzZ.data() + 2 + ncap;
2268                 bool copyOver = true;
2269 
2270                 if ((m = inNextStack[next]) == -1) {
2271                     m = nnext++;
2272                     nextStack[m] = next;
2273                     inNextStack[next] = m;
2274                 } else {
2275                     copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2276                                                capBegin, capEnd);
2277                 }
2278                 if (copyOver) {
2279                     memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2280                     memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2281                 }
2282 
2283                 sleeping.removeAt(j);
2284             } else {
2285                 ++j;
2286             }
2287         }
2288 #endif
2289 #endif
2290         for (j = 0; j < nnext; j++)
2291             inNextStack[nextStack[j]] = -1;
2292 
2293         // avoid needless iteration that confuses oneTestMatchedLen
2294         if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2295 #ifndef QT_NO_REGEXP_BACKREF
2296              && sleeping.isEmpty()
2297 #endif
2298            )
2299             stop = true;
2300 
2301         qSwap(curStack, nextStack);
2302 #ifndef QT_NO_REGEXP_CAPTURE
2303         qSwap(curCapBegin, nextCapBegin);
2304         qSwap(curCapEnd, nextCapEnd);
2305 #endif
2306         ncur = nnext;
2307         nnext = 0;
2308         ++i;
2309     }
2310 
2311 #ifndef QT_NO_REGEXP_BACKREF
2312     /*
2313       If minimal matching is enabled, we might have some sleepers
2314       left.
2315     */
2316     if (!sleeping.isEmpty())
2317         sleeping.clear();
2318 #endif
2319 
2320     oneTestMatchedLen = i - 1;
2321     return (matchLen >= 0);
2322 }
2323 
2324 #ifndef QT_NO_REGEXP_CCLASS
2325 
QRegExpCharClass()2326 QRegExpCharClass::QRegExpCharClass()
2327     : c(0), n(false)
2328 {
2329 #ifndef QT_NO_REGEXP_OPTIM
2330     occ1.fill(NoOccurrence, NumBadChars);
2331 #endif
2332 }
2333 
clear()2334 void QRegExpCharClass::clear()
2335 {
2336     c = 0;
2337     r.clear();
2338     n = false;
2339 }
2340 
setNegative(bool negative)2341 void QRegExpCharClass::setNegative(bool negative)
2342 {
2343     n = negative;
2344 #ifndef QT_NO_REGEXP_OPTIM
2345     occ1.fill(0, NumBadChars);
2346 #endif
2347 }
2348 
addCategories(uint cats)2349 void QRegExpCharClass::addCategories(uint cats)
2350 {
2351     static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2352                                 FLAG(QChar::Mark_SpacingCombining) |
2353                                 FLAG(QChar::Mark_Enclosing) |
2354                                 FLAG(QChar::Number_DecimalDigit) |
2355                                 FLAG(QChar::Number_Letter) |
2356                                 FLAG(QChar::Number_Other) |
2357                                 FLAG(QChar::Separator_Space) |
2358                                 FLAG(QChar::Separator_Line) |
2359                                 FLAG(QChar::Separator_Paragraph) |
2360                                 FLAG(QChar::Other_Control) |
2361                                 FLAG(QChar::Other_Format) |
2362                                 FLAG(QChar::Other_Surrogate) |
2363                                 FLAG(QChar::Other_PrivateUse) |
2364                                 FLAG(QChar::Other_NotAssigned) |
2365                                 FLAG(QChar::Letter_Uppercase) |
2366                                 FLAG(QChar::Letter_Lowercase) |
2367                                 FLAG(QChar::Letter_Titlecase) |
2368                                 FLAG(QChar::Letter_Modifier) |
2369                                 FLAG(QChar::Letter_Other) |
2370                                 FLAG(QChar::Punctuation_Connector) |
2371                                 FLAG(QChar::Punctuation_Dash) |
2372                                 FLAG(QChar::Punctuation_Open) |
2373                                 FLAG(QChar::Punctuation_Close) |
2374                                 FLAG(QChar::Punctuation_InitialQuote) |
2375                                 FLAG(QChar::Punctuation_FinalQuote) |
2376                                 FLAG(QChar::Punctuation_Other) |
2377                                 FLAG(QChar::Symbol_Math) |
2378                                 FLAG(QChar::Symbol_Currency) |
2379                                 FLAG(QChar::Symbol_Modifier) |
2380                                 FLAG(QChar::Symbol_Other);
2381     c |= (all_cats & cats);
2382 #ifndef QT_NO_REGEXP_OPTIM
2383     occ1.fill(0, NumBadChars);
2384 #endif
2385 }
2386 
addRange(ushort from,ushort to)2387 void QRegExpCharClass::addRange(ushort from, ushort to)
2388 {
2389     if (from > to)
2390         qSwap(from, to);
2391     int m = r.size();
2392     r.resize(m + 1);
2393     r[m].from = from;
2394     r[m].len = to - from + 1;
2395 
2396 #ifndef QT_NO_REGEXP_OPTIM
2397     int i;
2398 
2399     if (to - from < NumBadChars) {
2400         if (from % NumBadChars <= to % NumBadChars) {
2401             for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2402                 occ1[i] = 0;
2403         } else {
2404             for (i = 0; i <= to % NumBadChars; i++)
2405                 occ1[i] = 0;
2406             for (i = from % NumBadChars; i < NumBadChars; i++)
2407                 occ1[i] = 0;
2408         }
2409     } else {
2410         occ1.fill(0, NumBadChars);
2411     }
2412 #endif
2413 }
2414 
in(QChar ch) const2415 bool QRegExpCharClass::in(QChar ch) const
2416 {
2417 #ifndef QT_NO_REGEXP_OPTIM
2418     if (occ1.at(BadChar(ch)) == NoOccurrence)
2419         return n;
2420 #endif
2421 
2422     if (c != 0 && (c & FLAG(ch.category())) != 0)
2423         return !n;
2424 
2425     const int uc = ch.unicode();
2426     int size = r.size();
2427 
2428     for (int i = 0; i < size; ++i) {
2429         const QRegExpCharClassRange &range = r.at(i);
2430         if (uint(uc - range.from) < uint(r.at(i).len))
2431             return !n;
2432     }
2433     return n;
2434 }
2435 
2436 #if defined(QT_DEBUG)
dump() const2437 void QRegExpCharClass::dump() const
2438 {
2439     int i;
2440     qDebug("    %stive character class", n ? "nega" : "posi");
2441 #ifndef QT_NO_REGEXP_CCLASS
2442     if (c != 0)
2443         qDebug("      categories 0x%.8x", c);
2444 #endif
2445     for (i = 0; i < r.size(); i++)
2446         qDebug("      0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2447 }
2448 #endif
2449 #endif
2450 
Box(QRegExpEngine * engine)2451 QRegExpEngine::Box::Box(QRegExpEngine *engine)
2452     : eng(engine), skipanchors(0)
2453 #ifndef QT_NO_REGEXP_OPTIM
2454       , earlyStart(0), lateStart(0), maxl(0)
2455 #endif
2456 {
2457 #ifndef QT_NO_REGEXP_OPTIM
2458     occ1.fill(NoOccurrence, NumBadChars);
2459 #endif
2460     minl = 0;
2461 }
2462 
operator =(const Box & b)2463 QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2464 {
2465     eng = b.eng;
2466     ls = b.ls;
2467     rs = b.rs;
2468     lanchors = b.lanchors;
2469     ranchors = b.ranchors;
2470     skipanchors = b.skipanchors;
2471 #ifndef QT_NO_REGEXP_OPTIM
2472     earlyStart = b.earlyStart;
2473     lateStart = b.lateStart;
2474     str = b.str;
2475     leftStr = b.leftStr;
2476     rightStr = b.rightStr;
2477     maxl = b.maxl;
2478     occ1 = b.occ1;
2479 #endif
2480     minl = b.minl;
2481     return *this;
2482 }
2483 
set(QChar ch)2484 void QRegExpEngine::Box::set(QChar ch)
2485 {
2486     ls.resize(1);
2487     ls[0] = eng->createState(ch);
2488     rs = ls;
2489 #ifndef QT_NO_REGEXP_OPTIM
2490     str = ch;
2491     leftStr = ch;
2492     rightStr = ch;
2493     maxl = 1;
2494     occ1[BadChar(ch)] = 0;
2495 #endif
2496     minl = 1;
2497 }
2498 
set(const QRegExpCharClass & cc)2499 void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2500 {
2501     ls.resize(1);
2502     ls[0] = eng->createState(cc);
2503     rs = ls;
2504 #ifndef QT_NO_REGEXP_OPTIM
2505     maxl = 1;
2506     occ1 = cc.firstOccurrence();
2507 #endif
2508     minl = 1;
2509 }
2510 
2511 #ifndef QT_NO_REGEXP_BACKREF
set(int bref)2512 void QRegExpEngine::Box::set(int bref)
2513 {
2514     ls.resize(1);
2515     ls[0] = eng->createState(bref);
2516     rs = ls;
2517     if (bref >= 1 && bref <= MaxBackRefs)
2518         skipanchors = Anchor_BackRef0Empty << bref;
2519 #ifndef QT_NO_REGEXP_OPTIM
2520     maxl = InftyLen;
2521 #endif
2522     minl = 0;
2523 }
2524 #endif
2525 
cat(const Box & b)2526 void QRegExpEngine::Box::cat(const Box &b)
2527 {
2528     eng->addCatTransitions(rs, b.ls);
2529     addAnchorsToEngine(b);
2530     if (minl == 0) {
2531         lanchors.insert(b.lanchors);
2532         if (skipanchors != 0) {
2533             for (int i = 0; i < b.ls.size(); i++) {
2534                 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2535                 lanchors.insert(b.ls.at(i), a);
2536             }
2537         }
2538         mergeInto(&ls, b.ls);
2539     }
2540     if (b.minl == 0) {
2541         ranchors.insert(b.ranchors);
2542         if (b.skipanchors != 0) {
2543             for (int i = 0; i < rs.size(); i++) {
2544                 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2545                 ranchors.insert(rs.at(i), a);
2546             }
2547         }
2548         mergeInto(&rs, b.rs);
2549     } else {
2550         ranchors = b.ranchors;
2551         rs = b.rs;
2552     }
2553 
2554 #ifndef QT_NO_REGEXP_OPTIM
2555     if (maxl != InftyLen) {
2556         if (rightStr.length() + b.leftStr.length() >
2557              qMax(str.length(), b.str.length())) {
2558             earlyStart = minl - rightStr.length();
2559             lateStart = maxl - rightStr.length();
2560             str = rightStr + b.leftStr;
2561         } else if (b.str.length() > str.length()) {
2562             earlyStart = minl + b.earlyStart;
2563             lateStart = maxl + b.lateStart;
2564             str = b.str;
2565         }
2566     }
2567 
2568     if (leftStr.length() == maxl)
2569         leftStr += b.leftStr;
2570 
2571     if (b.rightStr.length() == b.maxl) {
2572         rightStr += b.rightStr;
2573     } else {
2574         rightStr = b.rightStr;
2575     }
2576 
2577     if (maxl == InftyLen || b.maxl == InftyLen) {
2578         maxl = InftyLen;
2579     } else {
2580         maxl += b.maxl;
2581     }
2582 
2583     for (int i = 0; i < NumBadChars; i++) {
2584         if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2585             occ1[i] = minl + b.occ1.at(i);
2586     }
2587 #endif
2588 
2589     minl += b.minl;
2590     if (minl == 0)
2591         skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2592     else
2593         skipanchors = 0;
2594 }
2595 
orx(const Box & b)2596 void QRegExpEngine::Box::orx(const Box &b)
2597 {
2598     mergeInto(&ls, b.ls);
2599     lanchors.insert(b.lanchors);
2600     mergeInto(&rs, b.rs);
2601     ranchors.insert(b.ranchors);
2602 
2603     if (b.minl == 0) {
2604         if (minl == 0)
2605             skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2606         else
2607             skipanchors = b.skipanchors;
2608     }
2609 
2610 #ifndef QT_NO_REGEXP_OPTIM
2611     for (int i = 0; i < NumBadChars; i++) {
2612         if (occ1.at(i) > b.occ1.at(i))
2613             occ1[i] = b.occ1.at(i);
2614     }
2615     earlyStart = 0;
2616     lateStart = 0;
2617     str = QString();
2618     leftStr = QString();
2619     rightStr = QString();
2620     if (b.maxl > maxl)
2621         maxl = b.maxl;
2622 #endif
2623     if (b.minl < minl)
2624         minl = b.minl;
2625 }
2626 
plus(int atom)2627 void QRegExpEngine::Box::plus(int atom)
2628 {
2629 #ifndef QT_NO_REGEXP_CAPTURE
2630     eng->addPlusTransitions(rs, ls, atom);
2631 #else
2632     Q_UNUSED(atom);
2633     eng->addCatTransitions(rs, ls);
2634 #endif
2635     addAnchorsToEngine(*this);
2636 #ifndef QT_NO_REGEXP_OPTIM
2637     maxl = InftyLen;
2638 #endif
2639 }
2640 
opt()2641 void QRegExpEngine::Box::opt()
2642 {
2643 #ifndef QT_NO_REGEXP_OPTIM
2644     earlyStart = 0;
2645     lateStart = 0;
2646     str = QString();
2647     leftStr = QString();
2648     rightStr = QString();
2649 #endif
2650     skipanchors = 0;
2651     minl = 0;
2652 }
2653 
catAnchor(int a)2654 void QRegExpEngine::Box::catAnchor(int a)
2655 {
2656     if (a != 0) {
2657         for (int i = 0; i < rs.size(); i++) {
2658             a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2659             ranchors.insert(rs.at(i), a);
2660         }
2661         if (minl == 0)
2662             skipanchors = eng->anchorConcatenation(skipanchors, a);
2663     }
2664 }
2665 
2666 #ifndef QT_NO_REGEXP_OPTIM
setupHeuristics()2667 void QRegExpEngine::Box::setupHeuristics()
2668 {
2669     eng->goodEarlyStart = earlyStart;
2670     eng->goodLateStart = lateStart;
2671     eng->goodStr = eng->cs ? str : str.toLower();
2672 
2673     eng->minl = minl;
2674     if (eng->cs) {
2675         /*
2676           A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2677           1 at this point. An entry of occ1 has to be at most minl or
2678           infinity for the rest of the algorithm to go well.
2679 
2680           We waited until here before normalizing these cases (instead of
2681           doing it in Box::orx()) because sometimes things improve by
2682           themselves. Consider for example (112|1)34.
2683         */
2684         for (int i = 0; i < NumBadChars; i++) {
2685             if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2686                 occ1[i] = minl;
2687         }
2688         eng->occ1 = occ1;
2689     } else {
2690         eng->occ1.fill(0, NumBadChars);
2691     }
2692 
2693     eng->heuristicallyChooseHeuristic();
2694 }
2695 #endif
2696 
2697 #if defined(QT_DEBUG)
dump() const2698 void QRegExpEngine::Box::dump() const
2699 {
2700     int i;
2701     qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2702     qDebug("  Left states:");
2703     for (i = 0; i < ls.size(); i++) {
2704         if (lanchors.value(ls[i], 0) == 0)
2705             qDebug("    %d", ls[i]);
2706         else
2707             qDebug("    %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2708     }
2709     qDebug("  Right states:");
2710     for (i = 0; i < rs.size(); i++) {
2711         if (ranchors.value(rs[i], 0) == 0)
2712             qDebug("    %d", rs[i]);
2713         else
2714             qDebug("    %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2715     }
2716     qDebug("  Skip anchors: 0x%.8x", skipanchors);
2717 }
2718 #endif
2719 
addAnchorsToEngine(const Box & to) const2720 void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2721 {
2722     for (int i = 0; i < to.ls.size(); i++) {
2723         for (int j = 0; j < rs.size(); j++) {
2724             int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2725                                              to.lanchors.value(to.ls.at(i), 0));
2726             eng->addAnchors(rs[j], to.ls[i], a);
2727         }
2728     }
2729 }
2730 
2731 #ifndef QT_NO_REGEXP_CCLASS
2732 // fast lookup hash for xml schema extensions
2733 // sorted by name for b-search
2734 static const struct CategoriesRangeMapEntry {
2735     const char name[40];
2736     uint first, second;
2737 } categoriesRangeMap[] = {
2738     { "AegeanNumbers",                        0x10100, 0x1013F },
2739     { "AlphabeticPresentationForms",          0xFB00, 0xFB4F },
2740     { "AncientGreekMusicalNotation",          0x1D200, 0x1D24F },
2741     { "AncientGreekNumbers",                  0x10140, 0x1018F },
2742     { "Arabic",                               0x0600, 0x06FF },
2743     { "ArabicPresentationForms-A",            0xFB50, 0xFDFF },
2744     { "ArabicPresentationForms-B",            0xFE70, 0xFEFF },
2745     { "ArabicSupplement",                     0x0750, 0x077F },
2746     { "Armenian",                             0x0530, 0x058F },
2747     { "Arrows",                               0x2190, 0x21FF },
2748     { "BasicLatin",                           0x0000, 0x007F },
2749     { "Bengali",                              0x0980, 0x09FF },
2750     { "BlockElements",                        0x2580, 0x259F },
2751     { "Bopomofo",                             0x3100, 0x312F },
2752     { "BopomofoExtended",                     0x31A0, 0x31BF },
2753     { "BoxDrawing",                           0x2500, 0x257F },
2754     { "BraillePatterns",                      0x2800, 0x28FF },
2755     { "Buginese",                             0x1A00, 0x1A1F },
2756     { "Buhid",                                0x1740, 0x175F },
2757     { "ByzantineMusicalSymbols",              0x1D000, 0x1D0FF },
2758     { "CJKCompatibility",                     0x3300, 0x33FF },
2759     { "CJKCompatibilityForms",                0xFE30, 0xFE4F },
2760     { "CJKCompatibilityIdeographs",           0xF900, 0xFAFF },
2761     { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F },
2762     { "CJKRadicalsSupplement",                0x2E80, 0x2EFF },
2763     { "CJKStrokes",                           0x31C0, 0x31EF },
2764     { "CJKSymbolsandPunctuation",             0x3000, 0x303F },
2765     { "CJKUnifiedIdeographs",                 0x4E00, 0x9FFF },
2766     { "CJKUnifiedIdeographsExtensionA",       0x3400, 0x4DB5 },
2767     { "CJKUnifiedIdeographsExtensionB",       0x20000, 0x2A6DF },
2768     { "Cherokee",                             0x13A0, 0x13FF },
2769     { "CombiningDiacriticalMarks",            0x0300, 0x036F },
2770     { "CombiningDiacriticalMarksSupplement",  0x1DC0, 0x1DFF },
2771     { "CombiningHalfMarks",                   0xFE20, 0xFE2F },
2772     { "CombiningMarksforSymbols",             0x20D0, 0x20FF },
2773     { "ControlPictures",                      0x2400, 0x243F },
2774     { "Coptic",                               0x2C80, 0x2CFF },
2775     { "CurrencySymbols",                      0x20A0, 0x20CF },
2776     { "CypriotSyllabary",                     0x10800, 0x1083F },
2777     { "Cyrillic",                             0x0400, 0x04FF },
2778     { "CyrillicSupplement",                   0x0500, 0x052F },
2779     { "Deseret",                              0x10400, 0x1044F },
2780     { "Devanagari",                           0x0900, 0x097F },
2781     { "Dingbats",                             0x2700, 0x27BF },
2782     { "EnclosedAlphanumerics",                0x2460, 0x24FF },
2783     { "EnclosedCJKLettersandMonths",          0x3200, 0x32FF },
2784     { "Ethiopic",                             0x1200, 0x137F },
2785     { "EthiopicExtended",                     0x2D80, 0x2DDF },
2786     { "EthiopicSupplement",                   0x1380, 0x139F },
2787     { "GeneralPunctuation",                   0x2000, 0x206F },
2788     { "GeometricShapes",                      0x25A0, 0x25FF },
2789     { "Georgian",                             0x10A0, 0x10FF },
2790     { "GeorgianSupplement",                   0x2D00, 0x2D2F },
2791     { "Glagolitic",                           0x2C00, 0x2C5F },
2792     { "Gothic",                               0x10330, 0x1034F },
2793     { "Greek",                                0x0370, 0x03FF },
2794     { "GreekExtended",                        0x1F00, 0x1FFF },
2795     { "Gujarati",                             0x0A80, 0x0AFF },
2796     { "Gurmukhi",                             0x0A00, 0x0A7F },
2797     { "HalfwidthandFullwidthForms",           0xFF00, 0xFFEF },
2798     { "HangulCompatibilityJamo",              0x3130, 0x318F },
2799     { "HangulJamo",                           0x1100, 0x11FF },
2800     { "HangulSyllables",                      0xAC00, 0xD7A3 },
2801     { "Hanunoo",                              0x1720, 0x173F },
2802     { "Hebrew",                               0x0590, 0x05FF },
2803     { "Hiragana",                             0x3040, 0x309F },
2804     { "IPAExtensions",                        0x0250, 0x02AF },
2805     { "IdeographicDescriptionCharacters",     0x2FF0, 0x2FFF },
2806     { "Kanbun",                               0x3190, 0x319F },
2807     { "KangxiRadicals",                       0x2F00, 0x2FDF },
2808     { "Kannada",                              0x0C80, 0x0CFF },
2809     { "Katakana",                             0x30A0, 0x30FF },
2810     { "KatakanaPhoneticExtensions",           0x31F0, 0x31FF },
2811     { "Kharoshthi",                           0x10A00, 0x10A5F },
2812     { "Khmer",                                0x1780, 0x17FF },
2813     { "KhmerSymbols",                         0x19E0, 0x19FF },
2814     { "Lao",                                  0x0E80, 0x0EFF },
2815     { "Latin-1Supplement",                    0x0080, 0x00FF },
2816     { "LatinExtended-A",                      0x0100, 0x017F },
2817     { "LatinExtended-B",                      0x0180, 0x024F },
2818     { "LatinExtendedAdditional",              0x1E00, 0x1EFF },
2819     { "LetterlikeSymbols",                    0x2100, 0x214F },
2820     { "Limbu",                                0x1900, 0x194F },
2821     { "LinearBIdeograms",                     0x10080, 0x100FF },
2822     { "LinearBSyllabary",                     0x10000, 0x1007F },
2823     { "Malayalam",                            0x0D00, 0x0D7F },
2824     { "MathematicalAlphanumericSymbols",      0x1D400, 0x1D7FF },
2825     { "MathematicalOperators",                0x2200, 0x22FF },
2826     { "MiscellaneousMathematicalSymbols-A",   0x27C0, 0x27EF },
2827     { "MiscellaneousMathematicalSymbols-B",   0x2980, 0x29FF },
2828     { "MiscellaneousSymbols",                 0x2600, 0x26FF },
2829     { "MiscellaneousSymbolsandArrows",        0x2B00, 0x2BFF },
2830     { "MiscellaneousTechnical",               0x2300, 0x23FF },
2831     { "ModifierToneLetters",                  0xA700, 0xA71F },
2832     { "Mongolian",                            0x1800, 0x18AF },
2833     { "MusicalSymbols",                       0x1D100, 0x1D1FF },
2834     { "Myanmar",                              0x1000, 0x109F },
2835     { "NewTaiLue",                            0x1980, 0x19DF },
2836     { "NumberForms",                          0x2150, 0x218F },
2837     { "Ogham",                                0x1680, 0x169F },
2838     { "OldItalic",                            0x10300, 0x1032F },
2839     { "OldPersian",                           0x103A0, 0x103DF },
2840     { "OpticalCharacterRecognition",          0x2440, 0x245F },
2841     { "Oriya",                                0x0B00, 0x0B7F },
2842     { "Osmanya",                              0x10480, 0x104AF },
2843     { "PhoneticExtensions",                   0x1D00, 0x1D7F },
2844     { "PhoneticExtensionsSupplement",         0x1D80, 0x1DBF },
2845     { "PrivateUse",                           0xE000, 0xF8FF },
2846     { "Runic",                                0x16A0, 0x16FF },
2847     { "Shavian",                              0x10450, 0x1047F },
2848     { "Sinhala",                              0x0D80, 0x0DFF },
2849     { "SmallFormVariants",                    0xFE50, 0xFE6F },
2850     { "SpacingModifierLetters",               0x02B0, 0x02FF },
2851     { "Specials",                             0xFFF0, 0xFFFF },
2852     { "SuperscriptsandSubscripts",            0x2070, 0x209F },
2853     { "SupplementalArrows-A",                 0x27F0, 0x27FF },
2854     { "SupplementalArrows-B",                 0x2900, 0x297F },
2855     { "SupplementalMathematicalOperators",    0x2A00, 0x2AFF },
2856     { "SupplementalPunctuation",              0x2E00, 0x2E7F },
2857     { "SupplementaryPrivateUseArea-A",        0xF0000, 0xFFFFF },
2858     { "SupplementaryPrivateUseArea-B",        0x100000, 0x10FFFF },
2859     { "SylotiNagri",                          0xA800, 0xA82F },
2860     { "Syriac",                               0x0700, 0x074F },
2861     { "Tagalog",                              0x1700, 0x171F },
2862     { "Tagbanwa",                             0x1760, 0x177F },
2863     { "Tags",                                 0xE0000, 0xE007F },
2864     { "TaiLe",                                0x1950, 0x197F },
2865     { "TaiXuanJingSymbols",                   0x1D300, 0x1D35F },
2866     { "Tamil",                                0x0B80, 0x0BFF },
2867     { "Telugu",                               0x0C00, 0x0C7F },
2868     { "Thaana",                               0x0780, 0x07BF },
2869     { "Thai",                                 0x0E00, 0x0E7F },
2870     { "Tibetan",                              0x0F00, 0x0FFF },
2871     { "Tifinagh",                             0x2D30, 0x2D7F },
2872     { "Ugaritic",                             0x10380, 0x1039F },
2873     { "UnifiedCanadianAboriginalSyllabics",   0x1400, 0x167F },
2874     { "VariationSelectors",                   0xFE00, 0xFE0F },
2875     { "VariationSelectorsSupplement",         0xE0100, 0xE01EF },
2876     { "VerticalForms",                        0xFE10, 0xFE1F },
2877     { "YiRadicals",                           0xA490, 0xA4CF },
2878     { "YiSyllables",                          0xA000, 0xA48F },
2879     { "YijingHexagramSymbols",                0x4DC0, 0x4DFF }
2880 };
2881 
operator <(const CategoriesRangeMapEntry & entry1,const CategoriesRangeMapEntry & entry2)2882 inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2)
2883 { return qstrcmp(entry1.name, entry2.name) < 0; }
operator <(const char * name,const CategoriesRangeMapEntry & entry)2884 inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2885 { return qstrcmp(name, entry.name) < 0; }
operator <(const CategoriesRangeMapEntry & entry,const char * name)2886 inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2887 { return qstrcmp(entry.name, name) < 0; }
2888 #endif // QT_NO_REGEXP_CCLASS
2889 
getChar()2890 int QRegExpEngine::getChar()
2891 {
2892     return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2893 }
2894 
getEscape()2895 int QRegExpEngine::getEscape()
2896 {
2897 #ifndef QT_NO_REGEXP_ESCAPE
2898     const char tab[] = "afnrtv"; // no b, as \b means word boundary
2899     const char backTab[] = "\a\f\n\r\t\v";
2900     ushort low;
2901     int i;
2902 #endif
2903     ushort val;
2904     int prevCh = yyCh;
2905 
2906     if (prevCh == EOS) {
2907         error(RXERR_END);
2908         return Tok_Char | '\\';
2909     }
2910     yyCh = getChar();
2911 #ifndef QT_NO_REGEXP_ESCAPE
2912     if ((prevCh & ~0xff) == 0) {
2913         const char *p = strchr(tab, prevCh);
2914         if (p != nullptr)
2915             return Tok_Char | backTab[p - tab];
2916     }
2917 #endif
2918 
2919     switch (prevCh) {
2920 #ifndef QT_NO_REGEXP_ESCAPE
2921     case '0':
2922         val = 0;
2923         for (i = 0; i < 3; i++) {
2924             if (yyCh >= '0' && yyCh <= '7')
2925                 val = (val << 3) | (yyCh - '0');
2926             else
2927                 break;
2928             yyCh = getChar();
2929         }
2930         if ((val & ~0377) != 0)
2931             error(RXERR_OCTAL);
2932         return Tok_Char | val;
2933 #endif
2934 #ifndef QT_NO_REGEXP_ESCAPE
2935     case 'B':
2936         return Tok_NonWord;
2937 #endif
2938 #ifndef QT_NO_REGEXP_CCLASS
2939     case 'D':
2940         // see QChar::isDigit()
2941         yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2942         return Tok_CharClass;
2943     case 'S':
2944         // see QChar::isSpace()
2945         yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) |
2946                                                FLAG(QChar::Separator_Line) |
2947                                                FLAG(QChar::Separator_Paragraph) |
2948                                                FLAG(QChar::Other_Control)));
2949         yyCharClass->addRange(0x0000, 0x0008);
2950         yyCharClass->addRange(0x000e, 0x001f);
2951         yyCharClass->addRange(0x007f, 0x0084);
2952         yyCharClass->addRange(0x0086, 0x009f);
2953         return Tok_CharClass;
2954     case 'W':
2955         // see QChar::isLetterOrNumber() and QChar::isMark()
2956         yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2957                                                FLAG(QChar::Mark_SpacingCombining) |
2958                                                FLAG(QChar::Mark_Enclosing) |
2959                                                FLAG(QChar::Number_DecimalDigit) |
2960                                                FLAG(QChar::Number_Letter) |
2961                                                FLAG(QChar::Number_Other) |
2962                                                FLAG(QChar::Letter_Uppercase) |
2963                                                FLAG(QChar::Letter_Lowercase) |
2964                                                FLAG(QChar::Letter_Titlecase) |
2965                                                FLAG(QChar::Letter_Modifier) |
2966                                                FLAG(QChar::Letter_Other) |
2967                                                FLAG(QChar::Punctuation_Connector)));
2968         yyCharClass->addRange(0x203f, 0x2040);
2969         yyCharClass->addSingleton(0x2040);
2970         yyCharClass->addSingleton(0x2054);
2971         yyCharClass->addSingleton(0x30fb);
2972         yyCharClass->addRange(0xfe33, 0xfe34);
2973         yyCharClass->addRange(0xfe4d, 0xfe4f);
2974         yyCharClass->addSingleton(0xff3f);
2975         yyCharClass->addSingleton(0xff65);
2976         return Tok_CharClass;
2977 #endif
2978 #ifndef QT_NO_REGEXP_ESCAPE
2979     case 'b':
2980         return Tok_Word;
2981 #endif
2982 #ifndef QT_NO_REGEXP_CCLASS
2983     case 'd':
2984         // see QChar::isDigit()
2985         yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2986         return Tok_CharClass;
2987     case 's':
2988         // see QChar::isSpace()
2989         yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2990                                    FLAG(QChar::Separator_Line) |
2991                                    FLAG(QChar::Separator_Paragraph));
2992         yyCharClass->addRange(0x0009, 0x000d);
2993         yyCharClass->addSingleton(0x0085);
2994         return Tok_CharClass;
2995     case 'w':
2996         // see QChar::isLetterOrNumber() and QChar::isMark()
2997         yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2998                                    FLAG(QChar::Mark_SpacingCombining) |
2999                                    FLAG(QChar::Mark_Enclosing) |
3000                                    FLAG(QChar::Number_DecimalDigit) |
3001                                    FLAG(QChar::Number_Letter) |
3002                                    FLAG(QChar::Number_Other) |
3003                                    FLAG(QChar::Letter_Uppercase) |
3004                                    FLAG(QChar::Letter_Lowercase) |
3005                                    FLAG(QChar::Letter_Titlecase) |
3006                                    FLAG(QChar::Letter_Modifier) |
3007                                    FLAG(QChar::Letter_Other));
3008         yyCharClass->addSingleton(0x005f); // '_'
3009         return Tok_CharClass;
3010     case 'I':
3011         if (!xmlSchemaExtensions)
3012             break;
3013         yyCharClass->setNegative(!yyCharClass->negative());
3014         Q_FALLTHROUGH();
3015     case 'i':
3016         if (xmlSchemaExtensions) {
3017             yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3018                                        FLAG(QChar::Mark_SpacingCombining) |
3019                                        FLAG(QChar::Mark_Enclosing) |
3020                                        FLAG(QChar::Number_DecimalDigit) |
3021                                        FLAG(QChar::Number_Letter) |
3022                                        FLAG(QChar::Number_Other) |
3023                                        FLAG(QChar::Letter_Uppercase) |
3024                                        FLAG(QChar::Letter_Lowercase) |
3025                                        FLAG(QChar::Letter_Titlecase) |
3026                                        FLAG(QChar::Letter_Modifier) |
3027                                        FLAG(QChar::Letter_Other));
3028             yyCharClass->addSingleton(0x003a); // ':'
3029             yyCharClass->addSingleton(0x005f); // '_'
3030             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3031             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3032             yyCharClass->addRange(0xc0, 0xd6);
3033             yyCharClass->addRange(0xd8, 0xf6);
3034             yyCharClass->addRange(0xf8, 0x2ff);
3035             yyCharClass->addRange(0x370, 0x37d);
3036             yyCharClass->addRange(0x37f, 0x1fff);
3037             yyCharClass->addRange(0x200c, 0x200d);
3038             yyCharClass->addRange(0x2070, 0x218f);
3039             yyCharClass->addRange(0x2c00, 0x2fef);
3040             yyCharClass->addRange(0x3001, 0xd7ff);
3041             yyCharClass->addRange(0xf900, 0xfdcf);
3042             yyCharClass->addRange(0xfdf0, 0xfffd);
3043             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3044             return Tok_CharClass;
3045         } else {
3046             break;
3047         }
3048     case 'C':
3049         if (!xmlSchemaExtensions)
3050             break;
3051         yyCharClass->setNegative(!yyCharClass->negative());
3052         Q_FALLTHROUGH();
3053     case 'c':
3054         if (xmlSchemaExtensions) {
3055             yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3056                                        FLAG(QChar::Mark_SpacingCombining) |
3057                                        FLAG(QChar::Mark_Enclosing) |
3058                                        FLAG(QChar::Number_DecimalDigit) |
3059                                        FLAG(QChar::Number_Letter) |
3060                                        FLAG(QChar::Number_Other) |
3061                                        FLAG(QChar::Letter_Uppercase) |
3062                                        FLAG(QChar::Letter_Lowercase) |
3063                                        FLAG(QChar::Letter_Titlecase) |
3064                                        FLAG(QChar::Letter_Modifier) |
3065                                        FLAG(QChar::Letter_Other));
3066             yyCharClass->addSingleton(0x002d); // '-'
3067             yyCharClass->addSingleton(0x002e); // '.'
3068             yyCharClass->addSingleton(0x003a); // ':'
3069             yyCharClass->addSingleton(0x005f); // '_'
3070             yyCharClass->addSingleton(0xb7);
3071             yyCharClass->addRange(0x0030, 0x0039); // [0-9]
3072             yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
3073             yyCharClass->addRange(0x0061, 0x007a); // [a-z]
3074             yyCharClass->addRange(0xc0, 0xd6);
3075             yyCharClass->addRange(0xd8, 0xf6);
3076             yyCharClass->addRange(0xf8, 0x2ff);
3077             yyCharClass->addRange(0x370, 0x37d);
3078             yyCharClass->addRange(0x37f, 0x1fff);
3079             yyCharClass->addRange(0x200c, 0x200d);
3080             yyCharClass->addRange(0x2070, 0x218f);
3081             yyCharClass->addRange(0x2c00, 0x2fef);
3082             yyCharClass->addRange(0x3001, 0xd7ff);
3083             yyCharClass->addRange(0xf900, 0xfdcf);
3084             yyCharClass->addRange(0xfdf0, 0xfffd);
3085             yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
3086             yyCharClass->addRange(0x0300, 0x036f);
3087             yyCharClass->addRange(0x203f, 0x2040);
3088             return Tok_CharClass;
3089         } else {
3090             break;
3091         }
3092     case 'P':
3093         if (!xmlSchemaExtensions)
3094             break;
3095         yyCharClass->setNegative(!yyCharClass->negative());
3096         Q_FALLTHROUGH();
3097     case 'p':
3098         if (xmlSchemaExtensions) {
3099             if (yyCh != '{') {
3100                 error(RXERR_CHARCLASS);
3101                 return Tok_CharClass;
3102             }
3103 
3104             QByteArray category;
3105             yyCh = getChar();
3106             while (yyCh != '}') {
3107                 if (yyCh == EOS) {
3108                     error(RXERR_END);
3109                     return Tok_CharClass;
3110                 }
3111                 category.append(yyCh);
3112                 yyCh = getChar();
3113             }
3114             yyCh = getChar(); // skip closing '}'
3115 
3116             int catlen = category.length();
3117             if (catlen == 1 || catlen == 2) {
3118                 switch (category.at(0)) {
3119                 case 'M':
3120                     if (catlen == 1) {
3121                         yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3122                                                    FLAG(QChar::Mark_SpacingCombining) |
3123                                                    FLAG(QChar::Mark_Enclosing));
3124                     } else {
3125                         switch (category.at(1)) {
3126                         case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3127                         case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3128                         case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3129                         default: error(RXERR_CATEGORY); break;
3130                         }
3131                     }
3132                     break;
3133                 case 'N':
3134                     if (catlen == 1) {
3135                         yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3136                                                    FLAG(QChar::Number_Letter) |
3137                                                    FLAG(QChar::Number_Other));
3138                     } else {
3139                         switch (category.at(1)) {
3140                         case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3141                         case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3142                         case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3143                         default: error(RXERR_CATEGORY); break;
3144                         }
3145                     }
3146                     break;
3147                 case 'Z':
3148                     if (catlen == 1) {
3149                         yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3150                                                    FLAG(QChar::Separator_Line) |
3151                                                    FLAG(QChar::Separator_Paragraph));
3152                     } else {
3153                         switch (category.at(1)) {
3154                         case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3155                         case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3156                         case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3157                         default: error(RXERR_CATEGORY); break;
3158                         }
3159                     }
3160                     break;
3161                 case 'C':
3162                     if (catlen == 1) {
3163                         yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3164                                                    FLAG(QChar::Other_Format) |
3165                                                    FLAG(QChar::Other_Surrogate) |
3166                                                    FLAG(QChar::Other_PrivateUse) |
3167                                                    FLAG(QChar::Other_NotAssigned));
3168                     } else {
3169                         switch (category.at(1)) {
3170                         case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3171                         case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3172                         case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3173                         case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3174                         case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3175                         default: error(RXERR_CATEGORY); break;
3176                         }
3177                     }
3178                     break;
3179                 case 'L':
3180                     if (catlen == 1) {
3181                         yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3182                                                    FLAG(QChar::Letter_Lowercase) |
3183                                                    FLAG(QChar::Letter_Titlecase) |
3184                                                    FLAG(QChar::Letter_Modifier) |
3185                                                    FLAG(QChar::Letter_Other));
3186                     } else {
3187                         switch (category.at(1)) {
3188                         case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3189                         case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3190                         case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3191                         case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3192                         case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3193                         default: error(RXERR_CATEGORY); break;
3194                         }
3195                     }
3196                     break;
3197                 case 'P':
3198                     if (catlen == 1) {
3199                         yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3200                                                    FLAG(QChar::Punctuation_Dash) |
3201                                                    FLAG(QChar::Punctuation_Open) |
3202                                                    FLAG(QChar::Punctuation_Close) |
3203                                                    FLAG(QChar::Punctuation_InitialQuote) |
3204                                                    FLAG(QChar::Punctuation_FinalQuote) |
3205                                                    FLAG(QChar::Punctuation_Other));
3206                     } else {
3207                         switch (category.at(1)) {
3208                         case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3209                         case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3210                         case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3211                         case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3212                         case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3213                         case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3214                         case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3215                         default: error(RXERR_CATEGORY); break;
3216                         }
3217                     }
3218                     break;
3219                 case 'S':
3220                     if (catlen == 1) {
3221                         yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3222                                                    FLAG(QChar::Symbol_Currency) |
3223                                                    FLAG(QChar::Symbol_Modifier) |
3224                                                    FLAG(QChar::Symbol_Other));
3225                     } else {
3226                         switch (category.at(1)) {
3227                         case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3228                         case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3229                         case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3230                         case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3231                         default: error(RXERR_CATEGORY); break;
3232                         }
3233                     }
3234                     break;
3235                 default:
3236                     error(RXERR_CATEGORY);
3237                     break;
3238                 }
3239             } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') {
3240                 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3241                 const char * const categoryFamily = category.constData() + 2;
3242                 const CategoriesRangeMapEntry *r = std::lower_bound(categoriesRangeMap, categoriesRangeMap + N, categoryFamily);
3243                 if (r != categoriesRangeMap + N && qstrcmp(r->name, categoryFamily) == 0)
3244                     yyCharClass->addRange(r->first, r->second);
3245                 else
3246                     error(RXERR_CATEGORY);
3247             } else {
3248                 error(RXERR_CATEGORY);
3249             }
3250             return Tok_CharClass;
3251         } else {
3252             break;
3253         }
3254 #endif
3255 #ifndef QT_NO_REGEXP_ESCAPE
3256     case 'x':
3257         val = 0;
3258         for (i = 0; i < 4; i++) {
3259             low = QChar(yyCh).toLower().unicode();
3260             if (low >= '0' && low <= '9')
3261                 val = (val << 4) | (low - '0');
3262             else if (low >= 'a' && low <= 'f')
3263                 val = (val << 4) | (low - 'a' + 10);
3264             else
3265                 break;
3266             yyCh = getChar();
3267         }
3268         return Tok_Char | val;
3269 #endif
3270     default:
3271         break;
3272     }
3273     if (prevCh >= '1' && prevCh <= '9') {
3274 #ifndef QT_NO_REGEXP_BACKREF
3275         val = prevCh - '0';
3276         while (yyCh >= '0' && yyCh <= '9') {
3277             val = (val * 10) + (yyCh - '0');
3278             yyCh = getChar();
3279         }
3280         return Tok_BackRef | val;
3281 #else
3282         error(RXERR_DISABLED);
3283 #endif
3284     }
3285     return Tok_Char | prevCh;
3286 }
3287 
3288 #ifndef QT_NO_REGEXP_INTERVAL
getRep(int def)3289 int QRegExpEngine::getRep(int def)
3290 {
3291     if (yyCh >= '0' && yyCh <= '9') {
3292         int rep = 0;
3293         do {
3294             rep = 10 * rep + yyCh - '0';
3295             if (rep >= InftyRep) {
3296                 error(RXERR_REPETITION);
3297                 rep = def;
3298             }
3299             yyCh = getChar();
3300         } while (yyCh >= '0' && yyCh <= '9');
3301         return rep;
3302     } else {
3303         return def;
3304     }
3305 }
3306 #endif
3307 
3308 #ifndef QT_NO_REGEXP_LOOKAHEAD
skipChars(int n)3309 void QRegExpEngine::skipChars(int n)
3310 {
3311     if (n > 0) {
3312         yyPos += n - 1;
3313         yyCh = getChar();
3314     }
3315 }
3316 #endif
3317 
error(const char * msg)3318 void QRegExpEngine::error(const char *msg)
3319 {
3320     if (yyError.isEmpty())
3321         yyError = QLatin1String(msg);
3322 }
3323 
startTokenizer(const QChar * rx,int len)3324 void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3325 {
3326     yyIn = rx;
3327     yyPos0 = 0;
3328     yyPos = 0;
3329     yyLen = len;
3330     yyCh = getChar();
3331     yyCharClass.reset(new QRegExpCharClass);
3332     yyMinRep = 0;
3333     yyMaxRep = 0;
3334     yyError = QString();
3335 }
3336 
getToken()3337 int QRegExpEngine::getToken()
3338 {
3339 #ifndef QT_NO_REGEXP_CCLASS
3340     ushort pendingCh = 0;
3341     bool charPending;
3342     bool rangePending;
3343     int tok;
3344 #endif
3345     int prevCh = yyCh;
3346 
3347     yyPos0 = yyPos - 1;
3348 #ifndef QT_NO_REGEXP_CCLASS
3349     yyCharClass->clear();
3350 #endif
3351     yyMinRep = 0;
3352     yyMaxRep = 0;
3353     yyCh = getChar();
3354 
3355     switch (prevCh) {
3356     case EOS:
3357         yyPos0 = yyPos;
3358         return Tok_Eos;
3359     case '$':
3360         return Tok_Dollar;
3361     case '(':
3362         if (yyCh == '?') {
3363             prevCh = getChar();
3364             yyCh = getChar();
3365             switch (prevCh) {
3366 #ifndef QT_NO_REGEXP_LOOKAHEAD
3367             case '!':
3368                 return Tok_NegLookahead;
3369             case '=':
3370                 return Tok_PosLookahead;
3371 #endif
3372             case ':':
3373                 return Tok_MagicLeftParen;
3374             case '<':
3375                 error(RXERR_LOOKBEHIND);
3376                 return Tok_MagicLeftParen;
3377             default:
3378                 error(RXERR_LOOKAHEAD);
3379                 return Tok_MagicLeftParen;
3380             }
3381         } else {
3382             return Tok_LeftParen;
3383         }
3384     case ')':
3385         return Tok_RightParen;
3386     case '*':
3387         yyMinRep = 0;
3388         yyMaxRep = InftyRep;
3389         return Tok_Quantifier;
3390     case '+':
3391         yyMinRep = 1;
3392         yyMaxRep = InftyRep;
3393         return Tok_Quantifier;
3394     case '.':
3395 #ifndef QT_NO_REGEXP_CCLASS
3396         yyCharClass->setNegative(true);
3397 #endif
3398         return Tok_CharClass;
3399     case '?':
3400         yyMinRep = 0;
3401         yyMaxRep = 1;
3402         return Tok_Quantifier;
3403     case '[':
3404 #ifndef QT_NO_REGEXP_CCLASS
3405         if (yyCh == '^') {
3406             yyCharClass->setNegative(true);
3407             yyCh = getChar();
3408         }
3409         charPending = false;
3410         rangePending = false;
3411         do {
3412             if (yyCh == '-' && charPending && !rangePending) {
3413                 rangePending = true;
3414                 yyCh = getChar();
3415             } else {
3416                 if (charPending && !rangePending) {
3417                     yyCharClass->addSingleton(pendingCh);
3418                     charPending = false;
3419                 }
3420                 if (yyCh == '\\') {
3421                     yyCh = getChar();
3422                     tok = getEscape();
3423                     if (tok == Tok_Word)
3424                         tok = '\b';
3425                 } else {
3426                     tok = Tok_Char | yyCh;
3427                     yyCh = getChar();
3428                 }
3429                 if (tok == Tok_CharClass) {
3430                     if (rangePending) {
3431                         yyCharClass->addSingleton('-');
3432                         yyCharClass->addSingleton(pendingCh);
3433                         charPending = false;
3434                         rangePending = false;
3435                     }
3436                 } else if ((tok & Tok_Char) != 0) {
3437                     if (rangePending) {
3438                         yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3439                         charPending = false;
3440                         rangePending = false;
3441                     } else {
3442                         pendingCh = tok ^ Tok_Char;
3443                         charPending = true;
3444                     }
3445                 } else {
3446                     error(RXERR_CHARCLASS);
3447                 }
3448             }
3449         }  while (yyCh != ']' && yyCh != EOS);
3450         if (rangePending)
3451             yyCharClass->addSingleton('-');
3452         if (charPending)
3453             yyCharClass->addSingleton(pendingCh);
3454         if (yyCh == EOS)
3455             error(RXERR_END);
3456         else
3457             yyCh = getChar();
3458         return Tok_CharClass;
3459 #else
3460         error(RXERR_END);
3461         return Tok_Char | '[';
3462 #endif
3463     case '\\':
3464         return getEscape();
3465     case ']':
3466         error(RXERR_LEFTDELIM);
3467         return Tok_Char | ']';
3468     case '^':
3469         return Tok_Caret;
3470     case '{':
3471 #ifndef QT_NO_REGEXP_INTERVAL
3472         yyMinRep = getRep(0);
3473         yyMaxRep = yyMinRep;
3474         if (yyCh == ',') {
3475             yyCh = getChar();
3476             yyMaxRep = getRep(InftyRep);
3477         }
3478         if (yyMaxRep < yyMinRep)
3479             error(RXERR_INTERVAL);
3480         if (yyCh != '}')
3481             error(RXERR_REPETITION);
3482         yyCh = getChar();
3483         return Tok_Quantifier;
3484 #else
3485         error(RXERR_DISABLED);
3486         return Tok_Char | '{';
3487 #endif
3488     case '|':
3489         return Tok_Bar;
3490     case '}':
3491         error(RXERR_LEFTDELIM);
3492         return Tok_Char | '}';
3493     default:
3494         return Tok_Char | prevCh;
3495     }
3496 }
3497 
parse(const QChar * pattern,int len)3498 int QRegExpEngine::parse(const QChar *pattern, int len)
3499 {
3500     valid = true;
3501     startTokenizer(pattern, len);
3502     yyTok = getToken();
3503 #ifndef QT_NO_REGEXP_CAPTURE
3504     yyMayCapture = true;
3505 #else
3506     yyMayCapture = false;
3507 #endif
3508 
3509 #ifndef QT_NO_REGEXP_CAPTURE
3510     int atom = startAtom(false);
3511 #endif
3512     QRegExpCharClass anything;
3513     Box box(this); // create InitialState
3514     box.set(anything);
3515     Box rightBox(this); // create FinalState
3516     rightBox.set(anything);
3517 
3518     Box middleBox(this);
3519     parseExpression(&middleBox);
3520 #ifndef QT_NO_REGEXP_CAPTURE
3521     finishAtom(atom, false);
3522 #endif
3523 #ifndef QT_NO_REGEXP_OPTIM
3524     middleBox.setupHeuristics();
3525 #endif
3526     box.cat(middleBox);
3527     box.cat(rightBox);
3528     yyCharClass.reset();
3529 
3530 #ifndef QT_NO_REGEXP_CAPTURE
3531     for (int i = 0; i < nf; ++i) {
3532         switch (f[i].capture) {
3533         case QRegExpAtom::NoCapture:
3534             break;
3535         case QRegExpAtom::OfficialCapture:
3536             f[i].capture = ncap;
3537             captureForOfficialCapture.append(ncap);
3538             ++ncap;
3539             ++officialncap;
3540             break;
3541         case QRegExpAtom::UnofficialCapture:
3542             f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3543         }
3544     }
3545 
3546 #ifndef QT_NO_REGEXP_BACKREF
3547 #ifndef QT_NO_REGEXP_OPTIM
3548     if (officialncap == 0 && nbrefs == 0) {
3549         ncap = nf = 0;
3550         f.clear();
3551     }
3552 #endif
3553     // handle the case where there's a \5 with no corresponding capture
3554     // (captureForOfficialCapture.size() != officialncap)
3555     for (int i = 0; i < nbrefs - officialncap; ++i) {
3556         captureForOfficialCapture.append(ncap);
3557         ++ncap;
3558     }
3559 #endif
3560 #endif
3561 
3562     if (!yyError.isEmpty())
3563         return -1;
3564 
3565 #ifndef QT_NO_REGEXP_OPTIM
3566     const QRegExpAutomatonState &sinit = s.at(InitialState);
3567     caretAnchored = !sinit.anchors.isEmpty();
3568     if (caretAnchored) {
3569         const QMap<int, int> &anchors = sinit.anchors;
3570         QMap<int, int>::const_iterator a;
3571         for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3572             if (
3573 #ifndef QT_NO_REGEXP_ANCHOR_ALT
3574                 (*a & Anchor_Alternation) != 0 ||
3575 #endif
3576                 (*a & Anchor_Caret) == 0)
3577             {
3578                 caretAnchored = false;
3579                 break;
3580             }
3581         }
3582     }
3583 #endif
3584 
3585     // cleanup anchors
3586     int numStates = s.count();
3587     for (int i = 0; i < numStates; ++i) {
3588         QRegExpAutomatonState &state = s[i];
3589         if (!state.anchors.isEmpty()) {
3590             QMap<int, int>::iterator a = state.anchors.begin();
3591             while (a != state.anchors.end()) {
3592                 if (a.value() == 0)
3593                     a = state.anchors.erase(a);
3594                 else
3595                     ++a;
3596             }
3597         }
3598     }
3599 
3600     return yyPos0;
3601 }
3602 
parseAtom(Box * box)3603 void QRegExpEngine::parseAtom(Box *box)
3604 {
3605 #ifndef QT_NO_REGEXP_LOOKAHEAD
3606     QRegExpEngine *eng = nullptr;
3607     bool neg;
3608     int len;
3609 #endif
3610 
3611     if ((yyTok & Tok_Char) != 0) {
3612         box->set(QChar(yyTok ^ Tok_Char));
3613     } else {
3614 #ifndef QT_NO_REGEXP_OPTIM
3615         trivial = false;
3616 #endif
3617         switch (yyTok) {
3618         case Tok_Dollar:
3619             box->catAnchor(Anchor_Dollar);
3620             break;
3621         case Tok_Caret:
3622             box->catAnchor(Anchor_Caret);
3623             break;
3624 #ifndef QT_NO_REGEXP_LOOKAHEAD
3625         case Tok_PosLookahead:
3626         case Tok_NegLookahead:
3627             neg = (yyTok == Tok_NegLookahead);
3628             eng = new QRegExpEngine(cs, greedyQuantifiers);
3629             len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3630             if (len >= 0)
3631                 skipChars(len);
3632             else
3633                 error(RXERR_LOOKAHEAD);
3634             box->catAnchor(addLookahead(eng, neg));
3635             yyTok = getToken();
3636             if (yyTok != Tok_RightParen)
3637                 error(RXERR_LOOKAHEAD);
3638             break;
3639 #endif
3640 #ifndef QT_NO_REGEXP_ESCAPE
3641         case Tok_Word:
3642             box->catAnchor(Anchor_Word);
3643             break;
3644         case Tok_NonWord:
3645             box->catAnchor(Anchor_NonWord);
3646             break;
3647 #endif
3648         case Tok_LeftParen:
3649         case Tok_MagicLeftParen:
3650             yyTok = getToken();
3651             parseExpression(box);
3652             if (yyTok != Tok_RightParen)
3653                 error(RXERR_END);
3654             break;
3655         case Tok_CharClass:
3656             box->set(*yyCharClass);
3657             break;
3658         case Tok_Quantifier:
3659             error(RXERR_REPETITION);
3660             break;
3661         default:
3662 #ifndef QT_NO_REGEXP_BACKREF
3663             if ((yyTok & Tok_BackRef) != 0)
3664                 box->set(yyTok ^ Tok_BackRef);
3665             else
3666 #endif
3667                 error(RXERR_DISABLED);
3668         }
3669     }
3670     yyTok = getToken();
3671 }
3672 
parseFactor(Box * box)3673 void QRegExpEngine::parseFactor(Box *box)
3674 {
3675 #ifndef QT_NO_REGEXP_CAPTURE
3676     int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3677     int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3678     bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3679 #else
3680     const int innerAtom = -1;
3681 #endif
3682 
3683 #ifndef QT_NO_REGEXP_INTERVAL
3684 #define YYREDO() \
3685         yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3686         *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3687 
3688     const QChar *in = yyIn;
3689     int pos0 = yyPos0;
3690     int pos = yyPos;
3691     int len = yyLen;
3692     int ch = yyCh;
3693     QRegExpCharClass charClass;
3694     if (yyTok == Tok_CharClass)
3695         charClass = *yyCharClass;
3696     int tok = yyTok;
3697     bool mayCapture = yyMayCapture;
3698 #endif
3699 
3700     parseAtom(box);
3701 #ifndef QT_NO_REGEXP_CAPTURE
3702     finishAtom(innerAtom, magicLeftParen);
3703 #endif
3704 
3705     bool hasQuantifier = (yyTok == Tok_Quantifier);
3706     if (hasQuantifier) {
3707 #ifndef QT_NO_REGEXP_OPTIM
3708         trivial = false;
3709 #endif
3710         if (yyMaxRep == InftyRep) {
3711             box->plus(innerAtom);
3712 #ifndef QT_NO_REGEXP_INTERVAL
3713         } else if (yyMaxRep == 0) {
3714             box->clear();
3715 #endif
3716         }
3717         if (yyMinRep == 0)
3718             box->opt();
3719 
3720 #ifndef QT_NO_REGEXP_INTERVAL
3721         yyMayCapture = false;
3722         int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3723         int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3724 
3725         Box rightBox(this);
3726         int i;
3727 
3728         for (i = 0; i < beta; i++) {
3729             YYREDO();
3730             Box leftBox(this);
3731             parseAtom(&leftBox);
3732             leftBox.cat(rightBox);
3733             leftBox.opt();
3734             rightBox = leftBox;
3735         }
3736         for (i = 0; i < alpha; i++) {
3737             YYREDO();
3738             Box leftBox(this);
3739             parseAtom(&leftBox);
3740             leftBox.cat(rightBox);
3741             rightBox = leftBox;
3742         }
3743         rightBox.cat(*box);
3744         *box = rightBox;
3745 #endif
3746         yyTok = getToken();
3747 #ifndef QT_NO_REGEXP_INTERVAL
3748         yyMayCapture = mayCapture;
3749 #endif
3750     }
3751 #undef YYREDO
3752 #ifndef QT_NO_REGEXP_CAPTURE
3753     if (greedyQuantifiers)
3754         finishAtom(outerAtom, hasQuantifier);
3755 #endif
3756 }
3757 
parseTerm(Box * box)3758 void QRegExpEngine::parseTerm(Box *box)
3759 {
3760 #ifndef QT_NO_REGEXP_OPTIM
3761     if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3762         parseFactor(box);
3763 #endif
3764     while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3765         Box rightBox(this);
3766         parseFactor(&rightBox);
3767         box->cat(rightBox);
3768     }
3769 }
3770 
parseExpression(Box * box)3771 void QRegExpEngine::parseExpression(Box *box)
3772 {
3773     parseTerm(box);
3774     while (yyTok == Tok_Bar) {
3775 #ifndef QT_NO_REGEXP_OPTIM
3776         trivial = false;
3777 #endif
3778         Box rightBox(this);
3779         yyTok = getToken();
3780         parseTerm(&rightBox);
3781         box->orx(rightBox);
3782     }
3783 }
3784 
3785 /*
3786   The struct QRegExpPrivate contains the private data of a regular
3787   expression other than the automaton. It makes it possible for many
3788   QRegExp objects to use the same QRegExpEngine object with different
3789   QRegExpPrivate objects.
3790 */
3791 struct QRegExpPrivate
3792 {
3793     QRegExpEngine *eng;
3794     QRegExpEngineKey engineKey;
3795     bool minimal;
3796 #ifndef QT_NO_REGEXP_CAPTURE
3797     QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3798     QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3799 #endif
3800     QRegExpMatchState matchState;
3801 
QRegExpPrivateQRegExpPrivate3802     inline QRegExpPrivate()
3803         : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
QRegExpPrivateQRegExpPrivate3804     inline QRegExpPrivate(const QRegExpEngineKey &key)
3805         : eng(nullptr), engineKey(key), minimal(false) {}
3806 };
3807 
3808 #if !defined(QT_NO_REGEXP_OPTIM)
3809 struct QRECache
3810 {
3811     typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache;
3812     typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache;
3813     EngineCache usedEngines;
3814     UnusedEngineCache unusedEngines;
3815 };
3816 Q_GLOBAL_STATIC(QRECache, engineCache)
3817 static QBasicMutex engineCacheMutex;
3818 #endif // QT_NO_REGEXP_OPTIM
3819 
derefEngine(QRegExpEngine * eng,const QRegExpEngineKey & key)3820 static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3821 {
3822 #if !defined(QT_NO_REGEXP_OPTIM)
3823     const auto locker = qt_scoped_lock(engineCacheMutex);
3824     if (!eng->ref.deref()) {
3825         if (QRECache *c = engineCache()) {
3826             c->unusedEngines.insert(key, eng, 4 + key.pattern.length() / 4);
3827             c->usedEngines.remove(key);
3828         } else {
3829             delete eng;
3830         }
3831     }
3832 #else
3833     Q_UNUSED(key);
3834     if (!eng->ref.deref())
3835         delete eng;
3836 #endif
3837 }
3838 
prepareEngine_helper(QRegExpPrivate * priv)3839 static void prepareEngine_helper(QRegExpPrivate *priv)
3840 {
3841     Q_ASSERT(!priv->eng);
3842 
3843 #if !defined(QT_NO_REGEXP_OPTIM)
3844     const auto locker = qt_scoped_lock(engineCacheMutex);
3845     if (QRECache *c = engineCache()) {
3846         priv->eng = c->unusedEngines.take(priv->engineKey);
3847         if (!priv->eng)
3848             priv->eng = c->usedEngines.value(priv->engineKey);
3849         if (!priv->eng)
3850             priv->eng = new QRegExpEngine(priv->engineKey);
3851         else
3852             priv->eng->ref.ref();
3853 
3854         c->usedEngines.insert(priv->engineKey, priv->eng);
3855         return;
3856     }
3857 #endif // QT_NO_REGEXP_OPTIM
3858 
3859     priv->eng = new QRegExpEngine(priv->engineKey);
3860 }
3861 
prepareEngine(QRegExpPrivate * priv)3862 inline static void prepareEngine(QRegExpPrivate *priv)
3863 {
3864     if (priv->eng)
3865         return;
3866     prepareEngine_helper(priv);
3867     priv->matchState.prepareForMatch(priv->eng);
3868 }
3869 
prepareEngineForMatch(QRegExpPrivate * priv,const QString & str)3870 static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3871 {
3872     prepareEngine(priv);
3873     priv->matchState.prepareForMatch(priv->eng);
3874 #ifndef QT_NO_REGEXP_CAPTURE
3875     priv->t = str;
3876     priv->capturedCache.clear();
3877 #else
3878     Q_UNUSED(str);
3879 #endif
3880 }
3881 
invalidateEngine(QRegExpPrivate * priv)3882 static void invalidateEngine(QRegExpPrivate *priv)
3883 {
3884     if (priv->eng) {
3885         derefEngine(priv->eng, priv->engineKey);
3886         priv->eng = nullptr;
3887         priv->matchState.drain();
3888     }
3889 }
3890 
3891 /*!
3892     \enum QRegExp::CaretMode
3893 
3894     The CaretMode enum defines the different meanings of the caret
3895     (\b{^}) in a regular expression. The possible values are:
3896 
3897     \value CaretAtZero
3898            The caret corresponds to index 0 in the searched string.
3899 
3900     \value CaretAtOffset
3901            The caret corresponds to the start offset of the search.
3902 
3903     \value CaretWontMatch
3904            The caret never matches.
3905 */
3906 
3907 /*!
3908     \enum QRegExp::PatternSyntax
3909 
3910     The syntax used to interpret the meaning of the pattern.
3911 
3912     \value RegExp A rich Perl-like pattern matching syntax. This is
3913     the default.
3914 
3915     \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3916     (Introduced in Qt 4.2.)
3917 
3918     \value Wildcard This provides a simple pattern matching syntax
3919     similar to that used by shells (command interpreters) for "file
3920     globbing". See \l{QRegExp wildcard matching}.
3921 
3922     \value WildcardUnix This is similar to Wildcard but with the
3923     behavior of a Unix shell. The wildcard characters can be escaped
3924     with the character "\\".
3925 
3926     \value FixedString The pattern is a fixed string. This is
3927     equivalent to using the RegExp pattern on a string in
3928     which all metacharacters are escaped using escape().
3929 
3930     \value W3CXmlSchema11 The pattern is a regular expression as
3931     defined by the W3C XML Schema 1.1 specification.
3932 
3933     \sa setPatternSyntax()
3934 */
3935 
3936 /*!
3937     Constructs an empty regexp.
3938 
3939     \sa isValid(), errorString()
3940 */
QRegExp()3941 QRegExp::QRegExp()
3942 {
3943     priv = new QRegExpPrivate;
3944     prepareEngine(priv);
3945 }
3946 
3947 /*!
3948     Constructs a regular expression object for the given \a pattern
3949     string. The pattern must be given using wildcard notation if \a
3950     syntax is \l Wildcard; the default is \l RegExp. The pattern is
3951     case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3952     greedy (maximal), but can be changed by calling
3953     setMinimal().
3954 
3955     \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3956 */
QRegExp(const QString & pattern,Qt::CaseSensitivity cs,PatternSyntax syntax)3957 QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3958 {
3959     priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3960     prepareEngine(priv);
3961 }
3962 
3963 /*!
3964     Constructs a regular expression as a copy of \a rx.
3965 
3966     \sa operator=()
3967 */
QRegExp(const QRegExp & rx)3968 QRegExp::QRegExp(const QRegExp &rx)
3969 {
3970     priv = new QRegExpPrivate;
3971     operator=(rx);
3972 }
3973 
3974 /*!
3975     Destroys the regular expression and cleans up its internal data.
3976 */
~QRegExp()3977 QRegExp::~QRegExp()
3978 {
3979     invalidateEngine(priv);
3980     delete priv;
3981 }
3982 
3983 /*!
3984     Copies the regular expression \a rx and returns a reference to the
3985     copy. The case sensitivity, wildcard, and minimal matching options
3986     are also copied.
3987 */
operator =(const QRegExp & rx)3988 QRegExp &QRegExp::operator=(const QRegExp &rx)
3989 {
3990     prepareEngine(rx.priv); // to allow sharing
3991     QRegExpEngine *otherEng = rx.priv->eng;
3992     if (otherEng)
3993         otherEng->ref.ref();
3994     invalidateEngine(priv);
3995     priv->eng = otherEng;
3996     priv->engineKey = rx.priv->engineKey;
3997     priv->minimal = rx.priv->minimal;
3998 #ifndef QT_NO_REGEXP_CAPTURE
3999     priv->t = rx.priv->t;
4000     priv->capturedCache = rx.priv->capturedCache;
4001 #endif
4002     if (priv->eng)
4003         priv->matchState.prepareForMatch(priv->eng);
4004     priv->matchState.captured = rx.priv->matchState.captured;
4005     return *this;
4006 }
4007 
4008 /*!
4009     \fn QRegExp &QRegExp::operator=(QRegExp &&other)
4010 
4011     Move-assigns \a other to this QRegExp instance.
4012 
4013     \since 5.2
4014 */
4015 
4016 /*!
4017     \fn void QRegExp::swap(QRegExp &other)
4018     \since 4.8
4019 
4020     Swaps regular expression \a other with this regular
4021     expression. This operation is very fast and never fails.
4022 */
4023 
4024 /*!
4025     Returns \c true if this regular expression is equal to \a rx;
4026     otherwise returns \c false.
4027 
4028     Two QRegExp objects are equal if they have the same pattern
4029     strings and the same settings for case sensitivity, wildcard and
4030     minimal matching.
4031 */
operator ==(const QRegExp & rx) const4032 bool QRegExp::operator==(const QRegExp &rx) const
4033 {
4034     return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4035 }
4036 
4037 /*!
4038     \since 5.6
4039     \relates QRegExp
4040 
4041     Returns the hash value for \a key, using
4042     \a seed to seed the calculation.
4043 */
qHash(const QRegExp & key,uint seed)4044 uint qHash(const QRegExp &key, uint seed) noexcept
4045 {
4046     QtPrivate::QHashCombine hash;
4047     seed = hash(seed, key.priv->engineKey);
4048     seed = hash(seed, key.priv->minimal);
4049     return seed;
4050 }
4051 
4052 /*!
4053     \fn bool QRegExp::operator!=(const QRegExp &rx) const
4054 
4055     Returns \c true if this regular expression is not equal to \a rx;
4056     otherwise returns \c false.
4057 
4058     \sa operator==()
4059 */
4060 
4061 /*!
4062     Returns \c true if the pattern string is empty; otherwise returns
4063     false.
4064 
4065     If you call exactMatch() with an empty pattern on an empty string
4066     it will return true; otherwise it returns \c false since it operates
4067     over the whole string. If you call indexIn() with an empty pattern
4068     on \e any string it will return the start offset (0 by default)
4069     because the empty pattern matches the 'emptiness' at the start of
4070     the string. In this case the length of the match returned by
4071     matchedLength() will be 0.
4072 
4073     See QString::isEmpty().
4074 */
4075 
isEmpty() const4076 bool QRegExp::isEmpty() const
4077 {
4078     return priv->engineKey.pattern.isEmpty();
4079 }
4080 
4081 /*!
4082     Returns \c true if the regular expression is valid; otherwise returns
4083     false. An invalid regular expression never matches.
4084 
4085     The pattern \b{[a-z} is an example of an invalid pattern, since
4086     it lacks a closing square bracket.
4087 
4088     Note that the validity of a regexp may also depend on the setting
4089     of the wildcard flag, for example \b{*.html} is a valid
4090     wildcard regexp but an invalid full regexp.
4091 
4092     \sa errorString()
4093 */
isValid() const4094 bool QRegExp::isValid() const
4095 {
4096     if (priv->engineKey.pattern.isEmpty()) {
4097         return true;
4098     } else {
4099         prepareEngine(priv);
4100         return priv->eng->isValid();
4101     }
4102 }
4103 
4104 /*!
4105     Returns the pattern string of the regular expression. The pattern
4106     has either regular expression syntax or wildcard syntax, depending
4107     on patternSyntax().
4108 
4109     \sa patternSyntax(), caseSensitivity()
4110 */
pattern() const4111 QString QRegExp::pattern() const
4112 {
4113     return priv->engineKey.pattern;
4114 }
4115 
4116 /*!
4117     Sets the pattern string to \a pattern. The case sensitivity,
4118     wildcard, and minimal matching options are not changed.
4119 
4120     \sa setPatternSyntax(), setCaseSensitivity()
4121 */
setPattern(const QString & pattern)4122 void QRegExp::setPattern(const QString &pattern)
4123 {
4124     if (priv->engineKey.pattern != pattern) {
4125         invalidateEngine(priv);
4126         priv->engineKey.pattern = pattern;
4127     }
4128 }
4129 
4130 /*!
4131     Returns Qt::CaseSensitive if the regexp is matched case
4132     sensitively; otherwise returns Qt::CaseInsensitive.
4133 
4134     \sa patternSyntax(), pattern(), isMinimal()
4135 */
caseSensitivity() const4136 Qt::CaseSensitivity QRegExp::caseSensitivity() const
4137 {
4138     return priv->engineKey.cs;
4139 }
4140 
4141 /*!
4142     Sets case sensitive matching to \a cs.
4143 
4144     If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4145     \c{readme.txt} but not \c{README.TXT}.
4146 
4147     \sa setPatternSyntax(), setPattern(), setMinimal()
4148 */
setCaseSensitivity(Qt::CaseSensitivity cs)4149 void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4150 {
4151     if ((bool)cs != (bool)priv->engineKey.cs) {
4152         invalidateEngine(priv);
4153         priv->engineKey.cs = cs;
4154     }
4155 }
4156 
4157 /*!
4158     Returns the syntax used by the regular expression. The default is
4159     QRegExp::RegExp.
4160 
4161     \sa pattern(), caseSensitivity()
4162 */
patternSyntax() const4163 QRegExp::PatternSyntax QRegExp::patternSyntax() const
4164 {
4165     return priv->engineKey.patternSyntax;
4166 }
4167 
4168 /*!
4169     Sets the syntax mode for the regular expression. The default is
4170     QRegExp::RegExp.
4171 
4172     Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4173     \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the
4174     string \c{readme.txt} in wildcard mode, but does not match
4175     \c{readme}.
4176 
4177     Setting \a syntax to QRegExp::FixedString means that the pattern
4178     is interpreted as a plain string. Special characters (e.g.,
4179     backslash) don't need to be escaped then.
4180 
4181     \sa setPattern(), setCaseSensitivity(), escape()
4182 */
setPatternSyntax(PatternSyntax syntax)4183 void QRegExp::setPatternSyntax(PatternSyntax syntax)
4184 {
4185     if (syntax != priv->engineKey.patternSyntax) {
4186         invalidateEngine(priv);
4187         priv->engineKey.patternSyntax = syntax;
4188     }
4189 }
4190 
4191 /*!
4192     Returns \c true if minimal (non-greedy) matching is enabled;
4193     otherwise returns \c false.
4194 
4195     \sa caseSensitivity(), setMinimal()
4196 */
isMinimal() const4197 bool QRegExp::isMinimal() const
4198 {
4199     return priv->minimal;
4200 }
4201 
4202 /*!
4203     Enables or disables minimal matching. If \a minimal is false,
4204     matching is greedy (maximal) which is the default.
4205 
4206     For example, suppose we have the input string "We must be
4207     <b>bold</b>, very <b>bold</b>!" and the pattern
4208     \b{<b>.*</b>}. With the default greedy (maximal) matching,
4209     the match is "We must be \underline{<b>bold</b>, very
4210     <b>bold</b>}!". But with minimal (non-greedy) matching, the
4211     first match is: "We must be \underline{<b>bold</b>}, very
4212     <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4213     very \underline{<b>bold</b>}!". In practice we might use the pattern
4214     \b{<b>[^<]*\</b>} instead, although this will still fail for
4215     nested tags.
4216 
4217     \sa setCaseSensitivity()
4218 */
setMinimal(bool minimal)4219 void QRegExp::setMinimal(bool minimal)
4220 {
4221     priv->minimal = minimal;
4222 }
4223 
4224 // ### Qt 5: make non-const
4225 /*!
4226     Returns \c true if \a str is matched exactly by this regular
4227     expression; otherwise returns \c false. You can determine how much of
4228     the string was matched by calling matchedLength().
4229 
4230     For a given regexp string R, exactMatch("R") is the equivalent of
4231     indexIn("^R$") since exactMatch() effectively encloses the regexp
4232     in the start of string and end of string anchors, except that it
4233     sets matchedLength() differently.
4234 
4235     For example, if the regular expression is \b{blue}, then
4236     exactMatch() returns \c true only for input \c blue. For inputs \c
4237     bluebell, \c blutak and \c lightblue, exactMatch() returns \c false
4238     and matchedLength() will return 4, 3 and 0 respectively.
4239 
4240     Although const, this function sets matchedLength(),
4241     capturedTexts(), and pos().
4242 
4243     \sa indexIn(), lastIndexIn()
4244 */
exactMatch(const QString & str) const4245 bool QRegExp::exactMatch(const QString &str) const
4246 {
4247     prepareEngineForMatch(priv, str);
4248     priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4249     if (priv->matchState.captured[1] == str.length()) {
4250         return true;
4251     } else {
4252         priv->matchState.captured[0] = 0;
4253         priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4254         return false;
4255     }
4256 }
4257 
4258 // ### Qt 5: make non-const
4259 /*!
4260     Attempts to find a match in \a str from position \a offset (0 by
4261     default). If \a offset is -1, the search starts at the last
4262     character; if -2, at the next to last character; etc.
4263 
4264     Returns the position of the first match, or -1 if there was no
4265     match.
4266 
4267     The \a caretMode parameter can be used to instruct whether \b{^}
4268     should match at index 0 or at \a offset.
4269 
4270     You might prefer to use QString::indexOf(), QString::contains(),
4271     or even QStringList::filter(). To replace matches use
4272     QString::replace().
4273 
4274     Example:
4275     \snippet code/src_corelib_tools_qregexp.cpp 13
4276 
4277     Although const, this function sets matchedLength(),
4278     capturedTexts() and pos().
4279 
4280     If the QRegExp is a wildcard expression (see setPatternSyntax())
4281     and want to test a string against the whole wildcard expression,
4282     use exactMatch() instead of this function.
4283 
4284     \sa lastIndexIn(), exactMatch()
4285 */
4286 
indexIn(const QString & str,int offset,CaretMode caretMode) const4287 int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4288 {
4289     prepareEngineForMatch(priv, str);
4290     if (offset < 0)
4291         offset += str.length();
4292     priv->matchState.match(str.unicode(), str.length(), offset,
4293         priv->minimal, false, caretIndex(offset, caretMode));
4294     return priv->matchState.captured[0];
4295 }
4296 
4297 // ### Qt 5: make non-const
4298 /*!
4299     Attempts to find a match backwards in \a str from position \a
4300     offset. If \a offset is -1 (the default), the search starts at the
4301     last character; if -2, at the next to last character; etc.
4302 
4303     Returns the position of the first match, or -1 if there was no
4304     match.
4305 
4306     The \a caretMode parameter can be used to instruct whether \b{^}
4307     should match at index 0 or at \a offset.
4308 
4309     Although const, this function sets matchedLength(),
4310     capturedTexts() and pos().
4311 
4312     \warning Searching backwards is much slower than searching
4313     forwards.
4314 
4315     \sa indexIn(), exactMatch()
4316 */
4317 
lastIndexIn(const QString & str,int offset,CaretMode caretMode) const4318 int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4319 {
4320     prepareEngineForMatch(priv, str);
4321     if (offset < 0)
4322         offset += str.length();
4323     if (offset < 0 || offset > str.length()) {
4324         memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4325         return -1;
4326     }
4327 
4328     while (offset >= 0) {
4329         priv->matchState.match(str.unicode(), str.length(), offset,
4330             priv->minimal, true, caretIndex(offset, caretMode));
4331         if (priv->matchState.captured[0] == offset)
4332             return offset;
4333         --offset;
4334     }
4335     return -1;
4336 }
4337 
4338 /*!
4339     Returns the length of the last matched string, or -1 if there was
4340     no match.
4341 
4342     \sa exactMatch(), indexIn(), lastIndexIn()
4343 */
matchedLength() const4344 int QRegExp::matchedLength() const
4345 {
4346     return priv->matchState.captured[1];
4347 }
4348 
4349 #ifndef QT_NO_REGEXP_CAPTURE
4350 
4351 /*!
4352   \since 4.6
4353   Returns the number of captures contained in the regular expression.
4354  */
captureCount() const4355 int QRegExp::captureCount() const
4356 {
4357     prepareEngine(priv);
4358     return priv->eng->captureCount();
4359 }
4360 
4361 /*!
4362     Returns a list of the captured text strings.
4363 
4364     The first string in the list is the entire matched string. Each
4365     subsequent list element contains a string that matched a
4366     (capturing) subexpression of the regexp.
4367 
4368     For example:
4369     \snippet code/src_corelib_tools_qregexp.cpp 14
4370 
4371     The above example also captures elements that may be present but
4372     which we have no interest in. This problem can be solved by using
4373     non-capturing parentheses:
4374 
4375     \snippet code/src_corelib_tools_qregexp.cpp 15
4376 
4377     Note that if you want to iterate over the list, you should iterate
4378     over a copy, e.g.
4379     \snippet code/src_corelib_tools_qregexp.cpp 16
4380 
4381     Some regexps can match an indeterminate number of times. For
4382     example if the input string is "Offsets: 12 14 99 231 7" and the
4383     regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4384     all the numbers matched. However, after calling
4385     \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4386     "12"), i.e. the entire match was "12" and the first subexpression
4387     matched was "12". The correct approach is to use cap() in a
4388     \l{QRegExp#cap_in_a_loop}{loop}.
4389 
4390     The order of elements in the string list is as follows. The first
4391     element is the entire matching string. Each subsequent element
4392     corresponds to the next capturing open left parentheses. Thus
4393     capturedTexts()[1] is the text of the first capturing parentheses,
4394     capturedTexts()[2] is the text of the second and so on
4395     (corresponding to $1, $2, etc., in some other regexp languages).
4396 
4397     \sa cap(), pos()
4398 */
capturedTexts() const4399 QStringList QRegExp::capturedTexts() const
4400 {
4401     if (priv->capturedCache.isEmpty()) {
4402         prepareEngine(priv);
4403         const int *captured = priv->matchState.captured;
4404         int n = priv->matchState.capturedSize;
4405 
4406         for (int i = 0; i < n; i += 2) {
4407             QString m;
4408             if (captured[i + 1] == 0)
4409                 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4410             else if (captured[i] >= 0)
4411                 m = priv->t.mid(captured[i], captured[i + 1]);
4412             priv->capturedCache.append(m);
4413         }
4414         priv->t.clear();
4415     }
4416     return priv->capturedCache;
4417 }
4418 
4419 /*!
4420     \internal
4421 */
capturedTexts()4422 QStringList QRegExp::capturedTexts()
4423 {
4424     return const_cast<const QRegExp *>(this)->capturedTexts();
4425 }
4426 
4427 /*!
4428     Returns the text captured by the \a nth subexpression. The entire
4429     match has index 0 and the parenthesized subexpressions have
4430     indexes starting from 1 (excluding non-capturing parentheses).
4431 
4432     \snippet code/src_corelib_tools_qregexp.cpp 17
4433 
4434     The order of elements matched by cap() is as follows. The first
4435     element, cap(0), is the entire matching string. Each subsequent
4436     element corresponds to the next capturing open left parentheses.
4437     Thus cap(1) is the text of the first capturing parentheses, cap(2)
4438     is the text of the second, and so on.
4439 
4440     \sa capturedTexts(), pos()
4441 */
cap(int nth) const4442 QString QRegExp::cap(int nth) const
4443 {
4444     return capturedTexts().value(nth);
4445 }
4446 
4447 /*!
4448     \internal
4449 */
cap(int nth)4450 QString QRegExp::cap(int nth)
4451 {
4452     return const_cast<const QRegExp *>(this)->cap(nth);
4453 }
4454 
4455 /*!
4456     Returns the position of the \a nth captured text in the searched
4457     string. If \a nth is 0 (the default), pos() returns the position
4458     of the whole match.
4459 
4460     Example:
4461     \snippet code/src_corelib_tools_qregexp.cpp 18
4462 
4463     For zero-length matches, pos() always returns -1. (For example, if
4464     cap(4) would return an empty string, pos(4) returns -1.) This is
4465     a feature of the implementation.
4466 
4467     \sa cap(), capturedTexts()
4468 */
pos(int nth) const4469 int QRegExp::pos(int nth) const
4470 {
4471     if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4472         return -1;
4473     else
4474         return priv->matchState.captured[2 * nth];
4475 }
4476 
4477 /*!
4478     \internal
4479 */
pos(int nth)4480 int QRegExp::pos(int nth)
4481 {
4482     return const_cast<const QRegExp *>(this)->pos(nth);
4483 }
4484 
4485 /*!
4486   Returns a text string that explains why a regexp pattern is
4487   invalid the case being; otherwise returns "no error occurred".
4488 
4489   \sa isValid()
4490 */
errorString() const4491 QString QRegExp::errorString() const
4492 {
4493     if (isValid()) {
4494         return QString::fromLatin1(RXERR_OK);
4495     } else {
4496         return priv->eng->errorString();
4497     }
4498 }
4499 
4500 /*!
4501     \internal
4502 */
errorString()4503 QString QRegExp::errorString()
4504 {
4505     return const_cast<const QRegExp *>(this)->errorString();
4506 }
4507 #endif
4508 
4509 /*!
4510     Returns the string \a str with every regexp special character
4511     escaped with a backslash. The special characters are $, (,), *, +,
4512     ., ?, [, \,], ^, {, | and }.
4513 
4514     Example:
4515 
4516     \snippet code/src_corelib_tools_qregexp.cpp 19
4517 
4518     This function is useful to construct regexp patterns dynamically:
4519 
4520     \snippet code/src_corelib_tools_qregexp.cpp 20
4521 
4522     \sa setPatternSyntax()
4523 */
escape(const QString & str)4524 QString QRegExp::escape(const QString &str)
4525 {
4526     QString quoted;
4527     const int count = str.count();
4528     quoted.reserve(count * 2);
4529     const QLatin1Char backslash('\\');
4530     for (int i = 0; i < count; i++) {
4531         switch (str.at(i).toLatin1()) {
4532         case '$':
4533         case '(':
4534         case ')':
4535         case '*':
4536         case '+':
4537         case '.':
4538         case '?':
4539         case '[':
4540         case '\\':
4541         case ']':
4542         case '^':
4543         case '{':
4544         case '|':
4545         case '}':
4546             quoted.append(backslash);
4547         }
4548         quoted.append(str.at(i));
4549     }
4550     return quoted;
4551 }
4552 
4553 
4554 #ifndef QT_NO_DATASTREAM
4555 /*!
4556     \relates QRegExp
4557 
4558     Writes the regular expression \a regExp to stream \a out.
4559 
4560     \sa {Serializing Qt Data Types}
4561 */
operator <<(QDataStream & out,const QRegExp & regExp)4562 QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4563 {
4564     return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4565                << (quint8)regExp.patternSyntax()
4566                << (quint8)!!regExp.isMinimal();
4567 }
4568 
4569 /*!
4570     \relates QRegExp
4571 
4572     Reads a regular expression from stream \a in into \a regExp.
4573 
4574     \sa {Serializing Qt Data Types}
4575 */
operator >>(QDataStream & in,QRegExp & regExp)4576 QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4577 {
4578     QString pattern;
4579     quint8 cs;
4580     quint8 patternSyntax;
4581     quint8 isMinimal;
4582 
4583     in >> pattern >> cs >> patternSyntax >> isMinimal;
4584 
4585     QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4586                       QRegExp::PatternSyntax(patternSyntax));
4587 
4588     newRegExp.setMinimal(isMinimal);
4589     regExp = newRegExp;
4590     return in;
4591 }
4592 #endif // QT_NO_DATASTREAM
4593 
4594 #ifndef QT_NO_DEBUG_STREAM
operator <<(QDebug dbg,const QRegExp & r)4595 QDebug operator<<(QDebug dbg, const QRegExp &r)
4596 {
4597     QDebugStateSaver saver(dbg);
4598     dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4599                   << ", pattern='"<< r.pattern() << "')";
4600     return dbg;
4601 }
4602 #endif
4603 
4604 QT_END_NAMESPACE
4605