1 /****************************************************************************
2 **
3 ** Copyright (C) 2015 The Qt Company Ltd.
4 ** Contact: http://www.qt.io/licensing/
5 **
6 ** This file is part of the QtCore module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and The Qt Company. For licensing terms
14 ** and conditions see http://www.qt.io/terms-conditions. For further
15 ** information use the contact form at http://www.qt.io/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 or version 3 as published by the Free
20 ** Software Foundation and appearing in the file LICENSE.LGPLv21 and
21 ** LICENSE.LGPLv3 included in the packaging of this file. Please review the
22 ** following information to ensure the GNU Lesser General Public License
23 ** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
24 ** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
25 **
26 ** As a special exception, The Qt Company gives you certain additional
27 ** rights. These rights are described in The Qt Company LGPL Exception
28 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
29 **
30 ** GNU General Public License Usage
31 ** Alternatively, this file may be used under the terms of the GNU
32 ** General Public License version 3.0 as published by the Free Software
33 ** Foundation and appearing in the file LICENSE.GPL included in the
34 ** packaging of this file. Please review the following information to
35 ** ensure the GNU General Public License version 3.0 requirements will be
36 ** met: http://www.gnu.org/copyleft/gpl.html.
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41
42 #include "qregexp.h"
43
44 #include "qalgorithms.h"
45 #include "qbitarray.h"
46 #include "qcache.h"
47 #include "qdatastream.h"
48 #include "qlist.h"
49 #include "qmap.h"
50 #include "qmutex.h"
51 #include "qstring.h"
52 #include "qstringlist.h"
53 #include "qstringmatcher.h"
54 #include "qvector.h"
55 #include "private/qfunctions_p.h"
56
57 #include <limits.h>
58
59 QT_BEGIN_NAMESPACE
60
61 int qFindString(const QChar *haystack, int haystackLen, int from,
62 const QChar *needle, int needleLen, Qt::CaseSensitivity cs);
63
64 // error strings for the regexp parser
65 #define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
66 #define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
67 #define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
68 #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
69 #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
70 #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
71 #define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
72 #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
73 #define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
74 #define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
75 #define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
76 #define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
77
78 /*!
79 \class QRegExp
80 \reentrant
81 \brief The QRegExp class provides pattern matching using regular expressions.
82
83 \ingroup tools
84 \ingroup shared
85
86 \keyword regular expression
87
88 A regular expression, or "regexp", is a pattern for matching
89 substrings in a text. This is useful in many contexts, e.g.,
90
91 \table
92 \row \i Validation
93 \i A regexp can test whether a substring meets some criteria,
94 e.g. is an integer or contains no whitespace.
95 \row \i Searching
96 \i A regexp provides more powerful pattern matching than
97 simple substring matching, e.g., match one of the words
98 \e{mail}, \e{letter} or \e{correspondence}, but none of the
99 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
100 \row \i Search and Replace
101 \i A regexp can replace all occurrences of a substring with a
102 different substring, e.g., replace all occurrences of \e{&}
103 with \e{\&} except where the \e{&} is already followed by
104 an \e{amp;}.
105 \row \i String Splitting
106 \i A regexp can be used to identify where a string should be
107 split apart, e.g. splitting tab-delimited strings.
108 \endtable
109
110 A brief introduction to regexps is presented, a description of
111 Qt's regexp language, some examples, and the function
112 documentation itself. QRegExp is modeled on Perl's regexp
113 language. It fully supports Unicode. QRegExp can also be used in a
114 simpler, \e{wildcard mode} that is similar to the functionality
115 found in command shells. The syntax rules used by QRegExp can be
116 changed with setPatternSyntax(). In particular, the pattern syntax
117 can be set to QRegExp::FixedString, which means the pattern to be
118 matched is interpreted as a plain string, i.e., special characters
119 (e.g., backslash) are not escaped.
120
121 A good text on regexps is \e {Mastering Regular Expressions}
122 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
123
124 \tableofcontents
125
126 \section1 Introduction
127
128 Regexps are built up from expressions, quantifiers, and
129 assertions. The simplest expression is a character, e.g. \bold{x}
130 or \bold{5}. An expression can also be a set of characters
131 enclosed in square brackets. \bold{[ABCD]} will match an \bold{A}
132 or a \bold{B} or a \bold{C} or a \bold{D}. We can write this same
133 expression as \bold{[A-D]}, and an experession to match any
134 captital letter in the English alphabet is written as
135 \bold{[A-Z]}.
136
137 A quantifier specifies the number of occurrences of an expression
138 that must be matched. \bold{x{1,1}} means match one and only one
139 \bold{x}. \bold{x{1,5}} means match a sequence of \bold{x}
140 characters that contains at least one \bold{x} but no more than
141 five.
142
143 Note that in general regexps cannot be used to check for balanced
144 brackets or tags. For example, a regexp can be written to match an
145 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
146 are not nested, but if the \c{<b>} tags are nested, that same
147 regexp will match an opening \c{<b>} tag with the wrong closing
148 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
149 first \c{<b>} would be matched with the first \c{</b>}, which is
150 not correct. However, it is possible to write a regexp that will
151 match nested brackets or tags correctly, but only if the number of
152 nesting levels is fixed and known. If the number of nesting levels
153 is not fixed and known, it is impossible to write a regexp that
154 will not fail.
155
156 Suppose we want a regexp to match integers in the range 0 to 99.
157 At least one digit is required, so we start with the expression
158 \bold{[0-9]{1,1}}, which matches a single digit exactly once. This
159 regexp matches integers in the range 0 to 9. To match integers up
160 to 99, increase the maximum number of occurrences to 2, so the
161 regexp becomes \bold{[0-9]{1,2}}. This regexp satisfies the
162 original requirement to match integers from 0 to 99, but it will
163 also match integers that occur in the middle of strings. If we
164 want the matched integer to be the whole string, we must use the
165 anchor assertions, \bold{^} (caret) and \bold{$} (dollar). When
166 \bold{^} is the first character in a regexp, it means the regexp
167 must match from the beginning of the string. When \bold{$} is the
168 last character of the regexp, it means the regexp must match to
169 the end of the string. The regexp becomes \bold{^[0-9]{1,2}$}.
170 Note that assertions, e.g. \bold{^} and \bold{$}, do not match
171 characters but locations in the string.
172
173 If you have seen regexps described elsewhere, they may have looked
174 different from the ones shown here. This is because some sets of
175 characters and some quantifiers are so common that they have been
176 given special symbols to represent them. \bold{[0-9]} can be
177 replaced with the symbol \bold{\\d}. The quantifier to match
178 exactly one occurrence, \bold{{1,1}}, can be replaced with the
179 expression itself, i.e. \bold{x{1,1}} is the same as \bold{x}. So
180 our 0 to 99 matcher could be written as \bold{^\\d{1,2}$}. It can
181 also be written \bold{^\\d\\d{0,1}$}, i.e. \e{From the start of
182 the string, match a digit, followed immediately by 0 or 1 digits}.
183 In practice, it would be written as \bold{^\\d\\d?$}. The \bold{?}
184 is shorthand for the quantifier \bold{{0,1}}, i.e. 0 or 1
185 occurrences. \bold{?} makes an expression optional. The regexp
186 \bold{^\\d\\d?$} means \e{From the beginning of the string, match
187 one digit, followed immediately by 0 or 1 more digit, followed
188 immediately by end of string}.
189
190 To write a regexp that matches one of the words 'mail' \e or
191 'letter' \e or 'correspondence' but does not match words that
192 contain these words, e.g., 'email', 'mailman', 'mailer', and
193 'letterbox', start with a regexp that matches 'mail'. Expressed
194 fully, the regexp is \bold{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
195 a character expression is automatically quantified by
196 \bold{{1,1}}, we can simplify the regexp to \bold{mail}, i.e., an
197 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
198 we can use the vertical bar \bold{|}, which means \bold{or}, to
199 include the other two words, so our regexp for matching any of the
200 three words becomes \bold{mail|letter|correspondence}. Match
201 'mail' \bold{or} 'letter' \bold{or} 'correspondence'. While this
202 regexp will match one of the three words we want to match, it will
203 also match words we don't want to match, e.g., 'email'. To
204 prevent the regexp from matching unwanted words, we must tell it
205 to begin and end the match at word boundaries. First we enclose
206 our regexp in parentheses, \bold{(mail|letter|correspondence)}.
207 Parentheses group expressions together, and they identify a part
208 of the regexp that we wish to \l{capturing text}{capture}.
209 Enclosing the expression in parentheses allows us to use it as a
210 component in more complex regexps. It also allows us to examine
211 which of the three words was actually matched. To force the match
212 to begin and end on word boundaries, we enclose the regexp in
213 \bold{\\b} \e{word boundary} assertions:
214 \bold{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
215 \e{Match a word boundary, followed by the regexp in parentheses,
216 followed by a word boundary}. The \bold{\\b} assertion matches a
217 \e position in the regexp, not a \e character. A word boundary is
218 any non-word character, e.g., a space, newline, or the beginning
219 or ending of a string.
220
221 If we want to replace ampersand characters with the HTML entity
222 \bold{\&}, the regexp to match is simply \bold{\&}. But this
223 regexp will also match ampersands that have already been converted
224 to HTML entities. We want to replace only ampersands that are not
225 already followed by \bold{amp;}. For this, we need the negative
226 lookahead assertion, \bold{(?!}__\bold{)}. The regexp can then be
227 written as \bold{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
228 \bold{not} \e{followed by} \bold{amp;}.
229
230 If we want to count all the occurrences of 'Eric' and 'Eirik' in a
231 string, two valid solutions are \bold{\\b(Eric|Eirik)\\b} and
232 \bold{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
233 required to avoid matching words that contain either name,
234 e.g. 'Ericsson'. Note that the second regexp matches more
235 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
236
237 Some of the examples discussed above are implemented in the
238 \link #code-examples code examples \endlink section.
239
240 \target characters-and-abbreviations-for-sets-of-characters
241 \section1 Characters and Abbreviations for Sets of Characters
242
243 \table
244 \header \i Element \i Meaning
245 \row \i \bold{c}
246 \i A character represents itself unless it has a special
247 regexp meaning. e.g. \bold{c} matches the character \e c.
248 \row \i \bold{\\c}
249 \i A character that follows a backslash matches the character
250 itself, except as specified below. e.g., To match a literal
251 caret at the beginning of a string, write \bold{\\^}.
252 \row \i \bold{\\a}
253 \i Matches the ASCII bell (BEL, 0x07).
254 \row \i \bold{\\f}
255 \i Matches the ASCII form feed (FF, 0x0C).
256 \row \i \bold{\\n}
257 \i Matches the ASCII line feed (LF, 0x0A, Unix newline).
258 \row \i \bold{\\r}
259 \i Matches the ASCII carriage return (CR, 0x0D).
260 \row \i \bold{\\t}
261 \i Matches the ASCII horizontal tab (HT, 0x09).
262 \row \i \bold{\\v}
263 \i Matches the ASCII vertical tab (VT, 0x0B).
264 \row \i \bold{\\x\e{hhhh}}
265 \i Matches the Unicode character corresponding to the
266 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
267 \row \i \bold{\\0\e{ooo}} (i.e., \\zero \e{ooo})
268 \i matches the ASCII/Latin1 character for the octal number
269 \e{ooo} (between 0 and 0377).
270 \row \i \bold{. (dot)}
271 \i Matches any character (including newline).
272 \row \i \bold{\\d}
273 \i Matches a digit (QChar::isDigit()).
274 \row \i \bold{\\D}
275 \i Matches a non-digit.
276 \row \i \bold{\\s}
277 \i Matches a whitespace character (QChar::isSpace()).
278 \row \i \bold{\\S}
279 \i Matches a non-whitespace character.
280 \row \i \bold{\\w}
281 \i Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
282 \row \i \bold{\\W}
283 \i Matches a non-word character.
284 \row \i \bold{\\\e{n}}
285 \i The \e{n}-th \l backreference, e.g. \\1, \\2, etc.
286 \endtable
287
288 \bold{Note:} The C++ compiler transforms backslashes in strings.
289 To include a \bold{\\} in a regexp, enter it twice, i.e. \c{\\}.
290 To match the backslash character itself, enter it four times, i.e.
291 \c{\\\\}.
292
293 \target sets-of-characters
294 \section1 Sets of Characters
295
296 Square brackets mean match any character contained in the square
297 brackets. The character set abbreviations described above can
298 appear in a character set in square brackets. Except for the
299 character set abbreviations and the following two exceptions,
300 characters do not have special meanings in square brackets.
301
302 \table
303 \row \i \bold{^}
304
305 \i The caret negates the character set if it occurs as the
306 first character (i.e. immediately after the opening square
307 bracket). \bold{[abc]} matches 'a' or 'b' or 'c', but
308 \bold{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
309
310 \row \i \bold{-}
311
312 \i The dash indicates a range of characters. \bold{[W-Z]}
313 matches 'W' or 'X' or 'Y' or 'Z'.
314
315 \endtable
316
317 Using the predefined character set abbreviations is more portable
318 than using character ranges across platforms and languages. For
319 example, \bold{[0-9]} matches a digit in Western alphabets but
320 \bold{\\d} matches a digit in \e any alphabet.
321
322 Note: In other regexp documentation, sets of characters are often
323 called "character classes".
324
325 \target quantifiers
326 \section1 Quantifiers
327
328 By default, an expression is automatically quantified by
329 \bold{{1,1}}, i.e. it should occur exactly once. In the following
330 list, \bold{\e {E}} stands for expression. An expression is a
331 character, or an abbreviation for a set of characters, or a set of
332 characters in square brackets, or an expression in parentheses.
333
334 \table
335 \row \i \bold{\e {E}?}
336
337 \i Matches zero or one occurrences of \e E. This quantifier
338 means \e{The previous expression is optional}, because it
339 will match whether or not the expression is found. \bold{\e
340 {E}?} is the same as \bold{\e {E}{0,1}}. e.g., \bold{dents?}
341 matches 'dent' or 'dents'.
342
343 \row \i \bold{\e {E}+}
344
345 \i Matches one or more occurrences of \e E. \bold{\e {E}+} is
346 the same as \bold{\e {E}{1,}}. e.g., \bold{0+} matches '0',
347 '00', '000', etc.
348
349 \row \i \bold{\e {E}*}
350
351 \i Matches zero or more occurrences of \e E. It is the same
352 as \bold{\e {E}{0,}}. The \bold{*} quantifier is often used
353 in error where \bold{+} should be used. For example, if
354 \bold{\\s*$} is used in an expression to match strings that
355 end in whitespace, it will match every string because
356 \bold{\\s*$} means \e{Match zero or more whitespaces followed
357 by end of string}. The correct regexp to match strings that
358 have at least one trailing whitespace character is
359 \bold{\\s+$}.
360
361 \row \i \bold{\e {E}{n}}
362
363 \i Matches exactly \e n occurrences of \e E. \bold{\e {E}{n}}
364 is the same as repeating \e E \e n times. For example,
365 \bold{x{5}} is the same as \bold{xxxxx}. It is also the same
366 as \bold{\e {E}{n,n}}, e.g. \bold{x{5,5}}.
367
368 \row \i \bold{\e {E}{n,}}
369 \i Matches at least \e n occurrences of \e E.
370
371 \row \i \bold{\e {E}{,m}}
372 \i Matches at most \e m occurrences of \e E. \bold{\e {E}{,m}}
373 is the same as \bold{\e {E}{0,m}}.
374
375 \row \i \bold{\e {E}{n,m}}
376 \i Matches at least \e n and at most \e m occurrences of \e E.
377 \endtable
378
379 To apply a quantifier to more than just the preceding character,
380 use parentheses to group characters together in an expression. For
381 example, \bold{tag+} matches a 't' followed by an 'a' followed by
382 at least one 'g', whereas \bold{(tag)+} matches at least one
383 occurrence of 'tag'.
384
385 Note: Quantifiers are normally "greedy". They always match as much
386 text as they can. For example, \bold{0+} matches the first zero it
387 finds and all the consecutive zeros after the first zero. Applied
388 to '20005', it matches'2\underline{000}5'. Quantifiers can be made
389 non-greedy, see setMinimal().
390
391 \target capturing parentheses
392 \target backreferences
393 \section1 Capturing Text
394
395 Parentheses allow us to group elements together so that we can
396 quantify and capture them. For example if we have the expression
397 \bold{mail|letter|correspondence} that matches a string we know
398 that \e one of the words matched but not which one. Using
399 parentheses allows us to "capture" whatever is matched within
400 their bounds, so if we used \bold{(mail|letter|correspondence)}
401 and matched this regexp against the string "I sent you some email"
402 we can use the cap() or capturedTexts() functions to extract the
403 matched characters, in this case 'mail'.
404
405 We can use captured text within the regexp itself. To refer to the
406 captured text we use \e backreferences which are indexed from 1,
407 the same as for cap(). For example we could search for duplicate
408 words in a string using \bold{\\b(\\w+)\\W+\\1\\b} which means match a
409 word boundary followed by one or more word characters followed by
410 one or more non-word characters followed by the same text as the
411 first parenthesized expression followed by a word boundary.
412
413 If we want to use parentheses purely for grouping and not for
414 capturing we can use the non-capturing syntax, e.g.
415 \bold{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
416 end ')'. In this example we match either 'green' or 'blue' but we
417 do not capture the match so we only know whether or not we matched
418 but not which color we actually found. Using non-capturing
419 parentheses is more efficient than using capturing parentheses
420 since the regexp engine has to do less book-keeping.
421
422 Both capturing and non-capturing parentheses may be nested.
423
424 \target greedy quantifiers
425
426 For historical reasons, quantifiers (e.g. \bold{*}) that apply to
427 capturing parentheses are more "greedy" than other quantifiers.
428 For example, \bold{a*(a*)} will match "aaa" with cap(1) == "aaa".
429 This behavior is different from what other regexp engines do
430 (notably, Perl). To obtain a more intuitive capturing behavior,
431 specify QRegExp::RegExp2 to the QRegExp constructor or call
432 setPatternSyntax(QRegExp::RegExp2).
433
434 \target cap_in_a_loop
435
436 When the number of matches cannot be determined in advance, a
437 common idiom is to use cap() in a loop. For example:
438
439 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 0
440
441 \target assertions
442 \section1 Assertions
443
444 Assertions make some statement about the text at the point where
445 they occur in the regexp but they do not match any characters. In
446 the following list \bold{\e {E}} stands for any expression.
447
448 \table
449 \row \i \bold{^}
450 \i The caret signifies the beginning of the string. If you
451 wish to match a literal \c{^} you must escape it by
452 writing \c{\\^}. For example, \bold{^#include} will only
453 match strings which \e begin with the characters '#include'.
454 (When the caret is the first character of a character set it
455 has a special meaning, see \link #sets-of-characters Sets of
456 Characters \endlink.)
457
458 \row \i \bold{$}
459 \i The dollar signifies the end of the string. For example
460 \bold{\\d\\s*$} will match strings which end with a digit
461 optionally followed by whitespace. If you wish to match a
462 literal \c{$} you must escape it by writing
463 \c{\\$}.
464
465 \row \i \bold{\\b}
466 \i A word boundary. For example the regexp
467 \bold{\\bOK\\b} means match immediately after a word
468 boundary (e.g. start of string or whitespace) the letter 'O'
469 then the letter 'K' immediately before another word boundary
470 (e.g. end of string or whitespace). But note that the
471 assertion does not actually match any whitespace so if we
472 write \bold{(\\bOK\\b)} and we have a match it will only
473 contain 'OK' even if the string is "It's \underline{OK} now".
474
475 \row \i \bold{\\B}
476 \i A non-word boundary. This assertion is true wherever
477 \bold{\\b} is false. For example if we searched for
478 \bold{\\Bon\\B} in "Left on" the match would fail (space
479 and end of string aren't non-word boundaries), but it would
480 match in "t\underline{on}ne".
481
482 \row \i \bold{(?=\e E)}
483 \i Positive lookahead. This assertion is true if the
484 expression matches at this point in the regexp. For example,
485 \bold{const(?=\\s+char)} matches 'const' whenever it is
486 followed by 'char', as in 'static \underline{const} char *'.
487 (Compare with \bold{const\\s+char}, which matches 'static
488 \underline{const char} *'.)
489
490 \row \i \bold{(?!\e E)}
491 \i Negative lookahead. This assertion is true if the
492 expression does not match at this point in the regexp. For
493 example, \bold{const(?!\\s+char)} matches 'const' \e except
494 when it is followed by 'char'.
495 \endtable
496
497 \keyword QRegExp wildcard matching
498 \section1 Wildcard Matching
499
500 Most command shells such as \e bash or \e cmd.exe support "file
501 globbing", the ability to identify a group of files by using
502 wildcards. The setPatternSyntax() function is used to switch
503 between regexp and wildcard mode. Wildcard matching is much
504 simpler than full regexps and has only four features:
505
506 \table
507 \row \i \bold{c}
508 \i Any character represents itself apart from those mentioned
509 below. Thus \bold{c} matches the character \e c.
510 \row \i \bold{?}
511 \i Matches any single character. It is the same as
512 \bold{.} in full regexps.
513 \row \i \bold{*}
514 \i Matches zero or more of any characters. It is the
515 same as \bold{.*} in full regexps.
516 \row \i \bold{[...]}
517 \i Sets of characters can be represented in square brackets,
518 similar to full regexps. Within the character class, like
519 outside, backslash has no special meaning.
520 \endtable
521
522 In the mode Wildcard, the wildcard characters cannot be
523 escaped. In the mode WildcardUnix, the character '\\' escapes the
524 wildcard.
525
526 For example if we are in wildcard mode and have strings which
527 contain filenames we could identify HTML files with \bold{*.html}.
528 This will match zero or more characters followed by a dot followed
529 by 'h', 't', 'm' and 'l'.
530
531 To test a string against a wildcard expression, use exactMatch().
532 For example:
533
534 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 1
535
536 \target perl-users
537 \section1 Notes for Perl Users
538
539 Most of the character class abbreviations supported by Perl are
540 supported by QRegExp, see \link
541 #characters-and-abbreviations-for-sets-of-characters characters
542 and abbreviations for sets of characters \endlink.
543
544 In QRegExp, apart from within character classes, \c{^} always
545 signifies the start of the string, so carets must always be
546 escaped unless used for that purpose. In Perl the meaning of caret
547 varies automagically depending on where it occurs so escaping it
548 is rarely necessary. The same applies to \c{$} which in
549 QRegExp always signifies the end of the string.
550
551 QRegExp's quantifiers are the same as Perl's greedy quantifiers
552 (but see the \l{greedy quantifiers}{note above}). Non-greedy
553 matching cannot be applied to individual quantifiers, but can be
554 applied to all the quantifiers in the pattern. For example, to
555 match the Perl regexp \bold{ro+?m} requires:
556
557 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 2
558
559 The equivalent of Perl's \c{/i} option is
560 setCaseSensitivity(Qt::CaseInsensitive).
561
562 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
563
564 In QRegExp \bold{.} matches any character, therefore all QRegExp
565 regexps have the equivalent of Perl's \c{/s} option. QRegExp
566 does not have an equivalent to Perl's \c{/m} option, but this
567 can be emulated in various ways for example by splitting the input
568 into lines or by looping with a regexp that searches for newlines.
569
570 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
571 assertions. The \\G assertion is not supported but can be emulated
572 in a loop.
573
574 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
575 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
576 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
577 capturedTexts()[2], etc.
578
579 To substitute a pattern use QString::replace().
580
581 Perl's extended \c{/x} syntax is not supported, nor are
582 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
583 the other hand, C++'s rules for literal strings can be used to
584 achieve the same:
585
586 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 3
587
588 Both zero-width positive and zero-width negative lookahead
589 assertions (?=pattern) and (?!pattern) are supported with the same
590 syntax as Perl. Perl's lookbehind assertions, "independent"
591 subexpressions and conditional expressions are not supported.
592
593 Non-capturing parentheses are also supported, with the same
594 (?:pattern) syntax.
595
596 See QString::split() and QStringList::join() for equivalents
597 to Perl's split and join functions.
598
599 Note: because C++ transforms \\'s they must be written \e twice in
600 code, e.g. \bold{\\b} must be written \bold{\\\\b}.
601
602 \target code-examples
603 \section1 Code Examples
604
605 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 4
606
607 The third string matches '\underline{6}'. This is a simple validation
608 regexp for integers in the range 0 to 99.
609
610 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 5
611
612 The second string matches '\underline{This_is-OK}'. We've used the
613 character set abbreviation '\\S' (non-whitespace) and the anchors
614 to match strings which contain no whitespace.
615
616 In the following example we match strings containing 'mail' or
617 'letter' or 'correspondence' but only match whole words i.e. not
618 'email'
619
620 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 6
621
622 The second string matches "Please write the \underline{letter}". The
623 word 'letter' is also captured (because of the parentheses). We
624 can see what text we've captured like this:
625
626 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 7
627
628 This will capture the text from the first set of capturing
629 parentheses (counting capturing left parentheses from left to
630 right). The parentheses are counted from 1 since cap(0) is the
631 whole matched regexp (equivalent to '&' in most regexp engines).
632
633 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 8
634
635 Here we've passed the QRegExp to QString's replace() function to
636 replace the matched text with new text.
637
638 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 9
639
640 We've used the indexIn() function to repeatedly match the regexp in
641 the string. Note that instead of moving forward by one character
642 at a time \c pos++ we could have written \c {pos +=
643 rx.matchedLength()} to skip over the already matched string. The
644 count will equal 3, matching 'One \underline{Eric} another
645 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
646 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
647 by non-word boundaries.
648
649 One common use of regexps is to split lines of delimited data into
650 their component fields.
651
652 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 10
653
654 In this example our input lines have the format company name, web
655 address and country. Unfortunately the regexp is rather long and
656 not very versatile -- the code will break if we add any more
657 fields. A simpler and better solution is to look for the
658 separator, '\\t' in this case, and take the surrounding text. The
659 QString::split() function can take a separator string or regexp
660 as an argument and split a string accordingly.
661
662 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 11
663
664 Here field[0] is the company, field[1] the web address and so on.
665
666 To imitate the matching of a shell we can use wildcard mode.
667
668 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 12
669
670 Wildcard matching can be convenient because of its simplicity, but
671 any wildcard regexp can be defined using full regexps, e.g.
672 \bold{.*\\.html$}. Notice that we can't match both \c .html and \c
673 .htm files with a wildcard unless we use \bold{*.htm*} which will
674 also match 'test.html.bak'. A full regexp gives us the precision
675 we need, \bold{.*\\.html?$}.
676
677 QRegExp can match case insensitively using setCaseSensitivity(),
678 and can use non-greedy matching, see setMinimal(). By
679 default QRegExp uses full regexps but this can be changed with
680 setWildcard(). Searching can be forward with indexIn() or backward
681 with lastIndexIn(). Captured text can be accessed using
682 capturedTexts() which returns a string list of all captured
683 strings, or using cap() which returns the captured string for the
684 given index. The pos() function takes a match index and returns
685 the position in the string where the match was made (or -1 if
686 there was no match).
687
688 \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
689 {tools/regexp}{Regular Expression Example}
690 */
691
692 #if defined(Q_OS_VXWORKS) && defined(EOS)
693 # undef EOS
694 #endif
695
696 const int NumBadChars = 64;
697 #define BadChar(ch) ((ch).unicode() % NumBadChars)
698
699 const int NoOccurrence = INT_MAX;
700 const int EmptyCapture = INT_MAX;
701 const int InftyLen = INT_MAX;
702 const int InftyRep = 1025;
703 const int EOS = -1;
704
isWord(QChar ch)705 static bool isWord(QChar ch)
706 {
707 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
708 }
709
710 /*
711 Merges two vectors of ints and puts the result into the first
712 one.
713 */
mergeInto(QVector<int> * a,const QVector<int> & b)714 static void mergeInto(QVector<int> *a, const QVector<int> &b)
715 {
716 int asize = a->size();
717 int bsize = b.size();
718 if (asize == 0) {
719 *a = b;
720 #ifndef QT_NO_REGEXP_OPTIM
721 } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {
722 a->resize(asize + 1);
723 (*a)[asize] = b.at(0);
724 #endif
725 } else if (bsize >= 1) {
726 int csize = asize + bsize;
727 QVector<int> c(csize);
728 int i = 0, j = 0, k = 0;
729 while (i < asize) {
730 if (j < bsize) {
731 if (a->at(i) == b.at(j)) {
732 ++i;
733 --csize;
734 } else if (a->at(i) < b.at(j)) {
735 c[k++] = a->at(i++);
736 } else {
737 c[k++] = b.at(j++);
738 }
739 } else {
740 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));
741 break;
742 }
743 }
744 c.resize(csize);
745 if (j < bsize)
746 memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));
747 *a = c;
748 }
749 }
750
751 #ifndef QT_NO_REGEXP_WILDCARD
752 /*
753 Translates a wildcard pattern to an equivalent regular expression
754 pattern (e.g., *.cpp to .*\.cpp).
755
756 If enableEscaping is true, it is possible to escape the wildcard
757 characters with \
758 */
wc2rx(const QString & wc_str,const bool enableEscaping)759 static QString wc2rx(const QString &wc_str, const bool enableEscaping)
760 {
761 const int wclen = wc_str.length();
762 QString rx;
763 int i = 0;
764 bool isEscaping = false; // the previous character is '\'
765 const QChar *wc = wc_str.unicode();
766
767 while (i < wclen) {
768 const QChar c = wc[i++];
769 switch (c.unicode()) {
770 case '\\':
771 if (enableEscaping) {
772 if (isEscaping) {
773 rx += QLatin1String("\\\\");
774 } // we insert the \\ later if necessary
775 if (i == wclen) { // the end
776 rx += QLatin1String("\\\\");
777 }
778 } else {
779 rx += QLatin1String("\\\\");
780 }
781 isEscaping = true;
782 break;
783 case '*':
784 if (isEscaping) {
785 rx += QLatin1String("\\*");
786 isEscaping = false;
787 } else {
788 rx += QLatin1String(".*");
789 }
790 break;
791 case '?':
792 if (isEscaping) {
793 rx += QLatin1String("\\?");
794 isEscaping = false;
795 } else {
796 rx += QLatin1Char('.');
797 }
798
799 break;
800 case '$':
801 case '(':
802 case ')':
803 case '+':
804 case '.':
805 case '^':
806 case '{':
807 case '|':
808 case '}':
809 if (isEscaping) {
810 isEscaping = false;
811 rx += QLatin1String("\\\\");
812 }
813 rx += QLatin1Char('\\');
814 rx += c;
815 break;
816 case '[':
817 if (isEscaping) {
818 isEscaping = false;
819 rx += QLatin1String("\\[");
820 } else {
821 rx += c;
822 if (wc[i] == QLatin1Char('^'))
823 rx += wc[i++];
824 if (i < wclen) {
825 if (rx[i] == QLatin1Char(']'))
826 rx += wc[i++];
827 while (i < wclen && wc[i] != QLatin1Char(']')) {
828 if (wc[i] == QLatin1Char('\\'))
829 rx += QLatin1Char('\\');
830 rx += wc[i++];
831 }
832 }
833 }
834 break;
835
836 case ']':
837 if(isEscaping){
838 isEscaping = false;
839 rx += QLatin1String("\\");
840 }
841 rx += c;
842 break;
843
844 default:
845 if(isEscaping){
846 isEscaping = false;
847 rx += QLatin1String("\\\\");
848 }
849 rx += c;
850 }
851 }
852 return rx;
853 }
854 #endif
855
caretIndex(int offset,QRegExp::CaretMode caretMode)856 static int caretIndex(int offset, QRegExp::CaretMode caretMode)
857 {
858 if (caretMode == QRegExp::CaretAtZero) {
859 return 0;
860 } else if (caretMode == QRegExp::CaretAtOffset) {
861 return offset;
862 } else { // QRegExp::CaretWontMatch
863 return -1;
864 }
865 }
866
867 /*
868 The QRegExpEngineKey struct uniquely identifies an engine.
869 */
870 struct QRegExpEngineKey
871 {
872 QString pattern;
873 QRegExp::PatternSyntax patternSyntax;
874 Qt::CaseSensitivity cs;
875
QRegExpEngineKeyQRegExpEngineKey876 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
877 Qt::CaseSensitivity cs)
878 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
879
clearQRegExpEngineKey880 inline void clear() {
881 pattern.clear();
882 patternSyntax = QRegExp::RegExp;
883 cs = Qt::CaseSensitive;
884 }
885 };
886
operator ==(const QRegExpEngineKey & key1,const QRegExpEngineKey & key2)887 Q_STATIC_GLOBAL_OPERATOR bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
888 {
889 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
890 && key1.cs == key2.cs;
891 }
892
893 class QRegExpEngine;
894
895 //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
896
897 /*
898 This is the engine state during matching.
899 */
900 struct QRegExpMatchState
901 {
902 const QChar *in; // a pointer to the input string data
903 int pos; // the current position in the string
904 int caretPos;
905 int len; // the length of the input string
906 bool minimal; // minimal matching?
907 int *bigArray; // big array holding the data for the next pointers
908 int *inNextStack; // is state is nextStack?
909 int *curStack; // stack of current states
910 int *nextStack; // stack of next states
911 int *curCapBegin; // start of current states' captures
912 int *nextCapBegin; // start of next states' captures
913 int *curCapEnd; // end of current states' captures
914 int *nextCapEnd; // end of next states' captures
915 int *tempCapBegin; // start of temporary captures
916 int *tempCapEnd; // end of temporary captures
917 int *capBegin; // start of captures for a next state
918 int *capEnd; // end of captures for a next state
919 int *slideTab; // bump-along slide table for bad-character heuristic
920 int *captured; // what match() returned last
921 int slideTabSize; // size of slide table
922 int capturedSize;
923 #ifndef QT_NO_REGEXP_BACKREF
924 QList<QVector<int> > sleeping; // list of back-reference sleepers
925 #endif
926 int matchLen; // length of match
927 int oneTestMatchedLen; // length of partial match
928
929 const QRegExpEngine *eng;
930
QRegExpMatchStateQRegExpMatchState931 inline QRegExpMatchState() : bigArray(0), captured(0) {}
~QRegExpMatchStateQRegExpMatchState932 inline ~QRegExpMatchState() { free(bigArray); }
933
drainQRegExpMatchState934 void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory
935 void prepareForMatch(QRegExpEngine *eng);
936 void match(const QChar *str, int len, int pos, bool minimal,
937 bool oneTest, int caretIndex);
938 bool matchHere();
939 bool testAnchor(int i, int a, const int *capBegin);
940 };
941
942 /*
943 The struct QRegExpAutomatonState represents one state in a modified NFA. The
944 input characters matched are stored in the state instead of on
945 the transitions, something possible for an automaton
946 constructed from a regular expression.
947 */
948 struct QRegExpAutomatonState
949 {
950 #ifndef QT_NO_REGEXP_CAPTURE
951 int atom; // which atom does this state belong to?
952 #endif
953 int match; // what does it match? (see CharClassBit and BackRefBit)
954 QVector<int> outs; // out-transitions
955 QMap<int, int> reenter; // atoms reentered when transiting out
956 QMap<int, int> anchors; // anchors met when transiting out
957
QRegExpAutomatonStateQRegExpAutomatonState958 inline QRegExpAutomatonState() { }
959 #ifndef QT_NO_REGEXP_CAPTURE
QRegExpAutomatonStateQRegExpAutomatonState960 inline QRegExpAutomatonState(int a, int m)
961 : atom(a), match(m) { }
962 #else
QRegExpAutomatonStateQRegExpAutomatonState963 inline QRegExpAutomatonState(int m)
964 : match(m) { }
965 #endif
966 };
967
968 Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
969
970 /*
971 The struct QRegExpCharClassRange represents a range of characters (e.g.,
972 [0-9] denotes range 48 to 57).
973 */
974 struct QRegExpCharClassRange
975 {
976 ushort from; // 48
977 ushort len; // 10
978 };
979
980 Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
981
982 #ifndef QT_NO_REGEXP_CAPTURE
983 /*
984 The struct QRegExpAtom represents one node in the hierarchy of regular
985 expression atoms.
986 */
987 struct QRegExpAtom
988 {
989 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
990
991 int parent; // index of parent in array of atoms
992 int capture; // index of capture, from 1 to ncap - 1
993 };
994
995 Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
996 #endif
997
998 struct QRegExpLookahead;
999
1000 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1001 /*
1002 The struct QRegExpAnchorAlternation represents a pair of anchors with
1003 OR semantics.
1004 */
1005 struct QRegExpAnchorAlternation
1006 {
1007 int a; // this anchor...
1008 int b; // ...or this one
1009 };
1010
1011 Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1012 #endif
1013
1014 #ifndef QT_NO_REGEXP_CCLASS
1015 /*
1016 The class QRegExpCharClass represents a set of characters, such as can
1017 be found in regular expressions (e.g., [a-z] denotes the set
1018 {a, b, ..., z}).
1019 */
1020 class QRegExpCharClass
1021 {
1022 public:
1023 QRegExpCharClass();
QRegExpCharClass(const QRegExpCharClass & cc)1024 inline QRegExpCharClass(const QRegExpCharClass &cc) { operator=(cc); }
1025
1026 QRegExpCharClass &operator=(const QRegExpCharClass &cc);
1027
1028 void clear();
negative() const1029 bool negative() const { return n; }
1030 void setNegative(bool negative);
1031 void addCategories(int cats);
1032 void addRange(ushort from, ushort to);
addSingleton(ushort ch)1033 void addSingleton(ushort ch) { addRange(ch, ch); }
1034
1035 bool in(QChar ch) const;
1036 #ifndef QT_NO_REGEXP_OPTIM
firstOccurrence() const1037 const QVector<int> &firstOccurrence() const { return occ1; }
1038 #endif
1039
1040 #if defined(QT_DEBUG)
1041 void dump() const;
1042 #endif
1043
1044 private:
1045 int c; // character classes
1046 QVector<QRegExpCharClassRange> r; // character ranges
1047 bool n; // negative?
1048 #ifndef QT_NO_REGEXP_OPTIM
1049 QVector<int> occ1; // first-occurrence array
1050 #endif
1051 };
1052 #else
1053 struct QRegExpCharClass
1054 {
1055 int dummy;
1056
1057 #ifndef QT_NO_REGEXP_OPTIM
QRegExpCharClassQRegExpCharClass1058 QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1059
firstOccurrenceQRegExpCharClass1060 const QVector<int> &firstOccurrence() const { return occ1; }
1061 QVector<int> occ1;
1062 #endif
1063 };
1064 #endif
1065
1066 Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1067
1068 /*
1069 The QRegExpEngine class encapsulates a modified nondeterministic
1070 finite automaton (NFA).
1071 */
1072 class QRegExpEngine
1073 {
1074 public:
QRegExpEngine(Qt::CaseSensitivity cs,bool greedyQuantifiers)1075 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1076 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1077
1078 QRegExpEngine(const QRegExpEngineKey &key);
1079 ~QRegExpEngine();
1080
isValid() const1081 bool isValid() const { return valid; }
errorString() const1082 const QString &errorString() const { return yyError; }
captureCount() const1083 int captureCount() const { return officialncap; }
1084
1085 int createState(QChar ch);
1086 int createState(const QRegExpCharClass &cc);
1087 #ifndef QT_NO_REGEXP_BACKREF
1088 int createState(int bref);
1089 #endif
1090
1091 void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1092 #ifndef QT_NO_REGEXP_CAPTURE
1093 void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1094 #endif
1095
1096 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1097 int anchorAlternation(int a, int b);
1098 int anchorConcatenation(int a, int b);
1099 #else
anchorAlternation(int a,int b)1100 int anchorAlternation(int a, int b) { return a & b; }
anchorConcatenation(int a,int b)1101 int anchorConcatenation(int a, int b) { return a | b; }
1102 #endif
1103 void addAnchors(int from, int to, int a);
1104
1105 #ifndef QT_NO_REGEXP_OPTIM
1106 void heuristicallyChooseHeuristic();
1107 #endif
1108
1109 #if defined(QT_DEBUG)
1110 void dump() const;
1111 #endif
1112
1113 QAtomicInt ref;
1114
1115 private:
1116 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1117 enum { InitialState = 0, FinalState = 1 };
1118
1119 void setup();
1120 int setupState(int match);
1121
1122 /*
1123 Let's hope that 13 lookaheads and 14 back-references are
1124 enough.
1125 */
1126 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1127 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1128 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1129 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1130 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1131 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1132
1133 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1134 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1135 #ifndef QT_NO_REGEXP_CAPTURE
1136 int startAtom(bool officialCapture);
1137 void finishAtom(int atom, bool needCapture);
1138 #endif
1139
1140 #ifndef QT_NO_REGEXP_LOOKAHEAD
1141 int addLookahead(QRegExpEngine *eng, bool negative);
1142 #endif
1143
1144 #ifndef QT_NO_REGEXP_OPTIM
1145 bool goodStringMatch(QRegExpMatchState &matchState) const;
1146 bool badCharMatch(QRegExpMatchState &matchState) const;
1147 #else
1148 bool bruteMatch(QRegExpMatchState &matchState) const;
1149 #endif
1150
1151 QVector<QRegExpAutomatonState> s; // array of states
1152 #ifndef QT_NO_REGEXP_CAPTURE
1153 QVector<QRegExpAtom> f; // atom hierarchy
1154 int nf; // number of atoms
1155 int cf; // current atom
1156 QVector<int> captureForOfficialCapture;
1157 #endif
1158 int officialncap; // number of captures, seen from the outside
1159 int ncap; // number of captures, seen from the inside
1160 #ifndef QT_NO_REGEXP_CCLASS
1161 QVector<QRegExpCharClass> cl; // array of character classes
1162 #endif
1163 #ifndef QT_NO_REGEXP_LOOKAHEAD
1164 QVector<QRegExpLookahead *> ahead; // array of lookaheads
1165 #endif
1166 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1167 QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1168 #endif
1169 #ifndef QT_NO_REGEXP_OPTIM
1170 bool caretAnchored; // does the regexp start with ^?
1171 bool trivial; // is the good-string all that needs to match?
1172 #endif
1173 bool valid; // is the regular expression valid?
1174 Qt::CaseSensitivity cs; // case sensitive?
1175 bool greedyQuantifiers; // RegExp2?
1176 bool xmlSchemaExtensions;
1177 #ifndef QT_NO_REGEXP_BACKREF
1178 int nbrefs; // number of back-references
1179 #endif
1180
1181 #ifndef QT_NO_REGEXP_OPTIM
1182 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1183
1184 int goodEarlyStart; // the index where goodStr can first occur in a match
1185 int goodLateStart; // the index where goodStr can last occur in a match
1186 QString goodStr; // the string that any match has to contain
1187
1188 int minl; // the minimum length of a match
1189 QVector<int> occ1; // first-occurrence array
1190 #endif
1191
1192 /*
1193 The class Box is an abstraction for a regular expression
1194 fragment. It can also be seen as one node in the syntax tree of
1195 a regular expression with synthetized attributes.
1196
1197 Its interface is ugly for performance reasons.
1198 */
1199 class Box
1200 {
1201 public:
1202 Box(QRegExpEngine *engine);
Box(const Box & b)1203 Box(const Box &b) { operator=(b); }
1204
1205 Box &operator=(const Box &b);
1206
clear()1207 void clear() { operator=(Box(eng)); }
1208 void set(QChar ch);
1209 void set(const QRegExpCharClass &cc);
1210 #ifndef QT_NO_REGEXP_BACKREF
1211 void set(int bref);
1212 #endif
1213
1214 void cat(const Box &b);
1215 void orx(const Box &b);
1216 void plus(int atom);
1217 void opt();
1218 void catAnchor(int a);
1219 #ifndef QT_NO_REGEXP_OPTIM
1220 void setupHeuristics();
1221 #endif
1222
1223 #if defined(QT_DEBUG)
1224 void dump() const;
1225 #endif
1226
1227 private:
1228 void addAnchorsToEngine(const Box &to) const;
1229
1230 QRegExpEngine *eng; // the automaton under construction
1231 QVector<int> ls; // the left states (firstpos)
1232 QVector<int> rs; // the right states (lastpos)
1233 QMap<int, int> lanchors; // the left anchors
1234 QMap<int, int> ranchors; // the right anchors
1235 int skipanchors; // the anchors to match if the box is skipped
1236
1237 #ifndef QT_NO_REGEXP_OPTIM
1238 int earlyStart; // the index where str can first occur
1239 int lateStart; // the index where str can last occur
1240 QString str; // a string that has to occur in any match
1241 QString leftStr; // a string occurring at the left of this box
1242 QString rightStr; // a string occurring at the right of this box
1243 int maxl; // the maximum length of this box (possibly InftyLen)
1244 #endif
1245
1246 int minl; // the minimum length of this box
1247 #ifndef QT_NO_REGEXP_OPTIM
1248 QVector<int> occ1; // first-occurrence array
1249 #endif
1250 };
1251
1252 friend class Box;
1253
1254 void setupCategoriesRangeMap();
1255
1256 /*
1257 This is the lexical analyzer for regular expressions.
1258 */
1259 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1260 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1261 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1262 int getChar();
1263 int getEscape();
1264 #ifndef QT_NO_REGEXP_INTERVAL
1265 int getRep(int def);
1266 #endif
1267 #ifndef QT_NO_REGEXP_LOOKAHEAD
1268 void skipChars(int n);
1269 #endif
1270 void error(const char *msg);
1271 void startTokenizer(const QChar *rx, int len);
1272 int getToken();
1273
1274 const QChar *yyIn; // a pointer to the input regular expression pattern
1275 int yyPos0; // the position of yyTok in the input pattern
1276 int yyPos; // the position of the next character to read
1277 int yyLen; // the length of yyIn
1278 int yyCh; // the last character read
1279 QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1280 int yyMinRep; // attribute for Tok_Quantifier
1281 int yyMaxRep; // ditto
1282 QString yyError; // syntax error or overflow during parsing?
1283
1284 /*
1285 This is the syntactic analyzer for regular expressions.
1286 */
1287 int parse(const QChar *rx, int len);
1288 void parseAtom(Box *box);
1289 void parseFactor(Box *box);
1290 void parseTerm(Box *box);
1291 void parseExpression(Box *box);
1292
1293 int yyTok; // the last token read
1294 bool yyMayCapture; // set this to false to disable capturing
1295 QHash<QByteArray, QPair<int, int> > categoriesRangeMap; // fast lookup hash for xml schema extensions
1296
1297 friend struct QRegExpMatchState;
1298 };
1299
1300 #ifndef QT_NO_REGEXP_LOOKAHEAD
1301 /*
1302 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1303 (?=foo) and (?!bar)).
1304 */
1305 struct QRegExpLookahead
1306 {
1307 QRegExpEngine *eng; // NFA representing the embedded regular expression
1308 bool neg; // negative lookahead?
1309
QRegExpLookaheadQRegExpLookahead1310 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1311 : eng(eng0), neg(neg0) { }
~QRegExpLookaheadQRegExpLookahead1312 inline ~QRegExpLookahead() { delete eng; }
1313 };
1314 #endif
1315
1316 /*! \internal
1317 convert the pattern string to the RegExp syntax.
1318
1319 This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1320 */
qt_regexp_toCanonical(const QString & pattern,QRegExp::PatternSyntax patternSyntax)1321 Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1322 {
1323 switch (patternSyntax) {
1324 #ifndef QT_NO_REGEXP_WILDCARD
1325 case QRegExp::Wildcard:
1326 return wc2rx(pattern, false);
1327 break;
1328 case QRegExp::WildcardUnix:
1329 return wc2rx(pattern, true);
1330 break;
1331 #endif
1332 case QRegExp::FixedString:
1333 return QRegExp::escape(pattern);
1334 break;
1335 case QRegExp::W3CXmlSchema11:
1336 default:
1337 return pattern;
1338 }
1339 }
1340
QRegExpEngine(const QRegExpEngineKey & key)1341 QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1342 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1343 xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1344 {
1345 setup();
1346
1347 QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);
1348
1349 valid = (parse(rx.unicode(), rx.length()) == rx.length());
1350 if (!valid) {
1351 #ifndef QT_NO_REGEXP_OPTIM
1352 trivial = false;
1353 #endif
1354 error(RXERR_LEFTDELIM);
1355 }
1356 }
1357
~QRegExpEngine()1358 QRegExpEngine::~QRegExpEngine()
1359 {
1360 #ifndef QT_NO_REGEXP_LOOKAHEAD
1361 qDeleteAll(ahead);
1362 #endif
1363 }
1364
prepareForMatch(QRegExpEngine * eng)1365 void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1366 {
1367 /*
1368 We use one QVector<int> for all the big data used a lot in
1369 matchHere() and friends.
1370 */
1371 int ns = eng->s.size(); // number of states
1372 int ncap = eng->ncap;
1373 #ifndef QT_NO_REGEXP_OPTIM
1374 int newSlideTabSize = qMax(eng->minl + 1, 16);
1375 #else
1376 int newSlideTabSize = 0;
1377 #endif
1378 int numCaptures = eng->captureCount();
1379 int newCapturedSize = 2 + 2 * numCaptures;
1380 bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1381
1382 // set all internal variables only _after_ bigArray is realloc'ed
1383 // to prevent a broken regexp in oom case
1384
1385 slideTabSize = newSlideTabSize;
1386 capturedSize = newCapturedSize;
1387 inNextStack = bigArray;
1388 memset(inNextStack, -1, ns * sizeof(int));
1389 curStack = inNextStack + ns;
1390 nextStack = inNextStack + 2 * ns;
1391
1392 curCapBegin = inNextStack + 3 * ns;
1393 nextCapBegin = curCapBegin + ncap * ns;
1394 curCapEnd = curCapBegin + 2 * ncap * ns;
1395 nextCapEnd = curCapBegin + 3 * ncap * ns;
1396
1397 tempCapBegin = curCapBegin + 4 * ncap * ns;
1398 tempCapEnd = tempCapBegin + ncap;
1399 capBegin = tempCapBegin + 2 * ncap;
1400 capEnd = tempCapBegin + 3 * ncap;
1401
1402 slideTab = tempCapBegin + 4 * ncap;
1403 captured = slideTab + slideTabSize;
1404 memset(captured, -1, capturedSize*sizeof(int));
1405 this->eng = eng;
1406 }
1407
1408 /*
1409 Tries to match in str and returns an array of (begin, length) pairs
1410 for captured text. If there is no match, all pairs are (-1, -1).
1411 */
match(const QChar * str0,int len0,int pos0,bool minimal0,bool oneTest,int caretIndex)1412 void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1413 bool minimal0, bool oneTest, int caretIndex)
1414 {
1415 bool matched = false;
1416 QChar char_null;
1417
1418 #ifndef QT_NO_REGEXP_OPTIM
1419 if (eng->trivial && !oneTest) {
1420 pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);
1421 matchLen = eng->goodStr.length();
1422 matched = (pos != -1);
1423 } else
1424 #endif
1425 {
1426 in = str0;
1427 if (in == 0)
1428 in = &char_null;
1429 pos = pos0;
1430 caretPos = caretIndex;
1431 len = len0;
1432 minimal = minimal0;
1433 matchLen = 0;
1434 oneTestMatchedLen = 0;
1435
1436 if (eng->valid && pos >= 0 && pos <= len) {
1437 #ifndef QT_NO_REGEXP_OPTIM
1438 if (oneTest) {
1439 matched = matchHere();
1440 } else {
1441 if (pos <= len - eng->minl) {
1442 if (eng->caretAnchored) {
1443 matched = matchHere();
1444 } else if (eng->useGoodStringHeuristic) {
1445 matched = eng->goodStringMatch(*this);
1446 } else {
1447 matched = eng->badCharMatch(*this);
1448 }
1449 }
1450 }
1451 #else
1452 matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1453 #endif
1454 }
1455 }
1456
1457 if (matched) {
1458 int *c = captured;
1459 *c++ = pos;
1460 *c++ = matchLen;
1461
1462 int numCaptures = (capturedSize - 2) >> 1;
1463 #ifndef QT_NO_REGEXP_CAPTURE
1464 for (int i = 0; i < numCaptures; ++i) {
1465 int j = eng->captureForOfficialCapture.at(i);
1466 if (capBegin[j] != EmptyCapture) {
1467 int len = capEnd[j] - capBegin[j];
1468 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1469 *c++ = len;
1470 } else {
1471 *c++ = -1;
1472 *c++ = -1;
1473 }
1474 }
1475 #endif
1476 } else {
1477 // we rely on 2's complement here
1478 memset(captured, -1, capturedSize * sizeof(int));
1479 }
1480 }
1481
1482 /*
1483 The three following functions add one state to the automaton and
1484 return the number of the state.
1485 */
1486
createState(QChar ch)1487 int QRegExpEngine::createState(QChar ch)
1488 {
1489 return setupState(ch.unicode());
1490 }
1491
createState(const QRegExpCharClass & cc)1492 int QRegExpEngine::createState(const QRegExpCharClass &cc)
1493 {
1494 #ifndef QT_NO_REGEXP_CCLASS
1495 int n = cl.size();
1496 cl += QRegExpCharClass(cc);
1497 return setupState(CharClassBit | n);
1498 #else
1499 Q_UNUSED(cc);
1500 return setupState(CharClassBit);
1501 #endif
1502 }
1503
1504 #ifndef QT_NO_REGEXP_BACKREF
createState(int bref)1505 int QRegExpEngine::createState(int bref)
1506 {
1507 if (bref > nbrefs) {
1508 nbrefs = bref;
1509 if (nbrefs > MaxBackRefs) {
1510 error(RXERR_LIMIT);
1511 return 0;
1512 }
1513 }
1514 return setupState(BackRefBit | bref);
1515 }
1516 #endif
1517
1518 /*
1519 The two following functions add a transition between all pairs of
1520 states (i, j) where i is found in from, and j is found in to.
1521
1522 Cat-transitions are distinguished from plus-transitions for
1523 capturing.
1524 */
1525
addCatTransitions(const QVector<int> & from,const QVector<int> & to)1526 void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1527 {
1528 for (int i = 0; i < from.size(); i++)
1529 mergeInto(&s[from.at(i)].outs, to);
1530 }
1531
1532 #ifndef QT_NO_REGEXP_CAPTURE
addPlusTransitions(const QVector<int> & from,const QVector<int> & to,int atom)1533 void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1534 {
1535 for (int i = 0; i < from.size(); i++) {
1536 QRegExpAutomatonState &st = s[from.at(i)];
1537 const QVector<int> oldOuts = st.outs;
1538 mergeInto(&st.outs, to);
1539 if (f.at(atom).capture != QRegExpAtom::NoCapture) {
1540 for (int j = 0; j < to.size(); j++) {
1541 // ### st.reenter.contains(to.at(j)) check looks suspicious
1542 if (!st.reenter.contains(to.at(j)) &&
1543 qBinaryFind(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)) == oldOuts.end())
1544 st.reenter.insert(to.at(j), atom);
1545 }
1546 }
1547 }
1548 }
1549 #endif
1550
1551 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1552 /*
1553 Returns an anchor that means a OR b.
1554 */
anchorAlternation(int a,int b)1555 int QRegExpEngine::anchorAlternation(int a, int b)
1556 {
1557 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1558 return a & b;
1559
1560 int n = aa.size();
1561 #ifndef QT_NO_REGEXP_OPTIM
1562 if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)
1563 return Anchor_Alternation | (n - 1);
1564 #endif
1565
1566 QRegExpAnchorAlternation element = {a, b};
1567 aa.append(element);
1568 return Anchor_Alternation | n;
1569 }
1570
1571 /*
1572 Returns an anchor that means a AND b.
1573 */
anchorConcatenation(int a,int b)1574 int QRegExpEngine::anchorConcatenation(int a, int b)
1575 {
1576 if (((a | b) & Anchor_Alternation) == 0)
1577 return a | b;
1578 if ((b & Anchor_Alternation) != 0)
1579 qSwap(a, b);
1580
1581 int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);
1582 int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);
1583 return anchorAlternation(aprime, bprime);
1584 }
1585 #endif
1586
1587 /*
1588 Adds anchor a on a transition caracterised by its from state and
1589 its to state.
1590 */
addAnchors(int from,int to,int a)1591 void QRegExpEngine::addAnchors(int from, int to, int a)
1592 {
1593 QRegExpAutomatonState &st = s[from];
1594 if (st.anchors.contains(to))
1595 a = anchorAlternation(st.anchors.value(to), a);
1596 st.anchors.insert(to, a);
1597 }
1598
1599 #ifndef QT_NO_REGEXP_OPTIM
1600 /*
1601 This function chooses between the good-string and the bad-character
1602 heuristics. It computes two scores and chooses the heuristic with
1603 the highest score.
1604
1605 Here are some common-sense constraints on the scores that should be
1606 respected if the formulas are ever modified: (1) If goodStr is
1607 empty, the good-string heuristic scores 0. (2) If the regular
1608 expression is trivial, the good-string heuristic should be used.
1609 (3) If the search is case insensitive, the good-string heuristic
1610 should be used, unless it scores 0. (Case insensitivity turns all
1611 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1612 big, the good-string heuristic should score less.
1613 */
heuristicallyChooseHeuristic()1614 void QRegExpEngine::heuristicallyChooseHeuristic()
1615 {
1616 if (minl == 0) {
1617 useGoodStringHeuristic = false;
1618 } else if (trivial) {
1619 useGoodStringHeuristic = true;
1620 } else {
1621 /*
1622 Magic formula: The good string has to constitute a good
1623 proportion of the minimum-length string, and appear at a
1624 more-or-less known index.
1625 */
1626 int goodStringScore = (64 * goodStr.length() / minl) -
1627 (goodLateStart - goodEarlyStart);
1628 /*
1629 Less magic formula: We pick some characters at random, and
1630 check whether they are good or bad.
1631 */
1632 int badCharScore = 0;
1633 int step = qMax(1, NumBadChars / 32);
1634 for (int i = 1; i < NumBadChars; i += step) {
1635 if (occ1.at(i) == NoOccurrence)
1636 badCharScore += minl;
1637 else
1638 badCharScore += occ1.at(i);
1639 }
1640 badCharScore /= minl;
1641 useGoodStringHeuristic = (goodStringScore > badCharScore);
1642 }
1643 }
1644 #endif
1645
1646 #if defined(QT_DEBUG)
dump() const1647 void QRegExpEngine::dump() const
1648 {
1649 int i, j;
1650 qDebug("Case %ssensitive engine", cs ? "" : "in");
1651 qDebug(" States");
1652 for (i = 0; i < s.size(); i++) {
1653 qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1654 #ifndef QT_NO_REGEXP_CAPTURE
1655 if (nf > 0)
1656 qDebug(" in atom %d", s[i].atom);
1657 #endif
1658 int m = s[i].match;
1659 if ((m & CharClassBit) != 0) {
1660 qDebug(" match character class %d", m ^ CharClassBit);
1661 #ifndef QT_NO_REGEXP_CCLASS
1662 cl[m ^ CharClassBit].dump();
1663 #else
1664 qDebug(" negative character class");
1665 #endif
1666 } else if ((m & BackRefBit) != 0) {
1667 qDebug(" match back-reference %d", m ^ BackRefBit);
1668 } else if (m >= 0x20 && m <= 0x7e) {
1669 qDebug(" match 0x%.4x (%c)", m, m);
1670 } else {
1671 qDebug(" match 0x%.4x", m);
1672 }
1673 for (j = 0; j < s[i].outs.size(); j++) {
1674 int next = s[i].outs[j];
1675 qDebug(" -> %d", next);
1676 if (s[i].reenter.contains(next))
1677 qDebug(" [reenter %d]", s[i].reenter[next]);
1678 if (s[i].anchors.value(next) != 0)
1679 qDebug(" [anchors 0x%.8x]", s[i].anchors[next]);
1680 }
1681 }
1682 #ifndef QT_NO_REGEXP_CAPTURE
1683 if (nf > 0) {
1684 qDebug(" Atom Parent Capture");
1685 for (i = 0; i < nf; i++) {
1686 if (f[i].capture == QRegExpAtom::NoCapture) {
1687 qDebug(" %6d %6d nil", i, f[i].parent);
1688 } else {
1689 int cap = f[i].capture;
1690 bool official = captureForOfficialCapture.contains(cap);
1691 qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1692 official ? "official" : "");
1693 }
1694 }
1695 }
1696 #endif
1697 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1698 for (i = 0; i < aa.size(); i++)
1699 qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1700 #endif
1701 }
1702 #endif
1703
setup()1704 void QRegExpEngine::setup()
1705 {
1706 ref = 1;
1707 #ifndef QT_NO_REGEXP_CAPTURE
1708 f.resize(32);
1709 nf = 0;
1710 cf = -1;
1711 #endif
1712 officialncap = 0;
1713 ncap = 0;
1714 #ifndef QT_NO_REGEXP_OPTIM
1715 caretAnchored = true;
1716 trivial = true;
1717 #endif
1718 valid = false;
1719 #ifndef QT_NO_REGEXP_BACKREF
1720 nbrefs = 0;
1721 #endif
1722 #ifndef QT_NO_REGEXP_OPTIM
1723 useGoodStringHeuristic = true;
1724 minl = 0;
1725 occ1.fill(0, NumBadChars);
1726 #endif
1727 }
1728
setupState(int match)1729 int QRegExpEngine::setupState(int match)
1730 {
1731 #ifndef QT_NO_REGEXP_CAPTURE
1732 s += QRegExpAutomatonState(cf, match);
1733 #else
1734 s += QRegExpAutomatonState(match);
1735 #endif
1736 return s.size() - 1;
1737 }
1738
1739 #ifndef QT_NO_REGEXP_CAPTURE
1740 /*
1741 Functions startAtom() and finishAtom() should be called to delimit
1742 atoms. When a state is created, it is assigned to the current atom.
1743 The information is later used for capturing.
1744 */
startAtom(bool officialCapture)1745 int QRegExpEngine::startAtom(bool officialCapture)
1746 {
1747 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1748 f.resize((nf + 1) << 1);
1749 f[nf].parent = cf;
1750 cf = nf++;
1751 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1752 return cf;
1753 }
1754
finishAtom(int atom,bool needCapture)1755 void QRegExpEngine::finishAtom(int atom, bool needCapture)
1756 {
1757 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1758 f[atom].capture = QRegExpAtom::UnofficialCapture;
1759 cf = f.at(atom).parent;
1760 }
1761 #endif
1762
1763 #ifndef QT_NO_REGEXP_LOOKAHEAD
1764 /*
1765 Creates a lookahead anchor.
1766 */
addLookahead(QRegExpEngine * eng,bool negative)1767 int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1768 {
1769 int n = ahead.size();
1770 if (n == MaxLookaheads) {
1771 error(RXERR_LIMIT);
1772 return 0;
1773 }
1774 ahead += new QRegExpLookahead(eng, negative);
1775 return Anchor_FirstLookahead << n;
1776 }
1777 #endif
1778
1779 #ifndef QT_NO_REGEXP_CAPTURE
1780 /*
1781 We want the longest leftmost captures.
1782 */
isBetterCapture(int ncap,const int * begin1,const int * end1,const int * begin2,const int * end2)1783 static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1784 const int *end2)
1785 {
1786 for (int i = 0; i < ncap; i++) {
1787 int delta = begin2[i] - begin1[i]; // it has to start early...
1788 if (delta == 0)
1789 delta = end1[i] - end2[i]; // ...and end late
1790
1791 if (delta != 0)
1792 return delta > 0;
1793 }
1794 return false;
1795 }
1796 #endif
1797
1798 /*
1799 Returns true if anchor a matches at position pos + i in the input
1800 string, otherwise false.
1801 */
testAnchor(int i,int a,const int * capBegin)1802 bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1803 {
1804 int j;
1805
1806 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1807 if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1808 return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1809 || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1810 #endif
1811
1812 if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1813 if (pos + i != caretPos)
1814 return false;
1815 }
1816 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1817 if (pos + i != len)
1818 return false;
1819 }
1820 #ifndef QT_NO_REGEXP_ESCAPE
1821 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1822 bool before = false;
1823 bool after = false;
1824 if (pos + i != 0)
1825 before = isWord(in[pos + i - 1]);
1826 if (pos + i != len)
1827 after = isWord(in[pos + i]);
1828 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1829 return false;
1830 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1831 return false;
1832 }
1833 #endif
1834 #ifndef QT_NO_REGEXP_LOOKAHEAD
1835 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1836 const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1837 for (j = 0; j < ahead.size(); j++) {
1838 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1839 QRegExpMatchState matchState;
1840 matchState.prepareForMatch(ahead[j]->eng);
1841 matchState.match(in + pos + i, len - pos - i, 0,
1842 true, true, caretPos - pos - i);
1843 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1844 return false;
1845 }
1846 }
1847 }
1848 #endif
1849 #ifndef QT_NO_REGEXP_CAPTURE
1850 #ifndef QT_NO_REGEXP_BACKREF
1851 for (j = 0; j < eng->nbrefs; j++) {
1852 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1853 int i = eng->captureForOfficialCapture.at(j);
1854 if (capBegin[i] != EmptyCapture)
1855 return false;
1856 }
1857 }
1858 #endif
1859 #endif
1860 return true;
1861 }
1862
1863 #ifndef QT_NO_REGEXP_OPTIM
1864 /*
1865 The three following functions are what Jeffrey Friedl would call
1866 transmissions (or bump-alongs). Using one or the other should make
1867 no difference except in performance.
1868 */
1869
goodStringMatch(QRegExpMatchState & matchState) const1870 bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1871 {
1872 int k = matchState.pos + goodEarlyStart;
1873 QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1874 while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {
1875 int from = k - goodLateStart;
1876 int to = k - goodEarlyStart;
1877 if (from > matchState.pos)
1878 matchState.pos = from;
1879
1880 while (matchState.pos <= to) {
1881 if (matchState.matchHere())
1882 return true;
1883 ++matchState.pos;
1884 }
1885 ++k;
1886 }
1887 return false;
1888 }
1889
badCharMatch(QRegExpMatchState & matchState) const1890 bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1891 {
1892 int slideHead = 0;
1893 int slideNext = 0;
1894 int i;
1895 int lastPos = matchState.len - minl;
1896 memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));
1897
1898 /*
1899 Set up the slide table, used for the bad-character heuristic,
1900 using the table of first occurrence of each character.
1901 */
1902 for (i = 0; i < minl; i++) {
1903 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1904 if (sk == NoOccurrence)
1905 sk = i + 1;
1906 if (sk > 0) {
1907 int k = i + 1 - sk;
1908 if (k < 0) {
1909 sk = i + 1;
1910 k = 0;
1911 }
1912 if (sk > matchState.slideTab[k])
1913 matchState.slideTab[k] = sk;
1914 }
1915 }
1916
1917 if (matchState.pos > lastPos)
1918 return false;
1919
1920 for (;;) {
1921 if (++slideNext >= matchState.slideTabSize)
1922 slideNext = 0;
1923 if (matchState.slideTab[slideHead] > 0) {
1924 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1925 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1926 matchState.slideTab[slideHead] = 0;
1927 } else {
1928 if (matchState.matchHere())
1929 return true;
1930 }
1931
1932 if (matchState.pos == lastPos)
1933 break;
1934
1935 /*
1936 Update the slide table. This code has much in common with
1937 the initialization code.
1938 */
1939 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1940 if (sk == NoOccurrence) {
1941 matchState.slideTab[slideNext] = minl;
1942 } else if (sk > 0) {
1943 int k = slideNext + minl - sk;
1944 if (k >= matchState.slideTabSize)
1945 k -= matchState.slideTabSize;
1946 if (sk > matchState.slideTab[k])
1947 matchState.slideTab[k] = sk;
1948 }
1949 slideHead = slideNext;
1950 ++matchState.pos;
1951 }
1952 return false;
1953 }
1954 #else
bruteMatch(QRegExpMatchState & matchState) const1955 bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1956 {
1957 while (matchState.pos <= matchState.len) {
1958 if (matchState.matchHere())
1959 return true;
1960 ++matchState.pos;
1961 }
1962 return false;
1963 }
1964 #endif
1965
1966 /*
1967 Here's the core of the engine. It tries to do a match here and now.
1968 */
matchHere()1969 bool QRegExpMatchState::matchHere()
1970 {
1971 int ncur = 1, nnext = 0;
1972 int i = 0, j, k, m;
1973 bool stop = false;
1974
1975 matchLen = -1;
1976 oneTestMatchedLen = -1;
1977 curStack[0] = QRegExpEngine::InitialState;
1978
1979 int ncap = eng->ncap;
1980 #ifndef QT_NO_REGEXP_CAPTURE
1981 if (ncap > 0) {
1982 for (j = 0; j < ncap; j++) {
1983 curCapBegin[j] = EmptyCapture;
1984 curCapEnd[j] = EmptyCapture;
1985 }
1986 }
1987 #endif
1988
1989 #ifndef QT_NO_REGEXP_BACKREF
1990 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1991 #else
1992 while (ncur > 0 && i <= len - pos && !stop)
1993 #endif
1994 {
1995 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1996 for (j = 0; j < ncur; j++) {
1997 int cur = curStack[j];
1998 const QRegExpAutomatonState &scur = eng->s.at(cur);
1999 const QVector<int> &outs = scur.outs;
2000 for (k = 0; k < outs.size(); k++) {
2001 int next = outs.at(k);
2002 const QRegExpAutomatonState &snext = eng->s.at(next);
2003 bool inside = true;
2004 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2005 int needSomeSleep = 0;
2006 #endif
2007
2008 /*
2009 First, check if the anchors are anchored properly.
2010 */
2011 int a = scur.anchors.value(next);
2012 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))
2013 inside = false;
2014
2015 /*
2016 If indeed they are, check if the input character is
2017 correct for this transition.
2018 */
2019 if (inside) {
2020 m = snext.match;
2021 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2022 if (eng->cs)
2023 inside = (m == ch);
2024 else
2025 inside = (QChar(m).toLower() == QChar(ch).toLower());
2026 } else if (next == QRegExpEngine::FinalState) {
2027 matchLen = i;
2028 stop = minimal;
2029 inside = true;
2030 } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2031 #ifndef QT_NO_REGEXP_CCLASS
2032 const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);
2033 if (eng->cs)
2034 inside = cc.in(ch);
2035 else if (cc.negative())
2036 inside = cc.in(QChar(ch).toLower()) &&
2037 cc.in(QChar(ch).toUpper());
2038 else
2039 inside = cc.in(QChar(ch).toLower()) ||
2040 cc.in(QChar(ch).toUpper());
2041 #endif
2042 #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2043 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2044 int bref = m ^ QRegExpEngine::BackRefBit;
2045 int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);
2046
2047 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2048 if (inside) {
2049 if (eng->cs)
2050 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2051 else
2052 inside = (in[pos + curCapBegin[ell]].toLower()
2053 == QChar(ch).toLower());
2054 }
2055
2056 if (inside) {
2057 int delta;
2058 if (curCapEnd[ell] == EmptyCapture)
2059 delta = i - curCapBegin[ell];
2060 else
2061 delta = curCapEnd[ell] - curCapBegin[ell];
2062
2063 inside = (delta <= len - (pos + i));
2064 if (inside && delta > 1) {
2065 int n = 1;
2066 if (eng->cs) {
2067 while (n < delta) {
2068 if (in[pos + curCapBegin[ell] + n]
2069 != in[pos + i + n])
2070 break;
2071 ++n;
2072 }
2073 } else {
2074 while (n < delta) {
2075 QChar a = in[pos + curCapBegin[ell] + n];
2076 QChar b = in[pos + i + n];
2077 if (a.toLower() != b.toLower())
2078 break;
2079 ++n;
2080 }
2081 }
2082 inside = (n == delta);
2083 if (inside)
2084 needSomeSleep = delta - 1;
2085 }
2086 }
2087 #endif
2088 }
2089 }
2090
2091 /*
2092 We must now update our data structures.
2093 */
2094 if (inside) {
2095 #ifndef QT_NO_REGEXP_CAPTURE
2096 int *capBegin, *capEnd;
2097 #endif
2098 /*
2099 If the next state was not encountered yet, all
2100 is fine.
2101 */
2102 if ((m = inNextStack[next]) == -1) {
2103 m = nnext++;
2104 nextStack[m] = next;
2105 inNextStack[next] = m;
2106 #ifndef QT_NO_REGEXP_CAPTURE
2107 capBegin = nextCapBegin + m * ncap;
2108 capEnd = nextCapEnd + m * ncap;
2109
2110 /*
2111 Otherwise, we'll first maintain captures in
2112 temporary arrays, and decide at the end whether
2113 it's best to keep the previous capture zones or
2114 the new ones.
2115 */
2116 } else {
2117 capBegin = tempCapBegin;
2118 capEnd = tempCapEnd;
2119 #endif
2120 }
2121
2122 #ifndef QT_NO_REGEXP_CAPTURE
2123 /*
2124 Updating the capture zones is much of a task.
2125 */
2126 if (ncap > 0) {
2127 memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));
2128 memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));
2129 int c = scur.atom, n = snext.atom;
2130 int p = -1, q = -1;
2131 int cap;
2132
2133 /*
2134 Lemma 1. For any x in the range [0..nf), we
2135 have f[x].parent < x.
2136
2137 Proof. By looking at startAtom(), it is
2138 clear that cf < nf holds all the time, and
2139 thus that f[nf].parent < nf.
2140 */
2141
2142 /*
2143 If we are reentering an atom, we empty all
2144 capture zones inside it.
2145 */
2146 if ((q = scur.reenter.value(next)) != 0) {
2147 QBitArray b(eng->nf, false);
2148 b.setBit(q, true);
2149 for (int ell = q + 1; ell < eng->nf; ell++) {
2150 if (b.testBit(eng->f.at(ell).parent)) {
2151 b.setBit(ell, true);
2152 cap = eng->f.at(ell).capture;
2153 if (cap >= 0) {
2154 capBegin[cap] = EmptyCapture;
2155 capEnd[cap] = EmptyCapture;
2156 }
2157 }
2158 }
2159 p = eng->f.at(q).parent;
2160
2161 /*
2162 Otherwise, close the capture zones we are
2163 leaving. We are leaving f[c].capture,
2164 f[f[c].parent].capture,
2165 f[f[f[c].parent].parent].capture, ...,
2166 until f[x].capture, with x such that
2167 f[x].parent is the youngest common ancestor
2168 for c and n.
2169
2170 We go up along c's and n's ancestry until
2171 we find x.
2172 */
2173 } else {
2174 p = c;
2175 q = n;
2176 while (p != q) {
2177 if (p > q) {
2178 cap = eng->f.at(p).capture;
2179 if (cap >= 0) {
2180 if (capBegin[cap] == i) {
2181 capBegin[cap] = EmptyCapture;
2182 capEnd[cap] = EmptyCapture;
2183 } else {
2184 capEnd[cap] = i;
2185 }
2186 }
2187 p = eng->f.at(p).parent;
2188 } else {
2189 q = eng->f.at(q).parent;
2190 }
2191 }
2192 }
2193
2194 /*
2195 In any case, we now open the capture zones
2196 we are entering. We work upwards from n
2197 until we reach p (the parent of the atom we
2198 reenter or the youngest common ancestor).
2199 */
2200 while (n > p) {
2201 cap = eng->f.at(n).capture;
2202 if (cap >= 0) {
2203 capBegin[cap] = i;
2204 capEnd[cap] = EmptyCapture;
2205 }
2206 n = eng->f.at(n).parent;
2207 }
2208 /*
2209 If the next state was already in
2210 nextStack, we must choose carefully which
2211 capture zones we want to keep.
2212 */
2213 if (capBegin == tempCapBegin &&
2214 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,
2215 nextCapEnd + m * ncap)) {
2216 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2217 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2218 }
2219 }
2220 #ifndef QT_NO_REGEXP_BACKREF
2221 /*
2222 We are done with updating the capture zones.
2223 It's now time to put the next state to sleep,
2224 if it needs to, and to remove it from
2225 nextStack.
2226 */
2227 if (needSomeSleep > 0) {
2228 QVector<int> zzZ(2 + 2 * ncap);
2229 zzZ[0] = i + needSomeSleep;
2230 zzZ[1] = next;
2231 if (ncap > 0) {
2232 memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));
2233 memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));
2234 }
2235 inNextStack[nextStack[--nnext]] = -1;
2236 sleeping.append(zzZ);
2237 }
2238 #endif
2239 #endif
2240 }
2241 }
2242 }
2243 #ifndef QT_NO_REGEXP_CAPTURE
2244 /*
2245 If we reached the final state, hurray! Copy the captured
2246 zone.
2247 */
2248 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2249 memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));
2250 memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));
2251 }
2252 #ifndef QT_NO_REGEXP_BACKREF
2253 /*
2254 It's time to wake up the sleepers.
2255 */
2256 j = 0;
2257 while (j < sleeping.count()) {
2258 if (sleeping.at(j)[0] == i) {
2259 const QVector<int> &zzZ = sleeping.at(j);
2260 int next = zzZ[1];
2261 const int *capBegin = zzZ.data() + 2;
2262 const int *capEnd = zzZ.data() + 2 + ncap;
2263 bool copyOver = true;
2264
2265 if ((m = inNextStack[next]) == -1) {
2266 m = nnext++;
2267 nextStack[m] = next;
2268 inNextStack[next] = m;
2269 } else {
2270 copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,
2271 capBegin, capEnd);
2272 }
2273 if (copyOver) {
2274 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));
2275 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));
2276 }
2277
2278 sleeping.removeAt(j);
2279 } else {
2280 ++j;
2281 }
2282 }
2283 #endif
2284 #endif
2285 for (j = 0; j < nnext; j++)
2286 inNextStack[nextStack[j]] = -1;
2287
2288 // avoid needless iteration that confuses oneTestMatchedLen
2289 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2290 #ifndef QT_NO_REGEXP_BACKREF
2291 && sleeping.isEmpty()
2292 #endif
2293 )
2294 stop = true;
2295
2296 qSwap(curStack, nextStack);
2297 #ifndef QT_NO_REGEXP_CAPTURE
2298 qSwap(curCapBegin, nextCapBegin);
2299 qSwap(curCapEnd, nextCapEnd);
2300 #endif
2301 ncur = nnext;
2302 nnext = 0;
2303 ++i;
2304 }
2305
2306 #ifndef QT_NO_REGEXP_BACKREF
2307 /*
2308 If minimal matching is enabled, we might have some sleepers
2309 left.
2310 */
2311 if (!sleeping.isEmpty())
2312 sleeping.clear();
2313 #endif
2314
2315 oneTestMatchedLen = i - 1;
2316 return (matchLen >= 0);
2317 }
2318
2319 #ifndef QT_NO_REGEXP_CCLASS
2320
QRegExpCharClass()2321 QRegExpCharClass::QRegExpCharClass()
2322 : c(0), n(false)
2323 {
2324 #ifndef QT_NO_REGEXP_OPTIM
2325 occ1.fill(NoOccurrence, NumBadChars);
2326 #endif
2327 }
2328
operator =(const QRegExpCharClass & cc)2329 QRegExpCharClass &QRegExpCharClass::operator=(const QRegExpCharClass &cc)
2330 {
2331 c = cc.c;
2332 r = cc.r;
2333 n = cc.n;
2334 #ifndef QT_NO_REGEXP_OPTIM
2335 occ1 = cc.occ1;
2336 #endif
2337 return *this;
2338 }
2339
clear()2340 void QRegExpCharClass::clear()
2341 {
2342 c = 0;
2343 r.resize(0);
2344 n = false;
2345 }
2346
setNegative(bool negative)2347 void QRegExpCharClass::setNegative(bool negative)
2348 {
2349 n = negative;
2350 #ifndef QT_NO_REGEXP_OPTIM
2351 occ1.fill(0, NumBadChars);
2352 #endif
2353 }
2354
addCategories(int cats)2355 void QRegExpCharClass::addCategories(int cats)
2356 {
2357 c |= cats;
2358 #ifndef QT_NO_REGEXP_OPTIM
2359 occ1.fill(0, NumBadChars);
2360 #endif
2361 }
2362
addRange(ushort from,ushort to)2363 void QRegExpCharClass::addRange(ushort from, ushort to)
2364 {
2365 if (from > to)
2366 qSwap(from, to);
2367 int m = r.size();
2368 r.resize(m + 1);
2369 r[m].from = from;
2370 r[m].len = to - from + 1;
2371
2372 #ifndef QT_NO_REGEXP_OPTIM
2373 int i;
2374
2375 if (to - from < NumBadChars) {
2376 if (from % NumBadChars <= to % NumBadChars) {
2377 for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2378 occ1[i] = 0;
2379 } else {
2380 for (i = 0; i <= to % NumBadChars; i++)
2381 occ1[i] = 0;
2382 for (i = from % NumBadChars; i < NumBadChars; i++)
2383 occ1[i] = 0;
2384 }
2385 } else {
2386 occ1.fill(0, NumBadChars);
2387 }
2388 #endif
2389 }
2390
in(QChar ch) const2391 bool QRegExpCharClass::in(QChar ch) const
2392 {
2393 #ifndef QT_NO_REGEXP_OPTIM
2394 if (occ1.at(BadChar(ch)) == NoOccurrence)
2395 return n;
2396 #endif
2397
2398 if (c != 0 && (c & (1 << (int)ch.category())) != 0)
2399 return !n;
2400
2401 const int uc = ch.unicode();
2402 int size = r.size();
2403
2404 for (int i = 0; i < size; ++i) {
2405 const QRegExpCharClassRange &range = r.at(i);
2406 if (uint(uc - range.from) < uint(r.at(i).len))
2407 return !n;
2408 }
2409 return n;
2410 }
2411
2412 #if defined(QT_DEBUG)
dump() const2413 void QRegExpCharClass::dump() const
2414 {
2415 int i;
2416 qDebug(" %stive character class", n ? "nega" : "posi");
2417 #ifndef QT_NO_REGEXP_CCLASS
2418 if (c != 0)
2419 qDebug(" categories 0x%.8x", c);
2420 #endif
2421 for (i = 0; i < r.size(); i++)
2422 qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2423 }
2424 #endif
2425 #endif
2426
Box(QRegExpEngine * engine)2427 QRegExpEngine::Box::Box(QRegExpEngine *engine)
2428 : eng(engine), skipanchors(0)
2429 #ifndef QT_NO_REGEXP_OPTIM
2430 , earlyStart(0), lateStart(0), maxl(0)
2431 #endif
2432 {
2433 #ifndef QT_NO_REGEXP_OPTIM
2434 occ1.fill(NoOccurrence, NumBadChars);
2435 #endif
2436 minl = 0;
2437 }
2438
operator =(const Box & b)2439 QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2440 {
2441 eng = b.eng;
2442 ls = b.ls;
2443 rs = b.rs;
2444 lanchors = b.lanchors;
2445 ranchors = b.ranchors;
2446 skipanchors = b.skipanchors;
2447 #ifndef QT_NO_REGEXP_OPTIM
2448 earlyStart = b.earlyStart;
2449 lateStart = b.lateStart;
2450 str = b.str;
2451 leftStr = b.leftStr;
2452 rightStr = b.rightStr;
2453 maxl = b.maxl;
2454 occ1 = b.occ1;
2455 #endif
2456 minl = b.minl;
2457 return *this;
2458 }
2459
set(QChar ch)2460 void QRegExpEngine::Box::set(QChar ch)
2461 {
2462 ls.resize(1);
2463 ls[0] = eng->createState(ch);
2464 rs = ls;
2465 #ifndef QT_NO_REGEXP_OPTIM
2466 str = ch;
2467 leftStr = ch;
2468 rightStr = ch;
2469 maxl = 1;
2470 occ1[BadChar(ch)] = 0;
2471 #endif
2472 minl = 1;
2473 }
2474
set(const QRegExpCharClass & cc)2475 void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2476 {
2477 ls.resize(1);
2478 ls[0] = eng->createState(cc);
2479 rs = ls;
2480 #ifndef QT_NO_REGEXP_OPTIM
2481 maxl = 1;
2482 occ1 = cc.firstOccurrence();
2483 #endif
2484 minl = 1;
2485 }
2486
2487 #ifndef QT_NO_REGEXP_BACKREF
set(int bref)2488 void QRegExpEngine::Box::set(int bref)
2489 {
2490 ls.resize(1);
2491 ls[0] = eng->createState(bref);
2492 rs = ls;
2493 if (bref >= 1 && bref <= MaxBackRefs)
2494 skipanchors = Anchor_BackRef0Empty << bref;
2495 #ifndef QT_NO_REGEXP_OPTIM
2496 maxl = InftyLen;
2497 #endif
2498 minl = 0;
2499 }
2500 #endif
2501
cat(const Box & b)2502 void QRegExpEngine::Box::cat(const Box &b)
2503 {
2504 eng->addCatTransitions(rs, b.ls);
2505 addAnchorsToEngine(b);
2506 if (minl == 0) {
2507 lanchors.unite(b.lanchors);
2508 if (skipanchors != 0) {
2509 for (int i = 0; i < b.ls.size(); i++) {
2510 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);
2511 lanchors.insert(b.ls.at(i), a);
2512 }
2513 }
2514 mergeInto(&ls, b.ls);
2515 }
2516 if (b.minl == 0) {
2517 ranchors.unite(b.ranchors);
2518 if (b.skipanchors != 0) {
2519 for (int i = 0; i < rs.size(); i++) {
2520 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);
2521 ranchors.insert(rs.at(i), a);
2522 }
2523 }
2524 mergeInto(&rs, b.rs);
2525 } else {
2526 ranchors = b.ranchors;
2527 rs = b.rs;
2528 }
2529
2530 #ifndef QT_NO_REGEXP_OPTIM
2531 if (maxl != InftyLen) {
2532 if (rightStr.length() + b.leftStr.length() >
2533 qMax(str.length(), b.str.length())) {
2534 earlyStart = minl - rightStr.length();
2535 lateStart = maxl - rightStr.length();
2536 str = rightStr + b.leftStr;
2537 } else if (b.str.length() > str.length()) {
2538 earlyStart = minl + b.earlyStart;
2539 lateStart = maxl + b.lateStart;
2540 str = b.str;
2541 }
2542 }
2543
2544 if (leftStr.length() == maxl)
2545 leftStr += b.leftStr;
2546
2547 if (b.rightStr.length() == b.maxl) {
2548 rightStr += b.rightStr;
2549 } else {
2550 rightStr = b.rightStr;
2551 }
2552
2553 if (maxl == InftyLen || b.maxl == InftyLen) {
2554 maxl = InftyLen;
2555 } else {
2556 maxl += b.maxl;
2557 }
2558
2559 for (int i = 0; i < NumBadChars; i++) {
2560 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2561 occ1[i] = minl + b.occ1.at(i);
2562 }
2563 #endif
2564
2565 minl += b.minl;
2566 if (minl == 0)
2567 skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);
2568 else
2569 skipanchors = 0;
2570 }
2571
orx(const Box & b)2572 void QRegExpEngine::Box::orx(const Box &b)
2573 {
2574 mergeInto(&ls, b.ls);
2575 lanchors.unite(b.lanchors);
2576 mergeInto(&rs, b.rs);
2577 ranchors.unite(b.ranchors);
2578
2579 if (b.minl == 0) {
2580 if (minl == 0)
2581 skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);
2582 else
2583 skipanchors = b.skipanchors;
2584 }
2585
2586 #ifndef QT_NO_REGEXP_OPTIM
2587 for (int i = 0; i < NumBadChars; i++) {
2588 if (occ1.at(i) > b.occ1.at(i))
2589 occ1[i] = b.occ1.at(i);
2590 }
2591 earlyStart = 0;
2592 lateStart = 0;
2593 str = QString();
2594 leftStr = QString();
2595 rightStr = QString();
2596 if (b.maxl > maxl)
2597 maxl = b.maxl;
2598 #endif
2599 if (b.minl < minl)
2600 minl = b.minl;
2601 }
2602
plus(int atom)2603 void QRegExpEngine::Box::plus(int atom)
2604 {
2605 #ifndef QT_NO_REGEXP_CAPTURE
2606 eng->addPlusTransitions(rs, ls, atom);
2607 #else
2608 Q_UNUSED(atom);
2609 eng->addCatTransitions(rs, ls);
2610 #endif
2611 addAnchorsToEngine(*this);
2612 #ifndef QT_NO_REGEXP_OPTIM
2613 maxl = InftyLen;
2614 #endif
2615 }
2616
opt()2617 void QRegExpEngine::Box::opt()
2618 {
2619 #ifndef QT_NO_REGEXP_OPTIM
2620 earlyStart = 0;
2621 lateStart = 0;
2622 str = QString();
2623 leftStr = QString();
2624 rightStr = QString();
2625 #endif
2626 skipanchors = 0;
2627 minl = 0;
2628 }
2629
catAnchor(int a)2630 void QRegExpEngine::Box::catAnchor(int a)
2631 {
2632 if (a != 0) {
2633 for (int i = 0; i < rs.size(); i++) {
2634 a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);
2635 ranchors.insert(rs.at(i), a);
2636 }
2637 if (minl == 0)
2638 skipanchors = eng->anchorConcatenation(skipanchors, a);
2639 }
2640 }
2641
2642 #ifndef QT_NO_REGEXP_OPTIM
setupHeuristics()2643 void QRegExpEngine::Box::setupHeuristics()
2644 {
2645 eng->goodEarlyStart = earlyStart;
2646 eng->goodLateStart = lateStart;
2647 eng->goodStr = eng->cs ? str : str.toLower();
2648
2649 eng->minl = minl;
2650 if (eng->cs) {
2651 /*
2652 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2653 1 at this point. An entry of occ1 has to be at most minl or
2654 infinity for the rest of the algorithm to go well.
2655
2656 We waited until here before normalizing these cases (instead of
2657 doing it in Box::orx()) because sometimes things improve by
2658 themselves. Consider for example (112|1)34.
2659 */
2660 for (int i = 0; i < NumBadChars; i++) {
2661 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2662 occ1[i] = minl;
2663 }
2664 eng->occ1 = occ1;
2665 } else {
2666 eng->occ1.fill(0, NumBadChars);
2667 }
2668
2669 eng->heuristicallyChooseHeuristic();
2670 }
2671 #endif
2672
2673 #if defined(QT_DEBUG)
dump() const2674 void QRegExpEngine::Box::dump() const
2675 {
2676 int i;
2677 qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2678 qDebug(" Left states:");
2679 for (i = 0; i < ls.size(); i++) {
2680 if (lanchors.value(ls[i], 0) == 0)
2681 qDebug(" %d", ls[i]);
2682 else
2683 qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2684 }
2685 qDebug(" Right states:");
2686 for (i = 0; i < rs.size(); i++) {
2687 if (ranchors.value(rs[i], 0) == 0)
2688 qDebug(" %d", rs[i]);
2689 else
2690 qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2691 }
2692 qDebug(" Skip anchors: 0x%.8x", skipanchors);
2693 }
2694 #endif
2695
addAnchorsToEngine(const Box & to) const2696 void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2697 {
2698 for (int i = 0; i < to.ls.size(); i++) {
2699 for (int j = 0; j < rs.size(); j++) {
2700 int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),
2701 to.lanchors.value(to.ls.at(i), 0));
2702 eng->addAnchors(rs[j], to.ls[i], a);
2703 }
2704 }
2705 }
2706
setupCategoriesRangeMap()2707 void QRegExpEngine::setupCategoriesRangeMap()
2708 {
2709 categoriesRangeMap.insert("IsBasicLatin", qMakePair(0x0000, 0x007F));
2710 categoriesRangeMap.insert("IsLatin-1Supplement", qMakePair(0x0080, 0x00FF));
2711 categoriesRangeMap.insert("IsLatinExtended-A", qMakePair(0x0100, 0x017F));
2712 categoriesRangeMap.insert("IsLatinExtended-B", qMakePair(0x0180, 0x024F));
2713 categoriesRangeMap.insert("IsIPAExtensions", qMakePair(0x0250, 0x02AF));
2714 categoriesRangeMap.insert("IsSpacingModifierLetters", qMakePair(0x02B0, 0x02FF));
2715 categoriesRangeMap.insert("IsCombiningDiacriticalMarks", qMakePair(0x0300, 0x036F));
2716 categoriesRangeMap.insert("IsGreek", qMakePair(0x0370, 0x03FF));
2717 categoriesRangeMap.insert("IsCyrillic", qMakePair(0x0400, 0x04FF));
2718 categoriesRangeMap.insert("IsCyrillicSupplement", qMakePair(0x0500, 0x052F));
2719 categoriesRangeMap.insert("IsArmenian", qMakePair(0x0530, 0x058F));
2720 categoriesRangeMap.insert("IsHebrew", qMakePair(0x0590, 0x05FF));
2721 categoriesRangeMap.insert("IsArabic", qMakePair(0x0600, 0x06FF));
2722 categoriesRangeMap.insert("IsSyriac", qMakePair(0x0700, 0x074F));
2723 categoriesRangeMap.insert("IsArabicSupplement", qMakePair(0x0750, 0x077F));
2724 categoriesRangeMap.insert("IsThaana", qMakePair(0x0780, 0x07BF));
2725 categoriesRangeMap.insert("IsDevanagari", qMakePair(0x0900, 0x097F));
2726 categoriesRangeMap.insert("IsBengali", qMakePair(0x0980, 0x09FF));
2727 categoriesRangeMap.insert("IsGurmukhi", qMakePair(0x0A00, 0x0A7F));
2728 categoriesRangeMap.insert("IsGujarati", qMakePair(0x0A80, 0x0AFF));
2729 categoriesRangeMap.insert("IsOriya", qMakePair(0x0B00, 0x0B7F));
2730 categoriesRangeMap.insert("IsTamil", qMakePair(0x0B80, 0x0BFF));
2731 categoriesRangeMap.insert("IsTelugu", qMakePair(0x0C00, 0x0C7F));
2732 categoriesRangeMap.insert("IsKannada", qMakePair(0x0C80, 0x0CFF));
2733 categoriesRangeMap.insert("IsMalayalam", qMakePair(0x0D00, 0x0D7F));
2734 categoriesRangeMap.insert("IsSinhala", qMakePair(0x0D80, 0x0DFF));
2735 categoriesRangeMap.insert("IsThai", qMakePair(0x0E00, 0x0E7F));
2736 categoriesRangeMap.insert("IsLao", qMakePair(0x0E80, 0x0EFF));
2737 categoriesRangeMap.insert("IsTibetan", qMakePair(0x0F00, 0x0FFF));
2738 categoriesRangeMap.insert("IsMyanmar", qMakePair(0x1000, 0x109F));
2739 categoriesRangeMap.insert("IsGeorgian", qMakePair(0x10A0, 0x10FF));
2740 categoriesRangeMap.insert("IsHangulJamo", qMakePair(0x1100, 0x11FF));
2741 categoriesRangeMap.insert("IsEthiopic", qMakePair(0x1200, 0x137F));
2742 categoriesRangeMap.insert("IsEthiopicSupplement", qMakePair(0x1380, 0x139F));
2743 categoriesRangeMap.insert("IsCherokee", qMakePair(0x13A0, 0x13FF));
2744 categoriesRangeMap.insert("IsUnifiedCanadianAboriginalSyllabics", qMakePair(0x1400, 0x167F));
2745 categoriesRangeMap.insert("IsOgham", qMakePair(0x1680, 0x169F));
2746 categoriesRangeMap.insert("IsRunic", qMakePair(0x16A0, 0x16FF));
2747 categoriesRangeMap.insert("IsTagalog", qMakePair(0x1700, 0x171F));
2748 categoriesRangeMap.insert("IsHanunoo", qMakePair(0x1720, 0x173F));
2749 categoriesRangeMap.insert("IsBuhid", qMakePair(0x1740, 0x175F));
2750 categoriesRangeMap.insert("IsTagbanwa", qMakePair(0x1760, 0x177F));
2751 categoriesRangeMap.insert("IsKhmer", qMakePair(0x1780, 0x17FF));
2752 categoriesRangeMap.insert("IsMongolian", qMakePair(0x1800, 0x18AF));
2753 categoriesRangeMap.insert("IsLimbu", qMakePair(0x1900, 0x194F));
2754 categoriesRangeMap.insert("IsTaiLe", qMakePair(0x1950, 0x197F));
2755 categoriesRangeMap.insert("IsNewTaiLue", qMakePair(0x1980, 0x19DF));
2756 categoriesRangeMap.insert("IsKhmerSymbols", qMakePair(0x19E0, 0x19FF));
2757 categoriesRangeMap.insert("IsBuginese", qMakePair(0x1A00, 0x1A1F));
2758 categoriesRangeMap.insert("IsPhoneticExtensions", qMakePair(0x1D00, 0x1D7F));
2759 categoriesRangeMap.insert("IsPhoneticExtensionsSupplement", qMakePair(0x1D80, 0x1DBF));
2760 categoriesRangeMap.insert("IsCombiningDiacriticalMarksSupplement", qMakePair(0x1DC0, 0x1DFF));
2761 categoriesRangeMap.insert("IsLatinExtendedAdditional", qMakePair(0x1E00, 0x1EFF));
2762 categoriesRangeMap.insert("IsGreekExtended", qMakePair(0x1F00, 0x1FFF));
2763 categoriesRangeMap.insert("IsGeneralPunctuation", qMakePair(0x2000, 0x206F));
2764 categoriesRangeMap.insert("IsSuperscriptsandSubscripts", qMakePair(0x2070, 0x209F));
2765 categoriesRangeMap.insert("IsCurrencySymbols", qMakePair(0x20A0, 0x20CF));
2766 categoriesRangeMap.insert("IsCombiningMarksforSymbols", qMakePair(0x20D0, 0x20FF));
2767 categoriesRangeMap.insert("IsLetterlikeSymbols", qMakePair(0x2100, 0x214F));
2768 categoriesRangeMap.insert("IsNumberForms", qMakePair(0x2150, 0x218F));
2769 categoriesRangeMap.insert("IsArrows", qMakePair(0x2190, 0x21FF));
2770 categoriesRangeMap.insert("IsMathematicalOperators", qMakePair(0x2200, 0x22FF));
2771 categoriesRangeMap.insert("IsMiscellaneousTechnical", qMakePair(0x2300, 0x23FF));
2772 categoriesRangeMap.insert("IsControlPictures", qMakePair(0x2400, 0x243F));
2773 categoriesRangeMap.insert("IsOpticalCharacterRecognition", qMakePair(0x2440, 0x245F));
2774 categoriesRangeMap.insert("IsEnclosedAlphanumerics", qMakePair(0x2460, 0x24FF));
2775 categoriesRangeMap.insert("IsBoxDrawing", qMakePair(0x2500, 0x257F));
2776 categoriesRangeMap.insert("IsBlockElements", qMakePair(0x2580, 0x259F));
2777 categoriesRangeMap.insert("IsGeometricShapes", qMakePair(0x25A0, 0x25FF));
2778 categoriesRangeMap.insert("IsMiscellaneousSymbols", qMakePair(0x2600, 0x26FF));
2779 categoriesRangeMap.insert("IsDingbats", qMakePair(0x2700, 0x27BF));
2780 categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-A", qMakePair(0x27C0, 0x27EF));
2781 categoriesRangeMap.insert("IsSupplementalArrows-A", qMakePair(0x27F0, 0x27FF));
2782 categoriesRangeMap.insert("IsBraillePatterns", qMakePair(0x2800, 0x28FF));
2783 categoriesRangeMap.insert("IsSupplementalArrows-B", qMakePair(0x2900, 0x297F));
2784 categoriesRangeMap.insert("IsMiscellaneousMathematicalSymbols-B", qMakePair(0x2980, 0x29FF));
2785 categoriesRangeMap.insert("IsSupplementalMathematicalOperators", qMakePair(0x2A00, 0x2AFF));
2786 categoriesRangeMap.insert("IsMiscellaneousSymbolsandArrows", qMakePair(0x2B00, 0x2BFF));
2787 categoriesRangeMap.insert("IsGlagolitic", qMakePair(0x2C00, 0x2C5F));
2788 categoriesRangeMap.insert("IsCoptic", qMakePair(0x2C80, 0x2CFF));
2789 categoriesRangeMap.insert("IsGeorgianSupplement", qMakePair(0x2D00, 0x2D2F));
2790 categoriesRangeMap.insert("IsTifinagh", qMakePair(0x2D30, 0x2D7F));
2791 categoriesRangeMap.insert("IsEthiopicExtended", qMakePair(0x2D80, 0x2DDF));
2792 categoriesRangeMap.insert("IsSupplementalPunctuation", qMakePair(0x2E00, 0x2E7F));
2793 categoriesRangeMap.insert("IsCJKRadicalsSupplement", qMakePair(0x2E80, 0x2EFF));
2794 categoriesRangeMap.insert("IsKangxiRadicals", qMakePair(0x2F00, 0x2FDF));
2795 categoriesRangeMap.insert("IsIdeographicDescriptionCharacters", qMakePair(0x2FF0, 0x2FFF));
2796 categoriesRangeMap.insert("IsCJKSymbolsandPunctuation", qMakePair(0x3000, 0x303F));
2797 categoriesRangeMap.insert("IsHiragana", qMakePair(0x3040, 0x309F));
2798 categoriesRangeMap.insert("IsKatakana", qMakePair(0x30A0, 0x30FF));
2799 categoriesRangeMap.insert("IsBopomofo", qMakePair(0x3100, 0x312F));
2800 categoriesRangeMap.insert("IsHangulCompatibilityJamo", qMakePair(0x3130, 0x318F));
2801 categoriesRangeMap.insert("IsKanbun", qMakePair(0x3190, 0x319F));
2802 categoriesRangeMap.insert("IsBopomofoExtended", qMakePair(0x31A0, 0x31BF));
2803 categoriesRangeMap.insert("IsCJKStrokes", qMakePair(0x31C0, 0x31EF));
2804 categoriesRangeMap.insert("IsKatakanaPhoneticExtensions", qMakePair(0x31F0, 0x31FF));
2805 categoriesRangeMap.insert("IsEnclosedCJKLettersandMonths", qMakePair(0x3200, 0x32FF));
2806 categoriesRangeMap.insert("IsCJKCompatibility", qMakePair(0x3300, 0x33FF));
2807 categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionA", qMakePair(0x3400, 0x4DB5));
2808 categoriesRangeMap.insert("IsYijingHexagramSymbols", qMakePair(0x4DC0, 0x4DFF));
2809 categoriesRangeMap.insert("IsCJKUnifiedIdeographs", qMakePair(0x4E00, 0x9FFF));
2810 categoriesRangeMap.insert("IsYiSyllables", qMakePair(0xA000, 0xA48F));
2811 categoriesRangeMap.insert("IsYiRadicals", qMakePair(0xA490, 0xA4CF));
2812 categoriesRangeMap.insert("IsModifierToneLetters", qMakePair(0xA700, 0xA71F));
2813 categoriesRangeMap.insert("IsSylotiNagri", qMakePair(0xA800, 0xA82F));
2814 categoriesRangeMap.insert("IsHangulSyllables", qMakePair(0xAC00, 0xD7A3));
2815 categoriesRangeMap.insert("IsPrivateUse", qMakePair(0xE000, 0xF8FF));
2816 categoriesRangeMap.insert("IsCJKCompatibilityIdeographs", qMakePair(0xF900, 0xFAFF));
2817 categoriesRangeMap.insert("IsAlphabeticPresentationForms", qMakePair(0xFB00, 0xFB4F));
2818 categoriesRangeMap.insert("IsArabicPresentationForms-A", qMakePair(0xFB50, 0xFDFF));
2819 categoriesRangeMap.insert("IsVariationSelectors", qMakePair(0xFE00, 0xFE0F));
2820 categoriesRangeMap.insert("IsVerticalForms", qMakePair(0xFE10, 0xFE1F));
2821 categoriesRangeMap.insert("IsCombiningHalfMarks", qMakePair(0xFE20, 0xFE2F));
2822 categoriesRangeMap.insert("IsCJKCompatibilityForms", qMakePair(0xFE30, 0xFE4F));
2823 categoriesRangeMap.insert("IsSmallFormVariants", qMakePair(0xFE50, 0xFE6F));
2824 categoriesRangeMap.insert("IsArabicPresentationForms-B", qMakePair(0xFE70, 0xFEFF));
2825 categoriesRangeMap.insert("IsHalfwidthandFullwidthForms", qMakePair(0xFF00, 0xFFEF));
2826 categoriesRangeMap.insert("IsSpecials", qMakePair(0xFFF0, 0xFFFF));
2827 categoriesRangeMap.insert("IsLinearBSyllabary", qMakePair(0x10000, 0x1007F));
2828 categoriesRangeMap.insert("IsLinearBIdeograms", qMakePair(0x10080, 0x100FF));
2829 categoriesRangeMap.insert("IsAegeanNumbers", qMakePair(0x10100, 0x1013F));
2830 categoriesRangeMap.insert("IsAncientGreekNumbers", qMakePair(0x10140, 0x1018F));
2831 categoriesRangeMap.insert("IsOldItalic", qMakePair(0x10300, 0x1032F));
2832 categoriesRangeMap.insert("IsGothic", qMakePair(0x10330, 0x1034F));
2833 categoriesRangeMap.insert("IsUgaritic", qMakePair(0x10380, 0x1039F));
2834 categoriesRangeMap.insert("IsOldPersian", qMakePair(0x103A0, 0x103DF));
2835 categoriesRangeMap.insert("IsDeseret", qMakePair(0x10400, 0x1044F));
2836 categoriesRangeMap.insert("IsShavian", qMakePair(0x10450, 0x1047F));
2837 categoriesRangeMap.insert("IsOsmanya", qMakePair(0x10480, 0x104AF));
2838 categoriesRangeMap.insert("IsCypriotSyllabary", qMakePair(0x10800, 0x1083F));
2839 categoriesRangeMap.insert("IsKharoshthi", qMakePair(0x10A00, 0x10A5F));
2840 categoriesRangeMap.insert("IsByzantineMusicalSymbols", qMakePair(0x1D000, 0x1D0FF));
2841 categoriesRangeMap.insert("IsMusicalSymbols", qMakePair(0x1D100, 0x1D1FF));
2842 categoriesRangeMap.insert("IsAncientGreekMusicalNotation", qMakePair(0x1D200, 0x1D24F));
2843 categoriesRangeMap.insert("IsTaiXuanJingSymbols", qMakePair(0x1D300, 0x1D35F));
2844 categoriesRangeMap.insert("IsMathematicalAlphanumericSymbols", qMakePair(0x1D400, 0x1D7FF));
2845 categoriesRangeMap.insert("IsCJKUnifiedIdeographsExtensionB", qMakePair(0x20000, 0x2A6DF));
2846 categoriesRangeMap.insert("IsCJKCompatibilityIdeographsSupplement", qMakePair(0x2F800, 0x2FA1F));
2847 categoriesRangeMap.insert("IsTags", qMakePair(0xE0000, 0xE007F));
2848 categoriesRangeMap.insert("IsVariationSelectorsSupplement", qMakePair(0xE0100, 0xE01EF));
2849 categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-A", qMakePair(0xF0000, 0xFFFFF));
2850 categoriesRangeMap.insert("IsSupplementaryPrivateUseArea-B", qMakePair(0x100000, 0x10FFFF));
2851 }
2852
getChar()2853 int QRegExpEngine::getChar()
2854 {
2855 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2856 }
2857
getEscape()2858 int QRegExpEngine::getEscape()
2859 {
2860 #ifndef QT_NO_REGEXP_ESCAPE
2861 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2862 const char backTab[] = "\a\f\n\r\t\v";
2863 ushort low;
2864 int i;
2865 #endif
2866 ushort val;
2867 int prevCh = yyCh;
2868
2869 if (prevCh == EOS) {
2870 error(RXERR_END);
2871 return Tok_Char | '\\';
2872 }
2873 yyCh = getChar();
2874 #ifndef QT_NO_REGEXP_ESCAPE
2875 if ((prevCh & ~0xff) == 0) {
2876 const char *p = strchr(tab, prevCh);
2877 if (p != 0)
2878 return Tok_Char | backTab[p - tab];
2879 }
2880 #endif
2881
2882 switch (prevCh) {
2883 #ifndef QT_NO_REGEXP_ESCAPE
2884 case '0':
2885 val = 0;
2886 for (i = 0; i < 3; i++) {
2887 if (yyCh >= '0' && yyCh <= '7')
2888 val = (val << 3) | (yyCh - '0');
2889 else
2890 break;
2891 yyCh = getChar();
2892 }
2893 if ((val & ~0377) != 0)
2894 error(RXERR_OCTAL);
2895 return Tok_Char | val;
2896 #endif
2897 #ifndef QT_NO_REGEXP_ESCAPE
2898 case 'B':
2899 return Tok_NonWord;
2900 #endif
2901 #ifndef QT_NO_REGEXP_CCLASS
2902 case 'D':
2903 // see QChar::isDigit()
2904 yyCharClass->addCategories(0x7fffffef);
2905 return Tok_CharClass;
2906 case 'S':
2907 // see QChar::isSpace()
2908 yyCharClass->addCategories(0x7ffff87f);
2909 yyCharClass->addRange(0x0000, 0x0008);
2910 yyCharClass->addRange(0x000e, 0x001f);
2911 yyCharClass->addRange(0x007f, 0x009f);
2912 return Tok_CharClass;
2913 case 'W':
2914 // see QChar::isLetterOrNumber() and QChar::isMark()
2915 yyCharClass->addCategories(0x7fe07f81);
2916 yyCharClass->addRange(0x203f, 0x2040);
2917 yyCharClass->addSingleton(0x2040);
2918 yyCharClass->addSingleton(0x2054);
2919 yyCharClass->addSingleton(0x30fb);
2920 yyCharClass->addRange(0xfe33, 0xfe34);
2921 yyCharClass->addRange(0xfe4d, 0xfe4f);
2922 yyCharClass->addSingleton(0xff3f);
2923 yyCharClass->addSingleton(0xff65);
2924 return Tok_CharClass;
2925 #endif
2926 #ifndef QT_NO_REGEXP_ESCAPE
2927 case 'b':
2928 return Tok_Word;
2929 #endif
2930 #ifndef QT_NO_REGEXP_CCLASS
2931 case 'd':
2932 // see QChar::isDigit()
2933 yyCharClass->addCategories(0x00000010);
2934 return Tok_CharClass;
2935 case 's':
2936 // see QChar::isSpace()
2937 yyCharClass->addCategories(0x00000380);
2938 yyCharClass->addRange(0x0009, 0x000d);
2939 return Tok_CharClass;
2940 case 'w':
2941 // see QChar::isLetterOrNumber() and QChar::isMark()
2942 yyCharClass->addCategories(0x000f807e);
2943 yyCharClass->addSingleton(0x005f); // '_'
2944 return Tok_CharClass;
2945 case 'I':
2946 if (xmlSchemaExtensions) {
2947 yyCharClass->setNegative(!yyCharClass->negative());
2948 // fall through
2949 }
2950 case 'i':
2951 if (xmlSchemaExtensions) {
2952 yyCharClass->addCategories(0x000f807e);
2953 yyCharClass->addSingleton(0x003a); // ':'
2954 yyCharClass->addSingleton(0x005f); // '_'
2955 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
2956 yyCharClass->addRange(0x0061, 0x007a); // [a-z]
2957 yyCharClass->addRange(0xc0, 0xd6);
2958 yyCharClass->addRange(0xd8, 0xf6);
2959 yyCharClass->addRange(0xf8, 0x2ff);
2960 yyCharClass->addRange(0x370, 0x37d);
2961 yyCharClass->addRange(0x37f, 0x1fff);
2962 yyCharClass->addRange(0x200c, 0x200d);
2963 yyCharClass->addRange(0x2070, 0x218f);
2964 yyCharClass->addRange(0x2c00, 0x2fef);
2965 yyCharClass->addRange(0x3001, 0xd7ff);
2966 yyCharClass->addRange(0xf900, 0xfdcf);
2967 yyCharClass->addRange(0xfdf0, 0xfffd);
2968 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
2969 }
2970 return Tok_CharClass;
2971 case 'C':
2972 if (xmlSchemaExtensions) {
2973 yyCharClass->setNegative(!yyCharClass->negative());
2974 // fall through
2975 }
2976 case 'c':
2977 if (xmlSchemaExtensions) {
2978 yyCharClass->addCategories(0x000f807e);
2979 yyCharClass->addSingleton(0x002d); // '-'
2980 yyCharClass->addSingleton(0x002e); // '.'
2981 yyCharClass->addSingleton(0x003a); // ':'
2982 yyCharClass->addSingleton(0x005f); // '_'
2983 yyCharClass->addSingleton(0xb7);
2984 yyCharClass->addRange(0x0030, 0x0039); // [0-9]
2985 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]
2986 yyCharClass->addRange(0x0061, 0x007a); // [a-z]
2987 yyCharClass->addRange(0xc0, 0xd6);
2988 yyCharClass->addRange(0xd8, 0xf6);
2989 yyCharClass->addRange(0xf8, 0x2ff);
2990 yyCharClass->addRange(0x370, 0x37d);
2991 yyCharClass->addRange(0x37f, 0x1fff);
2992 yyCharClass->addRange(0x200c, 0x200d);
2993 yyCharClass->addRange(0x2070, 0x218f);
2994 yyCharClass->addRange(0x2c00, 0x2fef);
2995 yyCharClass->addRange(0x3001, 0xd7ff);
2996 yyCharClass->addRange(0xf900, 0xfdcf);
2997 yyCharClass->addRange(0xfdf0, 0xfffd);
2998 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);
2999 yyCharClass->addRange(0x0300, 0x036f);
3000 yyCharClass->addRange(0x203f, 0x2040);
3001 }
3002 return Tok_CharClass;
3003 case 'P':
3004 if (xmlSchemaExtensions) {
3005 yyCharClass->setNegative(!yyCharClass->negative());
3006 // fall through
3007 }
3008 case 'p':
3009 if (xmlSchemaExtensions) {
3010 if (yyCh != '{') {
3011 error(RXERR_CHARCLASS);
3012 return Tok_CharClass;
3013 }
3014
3015 QByteArray category;
3016 yyCh = getChar();
3017 while (yyCh != '}') {
3018 if (yyCh == EOS) {
3019 error(RXERR_END);
3020 return Tok_CharClass;
3021 }
3022 category.append(yyCh);
3023 yyCh = getChar();
3024 }
3025 yyCh = getChar(); // skip closing '}'
3026
3027 if (category == "M") {
3028 yyCharClass->addCategories(0x0000000e);
3029 } else if (category == "Mn") {
3030 yyCharClass->addCategories(0x00000002);
3031 } else if (category == "Mc") {
3032 yyCharClass->addCategories(0x00000004);
3033 } else if (category == "Me") {
3034 yyCharClass->addCategories(0x00000008);
3035 } else if (category == "N") {
3036 yyCharClass->addCategories(0x00000070);
3037 } else if (category == "Nd") {
3038 yyCharClass->addCategories(0x00000010);
3039 } else if (category == "Nl") {
3040 yyCharClass->addCategories(0x00000020);
3041 } else if (category == "No") {
3042 yyCharClass->addCategories(0x00000040);
3043 } else if (category == "Z") {
3044 yyCharClass->addCategories(0x00000380);
3045 } else if (category == "Zs") {
3046 yyCharClass->addCategories(0x00000080);
3047 } else if (category == "Zl") {
3048 yyCharClass->addCategories(0x00000100);
3049 } else if (category == "Zp") {
3050 yyCharClass->addCategories(0x00000200);
3051 } else if (category == "C") {
3052 yyCharClass->addCategories(0x00006c00);
3053 } else if (category == "Cc") {
3054 yyCharClass->addCategories(0x00000400);
3055 } else if (category == "Cf") {
3056 yyCharClass->addCategories(0x00000800);
3057 } else if (category == "Cs") {
3058 yyCharClass->addCategories(0x00001000);
3059 } else if (category == "Co") {
3060 yyCharClass->addCategories(0x00002000);
3061 } else if (category == "Cn") {
3062 yyCharClass->addCategories(0x00004000);
3063 } else if (category == "L") {
3064 yyCharClass->addCategories(0x000f8000);
3065 } else if (category == "Lu") {
3066 yyCharClass->addCategories(0x00008000);
3067 } else if (category == "Ll") {
3068 yyCharClass->addCategories(0x00010000);
3069 } else if (category == "Lt") {
3070 yyCharClass->addCategories(0x00020000);
3071 } else if (category == "Lm") {
3072 yyCharClass->addCategories(0x00040000);
3073 } else if (category == "Lo") {
3074 yyCharClass->addCategories(0x00080000);
3075 } else if (category == "P") {
3076 yyCharClass->addCategories(0x4f580780);
3077 } else if (category == "Pc") {
3078 yyCharClass->addCategories(0x00100000);
3079 } else if (category == "Pd") {
3080 yyCharClass->addCategories(0x00200000);
3081 } else if (category == "Ps") {
3082 yyCharClass->addCategories(0x00400000);
3083 } else if (category == "Pe") {
3084 yyCharClass->addCategories(0x00800000);
3085 } else if (category == "Pi") {
3086 yyCharClass->addCategories(0x01000000);
3087 } else if (category == "Pf") {
3088 yyCharClass->addCategories(0x02000000);
3089 } else if (category == "Po") {
3090 yyCharClass->addCategories(0x04000000);
3091 } else if (category == "S") {
3092 yyCharClass->addCategories(0x78000000);
3093 } else if (category == "Sm") {
3094 yyCharClass->addCategories(0x08000000);
3095 } else if (category == "Sc") {
3096 yyCharClass->addCategories(0x10000000);
3097 } else if (category == "Sk") {
3098 yyCharClass->addCategories(0x20000000);
3099 } else if (category == "So") {
3100 yyCharClass->addCategories(0x40000000);
3101 } else if (category.startsWith("Is")) {
3102 if (categoriesRangeMap.isEmpty())
3103 setupCategoriesRangeMap();
3104
3105 if (categoriesRangeMap.contains(category)) {
3106 const QPair<int, int> range = categoriesRangeMap.value(category);
3107 yyCharClass->addRange(range.first, range.second);
3108 } else {
3109 error(RXERR_CATEGORY);
3110 }
3111 } else {
3112 error(RXERR_CATEGORY);
3113 }
3114 }
3115 return Tok_CharClass;
3116 #endif
3117 #ifndef QT_NO_REGEXP_ESCAPE
3118 case 'x':
3119 val = 0;
3120 for (i = 0; i < 4; i++) {
3121 low = QChar(yyCh).toLower().unicode();
3122 if (low >= '0' && low <= '9')
3123 val = (val << 4) | (low - '0');
3124 else if (low >= 'a' && low <= 'f')
3125 val = (val << 4) | (low - 'a' + 10);
3126 else
3127 break;
3128 yyCh = getChar();
3129 }
3130 return Tok_Char | val;
3131 #endif
3132 default:
3133 if (prevCh >= '1' && prevCh <= '9') {
3134 #ifndef QT_NO_REGEXP_BACKREF
3135 val = prevCh - '0';
3136 while (yyCh >= '0' && yyCh <= '9') {
3137 val = (val * 10) + (yyCh - '0');
3138 yyCh = getChar();
3139 }
3140 return Tok_BackRef | val;
3141 #else
3142 error(RXERR_DISABLED);
3143 #endif
3144 }
3145 return Tok_Char | prevCh;
3146 }
3147 }
3148
3149 #ifndef QT_NO_REGEXP_INTERVAL
getRep(int def)3150 int QRegExpEngine::getRep(int def)
3151 {
3152 if (yyCh >= '0' && yyCh <= '9') {
3153 int rep = 0;
3154 do {
3155 rep = 10 * rep + yyCh - '0';
3156 if (rep >= InftyRep) {
3157 error(RXERR_REPETITION);
3158 rep = def;
3159 }
3160 yyCh = getChar();
3161 } while (yyCh >= '0' && yyCh <= '9');
3162 return rep;
3163 } else {
3164 return def;
3165 }
3166 }
3167 #endif
3168
3169 #ifndef QT_NO_REGEXP_LOOKAHEAD
skipChars(int n)3170 void QRegExpEngine::skipChars(int n)
3171 {
3172 if (n > 0) {
3173 yyPos += n - 1;
3174 yyCh = getChar();
3175 }
3176 }
3177 #endif
3178
error(const char * msg)3179 void QRegExpEngine::error(const char *msg)
3180 {
3181 if (yyError.isEmpty())
3182 yyError = QLatin1String(msg);
3183 }
3184
startTokenizer(const QChar * rx,int len)3185 void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3186 {
3187 yyIn = rx;
3188 yyPos0 = 0;
3189 yyPos = 0;
3190 yyLen = len;
3191 yyCh = getChar();
3192 yyCharClass.reset(new QRegExpCharClass);
3193 yyMinRep = 0;
3194 yyMaxRep = 0;
3195 yyError = QString();
3196 }
3197
getToken()3198 int QRegExpEngine::getToken()
3199 {
3200 #ifndef QT_NO_REGEXP_CCLASS
3201 ushort pendingCh = 0;
3202 bool charPending;
3203 bool rangePending;
3204 int tok;
3205 #endif
3206 int prevCh = yyCh;
3207
3208 yyPos0 = yyPos - 1;
3209 #ifndef QT_NO_REGEXP_CCLASS
3210 yyCharClass->clear();
3211 #endif
3212 yyMinRep = 0;
3213 yyMaxRep = 0;
3214 yyCh = getChar();
3215
3216 switch (prevCh) {
3217 case EOS:
3218 yyPos0 = yyPos;
3219 return Tok_Eos;
3220 case '$':
3221 return Tok_Dollar;
3222 case '(':
3223 if (yyCh == '?') {
3224 prevCh = getChar();
3225 yyCh = getChar();
3226 switch (prevCh) {
3227 #ifndef QT_NO_REGEXP_LOOKAHEAD
3228 case '!':
3229 return Tok_NegLookahead;
3230 case '=':
3231 return Tok_PosLookahead;
3232 #endif
3233 case ':':
3234 return Tok_MagicLeftParen;
3235 case '<':
3236 error(RXERR_LOOKBEHIND);
3237 return Tok_MagicLeftParen;
3238 default:
3239 error(RXERR_LOOKAHEAD);
3240 return Tok_MagicLeftParen;
3241 }
3242 } else {
3243 return Tok_LeftParen;
3244 }
3245 case ')':
3246 return Tok_RightParen;
3247 case '*':
3248 yyMinRep = 0;
3249 yyMaxRep = InftyRep;
3250 return Tok_Quantifier;
3251 case '+':
3252 yyMinRep = 1;
3253 yyMaxRep = InftyRep;
3254 return Tok_Quantifier;
3255 case '.':
3256 #ifndef QT_NO_REGEXP_CCLASS
3257 yyCharClass->setNegative(true);
3258 #endif
3259 return Tok_CharClass;
3260 case '?':
3261 yyMinRep = 0;
3262 yyMaxRep = 1;
3263 return Tok_Quantifier;
3264 case '[':
3265 #ifndef QT_NO_REGEXP_CCLASS
3266 if (yyCh == '^') {
3267 yyCharClass->setNegative(true);
3268 yyCh = getChar();
3269 }
3270 charPending = false;
3271 rangePending = false;
3272 do {
3273 if (yyCh == '-' && charPending && !rangePending) {
3274 rangePending = true;
3275 yyCh = getChar();
3276 } else {
3277 if (charPending && !rangePending) {
3278 yyCharClass->addSingleton(pendingCh);
3279 charPending = false;
3280 }
3281 if (yyCh == '\\') {
3282 yyCh = getChar();
3283 tok = getEscape();
3284 if (tok == Tok_Word)
3285 tok = '\b';
3286 } else {
3287 tok = Tok_Char | yyCh;
3288 yyCh = getChar();
3289 }
3290 if (tok == Tok_CharClass) {
3291 if (rangePending) {
3292 yyCharClass->addSingleton('-');
3293 yyCharClass->addSingleton(pendingCh);
3294 charPending = false;
3295 rangePending = false;
3296 }
3297 } else if ((tok & Tok_Char) != 0) {
3298 if (rangePending) {
3299 yyCharClass->addRange(pendingCh, tok ^ Tok_Char);
3300 charPending = false;
3301 rangePending = false;
3302 } else {
3303 pendingCh = tok ^ Tok_Char;
3304 charPending = true;
3305 }
3306 } else {
3307 error(RXERR_CHARCLASS);
3308 }
3309 }
3310 } while (yyCh != ']' && yyCh != EOS);
3311 if (rangePending)
3312 yyCharClass->addSingleton('-');
3313 if (charPending)
3314 yyCharClass->addSingleton(pendingCh);
3315 if (yyCh == EOS)
3316 error(RXERR_END);
3317 else
3318 yyCh = getChar();
3319 return Tok_CharClass;
3320 #else
3321 error(RXERR_END);
3322 return Tok_Char | '[';
3323 #endif
3324 case '\\':
3325 return getEscape();
3326 case ']':
3327 error(RXERR_LEFTDELIM);
3328 return Tok_Char | ']';
3329 case '^':
3330 return Tok_Caret;
3331 case '{':
3332 #ifndef QT_NO_REGEXP_INTERVAL
3333 yyMinRep = getRep(0);
3334 yyMaxRep = yyMinRep;
3335 if (yyCh == ',') {
3336 yyCh = getChar();
3337 yyMaxRep = getRep(InftyRep);
3338 }
3339 if (yyMaxRep < yyMinRep)
3340 error(RXERR_INTERVAL);
3341 if (yyCh != '}')
3342 error(RXERR_REPETITION);
3343 yyCh = getChar();
3344 return Tok_Quantifier;
3345 #else
3346 error(RXERR_DISABLED);
3347 return Tok_Char | '{';
3348 #endif
3349 case '|':
3350 return Tok_Bar;
3351 case '}':
3352 error(RXERR_LEFTDELIM);
3353 return Tok_Char | '}';
3354 default:
3355 return Tok_Char | prevCh;
3356 }
3357 }
3358
parse(const QChar * pattern,int len)3359 int QRegExpEngine::parse(const QChar *pattern, int len)
3360 {
3361 valid = true;
3362 startTokenizer(pattern, len);
3363 yyTok = getToken();
3364 #ifndef QT_NO_REGEXP_CAPTURE
3365 yyMayCapture = true;
3366 #else
3367 yyMayCapture = false;
3368 #endif
3369
3370 #ifndef QT_NO_REGEXP_CAPTURE
3371 int atom = startAtom(false);
3372 #endif
3373 QRegExpCharClass anything;
3374 Box box(this); // create InitialState
3375 box.set(anything);
3376 Box rightBox(this); // create FinalState
3377 rightBox.set(anything);
3378
3379 Box middleBox(this);
3380 parseExpression(&middleBox);
3381 #ifndef QT_NO_REGEXP_CAPTURE
3382 finishAtom(atom, false);
3383 #endif
3384 #ifndef QT_NO_REGEXP_OPTIM
3385 middleBox.setupHeuristics();
3386 #endif
3387 box.cat(middleBox);
3388 box.cat(rightBox);
3389 yyCharClass.reset(0);
3390
3391 #ifndef QT_NO_REGEXP_CAPTURE
3392 for (int i = 0; i < nf; ++i) {
3393 switch (f[i].capture) {
3394 case QRegExpAtom::NoCapture:
3395 break;
3396 case QRegExpAtom::OfficialCapture:
3397 f[i].capture = ncap;
3398 captureForOfficialCapture.append(ncap);
3399 ++ncap;
3400 ++officialncap;
3401 break;
3402 case QRegExpAtom::UnofficialCapture:
3403 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3404 }
3405 }
3406
3407 #ifndef QT_NO_REGEXP_BACKREF
3408 #ifndef QT_NO_REGEXP_OPTIM
3409 if (officialncap == 0 && nbrefs == 0) {
3410 ncap = nf = 0;
3411 f.clear();
3412 }
3413 #endif
3414 // handle the case where there's a \5 with no corresponding capture
3415 // (captureForOfficialCapture.size() != officialncap)
3416 for (int i = 0; i < nbrefs - officialncap; ++i) {
3417 captureForOfficialCapture.append(ncap);
3418 ++ncap;
3419 }
3420 #endif
3421 #endif
3422
3423 if (!yyError.isEmpty())
3424 return -1;
3425
3426 #ifndef QT_NO_REGEXP_OPTIM
3427 const QRegExpAutomatonState &sinit = s.at(InitialState);
3428 caretAnchored = !sinit.anchors.isEmpty();
3429 if (caretAnchored) {
3430 const QMap<int, int> &anchors = sinit.anchors;
3431 QMap<int, int>::const_iterator a;
3432 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3433 if (
3434 #ifndef QT_NO_REGEXP_ANCHOR_ALT
3435 (*a & Anchor_Alternation) != 0 ||
3436 #endif
3437 (*a & Anchor_Caret) == 0)
3438 {
3439 caretAnchored = false;
3440 break;
3441 }
3442 }
3443 }
3444 #endif
3445
3446 // cleanup anchors
3447 int numStates = s.count();
3448 for (int i = 0; i < numStates; ++i) {
3449 QRegExpAutomatonState &state = s[i];
3450 if (!state.anchors.isEmpty()) {
3451 QMap<int, int>::iterator a = state.anchors.begin();
3452 while (a != state.anchors.end()) {
3453 if (a.value() == 0)
3454 a = state.anchors.erase(a);
3455 else
3456 ++a;
3457 }
3458 }
3459 }
3460
3461 return yyPos0;
3462 }
3463
parseAtom(Box * box)3464 void QRegExpEngine::parseAtom(Box *box)
3465 {
3466 #ifndef QT_NO_REGEXP_LOOKAHEAD
3467 QRegExpEngine *eng = 0;
3468 bool neg;
3469 int len;
3470 #endif
3471
3472 if ((yyTok & Tok_Char) != 0) {
3473 box->set(QChar(yyTok ^ Tok_Char));
3474 } else {
3475 #ifndef QT_NO_REGEXP_OPTIM
3476 trivial = false;
3477 #endif
3478 switch (yyTok) {
3479 case Tok_Dollar:
3480 box->catAnchor(Anchor_Dollar);
3481 break;
3482 case Tok_Caret:
3483 box->catAnchor(Anchor_Caret);
3484 break;
3485 #ifndef QT_NO_REGEXP_LOOKAHEAD
3486 case Tok_PosLookahead:
3487 case Tok_NegLookahead:
3488 neg = (yyTok == Tok_NegLookahead);
3489 eng = new QRegExpEngine(cs, greedyQuantifiers);
3490 len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);
3491 if (len >= 0)
3492 skipChars(len);
3493 else
3494 error(RXERR_LOOKAHEAD);
3495 box->catAnchor(addLookahead(eng, neg));
3496 yyTok = getToken();
3497 if (yyTok != Tok_RightParen)
3498 error(RXERR_LOOKAHEAD);
3499 break;
3500 #endif
3501 #ifndef QT_NO_REGEXP_ESCAPE
3502 case Tok_Word:
3503 box->catAnchor(Anchor_Word);
3504 break;
3505 case Tok_NonWord:
3506 box->catAnchor(Anchor_NonWord);
3507 break;
3508 #endif
3509 case Tok_LeftParen:
3510 case Tok_MagicLeftParen:
3511 yyTok = getToken();
3512 parseExpression(box);
3513 if (yyTok != Tok_RightParen)
3514 error(RXERR_END);
3515 break;
3516 case Tok_CharClass:
3517 box->set(*yyCharClass);
3518 break;
3519 case Tok_Quantifier:
3520 error(RXERR_REPETITION);
3521 break;
3522 default:
3523 #ifndef QT_NO_REGEXP_BACKREF
3524 if ((yyTok & Tok_BackRef) != 0)
3525 box->set(yyTok ^ Tok_BackRef);
3526 else
3527 #endif
3528 error(RXERR_DISABLED);
3529 }
3530 }
3531 yyTok = getToken();
3532 }
3533
parseFactor(Box * box)3534 void QRegExpEngine::parseFactor(Box *box)
3535 {
3536 #ifndef QT_NO_REGEXP_CAPTURE
3537 int outerAtom = greedyQuantifiers ? startAtom(false) : -1;
3538 int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);
3539 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3540 #else
3541 const int innerAtom = -1;
3542 #endif
3543
3544 #ifndef QT_NO_REGEXP_INTERVAL
3545 #define YYREDO() \
3546 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3547 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3548
3549 const QChar *in = yyIn;
3550 int pos0 = yyPos0;
3551 int pos = yyPos;
3552 int len = yyLen;
3553 int ch = yyCh;
3554 QRegExpCharClass charClass;
3555 if (yyTok == Tok_CharClass)
3556 charClass = *yyCharClass;
3557 int tok = yyTok;
3558 bool mayCapture = yyMayCapture;
3559 #endif
3560
3561 parseAtom(box);
3562 #ifndef QT_NO_REGEXP_CAPTURE
3563 finishAtom(innerAtom, magicLeftParen);
3564 #endif
3565
3566 bool hasQuantifier = (yyTok == Tok_Quantifier);
3567 if (hasQuantifier) {
3568 #ifndef QT_NO_REGEXP_OPTIM
3569 trivial = false;
3570 #endif
3571 if (yyMaxRep == InftyRep) {
3572 box->plus(innerAtom);
3573 #ifndef QT_NO_REGEXP_INTERVAL
3574 } else if (yyMaxRep == 0) {
3575 box->clear();
3576 #endif
3577 }
3578 if (yyMinRep == 0)
3579 box->opt();
3580
3581 #ifndef QT_NO_REGEXP_INTERVAL
3582 yyMayCapture = false;
3583 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3584 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3585
3586 Box rightBox(this);
3587 int i;
3588
3589 for (i = 0; i < beta; i++) {
3590 YYREDO();
3591 Box leftBox(this);
3592 parseAtom(&leftBox);
3593 leftBox.cat(rightBox);
3594 leftBox.opt();
3595 rightBox = leftBox;
3596 }
3597 for (i = 0; i < alpha; i++) {
3598 YYREDO();
3599 Box leftBox(this);
3600 parseAtom(&leftBox);
3601 leftBox.cat(rightBox);
3602 rightBox = leftBox;
3603 }
3604 rightBox.cat(*box);
3605 *box = rightBox;
3606 #endif
3607 yyTok = getToken();
3608 #ifndef QT_NO_REGEXP_INTERVAL
3609 yyMayCapture = mayCapture;
3610 #endif
3611 }
3612 #undef YYREDO
3613 #ifndef QT_NO_REGEXP_CAPTURE
3614 if (greedyQuantifiers)
3615 finishAtom(outerAtom, hasQuantifier);
3616 #endif
3617 }
3618
parseTerm(Box * box)3619 void QRegExpEngine::parseTerm(Box *box)
3620 {
3621 #ifndef QT_NO_REGEXP_OPTIM
3622 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3623 parseFactor(box);
3624 #endif
3625 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3626 Box rightBox(this);
3627 parseFactor(&rightBox);
3628 box->cat(rightBox);
3629 }
3630 }
3631
parseExpression(Box * box)3632 void QRegExpEngine::parseExpression(Box *box)
3633 {
3634 parseTerm(box);
3635 while (yyTok == Tok_Bar) {
3636 #ifndef QT_NO_REGEXP_OPTIM
3637 trivial = false;
3638 #endif
3639 Box rightBox(this);
3640 yyTok = getToken();
3641 parseTerm(&rightBox);
3642 box->orx(rightBox);
3643 }
3644 }
3645
3646 /*
3647 The struct QRegExpPrivate contains the private data of a regular
3648 expression other than the automaton. It makes it possible for many
3649 QRegExp objects to use the same QRegExpEngine object with different
3650 QRegExpPrivate objects.
3651 */
3652 struct QRegExpPrivate
3653 {
3654 QRegExpEngine *eng;
3655 QRegExpEngineKey engineKey;
3656 bool minimal;
3657 #ifndef QT_NO_REGEXP_CAPTURE
3658 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3659 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3660 #endif
3661 QRegExpMatchState matchState;
3662
QRegExpPrivateQRegExpPrivate3663 inline QRegExpPrivate()
3664 : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
QRegExpPrivateQRegExpPrivate3665 inline QRegExpPrivate(const QRegExpEngineKey &key)
3666 : eng(0), engineKey(key), minimal(false) {}
3667 };
3668
3669 #if !defined(QT_NO_REGEXP_OPTIM)
qHash(const QRegExpEngineKey & key)3670 uint qHash(const QRegExpEngineKey &key)
3671 {
3672 return qHash(key.pattern);
3673 }
3674
3675 typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;
Q_GLOBAL_STATIC(EngineCache,globalEngineCache)3676 Q_GLOBAL_STATIC(EngineCache, globalEngineCache)
3677 Q_GLOBAL_STATIC(QMutex, mutex)
3678 #endif // QT_NO_REGEXP_OPTIM
3679
3680 static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3681 {
3682 if (!eng->ref.deref()) {
3683 #if !defined(QT_NO_REGEXP_OPTIM)
3684 if (globalEngineCache()) {
3685 QMutexLocker locker(mutex());
3686 QT_TRY {
3687 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);
3688 } QT_CATCH(const std::bad_alloc &) {
3689 // in case of an exception (e.g. oom), just delete the engine
3690 delete eng;
3691 }
3692 } else {
3693 delete eng;
3694 }
3695 #else
3696 Q_UNUSED(key);
3697 delete eng;
3698 #endif
3699 }
3700 }
3701
prepareEngine_helper(QRegExpPrivate * priv)3702 static void prepareEngine_helper(QRegExpPrivate *priv)
3703 {
3704 bool initMatchState = !priv->eng;
3705 #if !defined(QT_NO_REGEXP_OPTIM)
3706 if (!priv->eng && globalEngineCache()) {
3707 QMutexLocker locker(mutex());
3708 priv->eng = globalEngineCache()->take(priv->engineKey);
3709 if (priv->eng != 0)
3710 priv->eng->ref.ref();
3711 }
3712 #endif // QT_NO_REGEXP_OPTIM
3713
3714 if (!priv->eng)
3715 priv->eng = new QRegExpEngine(priv->engineKey);
3716
3717 if (initMatchState)
3718 priv->matchState.prepareForMatch(priv->eng);
3719 }
3720
prepareEngine(QRegExpPrivate * priv)3721 inline static void prepareEngine(QRegExpPrivate *priv)
3722 {
3723 if (priv->eng)
3724 return;
3725 prepareEngine_helper(priv);
3726 }
3727
prepareEngineForMatch(QRegExpPrivate * priv,const QString & str)3728 static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3729 {
3730 prepareEngine(priv);
3731 priv->matchState.prepareForMatch(priv->eng);
3732 #ifndef QT_NO_REGEXP_CAPTURE
3733 priv->t = str;
3734 priv->capturedCache.clear();
3735 #else
3736 Q_UNUSED(str);
3737 #endif
3738 }
3739
invalidateEngine(QRegExpPrivate * priv)3740 static void invalidateEngine(QRegExpPrivate *priv)
3741 {
3742 if (priv->eng != 0) {
3743 derefEngine(priv->eng, priv->engineKey);
3744 priv->eng = 0;
3745 priv->matchState.drain();
3746 }
3747 }
3748
3749 /*!
3750 \enum QRegExp::CaretMode
3751
3752 The CaretMode enum defines the different meanings of the caret
3753 (\bold{^}) in a regular expression. The possible values are:
3754
3755 \value CaretAtZero
3756 The caret corresponds to index 0 in the searched string.
3757
3758 \value CaretAtOffset
3759 The caret corresponds to the start offset of the search.
3760
3761 \value CaretWontMatch
3762 The caret never matches.
3763 */
3764
3765 /*!
3766 \enum QRegExp::PatternSyntax
3767
3768 The syntax used to interpret the meaning of the pattern.
3769
3770 \value RegExp A rich Perl-like pattern matching syntax. This is
3771 the default.
3772
3773 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. This
3774 will be the default in Qt 5. (Introduced in Qt 4.2.)
3775
3776 \value Wildcard This provides a simple pattern matching syntax
3777 similar to that used by shells (command interpreters) for "file
3778 globbing". See \l{Wildcard Matching}.
3779
3780 \value WildcardUnix This is similar to Wildcard but with the
3781 behavior of a Unix shell. The wildcard characters can be escaped
3782 with the character "\\".
3783
3784 \value FixedString The pattern is a fixed string. This is
3785 equivalent to using the RegExp pattern on a string in
3786 which all metacharacters are escaped using escape().
3787
3788 \value W3CXmlSchema11 The pattern is a regular expression as
3789 defined by the W3C XML Schema 1.1 specification.
3790
3791 \sa setPatternSyntax()
3792 */
3793
3794 /*!
3795 Constructs an empty regexp.
3796
3797 \sa isValid(), errorString()
3798 */
QRegExp()3799 QRegExp::QRegExp()
3800 {
3801 priv = new QRegExpPrivate;
3802 prepareEngine(priv);
3803 }
3804
3805 /*!
3806 Constructs a regular expression object for the given \a pattern
3807 string. The pattern must be given using wildcard notation if \a
3808 syntax is \l Wildcard; the default is \l RegExp. The pattern is
3809 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3810 greedy (maximal), but can be changed by calling
3811 setMinimal().
3812
3813 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3814 */
QRegExp(const QString & pattern,Qt::CaseSensitivity cs,PatternSyntax syntax)3815 QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3816 {
3817 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3818 prepareEngine(priv);
3819 }
3820
3821 /*!
3822 Constructs a regular expression as a copy of \a rx.
3823
3824 \sa operator=()
3825 */
QRegExp(const QRegExp & rx)3826 QRegExp::QRegExp(const QRegExp &rx)
3827 {
3828 priv = new QRegExpPrivate;
3829 operator=(rx);
3830 }
3831
3832 /*!
3833 Destroys the regular expression and cleans up its internal data.
3834 */
~QRegExp()3835 QRegExp::~QRegExp()
3836 {
3837 invalidateEngine(priv);
3838 delete priv;
3839 }
3840
3841 /*!
3842 Copies the regular expression \a rx and returns a reference to the
3843 copy. The case sensitivity, wildcard, and minimal matching options
3844 are also copied.
3845 */
operator =(const QRegExp & rx)3846 QRegExp &QRegExp::operator=(const QRegExp &rx)
3847 {
3848 prepareEngine(rx.priv); // to allow sharing
3849 QRegExpEngine *otherEng = rx.priv->eng;
3850 if (otherEng)
3851 otherEng->ref.ref();
3852 invalidateEngine(priv);
3853 priv->eng = otherEng;
3854 priv->engineKey = rx.priv->engineKey;
3855 priv->minimal = rx.priv->minimal;
3856 #ifndef QT_NO_REGEXP_CAPTURE
3857 priv->t = rx.priv->t;
3858 priv->capturedCache = rx.priv->capturedCache;
3859 #endif
3860 if (priv->eng)
3861 priv->matchState.prepareForMatch(priv->eng);
3862 priv->matchState.captured = rx.priv->matchState.captured;
3863 return *this;
3864 }
3865
3866 /*!
3867 \fn void QRegExp::swap(QRegExp &other)
3868 \since 4.8
3869
3870 Swaps regular expression \a other with this regular
3871 expression. This operation is very fast and never fails.
3872 */
3873
3874 /*!
3875 Returns true if this regular expression is equal to \a rx;
3876 otherwise returns false.
3877
3878 Two QRegExp objects are equal if they have the same pattern
3879 strings and the same settings for case sensitivity, wildcard and
3880 minimal matching.
3881 */
operator ==(const QRegExp & rx) const3882 bool QRegExp::operator==(const QRegExp &rx) const
3883 {
3884 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
3885 }
3886
3887 /*!
3888 \fn bool QRegExp::operator!=(const QRegExp &rx) const
3889
3890 Returns true if this regular expression is not equal to \a rx;
3891 otherwise returns false.
3892
3893 \sa operator==()
3894 */
3895
3896 /*!
3897 Returns true if the pattern string is empty; otherwise returns
3898 false.
3899
3900 If you call exactMatch() with an empty pattern on an empty string
3901 it will return true; otherwise it returns false since it operates
3902 over the whole string. If you call indexIn() with an empty pattern
3903 on \e any string it will return the start offset (0 by default)
3904 because the empty pattern matches the 'emptiness' at the start of
3905 the string. In this case the length of the match returned by
3906 matchedLength() will be 0.
3907
3908 See QString::isEmpty().
3909 */
3910
isEmpty() const3911 bool QRegExp::isEmpty() const
3912 {
3913 return priv->engineKey.pattern.isEmpty();
3914 }
3915
3916 /*!
3917 Returns true if the regular expression is valid; otherwise returns
3918 false. An invalid regular expression never matches.
3919
3920 The pattern \bold{[a-z} is an example of an invalid pattern, since
3921 it lacks a closing square bracket.
3922
3923 Note that the validity of a regexp may also depend on the setting
3924 of the wildcard flag, for example \bold{*.html} is a valid
3925 wildcard regexp but an invalid full regexp.
3926
3927 \sa errorString()
3928 */
isValid() const3929 bool QRegExp::isValid() const
3930 {
3931 if (priv->engineKey.pattern.isEmpty()) {
3932 return true;
3933 } else {
3934 prepareEngine(priv);
3935 return priv->eng->isValid();
3936 }
3937 }
3938
3939 /*!
3940 Returns the pattern string of the regular expression. The pattern
3941 has either regular expression syntax or wildcard syntax, depending
3942 on patternSyntax().
3943
3944 \sa patternSyntax(), caseSensitivity()
3945 */
pattern() const3946 QString QRegExp::pattern() const
3947 {
3948 return priv->engineKey.pattern;
3949 }
3950
3951 /*!
3952 Sets the pattern string to \a pattern. The case sensitivity,
3953 wildcard, and minimal matching options are not changed.
3954
3955 \sa setPatternSyntax(), setCaseSensitivity()
3956 */
setPattern(const QString & pattern)3957 void QRegExp::setPattern(const QString &pattern)
3958 {
3959 if (priv->engineKey.pattern != pattern) {
3960 invalidateEngine(priv);
3961 priv->engineKey.pattern = pattern;
3962 }
3963 }
3964
3965 /*!
3966 Returns Qt::CaseSensitive if the regexp is matched case
3967 sensitively; otherwise returns Qt::CaseInsensitive.
3968
3969 \sa patternSyntax(), pattern(), isMinimal()
3970 */
caseSensitivity() const3971 Qt::CaseSensitivity QRegExp::caseSensitivity() const
3972 {
3973 return priv->engineKey.cs;
3974 }
3975
3976 /*!
3977 Sets case sensitive matching to \a cs.
3978
3979 If \a cs is Qt::CaseSensitive, \bold{\\.txt$} matches
3980 \c{readme.txt} but not \c{README.TXT}.
3981
3982 \sa setPatternSyntax(), setPattern(), setMinimal()
3983 */
setCaseSensitivity(Qt::CaseSensitivity cs)3984 void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
3985 {
3986 if ((bool)cs != (bool)priv->engineKey.cs) {
3987 invalidateEngine(priv);
3988 priv->engineKey.cs = cs;
3989 }
3990 }
3991
3992 /*!
3993 Returns the syntax used by the regular expression. The default is
3994 QRegExp::RegExp.
3995
3996 \sa pattern(), caseSensitivity()
3997 */
patternSyntax() const3998 QRegExp::PatternSyntax QRegExp::patternSyntax() const
3999 {
4000 return priv->engineKey.patternSyntax;
4001 }
4002
4003 /*!
4004 Sets the syntax mode for the regular expression. The default is
4005 QRegExp::RegExp.
4006
4007 Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4008 \l{wildcard matching}. For example, \bold{r*.txt} matches the
4009 string \c{readme.txt} in wildcard mode, but does not match
4010 \c{readme}.
4011
4012 Setting \a syntax to QRegExp::FixedString means that the pattern
4013 is interpreted as a plain string. Special characters (e.g.,
4014 backslash) don't need to be escaped then.
4015
4016 \sa setPattern(), setCaseSensitivity(), escape()
4017 */
setPatternSyntax(PatternSyntax syntax)4018 void QRegExp::setPatternSyntax(PatternSyntax syntax)
4019 {
4020 if (syntax != priv->engineKey.patternSyntax) {
4021 invalidateEngine(priv);
4022 priv->engineKey.patternSyntax = syntax;
4023 }
4024 }
4025
4026 /*!
4027 Returns true if minimal (non-greedy) matching is enabled;
4028 otherwise returns false.
4029
4030 \sa caseSensitivity(), setMinimal()
4031 */
isMinimal() const4032 bool QRegExp::isMinimal() const
4033 {
4034 return priv->minimal;
4035 }
4036
4037 /*!
4038 Enables or disables minimal matching. If \a minimal is false,
4039 matching is greedy (maximal) which is the default.
4040
4041 For example, suppose we have the input string "We must be
4042 <b>bold</b>, very <b>bold</b>!" and the pattern
4043 \bold{<b>.*</b>}. With the default greedy (maximal) matching,
4044 the match is "We must be \underline{<b>bold</b>, very
4045 <b>bold</b>}!". But with minimal (non-greedy) matching, the
4046 first match is: "We must be \underline{<b>bold</b>}, very
4047 <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4048 very \underline{<b>bold</b>}!". In practice we might use the pattern
4049 \bold{<b>[^<]*\</b>} instead, although this will still fail for
4050 nested tags.
4051
4052 \sa setCaseSensitivity()
4053 */
setMinimal(bool minimal)4054 void QRegExp::setMinimal(bool minimal)
4055 {
4056 priv->minimal = minimal;
4057 }
4058
4059 // ### Qt 5: make non-const
4060 /*!
4061 Returns true if \a str is matched exactly by this regular
4062 expression; otherwise returns false. You can determine how much of
4063 the string was matched by calling matchedLength().
4064
4065 For a given regexp string R, exactMatch("R") is the equivalent of
4066 indexIn("^R$") since exactMatch() effectively encloses the regexp
4067 in the start of string and end of string anchors, except that it
4068 sets matchedLength() differently.
4069
4070 For example, if the regular expression is \bold{blue}, then
4071 exactMatch() returns true only for input \c blue. For inputs \c
4072 bluebell, \c blutak and \c lightblue, exactMatch() returns false
4073 and matchedLength() will return 4, 3 and 0 respectively.
4074
4075 Although const, this function sets matchedLength(),
4076 capturedTexts(), and pos().
4077
4078 \sa indexIn(), lastIndexIn()
4079 */
exactMatch(const QString & str) const4080 bool QRegExp::exactMatch(const QString &str) const
4081 {
4082 prepareEngineForMatch(priv, str);
4083 priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);
4084 if (priv->matchState.captured[1] == str.length()) {
4085 return true;
4086 } else {
4087 priv->matchState.captured[0] = 0;
4088 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4089 return false;
4090 }
4091 }
4092
4093 // ### Qt 5: make non-const
4094 /*!
4095 Attempts to find a match in \a str from position \a offset (0 by
4096 default). If \a offset is -1, the search starts at the last
4097 character; if -2, at the next to last character; etc.
4098
4099 Returns the position of the first match, or -1 if there was no
4100 match.
4101
4102 The \a caretMode parameter can be used to instruct whether \bold{^}
4103 should match at index 0 or at \a offset.
4104
4105 You might prefer to use QString::indexOf(), QString::contains(),
4106 or even QStringList::filter(). To replace matches use
4107 QString::replace().
4108
4109 Example:
4110 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 13
4111
4112 Although const, this function sets matchedLength(),
4113 capturedTexts() and pos().
4114
4115 If the QRegExp is a wildcard expression (see setPatternSyntax())
4116 and want to test a string against the whole wildcard expression,
4117 use exactMatch() instead of this function.
4118
4119 \sa lastIndexIn(), exactMatch()
4120 */
4121
indexIn(const QString & str,int offset,CaretMode caretMode) const4122 int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4123 {
4124 prepareEngineForMatch(priv, str);
4125 if (offset < 0)
4126 offset += str.length();
4127 priv->matchState.match(str.unicode(), str.length(), offset,
4128 priv->minimal, false, caretIndex(offset, caretMode));
4129 return priv->matchState.captured[0];
4130 }
4131
4132 // ### Qt 5: make non-const
4133 /*!
4134 Attempts to find a match backwards in \a str from position \a
4135 offset. If \a offset is -1 (the default), the search starts at the
4136 last character; if -2, at the next to last character; etc.
4137
4138 Returns the position of the first match, or -1 if there was no
4139 match.
4140
4141 The \a caretMode parameter can be used to instruct whether \bold{^}
4142 should match at index 0 or at \a offset.
4143
4144 Although const, this function sets matchedLength(),
4145 capturedTexts() and pos().
4146
4147 \warning Searching backwards is much slower than searching
4148 forwards.
4149
4150 \sa indexIn(), exactMatch()
4151 */
4152
lastIndexIn(const QString & str,int offset,CaretMode caretMode) const4153 int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4154 {
4155 prepareEngineForMatch(priv, str);
4156 if (offset < 0)
4157 offset += str.length();
4158 if (offset < 0 || offset > str.length()) {
4159 memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));
4160 return -1;
4161 }
4162
4163 while (offset >= 0) {
4164 priv->matchState.match(str.unicode(), str.length(), offset,
4165 priv->minimal, true, caretIndex(offset, caretMode));
4166 if (priv->matchState.captured[0] == offset)
4167 return offset;
4168 --offset;
4169 }
4170 return -1;
4171 }
4172
4173 /*!
4174 Returns the length of the last matched string, or -1 if there was
4175 no match.
4176
4177 \sa exactMatch(), indexIn(), lastIndexIn()
4178 */
matchedLength() const4179 int QRegExp::matchedLength() const
4180 {
4181 return priv->matchState.captured[1];
4182 }
4183
4184 #ifndef QT_NO_REGEXP_CAPTURE
4185
4186 #ifndef QT_NO_DEPRECATED
4187 /*!
4188 \obsolete
4189 Returns the number of captures contained in the regular expression.
4190
4191 \sa captureCount()
4192 */
numCaptures() const4193 int QRegExp::numCaptures() const
4194 {
4195 return captureCount();
4196 }
4197 #endif
4198
4199 /*!
4200 \since 4.6
4201 Returns the number of captures contained in the regular expression.
4202 */
captureCount() const4203 int QRegExp::captureCount() const
4204 {
4205 prepareEngine(priv);
4206 return priv->eng->captureCount();
4207 }
4208
4209 /*!
4210 Returns a list of the captured text strings.
4211
4212 The first string in the list is the entire matched string. Each
4213 subsequent list element contains a string that matched a
4214 (capturing) subexpression of the regexp.
4215
4216 For example:
4217 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 14
4218
4219 The above example also captures elements that may be present but
4220 which we have no interest in. This problem can be solved by using
4221 non-capturing parentheses:
4222
4223 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 15
4224
4225 Note that if you want to iterate over the list, you should iterate
4226 over a copy, e.g.
4227 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 16
4228
4229 Some regexps can match an indeterminate number of times. For
4230 example if the input string is "Offsets: 12 14 99 231 7" and the
4231 regexp, \c{rx}, is \bold{(\\d+)+}, we would hope to get a list of
4232 all the numbers matched. However, after calling
4233 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4234 "12"), i.e. the entire match was "12" and the first subexpression
4235 matched was "12". The correct approach is to use cap() in a
4236 \l{QRegExp#cap_in_a_loop}{loop}.
4237
4238 The order of elements in the string list is as follows. The first
4239 element is the entire matching string. Each subsequent element
4240 corresponds to the next capturing open left parentheses. Thus
4241 capturedTexts()[1] is the text of the first capturing parentheses,
4242 capturedTexts()[2] is the text of the second and so on
4243 (corresponding to $1, $2, etc., in some other regexp languages).
4244
4245 \sa cap(), pos()
4246 */
capturedTexts() const4247 QStringList QRegExp::capturedTexts() const
4248 {
4249 if (priv->capturedCache.isEmpty()) {
4250 prepareEngine(priv);
4251 const int *captured = priv->matchState.captured;
4252 int n = priv->matchState.capturedSize;
4253
4254 for (int i = 0; i < n; i += 2) {
4255 QString m;
4256 if (captured[i + 1] == 0)
4257 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4258 else if (captured[i] >= 0)
4259 m = priv->t.mid(captured[i], captured[i + 1]);
4260 priv->capturedCache.append(m);
4261 }
4262 priv->t.clear();
4263 }
4264 return priv->capturedCache;
4265 }
4266
4267 /*!
4268 \internal
4269 */
capturedTexts()4270 QStringList QRegExp::capturedTexts()
4271 {
4272 return const_cast<const QRegExp *>(this)->capturedTexts();
4273 }
4274
4275 /*!
4276 Returns the text captured by the \a nth subexpression. The entire
4277 match has index 0 and the parenthesized subexpressions have
4278 indexes starting from 1 (excluding non-capturing parentheses).
4279
4280 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 17
4281
4282 The order of elements matched by cap() is as follows. The first
4283 element, cap(0), is the entire matching string. Each subsequent
4284 element corresponds to the next capturing open left parentheses.
4285 Thus cap(1) is the text of the first capturing parentheses, cap(2)
4286 is the text of the second, and so on.
4287
4288 \sa capturedTexts(), pos()
4289 */
cap(int nth) const4290 QString QRegExp::cap(int nth) const
4291 {
4292 return capturedTexts().value(nth);
4293 }
4294
4295 /*!
4296 \internal
4297 */
cap(int nth)4298 QString QRegExp::cap(int nth)
4299 {
4300 return const_cast<const QRegExp *>(this)->cap(nth);
4301 }
4302
4303 /*!
4304 Returns the position of the \a nth captured text in the searched
4305 string. If \a nth is 0 (the default), pos() returns the position
4306 of the whole match.
4307
4308 Example:
4309 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 18
4310
4311 For zero-length matches, pos() always returns -1. (For example, if
4312 cap(4) would return an empty string, pos(4) returns -1.) This is
4313 a feature of the implementation.
4314
4315 \sa cap(), capturedTexts()
4316 */
pos(int nth) const4317 int QRegExp::pos(int nth) const
4318 {
4319 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4320 return -1;
4321 else
4322 return priv->matchState.captured[2 * nth];
4323 }
4324
4325 /*!
4326 \internal
4327 */
pos(int nth)4328 int QRegExp::pos(int nth)
4329 {
4330 return const_cast<const QRegExp *>(this)->pos(nth);
4331 }
4332
4333 /*!
4334 Returns a text string that explains why a regexp pattern is
4335 invalid the case being; otherwise returns "no error occurred".
4336
4337 \sa isValid()
4338 */
errorString() const4339 QString QRegExp::errorString() const
4340 {
4341 if (isValid()) {
4342 return QString::fromLatin1(RXERR_OK);
4343 } else {
4344 return priv->eng->errorString();
4345 }
4346 }
4347
4348 /*!
4349 \internal
4350 */
errorString()4351 QString QRegExp::errorString()
4352 {
4353 return const_cast<const QRegExp *>(this)->errorString();
4354 }
4355 #endif
4356
4357 /*!
4358 Returns the string \a str with every regexp special character
4359 escaped with a backslash. The special characters are $, (,), *, +,
4360 ., ?, [, \,], ^, {, | and }.
4361
4362 Example:
4363
4364 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 19
4365
4366 This function is useful to construct regexp patterns dynamically:
4367
4368 \snippet doc/src/snippets/code/src_corelib_tools_qregexp.cpp 20
4369
4370 \sa setPatternSyntax()
4371 */
escape(const QString & str)4372 QString QRegExp::escape(const QString &str)
4373 {
4374 QString quoted;
4375 const int count = str.count();
4376 quoted.reserve(count * 2);
4377 const QLatin1Char backslash('\\');
4378 for (int i = 0; i < count; i++) {
4379 switch (str.at(i).toLatin1()) {
4380 case '$':
4381 case '(':
4382 case ')':
4383 case '*':
4384 case '+':
4385 case '.':
4386 case '?':
4387 case '[':
4388 case '\\':
4389 case ']':
4390 case '^':
4391 case '{':
4392 case '|':
4393 case '}':
4394 quoted.append(backslash);
4395 }
4396 quoted.append(str.at(i));
4397 }
4398 return quoted;
4399 }
4400
4401 /*!
4402 \fn bool QRegExp::caseSensitive() const
4403
4404 Use \l caseSensitivity() instead.
4405 */
4406
4407 /*!
4408 \fn void QRegExp::setCaseSensitive(bool sensitive)
4409
4410 Use \l setCaseSensitivity() instead.
4411 */
4412
4413 /*!
4414 \fn bool QRegExp::wildcard() const
4415
4416 Use \l patternSyntax() instead.
4417
4418 \oldcode
4419 bool wc = rx.wildcard();
4420 \newcode
4421 bool wc = (rx.patternSyntax() == QRegExp::Wildcard);
4422 \endcode
4423 */
4424
4425 /*!
4426 \fn void QRegExp::setWildcard(bool wildcard)
4427
4428 Use \l setPatternSyntax() instead.
4429
4430 \oldcode
4431 rx.setWildcard(wc);
4432 \newcode
4433 rx.setPatternSyntax(wc ? QRegExp::Wildcard : QRegExp::RegExp);
4434 \endcode
4435 */
4436
4437 /*!
4438 \fn bool QRegExp::minimal() const
4439
4440 Use \l isMinimal() instead.
4441 */
4442
4443 /*!
4444 \fn int QRegExp::search(const QString &str, int from = 0,
4445 CaretMode caretMode = CaretAtZero) const
4446
4447 Use \l indexIn() instead.
4448 */
4449
4450 /*!
4451 \fn int QRegExp::searchRev(const QString &str, int from = -1, \
4452 CaretMode caretMode = CaretAtZero) const
4453
4454 Use \l lastIndexIn() instead.
4455 */
4456
4457 /*!
4458 \fn QRegExp::QRegExp(const QString &pattern, bool cs, bool wildcard = false)
4459
4460 Use another constructor instead.
4461
4462 \oldcode
4463 QRegExp rx("*.txt", false, true);
4464 \newcode
4465 QRegExp rx("*.txt", Qt::CaseInsensitive, QRegExp::Wildcard);
4466 \endcode
4467 */
4468
4469 #ifndef QT_NO_DATASTREAM
4470 /*!
4471 \relates QRegExp
4472
4473 Writes the regular expression \a regExp to stream \a out.
4474
4475 \sa {Serializing Qt Data Types}
4476 */
operator <<(QDataStream & out,const QRegExp & regExp)4477 QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp)
4478 {
4479 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4480 << (quint8)regExp.patternSyntax()
4481 << (quint8)!!regExp.isMinimal();
4482 }
4483
4484 /*!
4485 \relates QRegExp
4486
4487 Reads a regular expression from stream \a in into \a regExp.
4488
4489 \sa {Serializing Qt Data Types}
4490 */
operator >>(QDataStream & in,QRegExp & regExp)4491 QDataStream &operator>>(QDataStream &in, QRegExp ®Exp)
4492 {
4493 QString pattern;
4494 quint8 cs;
4495 quint8 patternSyntax;
4496 quint8 isMinimal;
4497
4498 in >> pattern >> cs >> patternSyntax >> isMinimal;
4499
4500 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4501 QRegExp::PatternSyntax(patternSyntax));
4502
4503 newRegExp.setMinimal(isMinimal);
4504 regExp = newRegExp;
4505 return in;
4506 }
4507 #endif // QT_NO_DATASTREAM
4508
4509 QT_END_NAMESPACE
4510