1 /****************************************************************************
2 ** $Id: qt/qregexp.cpp 3.3.8 edited Jan 11 14:38 $
3 **
4 ** Implementation of QRegExp class
5 **
6 ** Created : 950126
7 **
8 ** Copyright (C) 1992-2007 Trolltech ASA. All rights reserved.
9 **
10 ** This file is part of the tools module of the Qt GUI Toolkit.
11 **
12 ** This file may be distributed under the terms of the Q Public License
13 ** as defined by Trolltech ASA of Norway and appearing in the file
14 ** LICENSE.QPL included in the packaging of this file.
15 **
16 ** This file may be distributed and/or modified under the terms of the
17 ** GNU General Public License version 2 as published by the Free Software
18 ** Foundation and appearing in the file LICENSE.GPL included in the
19 ** packaging of this file.
20 **
21 ** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
22 ** licenses may use this file in accordance with the Qt Commercial License
23 ** Agreement provided with the Software.
24 **
25 ** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
26 ** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
27 **
28 ** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
29 ** information about Qt Commercial License Agreements.
30 ** See http://www.trolltech.com/qpl/ for QPL licensing information.
31 ** See http://www.trolltech.com/gpl/ for GPL licensing information.
32 **
33 ** Contact info@trolltech.com if any conditions of this licensing are
34 ** not clear to you.
35 **
36 **********************************************************************/
37
38 #include "qregexp.h"
39
40 #ifndef QT_NO_REGEXP
41
42 #include "qmemarray.h"
43 #include "qbitarray.h"
44 #include "qcache.h"
45 #include "qcleanuphandler.h"
46 #include "qintdict.h"
47 #include "qmap.h"
48 #include "qptrvector.h"
49 #include "qstring.h"
50 #include "qtl.h"
51
52 #ifdef QT_THREAD_SUPPORT
53 #include "qthreadstorage.h"
54 #include <private/qthreadinstance_p.h>
55 #endif // QT_THREAD_SUPPORT
56
57 #undef QT_TRANSLATE_NOOP
58 #define QT_TRANSLATE_NOOP( context, sourceText ) sourceText
59
60 #include <limits.h>
61
62 // error strings for the regexp parser
63 #define RXERR_OK QT_TRANSLATE_NOOP( "QRegExp", "no error occurred" )
64 #define RXERR_DISABLED QT_TRANSLATE_NOOP( "QRegExp", "disabled feature used" )
65 #define RXERR_CHARCLASS QT_TRANSLATE_NOOP( "QRegExp", "bad char class syntax" )
66 #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP( "QRegExp", "bad lookahead syntax" )
67 #define RXERR_REPETITION QT_TRANSLATE_NOOP( "QRegExp", "bad repetition syntax" )
68 #define RXERR_OCTAL QT_TRANSLATE_NOOP( "QRegExp", "invalid octal value" )
69 #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP( "QRegExp", "missing left delim" )
70 #define RXERR_END QT_TRANSLATE_NOOP( "QRegExp", "unexpected end" )
71 #define RXERR_LIMIT QT_TRANSLATE_NOOP( "QRegExp", "met internal limit" )
72
73 /*
74 WARNING! Be sure to read qregexp.tex before modifying this file.
75 */
76
77 /*!
78 \class QRegExp qregexp.h
79 \reentrant
80 \brief The QRegExp class provides pattern matching using regular expressions.
81
82 \ingroup tools
83 \ingroup misc
84 \ingroup shared
85 \mainclass
86 \keyword regular expression
87
88 Regular expressions, or "regexps", provide a way to find patterns
89 within text. This is useful in many contexts, for example:
90
91 \table
92 \row \i Validation
93 \i A regexp can be used to check whether a piece of text
94 meets some criteria, e.g. is an integer or contains no
95 whitespace.
96 \row \i Searching
97 \i Regexps provide a much more powerful means of searching
98 text than simple string matching does. For example we can
99 create a regexp which says "find one of the words 'mail',
100 'letter' or 'correspondence' but not any of the words
101 'email', 'mailman' 'mailer', 'letterbox' etc."
102 \row \i Search and Replace
103 \i A regexp can be used to replace a pattern with a piece of
104 text, for example replace all occurrences of '&' with
105 '\&' except where the '&' is already followed by 'amp;'.
106 \row \i String Splitting
107 \i A regexp can be used to identify where a string should be
108 split into its component fields, e.g. splitting tab-delimited
109 strings.
110 \endtable
111
112 We present a very brief introduction to regexps, a description of
113 Qt's regexp language, some code examples, and finally the function
114 documentation itself. QRegExp is modeled on Perl's regexp
115 language, and also fully supports Unicode. QRegExp can also be
116 used in the weaker 'wildcard' (globbing) mode which works in a
117 similar way to command shells. A good text on regexps is \e
118 {Mastering Regular Expressions: Powerful Techniques for Perl and
119 Other Tools} by Jeffrey E. Friedl, ISBN 1565922573.
120
121 Experienced regexp users may prefer to skip the introduction and
122 go directly to the relevant information.
123
124 In case of multi-threaded programming, note that QRegExp depends on
125 QThreadStorage internally. For that reason, QRegExp should only be
126 used with threads started with QThread, i.e. not with threads
127 started with platform-specific APIs.
128
129 \tableofcontents
130
131 \section1 Introduction
132
133 Regexps are built up from expressions, quantifiers, and assertions.
134 The simplest form of expression is simply a character, e.g.
135 <b>x</b> or <b>5</b>. An expression can also be a set of
136 characters. For example, <b>[ABCD]</b>, will match an <b>A</b> or
137 a <b>B</b> or a <b>C</b> or a <b>D</b>. As a shorthand we could
138 write this as <b>[A-D]</b>. If we want to match any of the
139 captital letters in the English alphabet we can write
140 <b>[A-Z]</b>. A quantifier tells the regexp engine how many
141 occurrences of the expression we want, e.g. <b>x{1,1}</b> means
142 match an <b>x</b> which occurs at least once and at most once.
143 We'll look at assertions and more complex expressions later.
144
145 Note that in general regexps cannot be used to check for balanced
146 brackets or tags. For example if you want to match an opening html
147 \c <b> and its closing \c </b> you can only use a regexp if you
148 know that these tags are not nested; the html fragment, \c{<b>bold
149 <b>bolder</b></b>} will not match as expected. If you know the
150 maximum level of nesting it is possible to create a regexp that
151 will match correctly, but for an unknown level of nesting, regexps
152 will fail.
153
154 We'll start by writing a regexp to match integers in the range 0
155 to 99. We will require at least one digit so we will start with
156 <b>[0-9]{1,1}</b> which means match a digit exactly once. This
157 regexp alone will match integers in the range 0 to 9. To match one
158 or two digits we can increase the maximum number of occurrences so
159 the regexp becomes <b>[0-9]{1,2}</b> meaning match a digit at
160 least once and at most twice. However, this regexp as it stands
161 will not match correctly. This regexp will match one or two digits
162 \e within a string. To ensure that we match against the whole
163 string we must use the anchor assertions. We need <b>^</b> (caret)
164 which when it is the first character in the regexp means that the
165 regexp must match from the beginning of the string. And we also
166 need <b>$</b> (dollar) which when it is the last character in the
167 regexp means that the regexp must match until the end of the
168 string. So now our regexp is <b>^[0-9]{1,2}$</b>. Note that
169 assertions, such as <b>^</b> and <b>$</b>, do not match any
170 characters.
171
172 If you've seen regexps elsewhere they may have looked different from
173 the ones above. This is because some sets of characters and some
174 quantifiers are so common that they have special symbols to
175 represent them. <b>[0-9]</b> can be replaced with the symbol
176 <b>\d</b>. The quantifier to match exactly one occurrence,
177 <b>{1,1}</b>, can be replaced with the expression itself. This means
178 that <b>x{1,1}</b> is exactly the same as <b>x</b> alone. So our 0
179 to 99 matcher could be written <b>^\d{1,2}$</b>. Another way of
180 writing it would be <b>^\d\d{0,1}$</b>, i.e. from the start of the
181 string match a digit followed by zero or one digits. In practice
182 most people would write it <b>^\d\d?$</b>. The <b>?</b> is a
183 shorthand for the quantifier <b>{0,1}</b>, i.e. a minimum of no
184 occurrences a maximum of one occurrence. This is used to make an
185 expression optional. The regexp <b>^\d\d?$</b> means "from the
186 beginning of the string match one digit followed by zero or one
187 digits and then the end of the string".
188
189 Our second example is matching the words 'mail', 'letter' or
190 'correspondence' but without matching 'email', 'mailman',
191 'mailer', 'letterbox' etc. We'll start by just matching 'mail'. In
192 full the regexp is, <b>m{1,1}a{1,1}i{1,1}l{1,1}</b>, but since
193 each expression itself is automatically quantified by <b>{1,1}</b>
194 we can simply write this as <b>mail</b>; an 'm' followed by an 'a'
195 followed by an 'i' followed by an 'l'. The symbol '|' (bar) is
196 used for \e alternation, so our regexp now becomes
197 <b>mail|letter|correspondence</b> which means match 'mail' \e or
198 'letter' \e or 'correspondence'. Whilst this regexp will find the
199 words we want it will also find words we don't want such as
200 'email'. We will start by putting our regexp in parentheses,
201 <b>(mail|letter|correspondence)</b>. Parentheses have two effects,
202 firstly they group expressions together and secondly they identify
203 parts of the regexp that we wish to \link #capturing-text capture
204 \endlink. Our regexp still matches any of the three words but now
205 they are grouped together as a unit. This is useful for building
206 up more complex regexps. It is also useful because it allows us to
207 examine which of the words actually matched. We need to use
208 another assertion, this time <b>\b</b> "word boundary":
209 <b>\b(mail|letter|correspondence)\b</b>. This regexp means "match
210 a word boundary followed by the expression in parentheses followed
211 by another word boundary". The <b>\b</b> assertion matches at a \e
212 position in the regexp not a \e character in the regexp. A word
213 boundary is any non-word character such as a space a newline or
214 the beginning or end of the string.
215
216 For our third example we want to replace ampersands with the HTML
217 entity '\&'. The regexp to match is simple: <b>\&</b>, i.e.
218 match one ampersand. Unfortunately this will mess up our text if
219 some of the ampersands have already been turned into HTML
220 entities. So what we really want to say is replace an ampersand
221 providing it is not followed by 'amp;'. For this we need the
222 negative lookahead assertion and our regexp becomes:
223 <b>\&(?!amp;)</b>. The negative lookahead assertion is introduced
224 with '(?!' and finishes at the ')'. It means that the text it
225 contains, 'amp;' in our example, must \e not follow the expression
226 that preceeds it.
227
228 Regexps provide a rich language that can be used in a variety of
229 ways. For example suppose we want to count all the occurrences of
230 'Eric' and 'Eirik' in a string. Two valid regexps to match these
231 are <b>\\b(Eric|Eirik)\\b</b> and <b>\\bEi?ri[ck]\\b</b>. We need
232 the word boundary '\b' so we don't get 'Ericsson' etc. The second
233 regexp actually matches more than we want, 'Eric', 'Erik', 'Eiric'
234 and 'Eirik'.
235
236 We will implement some the examples above in the
237 \link #code-examples code examples \endlink section.
238
239 \target characters-and-abbreviations-for-sets-of-characters
240 \section1 Characters and Abbreviations for Sets of Characters
241
242 \table
243 \header \i Element \i Meaning
244 \row \i <b>c</b>
245 \i Any character represents itself unless it has a special
246 regexp meaning. Thus <b>c</b> matches the character \e c.
247 \row \i <b>\\c</b>
248 \i A character that follows a backslash matches the character
249 itself except where mentioned below. For example if you
250 wished to match a literal caret at the beginning of a string
251 you would write <b>\^</b>.
252 \row \i <b>\\a</b>
253 \i This matches the ASCII bell character (BEL, 0x07).
254 \row \i <b>\\f</b>
255 \i This matches the ASCII form feed character (FF, 0x0C).
256 \row \i <b>\\n</b>
257 \i This matches the ASCII line feed character (LF, 0x0A, Unix newline).
258 \row \i <b>\\r</b>
259 \i This matches the ASCII carriage return character (CR, 0x0D).
260 \row \i <b>\\t</b>
261 \i This matches the ASCII horizontal tab character (HT, 0x09).
262 \row \i <b>\\v</b>
263 \i This matches the ASCII vertical tab character (VT, 0x0B).
264 \row \i <b>\\xhhhh</b>
265 \i This matches the Unicode character corresponding to the
266 hexadecimal number hhhh (between 0x0000 and 0xFFFF). \0ooo
267 (i.e., \zero ooo) matches the ASCII/Latin-1 character
268 corresponding to the octal number ooo (between 0 and 0377).
269 \row \i <b>. (dot)</b>
270 \i This matches any character (including newline).
271 \row \i <b>\\d</b>
272 \i This matches a digit (QChar::isDigit()).
273 \row \i <b>\\D</b>
274 \i This matches a non-digit.
275 \row \i <b>\\s</b>
276 \i This matches a whitespace (QChar::isSpace()).
277 \row \i <b>\\S</b>
278 \i This matches a non-whitespace.
279 \row \i <b>\\w</b>
280 \i This matches a word character (QChar::isLetterOrNumber() or '_').
281 \row \i <b>\\W</b>
282 \i This matches a non-word character.
283 \row \i <b>\\n</b>
284 \i The n-th \link #capturing-text backreference \endlink,
285 e.g. \1, \2, etc.
286 \endtable
287
288 \e {Note that the C++ compiler transforms backslashes in strings
289 so to include a <b>\\</b> in a regexp you will need to enter it
290 twice, i.e. <b>\\\\</b>.}
291
292 \target sets-of-characters
293 \section1 Sets of Characters
294
295 Square brackets are used to match any character in the set of
296 characters contained within the square brackets. All the character
297 set abbreviations described above can be used within square
298 brackets. Apart from the character set abbreviations and the
299 following two exceptions no characters have special meanings in
300 square brackets.
301
302 \table
303 \row \i <b>^</b>
304 \i The caret negates the character set if it occurs as the
305 first character, i.e. immediately after the opening square
306 bracket. For example, <b>[abc]</b> matches 'a' or 'b' or 'c',
307 but <b>[^abc]</b> matches anything \e except 'a' or 'b' or
308 'c'.
309 \row \i <b>-</b>
310 \i The dash is used to indicate a range of characters, for
311 example <b>[W-Z]</b> matches 'W' or 'X' or 'Y' or 'Z'.
312 \endtable
313
314 Using the predefined character set abbreviations is more portable
315 than using character ranges across platforms and languages. For
316 example, <b>[0-9]</b> matches a digit in Western alphabets but
317 <b>\d</b> matches a digit in \e any alphabet.
318
319 Note that in most regexp literature sets of characters are called
320 "character classes".
321
322 \target quantifiers
323 \section1 Quantifiers
324
325 By default an expression is automatically quantified by
326 <b>{1,1}</b>, i.e. it should occur exactly once. In the following
327 list <b>\e {E}</b> stands for any expression. An expression is a
328 character or an abbreviation for a set of characters or a set of
329 characters in square brackets or any parenthesised expression.
330
331 \table
332 \row \i <b>\e {E}?</b>
333 \i Matches zero or one occurrence of \e E. This quantifier
334 means "the previous expression is optional" since it will
335 match whether or not the expression occurs in the string. It
336 is the same as <b>\e {E}{0,1}</b>. For example <b>dents?</b>
337 will match 'dent' and 'dents'.
338
339 \row \i <b>\e {E}+</b>
340 \i Matches one or more occurrences of \e E. This is the same
341 as <b>\e {E}{1,MAXINT}</b>. For example, <b>0+</b> will match
342 '0', '00', '000', etc.
343
344 \row \i <b>\e {E}*</b>
345 \i Matches zero or more occurrences of \e E. This is the same
346 as <b>\e {E}{0,MAXINT}</b>. The <b>*</b> quantifier is often
347 used by a mistake. Since it matches \e zero or more
348 occurrences it will match no occurrences at all. For example
349 if we want to match strings that end in whitespace and use
350 the regexp <b>\s*$</b> we would get a match on every string.
351 This is because we have said find zero or more whitespace
352 followed by the end of string, so even strings that don't end
353 in whitespace will match. The regexp we want in this case is
354 <b>\s+$</b> to match strings that have at least one
355 whitespace at the end.
356
357 \row \i <b>\e {E}{n}</b>
358 \i Matches exactly \e n occurrences of the expression. This
359 is the same as repeating the expression \e n times. For
360 example, <b>x{5}</b> is the same as <b>xxxxx</b>. It is also
361 the same as <b>\e {E}{n,n}</b>, e.g. <b>x{5,5}</b>.
362
363 \row \i <b>\e {E}{n,}</b>
364 \i Matches at least \e n occurrences of the expression. This
365 is the same as <b>\e {E}{n,MAXINT}</b>.
366
367 \row \i <b>\e {E}{,m}</b>
368 \i Matches at most \e m occurrences of the expression. This
369 is the same as <b>\e {E}{0,m}</b>.
370
371 \row \i <b>\e {E}{n,m}</b>
372 \i Matches at least \e n occurrences of the expression and at
373 most \e m occurrences of the expression.
374 \endtable
375
376 (MAXINT is implementation dependent but will not be smaller than
377 1024.)
378
379 If we wish to apply a quantifier to more than just the preceding
380 character we can use parentheses to group characters together in
381 an expression. For example, <b>tag+</b> matches a 't' followed by
382 an 'a' followed by at least one 'g', whereas <b>(tag)+</b> matches
383 at least one occurrence of 'tag'.
384
385 Note that quantifiers are "greedy". They will match as much text
386 as they can. For example, <b>0+</b> will match as many zeros as it
387 can from the first zero it finds, e.g. '2.<u>000</u>5'.
388 Quantifiers can be made non-greedy, see setMinimal().
389
390 \target capturing-text
391 \section1 Capturing Text
392
393 Parentheses allow us to group elements together so that we can
394 quantify and capture them. For example if we have the expression
395 <b>mail|letter|correspondence</b> that matches a string we know
396 that \e one of the words matched but not which one. Using
397 parentheses allows us to "capture" whatever is matched within
398 their bounds, so if we used <b>(mail|letter|correspondence)</b>
399 and matched this regexp against the string "I sent you some email"
400 we can use the cap() or capturedTexts() functions to extract the
401 matched characters, in this case 'mail'.
402
403 We can use captured text within the regexp itself. To refer to the
404 captured text we use \e backreferences which are indexed from 1,
405 the same as for cap(). For example we could search for duplicate
406 words in a string using <b>\b(\w+)\W+\1\b</b> which means match a
407 word boundary followed by one or more word characters followed by
408 one or more non-word characters followed by the same text as the
409 first parenthesised expression followed by a word boundary.
410
411 If we want to use parentheses purely for grouping and not for
412 capturing we can use the non-capturing syntax, e.g.
413 <b>(?:green|blue)</b>. Non-capturing parentheses begin '(?:' and
414 end ')'. In this example we match either 'green' or 'blue' but we
415 do not capture the match so we only know whether or not we matched
416 but not which color we actually found. Using non-capturing
417 parentheses is more efficient than using capturing parentheses
418 since the regexp engine has to do less book-keeping.
419
420 Both capturing and non-capturing parentheses may be nested.
421
422 \target assertions
423 \section1 Assertions
424
425 Assertions make some statement about the text at the point where
426 they occur in the regexp but they do not match any characters. In
427 the following list <b>\e {E}</b> stands for any expression.
428
429 \table
430 \row \i <b>^</b>
431 \i The caret signifies the beginning of the string. If you
432 wish to match a literal \c{^} you must escape it by
433 writing <b>\^</b>. For example, <b>^#include</b> will only
434 match strings which \e begin with the characters '#include'.
435 (When the caret is the first character of a character set it
436 has a special meaning, see \link #sets-of-characters Sets of
437 Characters \endlink.)
438
439 \row \i <b>$</b>
440 \i The dollar signifies the end of the string. For example
441 <b>\d\s*$</b> will match strings which end with a digit
442 optionally followed by whitespace. If you wish to match a
443 literal \c{$} you must escape it by writing
444 <b>\$</b>.
445
446 \row \i <b>\\b</b>
447 \i A word boundary. For example the regexp
448 <b>\\bOK\\b</b> means match immediately after a word
449 boundary (e.g. start of string or whitespace) the letter 'O'
450 then the letter 'K' immediately before another word boundary
451 (e.g. end of string or whitespace). But note that the
452 assertion does not actually match any whitespace so if we
453 write <b>(\\bOK\\b)</b> and we have a match it will only
454 contain 'OK' even if the string is "Its <u>OK</u> now".
455
456 \row \i <b>\\B</b>
457 \i A non-word boundary. This assertion is true wherever
458 <b>\\b</b> is false. For example if we searched for
459 <b>\\Bon\\B</b> in "Left on" the match would fail (space
460 and end of string aren't non-word boundaries), but it would
461 match in "t<u>on</u>ne".
462
463 \row \i <b>(?=\e E)</b>
464 \i Positive lookahead. This assertion is true if the
465 expression matches at this point in the regexp. For example,
466 <b>const(?=\\s+char)</b> matches 'const' whenever it is
467 followed by 'char', as in 'static <u>const</u> char *'.
468 (Compare with <b>const\\s+char</b>, which matches 'static
469 <u>const char</u> *'.)
470
471 \row \i <b>(?!\e E)</b>
472 \i Negative lookahead. This assertion is true if the
473 expression does not match at this point in the regexp. For
474 example, <b>const(?!\\s+char)</b> matches 'const' \e except
475 when it is followed by 'char'.
476 \endtable
477
478 \target wildcard-matching
479 \section1 Wildcard Matching (globbing)
480
481 Most command shells such as \e bash or \e cmd.exe support "file
482 globbing", the ability to identify a group of files by using
483 wildcards. The setWildcard() function is used to switch between
484 regexp and wildcard mode. Wildcard matching is much simpler than
485 full regexps and has only four features:
486
487 \table
488 \row \i <b>c</b>
489 \i Any character represents itself apart from those mentioned
490 below. Thus <b>c</b> matches the character \e c.
491 \row \i <b>?</b>
492 \i This matches any single character. It is the same as
493 <b>.</b> in full regexps.
494 \row \i <b>*</b>
495 \i This matches zero or more of any characters. It is the
496 same as <b>.*</b> in full regexps.
497 \row \i <b>[...]</b>
498 \i Sets of characters can be represented in square brackets,
499 similar to full regexps. Within the character class, like
500 outside, backslash has no special meaning.
501 \endtable
502
503 For example if we are in wildcard mode and have strings which
504 contain filenames we could identify HTML files with <b>*.html</b>.
505 This will match zero or more characters followed by a dot followed
506 by 'h', 't', 'm' and 'l'.
507
508 \target perl-users
509 \section1 Notes for Perl Users
510
511 Most of the character class abbreviations supported by Perl are
512 supported by QRegExp, see \link
513 #characters-and-abbreviations-for-sets-of-characters characters
514 and abbreviations for sets of characters \endlink.
515
516 In QRegExp, apart from within character classes, \c{^} always
517 signifies the start of the string, so carets must always be
518 escaped unless used for that purpose. In Perl the meaning of caret
519 varies automagically depending on where it occurs so escaping it
520 is rarely necessary. The same applies to \c{$} which in
521 QRegExp always signifies the end of the string.
522
523 QRegExp's quantifiers are the same as Perl's greedy quantifiers.
524 Non-greedy matching cannot be applied to individual quantifiers,
525 but can be applied to all the quantifiers in the pattern. For
526 example, to match the Perl regexp <b>ro+?m</b> requires:
527 \code
528 QRegExp rx( "ro+m" );
529 rx.setMinimal( TRUE );
530 \endcode
531
532 The equivalent of Perl's \c{/i} option is
533 setCaseSensitive(FALSE).
534
535 Perl's \c{/g} option can be emulated using a \link
536 #cap_in_a_loop loop \endlink.
537
538 In QRegExp <b>.</b> matches any character, therefore all QRegExp
539 regexps have the equivalent of Perl's \c{/s} option. QRegExp
540 does not have an equivalent to Perl's \c{/m} option, but this
541 can be emulated in various ways for example by splitting the input
542 into lines or by looping with a regexp that searches for newlines.
543
544 Because QRegExp is string oriented there are no \A, \Z or \z
545 assertions. The \G assertion is not supported but can be emulated
546 in a loop.
547
548 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
549 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
550 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
551 capturedTexts()[2], etc.
552
553 To substitute a pattern use QString::replace().
554
555 Perl's extended \c{/x} syntax is not supported, nor are
556 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
557 the other hand, C++'s rules for literal strings can be used to
558 achieve the same:
559 \code
560 QRegExp mark( "\\b" // word boundary
561 "[Mm]ark" // the word we want to match
562 );
563 \endcode
564
565 Both zero-width positive and zero-width negative lookahead
566 assertions (?=pattern) and (?!pattern) are supported with the same
567 syntax as Perl. Perl's lookbehind assertions, "independent"
568 subexpressions and conditional expressions are not supported.
569
570 Non-capturing parentheses are also supported, with the same
571 (?:pattern) syntax.
572
573 See QStringList::split() and QStringList::join() for equivalents
574 to Perl's split and join functions.
575
576 Note: because C++ transforms \\'s they must be written \e twice in
577 code, e.g. <b>\\b</b> must be written <b>\\\\b</b>.
578
579 \target code-examples
580 \section1 Code Examples
581
582 \code
583 QRegExp rx( "^\\d\\d?$" ); // match integers 0 to 99
584 rx.search( "123" ); // returns -1 (no match)
585 rx.search( "-6" ); // returns -1 (no match)
586 rx.search( "6" ); // returns 0 (matched as position 0)
587 \endcode
588
589 The third string matches '<u>6</u>'. This is a simple validation
590 regexp for integers in the range 0 to 99.
591
592 \code
593 QRegExp rx( "^\\S+$" ); // match strings without whitespace
594 rx.search( "Hello world" ); // returns -1 (no match)
595 rx.search( "This_is-OK" ); // returns 0 (matched at position 0)
596 \endcode
597
598 The second string matches '<u>This_is-OK</u>'. We've used the
599 character set abbreviation '\S' (non-whitespace) and the anchors
600 to match strings which contain no whitespace.
601
602 In the following example we match strings containing 'mail' or
603 'letter' or 'correspondence' but only match whole words i.e. not
604 'email'
605
606 \code
607 QRegExp rx( "\\b(mail|letter|correspondence)\\b" );
608 rx.search( "I sent you an email" ); // returns -1 (no match)
609 rx.search( "Please write the letter" ); // returns 17
610 \endcode
611
612 The second string matches "Please write the <u>letter</u>". The
613 word 'letter' is also captured (because of the parentheses). We
614 can see what text we've captured like this:
615
616 \code
617 QString captured = rx.cap( 1 ); // captured == "letter"
618 \endcode
619
620 This will capture the text from the first set of capturing
621 parentheses (counting capturing left parentheses from left to
622 right). The parentheses are counted from 1 since cap( 0 ) is the
623 whole matched regexp (equivalent to '&' in most regexp engines).
624
625 \code
626 QRegExp rx( "&(?!amp;)" ); // match ampersands but not &
627 QString line1 = "This & that";
628 line1.replace( rx, "&" );
629 // line1 == "This & that"
630 QString line2 = "His & hers & theirs";
631 line2.replace( rx, "&" );
632 // line2 == "His & hers & theirs"
633 \endcode
634
635 Here we've passed the QRegExp to QString's replace() function to
636 replace the matched text with new text.
637
638 \code
639 QString str = "One Eric another Eirik, and an Ericsson."
640 " How many Eiriks, Eric?";
641 QRegExp rx( "\\b(Eric|Eirik)\\b" ); // match Eric or Eirik
642 int pos = 0; // where we are in the string
643 int count = 0; // how many Eric and Eirik's we've counted
644 while ( pos >= 0 ) {
645 pos = rx.search( str, pos );
646 if ( pos >= 0 ) {
647 pos++; // move along in str
648 count++; // count our Eric or Eirik
649 }
650 }
651 \endcode
652
653 We've used the search() function to repeatedly match the regexp in
654 the string. Note that instead of moving forward by one character
655 at a time \c pos++ we could have written \c {pos +=
656 rx.matchedLength()} to skip over the already matched string. The
657 count will equal 3, matching 'One <u>Eric</u> another
658 <u>Eirik</u>, and an Ericsson. How many Eiriks, <u>Eric</u>?'; it
659 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
660 by non-word boundaries.
661
662 One common use of regexps is to split lines of delimited data into
663 their component fields.
664
665 \code
666 str = "Trolltech AS\twww.trolltech.com\tNorway";
667 QString company, web, country;
668 rx.setPattern( "^([^\t]+)\t([^\t]+)\t([^\t]+)$" );
669 if ( rx.search( str ) != -1 ) {
670 company = rx.cap( 1 );
671 web = rx.cap( 2 );
672 country = rx.cap( 3 );
673 }
674 \endcode
675
676 In this example our input lines have the format company name, web
677 address and country. Unfortunately the regexp is rather long and
678 not very versatile -- the code will break if we add any more
679 fields. A simpler and better solution is to look for the
680 separator, '\t' in this case, and take the surrounding text. The
681 QStringList split() function can take a separator string or regexp
682 as an argument and split a string accordingly.
683
684 \code
685 QStringList field = QStringList::split( "\t", str );
686 \endcode
687
688 Here field[0] is the company, field[1] the web address and so on.
689
690 To imitate the matching of a shell we can use wildcard mode.
691
692 \code
693 QRegExp rx( "*.html" ); // invalid regexp: * doesn't quantify anything
694 rx.setWildcard( TRUE ); // now it's a valid wildcard regexp
695 rx.exactMatch( "index.html" ); // returns TRUE
696 rx.exactMatch( "default.htm" ); // returns FALSE
697 rx.exactMatch( "readme.txt" ); // returns FALSE
698 \endcode
699
700 Wildcard matching can be convenient because of its simplicity, but
701 any wildcard regexp can be defined using full regexps, e.g.
702 <b>.*\.html$</b>. Notice that we can't match both \c .html and \c
703 .htm files with a wildcard unless we use <b>*.htm*</b> which will
704 also match 'test.html.bak'. A full regexp gives us the precision
705 we need, <b>.*\\.html?$</b>.
706
707 QRegExp can match case insensitively using setCaseSensitive(), and
708 can use non-greedy matching, see setMinimal(). By default QRegExp
709 uses full regexps but this can be changed with setWildcard().
710 Searching can be forward with search() or backward with
711 searchRev(). Captured text can be accessed using capturedTexts()
712 which returns a string list of all captured strings, or using
713 cap() which returns the captured string for the given index. The
714 pos() function takes a match index and returns the position in the
715 string where the match was made (or -1 if there was no match).
716
717 \sa QRegExpValidator QString QStringList
718
719 \target member-function-documentation
720 */
721
722 const int NumBadChars = 64;
723 #define BadChar( ch ) ( (ch).unicode() % NumBadChars )
724
725 const int NoOccurrence = INT_MAX;
726 const int EmptyCapture = INT_MAX;
727 const int InftyLen = INT_MAX;
728 const int InftyRep = 1025;
729 const int EOS = -1;
730
isWord(QChar ch)731 static bool isWord( QChar ch )
732 {
733 return ch.isLetterOrNumber() || ch == QChar( '_' );
734 }
735
736 /*
737 Merges two QMemArrays of ints and puts the result into the first
738 one.
739 */
mergeInto(QMemArray<int> * a,const QMemArray<int> & b)740 static void mergeInto( QMemArray<int> *a, const QMemArray<int>& b )
741 {
742 int asize = a->size();
743 int bsize = b.size();
744 if ( asize == 0 ) {
745 *a = b.copy();
746 #ifndef QT_NO_REGEXP_OPTIM
747 } else if ( bsize == 1 && (*a)[asize - 1] < b[0] ) {
748 a->resize( asize + 1 );
749 (*a)[asize] = b[0];
750 #endif
751 } else if ( bsize >= 1 ) {
752 int csize = asize + bsize;
753 QMemArray<int> c( csize );
754 int i = 0, j = 0, k = 0;
755 while ( i < asize ) {
756 if ( j < bsize ) {
757 if ( (*a)[i] == b[j] ) {
758 i++;
759 csize--;
760 } else if ( (*a)[i] < b[j] ) {
761 c[k++] = (*a)[i++];
762 } else {
763 c[k++] = b[j++];
764 }
765 } else {
766 memcpy( c.data() + k, (*a).data() + i,
767 (asize - i) * sizeof(int) );
768 break;
769 }
770 }
771 c.resize( csize );
772 if ( j < bsize )
773 memcpy( c.data() + k, b.data() + j, (bsize - j) * sizeof(int) );
774 *a = c;
775 }
776 }
777
778 /*
779 Merges two disjoint QMaps of (int, int) pairs and puts the result
780 into the first one.
781 */
mergeInto(QMap<int,int> * a,const QMap<int,int> & b)782 static void mergeInto( QMap<int, int> *a, const QMap<int, int>& b )
783 {
784 QMap<int, int>::ConstIterator it;
785 for ( it = b.begin(); it != b.end(); ++it )
786 a->insert( it.key(), *it );
787 }
788
789 /*
790 Returns the value associated to key k in QMap m of (int, int)
791 pairs, or 0 if no such value is explicitly present.
792 */
at(const QMap<int,int> & m,int k)793 static int at( const QMap<int, int>& m, int k )
794 {
795 QMap<int, int>::ConstIterator it = m.find( k );
796 if ( it == m.end() )
797 return 0;
798 else
799 return *it;
800 }
801
802 #ifndef QT_NO_REGEXP_WILDCARD
803 /*
804 Translates a wildcard pattern to an equivalent regular expression
805 pattern (e.g., *.cpp to .*\.cpp).
806 */
wc2rx(const QString & wc_str)807 static QString wc2rx( const QString& wc_str )
808 {
809 int wclen = wc_str.length();
810 QString rx = QString::fromLatin1( "" );
811 int i = 0;
812 const QChar *wc = wc_str.unicode();
813 while ( i < wclen ) {
814 QChar c = wc[i++];
815 switch ( c.unicode() ) {
816 case '*':
817 rx += QString::fromLatin1( ".*" );
818 break;
819 case '?':
820 rx += QChar( '.' );
821 break;
822 case '$':
823 case '(':
824 case ')':
825 case '+':
826 case '.':
827 case '\\':
828 case '^':
829 case '{':
830 case '|':
831 case '}':
832 rx += QChar( '\\' );
833 rx += c;
834 break;
835 case '[':
836 rx += c;
837 if ( wc[i] == QChar('^') )
838 rx += wc[i++];
839 if ( i < wclen ) {
840 if ( rx[i] == ']' )
841 rx += wc[i++];
842 while ( i < wclen && wc[i] != QChar(']') ) {
843 if ( wc[i] == '\\' )
844 rx += QChar( '\\' );
845 rx += wc[i++];
846 }
847 }
848 break;
849 default:
850 rx += c;
851 }
852 }
853 return rx;
854 }
855 #endif
856
857 /*
858 The class QRegExpEngine encapsulates a modified nondeterministic
859 finite automaton (NFA).
860 */
861 class QRegExpEngine : public QShared
862 {
863 public:
864 #ifndef QT_NO_REGEXP_CCLASS
865 /*
866 The class CharClass represents a set of characters, such as can
867 be found in regular expressions (e.g., [a-z] denotes the set
868 {a, b, ..., z}).
869 */
870 class CharClass
871 {
872 public:
873 CharClass();
CharClass(const CharClass & cc)874 CharClass( const CharClass& cc ) { operator=( cc ); }
875
876 CharClass& operator=( const CharClass& cc );
877
878 void clear();
negative() const879 bool negative() const { return n; }
880 void setNegative( bool negative );
881 void addCategories( int cats );
882 void addRange( ushort from, ushort to );
addSingleton(ushort ch)883 void addSingleton( ushort ch ) { addRange( ch, ch ); }
884
885 bool in( QChar ch ) const;
886 #ifndef QT_NO_REGEXP_OPTIM
firstOccurrence() const887 const QMemArray<int>& firstOccurrence() const { return occ1; }
888 #endif
889
890 #if defined(QT_DEBUG)
891 void dump() const;
892 #endif
893
894 private:
895 /*
896 The struct Range represents a range of characters (e.g.,
897 [0-9] denotes range 48 to 57).
898 */
899 struct Range
900 {
901 ushort from; // 48
902 ushort to; // 57
903 };
904
905 int c; // character classes
906 QMemArray<Range> r; // character ranges
907 bool n; // negative?
908 #ifndef QT_NO_REGEXP_OPTIM
909 QMemArray<int> occ1; // first-occurrence array
910 #endif
911 };
912 #else
913 struct CharClass
914 {
915 int dummy;
916
917 #ifndef QT_NO_REGEXP_OPTIM
918 CharClass() { occ1.fill( 0, NumBadChars ); }
919
920 const QMemArray<int>& firstOccurrence() const { return occ1; }
921 QMemArray<int> occ1;
922 #endif
923 };
924 #endif
925
QRegExpEngine(bool caseSensitive)926 QRegExpEngine( bool caseSensitive ) { setup( caseSensitive ); }
927 QRegExpEngine( const QString& rx, bool caseSensitive );
928 #ifndef QT_NO_REGEXP_OPTIM
929 ~QRegExpEngine();
930 #endif
931
isValid() const932 bool isValid() const { return valid; }
caseSensitive() const933 bool caseSensitive() const { return cs; }
errorString() const934 const QString& errorString() const { return yyError; }
numCaptures() const935 int numCaptures() const { return officialncap; }
936 void match( const QString& str, int pos, bool minimal, bool oneTest,
937 int caretIndex, QMemArray<int>& captured );
partialMatchLength() const938 int partialMatchLength() const { return mmOneTestMatchedLen; }
939
940 int createState( QChar ch );
941 int createState( const CharClass& cc );
942 #ifndef QT_NO_REGEXP_BACKREF
943 int createState( int bref );
944 #endif
945
946 void addCatTransitions( const QMemArray<int>& from,
947 const QMemArray<int>& to );
948 #ifndef QT_NO_REGEXP_CAPTURE
949 void addPlusTransitions( const QMemArray<int>& from,
950 const QMemArray<int>& to, int atom );
951 #endif
952
953 #ifndef QT_NO_REGEXP_ANCHOR_ALT
954 int anchorAlternation( int a, int b );
955 int anchorConcatenation( int a, int b );
956 #else
anchorAlternation(int a,int b)957 int anchorAlternation( int a, int b ) { return a & b; }
anchorConcatenation(int a,int b)958 int anchorConcatenation( int a, int b ) { return a | b; }
959 #endif
960 void addAnchors( int from, int to, int a );
961
962 #ifndef QT_NO_REGEXP_OPTIM
963 void heuristicallyChooseHeuristic();
964 #endif
965
966 #if defined(QT_DEBUG)
967 void dump() const;
968 #endif
969
970 private:
971 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
972
973 /*
974 The struct State represents one state in a modified NFA. The
975 input characters matched are stored in the state instead of on
976 the transitions, something possible for an automaton
977 constructed from a regular expression.
978 */
979 struct State
980 {
981 #ifndef QT_NO_REGEXP_CAPTURE
982 int atom; // which atom does this state belong to?
983 #endif
984 int match; // what does it match? (see CharClassBit and BackRefBit)
985 QMemArray<int> outs; // out-transitions
986 QMap<int, int> *reenter; // atoms reentered when transiting out
987 QMap<int, int> *anchors; // anchors met when transiting out
988
989 #ifndef QT_NO_REGEXP_CAPTURE
StateQRegExpEngine::State990 State( int a, int m )
991 : atom( a ), match( m ), reenter( 0 ), anchors( 0 ) { }
992 #else
StateQRegExpEngine::State993 State( int m )
994 : match( m ), reenter( 0 ), anchors( 0 ) { }
995 #endif
~StateQRegExpEngine::State996 ~State() { delete reenter; delete anchors; }
997 };
998
999 #ifndef QT_NO_REGEXP_LOOKAHEAD
1000 /*
1001 The struct Lookahead represents a lookahead a la Perl (e.g.,
1002 (?=foo) and (?!bar)).
1003 */
1004 struct Lookahead
1005 {
1006 QRegExpEngine *eng; // NFA representing the embedded regular expression
1007 bool neg; // negative lookahead?
1008
LookaheadQRegExpEngine::Lookahead1009 Lookahead( QRegExpEngine *eng0, bool neg0 )
1010 : eng( eng0 ), neg( neg0 ) { }
~LookaheadQRegExpEngine::Lookahead1011 ~Lookahead() { delete eng; }
1012 };
1013 #endif
1014
1015 #ifndef QT_NO_REGEXP_CAPTURE
1016 /*
1017 The struct Atom represents one node in the hierarchy of regular
1018 expression atoms.
1019 */
1020 struct Atom
1021 {
1022 int parent; // index of parent in array of atoms
1023 int capture; // index of capture, from 1 to ncap
1024 };
1025 #endif
1026
1027 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1028 /*
1029 The struct AnchorAlternation represents a pair of anchors with
1030 OR semantics.
1031 */
1032 struct AnchorAlternation
1033 {
1034 int a; // this anchor...
1035 int b; // ...or this one
1036 };
1037 #endif
1038
1039 enum { InitialState = 0, FinalState = 1 };
1040 void setup( bool caseSensitive );
1041 int setupState( int match );
1042
1043 /*
1044 Let's hope that 13 lookaheads and 14 back-references are
1045 enough.
1046 */
1047 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1048 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002,
1049 Anchor_Word = 0x00000004, Anchor_NonWord = 0x00000008,
1050 Anchor_FirstLookahead = 0x00000010,
1051 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1052 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1053 Anchor_Alternation = Anchor_BackRef1Empty << MaxBackRefs,
1054
1055 Anchor_LookaheadMask = ( Anchor_FirstLookahead - 1 ) ^
1056 ( (Anchor_FirstLookahead << MaxLookaheads) - 1 ) };
1057 #ifndef QT_NO_REGEXP_CAPTURE
1058 int startAtom( bool capture );
finishAtom(int atom)1059 void finishAtom( int atom ) { cf = f[atom].parent; }
1060 #endif
1061
1062 #ifndef QT_NO_REGEXP_LOOKAHEAD
1063 int addLookahead( QRegExpEngine *eng, bool negative );
1064 #endif
1065
1066 #ifndef QT_NO_REGEXP_CAPTURE
1067 bool isBetterCapture( const int *begin1, const int *end1, const int *begin2,
1068 const int *end2 );
1069 #endif
1070 bool testAnchor( int i, int a, const int *capBegin );
1071
1072 #ifndef QT_NO_REGEXP_OPTIM
1073 bool goodStringMatch();
1074 bool badCharMatch();
1075 #else
1076 bool bruteMatch();
1077 #endif
1078 bool matchHere();
1079
1080 QPtrVector<State> s; // array of states
1081 int ns; // number of states
1082 #ifndef QT_NO_REGEXP_CAPTURE
1083 QMemArray<Atom> f; // atom hierarchy
1084 int nf; // number of atoms
1085 int cf; // current atom
1086 #endif
1087 int officialncap; // number of captures, seen from the outside
1088 int ncap; // number of captures, seen from the inside
1089 #ifndef QT_NO_REGEXP_CCLASS
1090 QPtrVector<CharClass> cl; // array of character classes
1091 #endif
1092 #ifndef QT_NO_REGEXP_LOOKAHEAD
1093 QPtrVector<Lookahead> ahead; // array of lookaheads
1094 #endif
1095 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1096 QMemArray<AnchorAlternation> aa; // array of (a, b) pairs of anchors
1097 #endif
1098 #ifndef QT_NO_REGEXP_OPTIM
1099 bool caretAnchored; // does the regexp start with ^?
1100 bool trivial; // is the good-string all that needs to match?
1101 #endif
1102 bool valid; // is the regular expression valid?
1103 bool cs; // case sensitive?
1104 #ifndef QT_NO_REGEXP_BACKREF
1105 int nbrefs; // number of back-references
1106 #endif
1107
1108 #ifndef QT_NO_REGEXP_OPTIM
1109 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1110
1111 int goodEarlyStart; // the index where goodStr can first occur in a match
1112 int goodLateStart; // the index where goodStr can last occur in a match
1113 QString goodStr; // the string that any match has to contain
1114
1115 int minl; // the minimum length of a match
1116 QMemArray<int> occ1; // first-occurrence array
1117 #endif
1118
1119 /*
1120 The class Box is an abstraction for a regular expression
1121 fragment. It can also be seen as one node in the syntax tree of
1122 a regular expression with synthetized attributes.
1123
1124 Its interface is ugly for performance reasons.
1125 */
1126 class Box
1127 {
1128 public:
1129 Box( QRegExpEngine *engine );
Box(const Box & b)1130 Box( const Box& b ) { operator=( b ); }
1131
1132 Box& operator=( const Box& b );
1133
clear()1134 void clear() { operator=( Box(eng) ); }
1135 void set( QChar ch );
1136 void set( const CharClass& cc );
1137 #ifndef QT_NO_REGEXP_BACKREF
1138 void set( int bref );
1139 #endif
1140
1141 void cat( const Box& b );
1142 void orx( const Box& b );
1143 void plus( int atom );
1144 void opt();
1145 void catAnchor( int a );
1146 #ifndef QT_NO_REGEXP_OPTIM
1147 void setupHeuristics();
1148 #endif
1149
1150 #if defined(QT_DEBUG)
1151 void dump() const;
1152 #endif
1153
1154 private:
1155 void addAnchorsToEngine( const Box& to ) const;
1156
1157 QRegExpEngine *eng; // the automaton under construction
1158 QMemArray<int> ls; // the left states (firstpos)
1159 QMemArray<int> rs; // the right states (lastpos)
1160 QMap<int, int> lanchors; // the left anchors
1161 QMap<int, int> ranchors; // the right anchors
1162 int skipanchors; // the anchors to match if the box is skipped
1163
1164 #ifndef QT_NO_REGEXP_OPTIM
1165 int earlyStart; // the index where str can first occur
1166 int lateStart; // the index where str can last occur
1167 QString str; // a string that has to occur in any match
1168 QString leftStr; // a string occurring at the left of this box
1169 QString rightStr; // a string occurring at the right of this box
1170 int maxl; // the maximum length of this box (possibly InftyLen)
1171 #endif
1172
1173 int minl; // the minimum length of this box
1174 #ifndef QT_NO_REGEXP_OPTIM
1175 QMemArray<int> occ1; // first-occurrence array
1176 #endif
1177 };
1178 friend class Box;
1179
1180 /*
1181 This is the lexical analyzer for regular expressions.
1182 */
1183 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen,
1184 Tok_PosLookahead, Tok_NegLookahead, Tok_RightParen, Tok_CharClass,
1185 Tok_Caret, Tok_Quantifier, Tok_Bar, Tok_Word, Tok_NonWord,
1186 Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1187 int getChar();
1188 int getEscape();
1189 #ifndef QT_NO_REGEXP_INTERVAL
1190 int getRep( int def );
1191 #endif
1192 #ifndef QT_NO_REGEXP_LOOKAHEAD
1193 void skipChars( int n );
1194 #endif
1195 void error( const char *msg );
1196 void startTokenizer( const QChar *rx, int len );
1197 int getToken();
1198
1199 const QChar *yyIn; // a pointer to the input regular expression pattern
1200 int yyPos0; // the position of yyTok in the input pattern
1201 int yyPos; // the position of the next character to read
1202 int yyLen; // the length of yyIn
1203 int yyCh; // the last character read
1204 CharClass *yyCharClass; // attribute for Tok_CharClass tokens
1205 int yyMinRep; // attribute for Tok_Quantifier
1206 int yyMaxRep; // ditto
1207 QString yyError; // syntax error or overflow during parsing?
1208
1209 /*
1210 This is the syntactic analyzer for regular expressions.
1211 */
1212 int parse( const QChar *rx, int len );
1213 void parseAtom( Box *box );
1214 void parseFactor( Box *box );
1215 void parseTerm( Box *box );
1216 void parseExpression( Box *box );
1217
1218 int yyTok; // the last token read
1219 bool yyMayCapture; // set this to FALSE to disable capturing
1220
1221 /*
1222 This is the engine state during matching.
1223 */
1224 const QString *mmStr; // a pointer to the input QString
1225 const QChar *mmIn; // a pointer to the input string data
1226 int mmPos; // the current position in the string
1227 int mmCaretPos;
1228 int mmLen; // the length of the input string
1229 bool mmMinimal; // minimal matching?
1230 QMemArray<int> mmBigArray; // big QMemArray<int> array
1231 int *mmInNextStack; // is state is mmNextStack?
1232 int *mmCurStack; // stack of current states
1233 int *mmNextStack; // stack of next states
1234 int *mmCurCapBegin; // start of current states' captures
1235 int *mmNextCapBegin; // start of next states' captures
1236 int *mmCurCapEnd; // end of current states' captures
1237 int *mmNextCapEnd; // end of next states' captures
1238 int *mmTempCapBegin; // start of temporary captures
1239 int *mmTempCapEnd; // end of temporary captures
1240 int *mmCapBegin; // start of captures for a next state
1241 int *mmCapEnd; // end of captures for a next state
1242 int *mmSlideTab; // bump-along slide table for bad-character heuristic
1243 int mmSlideTabSize; // size of slide table
1244 #ifndef QT_NO_REGEXP_BACKREF
1245 QIntDict<int> mmSleeping; // dictionary of back-reference sleepers
1246 #endif
1247 int mmMatchLen; // length of match
1248 int mmOneTestMatchedLen; // length of partial match
1249 };
1250
QRegExpEngine(const QString & rx,bool caseSensitive)1251 QRegExpEngine::QRegExpEngine( const QString& rx, bool caseSensitive )
1252 #ifndef QT_NO_REGEXP_BACKREF
1253 : mmSleeping( 101 )
1254 #endif
1255 {
1256 setup( caseSensitive );
1257 valid = ( parse(rx.unicode(), rx.length()) == (int) rx.length() );
1258 if ( !valid ) {
1259 #ifndef QT_NO_REGEXP_OPTIM
1260 trivial = FALSE;
1261 #endif
1262 error( RXERR_LEFTDELIM );
1263 }
1264 }
1265
1266 #ifndef QT_NO_REGEXP_OPTIM
~QRegExpEngine()1267 QRegExpEngine::~QRegExpEngine()
1268 {
1269 }
1270 #endif
1271
1272 /*
1273 Tries to match in str and returns an array of (begin, length) pairs
1274 for captured text. If there is no match, all pairs are (-1, -1).
1275 */
match(const QString & str,int pos,bool minimal,bool oneTest,int caretIndex,QMemArray<int> & captured)1276 void QRegExpEngine::match( const QString& str, int pos, bool minimal,
1277 bool oneTest, int caretIndex,
1278 QMemArray<int>& captured )
1279 {
1280 bool matched = FALSE;
1281
1282 #ifndef QT_NO_REGEXP_OPTIM
1283 if ( trivial && !oneTest ) {
1284 mmPos = str.find( goodStr, pos, cs );
1285 mmMatchLen = goodStr.length();
1286 matched = ( mmPos != -1 );
1287 } else
1288 #endif
1289 {
1290 mmStr = &str;
1291 mmIn = str.unicode();
1292 if ( mmIn == 0 )
1293 mmIn = &QChar::null;
1294 mmPos = pos;
1295 mmCaretPos = caretIndex;
1296 mmLen = str.length();
1297 mmMinimal = minimal;
1298 mmMatchLen = 0;
1299 mmOneTestMatchedLen = 0;
1300
1301 if ( valid && mmPos >= 0 && mmPos <= mmLen ) {
1302 #ifndef QT_NO_REGEXP_OPTIM
1303 if ( oneTest ) {
1304 matched = matchHere();
1305 } else {
1306 if ( mmPos <= mmLen - minl ) {
1307 if ( caretAnchored ) {
1308 matched = matchHere();
1309 } else if ( useGoodStringHeuristic ) {
1310 matched = goodStringMatch();
1311 } else {
1312 matched = badCharMatch();
1313 }
1314 }
1315 }
1316 #else
1317 matched = oneTest ? matchHere() : bruteMatch();
1318 #endif
1319 }
1320 }
1321
1322 int capturedSize = 2 + 2 * officialncap;
1323 captured.detach();
1324 captured.resize( capturedSize );
1325 if ( matched ) {
1326 captured[0] = mmPos;
1327 captured[1] = mmMatchLen;
1328 for ( int j = 0; j < officialncap; j++ ) {
1329 int len = mmCapEnd[j] - mmCapBegin[j];
1330 captured[2 + 2 * j] = len > 0 ? mmPos + mmCapBegin[j] : 0;
1331 captured[2 + 2 * j + 1] = len;
1332 }
1333 } else {
1334 // we rely on 2's complement here
1335 memset( captured.data(), -1, capturedSize * sizeof(int) );
1336 }
1337 }
1338
1339 /*
1340 The three following functions add one state to the automaton and
1341 return the number of the state.
1342 */
1343
createState(QChar ch)1344 int QRegExpEngine::createState( QChar ch )
1345 {
1346 return setupState( ch.unicode() );
1347 }
1348
createState(const CharClass & cc)1349 int QRegExpEngine::createState( const CharClass& cc )
1350 {
1351 #ifndef QT_NO_REGEXP_CCLASS
1352 int n = cl.size();
1353 cl.resize( n + 1 );
1354 cl.insert( n, new CharClass(cc) );
1355 return setupState( CharClassBit | n );
1356 #else
1357 Q_UNUSED( cc );
1358 return setupState( CharClassBit );
1359 #endif
1360 }
1361
1362 #ifndef QT_NO_REGEXP_BACKREF
createState(int bref)1363 int QRegExpEngine::createState( int bref )
1364 {
1365 if ( bref > nbrefs ) {
1366 nbrefs = bref;
1367 if ( nbrefs > MaxBackRefs ) {
1368 error( RXERR_LIMIT );
1369 return 0;
1370 }
1371 }
1372 return setupState( BackRefBit | bref );
1373 }
1374 #endif
1375
1376 /*
1377 The two following functions add a transition between all pairs of
1378 states (i, j) where i is fond in from, and j is found in to.
1379
1380 Cat-transitions are distinguished from plus-transitions for
1381 capturing.
1382 */
1383
addCatTransitions(const QMemArray<int> & from,const QMemArray<int> & to)1384 void QRegExpEngine::addCatTransitions( const QMemArray<int>& from,
1385 const QMemArray<int>& to )
1386 {
1387 for ( int i = 0; i < (int) from.size(); i++ ) {
1388 State *st = s[from[i]];
1389 mergeInto( &st->outs, to );
1390 }
1391 }
1392
1393 #ifndef QT_NO_REGEXP_CAPTURE
addPlusTransitions(const QMemArray<int> & from,const QMemArray<int> & to,int atom)1394 void QRegExpEngine::addPlusTransitions( const QMemArray<int>& from,
1395 const QMemArray<int>& to, int atom )
1396 {
1397 for ( int i = 0; i < (int) from.size(); i++ ) {
1398 State *st = s[from[i]];
1399 QMemArray<int> oldOuts = st->outs.copy();
1400 mergeInto( &st->outs, to );
1401 if ( f[atom].capture >= 0 ) {
1402 if ( st->reenter == 0 )
1403 st->reenter = new QMap<int, int>;
1404 for ( int j = 0; j < (int) to.size(); j++ ) {
1405 if ( !st->reenter->contains(to[j]) &&
1406 oldOuts.bsearch(to[j]) < 0 )
1407 st->reenter->insert( to[j], atom );
1408 }
1409 }
1410 }
1411 }
1412 #endif
1413
1414 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1415 /*
1416 Returns an anchor that means a OR b.
1417 */
anchorAlternation(int a,int b)1418 int QRegExpEngine::anchorAlternation( int a, int b )
1419 {
1420 if ( ((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0 )
1421 return a & b;
1422
1423 int n = aa.size();
1424 #ifndef QT_NO_REGEXP_OPTIM
1425 if ( n > 0 && aa[n - 1].a == a && aa[n - 1].b == b )
1426 return Anchor_Alternation | ( n - 1 );
1427 #endif
1428
1429 aa.resize( n + 1 );
1430 aa[n].a = a;
1431 aa[n].b = b;
1432 return Anchor_Alternation | n;
1433 }
1434
1435 /*
1436 Returns an anchor that means a AND b.
1437 */
anchorConcatenation(int a,int b)1438 int QRegExpEngine::anchorConcatenation( int a, int b )
1439 {
1440 if ( ((a | b) & Anchor_Alternation) == 0 )
1441 return a | b;
1442 if ( (b & Anchor_Alternation) != 0 )
1443 qSwap( a, b );
1444
1445 int aprime = anchorConcatenation( aa[a ^ Anchor_Alternation].a, b );
1446 int bprime = anchorConcatenation( aa[a ^ Anchor_Alternation].b, b );
1447 return anchorAlternation( aprime, bprime );
1448 }
1449 #endif
1450
1451 /*
1452 Adds anchor a on a transition caracterised by its from state and
1453 its to state.
1454 */
addAnchors(int from,int to,int a)1455 void QRegExpEngine::addAnchors( int from, int to, int a )
1456 {
1457 State *st = s[from];
1458 if ( st->anchors == 0 )
1459 st->anchors = new QMap<int, int>;
1460 if ( st->anchors->contains(to) )
1461 a = anchorAlternation( (*st->anchors)[to], a );
1462 st->anchors->insert( to, a );
1463 }
1464
1465 #ifndef QT_NO_REGEXP_OPTIM
1466 /*
1467 This function chooses between the good-string and the bad-character
1468 heuristics. It computes two scores and chooses the heuristic with
1469 the highest score.
1470
1471 Here are some common-sense constraints on the scores that should be
1472 respected if the formulas are ever modified: (1) If goodStr is
1473 empty, the good-string heuristic scores 0. (2) If the regular
1474 expression is trivial, the good-string heuristic should be used.
1475 (3) If the search is case insensitive, the good-string heuristic
1476 should be used, unless it scores 0. (Case insensitivity turns all
1477 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1478 big, the good-string heuristic should score less.
1479 */
heuristicallyChooseHeuristic()1480 void QRegExpEngine::heuristicallyChooseHeuristic()
1481 {
1482 if ( minl == 0 ) {
1483 useGoodStringHeuristic = FALSE;
1484 } else if ( trivial ) {
1485 useGoodStringHeuristic = TRUE;
1486 } else {
1487 /*
1488 Magic formula: The good string has to constitute a good
1489 proportion of the minimum-length string, and appear at a
1490 more-or-less known index.
1491 */
1492 int goodStringScore = ( 64 * goodStr.length() / minl ) -
1493 ( goodLateStart - goodEarlyStart );
1494 /*
1495 Less magic formula: We pick some characters at random, and
1496 check whether they are good or bad.
1497 */
1498 int badCharScore = 0;
1499 int step = QMAX( 1, NumBadChars / 32 );
1500 for ( int i = 1; i < NumBadChars; i += step ) {
1501 if ( occ1[i] == NoOccurrence )
1502 badCharScore += minl;
1503 else
1504 badCharScore += occ1[i];
1505 }
1506 badCharScore /= minl;
1507 useGoodStringHeuristic = ( goodStringScore > badCharScore );
1508 }
1509 }
1510 #endif
1511
1512 #if defined(QT_DEBUG)
dump() const1513 void QRegExpEngine::dump() const
1514 {
1515 int i, j;
1516 qDebug( "Case %ssensitive engine", cs ? "" : "in" );
1517 qDebug( " States" );
1518 for ( i = 0; i < ns; i++ ) {
1519 qDebug( " %d%s", i,
1520 i == InitialState ? " (initial)" :
1521 i == FinalState ? " (final)" : "" );
1522 #ifndef QT_NO_REGEXP_CAPTURE
1523 qDebug( " in atom %d", s[i]->atom );
1524 #endif
1525 int m = s[i]->match;
1526 if ( (m & CharClassBit) != 0 ) {
1527 qDebug( " match character class %d", m ^ CharClassBit );
1528 #ifndef QT_NO_REGEXP_CCLASS
1529 cl[m ^ CharClassBit]->dump();
1530 #else
1531 qDebug( " negative character class" );
1532 #endif
1533 } else if ( (m & BackRefBit) != 0 ) {
1534 qDebug( " match back-reference %d", m ^ BackRefBit );
1535 } else if ( m >= 0x20 && m <= 0x7e ) {
1536 qDebug( " match 0x%.4x (%c)", m, m );
1537 } else {
1538 qDebug( " match 0x%.4x", m );
1539 }
1540 for ( j = 0; j < (int) s[i]->outs.size(); j++ ) {
1541 int next = s[i]->outs[j];
1542 qDebug( " -> %d", next );
1543 if ( s[i]->reenter != 0 && s[i]->reenter->contains(next) )
1544 qDebug( " [reenter %d]", (*s[i]->reenter)[next] );
1545 if ( s[i]->anchors != 0 && at(*s[i]->anchors, next) != 0 )
1546 qDebug( " [anchors 0x%.8x]", (*s[i]->anchors)[next] );
1547 }
1548 }
1549 #ifndef QT_NO_REGEXP_CAPTURE
1550 if ( nf > 0 ) {
1551 qDebug( " Atom Parent Capture" );
1552 for ( i = 0; i < nf; i++ )
1553 qDebug( " %6d %6d %6d", i, f[i].parent, f[i].capture );
1554 }
1555 #endif
1556 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1557 for ( i = 0; i < (int) aa.size(); i++ )
1558 qDebug( " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a,
1559 aa[i].b );
1560 #endif
1561 }
1562 #endif
1563
setup(bool caseSensitive)1564 void QRegExpEngine::setup( bool caseSensitive )
1565 {
1566 s.setAutoDelete( TRUE );
1567 s.resize( 32 );
1568 ns = 0;
1569 #ifndef QT_NO_REGEXP_CAPTURE
1570 f.resize( 32 );
1571 nf = 0;
1572 cf = -1;
1573 #endif
1574 officialncap = 0;
1575 ncap = 0;
1576 #ifndef QT_NO_REGEXP_CCLASS
1577 cl.setAutoDelete( TRUE );
1578 #endif
1579 #ifndef QT_NO_REGEXP_LOOKAHEAD
1580 ahead.setAutoDelete( TRUE );
1581 #endif
1582 #ifndef QT_NO_REGEXP_OPTIM
1583 caretAnchored = TRUE;
1584 trivial = TRUE;
1585 #endif
1586 valid = FALSE;
1587 cs = caseSensitive;
1588 #ifndef QT_NO_REGEXP_BACKREF
1589 nbrefs = 0;
1590 #endif
1591 #ifndef QT_NO_REGEXP_OPTIM
1592 useGoodStringHeuristic = TRUE;
1593 minl = 0;
1594 occ1.fill( 0, NumBadChars );
1595 #endif
1596 }
1597
setupState(int match)1598 int QRegExpEngine::setupState( int match )
1599 {
1600 if ( (ns & (ns + 1)) == 0 && ns + 1 >= (int) s.size() )
1601 s.resize( (ns + 1) << 1 );
1602 #ifndef QT_NO_REGEXP_CAPTURE
1603 s.insert( ns, new State(cf, match) );
1604 #else
1605 s.insert( ns, new State(match) );
1606 #endif
1607 return ns++;
1608 }
1609
1610 #ifndef QT_NO_REGEXP_CAPTURE
1611 /*
1612 Functions startAtom() and finishAtom() should be called to delimit
1613 atoms. When a state is created, it is assigned to the current atom.
1614 The information is later used for capturing.
1615 */
startAtom(bool capture)1616 int QRegExpEngine::startAtom( bool capture )
1617 {
1618 if ( (nf & (nf + 1)) == 0 && nf + 1 >= (int) f.size() )
1619 f.resize( (nf + 1) << 1 );
1620 f[nf].parent = cf;
1621 cf = nf++;
1622 f[cf].capture = capture ? ncap++ : -1;
1623 return cf;
1624 }
1625 #endif
1626
1627 #ifndef QT_NO_REGEXP_LOOKAHEAD
1628 /*
1629 Creates a lookahead anchor.
1630 */
addLookahead(QRegExpEngine * eng,bool negative)1631 int QRegExpEngine::addLookahead( QRegExpEngine *eng, bool negative )
1632 {
1633 int n = ahead.size();
1634 if ( n == MaxLookaheads ) {
1635 error( RXERR_LIMIT );
1636 return 0;
1637 }
1638 ahead.resize( n + 1 );
1639 ahead.insert( n, new Lookahead(eng, negative) );
1640 return Anchor_FirstLookahead << n;
1641 }
1642 #endif
1643
1644 #ifndef QT_NO_REGEXP_CAPTURE
1645 /*
1646 We want the longest leftmost captures.
1647 */
isBetterCapture(const int * begin1,const int * end1,const int * begin2,const int * end2)1648 bool QRegExpEngine::isBetterCapture( const int *begin1, const int *end1,
1649 const int *begin2, const int *end2 )
1650 {
1651 for ( int i = 0; i < ncap; i++ ) {
1652 int delta = begin2[i] - begin1[i]; // it has to start early...
1653 if ( delta == 0 )
1654 delta = end1[i] - end2[i]; // ...and end late (like a party)
1655
1656 if ( delta != 0 )
1657 return delta > 0;
1658 }
1659 return FALSE;
1660 }
1661 #endif
1662
1663 /*
1664 Returns TRUE if anchor a matches at position mmPos + i in the input
1665 string, otherwise FALSE.
1666 */
testAnchor(int i,int a,const int * capBegin)1667 bool QRegExpEngine::testAnchor( int i, int a, const int *capBegin )
1668 {
1669 int j;
1670
1671 #ifndef QT_NO_REGEXP_ANCHOR_ALT
1672 if ( (a & Anchor_Alternation) != 0 ) {
1673 return testAnchor( i, aa[a ^ Anchor_Alternation].a, capBegin ) ||
1674 testAnchor( i, aa[a ^ Anchor_Alternation].b, capBegin );
1675 }
1676 #endif
1677
1678 if ( (a & Anchor_Caret) != 0 ) {
1679 if ( mmPos + i != mmCaretPos )
1680 return FALSE;
1681 }
1682 if ( (a & Anchor_Dollar) != 0 ) {
1683 if ( mmPos + i != mmLen )
1684 return FALSE;
1685 }
1686 #ifndef QT_NO_REGEXP_ESCAPE
1687 if ( (a & (Anchor_Word | Anchor_NonWord)) != 0 ) {
1688 bool before = FALSE;
1689 bool after = FALSE;
1690 if ( mmPos + i != 0 )
1691 before = isWord( mmIn[mmPos + i - 1] );
1692 if ( mmPos + i != mmLen )
1693 after = isWord( mmIn[mmPos + i] );
1694 if ( (a & Anchor_Word) != 0 && (before == after) )
1695 return FALSE;
1696 if ( (a & Anchor_NonWord) != 0 && (before != after) )
1697 return FALSE;
1698 }
1699 #endif
1700 #ifndef QT_NO_REGEXP_LOOKAHEAD
1701 if ( (a & Anchor_LookaheadMask) != 0 ) {
1702 QConstString cstr = QConstString( (QChar *) mmIn + mmPos + i,
1703 mmLen - mmPos - i );
1704 for ( j = 0; j < (int) ahead.size(); j++ ) {
1705 if ( (a & (Anchor_FirstLookahead << j)) != 0 ) {
1706 QMemArray<int> captured;
1707 ahead[j]->eng->match( cstr.string(), 0, TRUE, TRUE,
1708 mmCaretPos - mmPos - i, captured );
1709 if ( (captured[0] == 0) == ahead[j]->neg )
1710 return FALSE;
1711 }
1712 }
1713 }
1714 #endif
1715 #ifndef QT_NO_REGEXP_CAPTURE
1716 #ifndef QT_NO_REGEXP_BACKREF
1717 for ( j = 0; j < nbrefs; j++ ) {
1718 if ( (a & (Anchor_BackRef1Empty << j)) != 0 ) {
1719 if ( capBegin[j] != EmptyCapture )
1720 return FALSE;
1721 }
1722 }
1723 #endif
1724 #endif
1725 return TRUE;
1726 }
1727
1728 #ifndef QT_NO_REGEXP_OPTIM
1729 /*
1730 The three following functions are what Jeffrey Friedl would call
1731 transmissions (or bump-alongs). Using one or the other should make
1732 no difference except in performance.
1733 */
1734
goodStringMatch()1735 bool QRegExpEngine::goodStringMatch()
1736 {
1737 int k = mmPos + goodEarlyStart;
1738 while ( (k = mmStr->find(goodStr, k, cs)) != -1 ) {
1739 int from = k - goodLateStart;
1740 int to = k - goodEarlyStart;
1741 if ( from > mmPos )
1742 mmPos = from;
1743
1744 while ( mmPos <= to ) {
1745 if ( matchHere() )
1746 return TRUE;
1747 mmPos++;
1748 }
1749 k++;
1750 }
1751 return FALSE;
1752 }
1753
badCharMatch()1754 bool QRegExpEngine::badCharMatch()
1755 {
1756 int slideHead = 0;
1757 int slideNext = 0;
1758 int i;
1759 int lastPos = mmLen - minl;
1760 memset( mmSlideTab, 0, mmSlideTabSize * sizeof(int) );
1761
1762 /*
1763 Set up the slide table, used for the bad-character heuristic,
1764 using the table of first occurrence of each character.
1765 */
1766 for ( i = 0; i < minl; i++ ) {
1767 int sk = occ1[BadChar(mmIn[mmPos + i])];
1768 if ( sk == NoOccurrence )
1769 sk = i + 1;
1770 if ( sk > 0 ) {
1771 int k = i + 1 - sk;
1772 if ( k < 0 ) {
1773 sk = i + 1;
1774 k = 0;
1775 }
1776 if ( sk > mmSlideTab[k] )
1777 mmSlideTab[k] = sk;
1778 }
1779 }
1780
1781 if ( mmPos > lastPos )
1782 return FALSE;
1783
1784 for ( ;; ) {
1785 if ( ++slideNext >= mmSlideTabSize )
1786 slideNext = 0;
1787 if ( mmSlideTab[slideHead] > 0 ) {
1788 if ( mmSlideTab[slideHead] - 1 > mmSlideTab[slideNext] )
1789 mmSlideTab[slideNext] = mmSlideTab[slideHead] - 1;
1790 mmSlideTab[slideHead] = 0;
1791 } else {
1792 if ( matchHere() )
1793 return TRUE;
1794 }
1795
1796 if ( mmPos == lastPos )
1797 break;
1798
1799 /*
1800 Update the slide table. This code has much in common with
1801 the initialization code.
1802 */
1803 int sk = occ1[BadChar(mmIn[mmPos + minl])];
1804 if ( sk == NoOccurrence ) {
1805 mmSlideTab[slideNext] = minl;
1806 } else if ( sk > 0 ) {
1807 int k = slideNext + minl - sk;
1808 if ( k >= mmSlideTabSize )
1809 k -= mmSlideTabSize;
1810 if ( sk > mmSlideTab[k] )
1811 mmSlideTab[k] = sk;
1812 }
1813 slideHead = slideNext;
1814 mmPos++;
1815 }
1816 return FALSE;
1817 }
1818 #else
bruteMatch()1819 bool QRegExpEngine::bruteMatch()
1820 {
1821 while ( mmPos <= mmLen ) {
1822 if ( matchHere() )
1823 return TRUE;
1824 mmPos++;
1825 }
1826 return FALSE;
1827 }
1828 #endif
1829
1830 /*
1831 Here's the core of the engine. It tries to do a match here and now.
1832 */
matchHere()1833 bool QRegExpEngine::matchHere()
1834 {
1835 int ncur = 1, nnext = 0;
1836 int i = 0, j, k, m;
1837 bool stop = FALSE;
1838
1839 mmMatchLen = -1;
1840 mmOneTestMatchedLen = -1;
1841 mmCurStack[0] = InitialState;
1842
1843 #ifndef QT_NO_REGEXP_CAPTURE
1844 if ( ncap > 0 ) {
1845 for ( j = 0; j < ncap; j++ ) {
1846 mmCurCapBegin[j] = EmptyCapture;
1847 mmCurCapEnd[j] = EmptyCapture;
1848 }
1849 }
1850 #endif
1851
1852 #ifndef QT_NO_REGEXP_BACKREF
1853 int *zzZ = 0;
1854
1855 while ( (ncur > 0 || !mmSleeping.isEmpty()) && i <= mmLen - mmPos &&
1856 !stop )
1857 #else
1858 while ( ncur > 0 && i <= mmLen - mmPos && !stop )
1859 #endif
1860 {
1861 int ch = ( i < mmLen - mmPos ) ? mmIn[mmPos + i].unicode() : 0;
1862 for ( j = 0; j < ncur; j++ ) {
1863 int cur = mmCurStack[j];
1864 State *scur = s[cur];
1865 QMemArray<int>& outs = scur->outs;
1866 for ( k = 0; k < (int) outs.size(); k++ ) {
1867 int next = outs[k];
1868 State *snext = s[next];
1869 bool in = TRUE;
1870 #ifndef QT_NO_REGEXP_BACKREF
1871 int needSomeSleep = 0;
1872 #endif
1873
1874 /*
1875 First, check if the anchors are anchored properly.
1876 */
1877 if ( scur->anchors != 0 ) {
1878 int a = at( *scur->anchors, next );
1879 if ( a != 0 && !testAnchor(i, a, mmCurCapBegin + j * ncap) )
1880 in = FALSE;
1881 }
1882 /*
1883 If indeed they are, check if the input character is
1884 correct for this transition.
1885 */
1886 if ( in ) {
1887 m = snext->match;
1888 if ( (m & (CharClassBit | BackRefBit)) == 0 ) {
1889 if ( cs )
1890 in = ( m == ch );
1891 else
1892 in = ( QChar(m).lower() == QChar(ch).lower() );
1893 } else if ( next == FinalState ) {
1894 mmMatchLen = i;
1895 stop = mmMinimal;
1896 in = TRUE;
1897 } else if ( (m & CharClassBit) != 0 ) {
1898 #ifndef QT_NO_REGEXP_CCLASS
1899 const CharClass *cc = cl[m ^ CharClassBit];
1900 if ( cs )
1901 in = cc->in( ch );
1902 else if ( cc->negative() )
1903 in = cc->in( QChar(ch).lower() ) &&
1904 cc->in( QChar(ch).upper() );
1905 else
1906 in = cc->in( QChar(ch).lower() ) ||
1907 cc->in( QChar(ch).upper() );
1908 #endif
1909 #ifndef QT_NO_REGEXP_BACKREF
1910 } else { /* ( (m & BackRefBit) != 0 ) */
1911 int bref = m ^ BackRefBit;
1912 int ell = j * ncap + ( bref - 1 );
1913
1914 in = bref <= ncap && mmCurCapBegin[ell] != EmptyCapture;
1915 if ( in ) {
1916 if ( cs )
1917 in = ( mmIn[mmPos + mmCurCapBegin[ell]]
1918 == QChar(ch) );
1919 else
1920 in = ( mmIn[mmPos + mmCurCapBegin[ell]].lower()
1921 == QChar(ch).lower() );
1922 }
1923
1924 if ( in ) {
1925 int delta;
1926 if ( mmCurCapEnd[ell] == EmptyCapture )
1927 delta = i - mmCurCapBegin[ell];
1928 else
1929 delta = mmCurCapEnd[ell] - mmCurCapBegin[ell];
1930
1931 in = ( delta <= mmLen - (mmPos + i) );
1932 if ( in && delta > 1 ) {
1933 int n = 1;
1934 if ( cs ) {
1935 while ( n < delta ) {
1936 if ( mmIn[mmPos +
1937 mmCurCapBegin[ell] + n] !=
1938 mmIn[mmPos + i + n] )
1939 break;
1940 n++;
1941 }
1942 } else {
1943 while ( n < delta ) {
1944 QChar a = mmIn[mmPos +
1945 mmCurCapBegin[ell] + n];
1946 QChar b = mmIn[mmPos + i + n];
1947 if ( a.lower() != b.lower() )
1948 break;
1949 n++;
1950 }
1951 }
1952 in = ( n == delta );
1953 if ( in )
1954 needSomeSleep = delta - 1;
1955 }
1956 }
1957 #endif
1958 }
1959 }
1960
1961 /*
1962 We must now update our data structures.
1963 */
1964 if ( in ) {
1965 #ifndef QT_NO_REGEXP_CAPTURE
1966 int *capBegin, *capEnd;
1967 #endif
1968 /*
1969 If the next state was not encountered yet, all
1970 is fine.
1971 */
1972 if ( (m = mmInNextStack[next]) == -1 ) {
1973 m = nnext++;
1974 mmNextStack[m] = next;
1975 mmInNextStack[next] = m;
1976 #ifndef QT_NO_REGEXP_CAPTURE
1977 capBegin = mmNextCapBegin + m * ncap;
1978 capEnd = mmNextCapEnd + m * ncap;
1979
1980 /*
1981 Otherwise, we'll first maintain captures in
1982 temporary arrays, and decide at the end whether
1983 it's best to keep the previous capture zones or
1984 the new ones.
1985 */
1986 } else {
1987 capBegin = mmTempCapBegin;
1988 capEnd = mmTempCapEnd;
1989 #endif
1990 }
1991
1992 #ifndef QT_NO_REGEXP_CAPTURE
1993 /*
1994 Updating the capture zones is much of a task.
1995 */
1996 if ( ncap > 0 ) {
1997 memcpy( capBegin, mmCurCapBegin + j * ncap,
1998 ncap * sizeof(int) );
1999 memcpy( capEnd, mmCurCapEnd + j * ncap,
2000 ncap * sizeof(int) );
2001 int c = scur->atom, n = snext->atom;
2002 int p = -1, q = -1;
2003 int cap;
2004
2005 /*
2006 Lemma 1. For any x in the range [0..nf), we
2007 have f[x].parent < x.
2008
2009 Proof. By looking at startAtom(), it is
2010 clear that cf < nf holds all the time, and
2011 thus that f[nf].parent < nf.
2012 */
2013
2014 /*
2015 If we are reentering an atom, we empty all
2016 capture zones inside it.
2017 */
2018 if ( scur->reenter != 0 &&
2019 (q = at(*scur->reenter, next)) != 0 ) {
2020 QBitArray b;
2021 b.fill( FALSE, nf );
2022 b.setBit( q, TRUE );
2023 for ( int ell = q + 1; ell < nf; ell++ ) {
2024 if ( b.testBit(f[ell].parent) ) {
2025 b.setBit( ell, TRUE );
2026 cap = f[ell].capture;
2027 if ( cap >= 0 ) {
2028 capBegin[cap] = EmptyCapture;
2029 capEnd[cap] = EmptyCapture;
2030 }
2031 }
2032 }
2033 p = f[q].parent;
2034
2035 /*
2036 Otherwise, close the capture zones we are
2037 leaving. We are leaving f[c].capture,
2038 f[f[c].parent].capture,
2039 f[f[f[c].parent].parent].capture, ...,
2040 until f[x].capture, with x such that
2041 f[x].parent is the youngest common ancestor
2042 for c and n.
2043
2044 We go up along c's and n's ancestry until
2045 we find x.
2046 */
2047 } else {
2048 p = c;
2049 q = n;
2050 while ( p != q ) {
2051 if ( p > q ) {
2052 cap = f[p].capture;
2053 if ( cap >= 0 ) {
2054 if ( capBegin[cap] == i ) {
2055 capBegin[cap] = EmptyCapture;
2056 capEnd[cap] = EmptyCapture;
2057 } else {
2058 capEnd[cap] = i;
2059 }
2060 }
2061 p = f[p].parent;
2062 } else {
2063 q = f[q].parent;
2064 }
2065 }
2066 }
2067
2068 /*
2069 In any case, we now open the capture zones
2070 we are entering. We work upwards from n
2071 until we reach p (the parent of the atom we
2072 reenter or the youngest common ancestor).
2073 */
2074 while ( n > p ) {
2075 cap = f[n].capture;
2076 if ( cap >= 0 ) {
2077 capBegin[cap] = i;
2078 capEnd[cap] = EmptyCapture;
2079 }
2080 n = f[n].parent;
2081 }
2082 /*
2083 If the next state was already in
2084 mmNextStack, we must choose carefully which
2085 capture zones we want to keep.
2086 */
2087 if ( capBegin == mmTempCapBegin &&
2088 isBetterCapture(capBegin, capEnd,
2089 mmNextCapBegin + m * ncap,
2090 mmNextCapEnd + m * ncap) ) {
2091 memcpy( mmNextCapBegin + m * ncap, capBegin,
2092 ncap * sizeof(int) );
2093 memcpy( mmNextCapEnd + m * ncap, capEnd,
2094 ncap * sizeof(int) );
2095 }
2096 }
2097 #ifndef QT_NO_REGEXP_BACKREF
2098 /*
2099 We are done with updating the capture zones.
2100 It's now time to put the next state to sleep,
2101 if it needs to, and to remove it from
2102 mmNextStack.
2103 */
2104 if ( needSomeSleep > 0 ) {
2105 zzZ = new int[1 + 2 * ncap];
2106 zzZ[0] = next;
2107 if ( ncap > 0 ) {
2108 memcpy( zzZ + 1, capBegin, ncap * sizeof(int) );
2109 memcpy( zzZ + 1 + ncap, capEnd,
2110 ncap * sizeof(int) );
2111 }
2112 mmInNextStack[mmNextStack[--nnext]] = -1;
2113 mmSleeping.insert( i + needSomeSleep, zzZ );
2114 }
2115 #endif
2116 #endif
2117 }
2118 }
2119 }
2120 #ifndef QT_NO_REGEXP_CAPTURE
2121 /*
2122 If we reached the final state, hurray! Copy the captured
2123 zone.
2124 */
2125 if ( ncap > 0 && (m = mmInNextStack[FinalState]) != -1 ) {
2126 memcpy( mmCapBegin, mmNextCapBegin + m * ncap, ncap * sizeof(int) );
2127 memcpy( mmCapEnd, mmNextCapEnd + m * ncap, ncap * sizeof(int) );
2128 }
2129 #ifndef QT_NO_REGEXP_BACKREF
2130 /*
2131 It's time to wake up the sleepers.
2132 */
2133 if ( !mmSleeping.isEmpty() ) {
2134 while ( (zzZ = mmSleeping.take(i)) != 0 ) {
2135 int next = zzZ[0];
2136 int *capBegin = zzZ + 1;
2137 int *capEnd = zzZ + 1 + ncap;
2138 bool copyOver = TRUE;
2139
2140 if ( (m = mmInNextStack[zzZ[0]]) == -1 ) {
2141 m = nnext++;
2142 mmNextStack[m] = next;
2143 mmInNextStack[next] = m;
2144 } else {
2145 copyOver = isBetterCapture( mmNextCapBegin + m * ncap,
2146 mmNextCapEnd + m * ncap,
2147 capBegin, capEnd );
2148 }
2149 if ( copyOver ) {
2150 memcpy( mmNextCapBegin + m * ncap, capBegin,
2151 ncap * sizeof(int) );
2152 memcpy( mmNextCapEnd + m * ncap, capEnd,
2153 ncap * sizeof(int) );
2154 }
2155 delete[] zzZ;
2156 }
2157 }
2158 #endif
2159 #endif
2160 for ( j = 0; j < nnext; j++ )
2161 mmInNextStack[mmNextStack[j]] = -1;
2162
2163 // avoid needless iteration that confuses mmOneTestMatchedLen
2164 if ( nnext == 1 && mmNextStack[0] == FinalState
2165 #ifndef QT_NO_REGEXP_BACKREF
2166 && mmSleeping.isEmpty()
2167 #endif
2168 )
2169 stop = TRUE;
2170
2171 qSwap( mmCurStack, mmNextStack );
2172 #ifndef QT_NO_REGEXP_CAPTURE
2173 qSwap( mmCurCapBegin, mmNextCapBegin );
2174 qSwap( mmCurCapEnd, mmNextCapEnd );
2175 #endif
2176 ncur = nnext;
2177 nnext = 0;
2178 i++;
2179 }
2180
2181 #ifndef QT_NO_REGEXP_BACKREF
2182 /*
2183 If minimal matching is enabled, we might have some sleepers
2184 left.
2185 */
2186 while ( !mmSleeping.isEmpty() ) {
2187 zzZ = mmSleeping.take( *QIntDictIterator<int>(mmSleeping) );
2188 delete[] zzZ;
2189 }
2190 #endif
2191
2192 mmOneTestMatchedLen = i - 1;
2193 return ( mmMatchLen >= 0 );
2194 }
2195
2196 #ifndef QT_NO_REGEXP_CCLASS
2197
CharClass()2198 QRegExpEngine::CharClass::CharClass()
2199 : c( 0 ), n( FALSE )
2200 {
2201 #ifndef QT_NO_REGEXP_OPTIM
2202 occ1.fill( NoOccurrence, NumBadChars );
2203 #endif
2204 }
2205
operator =(const CharClass & cc)2206 QRegExpEngine::CharClass& QRegExpEngine::CharClass::operator=(
2207 const CharClass& cc )
2208 {
2209 c = cc.c;
2210 r = cc.r.copy();
2211 n = cc.n;
2212 #ifndef QT_NO_REGEXP_OPTIM
2213 occ1 = cc.occ1;
2214 #endif
2215 return *this;
2216 }
2217
clear()2218 void QRegExpEngine::CharClass::clear()
2219 {
2220 c = 0;
2221 r.resize( 0 );
2222 n = FALSE;
2223 }
2224
setNegative(bool negative)2225 void QRegExpEngine::CharClass::setNegative( bool negative )
2226 {
2227 n = negative;
2228 #ifndef QT_NO_REGEXP_OPTIM
2229 occ1.fill( 0, NumBadChars );
2230 #endif
2231 }
2232
addCategories(int cats)2233 void QRegExpEngine::CharClass::addCategories( int cats )
2234 {
2235 c |= cats;
2236 #ifndef QT_NO_REGEXP_OPTIM
2237 occ1.fill( 0, NumBadChars );
2238 #endif
2239 }
2240
addRange(ushort from,ushort to)2241 void QRegExpEngine::CharClass::addRange( ushort from, ushort to )
2242 {
2243 if ( from > to )
2244 qSwap( from, to );
2245 int m = r.size();
2246 r.resize( m + 1 );
2247 r[m].from = from;
2248 r[m].to = to;
2249
2250 #ifndef QT_NO_REGEXP_OPTIM
2251 int i;
2252
2253 if ( to - from < NumBadChars ) {
2254 occ1.detach();
2255 if ( from % NumBadChars <= to % NumBadChars ) {
2256 for ( i = from % NumBadChars; i <= to % NumBadChars; i++ )
2257 occ1[i] = 0;
2258 } else {
2259 for ( i = 0; i <= to % NumBadChars; i++ )
2260 occ1[i] = 0;
2261 for ( i = from % NumBadChars; i < NumBadChars; i++ )
2262 occ1[i] = 0;
2263 }
2264 } else {
2265 occ1.fill( 0, NumBadChars );
2266 }
2267 #endif
2268 }
2269
in(QChar ch) const2270 bool QRegExpEngine::CharClass::in( QChar ch ) const
2271 {
2272 #ifndef QT_NO_REGEXP_OPTIM
2273 if ( occ1[BadChar(ch)] == NoOccurrence )
2274 return n;
2275 #endif
2276
2277 if ( c != 0 && (c & (1 << (int) ch.category())) != 0 )
2278 return !n;
2279 for ( int i = 0; i < (int) r.size(); i++ ) {
2280 if ( ch.unicode() >= r[i].from && ch.unicode() <= r[i].to )
2281 return !n;
2282 }
2283 return n;
2284 }
2285
2286 #if defined(QT_DEBUG)
dump() const2287 void QRegExpEngine::CharClass::dump() const
2288 {
2289 int i;
2290 qDebug( " %stive character class", n ? "nega" : "posi" );
2291 #ifndef QT_NO_REGEXP_CCLASS
2292 if ( c != 0 )
2293 qDebug( " categories 0x%.8x", c );
2294 #endif
2295 for ( i = 0; i < (int) r.size(); i++ )
2296 qDebug( " 0x%.4x through 0x%.4x", r[i].from, r[i].to );
2297 }
2298 #endif
2299 #endif
2300
Box(QRegExpEngine * engine)2301 QRegExpEngine::Box::Box( QRegExpEngine *engine )
2302 : eng( engine ), skipanchors( 0 )
2303 #ifndef QT_NO_REGEXP_OPTIM
2304 , earlyStart( 0 ), lateStart( 0 ), maxl( 0 )
2305 #endif
2306 {
2307 #ifndef QT_NO_REGEXP_OPTIM
2308 occ1.fill( NoOccurrence, NumBadChars );
2309 #endif
2310 minl = 0;
2311 }
2312
operator =(const Box & b)2313 QRegExpEngine::Box& QRegExpEngine::Box::operator=( const Box& b )
2314 {
2315 eng = b.eng;
2316 ls = b.ls;
2317 rs = b.rs;
2318 lanchors = b.lanchors;
2319 ranchors = b.ranchors;
2320 skipanchors = b.skipanchors;
2321 #ifndef QT_NO_REGEXP_OPTIM
2322 earlyStart = b.earlyStart;
2323 lateStart = b.lateStart;
2324 str = b.str;
2325 leftStr = b.leftStr;
2326 rightStr = b.rightStr;
2327 maxl = b.maxl;
2328 occ1 = b.occ1;
2329 #endif
2330 minl = b.minl;
2331 return *this;
2332 }
2333
set(QChar ch)2334 void QRegExpEngine::Box::set( QChar ch )
2335 {
2336 ls.resize( 1 );
2337 ls[0] = eng->createState( ch );
2338 rs = ls;
2339 rs.detach();
2340 #ifndef QT_NO_REGEXP_OPTIM
2341 str = ch;
2342 leftStr = ch;
2343 rightStr = ch;
2344 maxl = 1;
2345 occ1.detach();
2346 occ1[BadChar(ch)] = 0;
2347 #endif
2348 minl = 1;
2349 }
2350
set(const CharClass & cc)2351 void QRegExpEngine::Box::set( const CharClass& cc )
2352 {
2353 ls.resize( 1 );
2354 ls[0] = eng->createState( cc );
2355 rs = ls;
2356 rs.detach();
2357 #ifndef QT_NO_REGEXP_OPTIM
2358 maxl = 1;
2359 occ1 = cc.firstOccurrence();
2360 #endif
2361 minl = 1;
2362 }
2363
2364 #ifndef QT_NO_REGEXP_BACKREF
set(int bref)2365 void QRegExpEngine::Box::set( int bref )
2366 {
2367 ls.resize( 1 );
2368 ls[0] = eng->createState( bref );
2369 rs = ls;
2370 rs.detach();
2371 if ( bref >= 1 && bref <= MaxBackRefs )
2372 skipanchors = Anchor_BackRef0Empty << bref;
2373 #ifndef QT_NO_REGEXP_OPTIM
2374 maxl = InftyLen;
2375 #endif
2376 minl = 0;
2377 }
2378 #endif
2379
cat(const Box & b)2380 void QRegExpEngine::Box::cat( const Box& b )
2381 {
2382 eng->addCatTransitions( rs, b.ls );
2383 addAnchorsToEngine( b );
2384 if ( minl == 0 ) {
2385 mergeInto( &lanchors, b.lanchors );
2386 if ( skipanchors != 0 ) {
2387 for ( int i = 0; i < (int) b.ls.size(); i++ ) {
2388 int a = eng->anchorConcatenation( at(lanchors, b.ls[i]),
2389 skipanchors );
2390 lanchors.insert( b.ls[i], a );
2391 }
2392 }
2393 mergeInto( &ls, b.ls );
2394 }
2395 if ( b.minl == 0 ) {
2396 mergeInto( &ranchors, b.ranchors );
2397 if ( b.skipanchors != 0 ) {
2398 for ( int i = 0; i < (int) rs.size(); i++ ) {
2399 int a = eng->anchorConcatenation( at(ranchors, rs[i]),
2400 b.skipanchors );
2401 ranchors.insert( rs[i], a );
2402 }
2403 }
2404 mergeInto( &rs, b.rs );
2405 } else {
2406 ranchors = b.ranchors;
2407 rs = b.rs;
2408 }
2409
2410 #ifndef QT_NO_REGEXP_OPTIM
2411 if ( maxl != InftyLen ) {
2412 if ( rightStr.length() + b.leftStr.length() >
2413 QMAX(str.length(), b.str.length()) ) {
2414 earlyStart = minl - rightStr.length();
2415 lateStart = maxl - rightStr.length();
2416 str = rightStr + b.leftStr;
2417 } else if ( b.str.length() > str.length() ) {
2418 earlyStart = minl + b.earlyStart;
2419 lateStart = maxl + b.lateStart;
2420 str = b.str;
2421 }
2422 }
2423
2424 if ( (int) leftStr.length() == maxl )
2425 leftStr += b.leftStr;
2426
2427 if ( (int) b.rightStr.length() == b.maxl ) {
2428 rightStr += b.rightStr;
2429 } else {
2430 rightStr = b.rightStr;
2431 }
2432
2433 if ( maxl == InftyLen || b.maxl == InftyLen ) {
2434 maxl = InftyLen;
2435 } else {
2436 maxl += b.maxl;
2437 }
2438
2439 occ1.detach();
2440 for ( int i = 0; i < NumBadChars; i++ ) {
2441 if ( b.occ1[i] != NoOccurrence && minl + b.occ1[i] < occ1[i] )
2442 occ1[i] = minl + b.occ1[i];
2443 }
2444 #endif
2445
2446 minl += b.minl;
2447 if ( minl == 0 )
2448 skipanchors = eng->anchorConcatenation( skipanchors, b.skipanchors );
2449 else
2450 skipanchors = 0;
2451 }
2452
orx(const Box & b)2453 void QRegExpEngine::Box::orx( const Box& b )
2454 {
2455 mergeInto( &ls, b.ls );
2456 mergeInto( &lanchors, b.lanchors );
2457 mergeInto( &rs, b.rs );
2458 mergeInto( &ranchors, b.ranchors );
2459
2460 if ( b.minl == 0 ) {
2461 if ( minl == 0 )
2462 skipanchors = eng->anchorAlternation( skipanchors, b.skipanchors );
2463 else
2464 skipanchors = b.skipanchors;
2465 }
2466
2467 #ifndef QT_NO_REGEXP_OPTIM
2468 occ1.detach();
2469 for ( int i = 0; i < NumBadChars; i++ ) {
2470 if ( occ1[i] > b.occ1[i] )
2471 occ1[i] = b.occ1[i];
2472 }
2473 earlyStart = 0;
2474 lateStart = 0;
2475 str = QString();
2476 leftStr = QString();
2477 rightStr = QString();
2478 if ( b.maxl > maxl )
2479 maxl = b.maxl;
2480 #endif
2481 if ( b.minl < minl )
2482 minl = b.minl;
2483 }
2484
plus(int atom)2485 void QRegExpEngine::Box::plus( int atom )
2486 {
2487 #ifndef QT_NO_REGEXP_CAPTURE
2488 eng->addPlusTransitions( rs, ls, atom );
2489 #else
2490 Q_UNUSED( atom );
2491 eng->addCatTransitions( rs, ls );
2492 #endif
2493 addAnchorsToEngine( *this );
2494 #ifndef QT_NO_REGEXP_OPTIM
2495 maxl = InftyLen;
2496 #endif
2497 }
2498
opt()2499 void QRegExpEngine::Box::opt()
2500 {
2501 #ifndef QT_NO_REGEXP_OPTIM
2502 earlyStart = 0;
2503 lateStart = 0;
2504 str = QString();
2505 leftStr = QString();
2506 rightStr = QString();
2507 #endif
2508 skipanchors = 0;
2509 minl = 0;
2510 }
2511
catAnchor(int a)2512 void QRegExpEngine::Box::catAnchor( int a )
2513 {
2514 if ( a != 0 ) {
2515 for ( int i = 0; i < (int) rs.size(); i++ ) {
2516 a = eng->anchorConcatenation( at(ranchors, rs[i]), a );
2517 ranchors.insert( rs[i], a );
2518 }
2519 if ( minl == 0 )
2520 skipanchors = eng->anchorConcatenation( skipanchors, a );
2521 }
2522 }
2523
2524 #ifndef QT_NO_REGEXP_OPTIM
setupHeuristics()2525 void QRegExpEngine::Box::setupHeuristics()
2526 {
2527 eng->goodEarlyStart = earlyStart;
2528 eng->goodLateStart = lateStart;
2529 eng->goodStr = eng->cs ? str : str.lower();
2530
2531 eng->minl = minl;
2532 if ( eng->cs ) {
2533 /*
2534 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2535 1 at this point. An entry of occ1 has to be at most minl or
2536 infinity for the rest of the algorithm to go well.
2537
2538 We waited until here before normalizing these cases (instead of
2539 doing it in Box::orx()) because sometimes things improve by
2540 themselves. Consider for example (112|1)34.
2541 */
2542 for ( int i = 0; i < NumBadChars; i++ ) {
2543 if ( occ1[i] != NoOccurrence && occ1[i] >= minl )
2544 occ1[i] = minl;
2545 }
2546 eng->occ1 = occ1;
2547 } else {
2548 eng->occ1.fill( 0, NumBadChars );
2549 }
2550
2551 eng->heuristicallyChooseHeuristic();
2552 }
2553 #endif
2554
2555 #if defined(QT_DEBUG)
dump() const2556 void QRegExpEngine::Box::dump() const
2557 {
2558 int i;
2559 qDebug( "Box of at least %d character%s", minl, minl == 1 ? "" : "s" );
2560 qDebug( " Left states:" );
2561 for ( i = 0; i < (int) ls.size(); i++ ) {
2562 if ( at(lanchors, ls[i]) == 0 )
2563 qDebug( " %d", ls[i] );
2564 else
2565 qDebug( " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]] );
2566 }
2567 qDebug( " Right states:" );
2568 for ( i = 0; i < (int) rs.size(); i++ ) {
2569 if ( at(ranchors, rs[i]) == 0 )
2570 qDebug( " %d", rs[i] );
2571 else
2572 qDebug( " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]] );
2573 }
2574 qDebug( " Skip anchors: 0x%.8x", skipanchors );
2575 }
2576 #endif
2577
addAnchorsToEngine(const Box & to) const2578 void QRegExpEngine::Box::addAnchorsToEngine( const Box& to ) const
2579 {
2580 for ( int i = 0; i < (int) to.ls.size(); i++ ) {
2581 for ( int j = 0; j < (int) rs.size(); j++ ) {
2582 int a = eng->anchorConcatenation( at(ranchors, rs[j]),
2583 at(to.lanchors, to.ls[i]) );
2584 eng->addAnchors( rs[j], to.ls[i], a );
2585 }
2586 }
2587 }
2588
getChar()2589 int QRegExpEngine::getChar()
2590 {
2591 return ( yyPos == yyLen ) ? EOS : yyIn[yyPos++].unicode();
2592 }
2593
getEscape()2594 int QRegExpEngine::getEscape()
2595 {
2596 #ifndef QT_NO_REGEXP_ESCAPE
2597 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2598 const char backTab[] = "\a\f\n\r\t\v";
2599 ushort low;
2600 int i;
2601 #endif
2602 ushort val;
2603 int prevCh = yyCh;
2604
2605 if ( prevCh == EOS ) {
2606 error( RXERR_END );
2607 return Tok_Char | '\\';
2608 }
2609 yyCh = getChar();
2610 #ifndef QT_NO_REGEXP_ESCAPE
2611 if ( (prevCh & ~0xff) == 0 ) {
2612 const char *p = strchr( tab, prevCh );
2613 if ( p != 0 )
2614 return Tok_Char | backTab[p - tab];
2615 }
2616 #endif
2617
2618 switch ( prevCh ) {
2619 #ifndef QT_NO_REGEXP_ESCAPE
2620 case '0':
2621 val = 0;
2622 for ( i = 0; i < 3; i++ ) {
2623 if ( yyCh >= '0' && yyCh <= '7' )
2624 val = ( val << 3 ) | ( yyCh - '0' );
2625 else
2626 break;
2627 yyCh = getChar();
2628 }
2629 if ( (val & ~0377) != 0 )
2630 error( RXERR_OCTAL );
2631 return Tok_Char | val;
2632 #endif
2633 #ifndef QT_NO_REGEXP_ESCAPE
2634 case 'B':
2635 return Tok_NonWord;
2636 #endif
2637 #ifndef QT_NO_REGEXP_CCLASS
2638 case 'D':
2639 // see QChar::isDigit()
2640 yyCharClass->addCategories( 0x7fffffef );
2641 return Tok_CharClass;
2642 case 'S':
2643 // see QChar::isSpace()
2644 yyCharClass->addCategories( 0x7ffff87f );
2645 yyCharClass->addRange( 0x0000, 0x0008 );
2646 yyCharClass->addRange( 0x000e, 0x001f );
2647 yyCharClass->addRange( 0x007f, 0x009f );
2648 return Tok_CharClass;
2649 case 'W':
2650 // see QChar::isLetterOrNumber()
2651 yyCharClass->addCategories( 0x7fe07f8f );
2652 yyCharClass->addRange( 0x203f, 0x2040 );
2653 yyCharClass->addSingleton( 0x2040 );
2654 yyCharClass->addSingleton( 0x30fb );
2655 yyCharClass->addRange( 0xfe33, 0xfe34 );
2656 yyCharClass->addRange( 0xfe4d, 0xfe4f );
2657 yyCharClass->addSingleton( 0xff3f );
2658 yyCharClass->addSingleton( 0xff65 );
2659 return Tok_CharClass;
2660 #endif
2661 #ifndef QT_NO_REGEXP_ESCAPE
2662 case 'b':
2663 return Tok_Word;
2664 #endif
2665 #ifndef QT_NO_REGEXP_CCLASS
2666 case 'd':
2667 // see QChar::isDigit()
2668 yyCharClass->addCategories( 0x00000010 );
2669 return Tok_CharClass;
2670 case 's':
2671 // see QChar::isSpace()
2672 yyCharClass->addCategories( 0x00000380 );
2673 yyCharClass->addRange( 0x0009, 0x000d );
2674 return Tok_CharClass;
2675 case 'w':
2676 // see QChar::isLetterOrNumber()
2677 yyCharClass->addCategories( 0x000f8070 );
2678 yyCharClass->addSingleton( 0x005f ); // '_'
2679 return Tok_CharClass;
2680 #endif
2681 #ifndef QT_NO_REGEXP_ESCAPE
2682 case 'x':
2683 val = 0;
2684 for ( i = 0; i < 4; i++ ) {
2685 low = QChar( yyCh ).lower();
2686 if ( low >= '0' && low <= '9' )
2687 val = ( val << 4 ) | ( low - '0' );
2688 else if ( low >= 'a' && low <= 'f' )
2689 val = ( val << 4 ) | ( low - 'a' + 10 );
2690 else
2691 break;
2692 yyCh = getChar();
2693 }
2694 return Tok_Char | val;
2695 #endif
2696 default:
2697 if ( prevCh >= '1' && prevCh <= '9' ) {
2698 #ifndef QT_NO_REGEXP_BACKREF
2699 val = prevCh - '0';
2700 while ( yyCh >= '0' && yyCh <= '9' ) {
2701 val = ( val * 10 ) + ( yyCh - '0' );
2702 yyCh = getChar();
2703 }
2704 return Tok_BackRef | val;
2705 #else
2706 error( RXERR_DISABLED );
2707 #endif
2708 }
2709 return Tok_Char | prevCh;
2710 }
2711 }
2712
2713 #ifndef QT_NO_REGEXP_INTERVAL
getRep(int def)2714 int QRegExpEngine::getRep( int def )
2715 {
2716 if ( yyCh >= '0' && yyCh <= '9' ) {
2717 int rep = 0;
2718 do {
2719 rep = 10 * rep + yyCh - '0';
2720 if ( rep >= InftyRep ) {
2721 error( RXERR_REPETITION );
2722 rep = def;
2723 }
2724 yyCh = getChar();
2725 } while ( yyCh >= '0' && yyCh <= '9' );
2726 return rep;
2727 } else {
2728 return def;
2729 }
2730 }
2731 #endif
2732
2733 #ifndef QT_NO_REGEXP_LOOKAHEAD
skipChars(int n)2734 void QRegExpEngine::skipChars( int n )
2735 {
2736 if ( n > 0 ) {
2737 yyPos += n - 1;
2738 yyCh = getChar();
2739 }
2740 }
2741 #endif
2742
error(const char * msg)2743 void QRegExpEngine::error( const char *msg )
2744 {
2745 if ( yyError.isEmpty() )
2746 yyError = QString::fromLatin1( msg );
2747 }
2748
startTokenizer(const QChar * rx,int len)2749 void QRegExpEngine::startTokenizer( const QChar *rx, int len )
2750 {
2751 yyIn = rx;
2752 yyPos0 = 0;
2753 yyPos = 0;
2754 yyLen = len;
2755 yyCh = getChar();
2756 yyCharClass = new CharClass;
2757 yyMinRep = 0;
2758 yyMaxRep = 0;
2759 yyError = QString();
2760 }
2761
getToken()2762 int QRegExpEngine::getToken()
2763 {
2764 #ifndef QT_NO_REGEXP_CCLASS
2765 ushort pendingCh = 0;
2766 bool charPending;
2767 bool rangePending;
2768 int tok;
2769 #endif
2770 int prevCh = yyCh;
2771
2772 yyPos0 = yyPos - 1;
2773 #ifndef QT_NO_REGEXP_CCLASS
2774 yyCharClass->clear();
2775 #endif
2776 yyMinRep = 0;
2777 yyMaxRep = 0;
2778 yyCh = getChar();
2779
2780 switch ( prevCh ) {
2781 case EOS:
2782 yyPos0 = yyPos;
2783 return Tok_Eos;
2784 case '$':
2785 return Tok_Dollar;
2786 case '(':
2787 if ( yyCh == '?' ) {
2788 prevCh = getChar();
2789 yyCh = getChar();
2790 switch ( prevCh ) {
2791 #ifndef QT_NO_REGEXP_LOOKAHEAD
2792 case '!':
2793 return Tok_NegLookahead;
2794 case '=':
2795 return Tok_PosLookahead;
2796 #endif
2797 case ':':
2798 return Tok_MagicLeftParen;
2799 default:
2800 error( RXERR_LOOKAHEAD );
2801 return Tok_MagicLeftParen;
2802 }
2803 } else {
2804 return Tok_LeftParen;
2805 }
2806 case ')':
2807 return Tok_RightParen;
2808 case '*':
2809 yyMinRep = 0;
2810 yyMaxRep = InftyRep;
2811 return Tok_Quantifier;
2812 case '+':
2813 yyMinRep = 1;
2814 yyMaxRep = InftyRep;
2815 return Tok_Quantifier;
2816 case '.':
2817 #ifndef QT_NO_REGEXP_CCLASS
2818 yyCharClass->setNegative( TRUE );
2819 #endif
2820 return Tok_CharClass;
2821 case '?':
2822 yyMinRep = 0;
2823 yyMaxRep = 1;
2824 return Tok_Quantifier;
2825 case '[':
2826 #ifndef QT_NO_REGEXP_CCLASS
2827 if ( yyCh == '^' ) {
2828 yyCharClass->setNegative( TRUE );
2829 yyCh = getChar();
2830 }
2831 charPending = FALSE;
2832 rangePending = FALSE;
2833 do {
2834 if ( yyCh == '-' && charPending && !rangePending ) {
2835 rangePending = TRUE;
2836 yyCh = getChar();
2837 } else {
2838 if ( charPending && !rangePending ) {
2839 yyCharClass->addSingleton( pendingCh );
2840 charPending = FALSE;
2841 }
2842 if ( yyCh == '\\' ) {
2843 yyCh = getChar();
2844 tok = getEscape();
2845 if ( tok == Tok_Word )
2846 tok = '\b';
2847 } else {
2848 tok = Tok_Char | yyCh;
2849 yyCh = getChar();
2850 }
2851 if ( tok == Tok_CharClass ) {
2852 if ( rangePending ) {
2853 yyCharClass->addSingleton( '-' );
2854 yyCharClass->addSingleton( pendingCh );
2855 charPending = FALSE;
2856 rangePending = FALSE;
2857 }
2858 } else if ( (tok & Tok_Char) != 0 ) {
2859 if ( rangePending ) {
2860 yyCharClass->addRange( pendingCh, tok ^ Tok_Char );
2861 charPending = FALSE;
2862 rangePending = FALSE;
2863 } else {
2864 pendingCh = tok ^ Tok_Char;
2865 charPending = TRUE;
2866 }
2867 } else {
2868 error( RXERR_CHARCLASS );
2869 }
2870 }
2871 } while ( yyCh != ']' && yyCh != EOS );
2872 if ( rangePending )
2873 yyCharClass->addSingleton( '-' );
2874 if ( charPending )
2875 yyCharClass->addSingleton( pendingCh );
2876 if ( yyCh == EOS )
2877 error( RXERR_END );
2878 else
2879 yyCh = getChar();
2880 return Tok_CharClass;
2881 #else
2882 error( RXERR_END );
2883 return Tok_Char | '[';
2884 #endif
2885 case '\\':
2886 return getEscape();
2887 case ']':
2888 error( RXERR_LEFTDELIM );
2889 return Tok_Char | ']';
2890 case '^':
2891 return Tok_Caret;
2892 case '{':
2893 #ifndef QT_NO_REGEXP_INTERVAL
2894 yyMinRep = getRep( 0 );
2895 yyMaxRep = yyMinRep;
2896 if ( yyCh == ',' ) {
2897 yyCh = getChar();
2898 yyMaxRep = getRep( InftyRep );
2899 }
2900 if ( yyMaxRep < yyMinRep )
2901 qSwap( yyMinRep, yyMaxRep );
2902 if ( yyCh != '}' )
2903 error( RXERR_REPETITION );
2904 yyCh = getChar();
2905 return Tok_Quantifier;
2906 #else
2907 error( RXERR_DISABLED );
2908 return Tok_Char | '{';
2909 #endif
2910 case '|':
2911 return Tok_Bar;
2912 case '}':
2913 error( RXERR_LEFTDELIM );
2914 return Tok_Char | '}';
2915 default:
2916 return Tok_Char | prevCh;
2917 }
2918 }
2919
parse(const QChar * pattern,int len)2920 int QRegExpEngine::parse( const QChar *pattern, int len )
2921 {
2922 valid = TRUE;
2923 startTokenizer( pattern, len );
2924 yyTok = getToken();
2925 #ifndef QT_NO_REGEXP_CAPTURE
2926 yyMayCapture = TRUE;
2927 #else
2928 yyMayCapture = FALSE;
2929 #endif
2930
2931 #ifndef QT_NO_REGEXP_CAPTURE
2932 int atom = startAtom( FALSE );
2933 #endif
2934 CharClass anything;
2935 Box box( this ); // create InitialState
2936 box.set( anything );
2937 Box rightBox( this ); // create FinalState
2938 rightBox.set( anything );
2939
2940 Box middleBox( this );
2941 parseExpression( &middleBox );
2942 #ifndef QT_NO_REGEXP_CAPTURE
2943 finishAtom( atom );
2944 #endif
2945 #ifndef QT_NO_REGEXP_OPTIM
2946 middleBox.setupHeuristics();
2947 #endif
2948 box.cat( middleBox );
2949 box.cat( rightBox );
2950 delete yyCharClass;
2951 yyCharClass = 0;
2952
2953 officialncap = ncap;
2954 #ifndef QT_NO_REGEXP_BACKREF
2955 if ( nbrefs > ncap )
2956 ncap = nbrefs;
2957 #endif
2958
2959 /*
2960 We use one QMemArray<int> for all the big data used a lot in
2961 matchHere() and friends.
2962 */
2963 #ifndef QT_NO_REGEXP_OPTIM
2964 mmSlideTabSize = QMAX( minl + 1, 16 );
2965 #else
2966 mmSlideTabSize = 0;
2967 #endif
2968 mmBigArray.resize( (3 + 4 * ncap) * ns + 4 * ncap + mmSlideTabSize );
2969
2970 mmInNextStack = mmBigArray.data();
2971 memset( mmInNextStack, -1, ns * sizeof(int) );
2972 mmCurStack = mmInNextStack + ns;
2973 mmNextStack = mmInNextStack + 2 * ns;
2974
2975 mmCurCapBegin = mmInNextStack + 3 * ns;
2976 mmNextCapBegin = mmCurCapBegin + ncap * ns;
2977 mmCurCapEnd = mmCurCapBegin + 2 * ncap * ns;
2978 mmNextCapEnd = mmCurCapBegin + 3 * ncap * ns;
2979
2980 mmTempCapBegin = mmCurCapBegin + 4 * ncap * ns;
2981 mmTempCapEnd = mmTempCapBegin + ncap;
2982 mmCapBegin = mmTempCapBegin + 2 * ncap;
2983 mmCapEnd = mmTempCapBegin + 3 * ncap;
2984
2985 mmSlideTab = mmTempCapBegin + 4 * ncap;
2986
2987 if ( !yyError.isEmpty() )
2988 return -1;
2989
2990 #ifndef QT_NO_REGEXP_OPTIM
2991 State *sinit = s[InitialState];
2992 caretAnchored = ( sinit->anchors != 0 );
2993 if ( caretAnchored ) {
2994 QMap<int, int>& anchors = *sinit->anchors;
2995 QMap<int, int>::ConstIterator a;
2996 for ( a = anchors.begin(); a != anchors.end(); ++a ) {
2997 if (
2998 #ifndef QT_NO_REGEXP_ANCHOR_ALT
2999 (*a & Anchor_Alternation) != 0 ||
3000 #endif
3001 (*a & Anchor_Caret) == 0 ) {
3002 caretAnchored = FALSE;
3003 break;
3004 }
3005 }
3006 }
3007 #endif
3008 return yyPos0;
3009 }
3010
parseAtom(Box * box)3011 void QRegExpEngine::parseAtom( Box *box )
3012 {
3013 #ifndef QT_NO_REGEXP_LOOKAHEAD
3014 QRegExpEngine *eng = 0;
3015 bool neg;
3016 int len;
3017 #endif
3018
3019 if ( (yyTok & Tok_Char) != 0 ) {
3020 box->set( QChar(yyTok ^ Tok_Char) );
3021 } else {
3022 #ifndef QT_NO_REGEXP_OPTIM
3023 trivial = FALSE;
3024 #endif
3025 switch ( yyTok ) {
3026 case Tok_Dollar:
3027 box->catAnchor( Anchor_Dollar );
3028 break;
3029 case Tok_Caret:
3030 box->catAnchor( Anchor_Caret );
3031 break;
3032 #ifndef QT_NO_REGEXP_LOOKAHEAD
3033 case Tok_PosLookahead:
3034 case Tok_NegLookahead:
3035 neg = ( yyTok == Tok_NegLookahead );
3036 eng = new QRegExpEngine( cs );
3037 len = eng->parse( yyIn + yyPos - 1, yyLen - yyPos + 1 );
3038 if ( len >= 0 )
3039 skipChars( len );
3040 else
3041 error( RXERR_LOOKAHEAD );
3042 box->catAnchor( addLookahead(eng, neg) );
3043 yyTok = getToken();
3044 if ( yyTok != Tok_RightParen )
3045 error( RXERR_LOOKAHEAD );
3046 break;
3047 #endif
3048 #ifndef QT_NO_REGEXP_ESCAPE
3049 case Tok_Word:
3050 box->catAnchor( Anchor_Word );
3051 break;
3052 case Tok_NonWord:
3053 box->catAnchor( Anchor_NonWord );
3054 break;
3055 #endif
3056 case Tok_LeftParen:
3057 case Tok_MagicLeftParen:
3058 yyTok = getToken();
3059 parseExpression( box );
3060 if ( yyTok != Tok_RightParen )
3061 error( RXERR_END );
3062 break;
3063 case Tok_CharClass:
3064 box->set( *yyCharClass );
3065 break;
3066 case Tok_Quantifier:
3067 error( RXERR_REPETITION );
3068 break;
3069 default:
3070 #ifndef QT_NO_REGEXP_BACKREF
3071 if ( (yyTok & Tok_BackRef) != 0 )
3072 box->set( yyTok ^ Tok_BackRef );
3073 else
3074 #endif
3075 error( RXERR_DISABLED );
3076 }
3077 }
3078 yyTok = getToken();
3079 }
3080
parseFactor(Box * box)3081 void QRegExpEngine::parseFactor( Box *box )
3082 {
3083 #ifndef QT_NO_REGEXP_CAPTURE
3084 int atom = startAtom( yyMayCapture && yyTok == Tok_LeftParen );
3085 #else
3086 static const int atom = 0;
3087 #endif
3088
3089 #ifndef QT_NO_REGEXP_INTERVAL
3090 #define YYREDO() \
3091 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3092 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3093
3094 const QChar *in = yyIn;
3095 int pos0 = yyPos0;
3096 int pos = yyPos;
3097 int len = yyLen;
3098 int ch = yyCh;
3099 CharClass charClass;
3100 if ( yyTok == Tok_CharClass )
3101 charClass = *yyCharClass;
3102 int tok = yyTok;
3103 bool mayCapture = yyMayCapture;
3104 #endif
3105
3106 parseAtom( box );
3107 #ifndef QT_NO_REGEXP_CAPTURE
3108 finishAtom( atom );
3109 #endif
3110
3111 if ( yyTok == Tok_Quantifier ) {
3112 #ifndef QT_NO_REGEXP_OPTIM
3113 trivial = FALSE;
3114 #endif
3115 if ( yyMaxRep == InftyRep ) {
3116 box->plus( atom );
3117 #ifndef QT_NO_REGEXP_INTERVAL
3118 } else if ( yyMaxRep == 0 ) {
3119 box->clear();
3120 #endif
3121 }
3122 if ( yyMinRep == 0 )
3123 box->opt();
3124
3125 #ifndef QT_NO_REGEXP_INTERVAL
3126 yyMayCapture = FALSE;
3127 int alpha = ( yyMinRep == 0 ) ? 0 : yyMinRep - 1;
3128 int beta = ( yyMaxRep == InftyRep ) ? 0 : yyMaxRep - ( alpha + 1 );
3129
3130 Box rightBox( this );
3131 int i;
3132
3133 for ( i = 0; i < beta; i++ ) {
3134 YYREDO();
3135 Box leftBox( this );
3136 parseAtom( &leftBox );
3137 leftBox.cat( rightBox );
3138 leftBox.opt();
3139 rightBox = leftBox;
3140 }
3141 for ( i = 0; i < alpha; i++ ) {
3142 YYREDO();
3143 Box leftBox( this );
3144 parseAtom( &leftBox );
3145 leftBox.cat( rightBox );
3146 rightBox = leftBox;
3147 }
3148 rightBox.cat( *box );
3149 *box = rightBox;
3150 #endif
3151 yyTok = getToken();
3152 #ifndef QT_NO_REGEXP_INTERVAL
3153 yyMayCapture = mayCapture;
3154 #endif
3155 }
3156 #undef YYREDO
3157 }
3158
parseTerm(Box * box)3159 void QRegExpEngine::parseTerm( Box *box )
3160 {
3161 #ifndef QT_NO_REGEXP_OPTIM
3162 if ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar )
3163 parseFactor( box );
3164 #endif
3165 while ( yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar ) {
3166 Box rightBox( this );
3167 parseFactor( &rightBox );
3168 box->cat( rightBox );
3169 }
3170 }
3171
parseExpression(Box * box)3172 void QRegExpEngine::parseExpression( Box *box )
3173 {
3174 parseTerm( box );
3175 while ( yyTok == Tok_Bar ) {
3176 #ifndef QT_NO_REGEXP_OPTIM
3177 trivial = FALSE;
3178 #endif
3179 Box rightBox( this );
3180 yyTok = getToken();
3181 parseTerm( &rightBox );
3182 box->orx( rightBox );
3183 }
3184 }
3185
3186 /*
3187 The struct QRegExpPrivate contains the private data of a regular
3188 expression other than the automaton. It makes it possible for many
3189 QRegExp objects to use the same QRegExpEngine object with different
3190 QRegExpPrivate objects.
3191 */
3192 struct QRegExpPrivate
3193 {
3194 QString pattern; // regular-expression or wildcard pattern
3195 QString rxpattern; // regular-expression pattern
3196 #ifndef QT_NO_REGEXP_WILDCARD
3197 bool wc : 1; // wildcard mode?
3198 #endif
3199 bool min : 1; // minimal matching? (instead of maximal)
3200 bool cs : 1; // case sensitive?
3201 #ifndef QT_NO_REGEXP_CAPTURE
3202 QString t; // last string passed to QRegExp::search() or searchRev()
3203 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3204 #endif
3205 QMemArray<int> captured; // what QRegExpEngine::search() returned last
3206
QRegExpPrivateQRegExpPrivate3207 QRegExpPrivate() { captured.fill( -1, 2 ); }
3208 };
3209
3210 #ifndef QT_NO_REGEXP_OPTIM
3211 static QSingleCleanupHandler<QCache<QRegExpEngine> > cleanup_cache;
3212 # ifndef QT_THREAD_SUPPORT
3213 static QCache<QRegExpEngine> *engineCache = 0;
3214 # endif // QT_THREAD_SUPPORT
3215 #endif // QT_NO_REGEXP_OPTIM
3216
regexpEngine(QRegExpEngine * & eng,const QString & pattern,bool caseSensitive,bool deref)3217 static void regexpEngine( QRegExpEngine *&eng, const QString &pattern,
3218 bool caseSensitive, bool deref )
3219 {
3220 # ifdef QT_THREAD_SUPPORT
3221 static QThreadStorage<QCache<QRegExpEngine> *> engineCaches;
3222 QCache<QRegExpEngine> *engineCache = 0;
3223 QThreadInstance *currentThread = QThreadInstance::current();
3224 if (currentThread)
3225 engineCache = engineCaches.localData();
3226 #endif // QT_THREAD_SUPPORT
3227
3228 if ( !deref ) {
3229 #ifndef QT_NO_REGEXP_OPTIM
3230 # ifdef QT_THREAD_SUPPORT
3231 if ( currentThread )
3232 # endif
3233 {
3234 if ( engineCache != 0 ) {
3235 eng = engineCache->take( pattern );
3236 if ( eng == 0 || eng->caseSensitive() != caseSensitive ) {
3237 delete eng;
3238 } else {
3239 eng->ref();
3240 return;
3241 }
3242 }
3243 }
3244 #endif // QT_NO_REGEXP_OPTIM
3245 eng = new QRegExpEngine( pattern, caseSensitive );
3246 return;
3247 }
3248
3249 if ( eng->deref() ) {
3250 #ifndef QT_NO_REGEXP_OPTIM
3251 # ifdef QT_THREAD_SUPPORT
3252 if ( currentThread )
3253 # endif
3254 {
3255 if ( engineCache == 0 ) {
3256 engineCache = new QCache<QRegExpEngine>;
3257 engineCache->setAutoDelete( TRUE );
3258 # ifdef QT_THREAD_SUPPORT
3259 engineCaches.setLocalData(engineCache);
3260 # else
3261 cleanup_cache.set( &engineCache );
3262 # endif // !QT_THREAD_SUPPORT
3263 }
3264 if ( !pattern.isNull() &&
3265 engineCache->insert(pattern, eng, 4 + pattern.length() / 4) )
3266 return;
3267 }
3268 #else
3269 Q_UNUSED( pattern );
3270 #endif // QT_NO_REGEXP_OPTIM
3271 delete eng;
3272 eng = 0;
3273 }
3274 }
3275
3276 /*!
3277 \enum QRegExp::CaretMode
3278
3279 The CaretMode enum defines the different meanings of the caret
3280 (<b>^</b>) in a regular expression. The possible values are:
3281
3282 \value CaretAtZero
3283 The caret corresponds to index 0 in the searched string.
3284
3285 \value CaretAtOffset
3286 The caret corresponds to the start offset of the search.
3287
3288 \value CaretWontMatch
3289 The caret never matches.
3290 */
3291
3292 /*!
3293 Constructs an empty regexp.
3294
3295 \sa isValid() errorString()
3296 */
QRegExp()3297 QRegExp::QRegExp()
3298 : eng( 0 )
3299 {
3300 priv = new QRegExpPrivate;
3301 #ifndef QT_NO_REGEXP_WILDCARD
3302 priv->wc = FALSE;
3303 #endif
3304 priv->min = FALSE;
3305 priv->cs = TRUE;
3306 }
3307
3308 /*!
3309 Constructs a regular expression object for the given \a pattern
3310 string. The pattern must be given using wildcard notation if \a
3311 wildcard is TRUE (default is FALSE). The pattern is case
3312 sensitive, unless \a caseSensitive is FALSE. Matching is greedy
3313 (maximal), but can be changed by calling setMinimal().
3314
3315 \sa setPattern() setCaseSensitive() setWildcard() setMinimal()
3316 */
QRegExp(const QString & pattern,bool caseSensitive,bool wildcard)3317 QRegExp::QRegExp( const QString& pattern, bool caseSensitive, bool wildcard )
3318 : eng( 0 )
3319 {
3320 priv = new QRegExpPrivate;
3321 priv->pattern = pattern;
3322 #ifndef QT_NO_REGEXP_WILDCARD
3323 priv->wc = wildcard;
3324 #endif
3325 priv->min = FALSE;
3326 priv->cs = caseSensitive;
3327 }
3328
3329 /*!
3330 Constructs a regular expression as a copy of \a rx.
3331
3332 \sa operator=()
3333 */
QRegExp(const QRegExp & rx)3334 QRegExp::QRegExp( const QRegExp& rx )
3335 : eng( 0 )
3336 {
3337 priv = new QRegExpPrivate;
3338 operator=( rx );
3339 }
3340
3341 /*!
3342 Destroys the regular expression and cleans up its internal data.
3343 */
~QRegExp()3344 QRegExp::~QRegExp()
3345 {
3346 invalidateEngine();
3347 delete priv;
3348 }
3349
3350 /*!
3351 Copies the regular expression \a rx and returns a reference to the
3352 copy. The case sensitivity, wildcard and minimal matching options
3353 are also copied.
3354 */
operator =(const QRegExp & rx)3355 QRegExp& QRegExp::operator=( const QRegExp& rx )
3356 {
3357 QRegExpEngine *otherEng = rx.eng;
3358 if ( otherEng != 0 )
3359 otherEng->ref();
3360 invalidateEngine();
3361 eng = otherEng;
3362 priv->pattern = rx.priv->pattern;
3363 priv->rxpattern = rx.priv->rxpattern;
3364 #ifndef QT_NO_REGEXP_WILDCARD
3365 priv->wc = rx.priv->wc;
3366 #endif
3367 priv->min = rx.priv->min;
3368 priv->cs = rx.priv->cs;
3369 #ifndef QT_NO_REGEXP_CAPTURE
3370 priv->t = rx.priv->t;
3371 priv->capturedCache = rx.priv->capturedCache;
3372 #endif
3373 priv->captured = rx.priv->captured;
3374 return *this;
3375 }
3376
3377 /*!
3378 Returns TRUE if this regular expression is equal to \a rx;
3379 otherwise returns FALSE.
3380
3381 Two QRegExp objects are equal if they have the same pattern
3382 strings and the same settings for case sensitivity, wildcard and
3383 minimal matching.
3384 */
operator ==(const QRegExp & rx) const3385 bool QRegExp::operator==( const QRegExp& rx ) const
3386 {
3387 return priv->pattern == rx.priv->pattern &&
3388 #ifndef QT_NO_REGEXP_WILDCARD
3389 priv->wc == rx.priv->wc &&
3390 #endif
3391 priv->min == rx.priv->min &&
3392 priv->cs == rx.priv->cs;
3393 }
3394
3395 /*!
3396 \fn bool QRegExp::operator!=( const QRegExp& rx ) const
3397
3398 Returns TRUE if this regular expression is not equal to \a rx;
3399 otherwise returns FALSE.
3400
3401 \sa operator==()
3402 */
3403
3404 /*!
3405 Returns TRUE if the pattern string is empty; otherwise returns
3406 FALSE.
3407
3408 If you call exactMatch() with an empty pattern on an empty string
3409 it will return TRUE; otherwise it returns FALSE since it operates
3410 over the whole string. If you call search() with an empty pattern
3411 on \e any string it will return the start offset (0 by default)
3412 because the empty pattern matches the 'emptiness' at the start of
3413 the string. In this case the length of the match returned by
3414 matchedLength() will be 0.
3415
3416 See QString::isEmpty().
3417 */
3418
isEmpty() const3419 bool QRegExp::isEmpty() const
3420 {
3421 return priv->pattern.isEmpty();
3422 }
3423
3424 /*!
3425 Returns TRUE if the regular expression is valid; otherwise returns
3426 FALSE. An invalid regular expression never matches.
3427
3428 The pattern <b>[a-z</b> is an example of an invalid pattern, since
3429 it lacks a closing square bracket.
3430
3431 Note that the validity of a regexp may also depend on the setting
3432 of the wildcard flag, for example <b>*.html</b> is a valid
3433 wildcard regexp but an invalid full regexp.
3434
3435 \sa errorString()
3436 */
isValid() const3437 bool QRegExp::isValid() const
3438 {
3439 if ( priv->pattern.isEmpty() ) {
3440 return TRUE;
3441 } else {
3442 prepareEngine();
3443 return eng->isValid();
3444 }
3445 }
3446
3447 /*!
3448 Returns the pattern string of the regular expression. The pattern
3449 has either regular expression syntax or wildcard syntax, depending
3450 on wildcard().
3451
3452 \sa setPattern()
3453 */
pattern() const3454 QString QRegExp::pattern() const
3455 {
3456 return priv->pattern;
3457 }
3458
3459 /*!
3460 Sets the pattern string to \a pattern. The case sensitivity,
3461 wildcard and minimal matching options are not changed.
3462
3463 \sa pattern()
3464 */
setPattern(const QString & pattern)3465 void QRegExp::setPattern( const QString& pattern )
3466 {
3467 if ( priv->pattern != pattern ) {
3468 priv->pattern = pattern;
3469 invalidateEngine();
3470 }
3471 }
3472
3473 /*!
3474 Returns TRUE if case sensitivity is enabled; otherwise returns
3475 FALSE. The default is TRUE.
3476
3477 \sa setCaseSensitive()
3478 */
caseSensitive() const3479 bool QRegExp::caseSensitive() const
3480 {
3481 return priv->cs;
3482 }
3483
3484 /*!
3485 Sets case sensitive matching to \a sensitive.
3486
3487 If \a sensitive is TRUE, <b>\\.txt$</b> matches \c{readme.txt} but
3488 not \c{README.TXT}.
3489
3490 \sa caseSensitive()
3491 */
setCaseSensitive(bool sensitive)3492 void QRegExp::setCaseSensitive( bool sensitive )
3493 {
3494 if ( sensitive != priv->cs ) {
3495 priv->cs = sensitive;
3496 invalidateEngine();
3497 }
3498 }
3499
3500 #ifndef QT_NO_REGEXP_WILDCARD
3501 /*!
3502 Returns TRUE if wildcard mode is enabled; otherwise returns FALSE.
3503 The default is FALSE.
3504
3505 \sa setWildcard()
3506 */
wildcard() const3507 bool QRegExp::wildcard() const
3508 {
3509 return priv->wc;
3510 }
3511
3512 /*!
3513 Sets the wildcard mode for the regular expression. The default is
3514 FALSE.
3515
3516 Setting \a wildcard to TRUE enables simple shell-like wildcard
3517 matching. (See \link #wildcard-matching wildcard matching
3518 (globbing) \endlink.)
3519
3520 For example, <b>r*.txt</b> matches the string \c{readme.txt} in
3521 wildcard mode, but does not match \c{readme}.
3522
3523 \sa wildcard()
3524 */
setWildcard(bool wildcard)3525 void QRegExp::setWildcard( bool wildcard )
3526 {
3527 if ( wildcard != priv->wc ) {
3528 priv->wc = wildcard;
3529 invalidateEngine();
3530 }
3531 }
3532 #endif
3533
3534 /*!
3535 Returns TRUE if minimal (non-greedy) matching is enabled;
3536 otherwise returns FALSE.
3537
3538 \sa setMinimal()
3539 */
minimal() const3540 bool QRegExp::minimal() const
3541 {
3542 return priv->min;
3543 }
3544
3545 /*!
3546 Enables or disables minimal matching. If \a minimal is FALSE,
3547 matching is greedy (maximal) which is the default.
3548
3549 For example, suppose we have the input string "We must be
3550 \<b>bold\</b>, very \<b>bold\</b>!" and the pattern
3551 <b>\<b>.*\</b></b>. With the default greedy (maximal) matching,
3552 the match is "We must be <u>\<b>bold\</b>, very
3553 \<b>bold\</b></u>!". But with minimal (non-greedy) matching the
3554 first match is: "We must be <u>\<b>bold\</b></u>, very
3555 \<b>bold\</b>!" and the second match is "We must be \<b>bold\</b>,
3556 very <u>\<b>bold\</b></u>!". In practice we might use the pattern
3557 <b>\<b>[^\<]+\</b></b> instead, although this will still fail for
3558 nested tags.
3559
3560 \sa minimal()
3561 */
setMinimal(bool minimal)3562 void QRegExp::setMinimal( bool minimal )
3563 {
3564 priv->min = minimal;
3565 }
3566
3567 /*!
3568 Returns TRUE if \a str is matched exactly by this regular
3569 expression; otherwise returns FALSE. You can determine how much of
3570 the string was matched by calling matchedLength().
3571
3572 For a given regexp string, R, exactMatch("R") is the equivalent of
3573 search("^R$") since exactMatch() effectively encloses the regexp
3574 in the start of string and end of string anchors, except that it
3575 sets matchedLength() differently.
3576
3577 For example, if the regular expression is <b>blue</b>, then
3578 exactMatch() returns TRUE only for input \c blue. For inputs \c
3579 bluebell, \c blutak and \c lightblue, exactMatch() returns FALSE
3580 and matchedLength() will return 4, 3 and 0 respectively.
3581
3582 Although const, this function sets matchedLength(),
3583 capturedTexts() and pos().
3584
3585 \sa search() searchRev() QRegExpValidator
3586 */
exactMatch(const QString & str) const3587 bool QRegExp::exactMatch( const QString& str ) const
3588 {
3589 prepareEngineForMatch( str );
3590 eng->match( str, 0, priv->min, TRUE, 0, priv->captured );
3591 if ( priv->captured[1] == (int) str.length() ) {
3592 return TRUE;
3593 } else {
3594 priv->captured[0] = 0;
3595 priv->captured[1] = eng->partialMatchLength();
3596 return FALSE;
3597 }
3598 }
3599
3600 #ifndef QT_NO_COMPAT
3601 /*! \obsolete
3602
3603 Attempts to match in \a str, starting from position \a index.
3604 Returns the position of the match, or -1 if there was no match.
3605
3606 The length of the match is stored in \a *len, unless \a len is a
3607 null pointer.
3608
3609 If \a indexIsStart is TRUE (the default), the position \a index in
3610 the string will match the start of string anchor, <b>^</b>, in the
3611 regexp, if present. Otherwise, position 0 in \a str will match.
3612
3613 Use search() and matchedLength() instead of this function.
3614
3615 \sa QString::mid() QConstString
3616 */
match(const QString & str,int index,int * len,bool indexIsStart) const3617 int QRegExp::match( const QString& str, int index, int *len,
3618 bool indexIsStart ) const
3619 {
3620 int pos = search( str, index, indexIsStart ? CaretAtOffset : CaretAtZero );
3621 if ( len != 0 )
3622 *len = matchedLength();
3623 return pos;
3624 }
3625 #endif // QT_NO_COMPAT
3626
search(const QString & str,int offset) const3627 int QRegExp::search( const QString& str, int offset ) const
3628 {
3629 return search( str, offset, CaretAtZero );
3630 }
3631
3632 /*!
3633 Attempts to find a match in \a str from position \a offset (0 by
3634 default). If \a offset is -1, the search starts at the last
3635 character; if -2, at the next to last character; etc.
3636
3637 Returns the position of the first match, or -1 if there was no
3638 match.
3639
3640 The \a caretMode parameter can be used to instruct whether <b>^</b>
3641 should match at index 0 or at \a offset.
3642
3643 You might prefer to use QString::find(), QString::contains() or
3644 even QStringList::grep(). To replace matches use
3645 QString::replace().
3646
3647 Example:
3648 \code
3649 QString str = "offsets: 1.23 .50 71.00 6.00";
3650 QRegExp rx( "\\d*\\.\\d+" ); // primitive floating point matching
3651 int count = 0;
3652 int pos = 0;
3653 while ( (pos = rx.search(str, pos)) != -1 ) {
3654 count++;
3655 pos += rx.matchedLength();
3656 }
3657 // pos will be 9, 14, 18 and finally 24; count will end up as 4
3658 \endcode
3659
3660 Although const, this function sets matchedLength(),
3661 capturedTexts() and pos().
3662
3663 \sa searchRev() exactMatch()
3664 */
3665
search(const QString & str,int offset,CaretMode caretMode) const3666 int QRegExp::search( const QString& str, int offset, CaretMode caretMode ) const
3667 {
3668 prepareEngineForMatch( str );
3669 if ( offset < 0 )
3670 offset += str.length();
3671 eng->match( str, offset, priv->min, FALSE, caretIndex(offset, caretMode),
3672 priv->captured );
3673 return priv->captured[0];
3674 }
3675
3676
searchRev(const QString & str,int offset) const3677 int QRegExp::searchRev( const QString& str, int offset ) const
3678 {
3679 return searchRev( str, offset, CaretAtZero );
3680 }
3681
3682 /*!
3683 Attempts to find a match backwards in \a str from position \a
3684 offset. If \a offset is -1 (the default), the search starts at the
3685 last character; if -2, at the next to last character; etc.
3686
3687 Returns the position of the first match, or -1 if there was no
3688 match.
3689
3690 The \a caretMode parameter can be used to instruct whether <b>^</b>
3691 should match at index 0 or at \a offset.
3692
3693 Although const, this function sets matchedLength(),
3694 capturedTexts() and pos().
3695
3696 \warning Searching backwards is much slower than searching
3697 forwards.
3698
3699 \sa search() exactMatch()
3700 */
3701
searchRev(const QString & str,int offset,CaretMode caretMode) const3702 int QRegExp::searchRev( const QString& str, int offset,
3703 CaretMode caretMode ) const
3704 {
3705 prepareEngineForMatch( str );
3706 if ( offset < 0 )
3707 offset += str.length();
3708 if ( offset < 0 || offset > (int) str.length() ) {
3709 priv->captured.detach();
3710 priv->captured.fill( -1 );
3711 return -1;
3712 }
3713
3714 while ( offset >= 0 ) {
3715 eng->match( str, offset, priv->min, TRUE, caretIndex(offset, caretMode),
3716 priv->captured );
3717 if ( priv->captured[0] == offset )
3718 return offset;
3719 offset--;
3720 }
3721 return -1;
3722 }
3723
3724 /*!
3725 Returns the length of the last matched string, or -1 if there was
3726 no match.
3727
3728 \sa exactMatch() search() searchRev()
3729 */
matchedLength() const3730 int QRegExp::matchedLength() const
3731 {
3732 return priv->captured[1];
3733 }
3734
3735 #ifndef QT_NO_REGEXP_CAPTURE
3736 /*!
3737 Returns the number of captures contained in the regular expression.
3738 */
numCaptures() const3739 int QRegExp::numCaptures() const
3740 {
3741 prepareEngine();
3742 return eng->numCaptures();
3743 }
3744
3745 /*!
3746 Returns a list of the captured text strings.
3747
3748 The first string in the list is the entire matched string. Each
3749 subsequent list element contains a string that matched a
3750 (capturing) subexpression of the regexp.
3751
3752 For example:
3753 \code
3754 QRegExp rx( "(\\d+)(\\s*)(cm|inch(es)?)" );
3755 int pos = rx.search( "Length: 36 inches" );
3756 QStringList list = rx.capturedTexts();
3757 // list is now ( "36 inches", "36", " ", "inches", "es" )
3758 \endcode
3759
3760 The above example also captures elements that may be present but
3761 which we have no interest in. This problem can be solved by using
3762 non-capturing parentheses:
3763
3764 \code
3765 QRegExp rx( "(\\d+)(?:\\s*)(cm|inch(?:es)?)" );
3766 int pos = rx.search( "Length: 36 inches" );
3767 QStringList list = rx.capturedTexts();
3768 // list is now ( "36 inches", "36", "inches" )
3769 \endcode
3770
3771 Note that if you want to iterate over the list, you should iterate
3772 over a copy, e.g.
3773 \code
3774 QStringList list = rx.capturedTexts();
3775 QStringList::Iterator it = list.begin();
3776 while( it != list.end() ) {
3777 myProcessing( *it );
3778 ++it;
3779 }
3780 \endcode
3781
3782 Some regexps can match an indeterminate number of times. For
3783 example if the input string is "Offsets: 12 14 99 231 7" and the
3784 regexp, \c{rx}, is <b>(\\d+)+</b>, we would hope to get a list of
3785 all the numbers matched. However, after calling
3786 \c{rx.search(str)}, capturedTexts() will return the list ( "12",
3787 "12" ), i.e. the entire match was "12" and the first subexpression
3788 matched was "12". The correct approach is to use cap() in a \link
3789 #cap_in_a_loop loop \endlink.
3790
3791 The order of elements in the string list is as follows. The first
3792 element is the entire matching string. Each subsequent element
3793 corresponds to the next capturing open left parentheses. Thus
3794 capturedTexts()[1] is the text of the first capturing parentheses,
3795 capturedTexts()[2] is the text of the second and so on
3796 (corresponding to $1, $2, etc., in some other regexp languages).
3797
3798 \sa cap() pos() exactMatch() search() searchRev()
3799 */
capturedTexts()3800 QStringList QRegExp::capturedTexts()
3801 {
3802 if ( priv->capturedCache.isEmpty() ) {
3803 for ( int i = 0; i < (int) priv->captured.size(); i += 2 ) {
3804 QString m;
3805 if ( priv->captured[i + 1] == 0 )
3806 m = QString::fromLatin1( "" );
3807 else if ( priv->captured[i] >= 0 )
3808 m = priv->t.mid( priv->captured[i],
3809 priv->captured[i + 1] );
3810 priv->capturedCache.append( m );
3811 }
3812 priv->t = QString::null;
3813 }
3814 return priv->capturedCache;
3815 }
3816
3817 /*!
3818 Returns the text captured by the \a nth subexpression. The entire
3819 match has index 0 and the parenthesized subexpressions have
3820 indices starting from 1 (excluding non-capturing parentheses).
3821
3822 \code
3823 QRegExp rxlen( "(\\d+)(?:\\s*)(cm|inch)" );
3824 int pos = rxlen.search( "Length: 189cm" );
3825 if ( pos > -1 ) {
3826 QString value = rxlen.cap( 1 ); // "189"
3827 QString unit = rxlen.cap( 2 ); // "cm"
3828 // ...
3829 }
3830 \endcode
3831
3832 The order of elements matched by cap() is as follows. The first
3833 element, cap(0), is the entire matching string. Each subsequent
3834 element corresponds to the next capturing open left parentheses.
3835 Thus cap(1) is the text of the first capturing parentheses, cap(2)
3836 is the text of the second, and so on.
3837
3838 \target cap_in_a_loop
3839 Some patterns may lead to a number of matches which cannot be
3840 determined in advance, for example:
3841
3842 \code
3843 QRegExp rx( "(\\d+)" );
3844 str = "Offsets: 12 14 99 231 7";
3845 QStringList list;
3846 pos = 0;
3847 while ( pos >= 0 ) {
3848 pos = rx.search( str, pos );
3849 if ( pos > -1 ) {
3850 list += rx.cap( 1 );
3851 pos += rx.matchedLength();
3852 }
3853 }
3854 // list contains "12", "14", "99", "231", "7"
3855 \endcode
3856
3857 \sa capturedTexts() pos() exactMatch() search() searchRev()
3858 */
cap(int nth)3859 QString QRegExp::cap( int nth )
3860 {
3861 if ( nth < 0 || nth >= (int) priv->captured.size() / 2 ) {
3862 return QString::null;
3863 } else {
3864 return capturedTexts()[nth];
3865 }
3866 }
3867
3868 /*!
3869 Returns the position of the \a nth captured text in the searched
3870 string. If \a nth is 0 (the default), pos() returns the position
3871 of the whole match.
3872
3873 Example:
3874 \code
3875 QRegExp rx( "/([a-z]+)/([a-z]+)" );
3876 rx.search( "Output /dev/null" ); // returns 7 (position of /dev/null)
3877 rx.pos( 0 ); // returns 7 (position of /dev/null)
3878 rx.pos( 1 ); // returns 8 (position of dev)
3879 rx.pos( 2 ); // returns 12 (position of null)
3880 \endcode
3881
3882 For zero-length matches, pos() always returns -1. (For example, if
3883 cap(4) would return an empty string, pos(4) returns -1.) This is
3884 due to an implementation tradeoff.
3885
3886 \sa capturedTexts() exactMatch() search() searchRev()
3887 */
pos(int nth)3888 int QRegExp::pos( int nth )
3889 {
3890 if ( nth < 0 || nth >= (int) priv->captured.size() / 2 )
3891 return -1;
3892 else
3893 return priv->captured[2 * nth];
3894 }
3895
3896 /*!
3897 Returns a text string that explains why a regexp pattern is
3898 invalid the case being; otherwise returns "no error occurred".
3899
3900 \sa isValid()
3901 */
errorString()3902 QString QRegExp::errorString()
3903 {
3904 if ( isValid() ) {
3905 return QString( RXERR_OK );
3906 } else {
3907 return eng->errorString();
3908 }
3909 }
3910 #endif
3911
3912 /*!
3913 Returns the string \a str with every regexp special character
3914 escaped with a backslash. The special characters are $, (, ), *, +,
3915 ., ?, [, \, ], ^, {, | and }.
3916
3917 Example:
3918 \code
3919 s1 = QRegExp::escape( "bingo" ); // s1 == "bingo"
3920 s2 = QRegExp::escape( "f(x)" ); // s2 == "f\\(x\\)"
3921 \endcode
3922
3923 This function is useful to construct regexp patterns dynamically:
3924
3925 \code
3926 QRegExp rx( "(" + QRegExp::escape(name) +
3927 "|" + QRegExp::escape(alias) + ")" );
3928 \endcode
3929 */
escape(const QString & str)3930 QString QRegExp::escape( const QString& str )
3931 {
3932 static const char meta[] = "$()*+.?[\\]^{|}";
3933 QString quoted = str;
3934 int i = 0;
3935
3936 while ( i < (int) quoted.length() ) {
3937 if ( strchr(meta, quoted[i].latin1()) != 0 )
3938 quoted.insert( i++, "\\" );
3939 i++;
3940 }
3941 return quoted;
3942 }
3943
prepareEngine() const3944 void QRegExp::prepareEngine() const
3945 {
3946 if ( eng == 0 ) {
3947 #ifndef QT_NO_REGEXP_WILDCARD
3948 if ( priv->wc )
3949 priv->rxpattern = wc2rx( priv->pattern );
3950 else
3951 #endif
3952 priv->rxpattern = priv->pattern.isNull() ? QString::fromLatin1( "" )
3953 : priv->pattern;
3954 QRegExp *that = (QRegExp *) this;
3955 // that->eng = newEngine( priv->rxpattern, priv->cs );
3956 regexpEngine( that->eng, priv->rxpattern, priv->cs, FALSE );
3957 priv->captured.detach();
3958 priv->captured.fill( -1, 2 + 2 * eng->numCaptures() );
3959 }
3960 }
3961
prepareEngineForMatch(const QString & str) const3962 void QRegExp::prepareEngineForMatch( const QString& str ) const
3963 {
3964 prepareEngine();
3965 #ifndef QT_NO_REGEXP_CAPTURE
3966 priv->t = str;
3967 priv->capturedCache.clear();
3968 #else
3969 Q_UNUSED( str );
3970 #endif
3971 }
3972
invalidateEngine()3973 void QRegExp::invalidateEngine()
3974 {
3975 if ( eng != 0 ) {
3976 regexpEngine( eng, priv->rxpattern, priv->cs, TRUE );
3977 priv->rxpattern = QString();
3978 eng = 0;
3979 }
3980 }
3981
caretIndex(int offset,CaretMode caretMode)3982 int QRegExp::caretIndex( int offset, CaretMode caretMode )
3983 {
3984 if ( caretMode == CaretAtZero ) {
3985 return 0;
3986 } else if ( caretMode == CaretAtOffset ) {
3987 return offset;
3988 } else { // CaretWontMatch
3989 return -1;
3990 }
3991 }
3992
3993 #endif // QT_NO_REGEXP
3994