1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // file: regexcmp.cpp
5 //
6 // Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 // All Rights Reserved.
8 //
9 // This file contains the ICU regular expression compiler, which is responsible
10 // for processing a regular expression pattern into the compiled form that
11 // is used by the match finding engine.
12 //
13
14 #include "unicode/utypes.h"
15
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstr.h"
32 #include "cstring.h"
33 #include "uvectr32.h"
34 #include "uvectr64.h"
35 #include "uassert.h"
36 #include "uinvchar.h"
37
38 #include "regeximp.h"
39 #include "regexcst.h" // Contains state table for the regex pattern parser.
40 // generated by a Perl script.
41 #include "regexcmp.h"
42 #include "regexst.h"
43 #include "regextxt.h"
44
45
46
47 U_NAMESPACE_BEGIN
48
49
50 //------------------------------------------------------------------------------
51 //
52 // Constructor.
53 //
54 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56 fParenStack(status), fSetStack(status), fSetOpStack(status)
57 {
58 // Lazy init of all shared global sets (needed for init()'s empty text)
59 RegexStaticSets::initGlobals(&status);
60
61 fStatus = &status;
62
63 fRXPat = rxp;
64 fScanIndex = 0;
65 fLastChar = -1;
66 fPeekChar = -1;
67 fLineNum = 1;
68 fCharNum = 0;
69 fQuoteMode = FALSE;
70 fInBackslashQuote = FALSE;
71 fModeFlags = fRXPat->fFlags | 0x80000000;
72 fEOLComments = TRUE;
73
74 fMatchOpenParen = -1;
75 fMatchCloseParen = -1;
76 fCaptureName = NULL;
77 fLastSetLiteral = U_SENTINEL;
78
79 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80 status = rxp->fDeferredStatus;
81 }
82 }
83
84 static const UChar chAmp = 0x26; // '&'
85 static const UChar chDash = 0x2d; // '-'
86
87
88 //------------------------------------------------------------------------------
89 //
90 // Destructor
91 //
92 //------------------------------------------------------------------------------
~RegexCompile()93 RegexCompile::~RegexCompile() {
94 delete fCaptureName; // Normally will be NULL, but can exist if pattern
95 // compilation stops with a syntax error.
96 }
97
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100 }
101
102 //------------------------------------------------------------------------------
103 //
104 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
105 // The state tables are hand-written in the file regexcst.txt,
106 // and converted to the form used here by a perl
107 // script regexcst.pl
108 //
109 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)110 void RegexCompile::compile(
111 const UnicodeString &pat, // Source pat to be compiled.
112 UParseError &pp, // Error position info
113 UErrorCode &e) // Error Code
114 {
115 fRXPat->fPatternString = new UnicodeString(pat);
116 UText patternText = UTEXT_INITIALIZER;
117 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119 if (U_SUCCESS(e)) {
120 compile(&patternText, pp, e);
121 utext_close(&patternText);
122 }
123 }
124
125 //
126 // compile, UText mode
127 // All the work is actually done here.
128 //
compile(UText * pat,UParseError & pp,UErrorCode & e)129 void RegexCompile::compile(
130 UText *pat, // Source pat to be compiled.
131 UParseError &pp, // Error position info
132 UErrorCode &e) // Error Code
133 {
134 fStatus = &e;
135 fParseErr = &pp;
136 fStackPtr = 0;
137 fStack[fStackPtr] = 0;
138
139 if (U_FAILURE(*fStatus)) {
140 return;
141 }
142
143 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
144 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
145
146 // Prepare the RegexPattern object to receive the compiled pattern.
147 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148 if (U_FAILURE(*fStatus)) {
149 return;
150 }
151
152 // Initialize the pattern scanning state machine
153 fPatternLength = utext_nativeLength(pat);
154 uint16_t state = 1;
155 const RegexTableEl *tableEl;
156
157 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158 if (fModeFlags & UREGEX_LITERAL) {
159 fQuoteMode = TRUE;
160 }
161
162 nextChar(fC); // Fetch the first char from the pattern string.
163
164 //
165 // Main loop for the regex pattern parsing state machine.
166 // Runs once per state transition.
167 // Each time through optionally performs, depending on the state table,
168 // - an advance to the the next pattern char
169 // - an action to be performed.
170 // - pushing or popping a state to/from the local state return stack.
171 // file regexcst.txt is the source for the state table. The logic behind
172 // recongizing the pattern syntax is there, not here.
173 //
174 for (;;) {
175 // Bail out if anything has gone wrong.
176 // Regex pattern parsing stops on the first error encountered.
177 if (U_FAILURE(*fStatus)) {
178 break;
179 }
180
181 U_ASSERT(state != 0);
182
183 // Find the state table element that matches the input char from the pattern, or the
184 // class of the input character. Start with the first table row for this
185 // state, then linearly scan forward until we find a row that matches the
186 // character. The last row for each state always matches all characters, so
187 // the search will stop there, if not before.
188 //
189 tableEl = &gRuleParseStateTable[state];
190 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
191 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193 for (;;) { // loop through table rows belonging to this state, looking for one
194 // that matches the current input char.
195 REGEX_SCAN_DEBUG_PRINTF(("."));
196 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
197 // Table row specified an individual character, not a set, and
198 // the input character is not quoted, and
199 // the input character matched it.
200 break;
201 }
202 if (tableEl->fCharClass == 255) {
203 // Table row specified default, match anything character class.
204 break;
205 }
206 if (tableEl->fCharClass == 254 && fC.fQuoted) {
207 // Table row specified "quoted" and the char was quoted.
208 break;
209 }
210 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
211 // Table row specified eof and we hit eof on the input.
212 break;
213 }
214
215 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
216 fC.fQuoted == FALSE && // char is not escaped &&
217 fC.fChar != (UChar32)-1) { // char is not EOF
218 U_ASSERT(tableEl->fCharClass <= 137);
219 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220 // Table row specified a character class, or set of characters,
221 // and the current char matches it.
222 break;
223 }
224 }
225
226 // No match on this row, advance to the next row for this state,
227 tableEl++;
228 }
229 REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231 //
232 // We've found the row of the state table that matches the current input
233 // character from the rules string.
234 // Perform any action specified by this row in the state table.
235 if (doParseActions(tableEl->fAction) == FALSE) {
236 // Break out of the state machine loop if the
237 // the action signalled some kind of error, or
238 // the action was to exit, occurs on normal end-of-rules-input.
239 break;
240 }
241
242 if (tableEl->fPushState != 0) {
243 fStackPtr++;
244 if (fStackPtr >= kStackSize) {
245 error(U_REGEX_INTERNAL_ERROR);
246 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247 fStackPtr--;
248 }
249 fStack[fStackPtr] = tableEl->fPushState;
250 }
251
252 //
253 // NextChar. This is where characters are actually fetched from the pattern.
254 // Happens under control of the 'n' tag in the state table.
255 //
256 if (tableEl->fNextChar) {
257 nextChar(fC);
258 }
259
260 // Get the next state from the table entry, or from the
261 // state stack if the next state was specified as "pop".
262 if (tableEl->fNextState != 255) {
263 state = tableEl->fNextState;
264 } else {
265 state = fStack[fStackPtr];
266 fStackPtr--;
267 if (fStackPtr < 0) {
268 // state stack underflow
269 // This will occur if the user pattern has mis-matched parentheses,
270 // with extra close parens.
271 //
272 fStackPtr++;
273 error(U_REGEX_MISMATCHED_PAREN);
274 }
275 }
276
277 }
278
279 if (U_FAILURE(*fStatus)) {
280 // Bail out if the pattern had errors.
281 // Set stack cleanup: a successful compile would have left it empty,
282 // but errors can leave temporary sets hanging around.
283 while (!fSetStack.empty()) {
284 delete (UnicodeSet *)fSetStack.pop();
285 }
286 return;
287 }
288
289 //
290 // The pattern has now been read and processed, and the compiled code generated.
291 //
292
293 //
294 // The pattern's fFrameSize so far has accumulated the requirements for
295 // storage for capture parentheses, counters, etc. that are encountered
296 // in the pattern. Add space for the two variables that are always
297 // present in the saved state: the input string position (int64_t) and
298 // the position in the compiled pattern.
299 //
300 allocateStackData(RESTACKFRAME_HDRCOUNT);
301
302 //
303 // Optimization pass 1: NOPs, back-references, and case-folding
304 //
305 stripNOPs();
306
307 //
308 // Get bounds for the minimum and maximum length of a string that this
309 // pattern can match. Used to avoid looking for matches in strings that
310 // are too short.
311 //
312 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
313
314 //
315 // Optimization pass 2: match start type
316 //
317 matchStartType();
318
319 //
320 // Set up fast latin-1 range sets
321 //
322 int32_t numSets = fRXPat->fSets->size();
323 fRXPat->fSets8 = new Regex8BitSet[numSets];
324 // Null pointer check.
325 if (fRXPat->fSets8 == NULL) {
326 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
327 return;
328 }
329 int32_t i;
330 for (i=0; i<numSets; i++) {
331 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
332 fRXPat->fSets8[i].init(s);
333 }
334
335 }
336
337
338
339
340
341 //------------------------------------------------------------------------------
342 //
343 // doParseAction Do some action during regex pattern parsing.
344 // Called by the parse state machine.
345 //
346 // Generation of the match engine PCode happens here, or
347 // in functions called from the parse actions defined here.
348 //
349 //
350 //------------------------------------------------------------------------------
doParseActions(int32_t action)351 UBool RegexCompile::doParseActions(int32_t action)
352 {
353 UBool returnVal = TRUE;
354
355 switch ((Regex_PatternParseAction)action) {
356
357 case doPatStart:
358 // Start of pattern compiles to:
359 //0 SAVE 2 Fall back to position of FAIL
360 //1 jmp 3
361 //2 FAIL Stop if we ever reach here.
362 //3 NOP Dummy, so start of pattern looks the same as
363 // the start of an ( grouping.
364 //4 NOP Resreved, will be replaced by a save if there are
365 // OR | operators at the top level
366 appendOp(URX_STATE_SAVE, 2);
367 appendOp(URX_JMP, 3);
368 appendOp(URX_FAIL, 0);
369
370 // Standard open nonCapture paren action emits the two NOPs and
371 // sets up the paren stack frame.
372 doParseActions(doOpenNonCaptureParen);
373 break;
374
375 case doPatFinish:
376 // We've scanned to the end of the pattern
377 // The end of pattern compiles to:
378 // URX_END
379 // which will stop the runtime match engine.
380 // Encountering end of pattern also behaves like a close paren,
381 // and forces fixups of the State Save at the beginning of the compiled pattern
382 // and of any OR operations at the top level.
383 //
384 handleCloseParen();
385 if (fParenStack.size() > 0) {
386 // Missing close paren in pattern.
387 error(U_REGEX_MISMATCHED_PAREN);
388 }
389
390 // add the END operation to the compiled pattern.
391 appendOp(URX_END, 0);
392
393 // Terminate the pattern compilation state machine.
394 returnVal = FALSE;
395 break;
396
397
398
399 case doOrOperator:
400 // Scanning a '|', as in (A|B)
401 {
402 // Generate code for any pending literals preceding the '|'
403 fixLiterals(FALSE);
404
405 // Insert a SAVE operation at the start of the pattern section preceding
406 // this OR at this level. This SAVE will branch the match forward
407 // to the right hand side of the OR in the event that the left hand
408 // side fails to match and backtracks. Locate the position for the
409 // save from the location on the top of the parentheses stack.
410 int32_t savePosition = fParenStack.popi();
411 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
412 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
413 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
414 fRXPat->fCompiledPat->setElementAt(op, savePosition);
415
416 // Append an JMP operation into the compiled pattern. The operand for
417 // the JMP will eventually be the location following the ')' for the
418 // group. This will be patched in later, when the ')' is encountered.
419 appendOp(URX_JMP, 0);
420
421 // Push the position of the newly added JMP op onto the parentheses stack.
422 // This registers if for fixup when this block's close paren is encountered.
423 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
424
425 // Append a NOP to the compiled pattern. This is the slot reserved
426 // for a SAVE in the event that there is yet another '|' following
427 // this one.
428 appendOp(URX_NOP, 0);
429 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
430 }
431 break;
432
433
434 case doBeginNamedCapture:
435 // Scanning (?<letter.
436 // The first letter of the name will come through again under doConinueNamedCapture.
437 fCaptureName = new UnicodeString();
438 if (fCaptureName == NULL) {
439 error(U_MEMORY_ALLOCATION_ERROR);
440 }
441 break;
442
443 case doContinueNamedCapture:
444 fCaptureName->append(fC.fChar);
445 break;
446
447 case doBadNamedCapture:
448 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
449 break;
450
451 case doOpenCaptureParen:
452 // Open Capturing Paren, possibly named.
453 // Compile to a
454 // - NOP, which later may be replaced by a save-state if the
455 // parenthesized group gets a * quantifier, followed by
456 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
457 // - NOP, which may later be replaced by a save-state if there
458 // is an '|' alternation within the parens.
459 //
460 // Each capture group gets three slots in the save stack frame:
461 // 0: Capture Group start position (in input string being matched.)
462 // 1: Capture Group end position.
463 // 2: Start of Match-in-progress.
464 // The first two locations are for a completed capture group, and are
465 // referred to by back references and the like.
466 // The third location stores the capture start position when an START_CAPTURE is
467 // encountered. This will be promoted to a completed capture when (and if) the corresponding
468 // END_CAPTURE is encountered.
469 {
470 fixLiterals();
471 appendOp(URX_NOP, 0);
472 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
473 appendOp(URX_START_CAPTURE, varsLoc);
474 appendOp(URX_NOP, 0);
475
476 // On the Parentheses stack, start a new frame and add the postions
477 // of the two NOPs. Depending on what follows in the pattern, the
478 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
479 // address of the end of the parenthesized group.
480 fParenStack.push(fModeFlags, *fStatus); // Match mode state
481 fParenStack.push(capturing, *fStatus); // Frame type.
482 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
483 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
484
485 // Save the mapping from group number to stack frame variable position.
486 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
487
488 // If this is a named capture group, add the name->group number mapping.
489 if (fCaptureName != NULL) {
490 if (!fRXPat->initNamedCaptureMap()) {
491 if (U_SUCCESS(*fStatus)) {
492 error(fRXPat->fDeferredStatus);
493 }
494 break;
495 }
496 int32_t groupNumber = fRXPat->fGroupMap->size();
497 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
498 fCaptureName = NULL; // hash table takes ownership of the name (key) string.
499 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
500 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
501 }
502 }
503 }
504 break;
505
506 case doOpenNonCaptureParen:
507 // Open non-caputuring (grouping only) Paren.
508 // Compile to a
509 // - NOP, which later may be replaced by a save-state if the
510 // parenthesized group gets a * quantifier, followed by
511 // - NOP, which may later be replaced by a save-state if there
512 // is an '|' alternation within the parens.
513 {
514 fixLiterals();
515 appendOp(URX_NOP, 0);
516 appendOp(URX_NOP, 0);
517
518 // On the Parentheses stack, start a new frame and add the postions
519 // of the two NOPs.
520 fParenStack.push(fModeFlags, *fStatus); // Match mode state
521 fParenStack.push(plain, *fStatus); // Begin a new frame.
522 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
523 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
524 }
525 break;
526
527
528 case doOpenAtomicParen:
529 // Open Atomic Paren. (?>
530 // Compile to a
531 // - NOP, which later may be replaced if the parenthesized group
532 // has a quantifier, followed by
533 // - STO_SP save state stack position, so it can be restored at the ")"
534 // - NOP, which may later be replaced by a save-state if there
535 // is an '|' alternation within the parens.
536 {
537 fixLiterals();
538 appendOp(URX_NOP, 0);
539 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
540 appendOp(URX_STO_SP, varLoc);
541 appendOp(URX_NOP, 0);
542
543 // On the Parentheses stack, start a new frame and add the postions
544 // of the two NOPs. Depending on what follows in the pattern, the
545 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
546 // address of the end of the parenthesized group.
547 fParenStack.push(fModeFlags, *fStatus); // Match mode state
548 fParenStack.push(atomic, *fStatus); // Frame type.
549 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
550 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
551 }
552 break;
553
554
555 case doOpenLookAhead:
556 // Positive Look-ahead (?= stuff )
557 //
558 // Note: Addition of transparent input regions, with the need to
559 // restore the original regions when failing out of a lookahead
560 // block, complicated this sequence. Some combined opcodes
561 // might make sense - or might not, lookahead aren't that common.
562 //
563 // Caution: min match length optimization knows about this
564 // sequence; don't change without making updates there too.
565 //
566 // Compiles to
567 // 1 LA_START dataLoc Saves SP, Input Pos, Active input region.
568 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
569 // 3 JMP 6 continue ...
570 //
571 // 4. LA_END Look Ahead failed. Restore regions.
572 // 5. BACKTRACK and back track again.
573 //
574 // 6. NOP reserved for use by quantifiers on the block.
575 // Look-ahead can't have quantifiers, but paren stack
576 // compile time conventions require the slot anyhow.
577 // 7. NOP may be replaced if there is are '|' ops in the block.
578 // 8. code for parenthesized stuff.
579 // 9. LA_END
580 //
581 // Four data slots are reserved, for saving state on entry to the look-around
582 // 0: stack pointer on entry.
583 // 1: input position on entry.
584 // 2: fActiveStart, the active bounds start on entry.
585 // 3: fActiveLimit, the active bounds limit on entry.
586 {
587 fixLiterals();
588 int32_t dataLoc = allocateData(4);
589 appendOp(URX_LA_START, dataLoc);
590 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
591 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
592 appendOp(URX_LA_END, dataLoc);
593 appendOp(URX_BACKTRACK, 0);
594 appendOp(URX_NOP, 0);
595 appendOp(URX_NOP, 0);
596
597 // On the Parentheses stack, start a new frame and add the postions
598 // of the NOPs.
599 fParenStack.push(fModeFlags, *fStatus); // Match mode state
600 fParenStack.push(lookAhead, *fStatus); // Frame type.
601 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
602 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
603 }
604 break;
605
606 case doOpenLookAheadNeg:
607 // Negated Lookahead. (?! stuff )
608 // Compiles to
609 // 1. LA_START dataloc
610 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
611 // // which continues with the match.
612 // 3. NOP // Std. Open Paren sequence, for possible '|'
613 // 4. code for parenthesized stuff.
614 // 5. LA_END // Cut back stack, remove saved state from step 2.
615 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
616 // 7. END_LA // Restore match region, in case look-ahead was using
617 // an alternate (transparent) region.
618 // Four data slots are reserved, for saving state on entry to the look-around
619 // 0: stack pointer on entry.
620 // 1: input position on entry.
621 // 2: fActiveStart, the active bounds start on entry.
622 // 3: fActiveLimit, the active bounds limit on entry.
623 {
624 fixLiterals();
625 int32_t dataLoc = allocateData(4);
626 appendOp(URX_LA_START, dataLoc);
627 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later.
628 appendOp(URX_NOP, 0);
629
630 // On the Parentheses stack, start a new frame and add the postions
631 // of the StateSave and NOP.
632 fParenStack.push(fModeFlags, *fStatus); // Match mode state
633 fParenStack.push(negLookAhead, *fStatus); // Frame type
634 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
635 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
636
637 // Instructions #5 - #7 will be added when the ')' is encountered.
638 }
639 break;
640
641 case doOpenLookBehind:
642 {
643 // Compile a (?<= look-behind open paren.
644 //
645 // Compiles to
646 // 0 URX_LB_START dataLoc
647 // 1 URX_LB_CONT dataLoc
648 // 2 MinMatchLen
649 // 3 MaxMatchLen
650 // 4 URX_NOP Standard '(' boilerplate.
651 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
652 // 6 <code for LookBehind expression>
653 // 7 URX_LB_END dataLoc # Check match len, restore input len
654 // 8 URX_LA_END dataLoc # Restore stack, input pos
655 //
656 // Allocate a block of matcher data, to contain (when running a match)
657 // 0: Stack ptr on entry
658 // 1: Input Index on entry
659 // 2: fActiveStart, the active bounds start on entry.
660 // 3: fActiveLimit, the active bounds limit on entry.
661 // 4: Start index of match current match attempt.
662 // The first four items must match the layout of data for LA_START / LA_END
663
664 // Generate match code for any pending literals.
665 fixLiterals();
666
667 // Allocate data space
668 int32_t dataLoc = allocateData(5);
669
670 // Emit URX_LB_START
671 appendOp(URX_LB_START, dataLoc);
672
673 // Emit URX_LB_CONT
674 appendOp(URX_LB_CONT, dataLoc);
675 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
676 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
677
678 // Emit the NOPs
679 appendOp(URX_NOP, 0);
680 appendOp(URX_NOP, 0);
681
682 // On the Parentheses stack, start a new frame and add the postions
683 // of the URX_LB_CONT and the NOP.
684 fParenStack.push(fModeFlags, *fStatus); // Match mode state
685 fParenStack.push(lookBehind, *fStatus); // Frame type
686 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
687 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
688
689 // The final two instructions will be added when the ')' is encountered.
690 }
691
692 break;
693
694 case doOpenLookBehindNeg:
695 {
696 // Compile a (?<! negated look-behind open paren.
697 //
698 // Compiles to
699 // 0 URX_LB_START dataLoc # Save entry stack, input len
700 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
701 // 2 MinMatchLen
702 // 3 MaxMatchLen
703 // 4 continueLoc (9)
704 // 5 URX_NOP Standard '(' boilerplate.
705 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
706 // 7 <code for LookBehind expression>
707 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
708 // 9 ...
709 //
710 // Allocate a block of matcher data, to contain (when running a match)
711 // 0: Stack ptr on entry
712 // 1: Input Index on entry
713 // 2: fActiveStart, the active bounds start on entry.
714 // 3: fActiveLimit, the active bounds limit on entry.
715 // 4: Start index of match current match attempt.
716 // The first four items must match the layout of data for LA_START / LA_END
717
718 // Generate match code for any pending literals.
719 fixLiterals();
720
721 // Allocate data space
722 int32_t dataLoc = allocateData(5);
723
724 // Emit URX_LB_START
725 appendOp(URX_LB_START, dataLoc);
726
727 // Emit URX_LBN_CONT
728 appendOp(URX_LBN_CONT, dataLoc);
729 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
730 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
731 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later.
732
733 // Emit the NOPs
734 appendOp(URX_NOP, 0);
735 appendOp(URX_NOP, 0);
736
737 // On the Parentheses stack, start a new frame and add the postions
738 // of the URX_LB_CONT and the NOP.
739 fParenStack.push(fModeFlags, *fStatus); // Match mode state
740 fParenStack.push(lookBehindN, *fStatus); // Frame type
741 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
742 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
743
744 // The final two instructions will be added when the ')' is encountered.
745 }
746 break;
747
748 case doConditionalExpr:
749 // Conditionals such as (?(1)a:b)
750 case doPerlInline:
751 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
752 error(U_REGEX_UNIMPLEMENTED);
753 break;
754
755
756 case doCloseParen:
757 handleCloseParen();
758 if (fParenStack.size() <= 0) {
759 // Extra close paren, or missing open paren.
760 error(U_REGEX_MISMATCHED_PAREN);
761 }
762 break;
763
764 case doNOP:
765 break;
766
767
768 case doBadOpenParenType:
769 case doRuleError:
770 error(U_REGEX_RULE_SYNTAX);
771 break;
772
773
774 case doMismatchedParenErr:
775 error(U_REGEX_MISMATCHED_PAREN);
776 break;
777
778 case doPlus:
779 // Normal '+' compiles to
780 // 1. stuff to be repeated (already built)
781 // 2. jmp-sav 1
782 // 3. ...
783 //
784 // Or, if the item to be repeated can match a zero length string,
785 // 1. STO_INP_LOC data-loc
786 // 2. body of stuff to be repeated
787 // 3. JMP_SAV_X 2
788 // 4. ...
789
790 //
791 // Or, if the item to be repeated is simple
792 // 1. Item to be repeated.
793 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
794 // 3. LOOP_C stack location
795 {
796 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
797 int32_t frameLoc;
798
799 // Check for simple constructs, which may get special optimized code.
800 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
801 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
802
803 if (URX_TYPE(repeatedOp) == URX_SETREF) {
804 // Emit optimized code for [char set]+
805 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
806 frameLoc = allocateStackData(1);
807 appendOp(URX_LOOP_C, frameLoc);
808 break;
809 }
810
811 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
812 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
813 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
814 // Emit Optimized code for .+ operations.
815 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
816 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
817 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
818 loopOpI |= 1;
819 }
820 if (fModeFlags & UREGEX_UNIX_LINES) {
821 loopOpI |= 2;
822 }
823 appendOp(loopOpI);
824 frameLoc = allocateStackData(1);
825 appendOp(URX_LOOP_C, frameLoc);
826 break;
827 }
828
829 }
830
831 // General case.
832
833 // Check for minimum match length of zero, which requires
834 // extra loop-breaking code.
835 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
836 // Zero length match is possible.
837 // Emit the code sequence that can handle it.
838 insertOp(topLoc);
839 frameLoc = allocateStackData(1);
840
841 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
842 fRXPat->fCompiledPat->setElementAt(op, topLoc);
843
844 appendOp(URX_JMP_SAV_X, topLoc+1);
845 } else {
846 // Simpler code when the repeated body must match something non-empty
847 appendOp(URX_JMP_SAV, topLoc);
848 }
849 }
850 break;
851
852 case doNGPlus:
853 // Non-greedy '+?' compiles to
854 // 1. stuff to be repeated (already built)
855 // 2. state-save 1
856 // 3. ...
857 {
858 int32_t topLoc = blockTopLoc(FALSE);
859 appendOp(URX_STATE_SAVE, topLoc);
860 }
861 break;
862
863
864 case doOpt:
865 // Normal (greedy) ? quantifier.
866 // Compiles to
867 // 1. state save 3
868 // 2. body of optional block
869 // 3. ...
870 // Insert the state save into the compiled pattern, and we're done.
871 {
872 int32_t saveStateLoc = blockTopLoc(TRUE);
873 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
874 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
875 }
876 break;
877
878 case doNGOpt:
879 // Non-greedy ?? quantifier
880 // compiles to
881 // 1. jmp 4
882 // 2. body of optional block
883 // 3 jmp 5
884 // 4. state save 2
885 // 5 ...
886 // This code is less than ideal, with two jmps instead of one, because we can only
887 // insert one instruction at the top of the block being iterated.
888 {
889 int32_t jmp1_loc = blockTopLoc(TRUE);
890 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
891
892 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
893 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
894
895 appendOp(URX_JMP, jmp2_loc+2);
896
897 appendOp(URX_STATE_SAVE, jmp1_loc+1);
898 }
899 break;
900
901
902 case doStar:
903 // Normal (greedy) * quantifier.
904 // Compiles to
905 // 1. STATE_SAVE 4
906 // 2. body of stuff being iterated over
907 // 3. JMP_SAV 2
908 // 4. ...
909 //
910 // Or, if the body is a simple [Set],
911 // 1. LOOP_SR_I set number
912 // 2. LOOP_C stack location
913 // ...
914 //
915 // Or if this is a .*
916 // 1. LOOP_DOT_I (. matches all mode flag)
917 // 2. LOOP_C stack location
918 //
919 // Or, if the body can match a zero-length string, to inhibit infinite loops,
920 // 1. STATE_SAVE 5
921 // 2. STO_INP_LOC data-loc
922 // 3. body of stuff
923 // 4. JMP_SAV_X 2
924 // 5. ...
925 {
926 // location of item #1, the STATE_SAVE
927 int32_t topLoc = blockTopLoc(FALSE);
928 int32_t dataLoc = -1;
929
930 // Check for simple *, where the construct being repeated
931 // compiled to single opcode, and might be optimizable.
932 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
933 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
934
935 if (URX_TYPE(repeatedOp) == URX_SETREF) {
936 // Emit optimized code for a [char set]*
937 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
938 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
939 dataLoc = allocateStackData(1);
940 appendOp(URX_LOOP_C, dataLoc);
941 break;
942 }
943
944 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
945 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
946 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
947 // Emit Optimized code for .* operations.
948 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
949 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
950 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
951 loopOpI |= 1;
952 }
953 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
954 loopOpI |= 2;
955 }
956 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
957 dataLoc = allocateStackData(1);
958 appendOp(URX_LOOP_C, dataLoc);
959 break;
960 }
961 }
962
963 // Emit general case code for this *
964 // The optimizations did not apply.
965
966 int32_t saveStateLoc = blockTopLoc(TRUE);
967 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
968
969 // Check for minimum match length of zero, which requires
970 // extra loop-breaking code.
971 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
972 insertOp(saveStateLoc);
973 dataLoc = allocateStackData(1);
974
975 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
976 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
977 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
978 }
979
980 // Locate the position in the compiled pattern where the match will continue
981 // after completing the *. (4 or 5 in the comment above)
982 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
983
984 // Put together the save state op and store it into the compiled code.
985 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
986 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
987
988 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
989 appendOp(jmpOp);
990 }
991 break;
992
993 case doNGStar:
994 // Non-greedy *? quantifier
995 // compiles to
996 // 1. JMP 3
997 // 2. body of stuff being iterated over
998 // 3. STATE_SAVE 2
999 // 4 ...
1000 {
1001 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
1002 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
1003 int32_t jmpOp = buildOp(URX_JMP, saveLoc);
1004 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1005 appendOp(URX_STATE_SAVE, jmpLoc+1);
1006 }
1007 break;
1008
1009
1010 case doIntervalInit:
1011 // The '{' opening an interval quantifier was just scanned.
1012 // Init the counter varaiables that will accumulate the values as the digits
1013 // are scanned.
1014 fIntervalLow = 0;
1015 fIntervalUpper = -1;
1016 break;
1017
1018 case doIntevalLowerDigit:
1019 // Scanned a digit from the lower value of an {lower,upper} interval
1020 {
1021 int32_t digitValue = u_charDigitValue(fC.fChar);
1022 U_ASSERT(digitValue >= 0);
1023 int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1024 if (val > INT32_MAX) {
1025 error(U_REGEX_NUMBER_TOO_BIG);
1026 } else {
1027 fIntervalLow = (int32_t)val;
1028 }
1029 }
1030 break;
1031
1032 case doIntervalUpperDigit:
1033 // Scanned a digit from the upper value of an {lower,upper} interval
1034 {
1035 if (fIntervalUpper < 0) {
1036 fIntervalUpper = 0;
1037 }
1038 int32_t digitValue = u_charDigitValue(fC.fChar);
1039 U_ASSERT(digitValue >= 0);
1040 int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1041 if (val > INT32_MAX) {
1042 error(U_REGEX_NUMBER_TOO_BIG);
1043 } else {
1044 fIntervalUpper = (int32_t)val;
1045 }
1046 }
1047 break;
1048
1049 case doIntervalSame:
1050 // Scanned a single value interval like {27}. Upper = Lower.
1051 fIntervalUpper = fIntervalLow;
1052 break;
1053
1054 case doInterval:
1055 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1056 if (compileInlineInterval() == FALSE) {
1057 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1058 }
1059 break;
1060
1061 case doPossessiveInterval:
1062 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1063 {
1064 // Remember the loc for the top of the block being looped over.
1065 // (Can not reserve a slot in the compiled pattern at this time, because
1066 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1067 // once per block.)
1068 int32_t topLoc = blockTopLoc(FALSE);
1069
1070 // Produce normal looping code.
1071 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1072
1073 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1074 // just as if the loop was inclosed in atomic parentheses.
1075
1076 // First the STO_SP before the start of the loop
1077 insertOp(topLoc);
1078
1079 int32_t varLoc = allocateData(1); // Reserve a data location for saving the
1080 int32_t op = buildOp(URX_STO_SP, varLoc);
1081 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1082
1083 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1084 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1085 loopOp++; // point LoopOp after the just-inserted STO_SP
1086 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1087
1088 // Then the LD_SP after the end of the loop
1089 appendOp(URX_LD_SP, varLoc);
1090 }
1091
1092 break;
1093
1094 case doNGInterval:
1095 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1096 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1097 break;
1098
1099 case doIntervalError:
1100 error(U_REGEX_BAD_INTERVAL);
1101 break;
1102
1103 case doLiteralChar:
1104 // We've just scanned a "normal" character from the pattern,
1105 literalChar(fC.fChar);
1106 break;
1107
1108
1109 case doEscapedLiteralChar:
1110 // We've just scanned an backslashed escaped character with no
1111 // special meaning. It represents itself.
1112 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1113 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1114 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1115 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1116 }
1117 literalChar(fC.fChar);
1118 break;
1119
1120
1121 case doDotAny:
1122 // scanned a ".", match any single character.
1123 {
1124 fixLiterals(FALSE);
1125 if (fModeFlags & UREGEX_DOTALL) {
1126 appendOp(URX_DOTANY_ALL, 0);
1127 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1128 appendOp(URX_DOTANY_UNIX, 0);
1129 } else {
1130 appendOp(URX_DOTANY, 0);
1131 }
1132 }
1133 break;
1134
1135 case doCaret:
1136 {
1137 fixLiterals(FALSE);
1138 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1139 appendOp(URX_CARET, 0);
1140 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1141 appendOp(URX_CARET_M, 0);
1142 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1143 appendOp(URX_CARET, 0); // Only testing true start of input.
1144 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1145 appendOp(URX_CARET_M_UNIX, 0);
1146 }
1147 }
1148 break;
1149
1150 case doDollar:
1151 {
1152 fixLiterals(FALSE);
1153 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1154 appendOp(URX_DOLLAR, 0);
1155 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1156 appendOp(URX_DOLLAR_M, 0);
1157 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1158 appendOp(URX_DOLLAR_D, 0);
1159 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1160 appendOp(URX_DOLLAR_MD, 0);
1161 }
1162 }
1163 break;
1164
1165 case doBackslashA:
1166 fixLiterals(FALSE);
1167 appendOp(URX_CARET, 0);
1168 break;
1169
1170 case doBackslashB:
1171 {
1172 #if UCONFIG_NO_BREAK_ITERATION==1
1173 if (fModeFlags & UREGEX_UWORD) {
1174 error(U_UNSUPPORTED_ERROR);
1175 }
1176 #endif
1177 fixLiterals(FALSE);
1178 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1179 appendOp(op, 1);
1180 }
1181 break;
1182
1183 case doBackslashb:
1184 {
1185 #if UCONFIG_NO_BREAK_ITERATION==1
1186 if (fModeFlags & UREGEX_UWORD) {
1187 error(U_UNSUPPORTED_ERROR);
1188 }
1189 #endif
1190 fixLiterals(FALSE);
1191 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1192 appendOp(op, 0);
1193 }
1194 break;
1195
1196 case doBackslashD:
1197 fixLiterals(FALSE);
1198 appendOp(URX_BACKSLASH_D, 1);
1199 break;
1200
1201 case doBackslashd:
1202 fixLiterals(FALSE);
1203 appendOp(URX_BACKSLASH_D, 0);
1204 break;
1205
1206 case doBackslashG:
1207 fixLiterals(FALSE);
1208 appendOp(URX_BACKSLASH_G, 0);
1209 break;
1210
1211 case doBackslashH:
1212 fixLiterals(FALSE);
1213 appendOp(URX_BACKSLASH_H, 1);
1214 break;
1215
1216 case doBackslashh:
1217 fixLiterals(FALSE);
1218 appendOp(URX_BACKSLASH_H, 0);
1219 break;
1220
1221 case doBackslashR:
1222 fixLiterals(FALSE);
1223 appendOp(URX_BACKSLASH_R, 0);
1224 break;
1225
1226 case doBackslashS:
1227 fixLiterals(FALSE);
1228 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1229 break;
1230
1231 case doBackslashs:
1232 fixLiterals(FALSE);
1233 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1234 break;
1235
1236 case doBackslashV:
1237 fixLiterals(FALSE);
1238 appendOp(URX_BACKSLASH_V, 1);
1239 break;
1240
1241 case doBackslashv:
1242 fixLiterals(FALSE);
1243 appendOp(URX_BACKSLASH_V, 0);
1244 break;
1245
1246 case doBackslashW:
1247 fixLiterals(FALSE);
1248 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1249 break;
1250
1251 case doBackslashw:
1252 fixLiterals(FALSE);
1253 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1254 break;
1255
1256 case doBackslashX:
1257 #if UCONFIG_NO_BREAK_ITERATION==1
1258 // Grapheme Cluster Boundary requires ICU break iteration.
1259 error(U_UNSUPPORTED_ERROR);
1260 #endif
1261 fixLiterals(FALSE);
1262 appendOp(URX_BACKSLASH_X, 0);
1263 break;
1264
1265 case doBackslashZ:
1266 fixLiterals(FALSE);
1267 appendOp(URX_DOLLAR, 0);
1268 break;
1269
1270 case doBackslashz:
1271 fixLiterals(FALSE);
1272 appendOp(URX_BACKSLASH_Z, 0);
1273 break;
1274
1275 case doEscapeError:
1276 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1277 break;
1278
1279 case doExit:
1280 fixLiterals(FALSE);
1281 returnVal = FALSE;
1282 break;
1283
1284 case doProperty:
1285 {
1286 fixLiterals(FALSE);
1287 UnicodeSet *theSet = scanProp();
1288 compileSet(theSet);
1289 }
1290 break;
1291
1292 case doNamedChar:
1293 {
1294 UChar32 c = scanNamedChar();
1295 literalChar(c);
1296 }
1297 break;
1298
1299
1300 case doBackRef:
1301 // BackReference. Somewhat unusual in that the front-end can not completely parse
1302 // the regular expression, because the number of digits to be consumed
1303 // depends on the number of capture groups that have been defined. So
1304 // we have to do it here instead.
1305 {
1306 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1307 int32_t groupNum = 0;
1308 UChar32 c = fC.fChar;
1309
1310 for (;;) {
1311 // Loop once per digit, for max allowed number of digits in a back reference.
1312 int32_t digit = u_charDigitValue(c);
1313 groupNum = groupNum * 10 + digit;
1314 if (groupNum >= numCaptureGroups) {
1315 break;
1316 }
1317 c = peekCharLL();
1318 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1319 break;
1320 }
1321 nextCharLL();
1322 }
1323
1324 // Scan of the back reference in the source regexp is complete. Now generate
1325 // the compiled code for it.
1326 // Because capture groups can be forward-referenced by back-references,
1327 // we fill the operand with the capture group number. At the end
1328 // of compilation, it will be changed to the variable's location.
1329 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1330 // and shouldn't enter this code path at all.
1331 fixLiterals(FALSE);
1332 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1333 appendOp(URX_BACKREF_I, groupNum);
1334 } else {
1335 appendOp(URX_BACKREF, groupNum);
1336 }
1337 }
1338 break;
1339
1340 case doBeginNamedBackRef:
1341 U_ASSERT(fCaptureName == NULL);
1342 fCaptureName = new UnicodeString;
1343 if (fCaptureName == NULL) {
1344 error(U_MEMORY_ALLOCATION_ERROR);
1345 }
1346 break;
1347
1348 case doContinueNamedBackRef:
1349 fCaptureName->append(fC.fChar);
1350 break;
1351
1352 case doCompleteNamedBackRef:
1353 {
1354 int32_t groupNumber =
1355 fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1356 if (groupNumber == 0) {
1357 // Group name has not been defined.
1358 // Could be a forward reference. If we choose to support them at some
1359 // future time, extra mechanism will be required at this point.
1360 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1361 } else {
1362 // Given the number, handle identically to a \n numbered back reference.
1363 // See comments above, under doBackRef
1364 fixLiterals(FALSE);
1365 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1366 appendOp(URX_BACKREF_I, groupNumber);
1367 } else {
1368 appendOp(URX_BACKREF, groupNumber);
1369 }
1370 }
1371 delete fCaptureName;
1372 fCaptureName = NULL;
1373 break;
1374 }
1375
1376 case doPossessivePlus:
1377 // Possessive ++ quantifier.
1378 // Compiles to
1379 // 1. STO_SP
1380 // 2. body of stuff being iterated over
1381 // 3. STATE_SAVE 5
1382 // 4. JMP 2
1383 // 5. LD_SP
1384 // 6. ...
1385 //
1386 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1387 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1388 //
1389 {
1390 // Emit the STO_SP
1391 int32_t topLoc = blockTopLoc(TRUE);
1392 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1393 int32_t op = buildOp(URX_STO_SP, stoLoc);
1394 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1395
1396 // Emit the STATE_SAVE
1397 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1398
1399 // Emit the JMP
1400 appendOp(URX_JMP, topLoc+1);
1401
1402 // Emit the LD_SP
1403 appendOp(URX_LD_SP, stoLoc);
1404 }
1405 break;
1406
1407 case doPossessiveStar:
1408 // Possessive *+ quantifier.
1409 // Compiles to
1410 // 1. STO_SP loc
1411 // 2. STATE_SAVE 5
1412 // 3. body of stuff being iterated over
1413 // 4. JMP 2
1414 // 5. LD_SP loc
1415 // 6 ...
1416 // TODO: do something to cut back the state stack each time through the loop.
1417 {
1418 // Reserve two slots at the top of the block.
1419 int32_t topLoc = blockTopLoc(TRUE);
1420 insertOp(topLoc);
1421
1422 // emit STO_SP loc
1423 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1424 int32_t op = buildOp(URX_STO_SP, stoLoc);
1425 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1426
1427 // Emit the SAVE_STATE 5
1428 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1429 op = buildOp(URX_STATE_SAVE, L7);
1430 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1431
1432 // Append the JMP operation.
1433 appendOp(URX_JMP, topLoc+1);
1434
1435 // Emit the LD_SP loc
1436 appendOp(URX_LD_SP, stoLoc);
1437 }
1438 break;
1439
1440 case doPossessiveOpt:
1441 // Possessive ?+ quantifier.
1442 // Compiles to
1443 // 1. STO_SP loc
1444 // 2. SAVE_STATE 5
1445 // 3. body of optional block
1446 // 4. LD_SP loc
1447 // 5. ...
1448 //
1449 {
1450 // Reserve two slots at the top of the block.
1451 int32_t topLoc = blockTopLoc(TRUE);
1452 insertOp(topLoc);
1453
1454 // Emit the STO_SP
1455 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1456 int32_t op = buildOp(URX_STO_SP, stoLoc);
1457 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1458
1459 // Emit the SAVE_STATE
1460 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1461 op = buildOp(URX_STATE_SAVE, continueLoc);
1462 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1463
1464 // Emit the LD_SP
1465 appendOp(URX_LD_SP, stoLoc);
1466 }
1467 break;
1468
1469
1470 case doBeginMatchMode:
1471 fNewModeFlags = fModeFlags;
1472 fSetModeFlag = TRUE;
1473 break;
1474
1475 case doMatchMode: // (?i) and similar
1476 {
1477 int32_t bit = 0;
1478 switch (fC.fChar) {
1479 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1480 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
1481 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1482 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1483 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
1484 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1485 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1486 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1487 default:
1488 UPRV_UNREACHABLE; // Should never happen. Other chars are filtered out
1489 // by the scanner.
1490 }
1491 if (fSetModeFlag) {
1492 fNewModeFlags |= bit;
1493 } else {
1494 fNewModeFlags &= ~bit;
1495 }
1496 }
1497 break;
1498
1499 case doSetMatchMode:
1500 // Emit code to match any pending literals, using the not-yet changed match mode.
1501 fixLiterals();
1502
1503 // We've got a (?i) or similar. The match mode is being changed, but
1504 // the change is not scoped to a parenthesized block.
1505 U_ASSERT(fNewModeFlags < 0);
1506 fModeFlags = fNewModeFlags;
1507
1508 break;
1509
1510
1511 case doMatchModeParen:
1512 // We've got a (?i: or similar. Begin a parenthesized block, save old
1513 // mode flags so they can be restored at the close of the block.
1514 //
1515 // Compile to a
1516 // - NOP, which later may be replaced by a save-state if the
1517 // parenthesized group gets a * quantifier, followed by
1518 // - NOP, which may later be replaced by a save-state if there
1519 // is an '|' alternation within the parens.
1520 {
1521 fixLiterals(FALSE);
1522 appendOp(URX_NOP, 0);
1523 appendOp(URX_NOP, 0);
1524
1525 // On the Parentheses stack, start a new frame and add the postions
1526 // of the two NOPs (a normal non-capturing () frame, except for the
1527 // saving of the orignal mode flags.)
1528 fParenStack.push(fModeFlags, *fStatus);
1529 fParenStack.push(flags, *fStatus); // Frame Marker
1530 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1531 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1532
1533 // Set the current mode flags to the new values.
1534 U_ASSERT(fNewModeFlags < 0);
1535 fModeFlags = fNewModeFlags;
1536 }
1537 break;
1538
1539 case doBadModeFlag:
1540 error(U_REGEX_INVALID_FLAG);
1541 break;
1542
1543 case doSuppressComments:
1544 // We have just scanned a '(?'. We now need to prevent the character scanner from
1545 // treating a '#' as a to-the-end-of-line comment.
1546 // (This Perl compatibility just gets uglier and uglier to do...)
1547 fEOLComments = FALSE;
1548 break;
1549
1550
1551 case doSetAddAmp:
1552 {
1553 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1554 set->add(chAmp);
1555 }
1556 break;
1557
1558 case doSetAddDash:
1559 {
1560 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1561 set->add(chDash);
1562 }
1563 break;
1564
1565 case doSetBackslash_s:
1566 {
1567 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1568 set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1569 break;
1570 }
1571
1572 case doSetBackslash_S:
1573 {
1574 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1575 UnicodeSet SSet;
1576 SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1577 set->addAll(SSet);
1578 break;
1579 }
1580
1581 case doSetBackslash_d:
1582 {
1583 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1584 // TODO - make a static set, ticket 6058.
1585 addCategory(set, U_GC_ND_MASK, *fStatus);
1586 break;
1587 }
1588
1589 case doSetBackslash_D:
1590 {
1591 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1592 UnicodeSet digits;
1593 // TODO - make a static set, ticket 6058.
1594 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1595 digits.complement();
1596 set->addAll(digits);
1597 break;
1598 }
1599
1600 case doSetBackslash_h:
1601 {
1602 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1603 UnicodeSet h;
1604 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1605 h.add((UChar32)9); // Tab
1606 set->addAll(h);
1607 break;
1608 }
1609
1610 case doSetBackslash_H:
1611 {
1612 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1613 UnicodeSet h;
1614 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1615 h.add((UChar32)9); // Tab
1616 h.complement();
1617 set->addAll(h);
1618 break;
1619 }
1620
1621 case doSetBackslash_v:
1622 {
1623 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1624 set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1625 set->add((UChar32)0x85);
1626 set->add((UChar32)0x2028, (UChar32)0x2029);
1627 break;
1628 }
1629
1630 case doSetBackslash_V:
1631 {
1632 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1633 UnicodeSet v;
1634 v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1635 v.add((UChar32)0x85);
1636 v.add((UChar32)0x2028, (UChar32)0x2029);
1637 v.complement();
1638 set->addAll(v);
1639 break;
1640 }
1641
1642 case doSetBackslash_w:
1643 {
1644 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1645 set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1646 break;
1647 }
1648
1649 case doSetBackslash_W:
1650 {
1651 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1652 UnicodeSet SSet;
1653 SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1654 set->addAll(SSet);
1655 break;
1656 }
1657
1658 case doSetBegin:
1659 fixLiterals(FALSE);
1660 fSetStack.push(new UnicodeSet(), *fStatus);
1661 fSetOpStack.push(setStart, *fStatus);
1662 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1663 fSetOpStack.push(setCaseClose, *fStatus);
1664 }
1665 break;
1666
1667 case doSetBeginDifference1:
1668 // We have scanned something like [[abc]-[
1669 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1670 // Push a Difference operator, which will cause the new set to be subtracted from what
1671 // went before once it is created.
1672 setPushOp(setDifference1);
1673 fSetOpStack.push(setStart, *fStatus);
1674 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1675 fSetOpStack.push(setCaseClose, *fStatus);
1676 }
1677 break;
1678
1679 case doSetBeginIntersection1:
1680 // We have scanned something like [[abc]&[
1681 // Need both the '&' operator and the open '[' operator.
1682 setPushOp(setIntersection1);
1683 fSetOpStack.push(setStart, *fStatus);
1684 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1685 fSetOpStack.push(setCaseClose, *fStatus);
1686 }
1687 break;
1688
1689 case doSetBeginUnion:
1690 // We have scanned something like [[abc][
1691 // Need to handle the union operation explicitly [[abc] | [
1692 setPushOp(setUnion);
1693 fSetOpStack.push(setStart, *fStatus);
1694 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1695 fSetOpStack.push(setCaseClose, *fStatus);
1696 }
1697 break;
1698
1699 case doSetDifference2:
1700 // We have scanned something like [abc--
1701 // Consider this to unambiguously be a set difference operator.
1702 setPushOp(setDifference2);
1703 break;
1704
1705 case doSetEnd:
1706 // Have encountered the ']' that closes a set.
1707 // Force the evaluation of any pending operations within this set,
1708 // leave the completed set on the top of the set stack.
1709 setEval(setEnd);
1710 U_ASSERT(fSetOpStack.peeki()==setStart);
1711 fSetOpStack.popi();
1712 break;
1713
1714 case doSetFinish:
1715 {
1716 // Finished a complete set expression, including all nested sets.
1717 // The close bracket has already triggered clearing out pending set operators,
1718 // the operator stack should be empty and the operand stack should have just
1719 // one entry, the result set.
1720 U_ASSERT(fSetOpStack.empty());
1721 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1722 U_ASSERT(fSetStack.empty());
1723 compileSet(theSet);
1724 break;
1725 }
1726
1727 case doSetIntersection2:
1728 // Have scanned something like [abc&&
1729 setPushOp(setIntersection2);
1730 break;
1731
1732 case doSetLiteral:
1733 // Union the just-scanned literal character into the set being built.
1734 // This operation is the highest precedence set operation, so we can always do
1735 // it immediately, without waiting to see what follows. It is necessary to perform
1736 // any pending '-' or '&' operation first, because these have the same precedence
1737 // as union-ing in a literal'
1738 {
1739 setEval(setUnion);
1740 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1741 s->add(fC.fChar);
1742 fLastSetLiteral = fC.fChar;
1743 break;
1744 }
1745
1746 case doSetLiteralEscaped:
1747 // A back-slash escaped literal character was encountered.
1748 // Processing is the same as with setLiteral, above, with the addition of
1749 // the optional check for errors on escaped ASCII letters.
1750 {
1751 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1752 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1753 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1754 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1755 }
1756 setEval(setUnion);
1757 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1758 s->add(fC.fChar);
1759 fLastSetLiteral = fC.fChar;
1760 break;
1761 }
1762
1763 case doSetNamedChar:
1764 // Scanning a \N{UNICODE CHARACTER NAME}
1765 // Aside from the source of the character, the processing is identical to doSetLiteral,
1766 // above.
1767 {
1768 UChar32 c = scanNamedChar();
1769 setEval(setUnion);
1770 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1771 s->add(c);
1772 fLastSetLiteral = c;
1773 break;
1774 }
1775
1776 case doSetNamedRange:
1777 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1778 // The left character is already in the set, and is saved in fLastSetLiteral.
1779 // The right side needs to be picked up, the scan is at the 'N'.
1780 // Lower Limit > Upper limit being an error matches both Java
1781 // and ICU UnicodeSet behavior.
1782 {
1783 UChar32 c = scanNamedChar();
1784 if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1785 error(U_REGEX_INVALID_RANGE);
1786 }
1787 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1788 s->add(fLastSetLiteral, c);
1789 fLastSetLiteral = c;
1790 break;
1791 }
1792
1793
1794 case doSetNegate:
1795 // Scanned a '^' at the start of a set.
1796 // Push the negation operator onto the set op stack.
1797 // A twist for case-insensitive matching:
1798 // the case closure operation must happen _before_ negation.
1799 // But the case closure operation will already be on the stack if it's required.
1800 // This requires checking for case closure, and swapping the stack order
1801 // if it is present.
1802 {
1803 int32_t tosOp = fSetOpStack.peeki();
1804 if (tosOp == setCaseClose) {
1805 fSetOpStack.popi();
1806 fSetOpStack.push(setNegation, *fStatus);
1807 fSetOpStack.push(setCaseClose, *fStatus);
1808 } else {
1809 fSetOpStack.push(setNegation, *fStatus);
1810 }
1811 }
1812 break;
1813
1814 case doSetNoCloseError:
1815 error(U_REGEX_MISSING_CLOSE_BRACKET);
1816 break;
1817
1818 case doSetOpError:
1819 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1820 break;
1821
1822 case doSetPosixProp:
1823 {
1824 UnicodeSet *s = scanPosixProp();
1825 if (s != NULL) {
1826 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1827 tos->addAll(*s);
1828 delete s;
1829 } // else error. scanProp() reported the error status already.
1830 }
1831 break;
1832
1833 case doSetProp:
1834 // Scanned a \p \P within [brackets].
1835 {
1836 UnicodeSet *s = scanProp();
1837 if (s != NULL) {
1838 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1839 tos->addAll(*s);
1840 delete s;
1841 } // else error. scanProp() reported the error status already.
1842 }
1843 break;
1844
1845
1846 case doSetRange:
1847 // We have scanned literal-literal. Add the range to the set.
1848 // The left character is already in the set, and is saved in fLastSetLiteral.
1849 // The right side is the current character.
1850 // Lower Limit > Upper limit being an error matches both Java
1851 // and ICU UnicodeSet behavior.
1852 {
1853
1854 if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1855 error(U_REGEX_INVALID_RANGE);
1856 }
1857 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1858 s->add(fLastSetLiteral, fC.fChar);
1859 break;
1860 }
1861
1862 default:
1863 UPRV_UNREACHABLE;
1864 }
1865
1866 if (U_FAILURE(*fStatus)) {
1867 returnVal = FALSE;
1868 }
1869
1870 return returnVal;
1871 }
1872
1873
1874
1875 //------------------------------------------------------------------------------
1876 //
1877 // literalChar We've encountered a literal character from the pattern,
1878 // or an escape sequence that reduces to a character.
1879 // Add it to the string containing all literal chars/strings from
1880 // the pattern.
1881 //
1882 //------------------------------------------------------------------------------
literalChar(UChar32 c)1883 void RegexCompile::literalChar(UChar32 c) {
1884 fLiteralChars.append(c);
1885 }
1886
1887
1888 //------------------------------------------------------------------------------
1889 //
1890 // fixLiterals When compiling something that can follow a literal
1891 // string in a pattern, emit the code to match the
1892 // accumulated literal string.
1893 //
1894 // Optionally, split the last char of the string off into
1895 // a single "ONE_CHAR" operation, so that quantifiers can
1896 // apply to that char alone. Example: abc*
1897 // The * must apply to the 'c' only.
1898 //
1899 //------------------------------------------------------------------------------
fixLiterals(UBool split)1900 void RegexCompile::fixLiterals(UBool split) {
1901
1902 // If no literal characters have been scanned but not yet had code generated
1903 // for them, nothing needs to be done.
1904 if (fLiteralChars.length() == 0) {
1905 return;
1906 }
1907
1908 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1909 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1910
1911 // Split: We need to ensure that the last item in the compiled pattern
1912 // refers only to the last literal scanned in the pattern, so that
1913 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1914 // Split before case folding for case insensitive matches.
1915
1916 if (split) {
1917 fLiteralChars.truncate(indexOfLastCodePoint);
1918 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1919 // Note that the truncated literal string may be empty, in which case
1920 // nothing will be emitted.
1921
1922 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1923 fixLiterals(FALSE); // Second recursive call, code for the final code point.
1924 return;
1925 }
1926
1927 // If we are doing case-insensitive matching, case fold the string. This may expand
1928 // the string, e.g. the German sharp-s turns into "ss"
1929 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1930 fLiteralChars.foldCase();
1931 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1932 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1933 }
1934
1935 if (indexOfLastCodePoint == 0) {
1936 // Single character, emit a URX_ONECHAR op to match it.
1937 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1938 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1939 appendOp(URX_ONECHAR_I, lastCodePoint);
1940 } else {
1941 appendOp(URX_ONECHAR, lastCodePoint);
1942 }
1943 } else {
1944 // Two or more chars, emit a URX_STRING to match them.
1945 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1946 error(U_REGEX_PATTERN_TOO_BIG);
1947 }
1948 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1949 appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1950 } else {
1951 // TODO here: add optimization to split case sensitive strings of length two
1952 // into two single char ops, for efficiency.
1953 appendOp(URX_STRING, fRXPat->fLiteralText.length());
1954 }
1955 appendOp(URX_STRING_LEN, fLiteralChars.length());
1956
1957 // Add this string into the accumulated strings of the compiled pattern.
1958 fRXPat->fLiteralText.append(fLiteralChars);
1959 }
1960
1961 fLiteralChars.remove();
1962 }
1963
1964
buildOp(int32_t type,int32_t val)1965 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1966 if (U_FAILURE(*fStatus)) {
1967 return 0;
1968 }
1969 if (type < 0 || type > 255) {
1970 UPRV_UNREACHABLE;
1971 }
1972 if (val > 0x00ffffff) {
1973 UPRV_UNREACHABLE;
1974 }
1975 if (val < 0) {
1976 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1977 UPRV_UNREACHABLE;
1978 }
1979 if (URX_TYPE(val) != 0xff) {
1980 UPRV_UNREACHABLE;
1981 }
1982 type = URX_RESERVED_OP_N;
1983 }
1984 return (type << 24) | val;
1985 }
1986
1987
1988 //------------------------------------------------------------------------------
1989 //
1990 // appendOp() Append a new instruction onto the compiled pattern
1991 // Includes error checking, limiting the size of the
1992 // pattern to lengths that can be represented in the
1993 // 24 bit operand field of an instruction.
1994 //
1995 //------------------------------------------------------------------------------
appendOp(int32_t op)1996 void RegexCompile::appendOp(int32_t op) {
1997 if (U_FAILURE(*fStatus)) {
1998 return;
1999 }
2000 fRXPat->fCompiledPat->addElement(op, *fStatus);
2001 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2002 error(U_REGEX_PATTERN_TOO_BIG);
2003 }
2004 }
2005
appendOp(int32_t type,int32_t val)2006 void RegexCompile::appendOp(int32_t type, int32_t val) {
2007 appendOp(buildOp(type, val));
2008 }
2009
2010
2011 //------------------------------------------------------------------------------
2012 //
2013 // insertOp() Insert a slot for a new opcode into the already
2014 // compiled pattern code.
2015 //
2016 // Fill the slot with a NOP. Our caller will replace it
2017 // with what they really wanted.
2018 //
2019 //------------------------------------------------------------------------------
insertOp(int32_t where)2020 void RegexCompile::insertOp(int32_t where) {
2021 UVector64 *code = fRXPat->fCompiledPat;
2022 U_ASSERT(where>0 && where < code->size());
2023
2024 int32_t nop = buildOp(URX_NOP, 0);
2025 code->insertElementAt(nop, where, *fStatus);
2026
2027 // Walk through the pattern, looking for any ops with targets that
2028 // were moved down by the insert. Fix them.
2029 int32_t loc;
2030 for (loc=0; loc<code->size(); loc++) {
2031 int32_t op = (int32_t)code->elementAti(loc);
2032 int32_t opType = URX_TYPE(op);
2033 int32_t opValue = URX_VAL(op);
2034 if ((opType == URX_JMP ||
2035 opType == URX_JMPX ||
2036 opType == URX_STATE_SAVE ||
2037 opType == URX_CTR_LOOP ||
2038 opType == URX_CTR_LOOP_NG ||
2039 opType == URX_JMP_SAV ||
2040 opType == URX_JMP_SAV_X ||
2041 opType == URX_RELOC_OPRND) && opValue > where) {
2042 // Target location for this opcode is after the insertion point and
2043 // needs to be incremented to adjust for the insertion.
2044 opValue++;
2045 op = buildOp(opType, opValue);
2046 code->setElementAt(op, loc);
2047 }
2048 }
2049
2050 // Now fix up the parentheses stack. All positive values in it are locations in
2051 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2052 for (loc=0; loc<fParenStack.size(); loc++) {
2053 int32_t x = fParenStack.elementAti(loc);
2054 U_ASSERT(x < code->size());
2055 if (x>where) {
2056 x++;
2057 fParenStack.setElementAt(x, loc);
2058 }
2059 }
2060
2061 if (fMatchCloseParen > where) {
2062 fMatchCloseParen++;
2063 }
2064 if (fMatchOpenParen > where) {
2065 fMatchOpenParen++;
2066 }
2067 }
2068
2069
2070 //------------------------------------------------------------------------------
2071 //
2072 // allocateData() Allocate storage in the matcher's static data area.
2073 // Return the index for the newly allocated data.
2074 // The storage won't actually exist until we are running a match
2075 // operation, but the storage indexes are inserted into various
2076 // opcodes while compiling the pattern.
2077 //
2078 //------------------------------------------------------------------------------
allocateData(int32_t size)2079 int32_t RegexCompile::allocateData(int32_t size) {
2080 if (U_FAILURE(*fStatus)) {
2081 return 0;
2082 }
2083 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2084 error(U_REGEX_INTERNAL_ERROR);
2085 return 0;
2086 }
2087 int32_t dataIndex = fRXPat->fDataSize;
2088 fRXPat->fDataSize += size;
2089 if (fRXPat->fDataSize >= 0x00fffff0) {
2090 error(U_REGEX_INTERNAL_ERROR);
2091 }
2092 return dataIndex;
2093 }
2094
2095
2096 //------------------------------------------------------------------------------
2097 //
2098 // allocateStackData() Allocate space in the back-tracking stack frame.
2099 // Return the index for the newly allocated data.
2100 // The frame indexes are inserted into various
2101 // opcodes while compiling the pattern, meaning that frame
2102 // size must be restricted to the size that will fit
2103 // as an operand (24 bits).
2104 //
2105 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2106 int32_t RegexCompile::allocateStackData(int32_t size) {
2107 if (U_FAILURE(*fStatus)) {
2108 return 0;
2109 }
2110 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2111 error(U_REGEX_INTERNAL_ERROR);
2112 return 0;
2113 }
2114 int32_t dataIndex = fRXPat->fFrameSize;
2115 fRXPat->fFrameSize += size;
2116 if (fRXPat->fFrameSize >= 0x00fffff0) {
2117 error(U_REGEX_PATTERN_TOO_BIG);
2118 }
2119 return dataIndex;
2120 }
2121
2122
2123 //------------------------------------------------------------------------------
2124 //
2125 // blockTopLoc() Find or create a location in the compiled pattern
2126 // at the start of the operation or block that has
2127 // just been compiled. Needed when a quantifier (* or
2128 // whatever) appears, and we need to add an operation
2129 // at the start of the thing being quantified.
2130 //
2131 // (Parenthesized Blocks) have a slot with a NOP that
2132 // is reserved for this purpose. .* or similar don't
2133 // and a slot needs to be added.
2134 //
2135 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2136 // at the returned location.
2137 // FALSE - just return the address,
2138 // do not reserve a location there.
2139 //
2140 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2141 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2142 int32_t theLoc;
2143 fixLiterals(TRUE); // Emit code for any pending literals.
2144 // If last item was a string, emit separate op for the its last char.
2145 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2146 {
2147 // The item just processed is a parenthesized block.
2148 theLoc = fMatchOpenParen; // A slot is already reserved for us.
2149 U_ASSERT(theLoc > 0);
2150 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2151 }
2152 else {
2153 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2154 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2155 // We need to make space now.
2156 theLoc = fRXPat->fCompiledPat->size()-1;
2157 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2158 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2159 // Strings take two opcode, we want the position of the first one.
2160 // We can have a string at this point if a single character case-folded to two.
2161 theLoc--;
2162 }
2163 if (reserveLoc) {
2164 int32_t nop = buildOp(URX_NOP, 0);
2165 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2166 }
2167 }
2168 return theLoc;
2169 }
2170
2171
2172
2173 //------------------------------------------------------------------------------
2174 //
2175 // handleCloseParen When compiling a close paren, we need to go back
2176 // and fix up any JMP or SAVE operations within the
2177 // parenthesized block that need to target the end
2178 // of the block. The locations of these are kept on
2179 // the paretheses stack.
2180 //
2181 // This function is called both when encountering a
2182 // real ) and at the end of the pattern.
2183 //
2184 //------------------------------------------------------------------------------
handleCloseParen()2185 void RegexCompile::handleCloseParen() {
2186 int32_t patIdx;
2187 int32_t patOp;
2188 if (fParenStack.size() <= 0) {
2189 error(U_REGEX_MISMATCHED_PAREN);
2190 return;
2191 }
2192
2193 // Emit code for any pending literals.
2194 fixLiterals(FALSE);
2195
2196 // Fixup any operations within the just-closed parenthesized group
2197 // that need to reference the end of the (block).
2198 // (The first one popped from the stack is an unused slot for
2199 // alternation (OR) state save, but applying the fixup to it does no harm.)
2200 for (;;) {
2201 patIdx = fParenStack.popi();
2202 if (patIdx < 0) {
2203 // value < 0 flags the start of the frame on the paren stack.
2204 break;
2205 }
2206 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2207 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2208 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
2209 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
2210 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2211 fMatchOpenParen = patIdx;
2212 }
2213
2214 // At the close of any parenthesized block, restore the match mode flags to
2215 // the value they had at the open paren. Saved value is
2216 // at the top of the paren stack.
2217 fModeFlags = fParenStack.popi();
2218 U_ASSERT(fModeFlags < 0);
2219
2220 // DO any additional fixups, depending on the specific kind of
2221 // parentesized grouping this is
2222
2223 switch (patIdx) {
2224 case plain:
2225 case flags:
2226 // No additional fixups required.
2227 // (Grouping-only parentheses)
2228 break;
2229 case capturing:
2230 // Capturing Parentheses.
2231 // Insert a End Capture op into the pattern.
2232 // The frame offset of the variables for this cg is obtained from the
2233 // start capture op and put it into the end-capture op.
2234 {
2235 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2236 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2237
2238 int32_t frameVarLocation = URX_VAL(captureOp);
2239 appendOp(URX_END_CAPTURE, frameVarLocation);
2240 }
2241 break;
2242 case atomic:
2243 // Atomic Parenthesis.
2244 // Insert a LD_SP operation to restore the state stack to the position
2245 // it was when the atomic parens were entered.
2246 {
2247 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2248 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2249 int32_t stoLoc = URX_VAL(stoOp);
2250 appendOp(URX_LD_SP, stoLoc);
2251 }
2252 break;
2253
2254 case lookAhead:
2255 {
2256 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2257 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2258 int32_t dataLoc = URX_VAL(startOp);
2259 appendOp(URX_LA_END, dataLoc);
2260 }
2261 break;
2262
2263 case negLookAhead:
2264 {
2265 // See comment at doOpenLookAheadNeg
2266 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2267 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2268 int32_t dataLoc = URX_VAL(startOp);
2269 appendOp(URX_LA_END, dataLoc);
2270 appendOp(URX_BACKTRACK, 0);
2271 appendOp(URX_LA_END, dataLoc);
2272
2273 // Patch the URX_SAVE near the top of the block.
2274 // The destination of the SAVE is the final LA_END that was just added.
2275 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2276 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2277 int32_t dest = fRXPat->fCompiledPat->size()-1;
2278 saveOp = buildOp(URX_STATE_SAVE, dest);
2279 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2280 }
2281 break;
2282
2283 case lookBehind:
2284 {
2285 // See comment at doOpenLookBehind.
2286
2287 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2288 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2289 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2290 int32_t dataLoc = URX_VAL(startOp);
2291 appendOp(URX_LB_END, dataLoc);
2292 appendOp(URX_LA_END, dataLoc);
2293
2294 // Determine the min and max bounds for the length of the
2295 // string that the pattern can match.
2296 // An unbounded upper limit is an error.
2297 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2298 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2299 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2300 if (URX_TYPE(maxML) != 0) {
2301 error(U_REGEX_LOOK_BEHIND_LIMIT);
2302 break;
2303 }
2304 if (maxML == INT32_MAX) {
2305 error(U_REGEX_LOOK_BEHIND_LIMIT);
2306 break;
2307 }
2308 if (minML == INT32_MAX) {
2309 // This condition happens when no match is possible, such as with a
2310 // [set] expression containing no elements.
2311 // In principle, the generated code to evaluate the expression could be deleted,
2312 // but it's probably not worth the complication.
2313 minML = 0;
2314 }
2315 U_ASSERT(minML <= maxML);
2316
2317 // Insert the min and max match len bounds into the URX_LB_CONT op that
2318 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2319 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2320 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2321
2322 }
2323 break;
2324
2325
2326
2327 case lookBehindN:
2328 {
2329 // See comment at doOpenLookBehindNeg.
2330
2331 // Append the URX_LBN_END to the compiled pattern.
2332 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2333 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2334 int32_t dataLoc = URX_VAL(startOp);
2335 appendOp(URX_LBN_END, dataLoc);
2336
2337 // Determine the min and max bounds for the length of the
2338 // string that the pattern can match.
2339 // An unbounded upper limit is an error.
2340 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2341 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2342 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2343 if (URX_TYPE(maxML) != 0) {
2344 error(U_REGEX_LOOK_BEHIND_LIMIT);
2345 break;
2346 }
2347 if (maxML == INT32_MAX) {
2348 error(U_REGEX_LOOK_BEHIND_LIMIT);
2349 break;
2350 }
2351 if (minML == INT32_MAX) {
2352 // This condition happens when no match is possible, such as with a
2353 // [set] expression containing no elements.
2354 // In principle, the generated code to evaluate the expression could be deleted,
2355 // but it's probably not worth the complication.
2356 minML = 0;
2357 }
2358
2359 U_ASSERT(minML <= maxML);
2360
2361 // Insert the min and max match len bounds into the URX_LB_CONT op that
2362 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2363 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2364 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2365
2366 // Insert the pattern location to continue at after a successful match
2367 // as the last operand of the URX_LBN_CONT
2368 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2369 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2370 }
2371 break;
2372
2373
2374
2375 default:
2376 UPRV_UNREACHABLE;
2377 }
2378
2379 // remember the next location in the compiled pattern.
2380 // The compilation of Quantifiers will look at this to see whether its looping
2381 // over a parenthesized block or a single item
2382 fMatchCloseParen = fRXPat->fCompiledPat->size();
2383 }
2384
2385
2386
2387 //------------------------------------------------------------------------------
2388 //
2389 // compileSet Compile the pattern operations for a reference to a
2390 // UnicodeSet.
2391 //
2392 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2393 void RegexCompile::compileSet(UnicodeSet *theSet)
2394 {
2395 if (theSet == NULL) {
2396 return;
2397 }
2398 // Remove any strings from the set.
2399 // There shoudn't be any, but just in case.
2400 // (Case Closure can add them; if we had a simple case closure available that
2401 // ignored strings, that would be better.)
2402 theSet->removeAllStrings();
2403 int32_t setSize = theSet->size();
2404
2405 switch (setSize) {
2406 case 0:
2407 {
2408 // Set of no elements. Always fails to match.
2409 appendOp(URX_BACKTRACK, 0);
2410 delete theSet;
2411 }
2412 break;
2413
2414 case 1:
2415 {
2416 // The set contains only a single code point. Put it into
2417 // the compiled pattern as a single char operation rather
2418 // than a set, and discard the set itself.
2419 literalChar(theSet->charAt(0));
2420 delete theSet;
2421 }
2422 break;
2423
2424 default:
2425 {
2426 // The set contains two or more chars. (the normal case)
2427 // Put it into the compiled pattern as a set.
2428 theSet->freeze();
2429 int32_t setNumber = fRXPat->fSets->size();
2430 fRXPat->fSets->addElement(theSet, *fStatus);
2431 appendOp(URX_SETREF, setNumber);
2432 }
2433 }
2434 }
2435
2436
2437 //------------------------------------------------------------------------------
2438 //
2439 // compileInterval Generate the code for a {min, max} style interval quantifier.
2440 // Except for the specific opcodes used, the code is the same
2441 // for all three types (greedy, non-greedy, possessive) of
2442 // intervals. The opcodes are supplied as parameters.
2443 // (There are two sets of opcodes - greedy & possessive use the
2444 // same ones, while non-greedy has it's own.)
2445 //
2446 // The code for interval loops has this form:
2447 // 0 CTR_INIT counter loc (in stack frame)
2448 // 1 5 patt address of CTR_LOOP at bottom of block
2449 // 2 min count
2450 // 3 max count (-1 for unbounded)
2451 // 4 ... block to be iterated over
2452 // 5 CTR_LOOP
2453 //
2454 // In
2455 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2456 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2457 {
2458 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2459 // four slots in the compiled code. Reserve them.
2460 int32_t topOfBlock = blockTopLoc(TRUE);
2461 insertOp(topOfBlock);
2462 insertOp(topOfBlock);
2463 insertOp(topOfBlock);
2464
2465 // The operands for the CTR_INIT opcode include the index in the matcher data
2466 // of the counter. Allocate it now. There are two data items
2467 // counterLoc --> Loop counter
2468 // +1 --> Input index (for breaking non-progressing loops)
2469 // (Only present if unbounded upper limit on loop)
2470 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
2471 int32_t counterLoc = allocateStackData(dataSize);
2472
2473 int32_t op = buildOp(InitOp, counterLoc);
2474 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2475
2476 // The second operand of CTR_INIT is the location following the end of the loop.
2477 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2478 // compilation of something later on causes the code to grow and the target
2479 // position to move.
2480 int32_t loopEnd = fRXPat->fCompiledPat->size();
2481 op = buildOp(URX_RELOC_OPRND, loopEnd);
2482 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2483
2484 // Followed by the min and max counts.
2485 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2486 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2487
2488 // Append the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2489 // Goes at end of the block being looped over, so just append to the code so far.
2490 appendOp(LoopOp, topOfBlock);
2491
2492 if ((fIntervalLow & 0xff000000) != 0 ||
2493 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2494 error(U_REGEX_NUMBER_TOO_BIG);
2495 }
2496
2497 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2498 error(U_REGEX_MAX_LT_MIN);
2499 }
2500 }
2501
2502
2503
compileInlineInterval()2504 UBool RegexCompile::compileInlineInterval() {
2505 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2506 // Too big to inline. Fail, which will cause looping code to be generated.
2507 // (Upper < Lower picks up unbounded upper and errors, both.)
2508 return FALSE;
2509 }
2510
2511 int32_t topOfBlock = blockTopLoc(FALSE);
2512 if (fIntervalUpper == 0) {
2513 // Pathological case. Attempt no matches, as if the block doesn't exist.
2514 // Discard the generated code for the block.
2515 // If the block included parens, discard the info pertaining to them as well.
2516 fRXPat->fCompiledPat->setSize(topOfBlock);
2517 if (fMatchOpenParen >= topOfBlock) {
2518 fMatchOpenParen = -1;
2519 }
2520 if (fMatchCloseParen >= topOfBlock) {
2521 fMatchCloseParen = -1;
2522 }
2523 return TRUE;
2524 }
2525
2526 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2527 // The thing being repeated is not a single op, but some
2528 // more complex block. Do it as a loop, not inlines.
2529 // Note that things "repeated" a max of once are handled as inline, because
2530 // the one copy of the code already generated is just fine.
2531 return FALSE;
2532 }
2533
2534 // Pick up the opcode that is to be repeated
2535 //
2536 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2537
2538 // Compute the pattern location where the inline sequence
2539 // will end, and set up the state save op that will be needed.
2540 //
2541 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2542 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2543 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2544 if (fIntervalLow == 0) {
2545 insertOp(topOfBlock);
2546 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2547 }
2548
2549
2550
2551 // Loop, emitting the op for the thing being repeated each time.
2552 // Loop starts at 1 because one instance of the op already exists in the pattern,
2553 // it was put there when it was originally encountered.
2554 int32_t i;
2555 for (i=1; i<fIntervalUpper; i++ ) {
2556 if (i >= fIntervalLow) {
2557 appendOp(saveOp);
2558 }
2559 appendOp(op);
2560 }
2561 return TRUE;
2562 }
2563
2564
2565
2566 //------------------------------------------------------------------------------
2567 //
2568 // caseInsensitiveStart given a single code point from a pattern string, determine the
2569 // set of characters that could potentially begin a case-insensitive
2570 // match of a string beginning with that character, using full Unicode
2571 // case insensitive matching.
2572 //
2573 // This is used in optimizing find().
2574 //
2575 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2576 // misses cases like this:
2577 // A string from the pattern begins with 'ss' (although all we know
2578 // in this context is that it begins with 's')
2579 // The pattern could match a string beginning with a German sharp-s
2580 //
2581 // To the ordinary case closure for a character c, we add all other
2582 // characters cx where the case closure of cx incudes a string form that begins
2583 // with the original character c.
2584 //
2585 // This function could be made smarter. The full pattern string is available
2586 // and it would be possible to verify that the extra characters being added
2587 // to the starting set fully match, rather than having just a first-char of the
2588 // folded form match.
2589 //
2590 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2591 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2592
2593 // Machine Generated below.
2594 // It may need updating with new versions of Unicode.
2595 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2596 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2597
2598 // Machine Generated Data. Do not hand edit.
2599 static const UChar32 RECaseFixCodePoints[] = {
2600 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2601 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2602 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2603 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2604 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2605
2606 static const int16_t RECaseFixStringOffsets[] = {
2607 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2608 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2609 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2610 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2611 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2612
2613 static const int16_t RECaseFixCounts[] = {
2614 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2615 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2616 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2617 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2618 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2619
2620 static const UChar RECaseFixData[] = {
2621 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2622 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2623 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2624 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2625 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2626 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2627 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2628 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2629 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2630 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2631 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2632
2633 // End of machine generated data.
2634
2635 if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2636 // This function should never be called with an invalid input character.
2637 UPRV_UNREACHABLE;
2638 } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2639 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2640 starterChars->set(caseFoldedC, caseFoldedC);
2641
2642 int32_t i;
2643 for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2644 // Simple linear search through the sorted list of interesting code points.
2645 }
2646
2647 if (RECaseFixCodePoints[i] == c) {
2648 int32_t dataIndex = RECaseFixStringOffsets[i];
2649 int32_t numCharsToAdd = RECaseFixCounts[i];
2650 UChar32 cpToAdd = 0;
2651 for (int32_t j=0; j<numCharsToAdd; j++) {
2652 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2653 starterChars->add(cpToAdd);
2654 }
2655 }
2656
2657 starterChars->closeOver(USET_CASE_INSENSITIVE);
2658 starterChars->removeAllStrings();
2659 } else {
2660 // Not a cased character. Just return it alone.
2661 starterChars->set(c, c);
2662 }
2663 }
2664
2665
2666 // Increment with overflow check.
2667 // val and delta will both be positive.
2668
safeIncrement(int32_t val,int32_t delta)2669 static int32_t safeIncrement(int32_t val, int32_t delta) {
2670 if (INT32_MAX - val > delta) {
2671 return val + delta;
2672 } else {
2673 return INT32_MAX;
2674 }
2675 }
2676
2677
2678 //------------------------------------------------------------------------------
2679 //
2680 // matchStartType Determine how a match can start.
2681 // Used to optimize find() operations.
2682 //
2683 // Operation is very similar to minMatchLength(). Walk the compiled
2684 // pattern, keeping an on-going minimum-match-length. For any
2685 // op where the min match coming in is zero, add that ops possible
2686 // starting matches to the possible starts for the overall pattern.
2687 //
2688 //------------------------------------------------------------------------------
matchStartType()2689 void RegexCompile::matchStartType() {
2690 if (U_FAILURE(*fStatus)) {
2691 return;
2692 }
2693
2694
2695 int32_t loc; // Location in the pattern of the current op being processed.
2696 int32_t op; // The op being processed
2697 int32_t opType; // The opcode type of the op
2698 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2699 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2700
2701 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2702 // could have advanced the position in a match.
2703 // (Maximum match length so far == 0)
2704
2705 // forwardedLength is a vector holding minimum-match-length values that
2706 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2707 // It must be one longer than the pattern being checked because some ops
2708 // will jmp to a end-of-block+1 location from within a block, and we must
2709 // count those when checking the block.
2710 int32_t end = fRXPat->fCompiledPat->size();
2711 UVector32 forwardedLength(end+1, *fStatus);
2712 forwardedLength.setSize(end+1);
2713 for (loc=3; loc<end; loc++) {
2714 forwardedLength.setElementAt(INT32_MAX, loc);
2715 }
2716
2717 for (loc = 3; loc<end; loc++) {
2718 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2719 opType = URX_TYPE(op);
2720
2721 // The loop is advancing linearly through the pattern.
2722 // If the op we are now at was the destination of a branch in the pattern,
2723 // and that path has a shorter minimum length than the current accumulated value,
2724 // replace the current accumulated value.
2725 if (forwardedLength.elementAti(loc) < currentLen) {
2726 currentLen = forwardedLength.elementAti(loc);
2727 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2728 }
2729
2730 switch (opType) {
2731 // Ops that don't change the total length matched
2732 case URX_RESERVED_OP:
2733 case URX_END:
2734 case URX_FAIL:
2735 case URX_STRING_LEN:
2736 case URX_NOP:
2737 case URX_START_CAPTURE:
2738 case URX_END_CAPTURE:
2739 case URX_BACKSLASH_B:
2740 case URX_BACKSLASH_BU:
2741 case URX_BACKSLASH_G:
2742 case URX_BACKSLASH_Z:
2743 case URX_DOLLAR:
2744 case URX_DOLLAR_M:
2745 case URX_DOLLAR_D:
2746 case URX_DOLLAR_MD:
2747 case URX_RELOC_OPRND:
2748 case URX_STO_INP_LOC:
2749 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2750 case URX_BACKREF_I:
2751
2752 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2753 case URX_LD_SP:
2754 break;
2755
2756 case URX_CARET:
2757 if (atStart) {
2758 fRXPat->fStartType = START_START;
2759 }
2760 break;
2761
2762 case URX_CARET_M:
2763 case URX_CARET_M_UNIX:
2764 if (atStart) {
2765 fRXPat->fStartType = START_LINE;
2766 }
2767 break;
2768
2769 case URX_ONECHAR:
2770 if (currentLen == 0) {
2771 // This character could appear at the start of a match.
2772 // Add it to the set of possible starting characters.
2773 fRXPat->fInitialChars->add(URX_VAL(op));
2774 numInitialStrings += 2;
2775 }
2776 currentLen = safeIncrement(currentLen, 1);
2777 atStart = FALSE;
2778 break;
2779
2780
2781 case URX_SETREF:
2782 if (currentLen == 0) {
2783 int32_t sn = URX_VAL(op);
2784 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2785 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2786 fRXPat->fInitialChars->addAll(*s);
2787 numInitialStrings += 2;
2788 }
2789 currentLen = safeIncrement(currentLen, 1);
2790 atStart = FALSE;
2791 break;
2792
2793 case URX_LOOP_SR_I:
2794 // [Set]*, like a SETREF, above, in what it can match,
2795 // but may not match at all, so currentLen is not incremented.
2796 if (currentLen == 0) {
2797 int32_t sn = URX_VAL(op);
2798 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2799 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2800 fRXPat->fInitialChars->addAll(*s);
2801 numInitialStrings += 2;
2802 }
2803 atStart = FALSE;
2804 break;
2805
2806 case URX_LOOP_DOT_I:
2807 if (currentLen == 0) {
2808 // .* at the start of a pattern.
2809 // Any character can begin the match.
2810 fRXPat->fInitialChars->clear();
2811 fRXPat->fInitialChars->complement();
2812 numInitialStrings += 2;
2813 }
2814 atStart = FALSE;
2815 break;
2816
2817
2818 case URX_STATIC_SETREF:
2819 if (currentLen == 0) {
2820 int32_t sn = URX_VAL(op);
2821 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2822 const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2823 fRXPat->fInitialChars->addAll(s);
2824 numInitialStrings += 2;
2825 }
2826 currentLen = safeIncrement(currentLen, 1);
2827 atStart = FALSE;
2828 break;
2829
2830
2831
2832 case URX_STAT_SETREF_N:
2833 if (currentLen == 0) {
2834 int32_t sn = URX_VAL(op);
2835 UnicodeSet sc;
2836 sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2837 fRXPat->fInitialChars->addAll(sc);
2838 numInitialStrings += 2;
2839 }
2840 currentLen = safeIncrement(currentLen, 1);
2841 atStart = FALSE;
2842 break;
2843
2844
2845
2846 case URX_BACKSLASH_D:
2847 // Digit Char
2848 if (currentLen == 0) {
2849 UnicodeSet s;
2850 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2851 if (URX_VAL(op) != 0) {
2852 s.complement();
2853 }
2854 fRXPat->fInitialChars->addAll(s);
2855 numInitialStrings += 2;
2856 }
2857 currentLen = safeIncrement(currentLen, 1);
2858 atStart = FALSE;
2859 break;
2860
2861
2862 case URX_BACKSLASH_H:
2863 // Horiz white space
2864 if (currentLen == 0) {
2865 UnicodeSet s;
2866 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2867 s.add((UChar32)9); // Tab
2868 if (URX_VAL(op) != 0) {
2869 s.complement();
2870 }
2871 fRXPat->fInitialChars->addAll(s);
2872 numInitialStrings += 2;
2873 }
2874 currentLen = safeIncrement(currentLen, 1);
2875 atStart = FALSE;
2876 break;
2877
2878
2879 case URX_BACKSLASH_R: // Any line ending sequence
2880 case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2881 if (currentLen == 0) {
2882 UnicodeSet s;
2883 s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2884 s.add((UChar32)0x85);
2885 s.add((UChar32)0x2028, (UChar32)0x2029);
2886 if (URX_VAL(op) != 0) {
2887 // Complement option applies to URX_BACKSLASH_V only.
2888 s.complement();
2889 }
2890 fRXPat->fInitialChars->addAll(s);
2891 numInitialStrings += 2;
2892 }
2893 currentLen = safeIncrement(currentLen, 1);
2894 atStart = FALSE;
2895 break;
2896
2897
2898
2899 case URX_ONECHAR_I:
2900 // Case Insensitive Single Character.
2901 if (currentLen == 0) {
2902 UChar32 c = URX_VAL(op);
2903 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2904 UnicodeSet starters(c, c);
2905 starters.closeOver(USET_CASE_INSENSITIVE);
2906 // findCaseInsensitiveStarters(c, &starters);
2907 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2908 // The expanded folding can't match the pattern.
2909 fRXPat->fInitialChars->addAll(starters);
2910 } else {
2911 // Char has no case variants. Just add it as-is to the
2912 // set of possible starting chars.
2913 fRXPat->fInitialChars->add(c);
2914 }
2915 numInitialStrings += 2;
2916 }
2917 currentLen = safeIncrement(currentLen, 1);
2918 atStart = FALSE;
2919 break;
2920
2921
2922 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2923 case URX_DOTANY_ALL: // . matches one or two.
2924 case URX_DOTANY:
2925 case URX_DOTANY_UNIX:
2926 if (currentLen == 0) {
2927 // These constructs are all bad news when they appear at the start
2928 // of a match. Any character can begin the match.
2929 fRXPat->fInitialChars->clear();
2930 fRXPat->fInitialChars->complement();
2931 numInitialStrings += 2;
2932 }
2933 currentLen = safeIncrement(currentLen, 1);
2934 atStart = FALSE;
2935 break;
2936
2937
2938 case URX_JMPX:
2939 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2940 U_FALLTHROUGH;
2941 case URX_JMP:
2942 {
2943 int32_t jmpDest = URX_VAL(op);
2944 if (jmpDest < loc) {
2945 // Loop of some kind. Can safely ignore, the worst that will happen
2946 // is that we understate the true minimum length
2947 currentLen = forwardedLength.elementAti(loc+1);
2948
2949 } else {
2950 // Forward jump. Propagate the current min length to the target loc of the jump.
2951 U_ASSERT(jmpDest <= end+1);
2952 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2953 forwardedLength.setElementAt(currentLen, jmpDest);
2954 }
2955 }
2956 }
2957 atStart = FALSE;
2958 break;
2959
2960 case URX_JMP_SAV:
2961 case URX_JMP_SAV_X:
2962 // Combo of state save to the next loc, + jmp backwards.
2963 // Net effect on min. length computation is nothing.
2964 atStart = FALSE;
2965 break;
2966
2967 case URX_BACKTRACK:
2968 // Fails are kind of like a branch, except that the min length was
2969 // propagated already, by the state save.
2970 currentLen = forwardedLength.elementAti(loc+1);
2971 atStart = FALSE;
2972 break;
2973
2974
2975 case URX_STATE_SAVE:
2976 {
2977 // State Save, for forward jumps, propagate the current minimum.
2978 // of the state save.
2979 int32_t jmpDest = URX_VAL(op);
2980 if (jmpDest > loc) {
2981 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2982 forwardedLength.setElementAt(currentLen, jmpDest);
2983 }
2984 }
2985 }
2986 atStart = FALSE;
2987 break;
2988
2989
2990
2991
2992 case URX_STRING:
2993 {
2994 loc++;
2995 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2996 int32_t stringLen = URX_VAL(stringLenOp);
2997 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2998 U_ASSERT(stringLenOp >= 2);
2999 if (currentLen == 0) {
3000 // Add the starting character of this string to the set of possible starting
3001 // characters for this pattern.
3002 int32_t stringStartIdx = URX_VAL(op);
3003 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3004 fRXPat->fInitialChars->add(c);
3005
3006 // Remember this string. After the entire pattern has been checked,
3007 // if nothing else is identified that can start a match, we'll use it.
3008 numInitialStrings++;
3009 fRXPat->fInitialStringIdx = stringStartIdx;
3010 fRXPat->fInitialStringLen = stringLen;
3011 }
3012
3013 currentLen = safeIncrement(currentLen, stringLen);
3014 atStart = FALSE;
3015 }
3016 break;
3017
3018 case URX_STRING_I:
3019 {
3020 // Case-insensitive string. Unlike exact-match strings, we won't
3021 // attempt a string search for possible match positions. But we
3022 // do update the set of possible starting characters.
3023 loc++;
3024 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3025 int32_t stringLen = URX_VAL(stringLenOp);
3026 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3027 U_ASSERT(stringLenOp >= 2);
3028 if (currentLen == 0) {
3029 // Add the starting character of this string to the set of possible starting
3030 // characters for this pattern.
3031 int32_t stringStartIdx = URX_VAL(op);
3032 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
3033 UnicodeSet s;
3034 findCaseInsensitiveStarters(c, &s);
3035 fRXPat->fInitialChars->addAll(s);
3036 numInitialStrings += 2; // Matching on an initial string not possible.
3037 }
3038 currentLen = safeIncrement(currentLen, stringLen);
3039 atStart = FALSE;
3040 }
3041 break;
3042
3043 case URX_CTR_INIT:
3044 case URX_CTR_INIT_NG:
3045 {
3046 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3047 // so location must be updated accordingly.
3048 // Loop Init Ops.
3049 // If the min loop count == 0
3050 // move loc forwards to the end of the loop, skipping over the body.
3051 // If the min count is > 0,
3052 // continue normal processing of the body of the loop.
3053 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3054 loopEndLoc = URX_VAL(loopEndLoc);
3055 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3056 if (minLoopCount == 0) {
3057 // Min Loop Count of 0, treat like a forward branch and
3058 // move the current minimum length up to the target
3059 // (end of loop) location.
3060 U_ASSERT(loopEndLoc <= end+1);
3061 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3062 forwardedLength.setElementAt(currentLen, loopEndLoc);
3063 }
3064 }
3065 loc+=3; // Skips over operands of CTR_INIT
3066 }
3067 atStart = FALSE;
3068 break;
3069
3070
3071 case URX_CTR_LOOP:
3072 case URX_CTR_LOOP_NG:
3073 // Loop ops.
3074 // The jump is conditional, backwards only.
3075 atStart = FALSE;
3076 break;
3077
3078 case URX_LOOP_C:
3079 // More loop ops. These state-save to themselves.
3080 // don't change the minimum match
3081 atStart = FALSE;
3082 break;
3083
3084
3085 case URX_LA_START:
3086 case URX_LB_START:
3087 {
3088 // Look-around. Scan forward until the matching look-ahead end,
3089 // without processing the look-around block. This is overly pessimistic.
3090
3091 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3092 // lookahead contains two LA_END instructions, so count goes up by two
3093 // for each LA_START.
3094 int32_t depth = (opType == URX_LA_START? 2: 1);
3095 for (;;) {
3096 loc++;
3097 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3098 if (URX_TYPE(op) == URX_LA_START) {
3099 depth+=2;
3100 }
3101 if (URX_TYPE(op) == URX_LB_START) {
3102 depth++;
3103 }
3104 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3105 depth--;
3106 if (depth == 0) {
3107 break;
3108 }
3109 }
3110 if (URX_TYPE(op) == URX_STATE_SAVE) {
3111 // Need this because neg lookahead blocks will FAIL to outside
3112 // of the block.
3113 int32_t jmpDest = URX_VAL(op);
3114 if (jmpDest > loc) {
3115 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3116 forwardedLength.setElementAt(currentLen, jmpDest);
3117 }
3118 }
3119 }
3120 U_ASSERT(loc <= end);
3121 }
3122 }
3123 break;
3124
3125 case URX_LA_END:
3126 case URX_LB_CONT:
3127 case URX_LB_END:
3128 case URX_LBN_CONT:
3129 case URX_LBN_END:
3130 UPRV_UNREACHABLE; // Shouldn't get here. These ops should be
3131 // consumed by the scan in URX_LA_START and LB_START
3132 default:
3133 UPRV_UNREACHABLE;
3134 }
3135
3136 }
3137
3138
3139 // We have finished walking through the ops. Check whether some forward jump
3140 // propagated a shorter length to location end+1.
3141 if (forwardedLength.elementAti(end+1) < currentLen) {
3142 currentLen = forwardedLength.elementAti(end+1);
3143 }
3144
3145
3146 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3147
3148
3149 // Sort out what we should check for when looking for candidate match start positions.
3150 // In order of preference,
3151 // 1. Start of input text buffer.
3152 // 2. A literal string.
3153 // 3. Start of line in multi-line mode.
3154 // 4. A single literal character.
3155 // 5. A character from a set of characters.
3156 //
3157 if (fRXPat->fStartType == START_START) {
3158 // Match only at the start of an input text string.
3159 // start type is already set. We're done.
3160 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3161 // Match beginning only with a literal string.
3162 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3163 U_ASSERT(fRXPat->fInitialChars->contains(c));
3164 fRXPat->fStartType = START_STRING;
3165 fRXPat->fInitialChar = c;
3166 } else if (fRXPat->fStartType == START_LINE) {
3167 // Match at start of line in Multi-Line mode.
3168 // Nothing to do here; everything is already set.
3169 } else if (fRXPat->fMinMatchLen == 0) {
3170 // Zero length match possible. We could start anywhere.
3171 fRXPat->fStartType = START_NO_INFO;
3172 } else if (fRXPat->fInitialChars->size() == 1) {
3173 // All matches begin with the same char.
3174 fRXPat->fStartType = START_CHAR;
3175 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3176 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3177 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3178 fRXPat->fMinMatchLen > 0) {
3179 // Matches start with a set of character smaller than the set of all chars.
3180 fRXPat->fStartType = START_SET;
3181 } else {
3182 // Matches can start with anything
3183 fRXPat->fStartType = START_NO_INFO;
3184 }
3185
3186 return;
3187 }
3188
3189
3190
3191 //------------------------------------------------------------------------------
3192 //
3193 // minMatchLength Calculate the length of the shortest string that could
3194 // match the specified pattern.
3195 // Length is in 16 bit code units, not code points.
3196 //
3197 // The calculated length may not be exact. The returned
3198 // value may be shorter than the actual minimum; it must
3199 // never be longer.
3200 //
3201 // start and end are the range of p-code operations to be
3202 // examined. The endpoints are included in the range.
3203 //
3204 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3205 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3206 if (U_FAILURE(*fStatus)) {
3207 return 0;
3208 }
3209
3210 U_ASSERT(start <= end);
3211 U_ASSERT(end < fRXPat->fCompiledPat->size());
3212
3213
3214 int32_t loc;
3215 int32_t op;
3216 int32_t opType;
3217 int32_t currentLen = 0;
3218
3219
3220 // forwardedLength is a vector holding minimum-match-length values that
3221 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3222 // It must be one longer than the pattern being checked because some ops
3223 // will jmp to a end-of-block+1 location from within a block, and we must
3224 // count those when checking the block.
3225 UVector32 forwardedLength(end+2, *fStatus);
3226 forwardedLength.setSize(end+2);
3227 for (loc=start; loc<=end+1; loc++) {
3228 forwardedLength.setElementAt(INT32_MAX, loc);
3229 }
3230
3231 for (loc = start; loc<=end; loc++) {
3232 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3233 opType = URX_TYPE(op);
3234
3235 // The loop is advancing linearly through the pattern.
3236 // If the op we are now at was the destination of a branch in the pattern,
3237 // and that path has a shorter minimum length than the current accumulated value,
3238 // replace the current accumulated value.
3239 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3240 // no-match-possible cases.
3241 if (forwardedLength.elementAti(loc) < currentLen) {
3242 currentLen = forwardedLength.elementAti(loc);
3243 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3244 }
3245
3246 switch (opType) {
3247 // Ops that don't change the total length matched
3248 case URX_RESERVED_OP:
3249 case URX_END:
3250 case URX_STRING_LEN:
3251 case URX_NOP:
3252 case URX_START_CAPTURE:
3253 case URX_END_CAPTURE:
3254 case URX_BACKSLASH_B:
3255 case URX_BACKSLASH_BU:
3256 case URX_BACKSLASH_G:
3257 case URX_BACKSLASH_Z:
3258 case URX_CARET:
3259 case URX_DOLLAR:
3260 case URX_DOLLAR_M:
3261 case URX_DOLLAR_D:
3262 case URX_DOLLAR_MD:
3263 case URX_RELOC_OPRND:
3264 case URX_STO_INP_LOC:
3265 case URX_CARET_M:
3266 case URX_CARET_M_UNIX:
3267 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3268 case URX_BACKREF_I:
3269
3270 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3271 case URX_LD_SP:
3272
3273 case URX_JMP_SAV:
3274 case URX_JMP_SAV_X:
3275 break;
3276
3277
3278 // Ops that match a minimum of one character (one or two 16 bit code units.)
3279 //
3280 case URX_ONECHAR:
3281 case URX_STATIC_SETREF:
3282 case URX_STAT_SETREF_N:
3283 case URX_SETREF:
3284 case URX_BACKSLASH_D:
3285 case URX_BACKSLASH_H:
3286 case URX_BACKSLASH_R:
3287 case URX_BACKSLASH_V:
3288 case URX_ONECHAR_I:
3289 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3290 case URX_DOTANY_ALL: // . matches one or two.
3291 case URX_DOTANY:
3292 case URX_DOTANY_UNIX:
3293 currentLen = safeIncrement(currentLen, 1);
3294 break;
3295
3296
3297 case URX_JMPX:
3298 loc++; // URX_JMPX has an extra operand, ignored here,
3299 // otherwise processed identically to URX_JMP.
3300 U_FALLTHROUGH;
3301 case URX_JMP:
3302 {
3303 int32_t jmpDest = URX_VAL(op);
3304 if (jmpDest < loc) {
3305 // Loop of some kind. Can safely ignore, the worst that will happen
3306 // is that we understate the true minimum length
3307 currentLen = forwardedLength.elementAti(loc+1);
3308 } else {
3309 // Forward jump. Propagate the current min length to the target loc of the jump.
3310 U_ASSERT(jmpDest <= end+1);
3311 if (forwardedLength.elementAti(jmpDest) > currentLen) {
3312 forwardedLength.setElementAt(currentLen, jmpDest);
3313 }
3314 }
3315 }
3316 break;
3317
3318 case URX_BACKTRACK:
3319 {
3320 // Back-tracks are kind of like a branch, except that the min length was
3321 // propagated already, by the state save.
3322 currentLen = forwardedLength.elementAti(loc+1);
3323 }
3324 break;
3325
3326
3327 case URX_STATE_SAVE:
3328 {
3329 // State Save, for forward jumps, propagate the current minimum.
3330 // of the state save.
3331 int32_t jmpDest = URX_VAL(op);
3332 if (jmpDest > loc) {
3333 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3334 forwardedLength.setElementAt(currentLen, jmpDest);
3335 }
3336 }
3337 }
3338 break;
3339
3340
3341 case URX_STRING:
3342 {
3343 loc++;
3344 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3345 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3346 }
3347 break;
3348
3349
3350 case URX_STRING_I:
3351 {
3352 loc++;
3353 // TODO: with full case folding, matching input text may be shorter than
3354 // the string we have here. More smarts could put some bounds on it.
3355 // Assume a min length of one for now. A min length of zero causes
3356 // optimization failures for a pattern like "string"+
3357 // currentLen += URX_VAL(stringLenOp);
3358 currentLen = safeIncrement(currentLen, 1);
3359 }
3360 break;
3361
3362 case URX_CTR_INIT:
3363 case URX_CTR_INIT_NG:
3364 {
3365 // Loop Init Ops.
3366 // If the min loop count == 0
3367 // move loc forwards to the end of the loop, skipping over the body.
3368 // If the min count is > 0,
3369 // continue normal processing of the body of the loop.
3370 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3371 loopEndLoc = URX_VAL(loopEndLoc);
3372 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3373 if (minLoopCount == 0) {
3374 loc = loopEndLoc;
3375 } else {
3376 loc+=3; // Skips over operands of CTR_INIT
3377 }
3378 }
3379 break;
3380
3381
3382 case URX_CTR_LOOP:
3383 case URX_CTR_LOOP_NG:
3384 // Loop ops.
3385 // The jump is conditional, backwards only.
3386 break;
3387
3388 case URX_LOOP_SR_I:
3389 case URX_LOOP_DOT_I:
3390 case URX_LOOP_C:
3391 // More loop ops. These state-save to themselves.
3392 // don't change the minimum match - could match nothing at all.
3393 break;
3394
3395
3396 case URX_LA_START:
3397 case URX_LB_START:
3398 {
3399 // Look-around. Scan forward until the matching look-ahead end,
3400 // without processing the look-around block. This is overly pessimistic for look-ahead,
3401 // it assumes that the look-ahead match might be zero-length.
3402 // TODO: Positive lookahead could recursively do the block, then continue
3403 // with the longer of the block or the value coming in. Ticket 6060
3404 int32_t depth = (opType == URX_LA_START? 2: 1);
3405 for (;;) {
3406 loc++;
3407 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3408 if (URX_TYPE(op) == URX_LA_START) {
3409 // The boilerplate for look-ahead includes two LA_END insturctions,
3410 // Depth will be decremented by each one when it is seen.
3411 depth += 2;
3412 }
3413 if (URX_TYPE(op) == URX_LB_START) {
3414 depth++;
3415 }
3416 if (URX_TYPE(op) == URX_LA_END) {
3417 depth--;
3418 if (depth == 0) {
3419 break;
3420 }
3421 }
3422 if (URX_TYPE(op)==URX_LBN_END) {
3423 depth--;
3424 if (depth == 0) {
3425 break;
3426 }
3427 }
3428 if (URX_TYPE(op) == URX_STATE_SAVE) {
3429 // Need this because neg lookahead blocks will FAIL to outside
3430 // of the block.
3431 int32_t jmpDest = URX_VAL(op);
3432 if (jmpDest > loc) {
3433 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3434 forwardedLength.setElementAt(currentLen, jmpDest);
3435 }
3436 }
3437 }
3438 U_ASSERT(loc <= end);
3439 }
3440 }
3441 break;
3442
3443 case URX_LA_END:
3444 case URX_LB_CONT:
3445 case URX_LB_END:
3446 case URX_LBN_CONT:
3447 case URX_LBN_END:
3448 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3449 // range being sized, which happens when measuring size of look-behind blocks.
3450 break;
3451
3452 default:
3453 UPRV_UNREACHABLE;
3454 }
3455
3456 }
3457
3458 // We have finished walking through the ops. Check whether some forward jump
3459 // propagated a shorter length to location end+1.
3460 if (forwardedLength.elementAti(end+1) < currentLen) {
3461 currentLen = forwardedLength.elementAti(end+1);
3462 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3463 }
3464
3465 return currentLen;
3466 }
3467
3468 //------------------------------------------------------------------------------
3469 //
3470 // maxMatchLength Calculate the length of the longest string that could
3471 // match the specified pattern.
3472 // Length is in 16 bit code units, not code points.
3473 //
3474 // The calculated length may not be exact. The returned
3475 // value may be longer than the actual maximum; it must
3476 // never be shorter.
3477 //
3478 // start, end: the range of the pattern to check.
3479 // end is inclusive.
3480 //
3481 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3482 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3483 if (U_FAILURE(*fStatus)) {
3484 return 0;
3485 }
3486 U_ASSERT(start <= end);
3487 U_ASSERT(end < fRXPat->fCompiledPat->size());
3488
3489 int32_t loc;
3490 int32_t op;
3491 int32_t opType;
3492 int32_t currentLen = 0;
3493 UVector32 forwardedLength(end+1, *fStatus);
3494 forwardedLength.setSize(end+1);
3495
3496 for (loc=start; loc<=end; loc++) {
3497 forwardedLength.setElementAt(0, loc);
3498 }
3499
3500 for (loc = start; loc<=end; loc++) {
3501 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3502 opType = URX_TYPE(op);
3503
3504 // The loop is advancing linearly through the pattern.
3505 // If the op we are now at was the destination of a branch in the pattern,
3506 // and that path has a longer maximum length than the current accumulated value,
3507 // replace the current accumulated value.
3508 if (forwardedLength.elementAti(loc) > currentLen) {
3509 currentLen = forwardedLength.elementAti(loc);
3510 }
3511
3512 switch (opType) {
3513 // Ops that don't change the total length matched
3514 case URX_RESERVED_OP:
3515 case URX_END:
3516 case URX_STRING_LEN:
3517 case URX_NOP:
3518 case URX_START_CAPTURE:
3519 case URX_END_CAPTURE:
3520 case URX_BACKSLASH_B:
3521 case URX_BACKSLASH_BU:
3522 case URX_BACKSLASH_G:
3523 case URX_BACKSLASH_Z:
3524 case URX_CARET:
3525 case URX_DOLLAR:
3526 case URX_DOLLAR_M:
3527 case URX_DOLLAR_D:
3528 case URX_DOLLAR_MD:
3529 case URX_RELOC_OPRND:
3530 case URX_STO_INP_LOC:
3531 case URX_CARET_M:
3532 case URX_CARET_M_UNIX:
3533
3534 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3535 case URX_LD_SP:
3536
3537 case URX_LB_END:
3538 case URX_LB_CONT:
3539 case URX_LBN_CONT:
3540 case URX_LBN_END:
3541 break;
3542
3543
3544 // Ops that increase that cause an unbounded increase in the length
3545 // of a matched string, or that increase it a hard to characterize way.
3546 // Call the max length unbounded, and stop further checking.
3547 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3548 case URX_BACKREF_I:
3549 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3550 currentLen = INT32_MAX;
3551 break;
3552
3553
3554 // Ops that match a max of one character (possibly two 16 bit code units.)
3555 //
3556 case URX_STATIC_SETREF:
3557 case URX_STAT_SETREF_N:
3558 case URX_SETREF:
3559 case URX_BACKSLASH_D:
3560 case URX_BACKSLASH_H:
3561 case URX_BACKSLASH_R:
3562 case URX_BACKSLASH_V:
3563 case URX_ONECHAR_I:
3564 case URX_DOTANY_ALL:
3565 case URX_DOTANY:
3566 case URX_DOTANY_UNIX:
3567 currentLen = safeIncrement(currentLen, 2);
3568 break;
3569
3570 // Single literal character. Increase current max length by one or two,
3571 // depending on whether the char is in the supplementary range.
3572 case URX_ONECHAR:
3573 currentLen = safeIncrement(currentLen, 1);
3574 if (URX_VAL(op) > 0x10000) {
3575 currentLen = safeIncrement(currentLen, 1);
3576 }
3577 break;
3578
3579 // Jumps.
3580 //
3581 case URX_JMP:
3582 case URX_JMPX:
3583 case URX_JMP_SAV:
3584 case URX_JMP_SAV_X:
3585 {
3586 int32_t jmpDest = URX_VAL(op);
3587 if (jmpDest < loc) {
3588 // Loop of some kind. Max match length is unbounded.
3589 currentLen = INT32_MAX;
3590 } else {
3591 // Forward jump. Propagate the current min length to the target loc of the jump.
3592 if (forwardedLength.elementAti(jmpDest) < currentLen) {
3593 forwardedLength.setElementAt(currentLen, jmpDest);
3594 }
3595 currentLen = 0;
3596 }
3597 }
3598 break;
3599
3600 case URX_BACKTRACK:
3601 // back-tracks are kind of like a branch, except that the max length was
3602 // propagated already, by the state save.
3603 currentLen = forwardedLength.elementAti(loc+1);
3604 break;
3605
3606
3607 case URX_STATE_SAVE:
3608 {
3609 // State Save, for forward jumps, propagate the current minimum.
3610 // of the state save.
3611 // For backwards jumps, they create a loop, maximum
3612 // match length is unbounded.
3613 int32_t jmpDest = URX_VAL(op);
3614 if (jmpDest > loc) {
3615 if (currentLen > forwardedLength.elementAti(jmpDest)) {
3616 forwardedLength.setElementAt(currentLen, jmpDest);
3617 }
3618 } else {
3619 currentLen = INT32_MAX;
3620 }
3621 }
3622 break;
3623
3624
3625
3626
3627 case URX_STRING:
3628 {
3629 loc++;
3630 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3631 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3632 break;
3633 }
3634
3635 case URX_STRING_I:
3636 // TODO: This code assumes that any user string that matches will be no longer
3637 // than our compiled string, with case insensitive matching.
3638 // Our compiled string has been case-folded already.
3639 //
3640 // Any matching user string will have no more code points than our
3641 // compiled (folded) string. Folding may add code points, but
3642 // not remove them.
3643 //
3644 // There is a potential problem if a supplemental code point
3645 // case-folds to a BMP code point. In this case our compiled string
3646 // could be shorter (in code units) than a matching user string.
3647 //
3648 // At this time (Unicode 6.1) there are no such characters, and this case
3649 // is not being handled. A test, intltest regex/Bug9283, will fail if
3650 // any problematic characters are added to Unicode.
3651 //
3652 // If this happens, we can make a set of the BMP chars that the
3653 // troublesome supplementals fold to, scan our string, and bump the
3654 // currentLen one extra for each that is found.
3655 //
3656 {
3657 loc++;
3658 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3659 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3660 }
3661 break;
3662
3663 case URX_CTR_INIT:
3664 case URX_CTR_INIT_NG:
3665 // For Loops, recursively call this function on the pattern for the loop body,
3666 // then multiply the result by the maximum loop count.
3667 {
3668 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3669 if (loopEndLoc == loc+4) {
3670 // Loop has an empty body. No affect on max match length.
3671 // Continue processing with code after the loop end.
3672 loc = loopEndLoc;
3673 break;
3674 }
3675
3676 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3677 if (maxLoopCount == -1) {
3678 // Unbounded Loop. No upper bound on match length.
3679 currentLen = INT32_MAX;
3680 break;
3681 }
3682
3683 U_ASSERT(loopEndLoc >= loc+4);
3684 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
3685 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3686 if (updatedLen >= INT32_MAX) {
3687 currentLen = INT32_MAX;
3688 break;
3689 }
3690 currentLen = (int32_t)updatedLen;
3691 loc = loopEndLoc;
3692 break;
3693 }
3694
3695 case URX_CTR_LOOP:
3696 case URX_CTR_LOOP_NG:
3697 // These opcodes will be skipped over by code for URX_CTR_INIT.
3698 // We shouldn't encounter them here.
3699 UPRV_UNREACHABLE;
3700
3701 case URX_LOOP_SR_I:
3702 case URX_LOOP_DOT_I:
3703 case URX_LOOP_C:
3704 // For anything to do with loops, make the match length unbounded.
3705 currentLen = INT32_MAX;
3706 break;
3707
3708
3709
3710 case URX_LA_START:
3711 case URX_LA_END:
3712 // Look-ahead. Just ignore, treat the look-ahead block as if
3713 // it were normal pattern. Gives a too-long match length,
3714 // but good enough for now.
3715 break;
3716
3717 // End of look-ahead ops should always be consumed by the processing at
3718 // the URX_LA_START op.
3719 // UPRV_UNREACHABLE;
3720
3721 case URX_LB_START:
3722 {
3723 // Look-behind. Scan forward until the matching look-around end,
3724 // without processing the look-behind block.
3725 int32_t dataLoc = URX_VAL(op);
3726 for (loc = loc + 1; loc <= end; ++loc) {
3727 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3728 int32_t opType = URX_TYPE(op);
3729 if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3730 break;
3731 }
3732 }
3733 U_ASSERT(loc <= end);
3734 }
3735 break;
3736
3737 default:
3738 UPRV_UNREACHABLE;
3739 }
3740
3741
3742 if (currentLen == INT32_MAX) {
3743 // The maximum length is unbounded.
3744 // Stop further processing of the pattern.
3745 break;
3746 }
3747
3748 }
3749 return currentLen;
3750
3751 }
3752
3753
3754 //------------------------------------------------------------------------------
3755 //
3756 // stripNOPs Remove any NOP operations from the compiled pattern code.
3757 // Extra NOPs are inserted for some constructs during the initial
3758 // code generation to provide locations that may be patched later.
3759 // Many end up unneeded, and are removed by this function.
3760 //
3761 // In order to minimize the number of passes through the pattern,
3762 // back-reference fixup is also performed here (adjusting
3763 // back-reference operands to point to the correct frame offsets).
3764 //
3765 //------------------------------------------------------------------------------
stripNOPs()3766 void RegexCompile::stripNOPs() {
3767
3768 if (U_FAILURE(*fStatus)) {
3769 return;
3770 }
3771
3772 int32_t end = fRXPat->fCompiledPat->size();
3773 UVector32 deltas(end, *fStatus);
3774
3775 // Make a first pass over the code, computing the amount that things
3776 // will be offset at each location in the original code.
3777 int32_t loc;
3778 int32_t d = 0;
3779 for (loc=0; loc<end; loc++) {
3780 deltas.addElement(d, *fStatus);
3781 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3782 if (URX_TYPE(op) == URX_NOP) {
3783 d++;
3784 }
3785 }
3786
3787 UnicodeString caseStringBuffer;
3788
3789 // Make a second pass over the code, removing the NOPs by moving following
3790 // code up, and patching operands that refer to code locations that
3791 // are being moved. The array of offsets from the first step is used
3792 // to compute the new operand values.
3793 int32_t src;
3794 int32_t dst = 0;
3795 for (src=0; src<end; src++) {
3796 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3797 int32_t opType = URX_TYPE(op);
3798 switch (opType) {
3799 case URX_NOP:
3800 break;
3801
3802 case URX_STATE_SAVE:
3803 case URX_JMP:
3804 case URX_CTR_LOOP:
3805 case URX_CTR_LOOP_NG:
3806 case URX_RELOC_OPRND:
3807 case URX_JMPX:
3808 case URX_JMP_SAV:
3809 case URX_JMP_SAV_X:
3810 // These are instructions with operands that refer to code locations.
3811 {
3812 int32_t operandAddress = URX_VAL(op);
3813 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3814 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3815 op = buildOp(opType, fixedOperandAddress);
3816 fRXPat->fCompiledPat->setElementAt(op, dst);
3817 dst++;
3818 break;
3819 }
3820
3821 case URX_BACKREF:
3822 case URX_BACKREF_I:
3823 {
3824 int32_t where = URX_VAL(op);
3825 if (where > fRXPat->fGroupMap->size()) {
3826 error(U_REGEX_INVALID_BACK_REF);
3827 break;
3828 }
3829 where = fRXPat->fGroupMap->elementAti(where-1);
3830 op = buildOp(opType, where);
3831 fRXPat->fCompiledPat->setElementAt(op, dst);
3832 dst++;
3833
3834 fRXPat->fNeedsAltInput = TRUE;
3835 break;
3836 }
3837 case URX_RESERVED_OP:
3838 case URX_RESERVED_OP_N:
3839 case URX_BACKTRACK:
3840 case URX_END:
3841 case URX_ONECHAR:
3842 case URX_STRING:
3843 case URX_STRING_LEN:
3844 case URX_START_CAPTURE:
3845 case URX_END_CAPTURE:
3846 case URX_STATIC_SETREF:
3847 case URX_STAT_SETREF_N:
3848 case URX_SETREF:
3849 case URX_DOTANY:
3850 case URX_FAIL:
3851 case URX_BACKSLASH_B:
3852 case URX_BACKSLASH_BU:
3853 case URX_BACKSLASH_G:
3854 case URX_BACKSLASH_X:
3855 case URX_BACKSLASH_Z:
3856 case URX_DOTANY_ALL:
3857 case URX_BACKSLASH_D:
3858 case URX_CARET:
3859 case URX_DOLLAR:
3860 case URX_CTR_INIT:
3861 case URX_CTR_INIT_NG:
3862 case URX_DOTANY_UNIX:
3863 case URX_STO_SP:
3864 case URX_LD_SP:
3865 case URX_STO_INP_LOC:
3866 case URX_LA_START:
3867 case URX_LA_END:
3868 case URX_ONECHAR_I:
3869 case URX_STRING_I:
3870 case URX_DOLLAR_M:
3871 case URX_CARET_M:
3872 case URX_CARET_M_UNIX:
3873 case URX_LB_START:
3874 case URX_LB_CONT:
3875 case URX_LB_END:
3876 case URX_LBN_CONT:
3877 case URX_LBN_END:
3878 case URX_LOOP_SR_I:
3879 case URX_LOOP_DOT_I:
3880 case URX_LOOP_C:
3881 case URX_DOLLAR_D:
3882 case URX_DOLLAR_MD:
3883 case URX_BACKSLASH_H:
3884 case URX_BACKSLASH_R:
3885 case URX_BACKSLASH_V:
3886 // These instructions are unaltered by the relocation.
3887 fRXPat->fCompiledPat->setElementAt(op, dst);
3888 dst++;
3889 break;
3890
3891 default:
3892 // Some op is unaccounted for.
3893 UPRV_UNREACHABLE;
3894 }
3895 }
3896
3897 fRXPat->fCompiledPat->setSize(dst);
3898 }
3899
3900
3901
3902
3903 //------------------------------------------------------------------------------
3904 //
3905 // Error Report a rule parse error.
3906 // Only report it if no previous error has been recorded.
3907 //
3908 //------------------------------------------------------------------------------
error(UErrorCode e)3909 void RegexCompile::error(UErrorCode e) {
3910 if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3911 *fStatus = e;
3912 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3913 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3914 // int64_t. If the values of the latter are out of range for the former,
3915 // set them to the appropriate "field not supported" values.
3916 if (fLineNum > 0x7FFFFFFF) {
3917 fParseErr->line = 0;
3918 fParseErr->offset = -1;
3919 } else if (fCharNum > 0x7FFFFFFF) {
3920 fParseErr->line = (int32_t)fLineNum;
3921 fParseErr->offset = -1;
3922 } else {
3923 fParseErr->line = (int32_t)fLineNum;
3924 fParseErr->offset = (int32_t)fCharNum;
3925 }
3926
3927 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3928
3929 // Fill in the context.
3930 // Note: extractBetween() pins supplied indicies to the string bounds.
3931 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3932 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3933 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3934 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3935 }
3936 }
3937
3938
3939 //
3940 // Assorted Unicode character constants.
3941 // Numeric because there is no portable way to enter them as literals.
3942 // (Think EBCDIC).
3943 //
3944 static const UChar chCR = 0x0d; // New lines, for terminating comments.
3945 static const UChar chLF = 0x0a; // Line Feed
3946 static const UChar chPound = 0x23; // '#', introduces a comment.
3947 static const UChar chDigit0 = 0x30; // '0'
3948 static const UChar chDigit7 = 0x37; // '9'
3949 static const UChar chColon = 0x3A; // ':'
3950 static const UChar chE = 0x45; // 'E'
3951 static const UChar chQ = 0x51; // 'Q'
3952 //static const UChar chN = 0x4E; // 'N'
3953 static const UChar chP = 0x50; // 'P'
3954 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3955 //static const UChar chLBracket = 0x5b; // '['
3956 static const UChar chRBracket = 0x5d; // ']'
3957 static const UChar chUp = 0x5e; // '^'
3958 static const UChar chLowerP = 0x70;
3959 static const UChar chLBrace = 0x7b; // '{'
3960 static const UChar chRBrace = 0x7d; // '}'
3961 static const UChar chNEL = 0x85; // NEL newline variant
3962 static const UChar chLS = 0x2028; // Unicode Line Separator
3963
3964
3965 //------------------------------------------------------------------------------
3966 //
3967 // nextCharLL Low Level Next Char from the regex pattern.
3968 // Get a char from the string, keep track of input position
3969 // for error reporting.
3970 //
3971 //------------------------------------------------------------------------------
nextCharLL()3972 UChar32 RegexCompile::nextCharLL() {
3973 UChar32 ch;
3974
3975 if (fPeekChar != -1) {
3976 ch = fPeekChar;
3977 fPeekChar = -1;
3978 return ch;
3979 }
3980
3981 // assume we're already in the right place
3982 ch = UTEXT_NEXT32(fRXPat->fPattern);
3983 if (ch == U_SENTINEL) {
3984 return ch;
3985 }
3986
3987 if (ch == chCR ||
3988 ch == chNEL ||
3989 ch == chLS ||
3990 (ch == chLF && fLastChar != chCR)) {
3991 // Character is starting a new line. Bump up the line number, and
3992 // reset the column to 0.
3993 fLineNum++;
3994 fCharNum=0;
3995 }
3996 else {
3997 // Character is not starting a new line. Except in the case of a
3998 // LF following a CR, increment the column position.
3999 if (ch != chLF) {
4000 fCharNum++;
4001 }
4002 }
4003 fLastChar = ch;
4004 return ch;
4005 }
4006
4007 //------------------------------------------------------------------------------
4008 //
4009 // peekCharLL Low Level Character Scanning, sneak a peek at the next
4010 // character without actually getting it.
4011 //
4012 //------------------------------------------------------------------------------
peekCharLL()4013 UChar32 RegexCompile::peekCharLL() {
4014 if (fPeekChar == -1) {
4015 fPeekChar = nextCharLL();
4016 }
4017 return fPeekChar;
4018 }
4019
4020
4021 //------------------------------------------------------------------------------
4022 //
4023 // nextChar for pattern scanning. At this level, we handle stripping
4024 // out comments and processing some backslash character escapes.
4025 // The rest of the pattern grammar is handled at the next level up.
4026 //
4027 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4028 void RegexCompile::nextChar(RegexPatternChar &c) {
4029 tailRecursion:
4030 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4031 c.fChar = nextCharLL();
4032 c.fQuoted = FALSE;
4033
4034 if (fQuoteMode) {
4035 c.fQuoted = TRUE;
4036 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4037 c.fChar == (UChar32)-1) {
4038 fQuoteMode = FALSE; // Exit quote mode,
4039 nextCharLL(); // discard the E
4040 // nextChar(c); // recurse to get the real next char
4041 goto tailRecursion; // Note: fuzz testing produced testcases that
4042 // resulted in stack overflow here.
4043 }
4044 }
4045 else if (fInBackslashQuote) {
4046 // The current character immediately follows a '\'
4047 // Don't check for any further escapes, just return it as-is.
4048 // Don't set c.fQuoted, because that would prevent the state machine from
4049 // dispatching on the character.
4050 fInBackslashQuote = FALSE;
4051 }
4052 else
4053 {
4054 // We are not in a \Q quoted region \E of the source.
4055 //
4056 if (fModeFlags & UREGEX_COMMENTS) {
4057 //
4058 // We are in free-spacing and comments mode.
4059 // Scan through any white space and comments, until we
4060 // reach a significant character or the end of inut.
4061 for (;;) {
4062 if (c.fChar == (UChar32)-1) {
4063 break; // End of Input
4064 }
4065 if (c.fChar == chPound && fEOLComments == TRUE) {
4066 // Start of a comment. Consume the rest of it, until EOF or a new line
4067 for (;;) {
4068 c.fChar = nextCharLL();
4069 if (c.fChar == (UChar32)-1 || // EOF
4070 c.fChar == chCR ||
4071 c.fChar == chLF ||
4072 c.fChar == chNEL ||
4073 c.fChar == chLS) {
4074 break;
4075 }
4076 }
4077 }
4078 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4079 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4080 break;
4081 }
4082 c.fChar = nextCharLL();
4083 }
4084 }
4085
4086 //
4087 // check for backslash escaped characters.
4088 //
4089 if (c.fChar == chBackSlash) {
4090 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4091 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4092 //
4093 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4094 // Includes \uxxxx, \n, \r, many others.
4095 // Return the single equivalent character.
4096 //
4097 nextCharLL(); // get & discard the peeked char.
4098 c.fQuoted = TRUE;
4099
4100 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4101 int32_t endIndex = (int32_t)pos;
4102 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4103
4104 if (endIndex == pos) {
4105 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4106 }
4107 fCharNum += endIndex - pos;
4108 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4109 } else {
4110 int32_t offset = 0;
4111 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4112
4113 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4114 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4115
4116 if (offset == 0) {
4117 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4118 } else if (context.lastOffset == offset) {
4119 UTEXT_PREVIOUS32(fRXPat->fPattern);
4120 } else if (context.lastOffset != offset-1) {
4121 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4122 }
4123 fCharNum += offset;
4124 }
4125 }
4126 else if (peekCharLL() == chDigit0) {
4127 // Octal Escape, using Java Regexp Conventions
4128 // which are \0 followed by 1-3 octal digits.
4129 // Different from ICU Unescape handling of Octal, which does not
4130 // require the leading 0.
4131 // Java also has the convention of only consuming 2 octal digits if
4132 // the three digit number would be > 0xff
4133 //
4134 c.fChar = 0;
4135 nextCharLL(); // Consume the initial 0.
4136 int index;
4137 for (index=0; index<3; index++) {
4138 int32_t ch = peekCharLL();
4139 if (ch<chDigit0 || ch>chDigit7) {
4140 if (index==0) {
4141 // \0 is not followed by any octal digits.
4142 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4143 }
4144 break;
4145 }
4146 c.fChar <<= 3;
4147 c.fChar += ch&7;
4148 if (c.fChar <= 255) {
4149 nextCharLL();
4150 } else {
4151 // The last digit made the number too big. Forget we saw it.
4152 c.fChar >>= 3;
4153 }
4154 }
4155 c.fQuoted = TRUE;
4156 }
4157 else if (peekCharLL() == chQ) {
4158 // "\Q" enter quote mode, which will continue until "\E"
4159 fQuoteMode = TRUE;
4160 nextCharLL(); // discard the 'Q'.
4161 // nextChar(c); // recurse to get the real next char.
4162 goto tailRecursion; // Note: fuzz testing produced test cases that
4163 // resulted in stack overflow here.
4164 }
4165 else
4166 {
4167 // We are in a '\' escape that will be handled by the state table scanner.
4168 // Just return the backslash, but remember that the following char is to
4169 // be taken literally.
4170 fInBackslashQuote = TRUE;
4171 }
4172 }
4173 }
4174
4175 // re-enable # to end-of-line comments, in case they were disabled.
4176 // They are disabled by the parser upon seeing '(?', but this lasts for
4177 // the fetching of the next character only.
4178 fEOLComments = TRUE;
4179
4180 // putc(c.fChar, stdout);
4181 }
4182
4183
4184
4185 //------------------------------------------------------------------------------
4186 //
4187 // scanNamedChar
4188 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4189 //
4190 // The scan position will be at the 'N'. On return
4191 // the scan position should be just after the '}'
4192 //
4193 // Return the UChar32
4194 //
4195 //------------------------------------------------------------------------------
scanNamedChar()4196 UChar32 RegexCompile::scanNamedChar() {
4197 if (U_FAILURE(*fStatus)) {
4198 return 0;
4199 }
4200
4201 nextChar(fC);
4202 if (fC.fChar != chLBrace) {
4203 error(U_REGEX_PROPERTY_SYNTAX);
4204 return 0;
4205 }
4206
4207 UnicodeString charName;
4208 for (;;) {
4209 nextChar(fC);
4210 if (fC.fChar == chRBrace) {
4211 break;
4212 }
4213 if (fC.fChar == -1) {
4214 error(U_REGEX_PROPERTY_SYNTAX);
4215 return 0;
4216 }
4217 charName.append(fC.fChar);
4218 }
4219
4220 char name[100];
4221 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4222 (uint32_t)charName.length()>=sizeof(name)) {
4223 // All Unicode character names have only invariant characters.
4224 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4225 // which requires this error check
4226 error(U_REGEX_PROPERTY_SYNTAX);
4227 return 0;
4228 }
4229 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4230
4231 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4232 if (U_FAILURE(*fStatus)) {
4233 error(U_REGEX_PROPERTY_SYNTAX);
4234 }
4235
4236 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4237 return theChar;
4238 }
4239
4240 //------------------------------------------------------------------------------
4241 //
4242 // scanProp Construct a UnicodeSet from the text at the current scan
4243 // position, which will be of the form \p{whaterver}
4244 //
4245 // The scan position will be at the 'p' or 'P'. On return
4246 // the scan position should be just after the '}'
4247 //
4248 // Return a UnicodeSet, constructed from the \P pattern,
4249 // or NULL if the pattern is invalid.
4250 //
4251 //------------------------------------------------------------------------------
scanProp()4252 UnicodeSet *RegexCompile::scanProp() {
4253 UnicodeSet *uset = NULL;
4254
4255 if (U_FAILURE(*fStatus)) {
4256 return NULL;
4257 }
4258 (void)chLowerP; // Suppress compiler unused variable warning.
4259 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4260 UBool negated = (fC.fChar == chP);
4261
4262 UnicodeString propertyName;
4263 nextChar(fC);
4264 if (fC.fChar != chLBrace) {
4265 error(U_REGEX_PROPERTY_SYNTAX);
4266 return NULL;
4267 }
4268 for (;;) {
4269 nextChar(fC);
4270 if (fC.fChar == chRBrace) {
4271 break;
4272 }
4273 if (fC.fChar == -1) {
4274 // Hit the end of the input string without finding the closing '}'
4275 error(U_REGEX_PROPERTY_SYNTAX);
4276 return NULL;
4277 }
4278 propertyName.append(fC.fChar);
4279 }
4280 uset = createSetForProperty(propertyName, negated);
4281 nextChar(fC); // Move input scan to position following the closing '}'
4282 return uset;
4283 }
4284
4285 //------------------------------------------------------------------------------
4286 //
4287 // scanPosixProp Construct a UnicodeSet from the text at the current scan
4288 // position, which is expected be of the form [:property expression:]
4289 //
4290 // The scan position will be at the opening ':'. On return
4291 // the scan position must be on the closing ']'
4292 //
4293 // Return a UnicodeSet constructed from the pattern,
4294 // or NULL if this is not a valid POSIX-style set expression.
4295 // If not a property expression, restore the initial scan position
4296 // (to the opening ':')
4297 //
4298 // Note: the opening '[:' is not sufficient to guarantee that
4299 // this is a [:property:] expression.
4300 // [:'+=,] is a perfectly good ordinary set expression that
4301 // happens to include ':' as one of its characters.
4302 //
4303 //------------------------------------------------------------------------------
scanPosixProp()4304 UnicodeSet *RegexCompile::scanPosixProp() {
4305 UnicodeSet *uset = NULL;
4306
4307 if (U_FAILURE(*fStatus)) {
4308 return NULL;
4309 }
4310
4311 U_ASSERT(fC.fChar == chColon);
4312
4313 // Save the scanner state.
4314 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4315 int64_t savedScanIndex = fScanIndex;
4316 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4317 UBool savedQuoteMode = fQuoteMode;
4318 UBool savedInBackslashQuote = fInBackslashQuote;
4319 UBool savedEOLComments = fEOLComments;
4320 int64_t savedLineNum = fLineNum;
4321 int64_t savedCharNum = fCharNum;
4322 UChar32 savedLastChar = fLastChar;
4323 UChar32 savedPeekChar = fPeekChar;
4324 RegexPatternChar savedfC = fC;
4325
4326 // Scan for a closing ]. A little tricky because there are some perverse
4327 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4328 // ending on the second closing ].
4329
4330 UnicodeString propName;
4331 UBool negated = FALSE;
4332
4333 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4334 nextChar(fC);
4335 if (fC.fChar == chUp) {
4336 negated = TRUE;
4337 nextChar(fC);
4338 }
4339
4340 // Scan for the closing ":]", collecting the property name along the way.
4341 UBool sawPropSetTerminator = FALSE;
4342 for (;;) {
4343 propName.append(fC.fChar);
4344 nextChar(fC);
4345 if (fC.fQuoted || fC.fChar == -1) {
4346 // Escaped characters or end of input - either says this isn't a [:Property:]
4347 break;
4348 }
4349 if (fC.fChar == chColon) {
4350 nextChar(fC);
4351 if (fC.fChar == chRBracket) {
4352 sawPropSetTerminator = TRUE;
4353 }
4354 break;
4355 }
4356 }
4357
4358 if (sawPropSetTerminator) {
4359 uset = createSetForProperty(propName, negated);
4360 }
4361 else
4362 {
4363 // No closing ":]".
4364 // Restore the original scan position.
4365 // The main scanner will retry the input as a normal set expression,
4366 // not a [:Property:] expression.
4367 fScanIndex = savedScanIndex;
4368 fQuoteMode = savedQuoteMode;
4369 fInBackslashQuote = savedInBackslashQuote;
4370 fEOLComments = savedEOLComments;
4371 fLineNum = savedLineNum;
4372 fCharNum = savedCharNum;
4373 fLastChar = savedLastChar;
4374 fPeekChar = savedPeekChar;
4375 fC = savedfC;
4376 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4377 }
4378 return uset;
4379 }
4380
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4381 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4382 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4383 addCategory(set, U_GC_CF_MASK, ec);
4384 }
4385
4386 //
4387 // Create a Unicode Set from a Unicode Property expression.
4388 // This is common code underlying both \p{...} ane [:...:] expressions.
4389 // Includes trying the Java "properties" that aren't supported as
4390 // normal ICU UnicodeSet properties
4391 //
createSetForProperty(const UnicodeString & propName,UBool negated)4392 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4393
4394 if (U_FAILURE(*fStatus)) {
4395 return nullptr;
4396 }
4397 LocalPointer<UnicodeSet> set;
4398 UErrorCode status = U_ZERO_ERROR;
4399
4400 do { // non-loop, exists to allow breaks from the block.
4401 //
4402 // First try the property as we received it
4403 //
4404 UnicodeString setExpr;
4405 uint32_t usetFlags = 0;
4406 setExpr.append(u"[\\p{", -1);
4407 setExpr.append(propName);
4408 setExpr.append(u"}]", -1);
4409 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4410 usetFlags |= USET_CASE_INSENSITIVE;
4411 }
4412 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
4413 if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4414 break;
4415 }
4416
4417 //
4418 // The incoming property wasn't directly recognized by ICU.
4419
4420 // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4421 // Java accepts 'word' with mixed case.
4422 // Java accepts 'all' only in all lower case.
4423
4424 status = U_ZERO_ERROR;
4425 if (propName.caseCompare(u"word", -1, 0) == 0) {
4426 set.adoptInsteadAndCheckErrorCode(
4427 RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4428 break;
4429 }
4430 if (propName.compare(u"all", -1) == 0) {
4431 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4432 break;
4433 }
4434
4435
4436 // Do Java InBlock expressions
4437 //
4438 UnicodeString mPropName = propName;
4439 if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4440 status = U_ZERO_ERROR;
4441 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4442 if (U_FAILURE(status)) {
4443 break;
4444 }
4445 UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed.
4446 set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4447 break;
4448 }
4449
4450 // Check for the Java form "IsBooleanPropertyValue", which we will recast
4451 // as "BooleanPropertyValue". The property value can be either a
4452 // a General Category or a Script Name.
4453
4454 if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4455 mPropName.remove(0, 2); // Strip the "Is"
4456 if (mPropName.indexOf(u'=') >= 0) {
4457 // Reject any "Is..." property expression containing an '=', that is,
4458 // any non-binary property expression.
4459 status = U_REGEX_PROPERTY_SYNTAX;
4460 break;
4461 }
4462
4463 if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4464 mPropName.setTo(u"unassigned", -1);
4465 negated = !negated;
4466 } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4467 mPropName.setTo(u"Titlecase_Letter", -1);
4468 }
4469
4470 mPropName.insert(0, u"[\\p{", -1);
4471 mPropName.append(u"}]", -1);
4472 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4473
4474 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4475 set->closeOver(USET_CASE_INSENSITIVE);
4476 }
4477 break;
4478
4479 }
4480
4481 if (propName.startsWith(u"java", -1)) {
4482 status = U_ZERO_ERROR;
4483 set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4484 if (U_FAILURE(status)) {
4485 break;
4486 }
4487 //
4488 // Try the various Java specific properties.
4489 // These all begin with "java"
4490 //
4491 if (propName.compare(u"javaDefined", -1) == 0) {
4492 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4493 set->complement();
4494 }
4495 else if (propName.compare(u"javaDigit", -1) == 0) {
4496 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4497 }
4498 else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4499 addIdentifierIgnorable(set.getAlias(), status);
4500 }
4501 else if (propName.compare(u"javaISOControl", -1) == 0) {
4502 set->add(0, 0x1F).add(0x7F, 0x9F);
4503 }
4504 else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4505 addCategory(set.getAlias(), U_GC_L_MASK, status);
4506 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4507 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4508 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4509 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4510 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4511 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4512 addIdentifierIgnorable(set.getAlias(), status);
4513 }
4514 else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4515 addCategory(set.getAlias(), U_GC_L_MASK, status);
4516 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4517 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4518 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4519 }
4520 else if (propName.compare(u"javaLetter", -1) == 0) {
4521 addCategory(set.getAlias(), U_GC_L_MASK, status);
4522 }
4523 else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4524 addCategory(set.getAlias(), U_GC_L_MASK, status);
4525 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4526 }
4527 else if (propName.compare(u"javaLowerCase", -1) == 0) {
4528 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4529 }
4530 else if (propName.compare(u"javaMirrored", -1) == 0) {
4531 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4532 }
4533 else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4534 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4535 }
4536 else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4537 set->add(0x10000, UnicodeSet::MAX_VALUE);
4538 }
4539 else if (propName.compare(u"javaTitleCase", -1) == 0) {
4540 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4541 }
4542 else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4543 addCategory(set.getAlias(), U_GC_L_MASK, status);
4544 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4545 }
4546 else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4547 addCategory(set.getAlias(), U_GC_L_MASK, status);
4548 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4549 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4550 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4551 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4552 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4553 addIdentifierIgnorable(set.getAlias(), status);
4554 }
4555 else if (propName.compare(u"javaUpperCase", -1) == 0) {
4556 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4557 }
4558 else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4559 set->add(0, UnicodeSet::MAX_VALUE);
4560 }
4561 else if (propName.compare(u"javaWhitespace", -1) == 0) {
4562 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4563 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4564 set->add(9, 0x0d).add(0x1c, 0x1f);
4565 } else {
4566 status = U_REGEX_PROPERTY_SYNTAX;
4567 }
4568
4569 if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4570 set->closeOver(USET_CASE_INSENSITIVE);
4571 }
4572 break;
4573 }
4574
4575 // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4576 // extensions matched it.
4577 status = U_REGEX_PROPERTY_SYNTAX;
4578 } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
4579
4580 if (U_SUCCESS(status)) {
4581 U_ASSERT(set.isValid());
4582 if (negated) {
4583 set->complement();
4584 }
4585 return set.orphan();
4586 } else {
4587 if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4588 status = U_REGEX_PROPERTY_SYNTAX;
4589 }
4590 error(status);
4591 return nullptr;
4592 }
4593 }
4594
4595
4596 //
4597 // SetEval Part of the evaluation of [set expressions].
4598 // Perform any pending (stacked) operations with precedence
4599 // equal or greater to that of the next operator encountered
4600 // in the expression.
4601 //
setEval(int32_t nextOp)4602 void RegexCompile::setEval(int32_t nextOp) {
4603 UnicodeSet *rightOperand = NULL;
4604 UnicodeSet *leftOperand = NULL;
4605 for (;;) {
4606 U_ASSERT(fSetOpStack.empty()==FALSE);
4607 int32_t pendingSetOperation = fSetOpStack.peeki();
4608 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4609 break;
4610 }
4611 fSetOpStack.popi();
4612 U_ASSERT(fSetStack.empty() == FALSE);
4613 rightOperand = (UnicodeSet *)fSetStack.peek();
4614 switch (pendingSetOperation) {
4615 case setNegation:
4616 rightOperand->complement();
4617 break;
4618 case setCaseClose:
4619 // TODO: need a simple close function. Ticket 6065
4620 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4621 rightOperand->removeAllStrings();
4622 break;
4623 case setDifference1:
4624 case setDifference2:
4625 fSetStack.pop();
4626 leftOperand = (UnicodeSet *)fSetStack.peek();
4627 leftOperand->removeAll(*rightOperand);
4628 delete rightOperand;
4629 break;
4630 case setIntersection1:
4631 case setIntersection2:
4632 fSetStack.pop();
4633 leftOperand = (UnicodeSet *)fSetStack.peek();
4634 leftOperand->retainAll(*rightOperand);
4635 delete rightOperand;
4636 break;
4637 case setUnion:
4638 fSetStack.pop();
4639 leftOperand = (UnicodeSet *)fSetStack.peek();
4640 leftOperand->addAll(*rightOperand);
4641 delete rightOperand;
4642 break;
4643 default:
4644 UPRV_UNREACHABLE;
4645 }
4646 }
4647 }
4648
setPushOp(int32_t op)4649 void RegexCompile::setPushOp(int32_t op) {
4650 setEval(op);
4651 fSetOpStack.push(op, *fStatus);
4652 fSetStack.push(new UnicodeSet(), *fStatus);
4653 }
4654
4655 U_NAMESPACE_END
4656 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4657
4658