1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  repattrn.cpp
5 //
6 /*
7 ***************************************************************************
8 *   Copyright (C) 2002-2016 International Business Machines Corporation
9 *   and others. All rights reserved.
10 ***************************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 
17 #include "unicode/regex.h"
18 #include "unicode/uclean.h"
19 #include "cmemory.h"
20 #include "cstr.h"
21 #include "uassert.h"
22 #include "uhash.h"
23 #include "uvector.h"
24 #include "uvectr32.h"
25 #include "uvectr64.h"
26 #include "regexcmp.h"
27 #include "regeximp.h"
28 #include "regexst.h"
29 
30 U_NAMESPACE_BEGIN
31 
32 //--------------------------------------------------------------------------
33 //
34 //    RegexPattern    Default Constructor
35 //
36 //--------------------------------------------------------------------------
RegexPattern()37 RegexPattern::RegexPattern() {
38     // Init all of this instances data.
39     init();
40 }
41 
42 
43 //--------------------------------------------------------------------------
44 //
45 //   Copy Constructor        Note:  This is a rather inefficient implementation,
46 //                                  but it probably doesn't matter.
47 //
48 //--------------------------------------------------------------------------
RegexPattern(const RegexPattern & other)49 RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {
50     init();
51     *this = other;
52 }
53 
54 
55 
56 //--------------------------------------------------------------------------
57 //
58 //    Assignment Operator
59 //
60 //--------------------------------------------------------------------------
operator =(const RegexPattern & other)61 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
62     if (this == &other) {
63         // Source and destination are the same.  Don't do anything.
64         return *this;
65     }
66 
67     // Clean out any previous contents of object being assigned to.
68     zap();
69 
70     // Give target object a default initialization
71     init();
72 
73     // Copy simple fields
74     fDeferredStatus   = other.fDeferredStatus;
75 
76     if (U_FAILURE(fDeferredStatus)) {
77         return *this;
78     }
79 
80     if (other.fPatternString == NULL) {
81         fPatternString = NULL;
82         fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
83     } else {
84         fPatternString = new UnicodeString(*(other.fPatternString));
85         if (fPatternString == NULL) {
86             fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
87         } else {
88             fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus);
89         }
90     }
91     if (U_FAILURE(fDeferredStatus)) {
92         return *this;
93     }
94 
95     fFlags            = other.fFlags;
96     fLiteralText      = other.fLiteralText;
97     fMinMatchLen      = other.fMinMatchLen;
98     fFrameSize        = other.fFrameSize;
99     fDataSize         = other.fDataSize;
100 
101     fStartType        = other.fStartType;
102     fInitialStringIdx = other.fInitialStringIdx;
103     fInitialStringLen = other.fInitialStringLen;
104     *fInitialChars    = *other.fInitialChars;
105     fInitialChar      = other.fInitialChar;
106     *fInitialChars8   = *other.fInitialChars8;
107     fNeedsAltInput    = other.fNeedsAltInput;
108 
109     //  Copy the pattern.  It's just values, nothing deep to copy.
110     fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
111     fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
112 
113     //  Copy the Unicode Sets.
114     //    Could be made more efficient if the sets were reference counted and shared,
115     //    but I doubt that pattern copying will be particularly common.
116     //    Note:  init() already added an empty element zero to fSets
117     int32_t i;
118     int32_t  numSets = other.fSets->size();
119     fSets8 = new Regex8BitSet[numSets];
120     if (fSets8 == NULL) {
121     	fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
122     	return *this;
123     }
124     for (i=1; i<numSets; i++) {
125         if (U_FAILURE(fDeferredStatus)) {
126             return *this;
127         }
128         UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
129         UnicodeSet *newSet    = new UnicodeSet(*sourceSet);
130         if (newSet == NULL) {
131             fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
132             break;
133         }
134         fSets->addElement(newSet, fDeferredStatus);
135         fSets8[i] = other.fSets8[i];
136     }
137 
138     // Copy the named capture group hash map.
139     if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) {
140         int32_t hashPos = UHASH_FIRST;
141         while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
142             if (U_FAILURE(fDeferredStatus)) {
143                 break;
144             }
145             const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
146             UnicodeString *key = new UnicodeString(*name);
147             int32_t val = hashEl->value.integer;
148             if (key == NULL) {
149                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
150             } else {
151                 uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
152             }
153         }
154     }
155     return *this;
156 }
157 
158 
159 //--------------------------------------------------------------------------
160 //
161 //    init        Shared initialization for use by constructors.
162 //                Bring an uninitialized RegexPattern up to a default state.
163 //
164 //--------------------------------------------------------------------------
init()165 void RegexPattern::init() {
166     fFlags            = 0;
167     fCompiledPat      = 0;
168     fLiteralText.remove();
169     fSets             = NULL;
170     fSets8            = NULL;
171     fDeferredStatus   = U_ZERO_ERROR;
172     fMinMatchLen      = 0;
173     fFrameSize        = 0;
174     fDataSize         = 0;
175     fGroupMap         = NULL;
176     fStartType        = START_NO_INFO;
177     fInitialStringIdx = 0;
178     fInitialStringLen = 0;
179     fInitialChars     = NULL;
180     fInitialChar      = 0;
181     fInitialChars8    = NULL;
182     fNeedsAltInput    = FALSE;
183     fNamedCaptureMap  = NULL;
184 
185     fPattern          = NULL; // will be set later
186     fPatternString    = NULL; // may be set later
187     fCompiledPat      = new UVector64(fDeferredStatus);
188     fGroupMap         = new UVector32(fDeferredStatus);
189     fSets             = new UVector(fDeferredStatus);
190     fInitialChars     = new UnicodeSet;
191     fInitialChars8    = new Regex8BitSet;
192     if (U_FAILURE(fDeferredStatus)) {
193         return;
194     }
195     if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
196             fInitialChars == NULL || fInitialChars8 == NULL) {
197         fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
198         return;
199     }
200 
201     // Slot zero of the vector of sets is reserved.  Fill it here.
202     fSets->addElement((int32_t)0, fDeferredStatus);
203 }
204 
205 
initNamedCaptureMap()206 bool RegexPattern::initNamedCaptureMap() {
207     if (fNamedCaptureMap) {
208         return true;
209     }
210     fNamedCaptureMap  = uhash_openSize(uhash_hashUnicodeString,     // Key hash function
211                                        uhash_compareUnicodeString,  // Key comparator function
212                                        uhash_compareLong,           // Value comparator function
213                                        7,                           // Initial table capacity
214                                        &fDeferredStatus);
215     if (U_FAILURE(fDeferredStatus)) {
216         return false;
217     }
218 
219     // fNamedCaptureMap owns its key strings, type (UnicodeString *)
220     uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
221     return true;
222 }
223 
224 //--------------------------------------------------------------------------
225 //
226 //   zap            Delete everything owned by this RegexPattern.
227 //
228 //--------------------------------------------------------------------------
zap()229 void RegexPattern::zap() {
230     delete fCompiledPat;
231     fCompiledPat = NULL;
232     int i;
233     for (i=1; i<fSets->size(); i++) {
234         UnicodeSet *s;
235         s = (UnicodeSet *)fSets->elementAt(i);
236         if (s != NULL) {
237             delete s;
238         }
239     }
240     delete fSets;
241     fSets = NULL;
242     delete[] fSets8;
243     fSets8 = NULL;
244     delete fGroupMap;
245     fGroupMap = NULL;
246     delete fInitialChars;
247     fInitialChars = NULL;
248     delete fInitialChars8;
249     fInitialChars8 = NULL;
250     if (fPattern != NULL) {
251         utext_close(fPattern);
252         fPattern = NULL;
253     }
254     if (fPatternString != NULL) {
255         delete fPatternString;
256         fPatternString = NULL;
257     }
258     if (fNamedCaptureMap != NULL) {
259         uhash_close(fNamedCaptureMap);
260         fNamedCaptureMap = NULL;
261     }
262 }
263 
264 
265 //--------------------------------------------------------------------------
266 //
267 //   Destructor
268 //
269 //--------------------------------------------------------------------------
~RegexPattern()270 RegexPattern::~RegexPattern() {
271     zap();
272 }
273 
274 
275 //--------------------------------------------------------------------------
276 //
277 //   Clone
278 //
279 //--------------------------------------------------------------------------
clone() const280 RegexPattern  *RegexPattern::clone() const {
281     RegexPattern  *copy = new RegexPattern(*this);
282     return copy;
283 }
284 
285 
286 //--------------------------------------------------------------------------
287 //
288 //   operator ==   (comparison)    Consider to patterns to be == if the
289 //                                 pattern strings and the flags are the same.
290 //                                 Note that pattern strings with the same
291 //                                 characters can still be considered different.
292 //
293 //--------------------------------------------------------------------------
operator ==(const RegexPattern & other) const294 UBool   RegexPattern::operator ==(const RegexPattern &other) const {
295     if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) {
296         if (this->fPatternString != NULL && other.fPatternString != NULL) {
297             return *(this->fPatternString) == *(other.fPatternString);
298         } else if (this->fPattern == NULL) {
299             if (other.fPattern == NULL) {
300                 return TRUE;
301             }
302         } else if (other.fPattern != NULL) {
303             UTEXT_SETNATIVEINDEX(this->fPattern, 0);
304             UTEXT_SETNATIVEINDEX(other.fPattern, 0);
305             return utext_equals(this->fPattern, other.fPattern);
306         }
307     }
308     return FALSE;
309 }
310 
311 //---------------------------------------------------------------------
312 //
313 //   compile
314 //
315 //---------------------------------------------------------------------
316 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UParseError & pe,UErrorCode & status)317 RegexPattern::compile(const UnicodeString &regex,
318                       uint32_t             flags,
319                       UParseError          &pe,
320                       UErrorCode           &status)
321 {
322     if (U_FAILURE(status)) {
323         return NULL;
324     }
325 
326     const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
327     UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
328     UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES | UREGEX_LITERAL;
329 
330     if ((flags & ~allFlags) != 0) {
331         status = U_REGEX_INVALID_FLAG;
332         return NULL;
333     }
334 
335     if ((flags & UREGEX_CANON_EQ) != 0) {
336         status = U_REGEX_UNIMPLEMENTED;
337         return NULL;
338     }
339 
340     RegexPattern *This = new RegexPattern;
341     if (This == NULL) {
342         status = U_MEMORY_ALLOCATION_ERROR;
343         return NULL;
344     }
345     if (U_FAILURE(This->fDeferredStatus)) {
346         status = This->fDeferredStatus;
347         delete This;
348         return NULL;
349     }
350     This->fFlags = flags;
351 
352     RegexCompile     compiler(This, status);
353     compiler.compile(regex, pe, status);
354 
355     if (U_FAILURE(status)) {
356         delete This;
357         This = NULL;
358     }
359 
360     return This;
361 }
362 
363 
364 //
365 //   compile, UText mode
366 //
367 RegexPattern * U_EXPORT2
compile(UText * regex,uint32_t flags,UParseError & pe,UErrorCode & status)368 RegexPattern::compile(UText                *regex,
369                       uint32_t             flags,
370                       UParseError          &pe,
371                       UErrorCode           &status)
372 {
373     if (U_FAILURE(status)) {
374         return NULL;
375     }
376 
377     const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
378                               UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
379                               UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES | UREGEX_LITERAL;
380 
381     if ((flags & ~allFlags) != 0) {
382         status = U_REGEX_INVALID_FLAG;
383         return NULL;
384     }
385 
386     if ((flags & UREGEX_CANON_EQ) != 0) {
387         status = U_REGEX_UNIMPLEMENTED;
388         return NULL;
389     }
390 
391     RegexPattern *This = new RegexPattern;
392     if (This == NULL) {
393         status = U_MEMORY_ALLOCATION_ERROR;
394         return NULL;
395     }
396     if (U_FAILURE(This->fDeferredStatus)) {
397         status = This->fDeferredStatus;
398         delete This;
399         return NULL;
400     }
401     This->fFlags = flags;
402 
403     RegexCompile     compiler(This, status);
404     compiler.compile(regex, pe, status);
405 
406     if (U_FAILURE(status)) {
407         delete This;
408         This = NULL;
409     }
410 
411     return This;
412 }
413 
414 //
415 //   compile with default flags.
416 //
417 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,UParseError & pe,UErrorCode & err)418 RegexPattern::compile(const UnicodeString &regex,
419                       UParseError         &pe,
420                       UErrorCode          &err)
421 {
422     return compile(regex, 0, pe, err);
423 }
424 
425 
426 //
427 //   compile with default flags, UText mode
428 //
429 RegexPattern * U_EXPORT2
compile(UText * regex,UParseError & pe,UErrorCode & err)430 RegexPattern::compile(UText               *regex,
431                       UParseError         &pe,
432                       UErrorCode          &err)
433 {
434     return compile(regex, 0, pe, err);
435 }
436 
437 
438 //
439 //   compile with no UParseErr parameter.
440 //
441 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UErrorCode & err)442 RegexPattern::compile(const UnicodeString &regex,
443                       uint32_t             flags,
444                       UErrorCode          &err)
445 {
446     UParseError pe;
447     return compile(regex, flags, pe, err);
448 }
449 
450 
451 //
452 //   compile with no UParseErr parameter, UText mode
453 //
454 RegexPattern * U_EXPORT2
compile(UText * regex,uint32_t flags,UErrorCode & err)455 RegexPattern::compile(UText                *regex,
456                       uint32_t             flags,
457                       UErrorCode           &err)
458 {
459     UParseError pe;
460     return compile(regex, flags, pe, err);
461 }
462 
463 
464 //---------------------------------------------------------------------
465 //
466 //   flags
467 //
468 //---------------------------------------------------------------------
flags() const469 uint32_t RegexPattern::flags() const {
470     return fFlags;
471 }
472 
473 
474 //---------------------------------------------------------------------
475 //
476 //   matcher(UnicodeString, err)
477 //
478 //---------------------------------------------------------------------
matcher(const UnicodeString & input,UErrorCode & status) const479 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
480                                     UErrorCode          &status)  const {
481     RegexMatcher    *retMatcher = matcher(status);
482     if (retMatcher != NULL) {
483         retMatcher->fDeferredStatus = status;
484         retMatcher->reset(input);
485     }
486     return retMatcher;
487 }
488 
489 
490 //---------------------------------------------------------------------
491 //
492 //   matcher(status)
493 //
494 //---------------------------------------------------------------------
matcher(UErrorCode & status) const495 RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
496     RegexMatcher    *retMatcher = NULL;
497 
498     if (U_FAILURE(status)) {
499         return NULL;
500     }
501     if (U_FAILURE(fDeferredStatus)) {
502         status = fDeferredStatus;
503         return NULL;
504     }
505 
506     retMatcher = new RegexMatcher(this);
507     if (retMatcher == NULL) {
508         status = U_MEMORY_ALLOCATION_ERROR;
509         return NULL;
510     }
511     return retMatcher;
512 }
513 
514 
515 
516 //---------------------------------------------------------------------
517 //
518 //   matches        Convenience function to test for a match, starting
519 //                  with a pattern string and a data string.
520 //
521 //---------------------------------------------------------------------
matches(const UnicodeString & regex,const UnicodeString & input,UParseError & pe,UErrorCode & status)522 UBool U_EXPORT2 RegexPattern::matches(const UnicodeString   &regex,
523               const UnicodeString   &input,
524                     UParseError     &pe,
525                     UErrorCode      &status) {
526 
527     if (U_FAILURE(status)) {return FALSE;}
528 
529     UBool         retVal;
530     RegexPattern *pat     = NULL;
531     RegexMatcher *matcher = NULL;
532 
533     pat     = RegexPattern::compile(regex, 0, pe, status);
534     matcher = pat->matcher(input, status);
535     retVal  = matcher->matches(status);
536 
537     delete matcher;
538     delete pat;
539     return retVal;
540 }
541 
542 
543 //
544 //   matches, UText mode
545 //
matches(UText * regex,UText * input,UParseError & pe,UErrorCode & status)546 UBool U_EXPORT2 RegexPattern::matches(UText                *regex,
547                     UText           *input,
548                     UParseError     &pe,
549                     UErrorCode      &status) {
550 
551     if (U_FAILURE(status)) {return FALSE;}
552 
553     UBool         retVal  = FALSE;
554     RegexPattern *pat     = NULL;
555     RegexMatcher *matcher = NULL;
556 
557     pat     = RegexPattern::compile(regex, 0, pe, status);
558     matcher = pat->matcher(status);
559     if (U_SUCCESS(status)) {
560         matcher->reset(input);
561         retVal  = matcher->matches(status);
562     }
563 
564     delete matcher;
565     delete pat;
566     return retVal;
567 }
568 
569 
570 
571 
572 
573 //---------------------------------------------------------------------
574 //
575 //   pattern
576 //
577 //---------------------------------------------------------------------
pattern() const578 UnicodeString RegexPattern::pattern() const {
579     if (fPatternString != NULL) {
580         return *fPatternString;
581     } else if (fPattern == NULL) {
582         return UnicodeString();
583     } else {
584         UErrorCode status = U_ZERO_ERROR;
585         int64_t nativeLen = utext_nativeLength(fPattern);
586         int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
587         UnicodeString result;
588 
589         status = U_ZERO_ERROR;
590         UChar *resultChars = result.getBuffer(len16);
591         utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
592         result.releaseBuffer(len16);
593 
594         return result;
595     }
596 }
597 
598 
599 
600 
601 //---------------------------------------------------------------------
602 //
603 //   patternText
604 //
605 //---------------------------------------------------------------------
patternText(UErrorCode & status) const606 UText *RegexPattern::patternText(UErrorCode      &status) const {
607     if (U_FAILURE(status)) {return NULL;}
608     status = U_ZERO_ERROR;
609 
610     if (fPattern != NULL) {
611         return fPattern;
612     } else {
613         RegexStaticSets::initGlobals(&status);
614         return RegexStaticSets::gStaticSets->fEmptyText;
615     }
616 }
617 
618 
619 //--------------------------------------------------------------------------------
620 //
621 //  groupNumberFromName()
622 //
623 //--------------------------------------------------------------------------------
groupNumberFromName(const UnicodeString & groupName,UErrorCode & status) const624 int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const {
625     if (U_FAILURE(status)) {
626         return 0;
627     }
628 
629     // No need to explicitly check for syntactically valid names.
630     // Invalid ones will never be in the map, and the lookup will fail.
631 
632     int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0;
633     if (number == 0) {
634         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
635     }
636     return number;
637 }
638 
groupNumberFromName(const char * groupName,int32_t nameLength,UErrorCode & status) const639 int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const {
640     if (U_FAILURE(status)) {
641         return 0;
642     }
643     UnicodeString name(groupName, nameLength, US_INV);
644     return groupNumberFromName(name, status);
645 }
646 
647 
648 //---------------------------------------------------------------------
649 //
650 //   split
651 //
652 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status) const653 int32_t  RegexPattern::split(const UnicodeString &input,
654         UnicodeString    dest[],
655         int32_t          destCapacity,
656         UErrorCode      &status) const
657 {
658     if (U_FAILURE(status)) {
659         return 0;
660     }
661 
662     RegexMatcher  m(this);
663     int32_t r = 0;
664     // Check m's status to make sure all is ok.
665     if (U_SUCCESS(m.fDeferredStatus)) {
666     	r = m.split(input, dest, destCapacity, status);
667     }
668     return r;
669 }
670 
671 //
672 //   split, UText mode
673 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status) const674 int32_t  RegexPattern::split(UText *input,
675         UText           *dest[],
676         int32_t          destCapacity,
677         UErrorCode      &status) const
678 {
679     if (U_FAILURE(status)) {
680         return 0;
681     }
682 
683     RegexMatcher  m(this);
684     int32_t r = 0;
685     // Check m's status to make sure all is ok.
686     if (U_SUCCESS(m.fDeferredStatus)) {
687     	r = m.split(input, dest, destCapacity, status);
688     }
689     return r;
690 }
691 
692 
693 //---------------------------------------------------------------------
694 //
695 //   dump    Output the compiled form of the pattern.
696 //           Debugging function only.
697 //
698 //---------------------------------------------------------------------
dumpOp(int32_t index) const699 void   RegexPattern::dumpOp(int32_t index) const {
700     (void)index;  // Suppress warnings in non-debug build.
701 #if defined(REGEX_DEBUG)
702     static const char * const opNames[] = {URX_OPCODE_NAMES};
703     int32_t op          = fCompiledPat->elementAti(index);
704     int32_t val         = URX_VAL(op);
705     int32_t type        = URX_TYPE(op);
706     int32_t pinnedType  = type;
707     if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) {
708         pinnedType = 0;
709     }
710 
711     printf("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]);
712     switch (type) {
713     case URX_NOP:
714     case URX_DOTANY:
715     case URX_DOTANY_ALL:
716     case URX_FAIL:
717     case URX_CARET:
718     case URX_DOLLAR:
719     case URX_BACKSLASH_G:
720     case URX_BACKSLASH_X:
721     case URX_END:
722     case URX_DOLLAR_M:
723     case URX_CARET_M:
724         // Types with no operand field of interest.
725         break;
726 
727     case URX_RESERVED_OP:
728     case URX_START_CAPTURE:
729     case URX_END_CAPTURE:
730     case URX_STATE_SAVE:
731     case URX_JMP:
732     case URX_JMP_SAV:
733     case URX_JMP_SAV_X:
734     case URX_BACKSLASH_B:
735     case URX_BACKSLASH_BU:
736     case URX_BACKSLASH_D:
737     case URX_BACKSLASH_Z:
738     case URX_STRING_LEN:
739     case URX_CTR_INIT:
740     case URX_CTR_INIT_NG:
741     case URX_CTR_LOOP:
742     case URX_CTR_LOOP_NG:
743     case URX_RELOC_OPRND:
744     case URX_STO_SP:
745     case URX_LD_SP:
746     case URX_BACKREF:
747     case URX_STO_INP_LOC:
748     case URX_JMPX:
749     case URX_LA_START:
750     case URX_LA_END:
751     case URX_BACKREF_I:
752     case URX_LB_START:
753     case URX_LB_CONT:
754     case URX_LB_END:
755     case URX_LBN_CONT:
756     case URX_LBN_END:
757     case URX_LOOP_C:
758     case URX_LOOP_DOT_I:
759     case URX_BACKSLASH_H:
760     case URX_BACKSLASH_R:
761     case URX_BACKSLASH_V:
762         // types with an integer operand field.
763         printf("%d", val);
764         break;
765 
766     case URX_ONECHAR:
767     case URX_ONECHAR_I:
768         if (val < 0x20) {
769             printf("%#x", val);
770         } else {
771             printf("'%s'", CStr(UnicodeString(val))());
772         }
773         break;
774 
775     case URX_STRING:
776     case URX_STRING_I:
777         {
778             int32_t lengthOp       = fCompiledPat->elementAti(index+1);
779             U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
780             int32_t length = URX_VAL(lengthOp);
781             UnicodeString str(fLiteralText, val, length);
782             printf("%s", CStr(str)());
783         }
784         break;
785 
786     case URX_SETREF:
787     case URX_LOOP_SR_I:
788         {
789             UnicodeString s;
790             UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
791             set->toPattern(s, TRUE);
792             printf("%s", CStr(s)());
793         }
794         break;
795 
796     case URX_STATIC_SETREF:
797     case URX_STAT_SETREF_N:
798         {
799             UnicodeString s;
800             if (val & URX_NEG_SET) {
801                 printf("NOT ");
802                 val &= ~URX_NEG_SET;
803             }
804             UnicodeSet &set = RegexStaticSets::gStaticSets->fPropSets[val];
805             set.toPattern(s, TRUE);
806             printf("%s", CStr(s)());
807         }
808         break;
809 
810 
811     default:
812         printf("??????");
813         break;
814     }
815     printf("\n");
816 #endif
817 }
818 
819 
dumpPattern() const820 void RegexPattern::dumpPattern() const {
821 #if defined(REGEX_DEBUG)
822     int      index;
823 
824     UnicodeString patStr;
825     for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) {
826         patStr.append(c);
827     }
828     printf("Original Pattern:  \"%s\"\n", CStr(patStr)());
829     printf("   Min Match Length:  %d\n", fMinMatchLen);
830     printf("   Match Start Type:  %s\n", START_OF_MATCH_STR(fStartType));
831     if (fStartType == START_STRING) {
832         UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen);
833         printf("   Initial match string: \"%s\"\n", CStr(initialString)());
834     } else if (fStartType == START_SET) {
835         UnicodeString s;
836         fInitialChars->toPattern(s, TRUE);
837         printf("    Match First Chars: %s\n", CStr(s)());
838 
839     } else if (fStartType == START_CHAR) {
840         printf("    First char of Match: ");
841         if (fInitialChar > 0x20) {
842                 printf("'%s'\n", CStr(UnicodeString(fInitialChar))());
843             } else {
844                 printf("%#x\n", fInitialChar);
845             }
846     }
847 
848     printf("Named Capture Groups:\n");
849     if (!fNamedCaptureMap || uhash_count(fNamedCaptureMap) == 0) {
850         printf("   None\n");
851     } else {
852         int32_t pos = UHASH_FIRST;
853         const UHashElement *el = NULL;
854         while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
855             const UnicodeString *name = (const UnicodeString *)el->key.pointer;
856             int32_t number = el->value.integer;
857             printf("   %d\t%s\n", number, CStr(*name)());
858         }
859     }
860 
861     printf("\nIndex   Binary     Type             Operand\n" \
862            "-------------------------------------------\n");
863     for (index = 0; index<fCompiledPat->size(); index++) {
864         dumpOp(index);
865     }
866     printf("\n\n");
867 #endif
868 }
869 
870 
871 
872 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
873 
874 U_NAMESPACE_END
875 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
876