1 // Scintilla source code edit control
2 /** @file RESearch.cxx
3  ** Regular expression search library.
4  **/
5 
6 /*
7  * regex - Regular expression pattern matching and replacement
8  *
9  * By:  Ozan S. Yigit (oz)
10  *      Dept. of Computer Science
11  *      York University
12  *
13  * Original code available from http://www.cs.yorku.ca/~oz/
14  * Translation to C++ by Neil Hodgson neilh@scintilla.org
15  * Removed all use of register.
16  * Converted to modern function prototypes.
17  * Put all global/static variables into an object so this code can be
18  * used from multiple threads, etc.
19  *
20  * These routines are the PUBLIC DOMAIN equivalents of regex
21  * routines as found in 4.nBSD UN*X, with minor extensions.
22  *
23  * These routines are derived from various implementations found
24  * in software tools books, and Conroy's grep. They are NOT derived
25  * from licensed/restricted software.
26  * For more interesting/academic/complicated implementations,
27  * see Henry Spencer's regexp routines, or GNU Emacs pattern
28  * matching module.
29  *
30  * Modification history removed.
31  *
32  * Interfaces:
33  *  RESearch::Compile:      compile a regular expression into a NFA.
34  *
35  *          const char *RESearch::Compile(const char *pat, int length,
36  *                                        bool caseSensitive, bool posix)
37  *
38  * Returns a short error string if they fail.
39  *
40  *  RESearch::Execute:      execute the NFA to match a pattern.
41  *
42  *          int RESearch::Execute(characterIndexer &ci, int lp, int endp)
43  *
44  *  RESearch::Substitute:   substitute the matched portions in a new string.
45  *
46  *          int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst)
47  *
48  *  re_fail:                failure routine for RESearch::Execute. (no longer used)
49  *
50  *          void re_fail(char *msg, char op)
51  *
52  * Regular Expressions:
53  *
54  *      [1]     char    matches itself, unless it is a special
55  *                      character (metachar): . \ [ ] * + ^ $
56  *                      and ( ) if posix option.
57  *
58  *      [2]     .       matches any character.
59  *
60  *      [3]     \       matches the character following it, except:
61  *                      - \a, \b, \f, \n, \t, \v match the
62  *                      corresponding C escape char;
63  *                      - if not in posix mode, when followed by a
64  *                      left or right round bracket (see [7]);
65  *                      - when followed by a digit 1 to 9 (see [8]);
66  *                      - when followed by a left or right angle bracket
67  *                      (see [9]).
68  *                      It is used as an escape character for all
69  *                      other meta-characters, and itself. When used
70  *                      in a set ([4]), it is treated as an ordinary
71  *                      character (except for escape chars).
72  *
73  *      [4]     [set]   matches one of the characters in the set.
74  *                      If the first character in the set is "^",
75  *                      it matches a character NOT in the set, i.e.
76  *                      complements the set. A shorthand S-E (start-end)
77  *                      is used to specify a set of characters S upto
78  *                      E, inclusive. The special characters "]" and
79  *                      "-" have no special meaning if they appear
80  *                      as the first chars in the set. To include both,
81  *                      put - first: [-]A-Z]:
82  *                      [-]|] matches these 2 chars,
83  *                      []-|] matches from ] to | chars.
84  *                      examples:        match:
85  *
86  *                              [a-z]    any lowercase alpha
87  *
88  *                              [^-]]    any char except - and ]
89  *
90  *                              [^A-Z]   any char except uppercase
91  *                                       alpha
92  *
93  *                              [a-zA-Z] any alpha
94  *
95  *      [5]     *       any regular expression form [1] to [4], followed by
96  *                      closure char (*) matches zero or more matches of
97  *                      that form.
98  *
99  *      [6]     +       same as [5], except it matches one or more.
100  *
101  *      [7]             a regular expression in the form [1] to [10], enclosed
102  *                      as \(form\) (or (form) with posix flag) matches what
103  *                      form matches. The enclosure creates a set of tags,
104  *                      used for [8] and for pattern substitution.
105  *                      The tagged forms are numbered starting from 1.
106  *
107  *      [8]             a \ followed by a digit 1 to 9 matches whatever a
108  *                      previously tagged regular expression ([7]) matched.
109  *
110  *      [9]     \<      a regular expression starting with a \< construct
111  *              \>      and/or ending with a \> construct, restricts the
112  *                      pattern matching to the beginning of a word, and/or
113  *                      the end of a word. A word is defined to be a character
114  *                      string beginning and/or ending with the characters
115  *                      A-Z a-z 0-9 and _. It must also be preceded and/or
116  *                      followed by any character outside those mentioned.
117  *
118  *      [10]            a composite regular expression xy where x and y
119  *                      are in the form [1] to [10] matches the longest
120  *                      match of x followed by a match for y.
121  *
122  *      [11]    ^       a regular expression starting with a ^ character
123  *              $       and/or ending with a $ character, restricts the
124  *                      pattern matching to the beginning of the line,
125  *                      or the end of line. [anchors] Elsewhere in the
126  *                      pattern, ^ and $ are treated as ordinary characters.
127  *
128  *
129  * Acknowledgements:
130  *
131  *  HCR's Hugh Redelmeier has been most helpful in various
132  *  stages of development. He convinced me to include BOW
133  *  and EOW constructs, originally invented by Rob Pike at
134  *  the University of Toronto.
135  *
136  * References:
137  *              Software tools                  Kernighan & Plauger
138  *              Software tools in Pascal        Kernighan & Plauger
139  *              Grep [rsx-11 C dist]            David Conroy
140  *              ed - text editor                Un*x Programmer's Manual
141  *              Advanced editing on Un*x        B. W. Kernighan
142  *              RegExp routines                 Henry Spencer
143  *
144  * Notes:
145  *
146  *	This implementation uses a bit-set representation for character
147  *	classes for speed and compactness. Each character is represented
148  *	by one bit in a 256-bit block. Thus, CCL always takes a
149  *	constant 32 bytes in the internal nfa, and RESearch::Execute does a single
150  *	bit comparison to locate the character in the set.
151  *
152  * Examples:
153  *
154  *  pattern:    foo*.*
155  *  compile:    CHR f CHR o CLO CHR o END CLO ANY END END
156  *  matches:    fo foo fooo foobar fobar foxx ...
157  *
158  *  pattern:    fo[ob]a[rz]
159  *  compile:    CHR f CHR o CCL bitset CHR a CCL bitset END
160  *  matches:    fobar fooar fobaz fooaz
161  *
162  *  pattern:    foo\\+
163  *  compile:    CHR f CHR o CHR o CHR \ CLO CHR \ END END
164  *  matches:    foo\ foo\\ foo\\\  ...
165  *
166  *  pattern:    \(foo\)[1-3]\1  (same as foo[1-3]foo)
167  *  compile:    BOT 1 CHR f CHR o CHR o EOT 1 CCL bitset REF 1 END
168  *  matches:    foo1foo foo2foo foo3foo
169  *
170  *  pattern:    \(fo.*\)-\1
171  *  compile:    BOT 1 CHR f CHR o CLO ANY END EOT 1 CHR - REF 1 END
172  *  matches:    foo-foo fo-fo fob-fob foobar-foobar ...
173  */
174 
175 #include "CharClassify.h"
176 #include "RESearch.h"
177 
178 // Shut up annoying Visual C++ warnings:
179 #ifdef _MSC_VER
180 #pragma warning(disable: 4514)
181 #endif
182 
183 #define OKP     1
184 #define NOP     0
185 
186 #define CHR     1
187 #define ANY     2
188 #define CCL     3
189 #define BOL     4
190 #define EOL     5
191 #define BOT     6
192 #define EOT     7
193 #define BOW     8
194 #define EOW     9
195 #define REF     10
196 #define CLO     11
197 
198 #define END     0
199 
200 /*
201  * The following defines are not meant to be changeable.
202  * They are for readability only.
203  */
204 #define BLKIND  0370
205 #define BITIND  07
206 
207 const char bitarr[] = {1,2,4,8,16,32,64,'\200'};
208 
209 #define badpat(x)	(*nfa = END, x)
210 
211 /*
212  * Character classification table for word boundary operators BOW
213  * and EOW is passed in by the creator of this object (Scintilla
214  * Document). The Document default state is that word chars are:
215  * 0-9,a-z, A-Z and _
216  */
217 
RESearch(CharClassify * charClassTable)218 RESearch::RESearch(CharClassify *charClassTable) {
219 	charClass = charClassTable;
220 	Init();
221 }
222 
~RESearch()223 RESearch::~RESearch() {
224 	Clear();
225 }
226 
Init()227 void RESearch::Init() {
228 	sta = NOP;                  /* status of lastpat */
229 	bol = 0;
230 	for (int i=0; i<MAXTAG; i++)
231 		pat[i] = 0;
232 	for (int j=0; j<BITBLK; j++)
233 		bittab[j] = 0;
234 }
235 
Clear()236 void RESearch::Clear() {
237 	for (int i=0; i<MAXTAG; i++) {
238 		delete []pat[i];
239 		pat[i] = 0;
240 		bopat[i] = NOTFOUND;
241 		eopat[i] = NOTFOUND;
242 	}
243 }
244 
GrabMatches(CharacterIndexer & ci)245 bool RESearch::GrabMatches(CharacterIndexer &ci) {
246 	bool success = true;
247 	for (unsigned int i=0; i<MAXTAG; i++) {
248 		if ((bopat[i] != NOTFOUND) && (eopat[i] != NOTFOUND)) {
249 			unsigned int len = eopat[i] - bopat[i];
250 			pat[i] = new char[len + 1];
251 			if (pat[i]) {
252 				for (unsigned int j=0; j<len; j++)
253 					pat[i][j] = ci.CharAt(bopat[i] + j);
254 				pat[i][len] = '\0';
255 			} else {
256 				success = false;
257 			}
258 		}
259 	}
260 	return success;
261 }
262 
ChSet(char c)263 void RESearch::ChSet(char c) {
264 	bittab[((c) & BLKIND) >> 3] |= bitarr[(c) & BITIND];
265 }
266 
ChSetWithCase(char c,bool caseSensitive)267 void RESearch::ChSetWithCase(char c, bool caseSensitive) {
268 	if (caseSensitive) {
269 		ChSet(c);
270 	} else {
271 		if ((c >= 'a') && (c <= 'z')) {
272 			ChSet(c);
273 			ChSet(static_cast<char>(c - 'a' + 'A'));
274 		} else if ((c >= 'A') && (c <= 'Z')) {
275 			ChSet(c);
276 			ChSet(static_cast<char>(c - 'A' + 'a'));
277 		} else {
278 			ChSet(c);
279 		}
280 	}
281 }
282 
escapeValue(char ch)283 const char escapeValue(char ch) {
284 	switch (ch) {
285 	case 'a':	return '\a';
286 	case 'b':	return '\b';
287 	case 'f':	return '\f';
288 	case 'n':	return '\n';
289 	case 'r':	return '\r';
290 	case 't':	return '\t';
291 	case 'v':	return '\v';
292 	}
293 	return 0;
294 }
295 
Compile(const char * pat,int length,bool caseSensitive,bool posix)296 const char *RESearch::Compile(const char *pat, int length, bool caseSensitive, bool posix) {
297 	char *mp=nfa;          /* nfa pointer       */
298 	char *lp;              /* saved pointer     */
299 	char *sp=nfa;          /* another one       */
300 	char *mpMax = mp + MAXNFA - BITBLK - 10;
301 
302 	int tagi = 0;          /* tag stack index   */
303 	int tagc = 1;          /* actual tag count  */
304 
305 	int n;
306 	char mask;             /* xor mask -CCL/NCL */
307 	int c1, c2;
308 
309 	if (!pat || !length)
310 		if (sta)
311 			return 0;
312 		else
313 			return badpat("No previous regular expression");
314 	sta = NOP;
315 
316 	const char *p=pat;     /* pattern pointer   */
317 	for (int i=0; i<length; i++, p++) {
318 		if (mp > mpMax)
319 			return badpat("Pattern too long");
320 		lp = mp;
321 		switch(*p) {
322 
323 		case '.':               /* match any char  */
324 			*mp++ = ANY;
325 			break;
326 
327 		case '^':               /* match beginning */
328 			if (p == pat)
329 				*mp++ = BOL;
330 			else {
331 				*mp++ = CHR;
332 				*mp++ = *p;
333 			}
334 			break;
335 
336 		case '$':               /* match endofline */
337 			if (!*(p+1))
338 				*mp++ = EOL;
339 			else {
340 				*mp++ = CHR;
341 				*mp++ = *p;
342 			}
343 			break;
344 
345 		case '[':               /* match char class */
346 			*mp++ = CCL;
347 
348 			i++;
349 			if (*++p == '^') {
350 				mask = '\377';
351 				i++;
352 				p++;
353 			} else
354 				mask = 0;
355 
356 			if (*p == '-') {	/* real dash */
357 				i++;
358 				ChSet(*p++);
359 			}
360 			if (*p == ']') {	/* real brace */
361 				i++;
362 				ChSet(*p++);
363 			}
364 			while (*p && *p != ']') {
365 				if (*p == '-' && *(p+1) && *(p+1) != ']') {
366 					i++;
367 					p++;
368 					c1 = *(p-2) + 1;
369 					i++;
370 					c2 = *p++;
371 					while (c1 <= c2) {
372 						ChSetWithCase(static_cast<char>(c1++), caseSensitive);
373 					}
374 				} else if (*p == '\\' && *(p+1)) {
375 					i++;
376 					p++;
377 					char escape = escapeValue(*p);
378 					if (escape)
379 						ChSetWithCase(escape, caseSensitive);
380 					else
381 						ChSetWithCase(*p, caseSensitive);
382 					i++;
383 					p++;
384 				} else {
385 					i++;
386 					ChSetWithCase(*p++, caseSensitive);
387 				}
388 			}
389 			if (!*p)
390 				return badpat("Missing ]");
391 
392 			for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
393 				*mp++ = static_cast<char>(mask ^ bittab[n]);
394 
395 			break;
396 
397 		case '*':               /* match 0 or more... */
398 		case '+':               /* match 1 or more... */
399 			if (p == pat)
400 				return badpat("Empty closure");
401 			lp = sp;		/* previous opcode */
402 			if (*lp == CLO)		/* equivalence... */
403 				break;
404 			switch(*lp) {
405 
406 			case BOL:
407 			case BOT:
408 			case EOT:
409 			case BOW:
410 			case EOW:
411 			case REF:
412 				return badpat("Illegal closure");
413 			default:
414 				break;
415 			}
416 
417 			if (*p == '+')
418 				for (sp = mp; lp < sp; lp++)
419 					*mp++ = *lp;
420 
421 			*mp++ = END;
422 			*mp++ = END;
423 			sp = mp;
424 			while (--mp > lp)
425 				*mp = mp[-1];
426 			*mp = CLO;
427 			mp = sp;
428 			break;
429 
430 		case '\\':              /* tags, backrefs... */
431 			i++;
432 			switch(*++p) {
433 
434 			case '<':
435 				*mp++ = BOW;
436 				break;
437 			case '>':
438 				if (*sp == BOW)
439 					return badpat("Null pattern inside \\<\\>");
440 				*mp++ = EOW;
441 				break;
442 			case '1':
443 			case '2':
444 			case '3':
445 			case '4':
446 			case '5':
447 			case '6':
448 			case '7':
449 			case '8':
450 			case '9':
451 				n = *p-'0';
452 				if (tagi > 0 && tagstk[tagi] == n)
453 					return badpat("Cyclical reference");
454 				if (tagc > n) {
455 					*mp++ = static_cast<char>(REF);
456 					*mp++ = static_cast<char>(n);
457 				}
458 				else
459 					return badpat("Undetermined reference");
460 				break;
461 			case 'a':
462 			case 'b':
463 			case 'n':
464 			case 'f':
465 			case 'r':
466 			case 't':
467 			case 'v':
468 				*mp++ = CHR;
469 				*mp++ = escapeValue(*p);
470 				break;
471 			default:
472 				if (!posix && *p == '(') {
473 					if (tagc < MAXTAG) {
474 						tagstk[++tagi] = tagc;
475 						*mp++ = BOT;
476 						*mp++ = static_cast<char>(tagc++);
477 					}
478 					else
479 						return badpat("Too many \\(\\) pairs");
480 				} else if (!posix && *p == ')') {
481 					if (*sp == BOT)
482 						return badpat("Null pattern inside \\(\\)");
483 					if (tagi > 0) {
484 						*mp++ = static_cast<char>(EOT);
485 						*mp++ = static_cast<char>(tagstk[tagi--]);
486 					}
487 					else
488 						return badpat("Unmatched \\)");
489 				} else {
490 					*mp++ = CHR;
491 					*mp++ = *p;
492 				}
493 			}
494 			break;
495 
496 		default :               /* an ordinary char */
497 			if (posix && *p == '(') {
498 				if (tagc < MAXTAG) {
499 					tagstk[++tagi] = tagc;
500 					*mp++ = BOT;
501 					*mp++ = static_cast<char>(tagc++);
502 				}
503 				else
504 					return badpat("Too many () pairs");
505 			} else if (posix && *p == ')') {
506 				if (*sp == BOT)
507 					return badpat("Null pattern inside ()");
508 				if (tagi > 0) {
509 					*mp++ = static_cast<char>(EOT);
510 					*mp++ = static_cast<char>(tagstk[tagi--]);
511 				}
512 				else
513 					return badpat("Unmatched )");
514 			} else if (caseSensitive) {
515 				*mp++ = CHR;
516 				*mp++ = *p;
517 			} else {
518 				*mp++ = CCL;
519 				mask = 0;
520 				ChSetWithCase(*p, false);
521 				for (n = 0; n < BITBLK; bittab[n++] = (char) 0)
522 					*mp++ = static_cast<char>(mask ^ bittab[n]);
523 			}
524 			break;
525 		}
526 		sp = lp;
527 	}
528 	if (tagi > 0)
529 		return badpat((posix ? "Unmatched (" : "Unmatched \\("));
530 	*mp = END;
531 	sta = OKP;
532 	return 0;
533 }
534 
535 /*
536  * RESearch::Execute:
537  *   execute nfa to find a match.
538  *
539  *  special cases: (nfa[0])
540  *      BOL
541  *          Match only once, starting from the
542  *          beginning.
543  *      CHR
544  *          First locate the character without
545  *          calling PMatch, and if found, call
546  *          PMatch for the remaining string.
547  *      END
548  *          RESearch::Compile failed, poor luser did not
549  *          check for it. Fail fast.
550  *
551  *  If a match is found, bopat[0] and eopat[0] are set
552  *  to the beginning and the end of the matched fragment,
553  *  respectively.
554  *
555  */
556 
Execute(CharacterIndexer & ci,int lp,int endp)557 int RESearch::Execute(CharacterIndexer &ci, int lp, int endp) {
558 	char c;
559 	int ep = NOTFOUND;
560 	char *ap = nfa;
561 
562 	bol = lp;
563 	failure = 0;
564 
565 	Clear();
566 
567 	switch(*ap) {
568 
569 	case BOL:			/* anchored: match from BOL only */
570 		ep = PMatch(ci, lp, endp, ap);
571 		break;
572 	case EOL:			/* just searching for end of line normal path doesn't work */
573 		if (*(ap+1) == END) {
574 			lp = endp;
575 			ep = lp;
576 			break;
577 		} else {
578 			return 0;
579 		}
580 	case CHR:			/* ordinary char: locate it fast */
581 		c = *(ap+1);
582 		while ((lp < endp) && (ci.CharAt(lp) != c))
583 			lp++;
584 		if (lp >= endp)	/* if EOS, fail, else fall thru. */
585 			return 0;
586 	default:			/* regular matching all the way. */
587 		while (lp < endp) {
588 			ep = PMatch(ci, lp, endp, ap);
589 			if (ep != NOTFOUND)
590 				break;
591 			lp++;
592 		}
593 		break;
594 	case END:			/* munged automaton. fail always */
595 		return 0;
596 	}
597 	if (ep == NOTFOUND)
598 		return 0;
599 
600 	bopat[0] = lp;
601 	eopat[0] = ep;
602 	return 1;
603 }
604 
605 /*
606  * PMatch: internal routine for the hard part
607  *
608  *  This code is partly snarfed from an early grep written by
609  *  David Conroy. The backref and tag stuff, and various other
610  *  innovations are by oz.
611  *
612  *  special case optimizations: (nfa[n], nfa[n+1])
613  *      CLO ANY
614  *          We KNOW .* will match everything upto the
615  *          end of line. Thus, directly go to the end of
616  *          line, without recursive PMatch calls. As in
617  *          the other closure cases, the remaining pattern
618  *          must be matched by moving backwards on the
619  *          string recursively, to find a match for xy
620  *          (x is ".*" and y is the remaining pattern)
621  *          where the match satisfies the LONGEST match for
622  *          x followed by a match for y.
623  *      CLO CHR
624  *          We can again scan the string forward for the
625  *          single char and at the point of failure, we
626  *          execute the remaining nfa recursively, same as
627  *          above.
628  *
629  *  At the end of a successful match, bopat[n] and eopat[n]
630  *  are set to the beginning and end of subpatterns matched
631  *  by tagged expressions (n = 1 to 9).
632  */
633 
634 extern void re_fail(char *,char);
635 
636 #define isinset(x,y)	((x)[((y)&BLKIND)>>3] & bitarr[(y)&BITIND])
637 
638 /*
639  * skip values for CLO XXX to skip past the closure
640  */
641 
642 #define ANYSKIP 2 	/* [CLO] ANY END          */
643 #define CHRSKIP 3	/* [CLO] CHR chr END      */
644 #define CCLSKIP 34	/* [CLO] CCL 32 bytes END */
645 
PMatch(CharacterIndexer & ci,int lp,int endp,char * ap)646 int RESearch::PMatch(CharacterIndexer &ci, int lp, int endp, char *ap) {
647 	int op, c, n;
648 	int e;		/* extra pointer for CLO  */
649 	int bp;		/* beginning of subpat... */
650 	int ep;		/* ending of subpat...    */
651 	int are;	/* to save the line ptr.  */
652 
653 	while ((op = *ap++) != END)
654 		switch(op) {
655 
656 		case CHR:
657 			if (ci.CharAt(lp++) != *ap++)
658 				return NOTFOUND;
659 			break;
660 		case ANY:
661 			if (lp++ >= endp)
662 				return NOTFOUND;
663 			break;
664 		case CCL:
665 			c = ci.CharAt(lp++);
666 			if (!isinset(ap,c))
667 				return NOTFOUND;
668 			ap += BITBLK;
669 			break;
670 		case BOL:
671 			if (lp != bol)
672 				return NOTFOUND;
673 			break;
674 		case EOL:
675 			if (lp < endp)
676 				return NOTFOUND;
677 			break;
678 		case BOT:
679 			bopat[*ap++] = lp;
680 			break;
681 		case EOT:
682 			eopat[*ap++] = lp;
683 			break;
684  		case BOW:
685 			if (lp!=bol && iswordc(ci.CharAt(lp-1)) || !iswordc(ci.CharAt(lp)))
686 				return NOTFOUND;
687 			break;
688 		case EOW:
689 			if (lp==bol || !iswordc(ci.CharAt(lp-1)) || iswordc(ci.CharAt(lp)))
690 				return NOTFOUND;
691 			break;
692 		case REF:
693 			n = *ap++;
694 			bp = bopat[n];
695 			ep = eopat[n];
696 			while (bp < ep)
697 				if (ci.CharAt(bp++) != ci.CharAt(lp++))
698 					return NOTFOUND;
699 			break;
700 		case CLO:
701 			are = lp;
702 			switch(*ap) {
703 
704 			case ANY:
705 				while (lp < endp)
706 					lp++;
707 				n = ANYSKIP;
708 				break;
709 			case CHR:
710 				c = *(ap+1);
711 				while ((lp < endp) && (c == ci.CharAt(lp)))
712 					lp++;
713 				n = CHRSKIP;
714 				break;
715 			case CCL:
716 				while ((lp < endp) && isinset(ap+1,ci.CharAt(lp)))
717 					lp++;
718 				n = CCLSKIP;
719 				break;
720 			default:
721 				failure = true;
722 				//re_fail("closure: bad nfa.", *ap);
723 				return NOTFOUND;
724 			}
725 
726 			ap += n;
727 
728 			while (lp >= are) {
729 				if ((e = PMatch(ci, lp, endp, ap)) != NOTFOUND)
730 					return e;
731 				--lp;
732 			}
733 			return NOTFOUND;
734 		default:
735 			//re_fail("RESearch::Execute: bad nfa.", static_cast<char>(op));
736 			return NOTFOUND;
737 		}
738 	return lp;
739 }
740 
741 /*
742  * RESearch::Substitute:
743  *  substitute the matched portions of the src in dst.
744  *
745  *  &    substitute the entire matched pattern.
746  *
747  *  \digit  substitute a subpattern, with the given tag number.
748  *      Tags are numbered from 1 to 9. If the particular
749  *      tagged subpattern does not exist, null is substituted.
750  */
Substitute(CharacterIndexer & ci,char * src,char * dst)751 int RESearch::Substitute(CharacterIndexer &ci, char *src, char *dst) {
752 	char c;
753 	int  pin;
754 	int bp;
755 	int ep;
756 
757 	if (!*src || !bopat[0])
758 		return 0;
759 
760 	while ((c = *src++) != 0) {
761 		switch(c) {
762 
763 		case '&':
764 			pin = 0;
765 			break;
766 
767 		case '\\':
768 			c = *src++;
769 			if (c >= '0' && c <= '9') {
770 				pin = c - '0';
771 				break;
772 			}
773 
774 		default:
775 			*dst++ = c;
776 			continue;
777 		}
778 
779 		if ((bp = bopat[pin]) != 0 && (ep = eopat[pin]) != 0) {
780 			while (ci.CharAt(bp) && bp < ep)
781 				*dst++ = ci.CharAt(bp++);
782 			if (bp < ep)
783 				return 0;
784 		}
785 	}
786 	*dst = (char) 0;
787 	return 1;
788 }
789