1 /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
2 file Copyright.txt or https://cmake.org/licensing#kwsys for details. */
3 //
4 // Copyright (C) 1991 Texas Instruments Incorporated.
5 //
6 // Permission is granted to any individual or institution to use, copy, modify
7 // and distribute this software, provided that this complete copyright and
8 // permission notice is maintained, intact, in all copies and supporting
9 // documentation.
10 //
11 // Texas Instruments Incorporated provides this software "as is" without
12 // express or implied warranty.
13 //
14 //
15 // Created: MNF 06/13/89 Initial Design and Implementation
16 // Updated: LGO 08/09/89 Inherit from Generic
17 // Updated: MBN 09/07/89 Added conditional exception handling
18 // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
19 // Updated: DLS 03/22/91 New lite version
20 //
21
22 #include "kwsysPrivate.h"
23 #include KWSYS_HEADER(RegularExpression.hxx)
24
25 // Work-around CMake dependency scanning limitation. This must
26 // duplicate the above list of headers.
27 #if 0
28 # include "RegularExpression.hxx.in"
29 #endif
30
31 #include <stdio.h>
32 #include <string.h>
33
34 namespace KWSYS_NAMESPACE {
35
36 // RegularExpression -- Copies the given regular expression.
RegularExpression(const RegularExpression & rxp)37 RegularExpression::RegularExpression(const RegularExpression& rxp)
38 {
39 if (!rxp.program) {
40 this->program = KWSYS_NULLPTR;
41 return;
42 }
43 int ind;
44 this->progsize = rxp.progsize; // Copy regular expression size
45 this->program = new char[this->progsize]; // Allocate storage
46 for (ind = this->progsize; ind-- != 0;) // Copy regular expression
47 this->program[ind] = rxp.program[ind];
48 // Copy pointers into last successful "find" operation
49 this->regmatch = rxp.regmatch;
50 this->regmust = rxp.regmust; // Copy field
51 if (rxp.regmust != KWSYS_NULLPTR) {
52 char* dum = rxp.program;
53 ind = 0;
54 while (dum != rxp.regmust) {
55 ++dum;
56 ++ind;
57 }
58 this->regmust = this->program + ind;
59 }
60 this->regstart = rxp.regstart; // Copy starting index
61 this->reganch = rxp.reganch; // Copy remaining private data
62 this->regmlen = rxp.regmlen; // Copy remaining private data
63 }
64
65 // operator= -- Copies the given regular expression.
operator =(const RegularExpression & rxp)66 RegularExpression& RegularExpression::operator=(const RegularExpression& rxp)
67 {
68 if (this == &rxp) {
69 return *this;
70 }
71 if (!rxp.program) {
72 this->program = KWSYS_NULLPTR;
73 return *this;
74 }
75 int ind;
76 this->progsize = rxp.progsize; // Copy regular expression size
77 delete[] this->program;
78 this->program = new char[this->progsize]; // Allocate storage
79 for (ind = this->progsize; ind-- != 0;) // Copy regular expression
80 this->program[ind] = rxp.program[ind];
81 // Copy pointers into last successful "find" operation
82 this->regmatch = rxp.regmatch;
83 this->regmust = rxp.regmust; // Copy field
84 if (rxp.regmust != KWSYS_NULLPTR) {
85 char* dum = rxp.program;
86 ind = 0;
87 while (dum != rxp.regmust) {
88 ++dum;
89 ++ind;
90 }
91 this->regmust = this->program + ind;
92 }
93 this->regstart = rxp.regstart; // Copy starting index
94 this->reganch = rxp.reganch; // Copy remaining private data
95 this->regmlen = rxp.regmlen; // Copy remaining private data
96
97 return *this;
98 }
99
100 // operator== -- Returns true if two regular expressions have the same
101 // compiled program for pattern matching.
operator ==(const RegularExpression & rxp) const102 bool RegularExpression::operator==(const RegularExpression& rxp) const
103 {
104 if (this != &rxp) { // Same address?
105 int ind = this->progsize; // Get regular expression size
106 if (ind != rxp.progsize) // If different size regexp
107 return false; // Return failure
108 while (ind-- != 0) // Else while still characters
109 if (this->program[ind] != rxp.program[ind]) // If regexp are different
110 return false; // Return failure
111 }
112 return true; // Else same, return success
113 }
114
115 // deep_equal -- Returns true if have the same compiled regular expressions
116 // and the same start and end pointers.
117
deep_equal(const RegularExpression & rxp) const118 bool RegularExpression::deep_equal(const RegularExpression& rxp) const
119 {
120 int ind = this->progsize; // Get regular expression size
121 if (ind != rxp.progsize) // If different size regexp
122 return false; // Return failure
123 while (ind-- != 0) // Else while still characters
124 if (this->program[ind] != rxp.program[ind]) // If regexp are different
125 return false; // Return failure
126 // Else if same start/end ptrs, return true
127 return (this->regmatch.start() == rxp.regmatch.start() &&
128 this->regmatch.end() == rxp.regmatch.end());
129 }
130
131 // The remaining code in this file is derived from the regular expression code
132 // whose copyright statement appears below. It has been changed to work
133 // with the class concepts of C++ and COOL.
134
135 /*
136 * compile and find
137 *
138 * Copyright (c) 1986 by University of Toronto.
139 * Written by Henry Spencer. Not derived from licensed software.
140 *
141 * Permission is granted to anyone to use this software for any
142 * purpose on any computer system, and to redistribute it freely,
143 * subject to the following restrictions:
144 *
145 * 1. The author is not responsible for the consequences of use of
146 * this software, no matter how awful, even if they arise
147 * from defects in it.
148 *
149 * 2. The origin of this software must not be misrepresented, either
150 * by explicit claim or by omission.
151 *
152 * 3. Altered versions must be plainly marked as such, and must not
153 * be misrepresented as being the original software.
154 *
155 * Beware that some of this code is subtly aware of the way operator
156 * precedence is structured in regular expressions. Serious changes in
157 * regular-expression syntax might require a total rethink.
158 */
159
160 /*
161 * The "internal use only" fields in regexp.h are present to pass info from
162 * compile to execute that permits the execute phase to run lots faster on
163 * simple cases. They are:
164 *
165 * regstart char that must begin a match; '\0' if none obvious
166 * reganch is the match anchored (at beginning-of-line only)?
167 * regmust string (pointer into program) that match must include, or NULL
168 * regmlen length of regmust string
169 *
170 * Regstart and reganch permit very fast decisions on suitable starting points
171 * for a match, cutting down the work a lot. Regmust permits fast rejection
172 * of lines that cannot possibly match. The regmust tests are costly enough
173 * that compile() supplies a regmust only if the r.e. contains something
174 * potentially expensive (at present, the only such thing detected is * or +
175 * at the start of the r.e., which can involve a lot of backup). Regmlen is
176 * supplied because the test in find() needs it and compile() is computing
177 * it anyway.
178 */
179
180 /*
181 * Structure for regexp "program". This is essentially a linear encoding
182 * of a nondeterministic finite-state machine (aka syntax charts or
183 * "railroad normal form" in parsing technology). Each node is an opcode
184 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
185 * all nodes except BRANCH implement concatenation; a "next" pointer with
186 * a BRANCH on both ends of it is connecting two alternatives. (Here we
187 * have one of the subtle syntax dependencies: an individual BRANCH (as
188 * opposed to a collection of them) is never concatenated with anything
189 * because of operator precedence.) The operand of some types of node is
190 * a literal string; for others, it is a node leading into a sub-FSM. In
191 * particular, the operand of a BRANCH node is the first node of the branch.
192 * (NB this is *not* a tree structure: the tail of the branch connects
193 * to the thing following the set of BRANCHes.) The opcodes are:
194 */
195
196 // definition number opnd? meaning
197 #define END 0 // no End of program.
198 #define BOL 1 // no Match "" at beginning of line.
199 #define EOL 2 // no Match "" at end of line.
200 #define ANY 3 // no Match any one character.
201 #define ANYOF 4 // str Match any character in this string.
202 #define ANYBUT \
203 5 // str Match any character not in this
204 // string.
205 #define BRANCH \
206 6 // node Match this alternative, or the
207 // next...
208 #define BACK 7 // no Match "", "next" ptr points backward.
209 #define EXACTLY 8 // str Match this string.
210 #define NOTHING 9 // no Match empty string.
211 #define STAR \
212 10 // node Match this (simple) thing 0 or more
213 // times.
214 #define PLUS \
215 11 // node Match this (simple) thing 1 or more
216 // times.
217 #define OPEN \
218 20 // no Mark this point in input as start of
219 // #n.
220 // OPEN+1 is number 1, etc.
221 #define CLOSE 30 // no Analogous to OPEN.
222
223 /*
224 * Opcode notes:
225 *
226 * BRANCH The set of branches constituting a single choice are hooked
227 * together with their "next" pointers, since precedence prevents
228 * anything being concatenated to any individual branch. The
229 * "next" pointer of the last BRANCH in a choice points to the
230 * thing following the whole choice. This is also where the
231 * final "next" pointer of each individual branch points; each
232 * branch starts with the operand node of a BRANCH node.
233 *
234 * BACK Normal "next" pointers all implicitly point forward; BACK
235 * exists to make loop structures possible.
236 *
237 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
238 * BRANCH structures using BACK. Simple cases (one character
239 * per match) are implemented with STAR and PLUS for speed
240 * and to minimize recursive plunges.
241 *
242 * OPEN,CLOSE ...are numbered at compile time.
243 */
244
245 /*
246 * A node is one char of opcode followed by two chars of "next" pointer.
247 * "Next" pointers are stored as two 8-bit pieces, high order first. The
248 * value is a positive offset from the opcode of the node containing it.
249 * An operand, if any, simply follows the node. (Note that much of the
250 * code generation knows about this implicit relationship.)
251 *
252 * Using two bytes for the "next" pointer is vast overkill for most things,
253 * but allows patterns to get big without disasters.
254 */
255
256 #define OP(p) (*(p))
257 #define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
258 #define OPERAND(p) ((p) + 3)
259
260 const unsigned char MAGIC = 0234;
261 /*
262 * Utility definitions.
263 */
264
265 #define UCHARAT(p) (reinterpret_cast<const unsigned char*>(p))[0]
266
267 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
268 #define META "^$.[()|?+*\\"
269
270 /*
271 * Flags to be passed up and down.
272 */
273 #define HASWIDTH 01 // Known never to match null string.
274 #define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
275 #define SPSTART 04 // Starts with * or +.
276 #define WORST 0 // Worst case.
277
278 /////////////////////////////////////////////////////////////////////////
279 //
280 // COMPILE AND ASSOCIATED FUNCTIONS
281 //
282 /////////////////////////////////////////////////////////////////////////
283
284 /*
285 * Read only utility variables.
286 */
287 static char regdummy;
288 static char* const regdummyptr = ®dummy;
289
290 /*
291 * Utility class for RegularExpression::compile().
292 */
293 class RegExpCompile
294 {
295 public:
296 const char* regparse; // Input-scan pointer.
297 int regnpar; // () count.
298 char* regcode; // Code-emit pointer; regdummyptr = don't.
299 long regsize; // Code size.
300
301 char* reg(int, int*);
302 char* regbranch(int*);
303 char* regpiece(int*);
304 char* regatom(int*);
305 char* regnode(char);
306 void regc(char);
307 void reginsert(char, char*);
308 static void regtail(char*, const char*);
309 static void regoptail(char*, const char*);
310 };
311
312 static const char* regnext(const char*);
313 static char* regnext(char*);
314
315 #ifdef STRCSPN
316 static int strcspn();
317 #endif
318
319 /*
320 * We can't allocate space until we know how big the compiled form will be,
321 * but we can't compile it (and thus know how big it is) until we've got a
322 * place to put the code. So we cheat: we compile it twice, once with code
323 * generation turned off and size counting turned on, and once "for real".
324 * This also means that we don't allocate space until we are sure that the
325 * thing really will compile successfully, and we never have to move the
326 * code and thus invalidate pointers into it. (Note that it has to be in
327 * one piece because free() must be able to free it all.)
328 *
329 * Beware that the optimization-preparation code in here knows about some
330 * of the structure of the compiled regexp.
331 */
332
333 // compile -- compile a regular expression into internal code
334 // for later pattern matching.
335
compile(const char * exp)336 bool RegularExpression::compile(const char* exp)
337 {
338 const char* scan;
339 const char* longest;
340 size_t len;
341 int flags;
342
343 if (exp == KWSYS_NULLPTR) {
344 // RAISE Error, SYM(RegularExpression), SYM(No_Expr),
345 printf("RegularExpression::compile(): No expression supplied.\n");
346 return false;
347 }
348
349 // First pass: determine size, legality.
350 RegExpCompile comp;
351 comp.regparse = exp;
352 comp.regnpar = 1;
353 comp.regsize = 0L;
354 comp.regcode = regdummyptr;
355 comp.regc(static_cast<char>(MAGIC));
356 if (!comp.reg(0, &flags)) {
357 printf("RegularExpression::compile(): Error in compile.\n");
358 return false;
359 }
360 this->regmatch.clear();
361
362 // Small enough for pointer-storage convention?
363 if (comp.regsize >= 32767L) { // Probably could be 65535L.
364 // RAISE Error, SYM(RegularExpression), SYM(Expr_Too_Big),
365 printf("RegularExpression::compile(): Expression too big.\n");
366 return false;
367 }
368
369 // Allocate space.
370 //#ifndef _WIN32
371 if (this->program != KWSYS_NULLPTR)
372 delete[] this->program;
373 //#endif
374 this->program = new char[comp.regsize];
375 this->progsize = static_cast<int>(comp.regsize);
376
377 if (this->program == KWSYS_NULLPTR) {
378 // RAISE Error, SYM(RegularExpression), SYM(Out_Of_Memory),
379 printf("RegularExpression::compile(): Out of memory.\n");
380 return false;
381 }
382
383 // Second pass: emit code.
384 comp.regparse = exp;
385 comp.regnpar = 1;
386 comp.regcode = this->program;
387 comp.regc(static_cast<char>(MAGIC));
388 comp.reg(0, &flags);
389
390 // Dig out information for optimizations.
391 this->regstart = '\0'; // Worst-case defaults.
392 this->reganch = 0;
393 this->regmust = KWSYS_NULLPTR;
394 this->regmlen = 0;
395 scan = this->program + 1; // First BRANCH.
396 if (OP(regnext(scan)) == END) { // Only one top-level choice.
397 scan = OPERAND(scan);
398
399 // Starting-point info.
400 if (OP(scan) == EXACTLY)
401 this->regstart = *OPERAND(scan);
402 else if (OP(scan) == BOL)
403 this->reganch++;
404
405 //
406 // If there's something expensive in the r.e., find the longest
407 // literal string that must appear and make it the regmust. Resolve
408 // ties in favor of later strings, since the regstart check works
409 // with the beginning of the r.e. and avoiding duplication
410 // strengthens checking. Not a strong reason, but sufficient in the
411 // absence of others.
412 //
413 if (flags & SPSTART) {
414 longest = KWSYS_NULLPTR;
415 len = 0;
416 for (; scan != KWSYS_NULLPTR; scan = regnext(scan))
417 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
418 longest = OPERAND(scan);
419 len = strlen(OPERAND(scan));
420 }
421 this->regmust = longest;
422 this->regmlen = len;
423 }
424 }
425 return true;
426 }
427
428 /*
429 - reg - regular expression, i.e. main body or parenthesized thing
430 *
431 * Caller must absorb opening parenthesis.
432 *
433 * Combining parenthesis handling with the base level of regular expression
434 * is a trifle forced, but the need to tie the tails of the branches to what
435 * follows makes it hard to avoid.
436 */
reg(int paren,int * flagp)437 char* RegExpCompile::reg(int paren, int* flagp)
438 {
439 char* ret;
440 char* br;
441 char* ender;
442 int parno = 0;
443 int flags;
444
445 *flagp = HASWIDTH; // Tentatively.
446
447 // Make an OPEN node, if parenthesized.
448 if (paren) {
449 if (regnpar >= RegularExpressionMatch::NSUBEXP) {
450 // RAISE Error, SYM(RegularExpression), SYM(Too_Many_Parens),
451 printf("RegularExpression::compile(): Too many parentheses.\n");
452 return KWSYS_NULLPTR;
453 }
454 parno = regnpar;
455 regnpar++;
456 ret = regnode(static_cast<char>(OPEN + parno));
457 } else
458 ret = KWSYS_NULLPTR;
459
460 // Pick up the branches, linking them together.
461 br = regbranch(&flags);
462 if (br == KWSYS_NULLPTR)
463 return (KWSYS_NULLPTR);
464 if (ret != KWSYS_NULLPTR)
465 regtail(ret, br); // OPEN -> first.
466 else
467 ret = br;
468 if (!(flags & HASWIDTH))
469 *flagp &= ~HASWIDTH;
470 *flagp |= flags & SPSTART;
471 while (*regparse == '|') {
472 regparse++;
473 br = regbranch(&flags);
474 if (br == KWSYS_NULLPTR)
475 return (KWSYS_NULLPTR);
476 regtail(ret, br); // BRANCH -> BRANCH.
477 if (!(flags & HASWIDTH))
478 *flagp &= ~HASWIDTH;
479 *flagp |= flags & SPSTART;
480 }
481
482 // Make a closing node, and hook it on the end.
483 ender = regnode(static_cast<char>((paren) ? CLOSE + parno : END));
484 regtail(ret, ender);
485
486 // Hook the tails of the branches to the closing node.
487 for (br = ret; br != KWSYS_NULLPTR; br = regnext(br))
488 regoptail(br, ender);
489
490 // Check for proper termination.
491 if (paren && *regparse++ != ')') {
492 // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
493 printf("RegularExpression::compile(): Unmatched parentheses.\n");
494 return KWSYS_NULLPTR;
495 } else if (!paren && *regparse != '\0') {
496 if (*regparse == ')') {
497 // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
498 printf("RegularExpression::compile(): Unmatched parentheses.\n");
499 return KWSYS_NULLPTR;
500 } else {
501 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
502 printf("RegularExpression::compile(): Internal error.\n");
503 return KWSYS_NULLPTR;
504 }
505 // NOTREACHED
506 }
507 return (ret);
508 }
509
510 /*
511 - regbranch - one alternative of an | operator
512 *
513 * Implements the concatenation operator.
514 */
regbranch(int * flagp)515 char* RegExpCompile::regbranch(int* flagp)
516 {
517 char* ret;
518 char* chain;
519 char* latest;
520 int flags;
521
522 *flagp = WORST; // Tentatively.
523
524 ret = regnode(BRANCH);
525 chain = KWSYS_NULLPTR;
526 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
527 latest = regpiece(&flags);
528 if (latest == KWSYS_NULLPTR)
529 return (KWSYS_NULLPTR);
530 *flagp |= flags & HASWIDTH;
531 if (chain == KWSYS_NULLPTR) // First piece.
532 *flagp |= flags & SPSTART;
533 else
534 regtail(chain, latest);
535 chain = latest;
536 }
537 if (chain == KWSYS_NULLPTR) // Loop ran zero times.
538 regnode(NOTHING);
539
540 return (ret);
541 }
542
543 /*
544 - regpiece - something followed by possible [*+?]
545 *
546 * Note that the branching code sequences used for ? and the general cases
547 * of * and + are somewhat optimized: they use the same NOTHING node as
548 * both the endmarker for their branch list and the body of the last branch.
549 * It might seem that this node could be dispensed with entirely, but the
550 * endmarker role is not redundant.
551 */
regpiece(int * flagp)552 char* RegExpCompile::regpiece(int* flagp)
553 {
554 char* ret;
555 char op;
556 char* next;
557 int flags;
558
559 ret = regatom(&flags);
560 if (ret == KWSYS_NULLPTR)
561 return (KWSYS_NULLPTR);
562
563 op = *regparse;
564 if (!ISMULT(op)) {
565 *flagp = flags;
566 return (ret);
567 }
568
569 if (!(flags & HASWIDTH) && op != '?') {
570 // RAISE Error, SYM(RegularExpression), SYM(Empty_Operand),
571 printf("RegularExpression::compile() : *+ operand could be empty.\n");
572 return KWSYS_NULLPTR;
573 }
574 *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
575
576 if (op == '*' && (flags & SIMPLE))
577 reginsert(STAR, ret);
578 else if (op == '*') {
579 // Emit x* as (x&|), where & means "self".
580 reginsert(BRANCH, ret); // Either x
581 regoptail(ret, regnode(BACK)); // and loop
582 regoptail(ret, ret); // back
583 regtail(ret, regnode(BRANCH)); // or
584 regtail(ret, regnode(NOTHING)); // null.
585 } else if (op == '+' && (flags & SIMPLE))
586 reginsert(PLUS, ret);
587 else if (op == '+') {
588 // Emit x+ as x(&|), where & means "self".
589 next = regnode(BRANCH); // Either
590 regtail(ret, next);
591 regtail(regnode(BACK), ret); // loop back
592 regtail(next, regnode(BRANCH)); // or
593 regtail(ret, regnode(NOTHING)); // null.
594 } else if (op == '?') {
595 // Emit x? as (x|)
596 reginsert(BRANCH, ret); // Either x
597 regtail(ret, regnode(BRANCH)); // or
598 next = regnode(NOTHING); // null.
599 regtail(ret, next);
600 regoptail(ret, next);
601 }
602 regparse++;
603 if (ISMULT(*regparse)) {
604 // RAISE Error, SYM(RegularExpression), SYM(Nested_Operand),
605 printf("RegularExpression::compile(): Nested *?+.\n");
606 return KWSYS_NULLPTR;
607 }
608 return (ret);
609 }
610
611 /*
612 - regatom - the lowest level
613 *
614 * Optimization: gobbles an entire sequence of ordinary characters so that
615 * it can turn them into a single node, which is smaller to store and
616 * faster to run. Backslashed characters are exceptions, each becoming a
617 * separate node; the code is simpler that way and it's not worth fixing.
618 */
regatom(int * flagp)619 char* RegExpCompile::regatom(int* flagp)
620 {
621 char* ret;
622 int flags;
623
624 *flagp = WORST; // Tentatively.
625
626 switch (*regparse++) {
627 case '^':
628 ret = regnode(BOL);
629 break;
630 case '$':
631 ret = regnode(EOL);
632 break;
633 case '.':
634 ret = regnode(ANY);
635 *flagp |= HASWIDTH | SIMPLE;
636 break;
637 case '[': {
638 int rxpclass;
639 int rxpclassend;
640
641 if (*regparse == '^') { // Complement of range.
642 ret = regnode(ANYBUT);
643 regparse++;
644 } else
645 ret = regnode(ANYOF);
646 if (*regparse == ']' || *regparse == '-')
647 regc(*regparse++);
648 while (*regparse != '\0' && *regparse != ']') {
649 if (*regparse == '-') {
650 regparse++;
651 if (*regparse == ']' || *regparse == '\0')
652 regc('-');
653 else {
654 rxpclass = UCHARAT(regparse - 2) + 1;
655 rxpclassend = UCHARAT(regparse);
656 if (rxpclass > rxpclassend + 1) {
657 // RAISE Error, SYM(RegularExpression), SYM(Invalid_Range),
658 printf("RegularExpression::compile(): Invalid range in [].\n");
659 return KWSYS_NULLPTR;
660 }
661 for (; rxpclass <= rxpclassend; rxpclass++)
662 regc(static_cast<char>(rxpclass));
663 regparse++;
664 }
665 } else
666 regc(*regparse++);
667 }
668 regc('\0');
669 if (*regparse != ']') {
670 // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Bracket),
671 printf("RegularExpression::compile(): Unmatched [].\n");
672 return KWSYS_NULLPTR;
673 }
674 regparse++;
675 *flagp |= HASWIDTH | SIMPLE;
676 } break;
677 case '(':
678 ret = reg(1, &flags);
679 if (ret == KWSYS_NULLPTR)
680 return (KWSYS_NULLPTR);
681 *flagp |= flags & (HASWIDTH | SPSTART);
682 break;
683 case '\0':
684 case '|':
685 case ')':
686 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
687 printf("RegularExpression::compile(): Internal error.\n"); // Never here
688 return KWSYS_NULLPTR;
689 case '?':
690 case '+':
691 case '*':
692 // RAISE Error, SYM(RegularExpression), SYM(No_Operand),
693 printf("RegularExpression::compile(): ?+* follows nothing.\n");
694 return KWSYS_NULLPTR;
695 case '\\':
696 if (*regparse == '\0') {
697 // RAISE Error, SYM(RegularExpression), SYM(Trailing_Backslash),
698 printf("RegularExpression::compile(): Trailing backslash.\n");
699 return KWSYS_NULLPTR;
700 }
701 ret = regnode(EXACTLY);
702 regc(*regparse++);
703 regc('\0');
704 *flagp |= HASWIDTH | SIMPLE;
705 break;
706 default: {
707 int len;
708 char ender;
709
710 regparse--;
711 len = int(strcspn(regparse, META));
712 if (len <= 0) {
713 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
714 printf("RegularExpression::compile(): Internal error.\n");
715 return KWSYS_NULLPTR;
716 }
717 ender = *(regparse + len);
718 if (len > 1 && ISMULT(ender))
719 len--; // Back off clear of ?+* operand.
720 *flagp |= HASWIDTH;
721 if (len == 1)
722 *flagp |= SIMPLE;
723 ret = regnode(EXACTLY);
724 while (len > 0) {
725 regc(*regparse++);
726 len--;
727 }
728 regc('\0');
729 } break;
730 }
731 return (ret);
732 }
733
734 /*
735 - regnode - emit a node
736 Location.
737 */
regnode(char op)738 char* RegExpCompile::regnode(char op)
739 {
740 char* ret;
741 char* ptr;
742
743 ret = regcode;
744 if (ret == regdummyptr) {
745 regsize += 3;
746 return (ret);
747 }
748
749 ptr = ret;
750 *ptr++ = op;
751 *ptr++ = '\0'; // Null "next" pointer.
752 *ptr++ = '\0';
753 regcode = ptr;
754
755 return (ret);
756 }
757
758 /*
759 - regc - emit (if appropriate) a byte of code
760 */
regc(char b)761 void RegExpCompile::regc(char b)
762 {
763 if (regcode != regdummyptr)
764 *regcode++ = b;
765 else
766 regsize++;
767 }
768
769 /*
770 - reginsert - insert an operator in front of already-emitted operand
771 *
772 * Means relocating the operand.
773 */
reginsert(char op,char * opnd)774 void RegExpCompile::reginsert(char op, char* opnd)
775 {
776 char* src;
777 char* dst;
778 char* place;
779
780 if (regcode == regdummyptr) {
781 regsize += 3;
782 return;
783 }
784
785 src = regcode;
786 regcode += 3;
787 dst = regcode;
788 while (src > opnd)
789 *--dst = *--src;
790
791 place = opnd; // Op node, where operand used to be.
792 *place++ = op;
793 *place++ = '\0';
794 *place = '\0';
795 }
796
797 /*
798 - regtail - set the next-pointer at the end of a node chain
799 */
regtail(char * p,const char * val)800 void RegExpCompile::regtail(char* p, const char* val)
801 {
802 char* scan;
803 char* temp;
804 int offset;
805
806 if (p == regdummyptr)
807 return;
808
809 // Find last node.
810 scan = p;
811 for (;;) {
812 temp = regnext(scan);
813 if (temp == KWSYS_NULLPTR)
814 break;
815 scan = temp;
816 }
817
818 if (OP(scan) == BACK)
819 offset = int(scan - val);
820 else
821 offset = int(val - scan);
822 *(scan + 1) = static_cast<char>((offset >> 8) & 0377);
823 *(scan + 2) = static_cast<char>(offset & 0377);
824 }
825
826 /*
827 - regoptail - regtail on operand of first argument; nop if operandless
828 */
regoptail(char * p,const char * val)829 void RegExpCompile::regoptail(char* p, const char* val)
830 {
831 // "Operandless" and "op != BRANCH" are synonymous in practice.
832 if (p == KWSYS_NULLPTR || p == regdummyptr || OP(p) != BRANCH)
833 return;
834 regtail(OPERAND(p), val);
835 }
836
837 ////////////////////////////////////////////////////////////////////////
838 //
839 // find and friends
840 //
841 ////////////////////////////////////////////////////////////////////////
842
843 /*
844 * Utility class for RegularExpression::find().
845 */
846 class RegExpFind
847 {
848 public:
849 const char* reginput; // String-input pointer.
850 const char* regbol; // Beginning of input, for ^ check.
851 const char** regstartp; // Pointer to startp array.
852 const char** regendp; // Ditto for endp.
853
854 int regtry(const char*, const char**, const char**, const char*);
855 int regmatch(const char*);
856 int regrepeat(const char*);
857 };
858
859 // find -- Matches the regular expression to the given string.
860 // Returns true if found, and sets start and end indexes accordingly.
find(char const * string,RegularExpressionMatch & rmatch) const861 bool RegularExpression::find(char const* string,
862 RegularExpressionMatch& rmatch) const
863 {
864 const char* s;
865
866 rmatch.clear();
867 rmatch.searchstring = string;
868
869 if (!this->program) {
870 return false;
871 }
872
873 // Check validity of program.
874 if (UCHARAT(this->program) != MAGIC) {
875 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
876 printf(
877 "RegularExpression::find(): Compiled regular expression corrupted.\n");
878 return false;
879 }
880
881 // If there is a "must appear" string, look for it.
882 if (this->regmust != KWSYS_NULLPTR) {
883 s = string;
884 while ((s = strchr(s, this->regmust[0])) != KWSYS_NULLPTR) {
885 if (strncmp(s, this->regmust, this->regmlen) == 0)
886 break; // Found it.
887 s++;
888 }
889 if (s == KWSYS_NULLPTR) // Not present.
890 return false;
891 }
892
893 RegExpFind regFind;
894
895 // Mark beginning of line for ^ .
896 regFind.regbol = string;
897
898 // Simplest case: anchored match need be tried only once.
899 if (this->reganch)
900 return (
901 regFind.regtry(string, rmatch.startp, rmatch.endp, this->program) != 0);
902
903 // Messy cases: unanchored match.
904 s = string;
905 if (this->regstart != '\0')
906 // We know what char it must start with.
907 while ((s = strchr(s, this->regstart)) != KWSYS_NULLPTR) {
908 if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
909 return true;
910 s++;
911 }
912 else
913 // We don't -- general case.
914 do {
915 if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
916 return true;
917 } while (*s++ != '\0');
918
919 // Failure.
920 return false;
921 }
922
923 /*
924 - regtry - try match at specific point
925 0 failure, 1 success
926 */
regtry(const char * string,const char ** start,const char ** end,const char * prog)927 int RegExpFind::regtry(const char* string, const char** start,
928 const char** end, const char* prog)
929 {
930 int i;
931 const char** sp1;
932 const char** ep;
933
934 reginput = string;
935 regstartp = start;
936 regendp = end;
937
938 sp1 = start;
939 ep = end;
940 for (i = RegularExpressionMatch::NSUBEXP; i > 0; i--) {
941 *sp1++ = KWSYS_NULLPTR;
942 *ep++ = KWSYS_NULLPTR;
943 }
944 if (regmatch(prog + 1)) {
945 start[0] = string;
946 end[0] = reginput;
947 return (1);
948 } else
949 return (0);
950 }
951
952 /*
953 - regmatch - main matching routine
954 *
955 * Conceptually the strategy is simple: check to see whether the current
956 * node matches, call self recursively to see whether the rest matches,
957 * and then act accordingly. In practice we make some effort to avoid
958 * recursion, in particular by going through "ordinary" nodes (that don't
959 * need to know whether the rest of the match failed) by a loop instead of
960 * by recursion.
961 * 0 failure, 1 success
962 */
regmatch(const char * prog)963 int RegExpFind::regmatch(const char* prog)
964 {
965 const char* scan; // Current node.
966 const char* next; // Next node.
967
968 scan = prog;
969
970 while (scan != KWSYS_NULLPTR) {
971
972 next = regnext(scan);
973
974 switch (OP(scan)) {
975 case BOL:
976 if (reginput != regbol)
977 return (0);
978 break;
979 case EOL:
980 if (*reginput != '\0')
981 return (0);
982 break;
983 case ANY:
984 if (*reginput == '\0')
985 return (0);
986 reginput++;
987 break;
988 case EXACTLY: {
989 size_t len;
990 const char* opnd;
991
992 opnd = OPERAND(scan);
993 // Inline the first character, for speed.
994 if (*opnd != *reginput)
995 return (0);
996 len = strlen(opnd);
997 if (len > 1 && strncmp(opnd, reginput, len) != 0)
998 return (0);
999 reginput += len;
1000 } break;
1001 case ANYOF:
1002 if (*reginput == '\0' ||
1003 strchr(OPERAND(scan), *reginput) == KWSYS_NULLPTR)
1004 return (0);
1005 reginput++;
1006 break;
1007 case ANYBUT:
1008 if (*reginput == '\0' ||
1009 strchr(OPERAND(scan), *reginput) != KWSYS_NULLPTR)
1010 return (0);
1011 reginput++;
1012 break;
1013 case NOTHING:
1014 break;
1015 case BACK:
1016 break;
1017 case OPEN + 1:
1018 case OPEN + 2:
1019 case OPEN + 3:
1020 case OPEN + 4:
1021 case OPEN + 5:
1022 case OPEN + 6:
1023 case OPEN + 7:
1024 case OPEN + 8:
1025 case OPEN + 9: {
1026 int no;
1027 const char* save;
1028
1029 no = OP(scan) - OPEN;
1030 save = reginput;
1031
1032 if (regmatch(next)) {
1033
1034 //
1035 // Don't set startp if some later invocation of the
1036 // same parentheses already has.
1037 //
1038 if (regstartp[no] == KWSYS_NULLPTR)
1039 regstartp[no] = save;
1040 return (1);
1041 } else
1042 return (0);
1043 }
1044 // break;
1045 case CLOSE + 1:
1046 case CLOSE + 2:
1047 case CLOSE + 3:
1048 case CLOSE + 4:
1049 case CLOSE + 5:
1050 case CLOSE + 6:
1051 case CLOSE + 7:
1052 case CLOSE + 8:
1053 case CLOSE + 9: {
1054 int no;
1055 const char* save;
1056
1057 no = OP(scan) - CLOSE;
1058 save = reginput;
1059
1060 if (regmatch(next)) {
1061
1062 //
1063 // Don't set endp if some later invocation of the
1064 // same parentheses already has.
1065 //
1066 if (regendp[no] == KWSYS_NULLPTR)
1067 regendp[no] = save;
1068 return (1);
1069 } else
1070 return (0);
1071 }
1072 // break;
1073 case BRANCH: {
1074
1075 const char* save;
1076
1077 if (OP(next) != BRANCH) // No choice.
1078 next = OPERAND(scan); // Avoid recursion.
1079 else {
1080 do {
1081 save = reginput;
1082 if (regmatch(OPERAND(scan)))
1083 return (1);
1084 reginput = save;
1085 scan = regnext(scan);
1086 } while (scan != KWSYS_NULLPTR && OP(scan) == BRANCH);
1087 return (0);
1088 // NOTREACHED
1089 }
1090 } break;
1091 case STAR:
1092 case PLUS: {
1093 char nextch;
1094 int no;
1095 const char* save;
1096 int min_no;
1097
1098 //
1099 // Lookahead to avoid useless match attempts when we know
1100 // what character comes next.
1101 //
1102 nextch = '\0';
1103 if (OP(next) == EXACTLY)
1104 nextch = *OPERAND(next);
1105 min_no = (OP(scan) == STAR) ? 0 : 1;
1106 save = reginput;
1107 no = regrepeat(OPERAND(scan));
1108 while (no >= min_no) {
1109 // If it could work, try it.
1110 if (nextch == '\0' || *reginput == nextch)
1111 if (regmatch(next))
1112 return (1);
1113 // Couldn't or didn't -- back up.
1114 no--;
1115 reginput = save + no;
1116 }
1117 return (0);
1118 }
1119 // break;
1120 case END:
1121 return (1); // Success!
1122
1123 default:
1124 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1125 printf(
1126 "RegularExpression::find(): Internal error -- memory corrupted.\n");
1127 return 0;
1128 }
1129 scan = next;
1130 }
1131
1132 //
1133 // We get here only if there's trouble -- normally "case END" is the
1134 // terminating point.
1135 //
1136 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1137 printf("RegularExpression::find(): Internal error -- corrupted pointers.\n");
1138 return (0);
1139 }
1140
1141 /*
1142 - regrepeat - repeatedly match something simple, report how many
1143 */
regrepeat(const char * p)1144 int RegExpFind::regrepeat(const char* p)
1145 {
1146 int count = 0;
1147 const char* scan;
1148 const char* opnd;
1149
1150 scan = reginput;
1151 opnd = OPERAND(p);
1152 switch (OP(p)) {
1153 case ANY:
1154 count = int(strlen(scan));
1155 scan += count;
1156 break;
1157 case EXACTLY:
1158 while (*opnd == *scan) {
1159 count++;
1160 scan++;
1161 }
1162 break;
1163 case ANYOF:
1164 while (*scan != '\0' && strchr(opnd, *scan) != KWSYS_NULLPTR) {
1165 count++;
1166 scan++;
1167 }
1168 break;
1169 case ANYBUT:
1170 while (*scan != '\0' && strchr(opnd, *scan) == KWSYS_NULLPTR) {
1171 count++;
1172 scan++;
1173 }
1174 break;
1175 default: // Oh dear. Called inappropriately.
1176 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1177 printf("cm RegularExpression::find(): Internal error.\n");
1178 return 0;
1179 }
1180 reginput = scan;
1181 return (count);
1182 }
1183
1184 /*
1185 - regnext - dig the "next" pointer out of a node
1186 */
regnext(const char * p)1187 static const char* regnext(const char* p)
1188 {
1189 int offset;
1190
1191 if (p == regdummyptr)
1192 return (KWSYS_NULLPTR);
1193
1194 offset = NEXT(p);
1195 if (offset == 0)
1196 return (KWSYS_NULLPTR);
1197
1198 if (OP(p) == BACK)
1199 return (p - offset);
1200 else
1201 return (p + offset);
1202 }
1203
regnext(char * p)1204 static char* regnext(char* p)
1205 {
1206 int offset;
1207
1208 if (p == regdummyptr)
1209 return (KWSYS_NULLPTR);
1210
1211 offset = NEXT(p);
1212 if (offset == 0)
1213 return (KWSYS_NULLPTR);
1214
1215 if (OP(p) == BACK)
1216 return (p - offset);
1217 else
1218 return (p + offset);
1219 }
1220
1221 } // namespace KWSYS_NAMESPACE
1222