1 /* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
2 file Copyright.txt or https://cmake.org/licensing#kwsys for details. */
3 //
4 // Copyright (C) 1991 Texas Instruments Incorporated.
5 //
6 // Permission is granted to any individual or institution to use, copy, modify
7 // and distribute this software, provided that this complete copyright and
8 // permission notice is maintained, intact, in all copies and supporting
9 // documentation.
10 //
11 // Texas Instruments Incorporated provides this software "as is" without
12 // express or implied warranty.
13 //
14 //
15 // Created: MNF 06/13/89 Initial Design and Implementation
16 // Updated: LGO 08/09/89 Inherit from Generic
17 // Updated: MBN 09/07/89 Added conditional exception handling
18 // Updated: MBN 12/15/89 Sprinkled "const" qualifiers all over the place!
19 // Updated: DLS 03/22/91 New lite version
20 //
21
22 #include "kwsysPrivate.h"
23 #include KWSYS_HEADER(RegularExpression.hxx)
24
25 // Work-around CMake dependency scanning limitation. This must
26 // duplicate the above list of headers.
27 #if 0
28 # include "RegularExpression.hxx.in"
29 #endif
30
31 #include <cstdio>
32 #include <cstring>
33
34 namespace KWSYS_NAMESPACE {
35
36 // RegularExpression -- Copies the given regular expression.
RegularExpression(const RegularExpression & rxp)37 RegularExpression::RegularExpression(const RegularExpression& rxp)
38 {
39 if (!rxp.program) {
40 this->program = nullptr;
41 return;
42 }
43 int ind;
44 this->progsize = rxp.progsize; // Copy regular expression size
45 this->program = new char[this->progsize]; // Allocate storage
46 for (ind = this->progsize; ind-- != 0;) // Copy regular expression
47 this->program[ind] = rxp.program[ind];
48 // Copy pointers into last successful "find" operation
49 this->regmatch = rxp.regmatch;
50 this->regmust = rxp.regmust; // Copy field
51 if (rxp.regmust != nullptr) {
52 char* dum = rxp.program;
53 ind = 0;
54 while (dum != rxp.regmust) {
55 ++dum;
56 ++ind;
57 }
58 this->regmust = this->program + ind;
59 }
60 this->regstart = rxp.regstart; // Copy starting index
61 this->reganch = rxp.reganch; // Copy remaining private data
62 this->regmlen = rxp.regmlen; // Copy remaining private data
63 }
64
65 // operator= -- Copies the given regular expression.
operator =(const RegularExpression & rxp)66 RegularExpression& RegularExpression::operator=(const RegularExpression& rxp)
67 {
68 if (this == &rxp) {
69 return *this;
70 }
71 if (!rxp.program) {
72 this->program = nullptr;
73 return *this;
74 }
75 int ind;
76 this->progsize = rxp.progsize; // Copy regular expression size
77 delete[] this->program;
78 this->program = new char[this->progsize]; // Allocate storage
79 for (ind = this->progsize; ind-- != 0;) // Copy regular expression
80 this->program[ind] = rxp.program[ind];
81 // Copy pointers into last successful "find" operation
82 this->regmatch = rxp.regmatch;
83 this->regmust = rxp.regmust; // Copy field
84 if (rxp.regmust != nullptr) {
85 char* dum = rxp.program;
86 ind = 0;
87 while (dum != rxp.regmust) {
88 ++dum;
89 ++ind;
90 }
91 this->regmust = this->program + ind;
92 }
93 this->regstart = rxp.regstart; // Copy starting index
94 this->reganch = rxp.reganch; // Copy remaining private data
95 this->regmlen = rxp.regmlen; // Copy remaining private data
96
97 return *this;
98 }
99
100 // operator== -- Returns true if two regular expressions have the same
101 // compiled program for pattern matching.
operator ==(const RegularExpression & rxp) const102 bool RegularExpression::operator==(const RegularExpression& rxp) const
103 {
104 if (this != &rxp) { // Same address?
105 int ind = this->progsize; // Get regular expression size
106 if (ind != rxp.progsize) // If different size regexp
107 return false; // Return failure
108 while (ind-- != 0) // Else while still characters
109 if (this->program[ind] != rxp.program[ind]) // If regexp are different
110 return false; // Return failure
111 }
112 return true; // Else same, return success
113 }
114
115 // deep_equal -- Returns true if have the same compiled regular expressions
116 // and the same start and end pointers.
117
deep_equal(const RegularExpression & rxp) const118 bool RegularExpression::deep_equal(const RegularExpression& rxp) const
119 {
120 int ind = this->progsize; // Get regular expression size
121 if (ind != rxp.progsize) // If different size regexp
122 return false; // Return failure
123 while (ind-- != 0) // Else while still characters
124 if (this->program[ind] != rxp.program[ind]) // If regexp are different
125 return false; // Return failure
126 // Else if same start/end ptrs, return true
127 return (this->regmatch.start() == rxp.regmatch.start() &&
128 this->regmatch.end() == rxp.regmatch.end());
129 }
130
131 // The remaining code in this file is derived from the regular expression code
132 // whose copyright statement appears below. It has been changed to work
133 // with the class concepts of C++ and COOL.
134
135 /*
136 * compile and find
137 *
138 * Copyright (c) 1986 by University of Toronto.
139 * Written by Henry Spencer. Not derived from licensed software.
140 *
141 * Permission is granted to anyone to use this software for any
142 * purpose on any computer system, and to redistribute it freely,
143 * subject to the following restrictions:
144 *
145 * 1. The author is not responsible for the consequences of use of
146 * this software, no matter how awful, even if they arise
147 * from defects in it.
148 *
149 * 2. The origin of this software must not be misrepresented, either
150 * by explicit claim or by omission.
151 *
152 * 3. Altered versions must be plainly marked as such, and must not
153 * be misrepresented as being the original software.
154 *
155 * Beware that some of this code is subtly aware of the way operator
156 * precedence is structured in regular expressions. Serious changes in
157 * regular-expression syntax might require a total rethink.
158 */
159
160 /*
161 * The "internal use only" fields in regexp.h are present to pass info from
162 * compile to execute that permits the execute phase to run lots faster on
163 * simple cases. They are:
164 *
165 * regstart char that must begin a match; '\0' if none obvious
166 * reganch is the match anchored (at beginning-of-line only)?
167 * regmust string (pointer into program) that match must include, or
168 * nullptr regmlen length of regmust string
169 *
170 * Regstart and reganch permit very fast decisions on suitable starting points
171 * for a match, cutting down the work a lot. Regmust permits fast rejection
172 * of lines that cannot possibly match. The regmust tests are costly enough
173 * that compile() supplies a regmust only if the r.e. contains something
174 * potentially expensive (at present, the only such thing detected is * or +
175 * at the start of the r.e., which can involve a lot of backup). Regmlen is
176 * supplied because the test in find() needs it and compile() is computing
177 * it anyway.
178 */
179
180 /*
181 * Structure for regexp "program". This is essentially a linear encoding
182 * of a nondeterministic finite-state machine (aka syntax charts or
183 * "railroad normal form" in parsing technology). Each node is an opcode
184 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
185 * all nodes except BRANCH implement concatenation; a "next" pointer with
186 * a BRANCH on both ends of it is connecting two alternatives. (Here we
187 * have one of the subtle syntax dependencies: an individual BRANCH (as
188 * opposed to a collection of them) is never concatenated with anything
189 * because of operator precedence.) The operand of some types of node is
190 * a literal string; for others, it is a node leading into a sub-FSM. In
191 * particular, the operand of a BRANCH node is the first node of the branch.
192 * (NB this is *not* a tree structure: the tail of the branch connects
193 * to the thing following the set of BRANCHes.) The opcodes are:
194 */
195
196 // definition number opnd? meaning
197 #define END 0 // no End of program.
198 #define BOL 1 // no Match "" at beginning of line.
199 #define EOL 2 // no Match "" at end of line.
200 #define ANY 3 // no Match any one character.
201 #define ANYOF 4 // str Match any character in this string.
202 #define ANYBUT \
203 5 // str Match any character not in this
204 // string.
205 #define BRANCH \
206 6 // node Match this alternative, or the
207 // next...
208 #define BACK 7 // no Match "", "next" ptr points backward.
209 #define EXACTLY 8 // str Match this string.
210 #define NOTHING 9 // no Match empty string.
211 #define STAR \
212 10 // node Match this (simple) thing 0 or more
213 // times.
214 #define PLUS \
215 11 // node Match this (simple) thing 1 or more
216 // times.
217 #define OPEN \
218 20 // no Mark this point in input as start of
219 // #n.
220 // OPEN+1 is number 1, etc.
221 #define CLOSE 30 // no Analogous to OPEN.
222
223 /*
224 * Opcode notes:
225 *
226 * BRANCH The set of branches constituting a single choice are hooked
227 * together with their "next" pointers, since precedence prevents
228 * anything being concatenated to any individual branch. The
229 * "next" pointer of the last BRANCH in a choice points to the
230 * thing following the whole choice. This is also where the
231 * final "next" pointer of each individual branch points; each
232 * branch starts with the operand node of a BRANCH node.
233 *
234 * BACK Normal "next" pointers all implicitly point forward; BACK
235 * exists to make loop structures possible.
236 *
237 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
238 * BRANCH structures using BACK. Simple cases (one character
239 * per match) are implemented with STAR and PLUS for speed
240 * and to minimize recursive plunges.
241 *
242 * OPEN,CLOSE ...are numbered at compile time.
243 */
244
245 /*
246 * A node is one char of opcode followed by two chars of "next" pointer.
247 * "Next" pointers are stored as two 8-bit pieces, high order first. The
248 * value is a positive offset from the opcode of the node containing it.
249 * An operand, if any, simply follows the node. (Note that much of the
250 * code generation knows about this implicit relationship.)
251 *
252 * Using two bytes for the "next" pointer is vast overkill for most things,
253 * but allows patterns to get big without disasters.
254 */
255
256 #define OP(p) (*(p))
257 #define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
258 #define OPERAND(p) ((p) + 3)
259
260 const unsigned char MAGIC = 0234;
261 /*
262 * Utility definitions.
263 */
264
265 #define UCHARAT(p) (reinterpret_cast<const unsigned char*>(p))[0]
266
267 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
268 #define META "^$.[()|?+*\\"
269
270 /*
271 * Flags to be passed up and down.
272 */
273 #define HASWIDTH 01 // Known never to match null string.
274 #define SIMPLE 02 // Simple enough to be STAR/PLUS operand.
275 #define SPSTART 04 // Starts with * or +.
276 #define WORST 0 // Worst case.
277
278 /////////////////////////////////////////////////////////////////////////
279 //
280 // COMPILE AND ASSOCIATED FUNCTIONS
281 //
282 /////////////////////////////////////////////////////////////////////////
283
284 /*
285 * Read only utility variables.
286 */
287 static char regdummy;
288 static char* const regdummyptr = ®dummy;
289
290 /*
291 * Utility class for RegularExpression::compile().
292 */
293 class RegExpCompile
294 {
295 public:
296 const char* regparse; // Input-scan pointer.
297 int regnpar; // () count.
298 char* regcode; // Code-emit pointer; regdummyptr = don't.
299 long regsize; // Code size.
300
301 char* reg(int, int*);
302 char* regbranch(int*);
303 char* regpiece(int*);
304 char* regatom(int*);
305 char* regnode(char);
306 void regc(char);
307 void reginsert(char, char*);
308 static void regtail(char*, const char*);
309 static void regoptail(char*, const char*);
310 };
311
312 static const char* regnext(const char*);
313 static char* regnext(char*);
314
315 #ifdef STRCSPN
316 static int strcspn();
317 #endif
318
319 /*
320 * We can't allocate space until we know how big the compiled form will be,
321 * but we can't compile it (and thus know how big it is) until we've got a
322 * place to put the code. So we cheat: we compile it twice, once with code
323 * generation turned off and size counting turned on, and once "for real".
324 * This also means that we don't allocate space until we are sure that the
325 * thing really will compile successfully, and we never have to move the
326 * code and thus invalidate pointers into it. (Note that it has to be in
327 * one piece because free() must be able to free it all.)
328 *
329 * Beware that the optimization-preparation code in here knows about some
330 * of the structure of the compiled regexp.
331 */
332
333 // compile -- compile a regular expression into internal code
334 // for later pattern matching.
335
compile(const char * exp)336 bool RegularExpression::compile(const char* exp)
337 {
338 const char* scan;
339 const char* longest;
340 int flags;
341
342 if (exp == nullptr) {
343 // RAISE Error, SYM(RegularExpression), SYM(No_Expr),
344 printf("RegularExpression::compile(): No expression supplied.\n");
345 return false;
346 }
347
348 // First pass: determine size, legality.
349 RegExpCompile comp;
350 comp.regparse = exp;
351 comp.regnpar = 1;
352 comp.regsize = 0L;
353 comp.regcode = regdummyptr;
354 comp.regc(static_cast<char>(MAGIC));
355 if (!comp.reg(0, &flags)) {
356 printf("RegularExpression::compile(): Error in compile.\n");
357 return false;
358 }
359 this->regmatch.clear();
360
361 // Small enough for pointer-storage convention?
362 if (comp.regsize >= 65535L) {
363 // RAISE Error, SYM(RegularExpression), SYM(Expr_Too_Big),
364 printf("RegularExpression::compile(): Expression too big.\n");
365 return false;
366 }
367
368 // Allocate space.
369 //#ifndef _WIN32
370 delete[] this->program;
371 //#endif
372 this->program = new char[comp.regsize];
373 this->progsize = static_cast<int>(comp.regsize);
374
375 if (this->program == nullptr) {
376 // RAISE Error, SYM(RegularExpression), SYM(Out_Of_Memory),
377 printf("RegularExpression::compile(): Out of memory.\n");
378 return false;
379 }
380
381 // Second pass: emit code.
382 comp.regparse = exp;
383 comp.regnpar = 1;
384 comp.regcode = this->program;
385 comp.regc(static_cast<char>(MAGIC));
386 comp.reg(0, &flags);
387
388 // Dig out information for optimizations.
389 this->regstart = '\0'; // Worst-case defaults.
390 this->reganch = 0;
391 this->regmust = nullptr;
392 this->regmlen = 0;
393 scan = this->program + 1; // First BRANCH.
394 if (OP(regnext(scan)) == END) { // Only one top-level choice.
395 scan = OPERAND(scan);
396
397 // Starting-point info.
398 if (OP(scan) == EXACTLY)
399 this->regstart = *OPERAND(scan);
400 else if (OP(scan) == BOL)
401 this->reganch++;
402
403 //
404 // If there's something expensive in the r.e., find the longest
405 // literal string that must appear and make it the regmust. Resolve
406 // ties in favor of later strings, since the regstart check works
407 // with the beginning of the r.e. and avoiding duplication
408 // strengthens checking. Not a strong reason, but sufficient in the
409 // absence of others.
410 //
411 if (flags & SPSTART) {
412 longest = nullptr;
413 size_t len = 0;
414 for (; scan != nullptr; scan = regnext(scan))
415 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
416 longest = OPERAND(scan);
417 len = strlen(OPERAND(scan));
418 }
419 this->regmust = longest;
420 this->regmlen = len;
421 }
422 }
423 return true;
424 }
425
426 /*
427 - reg - regular expression, i.e. main body or parenthesized thing
428 *
429 * Caller must absorb opening parenthesis.
430 *
431 * Combining parenthesis handling with the base level of regular expression
432 * is a trifle forced, but the need to tie the tails of the branches to what
433 * follows makes it hard to avoid.
434 */
reg(int paren,int * flagp)435 char* RegExpCompile::reg(int paren, int* flagp)
436 {
437 char* ret;
438 char* br;
439 char* ender;
440 int parno = 0;
441 int flags;
442
443 *flagp = HASWIDTH; // Tentatively.
444
445 // Make an OPEN node, if parenthesized.
446 if (paren) {
447 if (regnpar >= RegularExpressionMatch::NSUBEXP) {
448 // RAISE Error, SYM(RegularExpression), SYM(Too_Many_Parens),
449 printf("RegularExpression::compile(): Too many parentheses.\n");
450 return nullptr;
451 }
452 parno = regnpar;
453 regnpar++;
454 ret = regnode(static_cast<char>(OPEN + parno));
455 } else
456 ret = nullptr;
457
458 // Pick up the branches, linking them together.
459 br = regbranch(&flags);
460 if (br == nullptr)
461 return (nullptr);
462 if (ret != nullptr)
463 regtail(ret, br); // OPEN -> first.
464 else
465 ret = br;
466 if (!(flags & HASWIDTH))
467 *flagp &= ~HASWIDTH;
468 *flagp |= flags & SPSTART;
469 while (*regparse == '|') {
470 regparse++;
471 br = regbranch(&flags);
472 if (br == nullptr)
473 return (nullptr);
474 regtail(ret, br); // BRANCH -> BRANCH.
475 if (!(flags & HASWIDTH))
476 *flagp &= ~HASWIDTH;
477 *flagp |= flags & SPSTART;
478 }
479
480 // Make a closing node, and hook it on the end.
481 ender = regnode(static_cast<char>((paren) ? CLOSE + parno : END));
482 regtail(ret, ender);
483
484 // Hook the tails of the branches to the closing node.
485 for (br = ret; br != nullptr; br = regnext(br))
486 regoptail(br, ender);
487
488 // Check for proper termination.
489 if (paren && *regparse++ != ')') {
490 // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
491 printf("RegularExpression::compile(): Unmatched parentheses.\n");
492 return nullptr;
493 } else if (!paren && *regparse != '\0') {
494 if (*regparse == ')') {
495 // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
496 printf("RegularExpression::compile(): Unmatched parentheses.\n");
497 return nullptr;
498 } else {
499 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
500 printf("RegularExpression::compile(): Internal error.\n");
501 return nullptr;
502 }
503 // NOTREACHED
504 }
505 return (ret);
506 }
507
508 /*
509 - regbranch - one alternative of an | operator
510 *
511 * Implements the concatenation operator.
512 */
regbranch(int * flagp)513 char* RegExpCompile::regbranch(int* flagp)
514 {
515 char* ret;
516 char* chain;
517 char* latest;
518 int flags;
519
520 *flagp = WORST; // Tentatively.
521
522 ret = regnode(BRANCH);
523 chain = nullptr;
524 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
525 latest = regpiece(&flags);
526 if (latest == nullptr)
527 return (nullptr);
528 *flagp |= flags & HASWIDTH;
529 if (chain == nullptr) // First piece.
530 *flagp |= flags & SPSTART;
531 else
532 regtail(chain, latest);
533 chain = latest;
534 }
535 if (chain == nullptr) // Loop ran zero times.
536 regnode(NOTHING);
537
538 return (ret);
539 }
540
541 /*
542 - regpiece - something followed by possible [*+?]
543 *
544 * Note that the branching code sequences used for ? and the general cases
545 * of * and + are somewhat optimized: they use the same NOTHING node as
546 * both the endmarker for their branch list and the body of the last branch.
547 * It might seem that this node could be dispensed with entirely, but the
548 * endmarker role is not redundant.
549 */
regpiece(int * flagp)550 char* RegExpCompile::regpiece(int* flagp)
551 {
552 char* ret;
553 char op;
554 char* next;
555 int flags;
556
557 ret = regatom(&flags);
558 if (ret == nullptr)
559 return (nullptr);
560
561 op = *regparse;
562 if (!ISMULT(op)) {
563 *flagp = flags;
564 return (ret);
565 }
566
567 if (!(flags & HASWIDTH) && op != '?') {
568 // RAISE Error, SYM(RegularExpression), SYM(Empty_Operand),
569 printf("RegularExpression::compile() : *+ operand could be empty.\n");
570 return nullptr;
571 }
572 *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
573
574 if (op == '*' && (flags & SIMPLE))
575 reginsert(STAR, ret);
576 else if (op == '*') {
577 // Emit x* as (x&|), where & means "self".
578 reginsert(BRANCH, ret); // Either x
579 regoptail(ret, regnode(BACK)); // and loop
580 regoptail(ret, ret); // back
581 regtail(ret, regnode(BRANCH)); // or
582 regtail(ret, regnode(NOTHING)); // null.
583 } else if (op == '+' && (flags & SIMPLE))
584 reginsert(PLUS, ret);
585 else if (op == '+') {
586 // Emit x+ as x(&|), where & means "self".
587 next = regnode(BRANCH); // Either
588 regtail(ret, next);
589 regtail(regnode(BACK), ret); // loop back
590 regtail(next, regnode(BRANCH)); // or
591 regtail(ret, regnode(NOTHING)); // null.
592 } else if (op == '?') {
593 // Emit x? as (x|)
594 reginsert(BRANCH, ret); // Either x
595 regtail(ret, regnode(BRANCH)); // or
596 next = regnode(NOTHING); // null.
597 regtail(ret, next);
598 regoptail(ret, next);
599 }
600 regparse++;
601 if (ISMULT(*regparse)) {
602 // RAISE Error, SYM(RegularExpression), SYM(Nested_Operand),
603 printf("RegularExpression::compile(): Nested *?+.\n");
604 return nullptr;
605 }
606 return (ret);
607 }
608
609 /*
610 - regatom - the lowest level
611 *
612 * Optimization: gobbles an entire sequence of ordinary characters so that
613 * it can turn them into a single node, which is smaller to store and
614 * faster to run. Backslashed characters are exceptions, each becoming a
615 * separate node; the code is simpler that way and it's not worth fixing.
616 */
regatom(int * flagp)617 char* RegExpCompile::regatom(int* flagp)
618 {
619 char* ret;
620 int flags;
621
622 *flagp = WORST; // Tentatively.
623
624 switch (*regparse++) {
625 case '^':
626 ret = regnode(BOL);
627 break;
628 case '$':
629 ret = regnode(EOL);
630 break;
631 case '.':
632 ret = regnode(ANY);
633 *flagp |= HASWIDTH | SIMPLE;
634 break;
635 case '[': {
636 int rxpclass;
637 int rxpclassend;
638
639 if (*regparse == '^') { // Complement of range.
640 ret = regnode(ANYBUT);
641 regparse++;
642 } else
643 ret = regnode(ANYOF);
644 if (*regparse == ']' || *regparse == '-')
645 regc(*regparse++);
646 while (*regparse != '\0' && *regparse != ']') {
647 if (*regparse == '-') {
648 regparse++;
649 if (*regparse == ']' || *regparse == '\0')
650 regc('-');
651 else {
652 rxpclass = UCHARAT(regparse - 2) + 1;
653 rxpclassend = UCHARAT(regparse);
654 if (rxpclass > rxpclassend + 1) {
655 // RAISE Error, SYM(RegularExpression), SYM(Invalid_Range),
656 printf("RegularExpression::compile(): Invalid range in [].\n");
657 return nullptr;
658 }
659 for (; rxpclass <= rxpclassend; rxpclass++)
660 regc(static_cast<char>(rxpclass));
661 regparse++;
662 }
663 } else
664 regc(*regparse++);
665 }
666 regc('\0');
667 if (*regparse != ']') {
668 // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Bracket),
669 printf("RegularExpression::compile(): Unmatched [].\n");
670 return nullptr;
671 }
672 regparse++;
673 *flagp |= HASWIDTH | SIMPLE;
674 } break;
675 case '(':
676 ret = reg(1, &flags);
677 if (ret == nullptr)
678 return (nullptr);
679 *flagp |= flags & (HASWIDTH | SPSTART);
680 break;
681 case '\0':
682 case '|':
683 case ')':
684 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
685 printf("RegularExpression::compile(): Internal error.\n"); // Never here
686 return nullptr;
687 case '?':
688 case '+':
689 case '*':
690 // RAISE Error, SYM(RegularExpression), SYM(No_Operand),
691 printf("RegularExpression::compile(): ?+* follows nothing.\n");
692 return nullptr;
693 case '\\':
694 if (*regparse == '\0') {
695 // RAISE Error, SYM(RegularExpression), SYM(Trailing_Backslash),
696 printf("RegularExpression::compile(): Trailing backslash.\n");
697 return nullptr;
698 }
699 ret = regnode(EXACTLY);
700 regc(*regparse++);
701 regc('\0');
702 *flagp |= HASWIDTH | SIMPLE;
703 break;
704 default: {
705 int len;
706 char ender;
707
708 regparse--;
709 len = int(strcspn(regparse, META));
710 if (len <= 0) {
711 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
712 printf("RegularExpression::compile(): Internal error.\n");
713 return nullptr;
714 }
715 ender = *(regparse + len);
716 if (len > 1 && ISMULT(ender))
717 len--; // Back off clear of ?+* operand.
718 *flagp |= HASWIDTH;
719 if (len == 1)
720 *flagp |= SIMPLE;
721 ret = regnode(EXACTLY);
722 while (len > 0) {
723 regc(*regparse++);
724 len--;
725 }
726 regc('\0');
727 } break;
728 }
729 return (ret);
730 }
731
732 /*
733 - regnode - emit a node
734 Location.
735 */
regnode(char op)736 char* RegExpCompile::regnode(char op)
737 {
738 char* ret;
739 char* ptr;
740
741 ret = regcode;
742 if (ret == regdummyptr) {
743 regsize += 3;
744 return (ret);
745 }
746
747 ptr = ret;
748 *ptr++ = op;
749 *ptr++ = '\0'; // Null "next" pointer.
750 *ptr++ = '\0';
751 regcode = ptr;
752
753 return (ret);
754 }
755
756 /*
757 - regc - emit (if appropriate) a byte of code
758 */
regc(char b)759 void RegExpCompile::regc(char b)
760 {
761 if (regcode != regdummyptr)
762 *regcode++ = b;
763 else
764 regsize++;
765 }
766
767 /*
768 - reginsert - insert an operator in front of already-emitted operand
769 *
770 * Means relocating the operand.
771 */
reginsert(char op,char * opnd)772 void RegExpCompile::reginsert(char op, char* opnd)
773 {
774 char* src;
775 char* dst;
776 char* place;
777
778 if (regcode == regdummyptr) {
779 regsize += 3;
780 return;
781 }
782
783 src = regcode;
784 regcode += 3;
785 dst = regcode;
786 while (src > opnd)
787 *--dst = *--src;
788
789 place = opnd; // Op node, where operand used to be.
790 *place++ = op;
791 *place++ = '\0';
792 *place = '\0';
793 }
794
795 /*
796 - regtail - set the next-pointer at the end of a node chain
797 */
regtail(char * p,const char * val)798 void RegExpCompile::regtail(char* p, const char* val)
799 {
800 char* scan;
801 char* temp;
802 int offset;
803
804 if (p == regdummyptr)
805 return;
806
807 // Find last node.
808 scan = p;
809 for (;;) {
810 temp = regnext(scan);
811 if (temp == nullptr)
812 break;
813 scan = temp;
814 }
815
816 if (OP(scan) == BACK)
817 offset = int(scan - val);
818 else
819 offset = int(val - scan);
820 *(scan + 1) = static_cast<char>((offset >> 8) & 0377);
821 *(scan + 2) = static_cast<char>(offset & 0377);
822 }
823
824 /*
825 - regoptail - regtail on operand of first argument; nop if operandless
826 */
regoptail(char * p,const char * val)827 void RegExpCompile::regoptail(char* p, const char* val)
828 {
829 // "Operandless" and "op != BRANCH" are synonymous in practice.
830 if (p == nullptr || p == regdummyptr || OP(p) != BRANCH)
831 return;
832 regtail(OPERAND(p), val);
833 }
834
835 ////////////////////////////////////////////////////////////////////////
836 //
837 // find and friends
838 //
839 ////////////////////////////////////////////////////////////////////////
840
841 /*
842 * Utility class for RegularExpression::find().
843 */
844 class RegExpFind
845 {
846 public:
847 const char* reginput; // String-input pointer.
848 const char* regbol; // Beginning of input, for ^ check.
849 const char** regstartp; // Pointer to startp array.
850 const char** regendp; // Ditto for endp.
851
852 int regtry(const char*, const char**, const char**, const char*);
853 int regmatch(const char*);
854 int regrepeat(const char*);
855 };
856
857 // find -- Matches the regular expression to the given string.
858 // Returns true if found, and sets start and end indexes accordingly.
find(char const * string,RegularExpressionMatch & rmatch) const859 bool RegularExpression::find(char const* string,
860 RegularExpressionMatch& rmatch) const
861 {
862 const char* s;
863
864 rmatch.clear();
865 rmatch.searchstring = string;
866
867 if (!this->program) {
868 return false;
869 }
870
871 // Check validity of program.
872 if (UCHARAT(this->program) != MAGIC) {
873 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
874 printf(
875 "RegularExpression::find(): Compiled regular expression corrupted.\n");
876 return false;
877 }
878
879 // If there is a "must appear" string, look for it.
880 if (this->regmust != nullptr) {
881 s = string;
882 while ((s = strchr(s, this->regmust[0])) != nullptr) {
883 if (strncmp(s, this->regmust, this->regmlen) == 0)
884 break; // Found it.
885 s++;
886 }
887 if (s == nullptr) // Not present.
888 return false;
889 }
890
891 RegExpFind regFind;
892
893 // Mark beginning of line for ^ .
894 regFind.regbol = string;
895
896 // Simplest case: anchored match need be tried only once.
897 if (this->reganch)
898 return (
899 regFind.regtry(string, rmatch.startp, rmatch.endp, this->program) != 0);
900
901 // Messy cases: unanchored match.
902 s = string;
903 if (this->regstart != '\0')
904 // We know what char it must start with.
905 while ((s = strchr(s, this->regstart)) != nullptr) {
906 if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
907 return true;
908 s++;
909 }
910 else
911 // We don't -- general case.
912 do {
913 if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
914 return true;
915 } while (*s++ != '\0');
916
917 // Failure.
918 return false;
919 }
920
921 /*
922 - regtry - try match at specific point
923 0 failure, 1 success
924 */
regtry(const char * string,const char ** start,const char ** end,const char * prog)925 int RegExpFind::regtry(const char* string, const char** start,
926 const char** end, const char* prog)
927 {
928 int i;
929 const char** sp1;
930 const char** ep;
931
932 reginput = string;
933 regstartp = start;
934 regendp = end;
935
936 sp1 = start;
937 ep = end;
938 for (i = RegularExpressionMatch::NSUBEXP; i > 0; i--) {
939 *sp1++ = nullptr;
940 *ep++ = nullptr;
941 }
942 if (regmatch(prog + 1)) {
943 start[0] = string;
944 end[0] = reginput;
945 return (1);
946 } else
947 return (0);
948 }
949
950 /*
951 - regmatch - main matching routine
952 *
953 * Conceptually the strategy is simple: check to see whether the current
954 * node matches, call self recursively to see whether the rest matches,
955 * and then act accordingly. In practice we make some effort to avoid
956 * recursion, in particular by going through "ordinary" nodes (that don't
957 * need to know whether the rest of the match failed) by a loop instead of
958 * by recursion.
959 * 0 failure, 1 success
960 */
regmatch(const char * prog)961 int RegExpFind::regmatch(const char* prog)
962 {
963 const char* scan; // Current node.
964 const char* next; // Next node.
965
966 scan = prog;
967
968 while (scan != nullptr) {
969
970 next = regnext(scan);
971
972 switch (OP(scan)) {
973 case BOL:
974 if (reginput != regbol)
975 return (0);
976 break;
977 case EOL:
978 if (*reginput != '\0')
979 return (0);
980 break;
981 case ANY:
982 if (*reginput == '\0')
983 return (0);
984 reginput++;
985 break;
986 case EXACTLY: {
987 size_t len;
988 const char* opnd;
989
990 opnd = OPERAND(scan);
991 // Inline the first character, for speed.
992 if (*opnd != *reginput)
993 return (0);
994 len = strlen(opnd);
995 if (len > 1 && strncmp(opnd, reginput, len) != 0)
996 return (0);
997 reginput += len;
998 } break;
999 case ANYOF:
1000 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == nullptr)
1001 return (0);
1002 reginput++;
1003 break;
1004 case ANYBUT:
1005 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != nullptr)
1006 return (0);
1007 reginput++;
1008 break;
1009 case NOTHING:
1010 break;
1011 case BACK:
1012 break;
1013 case OPEN + 1:
1014 case OPEN + 2:
1015 case OPEN + 3:
1016 case OPEN + 4:
1017 case OPEN + 5:
1018 case OPEN + 6:
1019 case OPEN + 7:
1020 case OPEN + 8:
1021 case OPEN + 9: {
1022 int no;
1023 const char* save;
1024
1025 no = OP(scan) - OPEN;
1026 save = reginput;
1027
1028 if (regmatch(next)) {
1029
1030 //
1031 // Don't set startp if some later invocation of the
1032 // same parentheses already has.
1033 //
1034 if (regstartp[no] == nullptr)
1035 regstartp[no] = save;
1036 return (1);
1037 } else
1038 return (0);
1039 }
1040 // break;
1041 case CLOSE + 1:
1042 case CLOSE + 2:
1043 case CLOSE + 3:
1044 case CLOSE + 4:
1045 case CLOSE + 5:
1046 case CLOSE + 6:
1047 case CLOSE + 7:
1048 case CLOSE + 8:
1049 case CLOSE + 9: {
1050 int no;
1051 const char* save;
1052
1053 no = OP(scan) - CLOSE;
1054 save = reginput;
1055
1056 if (regmatch(next)) {
1057
1058 //
1059 // Don't set endp if some later invocation of the
1060 // same parentheses already has.
1061 //
1062 if (regendp[no] == nullptr)
1063 regendp[no] = save;
1064 return (1);
1065 } else
1066 return (0);
1067 }
1068 // break;
1069 case BRANCH: {
1070
1071 const char* save;
1072
1073 if (OP(next) != BRANCH) // No choice.
1074 next = OPERAND(scan); // Avoid recursion.
1075 else {
1076 do {
1077 save = reginput;
1078 if (regmatch(OPERAND(scan)))
1079 return (1);
1080 reginput = save;
1081 scan = regnext(scan);
1082 } while (scan != nullptr && OP(scan) == BRANCH);
1083 return (0);
1084 // NOTREACHED
1085 }
1086 } break;
1087 case STAR:
1088 case PLUS: {
1089 char nextch;
1090 int no;
1091 const char* save;
1092 int min_no;
1093
1094 //
1095 // Lookahead to avoid useless match attempts when we know
1096 // what character comes next.
1097 //
1098 nextch = '\0';
1099 if (OP(next) == EXACTLY)
1100 nextch = *OPERAND(next);
1101 min_no = (OP(scan) == STAR) ? 0 : 1;
1102 save = reginput;
1103 no = regrepeat(OPERAND(scan));
1104 while (no >= min_no) {
1105 // If it could work, try it.
1106 if (nextch == '\0' || *reginput == nextch)
1107 if (regmatch(next))
1108 return (1);
1109 // Couldn't or didn't -- back up.
1110 no--;
1111 reginput = save + no;
1112 }
1113 return (0);
1114 }
1115 // break;
1116 case END:
1117 return (1); // Success!
1118
1119 default:
1120 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1121 printf(
1122 "RegularExpression::find(): Internal error -- memory corrupted.\n");
1123 return 0;
1124 }
1125 scan = next;
1126 }
1127
1128 //
1129 // We get here only if there's trouble -- normally "case END" is the
1130 // terminating point.
1131 //
1132 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1133 printf("RegularExpression::find(): Internal error -- corrupted pointers.\n");
1134 return (0);
1135 }
1136
1137 /*
1138 - regrepeat - repeatedly match something simple, report how many
1139 */
regrepeat(const char * p)1140 int RegExpFind::regrepeat(const char* p)
1141 {
1142 int count = 0;
1143 const char* scan;
1144 const char* opnd;
1145
1146 scan = reginput;
1147 opnd = OPERAND(p);
1148 switch (OP(p)) {
1149 case ANY:
1150 count = int(strlen(scan));
1151 scan += count;
1152 break;
1153 case EXACTLY:
1154 while (*opnd == *scan) {
1155 count++;
1156 scan++;
1157 }
1158 break;
1159 case ANYOF:
1160 while (*scan != '\0' && strchr(opnd, *scan) != nullptr) {
1161 count++;
1162 scan++;
1163 }
1164 break;
1165 case ANYBUT:
1166 while (*scan != '\0' && strchr(opnd, *scan) == nullptr) {
1167 count++;
1168 scan++;
1169 }
1170 break;
1171 default: // Oh dear. Called inappropriately.
1172 // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1173 printf("cm RegularExpression::find(): Internal error.\n");
1174 return 0;
1175 }
1176 reginput = scan;
1177 return (count);
1178 }
1179
1180 /*
1181 - regnext - dig the "next" pointer out of a node
1182 */
regnext(const char * p)1183 static const char* regnext(const char* p)
1184 {
1185 int offset;
1186
1187 if (p == regdummyptr)
1188 return (nullptr);
1189
1190 offset = NEXT(p);
1191 if (offset == 0)
1192 return (nullptr);
1193
1194 if (OP(p) == BACK)
1195 return (p - offset);
1196 else
1197 return (p + offset);
1198 }
1199
regnext(char * p)1200 static char* regnext(char* p)
1201 {
1202 int offset;
1203
1204 if (p == regdummyptr)
1205 return (nullptr);
1206
1207 offset = NEXT(p);
1208 if (offset == 0)
1209 return (nullptr);
1210
1211 if (OP(p) == BACK)
1212 return (p - offset);
1213 else
1214 return (p + offset);
1215 }
1216
1217 } // namespace KWSYS_NAMESPACE
1218