1 /* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
2    file Copyright.txt or https://cmake.org/licensing#kwsys for details.  */
3 //
4 // Copyright (C) 1991 Texas Instruments Incorporated.
5 //
6 // Permission is granted to any individual or institution to use, copy, modify
7 // and distribute this software, provided that this complete copyright and
8 // permission notice is maintained, intact, in all copies and supporting
9 // documentation.
10 //
11 // Texas Instruments Incorporated provides this software "as is" without
12 // express or implied warranty.
13 //
14 //
15 // Created: MNF 06/13/89  Initial Design and Implementation
16 // Updated: LGO 08/09/89  Inherit from Generic
17 // Updated: MBN 09/07/89  Added conditional exception handling
18 // Updated: MBN 12/15/89  Sprinkled "const" qualifiers all over the place!
19 // Updated: DLS 03/22/91  New lite version
20 //
21 
22 #include "kwsysPrivate.h"
23 #include KWSYS_HEADER(RegularExpression.hxx)
24 
25 // Work-around CMake dependency scanning limitation.  This must
26 // duplicate the above list of headers.
27 #if 0
28 #  include "RegularExpression.hxx.in"
29 #endif
30 
31 #include <stdio.h>
32 #include <string.h>
33 
34 namespace KWSYS_NAMESPACE {
35 
36 // RegularExpression -- Copies the given regular expression.
RegularExpression(const RegularExpression & rxp)37 RegularExpression::RegularExpression(const RegularExpression& rxp)
38 {
39   if (!rxp.program) {
40     this->program = KWSYS_NULLPTR;
41     return;
42   }
43   int ind;
44   this->progsize = rxp.progsize;            // Copy regular expression size
45   this->program = new char[this->progsize]; // Allocate storage
46   for (ind = this->progsize; ind-- != 0;)   // Copy regular expression
47     this->program[ind] = rxp.program[ind];
48   // Copy pointers into last successful "find" operation
49   this->regmatch = rxp.regmatch;
50   this->regmust = rxp.regmust; // Copy field
51   if (rxp.regmust != KWSYS_NULLPTR) {
52     char* dum = rxp.program;
53     ind = 0;
54     while (dum != rxp.regmust) {
55       ++dum;
56       ++ind;
57     }
58     this->regmust = this->program + ind;
59   }
60   this->regstart = rxp.regstart; // Copy starting index
61   this->reganch = rxp.reganch;   // Copy remaining private data
62   this->regmlen = rxp.regmlen;   // Copy remaining private data
63 }
64 
65 // operator= -- Copies the given regular expression.
operator =(const RegularExpression & rxp)66 RegularExpression& RegularExpression::operator=(const RegularExpression& rxp)
67 {
68   if (this == &rxp) {
69     return *this;
70   }
71   if (!rxp.program) {
72     this->program = KWSYS_NULLPTR;
73     return *this;
74   }
75   int ind;
76   this->progsize = rxp.progsize; // Copy regular expression size
77   delete[] this->program;
78   this->program = new char[this->progsize]; // Allocate storage
79   for (ind = this->progsize; ind-- != 0;)   // Copy regular expression
80     this->program[ind] = rxp.program[ind];
81   // Copy pointers into last successful "find" operation
82   this->regmatch = rxp.regmatch;
83   this->regmust = rxp.regmust; // Copy field
84   if (rxp.regmust != KWSYS_NULLPTR) {
85     char* dum = rxp.program;
86     ind = 0;
87     while (dum != rxp.regmust) {
88       ++dum;
89       ++ind;
90     }
91     this->regmust = this->program + ind;
92   }
93   this->regstart = rxp.regstart; // Copy starting index
94   this->reganch = rxp.reganch;   // Copy remaining private data
95   this->regmlen = rxp.regmlen;   // Copy remaining private data
96 
97   return *this;
98 }
99 
100 // operator== -- Returns true if two regular expressions have the same
101 // compiled program for pattern matching.
operator ==(const RegularExpression & rxp) const102 bool RegularExpression::operator==(const RegularExpression& rxp) const
103 {
104   if (this != &rxp) {         // Same address?
105     int ind = this->progsize; // Get regular expression size
106     if (ind != rxp.progsize)  // If different size regexp
107       return false;           // Return failure
108     while (ind-- != 0)        // Else while still characters
109       if (this->program[ind] != rxp.program[ind]) // If regexp are different
110         return false;                             // Return failure
111   }
112   return true; // Else same, return success
113 }
114 
115 // deep_equal -- Returns true if have the same compiled regular expressions
116 // and the same start and end pointers.
117 
deep_equal(const RegularExpression & rxp) const118 bool RegularExpression::deep_equal(const RegularExpression& rxp) const
119 {
120   int ind = this->progsize;                     // Get regular expression size
121   if (ind != rxp.progsize)                      // If different size regexp
122     return false;                               // Return failure
123   while (ind-- != 0)                            // Else while still characters
124     if (this->program[ind] != rxp.program[ind]) // If regexp are different
125       return false;                             // Return failure
126   // Else if same start/end ptrs, return true
127   return (this->regmatch.start() == rxp.regmatch.start() &&
128           this->regmatch.end() == rxp.regmatch.end());
129 }
130 
131 // The remaining code in this file is derived from the regular expression code
132 // whose copyright statement appears below.  It has been changed to work
133 // with the class concepts of C++ and COOL.
134 
135 /*
136  * compile and find
137  *
138  *      Copyright (c) 1986 by University of Toronto.
139  *      Written by Henry Spencer.  Not derived from licensed software.
140  *
141  *      Permission is granted to anyone to use this software for any
142  *      purpose on any computer system, and to redistribute it freely,
143  *      subject to the following restrictions:
144  *
145  *      1. The author is not responsible for the consequences of use of
146  *              this software, no matter how awful, even if they arise
147  *              from defects in it.
148  *
149  *      2. The origin of this software must not be misrepresented, either
150  *              by explicit claim or by omission.
151  *
152  *      3. Altered versions must be plainly marked as such, and must not
153  *              be misrepresented as being the original software.
154  *
155  * Beware that some of this code is subtly aware of the way operator
156  * precedence is structured in regular expressions.  Serious changes in
157  * regular-expression syntax might require a total rethink.
158  */
159 
160 /*
161  * The "internal use only" fields in regexp.h are present to pass info from
162  * compile to execute that permits the execute phase to run lots faster on
163  * simple cases.  They are:
164  *
165  * regstart     char that must begin a match; '\0' if none obvious
166  * reganch      is the match anchored (at beginning-of-line only)?
167  * regmust      string (pointer into program) that match must include, or NULL
168  * regmlen      length of regmust string
169  *
170  * Regstart and reganch permit very fast decisions on suitable starting points
171  * for a match, cutting down the work a lot.  Regmust permits fast rejection
172  * of lines that cannot possibly match.  The regmust tests are costly enough
173  * that compile() supplies a regmust only if the r.e. contains something
174  * potentially expensive (at present, the only such thing detected is * or +
175  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
176  * supplied because the test in find() needs it and compile() is computing
177  * it anyway.
178  */
179 
180 /*
181  * Structure for regexp "program".  This is essentially a linear encoding
182  * of a nondeterministic finite-state machine (aka syntax charts or
183  * "railroad normal form" in parsing technology).  Each node is an opcode
184  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
185  * all nodes except BRANCH implement concatenation; a "next" pointer with
186  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
187  * have one of the subtle syntax dependencies:  an individual BRANCH (as
188  * opposed to a collection of them) is never concatenated with anything
189  * because of operator precedence.)  The operand of some types of node is
190  * a literal string; for others, it is a node leading into a sub-FSM.  In
191  * particular, the operand of a BRANCH node is the first node of the branch.
192  * (NB this is *not* a tree structure:  the tail of the branch connects
193  * to the thing following the set of BRANCHes.)  The opcodes are:
194  */
195 
196 // definition   number  opnd?   meaning
197 #define END 0   // no   End of program.
198 #define BOL 1   // no   Match "" at beginning of line.
199 #define EOL 2   // no   Match "" at end of line.
200 #define ANY 3   // no   Match any one character.
201 #define ANYOF 4 // str  Match any character in this string.
202 #define ANYBUT                                                                \
203   5 // str  Match any character not in this
204     // string.
205 #define BRANCH                                                                \
206   6               // node Match this alternative, or the
207                   // next...
208 #define BACK 7    // no   Match "", "next" ptr points backward.
209 #define EXACTLY 8 // str  Match this string.
210 #define NOTHING 9 // no   Match empty string.
211 #define STAR                                                                  \
212   10 // node Match this (simple) thing 0 or more
213      // times.
214 #define PLUS                                                                  \
215   11 // node Match this (simple) thing 1 or more
216      // times.
217 #define OPEN                                                                  \
218   20 // no   Mark this point in input as start of
219      // #n.
220 // OPEN+1 is number 1, etc.
221 #define CLOSE 30 // no   Analogous to OPEN.
222 
223 /*
224  * Opcode notes:
225  *
226  * BRANCH       The set of branches constituting a single choice are hooked
227  *              together with their "next" pointers, since precedence prevents
228  *              anything being concatenated to any individual branch.  The
229  *              "next" pointer of the last BRANCH in a choice points to the
230  *              thing following the whole choice.  This is also where the
231  *              final "next" pointer of each individual branch points; each
232  *              branch starts with the operand node of a BRANCH node.
233  *
234  * BACK         Normal "next" pointers all implicitly point forward; BACK
235  *              exists to make loop structures possible.
236  *
237  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
238  *              BRANCH structures using BACK.  Simple cases (one character
239  *              per match) are implemented with STAR and PLUS for speed
240  *              and to minimize recursive plunges.
241  *
242  * OPEN,CLOSE   ...are numbered at compile time.
243  */
244 
245 /*
246  * A node is one char of opcode followed by two chars of "next" pointer.
247  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
248  * value is a positive offset from the opcode of the node containing it.
249  * An operand, if any, simply follows the node.  (Note that much of the
250  * code generation knows about this implicit relationship.)
251  *
252  * Using two bytes for the "next" pointer is vast overkill for most things,
253  * but allows patterns to get big without disasters.
254  */
255 
256 #define OP(p) (*(p))
257 #define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
258 #define OPERAND(p) ((p) + 3)
259 
260 const unsigned char MAGIC = 0234;
261 /*
262  * Utility definitions.
263  */
264 
265 #define UCHARAT(p) (reinterpret_cast<const unsigned char*>(p))[0]
266 
267 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
268 #define META "^$.[()|?+*\\"
269 
270 /*
271  * Flags to be passed up and down.
272  */
273 #define HASWIDTH 01 // Known never to match null string.
274 #define SIMPLE 02   // Simple enough to be STAR/PLUS operand.
275 #define SPSTART 04  // Starts with * or +.
276 #define WORST 0     // Worst case.
277 
278 /////////////////////////////////////////////////////////////////////////
279 //
280 //  COMPILE AND ASSOCIATED FUNCTIONS
281 //
282 /////////////////////////////////////////////////////////////////////////
283 
284 /*
285  * Read only utility variables.
286  */
287 static char regdummy;
288 static char* const regdummyptr = &regdummy;
289 
290 /*
291  * Utility class for RegularExpression::compile().
292  */
293 class RegExpCompile
294 {
295 public:
296   const char* regparse; // Input-scan pointer.
297   int regnpar;          // () count.
298   char* regcode;        // Code-emit pointer; regdummyptr = don't.
299   long regsize;         // Code size.
300 
301   char* reg(int, int*);
302   char* regbranch(int*);
303   char* regpiece(int*);
304   char* regatom(int*);
305   char* regnode(char);
306   void regc(char);
307   void reginsert(char, char*);
308   static void regtail(char*, const char*);
309   static void regoptail(char*, const char*);
310 };
311 
312 static const char* regnext(const char*);
313 static char* regnext(char*);
314 
315 #ifdef STRCSPN
316 static int strcspn();
317 #endif
318 
319 /*
320  * We can't allocate space until we know how big the compiled form will be,
321  * but we can't compile it (and thus know how big it is) until we've got a
322  * place to put the code.  So we cheat:  we compile it twice, once with code
323  * generation turned off and size counting turned on, and once "for real".
324  * This also means that we don't allocate space until we are sure that the
325  * thing really will compile successfully, and we never have to move the
326  * code and thus invalidate pointers into it.  (Note that it has to be in
327  * one piece because free() must be able to free it all.)
328  *
329  * Beware that the optimization-preparation code in here knows about some
330  * of the structure of the compiled regexp.
331  */
332 
333 // compile -- compile a regular expression into internal code
334 // for later pattern matching.
335 
compile(const char * exp)336 bool RegularExpression::compile(const char* exp)
337 {
338   const char* scan;
339   const char* longest;
340   size_t len;
341   int flags;
342 
343   if (exp == KWSYS_NULLPTR) {
344     // RAISE Error, SYM(RegularExpression), SYM(No_Expr),
345     printf("RegularExpression::compile(): No expression supplied.\n");
346     return false;
347   }
348 
349   // First pass: determine size, legality.
350   RegExpCompile comp;
351   comp.regparse = exp;
352   comp.regnpar = 1;
353   comp.regsize = 0L;
354   comp.regcode = regdummyptr;
355   comp.regc(static_cast<char>(MAGIC));
356   if (!comp.reg(0, &flags)) {
357     printf("RegularExpression::compile(): Error in compile.\n");
358     return false;
359   }
360   this->regmatch.clear();
361 
362   // Small enough for pointer-storage convention?
363   if (comp.regsize >= 32767L) { // Probably could be 65535L.
364     // RAISE Error, SYM(RegularExpression), SYM(Expr_Too_Big),
365     printf("RegularExpression::compile(): Expression too big.\n");
366     return false;
367   }
368 
369   // Allocate space.
370   //#ifndef _WIN32
371   if (this->program != KWSYS_NULLPTR)
372     delete[] this->program;
373   //#endif
374   this->program = new char[comp.regsize];
375   this->progsize = static_cast<int>(comp.regsize);
376 
377   if (this->program == KWSYS_NULLPTR) {
378     // RAISE Error, SYM(RegularExpression), SYM(Out_Of_Memory),
379     printf("RegularExpression::compile(): Out of memory.\n");
380     return false;
381   }
382 
383   // Second pass: emit code.
384   comp.regparse = exp;
385   comp.regnpar = 1;
386   comp.regcode = this->program;
387   comp.regc(static_cast<char>(MAGIC));
388   comp.reg(0, &flags);
389 
390   // Dig out information for optimizations.
391   this->regstart = '\0'; // Worst-case defaults.
392   this->reganch = 0;
393   this->regmust = KWSYS_NULLPTR;
394   this->regmlen = 0;
395   scan = this->program + 1;       // First BRANCH.
396   if (OP(regnext(scan)) == END) { // Only one top-level choice.
397     scan = OPERAND(scan);
398 
399     // Starting-point info.
400     if (OP(scan) == EXACTLY)
401       this->regstart = *OPERAND(scan);
402     else if (OP(scan) == BOL)
403       this->reganch++;
404 
405     //
406     // If there's something expensive in the r.e., find the longest
407     // literal string that must appear and make it the regmust.  Resolve
408     // ties in favor of later strings, since the regstart check works
409     // with the beginning of the r.e. and avoiding duplication
410     // strengthens checking.  Not a strong reason, but sufficient in the
411     // absence of others.
412     //
413     if (flags & SPSTART) {
414       longest = KWSYS_NULLPTR;
415       len = 0;
416       for (; scan != KWSYS_NULLPTR; scan = regnext(scan))
417         if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
418           longest = OPERAND(scan);
419           len = strlen(OPERAND(scan));
420         }
421       this->regmust = longest;
422       this->regmlen = len;
423     }
424   }
425   return true;
426 }
427 
428 /*
429  - reg - regular expression, i.e. main body or parenthesized thing
430  *
431  * Caller must absorb opening parenthesis.
432  *
433  * Combining parenthesis handling with the base level of regular expression
434  * is a trifle forced, but the need to tie the tails of the branches to what
435  * follows makes it hard to avoid.
436  */
reg(int paren,int * flagp)437 char* RegExpCompile::reg(int paren, int* flagp)
438 {
439   char* ret;
440   char* br;
441   char* ender;
442   int parno = 0;
443   int flags;
444 
445   *flagp = HASWIDTH; // Tentatively.
446 
447   // Make an OPEN node, if parenthesized.
448   if (paren) {
449     if (regnpar >= RegularExpressionMatch::NSUBEXP) {
450       // RAISE Error, SYM(RegularExpression), SYM(Too_Many_Parens),
451       printf("RegularExpression::compile(): Too many parentheses.\n");
452       return KWSYS_NULLPTR;
453     }
454     parno = regnpar;
455     regnpar++;
456     ret = regnode(static_cast<char>(OPEN + parno));
457   } else
458     ret = KWSYS_NULLPTR;
459 
460   // Pick up the branches, linking them together.
461   br = regbranch(&flags);
462   if (br == KWSYS_NULLPTR)
463     return (KWSYS_NULLPTR);
464   if (ret != KWSYS_NULLPTR)
465     regtail(ret, br); // OPEN -> first.
466   else
467     ret = br;
468   if (!(flags & HASWIDTH))
469     *flagp &= ~HASWIDTH;
470   *flagp |= flags & SPSTART;
471   while (*regparse == '|') {
472     regparse++;
473     br = regbranch(&flags);
474     if (br == KWSYS_NULLPTR)
475       return (KWSYS_NULLPTR);
476     regtail(ret, br); // BRANCH -> BRANCH.
477     if (!(flags & HASWIDTH))
478       *flagp &= ~HASWIDTH;
479     *flagp |= flags & SPSTART;
480   }
481 
482   // Make a closing node, and hook it on the end.
483   ender = regnode(static_cast<char>((paren) ? CLOSE + parno : END));
484   regtail(ret, ender);
485 
486   // Hook the tails of the branches to the closing node.
487   for (br = ret; br != KWSYS_NULLPTR; br = regnext(br))
488     regoptail(br, ender);
489 
490   // Check for proper termination.
491   if (paren && *regparse++ != ')') {
492     // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
493     printf("RegularExpression::compile(): Unmatched parentheses.\n");
494     return KWSYS_NULLPTR;
495   } else if (!paren && *regparse != '\0') {
496     if (*regparse == ')') {
497       // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
498       printf("RegularExpression::compile(): Unmatched parentheses.\n");
499       return KWSYS_NULLPTR;
500     } else {
501       // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
502       printf("RegularExpression::compile(): Internal error.\n");
503       return KWSYS_NULLPTR;
504     }
505     // NOTREACHED
506   }
507   return (ret);
508 }
509 
510 /*
511  - regbranch - one alternative of an | operator
512  *
513  * Implements the concatenation operator.
514  */
regbranch(int * flagp)515 char* RegExpCompile::regbranch(int* flagp)
516 {
517   char* ret;
518   char* chain;
519   char* latest;
520   int flags;
521 
522   *flagp = WORST; // Tentatively.
523 
524   ret = regnode(BRANCH);
525   chain = KWSYS_NULLPTR;
526   while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
527     latest = regpiece(&flags);
528     if (latest == KWSYS_NULLPTR)
529       return (KWSYS_NULLPTR);
530     *flagp |= flags & HASWIDTH;
531     if (chain == KWSYS_NULLPTR) // First piece.
532       *flagp |= flags & SPSTART;
533     else
534       regtail(chain, latest);
535     chain = latest;
536   }
537   if (chain == KWSYS_NULLPTR) // Loop ran zero times.
538     regnode(NOTHING);
539 
540   return (ret);
541 }
542 
543 /*
544  - regpiece - something followed by possible [*+?]
545  *
546  * Note that the branching code sequences used for ? and the general cases
547  * of * and + are somewhat optimized:  they use the same NOTHING node as
548  * both the endmarker for their branch list and the body of the last branch.
549  * It might seem that this node could be dispensed with entirely, but the
550  * endmarker role is not redundant.
551  */
regpiece(int * flagp)552 char* RegExpCompile::regpiece(int* flagp)
553 {
554   char* ret;
555   char op;
556   char* next;
557   int flags;
558 
559   ret = regatom(&flags);
560   if (ret == KWSYS_NULLPTR)
561     return (KWSYS_NULLPTR);
562 
563   op = *regparse;
564   if (!ISMULT(op)) {
565     *flagp = flags;
566     return (ret);
567   }
568 
569   if (!(flags & HASWIDTH) && op != '?') {
570     // RAISE Error, SYM(RegularExpression), SYM(Empty_Operand),
571     printf("RegularExpression::compile() : *+ operand could be empty.\n");
572     return KWSYS_NULLPTR;
573   }
574   *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
575 
576   if (op == '*' && (flags & SIMPLE))
577     reginsert(STAR, ret);
578   else if (op == '*') {
579     // Emit x* as (x&|), where & means "self".
580     reginsert(BRANCH, ret);         // Either x
581     regoptail(ret, regnode(BACK));  // and loop
582     regoptail(ret, ret);            // back
583     regtail(ret, regnode(BRANCH));  // or
584     regtail(ret, regnode(NOTHING)); // null.
585   } else if (op == '+' && (flags & SIMPLE))
586     reginsert(PLUS, ret);
587   else if (op == '+') {
588     // Emit x+ as x(&|), where & means "self".
589     next = regnode(BRANCH); // Either
590     regtail(ret, next);
591     regtail(regnode(BACK), ret);    // loop back
592     regtail(next, regnode(BRANCH)); // or
593     regtail(ret, regnode(NOTHING)); // null.
594   } else if (op == '?') {
595     // Emit x? as (x|)
596     reginsert(BRANCH, ret);        // Either x
597     regtail(ret, regnode(BRANCH)); // or
598     next = regnode(NOTHING);       // null.
599     regtail(ret, next);
600     regoptail(ret, next);
601   }
602   regparse++;
603   if (ISMULT(*regparse)) {
604     // RAISE Error, SYM(RegularExpression), SYM(Nested_Operand),
605     printf("RegularExpression::compile(): Nested *?+.\n");
606     return KWSYS_NULLPTR;
607   }
608   return (ret);
609 }
610 
611 /*
612  - regatom - the lowest level
613  *
614  * Optimization:  gobbles an entire sequence of ordinary characters so that
615  * it can turn them into a single node, which is smaller to store and
616  * faster to run.  Backslashed characters are exceptions, each becoming a
617  * separate node; the code is simpler that way and it's not worth fixing.
618  */
regatom(int * flagp)619 char* RegExpCompile::regatom(int* flagp)
620 {
621   char* ret;
622   int flags;
623 
624   *flagp = WORST; // Tentatively.
625 
626   switch (*regparse++) {
627     case '^':
628       ret = regnode(BOL);
629       break;
630     case '$':
631       ret = regnode(EOL);
632       break;
633     case '.':
634       ret = regnode(ANY);
635       *flagp |= HASWIDTH | SIMPLE;
636       break;
637     case '[': {
638       int rxpclass;
639       int rxpclassend;
640 
641       if (*regparse == '^') { // Complement of range.
642         ret = regnode(ANYBUT);
643         regparse++;
644       } else
645         ret = regnode(ANYOF);
646       if (*regparse == ']' || *regparse == '-')
647         regc(*regparse++);
648       while (*regparse != '\0' && *regparse != ']') {
649         if (*regparse == '-') {
650           regparse++;
651           if (*regparse == ']' || *regparse == '\0')
652             regc('-');
653           else {
654             rxpclass = UCHARAT(regparse - 2) + 1;
655             rxpclassend = UCHARAT(regparse);
656             if (rxpclass > rxpclassend + 1) {
657               // RAISE Error, SYM(RegularExpression), SYM(Invalid_Range),
658               printf("RegularExpression::compile(): Invalid range in [].\n");
659               return KWSYS_NULLPTR;
660             }
661             for (; rxpclass <= rxpclassend; rxpclass++)
662               regc(static_cast<char>(rxpclass));
663             regparse++;
664           }
665         } else
666           regc(*regparse++);
667       }
668       regc('\0');
669       if (*regparse != ']') {
670         // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Bracket),
671         printf("RegularExpression::compile(): Unmatched [].\n");
672         return KWSYS_NULLPTR;
673       }
674       regparse++;
675       *flagp |= HASWIDTH | SIMPLE;
676     } break;
677     case '(':
678       ret = reg(1, &flags);
679       if (ret == KWSYS_NULLPTR)
680         return (KWSYS_NULLPTR);
681       *flagp |= flags & (HASWIDTH | SPSTART);
682       break;
683     case '\0':
684     case '|':
685     case ')':
686       // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
687       printf("RegularExpression::compile(): Internal error.\n"); // Never here
688       return KWSYS_NULLPTR;
689     case '?':
690     case '+':
691     case '*':
692       // RAISE Error, SYM(RegularExpression), SYM(No_Operand),
693       printf("RegularExpression::compile(): ?+* follows nothing.\n");
694       return KWSYS_NULLPTR;
695     case '\\':
696       if (*regparse == '\0') {
697         // RAISE Error, SYM(RegularExpression), SYM(Trailing_Backslash),
698         printf("RegularExpression::compile(): Trailing backslash.\n");
699         return KWSYS_NULLPTR;
700       }
701       ret = regnode(EXACTLY);
702       regc(*regparse++);
703       regc('\0');
704       *flagp |= HASWIDTH | SIMPLE;
705       break;
706     default: {
707       int len;
708       char ender;
709 
710       regparse--;
711       len = int(strcspn(regparse, META));
712       if (len <= 0) {
713         // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
714         printf("RegularExpression::compile(): Internal error.\n");
715         return KWSYS_NULLPTR;
716       }
717       ender = *(regparse + len);
718       if (len > 1 && ISMULT(ender))
719         len--; // Back off clear of ?+* operand.
720       *flagp |= HASWIDTH;
721       if (len == 1)
722         *flagp |= SIMPLE;
723       ret = regnode(EXACTLY);
724       while (len > 0) {
725         regc(*regparse++);
726         len--;
727       }
728       regc('\0');
729     } break;
730   }
731   return (ret);
732 }
733 
734 /*
735  - regnode - emit a node
736    Location.
737  */
regnode(char op)738 char* RegExpCompile::regnode(char op)
739 {
740   char* ret;
741   char* ptr;
742 
743   ret = regcode;
744   if (ret == regdummyptr) {
745     regsize += 3;
746     return (ret);
747   }
748 
749   ptr = ret;
750   *ptr++ = op;
751   *ptr++ = '\0'; // Null "next" pointer.
752   *ptr++ = '\0';
753   regcode = ptr;
754 
755   return (ret);
756 }
757 
758 /*
759  - regc - emit (if appropriate) a byte of code
760  */
regc(char b)761 void RegExpCompile::regc(char b)
762 {
763   if (regcode != regdummyptr)
764     *regcode++ = b;
765   else
766     regsize++;
767 }
768 
769 /*
770  - reginsert - insert an operator in front of already-emitted operand
771  *
772  * Means relocating the operand.
773  */
reginsert(char op,char * opnd)774 void RegExpCompile::reginsert(char op, char* opnd)
775 {
776   char* src;
777   char* dst;
778   char* place;
779 
780   if (regcode == regdummyptr) {
781     regsize += 3;
782     return;
783   }
784 
785   src = regcode;
786   regcode += 3;
787   dst = regcode;
788   while (src > opnd)
789     *--dst = *--src;
790 
791   place = opnd; // Op node, where operand used to be.
792   *place++ = op;
793   *place++ = '\0';
794   *place = '\0';
795 }
796 
797 /*
798  - regtail - set the next-pointer at the end of a node chain
799  */
regtail(char * p,const char * val)800 void RegExpCompile::regtail(char* p, const char* val)
801 {
802   char* scan;
803   char* temp;
804   int offset;
805 
806   if (p == regdummyptr)
807     return;
808 
809   // Find last node.
810   scan = p;
811   for (;;) {
812     temp = regnext(scan);
813     if (temp == KWSYS_NULLPTR)
814       break;
815     scan = temp;
816   }
817 
818   if (OP(scan) == BACK)
819     offset = int(scan - val);
820   else
821     offset = int(val - scan);
822   *(scan + 1) = static_cast<char>((offset >> 8) & 0377);
823   *(scan + 2) = static_cast<char>(offset & 0377);
824 }
825 
826 /*
827  - regoptail - regtail on operand of first argument; nop if operandless
828  */
regoptail(char * p,const char * val)829 void RegExpCompile::regoptail(char* p, const char* val)
830 {
831   // "Operandless" and "op != BRANCH" are synonymous in practice.
832   if (p == KWSYS_NULLPTR || p == regdummyptr || OP(p) != BRANCH)
833     return;
834   regtail(OPERAND(p), val);
835 }
836 
837 ////////////////////////////////////////////////////////////////////////
838 //
839 //  find and friends
840 //
841 ////////////////////////////////////////////////////////////////////////
842 
843 /*
844  * Utility class for RegularExpression::find().
845  */
846 class RegExpFind
847 {
848 public:
849   const char* reginput;   // String-input pointer.
850   const char* regbol;     // Beginning of input, for ^ check.
851   const char** regstartp; // Pointer to startp array.
852   const char** regendp;   // Ditto for endp.
853 
854   int regtry(const char*, const char**, const char**, const char*);
855   int regmatch(const char*);
856   int regrepeat(const char*);
857 };
858 
859 // find -- Matches the regular expression to the given string.
860 // Returns true if found, and sets start and end indexes accordingly.
find(char const * string,RegularExpressionMatch & rmatch) const861 bool RegularExpression::find(char const* string,
862                              RegularExpressionMatch& rmatch) const
863 {
864   const char* s;
865 
866   rmatch.clear();
867   rmatch.searchstring = string;
868 
869   if (!this->program) {
870     return false;
871   }
872 
873   // Check validity of program.
874   if (UCHARAT(this->program) != MAGIC) {
875     // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
876     printf(
877       "RegularExpression::find(): Compiled regular expression corrupted.\n");
878     return false;
879   }
880 
881   // If there is a "must appear" string, look for it.
882   if (this->regmust != KWSYS_NULLPTR) {
883     s = string;
884     while ((s = strchr(s, this->regmust[0])) != KWSYS_NULLPTR) {
885       if (strncmp(s, this->regmust, this->regmlen) == 0)
886         break; // Found it.
887       s++;
888     }
889     if (s == KWSYS_NULLPTR) // Not present.
890       return false;
891   }
892 
893   RegExpFind regFind;
894 
895   // Mark beginning of line for ^ .
896   regFind.regbol = string;
897 
898   // Simplest case:  anchored match need be tried only once.
899   if (this->reganch)
900     return (
901       regFind.regtry(string, rmatch.startp, rmatch.endp, this->program) != 0);
902 
903   // Messy cases:  unanchored match.
904   s = string;
905   if (this->regstart != '\0')
906     // We know what char it must start with.
907     while ((s = strchr(s, this->regstart)) != KWSYS_NULLPTR) {
908       if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
909         return true;
910       s++;
911     }
912   else
913     // We don't -- general case.
914     do {
915       if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
916         return true;
917     } while (*s++ != '\0');
918 
919   // Failure.
920   return false;
921 }
922 
923 /*
924  - regtry - try match at specific point
925    0 failure, 1 success
926  */
regtry(const char * string,const char ** start,const char ** end,const char * prog)927 int RegExpFind::regtry(const char* string, const char** start,
928                        const char** end, const char* prog)
929 {
930   int i;
931   const char** sp1;
932   const char** ep;
933 
934   reginput = string;
935   regstartp = start;
936   regendp = end;
937 
938   sp1 = start;
939   ep = end;
940   for (i = RegularExpressionMatch::NSUBEXP; i > 0; i--) {
941     *sp1++ = KWSYS_NULLPTR;
942     *ep++ = KWSYS_NULLPTR;
943   }
944   if (regmatch(prog + 1)) {
945     start[0] = string;
946     end[0] = reginput;
947     return (1);
948   } else
949     return (0);
950 }
951 
952 /*
953  - regmatch - main matching routine
954  *
955  * Conceptually the strategy is simple:  check to see whether the current
956  * node matches, call self recursively to see whether the rest matches,
957  * and then act accordingly.  In practice we make some effort to avoid
958  * recursion, in particular by going through "ordinary" nodes (that don't
959  * need to know whether the rest of the match failed) by a loop instead of
960  * by recursion.
961  * 0 failure, 1 success
962  */
regmatch(const char * prog)963 int RegExpFind::regmatch(const char* prog)
964 {
965   const char* scan; // Current node.
966   const char* next; // Next node.
967 
968   scan = prog;
969 
970   while (scan != KWSYS_NULLPTR) {
971 
972     next = regnext(scan);
973 
974     switch (OP(scan)) {
975       case BOL:
976         if (reginput != regbol)
977           return (0);
978         break;
979       case EOL:
980         if (*reginput != '\0')
981           return (0);
982         break;
983       case ANY:
984         if (*reginput == '\0')
985           return (0);
986         reginput++;
987         break;
988       case EXACTLY: {
989         size_t len;
990         const char* opnd;
991 
992         opnd = OPERAND(scan);
993         // Inline the first character, for speed.
994         if (*opnd != *reginput)
995           return (0);
996         len = strlen(opnd);
997         if (len > 1 && strncmp(opnd, reginput, len) != 0)
998           return (0);
999         reginput += len;
1000       } break;
1001       case ANYOF:
1002         if (*reginput == '\0' ||
1003             strchr(OPERAND(scan), *reginput) == KWSYS_NULLPTR)
1004           return (0);
1005         reginput++;
1006         break;
1007       case ANYBUT:
1008         if (*reginput == '\0' ||
1009             strchr(OPERAND(scan), *reginput) != KWSYS_NULLPTR)
1010           return (0);
1011         reginput++;
1012         break;
1013       case NOTHING:
1014         break;
1015       case BACK:
1016         break;
1017       case OPEN + 1:
1018       case OPEN + 2:
1019       case OPEN + 3:
1020       case OPEN + 4:
1021       case OPEN + 5:
1022       case OPEN + 6:
1023       case OPEN + 7:
1024       case OPEN + 8:
1025       case OPEN + 9: {
1026         int no;
1027         const char* save;
1028 
1029         no = OP(scan) - OPEN;
1030         save = reginput;
1031 
1032         if (regmatch(next)) {
1033 
1034           //
1035           // Don't set startp if some later invocation of the
1036           // same parentheses already has.
1037           //
1038           if (regstartp[no] == KWSYS_NULLPTR)
1039             regstartp[no] = save;
1040           return (1);
1041         } else
1042           return (0);
1043       }
1044       //              break;
1045       case CLOSE + 1:
1046       case CLOSE + 2:
1047       case CLOSE + 3:
1048       case CLOSE + 4:
1049       case CLOSE + 5:
1050       case CLOSE + 6:
1051       case CLOSE + 7:
1052       case CLOSE + 8:
1053       case CLOSE + 9: {
1054         int no;
1055         const char* save;
1056 
1057         no = OP(scan) - CLOSE;
1058         save = reginput;
1059 
1060         if (regmatch(next)) {
1061 
1062           //
1063           // Don't set endp if some later invocation of the
1064           // same parentheses already has.
1065           //
1066           if (regendp[no] == KWSYS_NULLPTR)
1067             regendp[no] = save;
1068           return (1);
1069         } else
1070           return (0);
1071       }
1072       //              break;
1073       case BRANCH: {
1074 
1075         const char* save;
1076 
1077         if (OP(next) != BRANCH) // No choice.
1078           next = OPERAND(scan); // Avoid recursion.
1079         else {
1080           do {
1081             save = reginput;
1082             if (regmatch(OPERAND(scan)))
1083               return (1);
1084             reginput = save;
1085             scan = regnext(scan);
1086           } while (scan != KWSYS_NULLPTR && OP(scan) == BRANCH);
1087           return (0);
1088           // NOTREACHED
1089         }
1090       } break;
1091       case STAR:
1092       case PLUS: {
1093         char nextch;
1094         int no;
1095         const char* save;
1096         int min_no;
1097 
1098         //
1099         // Lookahead to avoid useless match attempts when we know
1100         // what character comes next.
1101         //
1102         nextch = '\0';
1103         if (OP(next) == EXACTLY)
1104           nextch = *OPERAND(next);
1105         min_no = (OP(scan) == STAR) ? 0 : 1;
1106         save = reginput;
1107         no = regrepeat(OPERAND(scan));
1108         while (no >= min_no) {
1109           // If it could work, try it.
1110           if (nextch == '\0' || *reginput == nextch)
1111             if (regmatch(next))
1112               return (1);
1113           // Couldn't or didn't -- back up.
1114           no--;
1115           reginput = save + no;
1116         }
1117         return (0);
1118       }
1119       //              break;
1120       case END:
1121         return (1); // Success!
1122 
1123       default:
1124         // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1125         printf(
1126           "RegularExpression::find(): Internal error -- memory corrupted.\n");
1127         return 0;
1128     }
1129     scan = next;
1130   }
1131 
1132   //
1133   //  We get here only if there's trouble -- normally "case END" is the
1134   //  terminating point.
1135   //
1136   // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1137   printf("RegularExpression::find(): Internal error -- corrupted pointers.\n");
1138   return (0);
1139 }
1140 
1141 /*
1142  - regrepeat - repeatedly match something simple, report how many
1143  */
regrepeat(const char * p)1144 int RegExpFind::regrepeat(const char* p)
1145 {
1146   int count = 0;
1147   const char* scan;
1148   const char* opnd;
1149 
1150   scan = reginput;
1151   opnd = OPERAND(p);
1152   switch (OP(p)) {
1153     case ANY:
1154       count = int(strlen(scan));
1155       scan += count;
1156       break;
1157     case EXACTLY:
1158       while (*opnd == *scan) {
1159         count++;
1160         scan++;
1161       }
1162       break;
1163     case ANYOF:
1164       while (*scan != '\0' && strchr(opnd, *scan) != KWSYS_NULLPTR) {
1165         count++;
1166         scan++;
1167       }
1168       break;
1169     case ANYBUT:
1170       while (*scan != '\0' && strchr(opnd, *scan) == KWSYS_NULLPTR) {
1171         count++;
1172         scan++;
1173       }
1174       break;
1175     default: // Oh dear.  Called inappropriately.
1176       // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1177       printf("cm RegularExpression::find(): Internal error.\n");
1178       return 0;
1179   }
1180   reginput = scan;
1181   return (count);
1182 }
1183 
1184 /*
1185  - regnext - dig the "next" pointer out of a node
1186  */
regnext(const char * p)1187 static const char* regnext(const char* p)
1188 {
1189   int offset;
1190 
1191   if (p == regdummyptr)
1192     return (KWSYS_NULLPTR);
1193 
1194   offset = NEXT(p);
1195   if (offset == 0)
1196     return (KWSYS_NULLPTR);
1197 
1198   if (OP(p) == BACK)
1199     return (p - offset);
1200   else
1201     return (p + offset);
1202 }
1203 
regnext(char * p)1204 static char* regnext(char* p)
1205 {
1206   int offset;
1207 
1208   if (p == regdummyptr)
1209     return (KWSYS_NULLPTR);
1210 
1211   offset = NEXT(p);
1212   if (offset == 0)
1213     return (KWSYS_NULLPTR);
1214 
1215   if (OP(p) == BACK)
1216     return (p - offset);
1217   else
1218     return (p + offset);
1219 }
1220 
1221 } // namespace KWSYS_NAMESPACE
1222