1 /* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
2    file Copyright.txt or https://cmake.org/licensing#kwsys for details.  */
3 //
4 // Copyright (C) 1991 Texas Instruments Incorporated.
5 //
6 // Permission is granted to any individual or institution to use, copy, modify
7 // and distribute this software, provided that this complete copyright and
8 // permission notice is maintained, intact, in all copies and supporting
9 // documentation.
10 //
11 // Texas Instruments Incorporated provides this software "as is" without
12 // express or implied warranty.
13 //
14 //
15 // Created: MNF 06/13/89  Initial Design and Implementation
16 // Updated: LGO 08/09/89  Inherit from Generic
17 // Updated: MBN 09/07/89  Added conditional exception handling
18 // Updated: MBN 12/15/89  Sprinkled "const" qualifiers all over the place!
19 // Updated: DLS 03/22/91  New lite version
20 //
21 
22 #include "kwsysPrivate.h"
23 #include KWSYS_HEADER(RegularExpression.hxx)
24 
25 // Work-around CMake dependency scanning limitation.  This must
26 // duplicate the above list of headers.
27 #if 0
28 #  include "RegularExpression.hxx.in"
29 #endif
30 
31 #include <cstdio>
32 #include <cstring>
33 
34 namespace KWSYS_NAMESPACE {
35 
36 // RegularExpression -- Copies the given regular expression.
RegularExpression(const RegularExpression & rxp)37 RegularExpression::RegularExpression(const RegularExpression& rxp)
38 {
39   if (!rxp.program) {
40     this->program = nullptr;
41     return;
42   }
43   int ind;
44   this->progsize = rxp.progsize;            // Copy regular expression size
45   this->program = new char[this->progsize]; // Allocate storage
46   for (ind = this->progsize; ind-- != 0;)   // Copy regular expression
47     this->program[ind] = rxp.program[ind];
48   // Copy pointers into last successful "find" operation
49   this->regmatch = rxp.regmatch;
50   this->regmust = rxp.regmust; // Copy field
51   if (rxp.regmust != nullptr) {
52     char* dum = rxp.program;
53     ind = 0;
54     while (dum != rxp.regmust) {
55       ++dum;
56       ++ind;
57     }
58     this->regmust = this->program + ind;
59   }
60   this->regstart = rxp.regstart; // Copy starting index
61   this->reganch = rxp.reganch;   // Copy remaining private data
62   this->regmlen = rxp.regmlen;   // Copy remaining private data
63 }
64 
65 // operator= -- Copies the given regular expression.
operator =(const RegularExpression & rxp)66 RegularExpression& RegularExpression::operator=(const RegularExpression& rxp)
67 {
68   if (this == &rxp) {
69     return *this;
70   }
71   if (!rxp.program) {
72     this->program = nullptr;
73     return *this;
74   }
75   int ind;
76   this->progsize = rxp.progsize; // Copy regular expression size
77   delete[] this->program;
78   this->program = new char[this->progsize]; // Allocate storage
79   for (ind = this->progsize; ind-- != 0;)   // Copy regular expression
80     this->program[ind] = rxp.program[ind];
81   // Copy pointers into last successful "find" operation
82   this->regmatch = rxp.regmatch;
83   this->regmust = rxp.regmust; // Copy field
84   if (rxp.regmust != nullptr) {
85     char* dum = rxp.program;
86     ind = 0;
87     while (dum != rxp.regmust) {
88       ++dum;
89       ++ind;
90     }
91     this->regmust = this->program + ind;
92   }
93   this->regstart = rxp.regstart; // Copy starting index
94   this->reganch = rxp.reganch;   // Copy remaining private data
95   this->regmlen = rxp.regmlen;   // Copy remaining private data
96 
97   return *this;
98 }
99 
100 // operator== -- Returns true if two regular expressions have the same
101 // compiled program for pattern matching.
operator ==(const RegularExpression & rxp) const102 bool RegularExpression::operator==(const RegularExpression& rxp) const
103 {
104   if (this != &rxp) {         // Same address?
105     int ind = this->progsize; // Get regular expression size
106     if (ind != rxp.progsize)  // If different size regexp
107       return false;           // Return failure
108     while (ind-- != 0)        // Else while still characters
109       if (this->program[ind] != rxp.program[ind]) // If regexp are different
110         return false;                             // Return failure
111   }
112   return true; // Else same, return success
113 }
114 
115 // deep_equal -- Returns true if have the same compiled regular expressions
116 // and the same start and end pointers.
117 
deep_equal(const RegularExpression & rxp) const118 bool RegularExpression::deep_equal(const RegularExpression& rxp) const
119 {
120   int ind = this->progsize;                     // Get regular expression size
121   if (ind != rxp.progsize)                      // If different size regexp
122     return false;                               // Return failure
123   while (ind-- != 0)                            // Else while still characters
124     if (this->program[ind] != rxp.program[ind]) // If regexp are different
125       return false;                             // Return failure
126   // Else if same start/end ptrs, return true
127   return (this->regmatch.start() == rxp.regmatch.start() &&
128           this->regmatch.end() == rxp.regmatch.end());
129 }
130 
131 // The remaining code in this file is derived from the regular expression code
132 // whose copyright statement appears below.  It has been changed to work
133 // with the class concepts of C++ and COOL.
134 
135 /*
136  * compile and find
137  *
138  *      Copyright (c) 1986 by University of Toronto.
139  *      Written by Henry Spencer.  Not derived from licensed software.
140  *
141  *      Permission is granted to anyone to use this software for any
142  *      purpose on any computer system, and to redistribute it freely,
143  *      subject to the following restrictions:
144  *
145  *      1. The author is not responsible for the consequences of use of
146  *              this software, no matter how awful, even if they arise
147  *              from defects in it.
148  *
149  *      2. The origin of this software must not be misrepresented, either
150  *              by explicit claim or by omission.
151  *
152  *      3. Altered versions must be plainly marked as such, and must not
153  *              be misrepresented as being the original software.
154  *
155  * Beware that some of this code is subtly aware of the way operator
156  * precedence is structured in regular expressions.  Serious changes in
157  * regular-expression syntax might require a total rethink.
158  */
159 
160 /*
161  * The "internal use only" fields in regexp.h are present to pass info from
162  * compile to execute that permits the execute phase to run lots faster on
163  * simple cases.  They are:
164  *
165  * regstart     char that must begin a match; '\0' if none obvious
166  * reganch      is the match anchored (at beginning-of-line only)?
167  * regmust      string (pointer into program) that match must include, or
168  * nullptr regmlen      length of regmust string
169  *
170  * Regstart and reganch permit very fast decisions on suitable starting points
171  * for a match, cutting down the work a lot.  Regmust permits fast rejection
172  * of lines that cannot possibly match.  The regmust tests are costly enough
173  * that compile() supplies a regmust only if the r.e. contains something
174  * potentially expensive (at present, the only such thing detected is * or +
175  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
176  * supplied because the test in find() needs it and compile() is computing
177  * it anyway.
178  */
179 
180 /*
181  * Structure for regexp "program".  This is essentially a linear encoding
182  * of a nondeterministic finite-state machine (aka syntax charts or
183  * "railroad normal form" in parsing technology).  Each node is an opcode
184  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
185  * all nodes except BRANCH implement concatenation; a "next" pointer with
186  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
187  * have one of the subtle syntax dependencies:  an individual BRANCH (as
188  * opposed to a collection of them) is never concatenated with anything
189  * because of operator precedence.)  The operand of some types of node is
190  * a literal string; for others, it is a node leading into a sub-FSM.  In
191  * particular, the operand of a BRANCH node is the first node of the branch.
192  * (NB this is *not* a tree structure:  the tail of the branch connects
193  * to the thing following the set of BRANCHes.)  The opcodes are:
194  */
195 
196 // definition   number  opnd?   meaning
197 #define END 0   // no   End of program.
198 #define BOL 1   // no   Match "" at beginning of line.
199 #define EOL 2   // no   Match "" at end of line.
200 #define ANY 3   // no   Match any one character.
201 #define ANYOF 4 // str  Match any character in this string.
202 #define ANYBUT                                                                \
203   5 // str  Match any character not in this
204     // string.
205 #define BRANCH                                                                \
206   6               // node Match this alternative, or the
207                   // next...
208 #define BACK 7    // no   Match "", "next" ptr points backward.
209 #define EXACTLY 8 // str  Match this string.
210 #define NOTHING 9 // no   Match empty string.
211 #define STAR                                                                  \
212   10 // node Match this (simple) thing 0 or more
213      // times.
214 #define PLUS                                                                  \
215   11 // node Match this (simple) thing 1 or more
216      // times.
217 #define OPEN                                                                  \
218   20 // no   Mark this point in input as start of
219      // #n.
220 // OPEN+1 is number 1, etc.
221 #define CLOSE 30 // no   Analogous to OPEN.
222 
223 /*
224  * Opcode notes:
225  *
226  * BRANCH       The set of branches constituting a single choice are hooked
227  *              together with their "next" pointers, since precedence prevents
228  *              anything being concatenated to any individual branch.  The
229  *              "next" pointer of the last BRANCH in a choice points to the
230  *              thing following the whole choice.  This is also where the
231  *              final "next" pointer of each individual branch points; each
232  *              branch starts with the operand node of a BRANCH node.
233  *
234  * BACK         Normal "next" pointers all implicitly point forward; BACK
235  *              exists to make loop structures possible.
236  *
237  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
238  *              BRANCH structures using BACK.  Simple cases (one character
239  *              per match) are implemented with STAR and PLUS for speed
240  *              and to minimize recursive plunges.
241  *
242  * OPEN,CLOSE   ...are numbered at compile time.
243  */
244 
245 /*
246  * A node is one char of opcode followed by two chars of "next" pointer.
247  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
248  * value is a positive offset from the opcode of the node containing it.
249  * An operand, if any, simply follows the node.  (Note that much of the
250  * code generation knows about this implicit relationship.)
251  *
252  * Using two bytes for the "next" pointer is vast overkill for most things,
253  * but allows patterns to get big without disasters.
254  */
255 
256 #define OP(p) (*(p))
257 #define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
258 #define OPERAND(p) ((p) + 3)
259 
260 const unsigned char MAGIC = 0234;
261 /*
262  * Utility definitions.
263  */
264 
265 #define UCHARAT(p) (reinterpret_cast<const unsigned char*>(p))[0]
266 
267 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
268 #define META "^$.[()|?+*\\"
269 
270 /*
271  * Flags to be passed up and down.
272  */
273 #define HASWIDTH 01 // Known never to match null string.
274 #define SIMPLE 02   // Simple enough to be STAR/PLUS operand.
275 #define SPSTART 04  // Starts with * or +.
276 #define WORST 0     // Worst case.
277 
278 /////////////////////////////////////////////////////////////////////////
279 //
280 //  COMPILE AND ASSOCIATED FUNCTIONS
281 //
282 /////////////////////////////////////////////////////////////////////////
283 
284 /*
285  * Read only utility variables.
286  */
287 static char regdummy;
288 static char* const regdummyptr = &regdummy;
289 
290 /*
291  * Utility class for RegularExpression::compile().
292  */
293 class RegExpCompile
294 {
295 public:
296   const char* regparse; // Input-scan pointer.
297   int regnpar;          // () count.
298   char* regcode;        // Code-emit pointer; regdummyptr = don't.
299   long regsize;         // Code size.
300 
301   char* reg(int, int*);
302   char* regbranch(int*);
303   char* regpiece(int*);
304   char* regatom(int*);
305   char* regnode(char);
306   void regc(char);
307   void reginsert(char, char*);
308   static void regtail(char*, const char*);
309   static void regoptail(char*, const char*);
310 };
311 
312 static const char* regnext(const char*);
313 static char* regnext(char*);
314 
315 #ifdef STRCSPN
316 static int strcspn();
317 #endif
318 
319 /*
320  * We can't allocate space until we know how big the compiled form will be,
321  * but we can't compile it (and thus know how big it is) until we've got a
322  * place to put the code.  So we cheat:  we compile it twice, once with code
323  * generation turned off and size counting turned on, and once "for real".
324  * This also means that we don't allocate space until we are sure that the
325  * thing really will compile successfully, and we never have to move the
326  * code and thus invalidate pointers into it.  (Note that it has to be in
327  * one piece because free() must be able to free it all.)
328  *
329  * Beware that the optimization-preparation code in here knows about some
330  * of the structure of the compiled regexp.
331  */
332 
333 // compile -- compile a regular expression into internal code
334 // for later pattern matching.
335 
compile(const char * exp)336 bool RegularExpression::compile(const char* exp)
337 {
338   const char* scan;
339   const char* longest;
340   int flags;
341 
342   if (exp == nullptr) {
343     // RAISE Error, SYM(RegularExpression), SYM(No_Expr),
344     printf("RegularExpression::compile(): No expression supplied.\n");
345     return false;
346   }
347 
348   // First pass: determine size, legality.
349   RegExpCompile comp;
350   comp.regparse = exp;
351   comp.regnpar = 1;
352   comp.regsize = 0L;
353   comp.regcode = regdummyptr;
354   comp.regc(static_cast<char>(MAGIC));
355   if (!comp.reg(0, &flags)) {
356     printf("RegularExpression::compile(): Error in compile.\n");
357     return false;
358   }
359   this->regmatch.clear();
360 
361   // Small enough for pointer-storage convention?
362   if (comp.regsize >= 65535L) {
363     // RAISE Error, SYM(RegularExpression), SYM(Expr_Too_Big),
364     printf("RegularExpression::compile(): Expression too big.\n");
365     return false;
366   }
367 
368   // Allocate space.
369   //#ifndef _WIN32
370   delete[] this->program;
371   //#endif
372   this->program = new char[comp.regsize];
373   this->progsize = static_cast<int>(comp.regsize);
374 
375   if (this->program == nullptr) {
376     // RAISE Error, SYM(RegularExpression), SYM(Out_Of_Memory),
377     printf("RegularExpression::compile(): Out of memory.\n");
378     return false;
379   }
380 
381   // Second pass: emit code.
382   comp.regparse = exp;
383   comp.regnpar = 1;
384   comp.regcode = this->program;
385   comp.regc(static_cast<char>(MAGIC));
386   comp.reg(0, &flags);
387 
388   // Dig out information for optimizations.
389   this->regstart = '\0'; // Worst-case defaults.
390   this->reganch = 0;
391   this->regmust = nullptr;
392   this->regmlen = 0;
393   scan = this->program + 1;       // First BRANCH.
394   if (OP(regnext(scan)) == END) { // Only one top-level choice.
395     scan = OPERAND(scan);
396 
397     // Starting-point info.
398     if (OP(scan) == EXACTLY)
399       this->regstart = *OPERAND(scan);
400     else if (OP(scan) == BOL)
401       this->reganch++;
402 
403     //
404     // If there's something expensive in the r.e., find the longest
405     // literal string that must appear and make it the regmust.  Resolve
406     // ties in favor of later strings, since the regstart check works
407     // with the beginning of the r.e. and avoiding duplication
408     // strengthens checking.  Not a strong reason, but sufficient in the
409     // absence of others.
410     //
411     if (flags & SPSTART) {
412       longest = nullptr;
413       size_t len = 0;
414       for (; scan != nullptr; scan = regnext(scan))
415         if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
416           longest = OPERAND(scan);
417           len = strlen(OPERAND(scan));
418         }
419       this->regmust = longest;
420       this->regmlen = len;
421     }
422   }
423   return true;
424 }
425 
426 /*
427  - reg - regular expression, i.e. main body or parenthesized thing
428  *
429  * Caller must absorb opening parenthesis.
430  *
431  * Combining parenthesis handling with the base level of regular expression
432  * is a trifle forced, but the need to tie the tails of the branches to what
433  * follows makes it hard to avoid.
434  */
reg(int paren,int * flagp)435 char* RegExpCompile::reg(int paren, int* flagp)
436 {
437   char* ret;
438   char* br;
439   char* ender;
440   int parno = 0;
441   int flags;
442 
443   *flagp = HASWIDTH; // Tentatively.
444 
445   // Make an OPEN node, if parenthesized.
446   if (paren) {
447     if (regnpar >= RegularExpressionMatch::NSUBEXP) {
448       // RAISE Error, SYM(RegularExpression), SYM(Too_Many_Parens),
449       printf("RegularExpression::compile(): Too many parentheses.\n");
450       return nullptr;
451     }
452     parno = regnpar;
453     regnpar++;
454     ret = regnode(static_cast<char>(OPEN + parno));
455   } else
456     ret = nullptr;
457 
458   // Pick up the branches, linking them together.
459   br = regbranch(&flags);
460   if (br == nullptr)
461     return (nullptr);
462   if (ret != nullptr)
463     regtail(ret, br); // OPEN -> first.
464   else
465     ret = br;
466   if (!(flags & HASWIDTH))
467     *flagp &= ~HASWIDTH;
468   *flagp |= flags & SPSTART;
469   while (*regparse == '|') {
470     regparse++;
471     br = regbranch(&flags);
472     if (br == nullptr)
473       return (nullptr);
474     regtail(ret, br); // BRANCH -> BRANCH.
475     if (!(flags & HASWIDTH))
476       *flagp &= ~HASWIDTH;
477     *flagp |= flags & SPSTART;
478   }
479 
480   // Make a closing node, and hook it on the end.
481   ender = regnode(static_cast<char>((paren) ? CLOSE + parno : END));
482   regtail(ret, ender);
483 
484   // Hook the tails of the branches to the closing node.
485   for (br = ret; br != nullptr; br = regnext(br))
486     regoptail(br, ender);
487 
488   // Check for proper termination.
489   if (paren && *regparse++ != ')') {
490     // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
491     printf("RegularExpression::compile(): Unmatched parentheses.\n");
492     return nullptr;
493   } else if (!paren && *regparse != '\0') {
494     if (*regparse == ')') {
495       // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
496       printf("RegularExpression::compile(): Unmatched parentheses.\n");
497       return nullptr;
498     } else {
499       // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
500       printf("RegularExpression::compile(): Internal error.\n");
501       return nullptr;
502     }
503     // NOTREACHED
504   }
505   return (ret);
506 }
507 
508 /*
509  - regbranch - one alternative of an | operator
510  *
511  * Implements the concatenation operator.
512  */
regbranch(int * flagp)513 char* RegExpCompile::regbranch(int* flagp)
514 {
515   char* ret;
516   char* chain;
517   char* latest;
518   int flags;
519 
520   *flagp = WORST; // Tentatively.
521 
522   ret = regnode(BRANCH);
523   chain = nullptr;
524   while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
525     latest = regpiece(&flags);
526     if (latest == nullptr)
527       return (nullptr);
528     *flagp |= flags & HASWIDTH;
529     if (chain == nullptr) // First piece.
530       *flagp |= flags & SPSTART;
531     else
532       regtail(chain, latest);
533     chain = latest;
534   }
535   if (chain == nullptr) // Loop ran zero times.
536     regnode(NOTHING);
537 
538   return (ret);
539 }
540 
541 /*
542  - regpiece - something followed by possible [*+?]
543  *
544  * Note that the branching code sequences used for ? and the general cases
545  * of * and + are somewhat optimized:  they use the same NOTHING node as
546  * both the endmarker for their branch list and the body of the last branch.
547  * It might seem that this node could be dispensed with entirely, but the
548  * endmarker role is not redundant.
549  */
regpiece(int * flagp)550 char* RegExpCompile::regpiece(int* flagp)
551 {
552   char* ret;
553   char op;
554   char* next;
555   int flags;
556 
557   ret = regatom(&flags);
558   if (ret == nullptr)
559     return (nullptr);
560 
561   op = *regparse;
562   if (!ISMULT(op)) {
563     *flagp = flags;
564     return (ret);
565   }
566 
567   if (!(flags & HASWIDTH) && op != '?') {
568     // RAISE Error, SYM(RegularExpression), SYM(Empty_Operand),
569     printf("RegularExpression::compile() : *+ operand could be empty.\n");
570     return nullptr;
571   }
572   *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
573 
574   if (op == '*' && (flags & SIMPLE))
575     reginsert(STAR, ret);
576   else if (op == '*') {
577     // Emit x* as (x&|), where & means "self".
578     reginsert(BRANCH, ret);         // Either x
579     regoptail(ret, regnode(BACK));  // and loop
580     regoptail(ret, ret);            // back
581     regtail(ret, regnode(BRANCH));  // or
582     regtail(ret, regnode(NOTHING)); // null.
583   } else if (op == '+' && (flags & SIMPLE))
584     reginsert(PLUS, ret);
585   else if (op == '+') {
586     // Emit x+ as x(&|), where & means "self".
587     next = regnode(BRANCH); // Either
588     regtail(ret, next);
589     regtail(regnode(BACK), ret);    // loop back
590     regtail(next, regnode(BRANCH)); // or
591     regtail(ret, regnode(NOTHING)); // null.
592   } else if (op == '?') {
593     // Emit x? as (x|)
594     reginsert(BRANCH, ret);        // Either x
595     regtail(ret, regnode(BRANCH)); // or
596     next = regnode(NOTHING);       // null.
597     regtail(ret, next);
598     regoptail(ret, next);
599   }
600   regparse++;
601   if (ISMULT(*regparse)) {
602     // RAISE Error, SYM(RegularExpression), SYM(Nested_Operand),
603     printf("RegularExpression::compile(): Nested *?+.\n");
604     return nullptr;
605   }
606   return (ret);
607 }
608 
609 /*
610  - regatom - the lowest level
611  *
612  * Optimization:  gobbles an entire sequence of ordinary characters so that
613  * it can turn them into a single node, which is smaller to store and
614  * faster to run.  Backslashed characters are exceptions, each becoming a
615  * separate node; the code is simpler that way and it's not worth fixing.
616  */
regatom(int * flagp)617 char* RegExpCompile::regatom(int* flagp)
618 {
619   char* ret;
620   int flags;
621 
622   *flagp = WORST; // Tentatively.
623 
624   switch (*regparse++) {
625     case '^':
626       ret = regnode(BOL);
627       break;
628     case '$':
629       ret = regnode(EOL);
630       break;
631     case '.':
632       ret = regnode(ANY);
633       *flagp |= HASWIDTH | SIMPLE;
634       break;
635     case '[': {
636       int rxpclass;
637       int rxpclassend;
638 
639       if (*regparse == '^') { // Complement of range.
640         ret = regnode(ANYBUT);
641         regparse++;
642       } else
643         ret = regnode(ANYOF);
644       if (*regparse == ']' || *regparse == '-')
645         regc(*regparse++);
646       while (*regparse != '\0' && *regparse != ']') {
647         if (*regparse == '-') {
648           regparse++;
649           if (*regparse == ']' || *regparse == '\0')
650             regc('-');
651           else {
652             rxpclass = UCHARAT(regparse - 2) + 1;
653             rxpclassend = UCHARAT(regparse);
654             if (rxpclass > rxpclassend + 1) {
655               // RAISE Error, SYM(RegularExpression), SYM(Invalid_Range),
656               printf("RegularExpression::compile(): Invalid range in [].\n");
657               return nullptr;
658             }
659             for (; rxpclass <= rxpclassend; rxpclass++)
660               regc(static_cast<char>(rxpclass));
661             regparse++;
662           }
663         } else
664           regc(*regparse++);
665       }
666       regc('\0');
667       if (*regparse != ']') {
668         // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Bracket),
669         printf("RegularExpression::compile(): Unmatched [].\n");
670         return nullptr;
671       }
672       regparse++;
673       *flagp |= HASWIDTH | SIMPLE;
674     } break;
675     case '(':
676       ret = reg(1, &flags);
677       if (ret == nullptr)
678         return (nullptr);
679       *flagp |= flags & (HASWIDTH | SPSTART);
680       break;
681     case '\0':
682     case '|':
683     case ')':
684       // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
685       printf("RegularExpression::compile(): Internal error.\n"); // Never here
686       return nullptr;
687     case '?':
688     case '+':
689     case '*':
690       // RAISE Error, SYM(RegularExpression), SYM(No_Operand),
691       printf("RegularExpression::compile(): ?+* follows nothing.\n");
692       return nullptr;
693     case '\\':
694       if (*regparse == '\0') {
695         // RAISE Error, SYM(RegularExpression), SYM(Trailing_Backslash),
696         printf("RegularExpression::compile(): Trailing backslash.\n");
697         return nullptr;
698       }
699       ret = regnode(EXACTLY);
700       regc(*regparse++);
701       regc('\0');
702       *flagp |= HASWIDTH | SIMPLE;
703       break;
704     default: {
705       int len;
706       char ender;
707 
708       regparse--;
709       len = int(strcspn(regparse, META));
710       if (len <= 0) {
711         // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
712         printf("RegularExpression::compile(): Internal error.\n");
713         return nullptr;
714       }
715       ender = *(regparse + len);
716       if (len > 1 && ISMULT(ender))
717         len--; // Back off clear of ?+* operand.
718       *flagp |= HASWIDTH;
719       if (len == 1)
720         *flagp |= SIMPLE;
721       ret = regnode(EXACTLY);
722       while (len > 0) {
723         regc(*regparse++);
724         len--;
725       }
726       regc('\0');
727     } break;
728   }
729   return (ret);
730 }
731 
732 /*
733  - regnode - emit a node
734    Location.
735  */
regnode(char op)736 char* RegExpCompile::regnode(char op)
737 {
738   char* ret;
739   char* ptr;
740 
741   ret = regcode;
742   if (ret == regdummyptr) {
743     regsize += 3;
744     return (ret);
745   }
746 
747   ptr = ret;
748   *ptr++ = op;
749   *ptr++ = '\0'; // Null "next" pointer.
750   *ptr++ = '\0';
751   regcode = ptr;
752 
753   return (ret);
754 }
755 
756 /*
757  - regc - emit (if appropriate) a byte of code
758  */
regc(char b)759 void RegExpCompile::regc(char b)
760 {
761   if (regcode != regdummyptr)
762     *regcode++ = b;
763   else
764     regsize++;
765 }
766 
767 /*
768  - reginsert - insert an operator in front of already-emitted operand
769  *
770  * Means relocating the operand.
771  */
reginsert(char op,char * opnd)772 void RegExpCompile::reginsert(char op, char* opnd)
773 {
774   char* src;
775   char* dst;
776   char* place;
777 
778   if (regcode == regdummyptr) {
779     regsize += 3;
780     return;
781   }
782 
783   src = regcode;
784   regcode += 3;
785   dst = regcode;
786   while (src > opnd)
787     *--dst = *--src;
788 
789   place = opnd; // Op node, where operand used to be.
790   *place++ = op;
791   *place++ = '\0';
792   *place = '\0';
793 }
794 
795 /*
796  - regtail - set the next-pointer at the end of a node chain
797  */
regtail(char * p,const char * val)798 void RegExpCompile::regtail(char* p, const char* val)
799 {
800   char* scan;
801   char* temp;
802   int offset;
803 
804   if (p == regdummyptr)
805     return;
806 
807   // Find last node.
808   scan = p;
809   for (;;) {
810     temp = regnext(scan);
811     if (temp == nullptr)
812       break;
813     scan = temp;
814   }
815 
816   if (OP(scan) == BACK)
817     offset = int(scan - val);
818   else
819     offset = int(val - scan);
820   *(scan + 1) = static_cast<char>((offset >> 8) & 0377);
821   *(scan + 2) = static_cast<char>(offset & 0377);
822 }
823 
824 /*
825  - regoptail - regtail on operand of first argument; nop if operandless
826  */
regoptail(char * p,const char * val)827 void RegExpCompile::regoptail(char* p, const char* val)
828 {
829   // "Operandless" and "op != BRANCH" are synonymous in practice.
830   if (p == nullptr || p == regdummyptr || OP(p) != BRANCH)
831     return;
832   regtail(OPERAND(p), val);
833 }
834 
835 ////////////////////////////////////////////////////////////////////////
836 //
837 //  find and friends
838 //
839 ////////////////////////////////////////////////////////////////////////
840 
841 /*
842  * Utility class for RegularExpression::find().
843  */
844 class RegExpFind
845 {
846 public:
847   const char* reginput;   // String-input pointer.
848   const char* regbol;     // Beginning of input, for ^ check.
849   const char** regstartp; // Pointer to startp array.
850   const char** regendp;   // Ditto for endp.
851 
852   int regtry(const char*, const char**, const char**, const char*);
853   int regmatch(const char*);
854   int regrepeat(const char*);
855 };
856 
857 // find -- Matches the regular expression to the given string.
858 // Returns true if found, and sets start and end indexes accordingly.
find(char const * string,RegularExpressionMatch & rmatch) const859 bool RegularExpression::find(char const* string,
860                              RegularExpressionMatch& rmatch) const
861 {
862   const char* s;
863 
864   rmatch.clear();
865   rmatch.searchstring = string;
866 
867   if (!this->program) {
868     return false;
869   }
870 
871   // Check validity of program.
872   if (UCHARAT(this->program) != MAGIC) {
873     // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
874     printf(
875       "RegularExpression::find(): Compiled regular expression corrupted.\n");
876     return false;
877   }
878 
879   // If there is a "must appear" string, look for it.
880   if (this->regmust != nullptr) {
881     s = string;
882     while ((s = strchr(s, this->regmust[0])) != nullptr) {
883       if (strncmp(s, this->regmust, this->regmlen) == 0)
884         break; // Found it.
885       s++;
886     }
887     if (s == nullptr) // Not present.
888       return false;
889   }
890 
891   RegExpFind regFind;
892 
893   // Mark beginning of line for ^ .
894   regFind.regbol = string;
895 
896   // Simplest case:  anchored match need be tried only once.
897   if (this->reganch)
898     return (
899       regFind.regtry(string, rmatch.startp, rmatch.endp, this->program) != 0);
900 
901   // Messy cases:  unanchored match.
902   s = string;
903   if (this->regstart != '\0')
904     // We know what char it must start with.
905     while ((s = strchr(s, this->regstart)) != nullptr) {
906       if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
907         return true;
908       s++;
909     }
910   else
911     // We don't -- general case.
912     do {
913       if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
914         return true;
915     } while (*s++ != '\0');
916 
917   // Failure.
918   return false;
919 }
920 
921 /*
922  - regtry - try match at specific point
923    0 failure, 1 success
924  */
regtry(const char * string,const char ** start,const char ** end,const char * prog)925 int RegExpFind::regtry(const char* string, const char** start,
926                        const char** end, const char* prog)
927 {
928   int i;
929   const char** sp1;
930   const char** ep;
931 
932   reginput = string;
933   regstartp = start;
934   regendp = end;
935 
936   sp1 = start;
937   ep = end;
938   for (i = RegularExpressionMatch::NSUBEXP; i > 0; i--) {
939     *sp1++ = nullptr;
940     *ep++ = nullptr;
941   }
942   if (regmatch(prog + 1)) {
943     start[0] = string;
944     end[0] = reginput;
945     return (1);
946   } else
947     return (0);
948 }
949 
950 /*
951  - regmatch - main matching routine
952  *
953  * Conceptually the strategy is simple:  check to see whether the current
954  * node matches, call self recursively to see whether the rest matches,
955  * and then act accordingly.  In practice we make some effort to avoid
956  * recursion, in particular by going through "ordinary" nodes (that don't
957  * need to know whether the rest of the match failed) by a loop instead of
958  * by recursion.
959  * 0 failure, 1 success
960  */
regmatch(const char * prog)961 int RegExpFind::regmatch(const char* prog)
962 {
963   const char* scan; // Current node.
964   const char* next; // Next node.
965 
966   scan = prog;
967 
968   while (scan != nullptr) {
969 
970     next = regnext(scan);
971 
972     switch (OP(scan)) {
973       case BOL:
974         if (reginput != regbol)
975           return (0);
976         break;
977       case EOL:
978         if (*reginput != '\0')
979           return (0);
980         break;
981       case ANY:
982         if (*reginput == '\0')
983           return (0);
984         reginput++;
985         break;
986       case EXACTLY: {
987         size_t len;
988         const char* opnd;
989 
990         opnd = OPERAND(scan);
991         // Inline the first character, for speed.
992         if (*opnd != *reginput)
993           return (0);
994         len = strlen(opnd);
995         if (len > 1 && strncmp(opnd, reginput, len) != 0)
996           return (0);
997         reginput += len;
998       } break;
999       case ANYOF:
1000         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == nullptr)
1001           return (0);
1002         reginput++;
1003         break;
1004       case ANYBUT:
1005         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != nullptr)
1006           return (0);
1007         reginput++;
1008         break;
1009       case NOTHING:
1010         break;
1011       case BACK:
1012         break;
1013       case OPEN + 1:
1014       case OPEN + 2:
1015       case OPEN + 3:
1016       case OPEN + 4:
1017       case OPEN + 5:
1018       case OPEN + 6:
1019       case OPEN + 7:
1020       case OPEN + 8:
1021       case OPEN + 9: {
1022         int no;
1023         const char* save;
1024 
1025         no = OP(scan) - OPEN;
1026         save = reginput;
1027 
1028         if (regmatch(next)) {
1029 
1030           //
1031           // Don't set startp if some later invocation of the
1032           // same parentheses already has.
1033           //
1034           if (regstartp[no] == nullptr)
1035             regstartp[no] = save;
1036           return (1);
1037         } else
1038           return (0);
1039       }
1040       //              break;
1041       case CLOSE + 1:
1042       case CLOSE + 2:
1043       case CLOSE + 3:
1044       case CLOSE + 4:
1045       case CLOSE + 5:
1046       case CLOSE + 6:
1047       case CLOSE + 7:
1048       case CLOSE + 8:
1049       case CLOSE + 9: {
1050         int no;
1051         const char* save;
1052 
1053         no = OP(scan) - CLOSE;
1054         save = reginput;
1055 
1056         if (regmatch(next)) {
1057 
1058           //
1059           // Don't set endp if some later invocation of the
1060           // same parentheses already has.
1061           //
1062           if (regendp[no] == nullptr)
1063             regendp[no] = save;
1064           return (1);
1065         } else
1066           return (0);
1067       }
1068       //              break;
1069       case BRANCH: {
1070 
1071         const char* save;
1072 
1073         if (OP(next) != BRANCH) // No choice.
1074           next = OPERAND(scan); // Avoid recursion.
1075         else {
1076           do {
1077             save = reginput;
1078             if (regmatch(OPERAND(scan)))
1079               return (1);
1080             reginput = save;
1081             scan = regnext(scan);
1082           } while (scan != nullptr && OP(scan) == BRANCH);
1083           return (0);
1084           // NOTREACHED
1085         }
1086       } break;
1087       case STAR:
1088       case PLUS: {
1089         char nextch;
1090         int no;
1091         const char* save;
1092         int min_no;
1093 
1094         //
1095         // Lookahead to avoid useless match attempts when we know
1096         // what character comes next.
1097         //
1098         nextch = '\0';
1099         if (OP(next) == EXACTLY)
1100           nextch = *OPERAND(next);
1101         min_no = (OP(scan) == STAR) ? 0 : 1;
1102         save = reginput;
1103         no = regrepeat(OPERAND(scan));
1104         while (no >= min_no) {
1105           // If it could work, try it.
1106           if (nextch == '\0' || *reginput == nextch)
1107             if (regmatch(next))
1108               return (1);
1109           // Couldn't or didn't -- back up.
1110           no--;
1111           reginput = save + no;
1112         }
1113         return (0);
1114       }
1115       //              break;
1116       case END:
1117         return (1); // Success!
1118 
1119       default:
1120         // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1121         printf(
1122           "RegularExpression::find(): Internal error -- memory corrupted.\n");
1123         return 0;
1124     }
1125     scan = next;
1126   }
1127 
1128   //
1129   //  We get here only if there's trouble -- normally "case END" is the
1130   //  terminating point.
1131   //
1132   // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1133   printf("RegularExpression::find(): Internal error -- corrupted pointers.\n");
1134   return (0);
1135 }
1136 
1137 /*
1138  - regrepeat - repeatedly match something simple, report how many
1139  */
regrepeat(const char * p)1140 int RegExpFind::regrepeat(const char* p)
1141 {
1142   int count = 0;
1143   const char* scan;
1144   const char* opnd;
1145 
1146   scan = reginput;
1147   opnd = OPERAND(p);
1148   switch (OP(p)) {
1149     case ANY:
1150       count = int(strlen(scan));
1151       scan += count;
1152       break;
1153     case EXACTLY:
1154       while (*opnd == *scan) {
1155         count++;
1156         scan++;
1157       }
1158       break;
1159     case ANYOF:
1160       while (*scan != '\0' && strchr(opnd, *scan) != nullptr) {
1161         count++;
1162         scan++;
1163       }
1164       break;
1165     case ANYBUT:
1166       while (*scan != '\0' && strchr(opnd, *scan) == nullptr) {
1167         count++;
1168         scan++;
1169       }
1170       break;
1171     default: // Oh dear.  Called inappropriately.
1172       // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
1173       printf("cm RegularExpression::find(): Internal error.\n");
1174       return 0;
1175   }
1176   reginput = scan;
1177   return (count);
1178 }
1179 
1180 /*
1181  - regnext - dig the "next" pointer out of a node
1182  */
regnext(const char * p)1183 static const char* regnext(const char* p)
1184 {
1185   int offset;
1186 
1187   if (p == regdummyptr)
1188     return (nullptr);
1189 
1190   offset = NEXT(p);
1191   if (offset == 0)
1192     return (nullptr);
1193 
1194   if (OP(p) == BACK)
1195     return (p - offset);
1196   else
1197     return (p + offset);
1198 }
1199 
regnext(char * p)1200 static char* regnext(char* p)
1201 {
1202   int offset;
1203 
1204   if (p == regdummyptr)
1205     return (nullptr);
1206 
1207   offset = NEXT(p);
1208   if (offset == 0)
1209     return (nullptr);
1210 
1211   if (OP(p) == BACK)
1212     return (p - offset);
1213   else
1214     return (p + offset);
1215 }
1216 
1217 } // namespace KWSYS_NAMESPACE
1218