Source/kwsys/RegularExpression.cxx

/* Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
   file Copyright.txt or https://cmake.org/licensing#kwsys for details.  */
//
// Copyright (C) 1991 Texas Instruments Incorporated.
//
// Permission is granted to any individual or institution to use, copy, modify
// and distribute this software, provided that this complete copyright and
// permission notice is maintained, intact, in all copies and supporting
// documentation.
//
// Texas Instruments Incorporated provides this software "as is" without
// express or implied warranty.
//
//
// Created: MNF 06/13/89  Initial Design and Implementation
// Updated: LGO 08/09/89  Inherit from Generic
// Updated: MBN 09/07/89  Added conditional exception handling
// Updated: MBN 12/15/89  Sprinkled "const" qualifiers all over the place!
// Updated: DLS 03/22/91  New lite version
//

#include "kwsysPrivate.h"
#include KWSYS_HEADER(RegularExpression.hxx)

// Work-around CMake dependency scanning limitation.  This must
// duplicate the above list of headers.
#if 0
#  include "RegularExpression.hxx.in"
#endif

#include <cstdio>
#include <cstring>

namespace KWSYS_NAMESPACE {

// RegularExpression -- Copies the given regular expression.
RegularExpression::RegularExpression(const RegularExpression& rxp)
{
  if (!rxp.program) {
    this->program = nullptr;
    return;
  }
  int ind;
  this->progsize = rxp.progsize;            // Copy regular expression size
  this->program = new char[this->progsize]; // Allocate storage
  for (ind = this->progsize; ind-- != 0;)   // Copy regular expression
    this->program[ind] = rxp.program[ind];
  // Copy pointers into last successful "find" operation
  this->regmatch = rxp.regmatch;
  this->regmust = rxp.regmust; // Copy field
  if (rxp.regmust != nullptr) {
    char* dum = rxp.program;
    ind = 0;
    while (dum != rxp.regmust) {
      ++dum;
      ++ind;
    }
    this->regmust = this->program + ind;
  }
  this->regstart = rxp.regstart; // Copy starting index
  this->reganch = rxp.reganch;   // Copy remaining private data
  this->regmlen = rxp.regmlen;   // Copy remaining private data
}

// operator= -- Copies the given regular expression.
RegularExpression& RegularExpression::operator=(const RegularExpression& rxp)
{
  if (this == &rxp) {
    return *this;
  }
  if (!rxp.program) {
    this->program = nullptr;
    return *this;
  }
  int ind;
  this->progsize = rxp.progsize; // Copy regular expression size
  delete[] this->program;
  this->program = new char[this->progsize]; // Allocate storage
  for (ind = this->progsize; ind-- != 0;)   // Copy regular expression
    this->program[ind] = rxp.program[ind];
  // Copy pointers into last successful "find" operation
  this->regmatch = rxp.regmatch;
  this->regmust = rxp.regmust; // Copy field
  if (rxp.regmust != nullptr) {
    char* dum = rxp.program;
    ind = 0;
    while (dum != rxp.regmust) {
      ++dum;
      ++ind;
    }
    this->regmust = this->program + ind;
  }
  this->regstart = rxp.regstart; // Copy starting index
  this->reganch = rxp.reganch;   // Copy remaining private data
  this->regmlen = rxp.regmlen;   // Copy remaining private data

  return *this;
}

// operator== -- Returns true if two regular expressions have the same
// compiled program for pattern matching.
bool RegularExpression::operator==(const RegularExpression& rxp) const
{
  if (this != &rxp) {         // Same address?
    int ind = this->progsize; // Get regular expression size
    if (ind != rxp.progsize)  // If different size regexp
      return false;           // Return failure
    while (ind-- != 0)        // Else while still characters
      if (this->program[ind] != rxp.program[ind]) // If regexp are different
        return false;                             // Return failure
  }
  return true; // Else same, return success
}

// deep_equal -- Returns true if have the same compiled regular expressions
// and the same start and end pointers.

bool RegularExpression::deep_equal(const RegularExpression& rxp) const
{
  int ind = this->progsize;                     // Get regular expression size
  if (ind != rxp.progsize)                      // If different size regexp
    return false;                               // Return failure
  while (ind-- != 0)                            // Else while still characters
    if (this->program[ind] != rxp.program[ind]) // If regexp are different
      return false;                             // Return failure
  // Else if same start/end ptrs, return true
  return (this->regmatch.start() == rxp.regmatch.start() &&
          this->regmatch.end() == rxp.regmatch.end());
}

// The remaining code in this file is derived from the regular expression code
// whose copyright statement appears below.  It has been changed to work
// with the class concepts of C++ and COOL.

/*
 * compile and find
 *
 *      Copyright (c) 1986 by University of Toronto.
 *      Written by Henry Spencer.  Not derived from licensed software.
 *
 *      Permission is granted to anyone to use this software for any
 *      purpose on any computer system, and to redistribute it freely,
 *      subject to the following restrictions:
 *
 *      1. The author is not responsible for the consequences of use of
 *              this software, no matter how awful, even if they arise
 *              from defects in it.
 *
 *      2. The origin of this software must not be misrepresented, either
 *              by explicit claim or by omission.
 *
 *      3. Altered versions must be plainly marked as such, and must not
 *              be misrepresented as being the original software.
 *
 * Beware that some of this code is subtly aware of the way operator
 * precedence is structured in regular expressions.  Serious changes in
 * regular-expression syntax might require a total rethink.
 */

/*
 * The "internal use only" fields in regexp.h are present to pass info from
 * compile to execute that permits the execute phase to run lots faster on
 * simple cases.  They are:
 *
 * regstart     char that must begin a match; '\0' if none obvious
 * reganch      is the match anchored (at beginning-of-line only)?
 * regmust      string (pointer into program) that match must include, or
 * nullptr regmlen      length of regmust string
 *
 * Regstart and reganch permit very fast decisions on suitable starting points
 * for a match, cutting down the work a lot.  Regmust permits fast rejection
 * of lines that cannot possibly match.  The regmust tests are costly enough
 * that compile() supplies a regmust only if the r.e. contains something
 * potentially expensive (at present, the only such thing detected is * or +
 * at the start of the r.e., which can involve a lot of backup).  Regmlen is
 * supplied because the test in find() needs it and compile() is computing
 * it anyway.
 */

/*
 * Structure for regexp "program".  This is essentially a linear encoding
 * of a nondeterministic finite-state machine (aka syntax charts or
 * "railroad normal form" in parsing technology).  Each node is an opcode
 * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
 * all nodes except BRANCH implement concatenation; a "next" pointer with
 * a BRANCH on both ends of it is connecting two alternatives.  (Here we
 * have one of the subtle syntax dependencies:  an individual BRANCH (as
 * opposed to a collection of them) is never concatenated with anything
 * because of operator precedence.)  The operand of some types of node is
 * a literal string; for others, it is a node leading into a sub-FSM.  In
 * particular, the operand of a BRANCH node is the first node of the branch.
 * (NB this is *not* a tree structure:  the tail of the branch connects
 * to the thing following the set of BRANCHes.)  The opcodes are:
 */

// definition   number  opnd?   meaning
#define END 0   // no   End of program.
#define BOL 1   // no   Match "" at beginning of line.
#define EOL 2   // no   Match "" at end of line.
#define ANY 3   // no   Match any one character.
#define ANYOF 4 // str  Match any character in this string.
#define ANYBUT                                                                \
  5 // str  Match any character not in this
    // string.
#define BRANCH                                                                \
  6               // node Match this alternative, or the
                  // next...
#define BACK 7    // no   Match "", "next" ptr points backward.
#define EXACTLY 8 // str  Match this string.
#define NOTHING 9 // no   Match empty string.
#define STAR                                                                  \
  10 // node Match this (simple) thing 0 or more
     // times.
#define PLUS                                                                  \
  11 // node Match this (simple) thing 1 or more
     // times.
#define OPEN                                                                  \
  20 // no   Mark this point in input as start of
     // #n.
// OPEN+1 is number 1, etc.
#define CLOSE 30 // no   Analogous to OPEN.

/*
 * Opcode notes:
 *
 * BRANCH       The set of branches constituting a single choice are hooked
 *              together with their "next" pointers, since precedence prevents
 *              anything being concatenated to any individual branch.  The
 *              "next" pointer of the last BRANCH in a choice points to the
 *              thing following the whole choice.  This is also where the
 *              final "next" pointer of each individual branch points; each
 *              branch starts with the operand node of a BRANCH node.
 *
 * BACK         Normal "next" pointers all implicitly point forward; BACK
 *              exists to make loop structures possible.
 *
 * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 *              BRANCH structures using BACK.  Simple cases (one character
 *              per match) are implemented with STAR and PLUS for speed
 *              and to minimize recursive plunges.
 *
 * OPEN,CLOSE   ...are numbered at compile time.
 */

/*
 * A node is one char of opcode followed by two chars of "next" pointer.
 * "Next" pointers are stored as two 8-bit pieces, high order first.  The
 * value is a positive offset from the opcode of the node containing it.
 * An operand, if any, simply follows the node.  (Note that much of the
 * code generation knows about this implicit relationship.)
 *
 * Using two bytes for the "next" pointer is vast overkill for most things,
 * but allows patterns to get big without disasters.
 */

#define OP(p) (*(p))
#define NEXT(p) (((*((p) + 1) & 0377) << 8) + (*((p) + 2) & 0377))
#define OPERAND(p) ((p) + 3)

const unsigned char MAGIC = 0234;
/*
 * Utility definitions.
 */

#define UCHARAT(p) (reinterpret_cast<const unsigned char*>(p))[0]

#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define META "^$.[()|?+*\\"

/*
 * Flags to be passed up and down.
 */
#define HASWIDTH 01 // Known never to match null string.
#define SIMPLE 02   // Simple enough to be STAR/PLUS operand.
#define SPSTART 04  // Starts with * or +.
#define WORST 0     // Worst case.

/////////////////////////////////////////////////////////////////////////
//
//  COMPILE AND ASSOCIATED FUNCTIONS
//
/////////////////////////////////////////////////////////////////////////

/*
 * Read only utility variables.
 */
static char regdummy;
static char* const regdummyptr = &regdummy;

/*
 * Utility class for RegularExpression::compile().
 */
class RegExpCompile
{
public:
  const char* regparse; // Input-scan pointer.
  int regnpar;          // () count.
  char* regcode;        // Code-emit pointer; regdummyptr = don't.
  long regsize;         // Code size.

  char* reg(int, int*);
  char* regbranch(int*);
  char* regpiece(int*);
  char* regatom(int*);
  char* regnode(char);
  void regc(char);
  void reginsert(char, char*);
  static void regtail(char*, const char*);
  static void regoptail(char*, const char*);
};

static const char* regnext(const char*);
static char* regnext(char*);

#ifdef STRCSPN
static int strcspn();
#endif

/*
 * We can't allocate space until we know how big the compiled form will be,
 * but we can't compile it (and thus know how big it is) until we've got a
 * place to put the code.  So we cheat:  we compile it twice, once with code
 * generation turned off and size counting turned on, and once "for real".
 * This also means that we don't allocate space until we are sure that the
 * thing really will compile successfully, and we never have to move the
 * code and thus invalidate pointers into it.  (Note that it has to be in
 * one piece because free() must be able to free it all.)
 *
 * Beware that the optimization-preparation code in here knows about some
 * of the structure of the compiled regexp.
 */

// compile -- compile a regular expression into internal code
// for later pattern matching.

bool RegularExpression::compile(const char* exp)
{
  const char* scan;
  const char* longest;
  int flags;

  if (exp == nullptr) {
    // RAISE Error, SYM(RegularExpression), SYM(No_Expr),
    printf("RegularExpression::compile(): No expression supplied.\n");
    return false;
  }

  // First pass: determine size, legality.
  RegExpCompile comp;
  comp.regparse = exp;
  comp.regnpar = 1;
  comp.regsize = 0L;
  comp.regcode = regdummyptr;
  comp.regc(static_cast<char>(MAGIC));
  if (!comp.reg(0, &flags)) {
    printf("RegularExpression::compile(): Error in compile.\n");
    return false;
  }
  this->regmatch.clear();

  // Small enough for pointer-storage convention?
  if (comp.regsize >= 65535L) {
    // RAISE Error, SYM(RegularExpression), SYM(Expr_Too_Big),
    printf("RegularExpression::compile(): Expression too big.\n");
    return false;
  }

  // Allocate space.
  //#ifndef _WIN32
  delete[] this->program;
  //#endif
  this->program = new char[comp.regsize];
  this->progsize = static_cast<int>(comp.regsize);

  if (this->program == nullptr) {
    // RAISE Error, SYM(RegularExpression), SYM(Out_Of_Memory),
    printf("RegularExpression::compile(): Out of memory.\n");
    return false;
  }

  // Second pass: emit code.
  comp.regparse = exp;
  comp.regnpar = 1;
  comp.regcode = this->program;
  comp.regc(static_cast<char>(MAGIC));
  comp.reg(0, &flags);

  // Dig out information for optimizations.
  this->regstart = '\0'; // Worst-case defaults.
  this->reganch = 0;
  this->regmust = nullptr;
  this->regmlen = 0;
  scan = this->program + 1;       // First BRANCH.
  if (OP(regnext(scan)) == END) { // Only one top-level choice.
    scan = OPERAND(scan);

    // Starting-point info.
    if (OP(scan) == EXACTLY)
      this->regstart = *OPERAND(scan);
    else if (OP(scan) == BOL)
      this->reganch++;

    //
    // If there's something expensive in the r.e., find the longest
    // literal string that must appear and make it the regmust.  Resolve
    // ties in favor of later strings, since the regstart check works
    // with the beginning of the r.e. and avoiding duplication
    // strengthens checking.  Not a strong reason, but sufficient in the
    // absence of others.
    //
    if (flags & SPSTART) {
      longest = nullptr;
      size_t len = 0;
      for (; scan != nullptr; scan = regnext(scan))
        if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
          longest = OPERAND(scan);
          len = strlen(OPERAND(scan));
        }
      this->regmust = longest;
      this->regmlen = len;
    }
  }
  return true;
}

/*
 - reg - regular expression, i.e. main body or parenthesized thing
 *
 * Caller must absorb opening parenthesis.
 *
 * Combining parenthesis handling with the base level of regular expression
 * is a trifle forced, but the need to tie the tails of the branches to what
 * follows makes it hard to avoid.
 */
char* RegExpCompile::reg(int paren, int* flagp)
{
  char* ret;
  char* br;
  char* ender;
  int parno = 0;
  int flags;

  *flagp = HASWIDTH; // Tentatively.

  // Make an OPEN node, if parenthesized.
  if (paren) {
    if (regnpar >= RegularExpressionMatch::NSUBEXP) {
      // RAISE Error, SYM(RegularExpression), SYM(Too_Many_Parens),
      printf("RegularExpression::compile(): Too many parentheses.\n");
      return nullptr;
    }
    parno = regnpar;
    regnpar++;
    ret = regnode(static_cast<char>(OPEN + parno));
  } else
    ret = nullptr;

  // Pick up the branches, linking them together.
  br = regbranch(&flags);
  if (br == nullptr)
    return (nullptr);
  if (ret != nullptr)
    regtail(ret, br); // OPEN -> first.
  else
    ret = br;
  if (!(flags & HASWIDTH))
    *flagp &= ~HASWIDTH;
  *flagp |= flags & SPSTART;
  while (*regparse == '|') {
    regparse++;
    br = regbranch(&flags);
    if (br == nullptr)
      return (nullptr);
    regtail(ret, br); // BRANCH -> BRANCH.
    if (!(flags & HASWIDTH))
      *flagp &= ~HASWIDTH;
    *flagp |= flags & SPSTART;
  }

  // Make a closing node, and hook it on the end.
  ender = regnode(static_cast<char>((paren) ? CLOSE + parno : END));
  regtail(ret, ender);

  // Hook the tails of the branches to the closing node.
  for (br = ret; br != nullptr; br = regnext(br))
    regoptail(br, ender);

  // Check for proper termination.
  if (paren && *regparse++ != ')') {
    // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
    printf("RegularExpression::compile(): Unmatched parentheses.\n");
    return nullptr;
  } else if (!paren && *regparse != '\0') {
    if (*regparse == ')') {
      // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Parens),
      printf("RegularExpression::compile(): Unmatched parentheses.\n");
      return nullptr;
    } else {
      // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
      printf("RegularExpression::compile(): Internal error.\n");
      return nullptr;
    }
    // NOTREACHED
  }
  return (ret);
}

/*
 - regbranch - one alternative of an | operator
 *
 * Implements the concatenation operator.
 */
char* RegExpCompile::regbranch(int* flagp)
{
  char* ret;
  char* chain;
  char* latest;
  int flags;

  *flagp = WORST; // Tentatively.

  ret = regnode(BRANCH);
  chain = nullptr;
  while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
    latest = regpiece(&flags);
    if (latest == nullptr)
      return (nullptr);
    *flagp |= flags & HASWIDTH;
    if (chain == nullptr) // First piece.
      *flagp |= flags & SPSTART;
    else
      regtail(chain, latest);
    chain = latest;
  }
  if (chain == nullptr) // Loop ran zero times.
    regnode(NOTHING);

  return (ret);
}

/*
 - regpiece - something followed by possible [*+?]
 *
 * Note that the branching code sequences used for ? and the general cases
 * of * and + are somewhat optimized:  they use the same NOTHING node as
 * both the endmarker for their branch list and the body of the last branch.
 * It might seem that this node could be dispensed with entirely, but the
 * endmarker role is not redundant.
 */
char* RegExpCompile::regpiece(int* flagp)
{
  char* ret;
  char op;
  char* next;
  int flags;

  ret = regatom(&flags);
  if (ret == nullptr)
    return (nullptr);

  op = *regparse;
  if (!ISMULT(op)) {
    *flagp = flags;
    return (ret);
  }

  if (!(flags & HASWIDTH) && op != '?') {
    // RAISE Error, SYM(RegularExpression), SYM(Empty_Operand),
    printf("RegularExpression::compile() : *+ operand could be empty.\n");
    return nullptr;
  }
  *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);

  if (op == '*' && (flags & SIMPLE))
    reginsert(STAR, ret);
  else if (op == '*') {
    // Emit x* as (x&|), where & means "self".
    reginsert(BRANCH, ret);         // Either x
    regoptail(ret, regnode(BACK));  // and loop
    regoptail(ret, ret);            // back
    regtail(ret, regnode(BRANCH));  // or
    regtail(ret, regnode(NOTHING)); // null.
  } else if (op == '+' && (flags & SIMPLE))
    reginsert(PLUS, ret);
  else if (op == '+') {
    // Emit x+ as x(&|), where & means "self".
    next = regnode(BRANCH); // Either
    regtail(ret, next);
    regtail(regnode(BACK), ret);    // loop back
    regtail(next, regnode(BRANCH)); // or
    regtail(ret, regnode(NOTHING)); // null.
  } else if (op == '?') {
    // Emit x? as (x|)
    reginsert(BRANCH, ret);        // Either x
    regtail(ret, regnode(BRANCH)); // or
    next = regnode(NOTHING);       // null.
    regtail(ret, next);
    regoptail(ret, next);
  }
  regparse++;
  if (ISMULT(*regparse)) {
    // RAISE Error, SYM(RegularExpression), SYM(Nested_Operand),
    printf("RegularExpression::compile(): Nested *?+.\n");
    return nullptr;
  }
  return (ret);
}

/*
 - regatom - the lowest level
 *
 * Optimization:  gobbles an entire sequence of ordinary characters so that
 * it can turn them into a single node, which is smaller to store and
 * faster to run.  Backslashed characters are exceptions, each becoming a
 * separate node; the code is simpler that way and it's not worth fixing.
 */
char* RegExpCompile::regatom(int* flagp)
{
  char* ret;
  int flags;

  *flagp = WORST; // Tentatively.

  switch (*regparse++) {
    case '^':
      ret = regnode(BOL);
      break;
    case '$':
      ret = regnode(EOL);
      break;
    case '.':
      ret = regnode(ANY);
      *flagp |= HASWIDTH | SIMPLE;
      break;
    case '[': {
      int rxpclass;
      int rxpclassend;

      if (*regparse == '^') { // Complement of range.
        ret = regnode(ANYBUT);
        regparse++;
      } else
        ret = regnode(ANYOF);
      if (*regparse == ']' || *regparse == '-')
        regc(*regparse++);
      while (*regparse != '\0' && *regparse != ']') {
        if (*regparse == '-') {
          regparse++;
          if (*regparse == ']' || *regparse == '\0')
            regc('-');
          else {
            rxpclass = UCHARAT(regparse - 2) + 1;
            rxpclassend = UCHARAT(regparse);
            if (rxpclass > rxpclassend + 1) {
              // RAISE Error, SYM(RegularExpression), SYM(Invalid_Range),
              printf("RegularExpression::compile(): Invalid range in [].\n");
              return nullptr;
            }
            for (; rxpclass <= rxpclassend; rxpclass++)
              regc(static_cast<char>(rxpclass));
            regparse++;
          }
        } else
          regc(*regparse++);
      }
      regc('\0');
      if (*regparse != ']') {
        // RAISE Error, SYM(RegularExpression), SYM(Unmatched_Bracket),
        printf("RegularExpression::compile(): Unmatched [].\n");
        return nullptr;
      }
      regparse++;
      *flagp |= HASWIDTH | SIMPLE;
    } break;
    case '(':
      ret = reg(1, &flags);
      if (ret == nullptr)
        return (nullptr);
      *flagp |= flags & (HASWIDTH | SPSTART);
      break;
    case '\0':
    case '|':
    case ')':
      // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
      printf("RegularExpression::compile(): Internal error.\n"); // Never here
      return nullptr;
    case '?':
    case '+':
    case '*':
      // RAISE Error, SYM(RegularExpression), SYM(No_Operand),
      printf("RegularExpression::compile(): ?+* follows nothing.\n");
      return nullptr;
    case '\\':
      if (*regparse == '\0') {
        // RAISE Error, SYM(RegularExpression), SYM(Trailing_Backslash),
        printf("RegularExpression::compile(): Trailing backslash.\n");
        return nullptr;
      }
      ret = regnode(EXACTLY);
      regc(*regparse++);
      regc('\0');
      *flagp |= HASWIDTH | SIMPLE;
      break;
    default: {
      int len;
      char ender;

      regparse--;
      len = int(strcspn(regparse, META));
      if (len <= 0) {
        // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
        printf("RegularExpression::compile(): Internal error.\n");
        return nullptr;
      }
      ender = *(regparse + len);
      if (len > 1 && ISMULT(ender))
        len--; // Back off clear of ?+* operand.
      *flagp |= HASWIDTH;
      if (len == 1)
        *flagp |= SIMPLE;
      ret = regnode(EXACTLY);
      while (len > 0) {
        regc(*regparse++);
        len--;
      }
      regc('\0');
    } break;
  }
  return (ret);
}

/*
 - regnode - emit a node
   Location.
 */
char* RegExpCompile::regnode(char op)
{
  char* ret;
  char* ptr;

  ret = regcode;
  if (ret == regdummyptr) {
    regsize += 3;
    return (ret);
  }

  ptr = ret;
  *ptr++ = op;
  *ptr++ = '\0'; // Null "next" pointer.
  *ptr++ = '\0';
  regcode = ptr;

  return (ret);
}

/*
 - regc - emit (if appropriate) a byte of code
 */
void RegExpCompile::regc(char b)
{
  if (regcode != regdummyptr)
    *regcode++ = b;
  else
    regsize++;
}

/*
 - reginsert - insert an operator in front of already-emitted operand
 *
 * Means relocating the operand.
 */
void RegExpCompile::reginsert(char op, char* opnd)
{
  char* src;
  char* dst;
  char* place;

  if (regcode == regdummyptr) {
    regsize += 3;
    return;
  }

  src = regcode;
  regcode += 3;
  dst = regcode;
  while (src > opnd)
    *--dst = *--src;

  place = opnd; // Op node, where operand used to be.
  *place++ = op;
  *place++ = '\0';
  *place = '\0';
}

/*
 - regtail - set the next-pointer at the end of a node chain
 */
void RegExpCompile::regtail(char* p, const char* val)
{
  char* scan;
  char* temp;
  int offset;

  if (p == regdummyptr)
    return;

  // Find last node.
  scan = p;
  for (;;) {
    temp = regnext(scan);
    if (temp == nullptr)
      break;
    scan = temp;
  }

  if (OP(scan) == BACK)
    offset = int(scan - val);
  else
    offset = int(val - scan);
  *(scan + 1) = static_cast<char>((offset >> 8) & 0377);
  *(scan + 2) = static_cast<char>(offset & 0377);
}

/*
 - regoptail - regtail on operand of first argument; nop if operandless
 */
void RegExpCompile::regoptail(char* p, const char* val)
{
  // "Operandless" and "op != BRANCH" are synonymous in practice.
  if (p == nullptr || p == regdummyptr || OP(p) != BRANCH)
    return;
  regtail(OPERAND(p), val);
}

////////////////////////////////////////////////////////////////////////
//
//  find and friends
//
////////////////////////////////////////////////////////////////////////

/*
 * Utility class for RegularExpression::find().
 */
class RegExpFind
{
public:
  const char* reginput;   // String-input pointer.
  const char* regbol;     // Beginning of input, for ^ check.
  const char** regstartp; // Pointer to startp array.
  const char** regendp;   // Ditto for endp.

  int regtry(const char*, const char**, const char**, const char*);
  int regmatch(const char*);
  int regrepeat(const char*);
};

// find -- Matches the regular expression to the given string.
// Returns true if found, and sets start and end indexes accordingly.
bool RegularExpression::find(char const* string,
                             RegularExpressionMatch& rmatch) const
{
  const char* s;

  rmatch.clear();
  rmatch.searchstring = string;

  if (!this->program) {
    return false;
  }

  // Check validity of program.
  if (UCHARAT(this->program) != MAGIC) {
    // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
    printf(
      "RegularExpression::find(): Compiled regular expression corrupted.\n");
    return false;
  }

  // If there is a "must appear" string, look for it.
  if (this->regmust != nullptr) {
    s = string;
    while ((s = strchr(s, this->regmust[0])) != nullptr) {
      if (strncmp(s, this->regmust, this->regmlen) == 0)
        break; // Found it.
      s++;
    }
    if (s == nullptr) // Not present.
      return false;
  }

  RegExpFind regFind;

  // Mark beginning of line for ^ .
  regFind.regbol = string;

  // Simplest case:  anchored match need be tried only once.
  if (this->reganch)
    return (
      regFind.regtry(string, rmatch.startp, rmatch.endp, this->program) != 0);

  // Messy cases:  unanchored match.
  s = string;
  if (this->regstart != '\0')
    // We know what char it must start with.
    while ((s = strchr(s, this->regstart)) != nullptr) {
      if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
        return true;
      s++;
    }
  else
    // We don't -- general case.
    do {
      if (regFind.regtry(s, rmatch.startp, rmatch.endp, this->program))
        return true;
    } while (*s++ != '\0');

  // Failure.
  return false;
}

/*
 - regtry - try match at specific point
   0 failure, 1 success
 */
int RegExpFind::regtry(const char* string, const char** start,
                       const char** end, const char* prog)
{
  int i;
  const char** sp1;
  const char** ep;

  reginput = string;
  regstartp = start;
  regendp = end;

  sp1 = start;
  ep = end;
  for (i = RegularExpressionMatch::NSUBEXP; i > 0; i--) {
    *sp1++ = nullptr;
    *ep++ = nullptr;
  }
  if (regmatch(prog + 1)) {
    start[0] = string;
    end[0] = reginput;
    return (1);
  } else
    return (0);
}

/*
 - regmatch - main matching routine
 *
 * Conceptually the strategy is simple:  check to see whether the current
 * node matches, call self recursively to see whether the rest matches,
 * and then act accordingly.  In practice we make some effort to avoid
 * recursion, in particular by going through "ordinary" nodes (that don't
 * need to know whether the rest of the match failed) by a loop instead of
 * by recursion.
 * 0 failure, 1 success
 */
int RegExpFind::regmatch(const char* prog)
{
  const char* scan; // Current node.
  const char* next; // Next node.

  scan = prog;

  while (scan != nullptr) {

    next = regnext(scan);

    switch (OP(scan)) {
      case BOL:
        if (reginput != regbol)
          return (0);
        break;
      case EOL:
        if (*reginput != '\0')
          return (0);
        break;
      case ANY:
        if (*reginput == '\0')
          return (0);
        reginput++;
        break;
      case EXACTLY: {
        size_t len;
        const char* opnd;

        opnd = OPERAND(scan);
        // Inline the first character, for speed.
        if (*opnd != *reginput)
          return (0);
        len = strlen(opnd);
        if (len > 1 && strncmp(opnd, reginput, len) != 0)
          return (0);
        reginput += len;
      } break;
      case ANYOF:
        if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == nullptr)
          return (0);
        reginput++;
        break;
      case ANYBUT:
        if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != nullptr)
          return (0);
        reginput++;
        break;
      case NOTHING:
        break;
      case BACK:
        break;
      case OPEN + 1:
      case OPEN + 2:
      case OPEN + 3:
      case OPEN + 4:
      case OPEN + 5:
      case OPEN + 6:
      case OPEN + 7:
      case OPEN + 8:
      case OPEN + 9: {
        int no;
        const char* save;

        no = OP(scan) - OPEN;
        save = reginput;

        if (regmatch(next)) {

          //
          // Don't set startp if some later invocation of the
          // same parentheses already has.
          //
          if (regstartp[no] == nullptr)
            regstartp[no] = save;
          return (1);
        } else
          return (0);
      }
      //              break;
      case CLOSE + 1:
      case CLOSE + 2:
      case CLOSE + 3:
      case CLOSE + 4:
      case CLOSE + 5:
      case CLOSE + 6:
      case CLOSE + 7:
      case CLOSE + 8:
      case CLOSE + 9: {
        int no;
        const char* save;

        no = OP(scan) - CLOSE;
        save = reginput;

        if (regmatch(next)) {

          //
          // Don't set endp if some later invocation of the
          // same parentheses already has.
          //
          if (regendp[no] == nullptr)
            regendp[no] = save;
          return (1);
        } else
          return (0);
      }
      //              break;
      case BRANCH: {

        const char* save;

        if (OP(next) != BRANCH) // No choice.
          next = OPERAND(scan); // Avoid recursion.
        else {
          do {
            save = reginput;
            if (regmatch(OPERAND(scan)))
              return (1);
            reginput = save;
            scan = regnext(scan);
          } while (scan != nullptr && OP(scan) == BRANCH);
          return (0);
          // NOTREACHED
        }
      } break;
      case STAR:
      case PLUS: {
        char nextch;
        int no;
        const char* save;
        int min_no;

        //
        // Lookahead to avoid useless match attempts when we know
        // what character comes next.
        //
        nextch = '\0';
        if (OP(next) == EXACTLY)
          nextch = *OPERAND(next);
        min_no = (OP(scan) == STAR) ? 0 : 1;
        save = reginput;
        no = regrepeat(OPERAND(scan));
        while (no >= min_no) {
          // If it could work, try it.
          if (nextch == '\0' || *reginput == nextch)
            if (regmatch(next))
              return (1);
          // Couldn't or didn't -- back up.
          no--;
          reginput = save + no;
        }
        return (0);
      }
      //              break;
      case END:
        return (1); // Success!

      default:
        // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
        printf(
          "RegularExpression::find(): Internal error -- memory corrupted.\n");
        return 0;
    }
    scan = next;
  }

  //
  //  We get here only if there's trouble -- normally "case END" is the
  //  terminating point.
  //
  // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
  printf("RegularExpression::find(): Internal error -- corrupted pointers.\n");
  return (0);
}

/*
 - regrepeat - repeatedly match something simple, report how many
 */
int RegExpFind::regrepeat(const char* p)
{
  int count = 0;
  const char* scan;
  const char* opnd;

  scan = reginput;
  opnd = OPERAND(p);
  switch (OP(p)) {
    case ANY:
      count = int(strlen(scan));
      scan += count;
      break;
    case EXACTLY:
      while (*opnd == *scan) {
        count++;
        scan++;
      }
      break;
    case ANYOF:
      while (*scan != '\0' && strchr(opnd, *scan) != nullptr) {
        count++;
        scan++;
      }
      break;
    case ANYBUT:
      while (*scan != '\0' && strchr(opnd, *scan) == nullptr) {
        count++;
        scan++;
      }
      break;
    default: // Oh dear.  Called inappropriately.
      // RAISE Error, SYM(RegularExpression), SYM(Internal_Error),
      printf("cm RegularExpression::find(): Internal error.\n");
      return 0;
  }
  reginput = scan;
  return (count);
}

/*
 - regnext - dig the "next" pointer out of a node
 */
static const char* regnext(const char* p)
{
  int offset;

  if (p == regdummyptr)
    return (nullptr);

  offset = NEXT(p);
  if (offset == 0)
    return (nullptr);

  if (OP(p) == BACK)
    return (p - offset);
  else
    return (p + offset);
}

static char* regnext(char* p)
{
  int offset;

  if (p == regdummyptr)
    return (nullptr);

  offset = NEXT(p);
  if (offset == 0)
    return (nullptr);

  if (OP(p) == BACK)
    return (p - offset);
  else
    return (p + offset);
}

} // namespace KWSYS_NAMESPACE