17641c5eaSYuri Pankov /*
2*14247eb6SBill Sommerfeld * Copyright 2023 Bill Sommerfeld <sommerfeld@hamachi.org>
37641c5eaSYuri Pankov * Copyright (c) 1992, 1993, 1994 Henry Spencer.
47641c5eaSYuri Pankov * Copyright (c) 1992, 1993, 1994
57641c5eaSYuri Pankov * The Regents of the University of California. All rights reserved.
67641c5eaSYuri Pankov *
77641c5eaSYuri Pankov * This code is derived from software contributed to Berkeley by
87641c5eaSYuri Pankov * Henry Spencer.
97641c5eaSYuri Pankov *
107641c5eaSYuri Pankov * Redistribution and use in source and binary forms, with or without
117641c5eaSYuri Pankov * modification, are permitted provided that the following conditions
127641c5eaSYuri Pankov * are met:
137641c5eaSYuri Pankov * 1. Redistributions of source code must retain the above copyright
147641c5eaSYuri Pankov * notice, this list of conditions and the following disclaimer.
157641c5eaSYuri Pankov * 2. Redistributions in binary form must reproduce the above copyright
167641c5eaSYuri Pankov * notice, this list of conditions and the following disclaimer in the
177641c5eaSYuri Pankov * documentation and/or other materials provided with the distribution.
187641c5eaSYuri Pankov * 3. Neither the name of the University nor the names of its contributors
197641c5eaSYuri Pankov * may be used to endorse or promote products derived from this software
207641c5eaSYuri Pankov * without specific prior written permission.
217641c5eaSYuri Pankov *
227641c5eaSYuri Pankov * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
237641c5eaSYuri Pankov * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
247641c5eaSYuri Pankov * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
257641c5eaSYuri Pankov * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
267641c5eaSYuri Pankov * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
277641c5eaSYuri Pankov * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
287641c5eaSYuri Pankov * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
297641c5eaSYuri Pankov * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
307641c5eaSYuri Pankov * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
317641c5eaSYuri Pankov * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
327641c5eaSYuri Pankov * SUCH DAMAGE.
337641c5eaSYuri Pankov */
347641c5eaSYuri Pankov
357641c5eaSYuri Pankov /*
367641c5eaSYuri Pankov * First, the stuff that ends up in the outside-world include file
377641c5eaSYuri Pankov * typedef off_t regoff_t;
387641c5eaSYuri Pankov * typedef struct {
397641c5eaSYuri Pankov * int re_magic;
407641c5eaSYuri Pankov * size_t re_nsub; // number of parenthesized subexpressions
417641c5eaSYuri Pankov * const char *re_endp; // end pointer for REG_PEND
427641c5eaSYuri Pankov * struct re_guts *re_g; // none of your business :-)
437641c5eaSYuri Pankov * } regex_t;
447641c5eaSYuri Pankov * typedef struct {
457641c5eaSYuri Pankov * regoff_t rm_so; // start of match
467641c5eaSYuri Pankov * regoff_t rm_eo; // end of match
477641c5eaSYuri Pankov * } regmatch_t;
487641c5eaSYuri Pankov */
497641c5eaSYuri Pankov /*
507641c5eaSYuri Pankov * internals of regex_t
517641c5eaSYuri Pankov */
527641c5eaSYuri Pankov #define MAGIC1 ((('r'^0200)<<8) | 'e')
537641c5eaSYuri Pankov
547641c5eaSYuri Pankov /*
557641c5eaSYuri Pankov * The internal representation is a *strip*, a sequence of
567641c5eaSYuri Pankov * operators ending with an endmarker. (Some terminology etc. is a
577641c5eaSYuri Pankov * historical relic of earlier versions which used multiple strips.)
587641c5eaSYuri Pankov * Certain oddities in the representation are there to permit running
597641c5eaSYuri Pankov * the machinery backwards; in particular, any deviation from sequential
607641c5eaSYuri Pankov * flow must be marked at both its source and its destination. Some
617641c5eaSYuri Pankov * fine points:
627641c5eaSYuri Pankov *
637641c5eaSYuri Pankov * - OPLUS_ and O_PLUS are *inside* the loop they create.
647641c5eaSYuri Pankov * - OQUEST_ and O_QUEST are *outside* the bypass they create.
657641c5eaSYuri Pankov * - OCH_ and O_CH are *outside* the multi-way branch they create, while
667641c5eaSYuri Pankov * OOR1 and OOR2 are respectively the end and the beginning of one of
677641c5eaSYuri Pankov * the branches. Note that there is an implicit OOR2 following OCH_
687641c5eaSYuri Pankov * and an implicit OOR1 preceding O_CH.
697641c5eaSYuri Pankov *
707641c5eaSYuri Pankov * In state representations, an operator's bit is on to signify a state
717641c5eaSYuri Pankov * immediately *preceding* "execution" of that operator.
727641c5eaSYuri Pankov */
737641c5eaSYuri Pankov typedef unsigned int sop; /* strip operator */
747641c5eaSYuri Pankov typedef unsigned int sopno;
757641c5eaSYuri Pankov #define OPRMASK 0xf8000000U
767641c5eaSYuri Pankov #define OPDMASK 0x07ffffffU
777641c5eaSYuri Pankov #define OPSHIFT ((unsigned)27)
787641c5eaSYuri Pankov #define OP(n) ((n)&OPRMASK)
797641c5eaSYuri Pankov #define OPND(n) ((n)&OPDMASK)
807641c5eaSYuri Pankov #define SOP(op, opnd) ((op)|(opnd))
817641c5eaSYuri Pankov /* operators meaning operand */
827641c5eaSYuri Pankov /* (back, fwd are offsets) */
837641c5eaSYuri Pankov #define OEND (1U<<OPSHIFT) /* endmarker - */
847641c5eaSYuri Pankov #define OCHAR (2U<<OPSHIFT) /* character wide character */
857641c5eaSYuri Pankov #define OBOL (3U<<OPSHIFT) /* left anchor - */
867641c5eaSYuri Pankov #define OEOL (4U<<OPSHIFT) /* right anchor - */
877641c5eaSYuri Pankov #define OANY (5U<<OPSHIFT) /* . - */
887641c5eaSYuri Pankov #define OANYOF (6U<<OPSHIFT) /* [...] set number */
897641c5eaSYuri Pankov #define OBACK_ (7U<<OPSHIFT) /* begin \d paren number */
907641c5eaSYuri Pankov #define O_BACK (8U<<OPSHIFT) /* end \d paren number */
917641c5eaSYuri Pankov #define OPLUS_ (9U<<OPSHIFT) /* + prefix fwd to suffix */
927641c5eaSYuri Pankov #define O_PLUS (10U<<OPSHIFT) /* + suffix back to prefix */
937641c5eaSYuri Pankov #define OQUEST_ (11U<<OPSHIFT) /* ? prefix fwd to suffix */
947641c5eaSYuri Pankov #define O_QUEST (12U<<OPSHIFT) /* ? suffix back to prefix */
957641c5eaSYuri Pankov #define OLPAREN (13U<<OPSHIFT) /* ( fwd to ) */
967641c5eaSYuri Pankov #define ORPAREN (14U<<OPSHIFT) /* ) back to ( */
977641c5eaSYuri Pankov #define OCH_ (15U<<OPSHIFT) /* begin choice fwd to OOR2 */
987641c5eaSYuri Pankov #define OOR1 (16U<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */
997641c5eaSYuri Pankov #define OOR2 (17U<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */
1007641c5eaSYuri Pankov #define O_CH (18U<<OPSHIFT) /* end choice back to OOR1 */
1017641c5eaSYuri Pankov #define OBOW (19U<<OPSHIFT) /* begin word - */
1027641c5eaSYuri Pankov #define OEOW (20U<<OPSHIFT) /* end word - */
1037641c5eaSYuri Pankov
1047641c5eaSYuri Pankov /*
1057641c5eaSYuri Pankov * Structures for [] character-set representation.
1067641c5eaSYuri Pankov */
1077641c5eaSYuri Pankov typedef struct {
1087641c5eaSYuri Pankov wint_t min;
1097641c5eaSYuri Pankov wint_t max;
1107641c5eaSYuri Pankov } crange;
1117641c5eaSYuri Pankov typedef struct {
1121603eda2SYuri Pankov unsigned char bmp[NC_MAX / 8];
1137641c5eaSYuri Pankov wctype_t *types;
1147641c5eaSYuri Pankov unsigned int ntypes;
1157641c5eaSYuri Pankov wint_t *wides;
1167641c5eaSYuri Pankov unsigned int nwides;
1177641c5eaSYuri Pankov crange *ranges;
1187641c5eaSYuri Pankov unsigned int nranges;
1197641c5eaSYuri Pankov int invert;
1207641c5eaSYuri Pankov int icase;
1217641c5eaSYuri Pankov } cset;
1227641c5eaSYuri Pankov
123*14247eb6SBill Sommerfeld static inline int
CHIN1(int nc,cset * cs,wint_t ch)124*14247eb6SBill Sommerfeld CHIN1(int nc, cset *cs, wint_t ch)
1257641c5eaSYuri Pankov {
1267641c5eaSYuri Pankov unsigned int i;
1277641c5eaSYuri Pankov
1287641c5eaSYuri Pankov assert(ch >= 0);
129*14247eb6SBill Sommerfeld if (ch < nc)
1307641c5eaSYuri Pankov return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^
1317641c5eaSYuri Pankov cs->invert);
1321603eda2SYuri Pankov for (i = 0; i < cs->nwides; i++) {
1331603eda2SYuri Pankov if (cs->icase) {
1341603eda2SYuri Pankov if (ch == towlower(cs->wides[i]) ||
1351603eda2SYuri Pankov ch == towupper(cs->wides[i]))
1367641c5eaSYuri Pankov return (!cs->invert);
1371603eda2SYuri Pankov } else if (ch == cs->wides[i])
1381603eda2SYuri Pankov return (!cs->invert);
1391603eda2SYuri Pankov }
1407641c5eaSYuri Pankov for (i = 0; i < cs->nranges; i++)
1417641c5eaSYuri Pankov if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max)
1427641c5eaSYuri Pankov return (!cs->invert);
1437641c5eaSYuri Pankov for (i = 0; i < cs->ntypes; i++)
1447641c5eaSYuri Pankov if (iswctype(ch, cs->types[i]))
1457641c5eaSYuri Pankov return (!cs->invert);
1467641c5eaSYuri Pankov return (cs->invert);
1477641c5eaSYuri Pankov }
1487641c5eaSYuri Pankov
149*14247eb6SBill Sommerfeld static inline int
CHIN(int nc,cset * cs,wint_t ch)150*14247eb6SBill Sommerfeld CHIN(int nc, cset *cs, wint_t ch)
1517641c5eaSYuri Pankov {
1527641c5eaSYuri Pankov
1537641c5eaSYuri Pankov assert(ch >= 0);
154*14247eb6SBill Sommerfeld if (ch < nc)
1557641c5eaSYuri Pankov return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^
1567641c5eaSYuri Pankov cs->invert);
1577641c5eaSYuri Pankov else if (cs->icase)
158*14247eb6SBill Sommerfeld return (CHIN1(nc, cs, ch) || CHIN1(nc, cs, towlower(ch)) ||
159*14247eb6SBill Sommerfeld CHIN1(nc, cs, towupper(ch)));
1607641c5eaSYuri Pankov else
161*14247eb6SBill Sommerfeld return (CHIN1(nc, cs, ch));
1627641c5eaSYuri Pankov }
1637641c5eaSYuri Pankov
1647641c5eaSYuri Pankov /*
1657641c5eaSYuri Pankov * main compiled-expression structure
1667641c5eaSYuri Pankov */
1677641c5eaSYuri Pankov struct re_guts {
1687641c5eaSYuri Pankov int magic;
1697641c5eaSYuri Pankov #define MAGIC2 ((('R'^0200)<<8)|'E')
1707641c5eaSYuri Pankov sop *strip; /* malloced area for strip */
1717641c5eaSYuri Pankov unsigned int ncsets; /* number of csets in use */
1727641c5eaSYuri Pankov cset *sets; /* -> cset [ncsets] */
1737641c5eaSYuri Pankov int cflags; /* copy of regcomp() cflags argument */
1747641c5eaSYuri Pankov sopno nstates; /* = number of sops */
1757641c5eaSYuri Pankov sopno firststate; /* the initial OEND (normally 0) */
1767641c5eaSYuri Pankov sopno laststate; /* the final OEND */
1777641c5eaSYuri Pankov int iflags; /* internal flags */
1787641c5eaSYuri Pankov #define USEBOL 01 /* used ^ */
1797641c5eaSYuri Pankov #define USEEOL 02 /* used $ */
1807641c5eaSYuri Pankov #define BAD 04 /* something wrong */
1817641c5eaSYuri Pankov int nbol; /* number of ^ used */
1827641c5eaSYuri Pankov int neol; /* number of $ used */
1837641c5eaSYuri Pankov char *must; /* match must contain this string */
1847641c5eaSYuri Pankov int moffset; /* latest point at which must may be located */
1857641c5eaSYuri Pankov int *charjump; /* Boyer-Moore char jump table */
1867641c5eaSYuri Pankov int *matchjump; /* Boyer-Moore match jump table */
1877641c5eaSYuri Pankov int mlen; /* length of must */
1887641c5eaSYuri Pankov size_t nsub; /* copy of re_nsub */
1897641c5eaSYuri Pankov int backrefs; /* does it use back references? */
1907641c5eaSYuri Pankov sopno nplus; /* how deep does it nest +s? */
191*14247eb6SBill Sommerfeld unsigned int mb_cur_max;
1927641c5eaSYuri Pankov };
1937641c5eaSYuri Pankov
1947641c5eaSYuri Pankov /* misc utilities */
1957641c5eaSYuri Pankov #define OUT (CHAR_MIN - 1) /* a non-character value */
19662e74980SKyle Evans #define IGN (CHAR_MIN - 2)
1977641c5eaSYuri Pankov #define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')
198