1 /* ed style regular expressions */
2 /*
3 Copyright (C) 2004-2017,2018 John E. Davis
4 
5 This file is part of the S-Lang Library.
6 
7 The S-Lang Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
11 
12 The S-Lang Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with this library; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA.
21 */
22 
23 #include "slinclud.h"
24 
25 #include "slang.h"
26 #include "_slang.h"
27 
28 struct _pSLRegexp_Type
29 {
30    /* These must be set by calling routine. */
31    unsigned char *pat;		       /* regular expression pattern */
32    unsigned char *buf;		       /* buffer for compiled regexp */
33    size_t buf_len;		       /* length of buffer */
34    int case_sensitive;		       /* 1 if match is case sensitive  */
35 
36    /* The rest are set by SLang_regexp_compile */
37 
38    int must_match;		       /* 1 if line must contain substring */
39    int must_match_bol;		       /* true if it must match beginning of line */
40    unsigned char must_match_str[16];   /* 15 char null term substring */
41    int osearch;			       /* 1 if ordinary search suffices */
42    size_t min_length;		       /* minimum length the match must be */
43    ssize_t beg_matches[10];	       /* offset of start of \( */
44    size_t end_matches[10];	       /* length of nth submatch
45 					* Note that the entire match corresponds
46 					* to \0
47 					*/
48    int offset;			       /* offset to be added to beg_matches */
49 };
50 
51 #define SET_BIT(b, n) b[(unsigned int) (n) >> 3] |= 1 << ((unsigned int) (n) % 8)
52 #define TEST_BIT(b, n) (b[(unsigned int)(n) >> 3] & (1 << ((unsigned int) (n) % 8)))
53 #define LITERAL 1
54 #define RANGE 2			       /* [...] */
55 #define ANY 3			       /* . */
56 #define BOL 4			       /* ^ */
57 #define EOL 5			       /* $ */
58 #define NTH_MATCH 6		       /* \1 \2 ... \9 */
59 #define OPAREN 7		       /* \( */
60 #define CPAREN 0x8		       /* \) */
61 #define ANY_DIGIT 0x9		       /* \d */
62 #define BOW	0xA		       /* \< */
63 #define EOW	0xB		       /* \> */
64 #if 0
65 #define NOT_LITERAL		0xC	       /* \~ */
66 #endif
67 #define STAR 0x80		       /* * */
68 #define LEAST_ONCE 0x40		       /* + */
69 #define MAYBE_ONCE 0x20		       /* ? */
70 #define MANY 0x10		       /* {n,m} */
71 /* The rest are additions */
72 #define YES_CASE (STAR | BOL)
73 #define NO_CASE  (STAR | EOL)
74 
75 #define UPPERCASE(x)  (cs ? (x) : UPPER_CASE(x))
76 #define LOWERCASE(x)  (cs ? (x) : LOWER_CASE(x))
77 
78 /* FIXME: UTF8 */
79 static unsigned char Word_Chars[256];
80 #define IS_WORD_CHAR(x) Word_Chars[(unsigned int) (x)]
81 
82 #if 0
83 static int ctx->open_paren_number;
84 static char Closed_Paren_Matches[10];
85 
86 static SLRegexp_Type *This_Reg;
87 static unsigned char *This_Str;
88 #endif
89 
90 typedef struct
91 {
92    SLRegexp_Type *reg;
93    SLCONST unsigned char *str;
94    SLstrlen_Type len;
95    char closed_paren_matches[10];
96    int open_paren_number;
97 }
98 Re_Context_Type;
99 
do_nth_match(Re_Context_Type * ctx,int idx,SLCONST unsigned char * str,SLCONST unsigned char * estr)100 static SLCONST unsigned char *do_nth_match (Re_Context_Type *ctx, int idx, SLCONST unsigned char *str, SLCONST unsigned char *estr)
101 {
102    SLCONST unsigned char *bpos;
103    size_t m;
104 
105    if (ctx->closed_paren_matches[idx] == 0)
106      return NULL;
107 
108    bpos = ctx->reg->beg_matches[idx] + ctx->str;
109    m = ctx->reg->end_matches[idx];
110    if (m == 0) return(str);
111    if (str + m > estr) return (NULL);
112 
113    /* This needs fixed for case in-sensitive match */
114    if (0 != strncmp((char *) str, (char *) bpos, m)) return (NULL);
115    str += m;
116    return (str);
117 }
118 
119 /* returns pointer to the end of regexp or NULL */
regexp_looking_at(Re_Context_Type * ctx,SLCONST unsigned char * str,SLCONST unsigned char * estr,unsigned char * regexp,int cs)120 static SLCONST unsigned char *regexp_looking_at (Re_Context_Type *ctx,
121 						 SLCONST unsigned char *str, SLCONST unsigned char *estr,
122 						 unsigned char *regexp,
123 						 int cs)
124 {
125    register unsigned char p, p1;
126    SLCONST unsigned char *save_str, *tmpstr;
127    int n, n0, n1;
128    int save_num_open;
129    char save_closed_matches[10];
130 
131    p = *regexp++;
132 
133    while (p != 0)
134      {
135 	/* p1 = UPPERCASE(*regexp); */
136 	/* if (str < estr) c = UPPERCASE(*str); */
137 
138 	switch((unsigned char) p)
139 	  {
140 	   case BOW:
141 	     if ((str != ctx->str)
142 		 && ((str >= estr)
143 		     || IS_WORD_CHAR(*(str - 1))
144 		     || (0 == IS_WORD_CHAR(*str)))) return NULL;
145 	     break;
146 
147 	   case EOW:
148 	     if ((str < estr)
149 		 && IS_WORD_CHAR (*str)) return NULL;
150 	     break;
151 
152 	   case YES_CASE: cs = 1; break;
153 	   case NO_CASE: cs = 0; break;
154 
155 	   case OPAREN:
156 	     ctx->open_paren_number++;
157 	     ctx->reg->beg_matches[ctx->open_paren_number] = (str - ctx->str);
158 	     break;
159 	   case CPAREN:
160 	     n = ctx->open_paren_number;
161 	     while (n > 0)
162 	       {
163 		  if (ctx->closed_paren_matches[n] != 0)
164 		    {
165 		       n--;
166 		       continue;
167 		    }
168 		  ctx->closed_paren_matches[n] = 1;
169 		  ctx->reg->end_matches[n] = (str - (ctx->str + ctx->reg->beg_matches[n]));
170 		  break;
171 	       }
172 	     break;
173 #ifdef NOT_LITERAL
174 	   case NOT_LITERAL:
175 	     if ((str >= estr) || (*regexp == UPPERCASE(*str))) return (NULL);
176 	     str++; regexp++;
177 	     break;
178 
179 	   case MAYBE_ONCE | NOT_LITERAL:
180 	     save_str = str;
181 	     if ((str < estr) && (*regexp != UPPERCASE(*str))) str++;
182 	     regexp++;
183 	     goto match_rest;
184 
185 	   case NOT_LITERAL | LEAST_ONCE:   /* match at least once */
186 	     if ((str >= estr) || (UPPERCASE(*str) == UPPERCASE(*regexp))) return (NULL);
187 	     str++;
188 	     /* drop */
189 	   case STAR | NOT_LITERAL:
190 	     save_str = str;  p1 = *regexp;
191 	     while ((str < estr) && (UPPERCASE(*str) != p1)) str++;
192 	     regexp++;
193 	     goto match_rest;
194 
195 	     /* this type consists of the expression + two bytes that
196 	        determine number of matches to perform */
197 	   case MANY | NOT_LITERAL:
198 	     p1 = *regexp; regexp++;
199 	     n = n0 = (int) (unsigned char) *regexp++;
200 	     /* minimum number to match--- could be 0 */
201 	     n1 = (int) (unsigned char) *regexp++;
202 	     /* maximum number to match */
203 
204 	     while (n && (str < estr) && (p1 != UPPERCASE(*str)))
205 	       {
206 		  n--;
207 		  str++;
208 	       }
209 	     if (n) return (NULL);
210 
211 	     save_str = str;
212 	     n = n1 - n0;
213 	     while (n && (str < estr) && (p1 != UPPERCASE(*str)))
214 	       {
215 		  n--;
216 		  str++;
217 	       }
218 	     goto match_rest;
219 #endif				       /* NOT_LITERAL */
220 	   case LITERAL:
221 	     if ((str >= estr) || (*regexp != UPPERCASE(*str))) return (NULL);
222 	     str++; regexp++;
223 	     break;
224 
225 	   case MAYBE_ONCE | LITERAL:
226 	     save_str = str;
227 	     if ((str < estr) && (*regexp == UPPERCASE(*str))) str++;
228 	     regexp++;
229 	     goto match_rest;
230 
231 	   case LITERAL | LEAST_ONCE:   /* match at least once */
232 	     if ((str >= estr) || (UPPERCASE(*str) != UPPERCASE(*regexp))) return (NULL);
233 	     str++;
234 	     /* drop */
235 	   case STAR | LITERAL:
236 	     save_str = str;  p1 = *regexp;
237 	     while ((str < estr) && (UPPERCASE(*str) == p1)) str++;
238 	     regexp++;
239 	     goto match_rest;
240 
241 	     /* this type consists of the expression + two bytes that
242 	        determine number of matches to perform */
243 	   case MANY | LITERAL:
244 	     p1 = *regexp; regexp++;
245 	     n = n0 = (int) (unsigned char) *regexp++;
246 	     /* minimum number to match--- could be 0 */
247 	     n1 = (int) (unsigned char) *regexp++;
248 	     /* maximum number to match */
249 
250 	     while (n && (str < estr) && (p1 == UPPERCASE(*str)))
251 	       {
252 		  n--;
253 		  str++;
254 	       }
255 	     if (n) return (NULL);
256 
257 	     save_str = str;
258 	     n = n1 - n0;
259 	     while (n && (str < estr) && (p1 == UPPERCASE(*str)))
260 	       {
261 		  n--;
262 		  str++;
263 	       }
264 	     goto match_rest;
265 
266 	   case NTH_MATCH:
267 	     if ((str = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr)) == NULL) return(NULL);
268 	     regexp++;
269 	     break;
270 
271 	   case MAYBE_ONCE | NTH_MATCH:
272 	     save_str = str;
273 	     tmpstr = do_nth_match (ctx, (int) (unsigned char) *regexp, str, estr);
274 	     if (tmpstr != NULL)
275 	       str = tmpstr;
276 	     regexp++;
277 	     goto match_rest;
278 
279 	   case LEAST_ONCE | NTH_MATCH:
280 	     if ((str = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr)) == NULL) return(NULL);
281 	     /* drop */
282 	   case STAR | NTH_MATCH:
283 	     save_str = str;
284 	     while (NULL != (tmpstr = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr)))
285 	       {
286 		  str = tmpstr;
287 	       }
288 	     regexp++;
289 	     goto match_rest;
290 
291 	   case MANY | NTH_MATCH:
292 	     /* minimum number to match--- could be 0 */
293 	     n = n0 = (int) (unsigned char) regexp[1];
294 	     /* maximum number to match */
295 	     n1 = (int) (unsigned char) regexp[2];
296 
297 	     while (n && (str < estr)
298 		    && (NULL != (tmpstr = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr))))
299 	       {
300 		  n--;
301 		  str = tmpstr;
302 	       }
303 	     if (n) return (NULL);
304 
305 	     save_str = str;
306 	     n = n1 - n0;
307 	     while (n && (str < estr)
308 		    && (NULL != (tmpstr = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr))))
309 	       {
310 		  n--;
311 		  str = tmpstr;
312 	       }
313 	     regexp += 3;
314 	     goto match_rest;
315 
316 	   case RANGE:
317 	     if (str >= estr) return (NULL);
318 	     if (TEST_BIT(regexp, UPPERCASE(*str)) == 0) return (NULL);
319 	     regexp += 32; str++;
320 	     break;
321 
322 	   case MAYBE_ONCE | RANGE:
323 	     save_str = str;
324 	     if ((str < estr) && TEST_BIT(regexp, UPPERCASE(*str))) str++;
325 	     regexp += 32;
326 	     goto match_rest;
327 
328 	   case LEAST_ONCE | RANGE:
329 	     if ((str >= estr) || (0 == TEST_BIT(regexp, UPPERCASE(*str)))) return NULL;
330 	     str++;
331 	     /* drop */
332 	   case STAR | RANGE:
333 	     save_str = str;
334 	     while ((str < estr) && TEST_BIT(regexp, UPPERCASE(*str))) str++;
335 	     regexp += 32;
336 	     goto match_rest;
337 
338 	     /* The first 32 bytes correspond to the range and the two
339 	      * following bytes indicate the min and max number of matches.
340 	      */
341 	   case MANY | RANGE:
342 	     /* minimum number to match--- could be 0 */
343 	     n = n0 = (int) (unsigned char) *(regexp + 32);
344 	     /* maximum number to match */
345 	     n1 = (int) (unsigned char) *(regexp + 33);
346 
347 	     while (n && (str < estr) && (TEST_BIT(regexp, UPPERCASE(*str))))
348 	       {
349 		  n--;
350 		  str++;
351 	       }
352 	     if (n) return (NULL);
353 	     save_str = str;
354 	     n = n1 - n0;
355 	     while (n && (str < estr) && (TEST_BIT(regexp, UPPERCASE(*str))))
356 	       {
357 		  n--;
358 		  str++;
359 	       }
360 	     regexp += 34;		       /* 32 + 2 */
361 	     goto match_rest;
362 
363 	   case ANY_DIGIT:
364 	     if ((str >= estr) || (*str > '9') || (*str < '0')) return (NULL);
365 	     str++;
366 	     break;
367 
368 	   case MAYBE_ONCE | ANY_DIGIT:
369 	     save_str = str;
370 	     if ((str < estr) && ((*str <= '9') && (*str >= '0'))) str++;
371 	     goto match_rest;
372 
373 	   case LEAST_ONCE | ANY_DIGIT:
374 	     if ((str >= estr) || ((*str > '9') || (*str < '0'))) return NULL;
375 	     str++;
376 	     /* drop */
377 	   case STAR | ANY_DIGIT:
378 	     save_str = str;
379 	     while ((str < estr) && ((*str <= '9') && (*str >= '0'))) str++;
380 	     goto match_rest;
381 
382 	   case MANY | ANY_DIGIT:
383 	     /* minimum number to match--- could be 0 */
384 	     n = n0 = (int) (unsigned char) *regexp++;
385 	     /* maximum number to match */
386 	     n1 = (int) (unsigned char) *regexp++;
387 
388 	     while (n && (str < estr) && (*str <= '9') && (*str >= '0'))
389 	       {
390 		  n--;
391 		  str++;
392 	       }
393 	     if (n) return (NULL);
394 	     save_str = str;
395 	     n = n1 - n0;
396 	     while (n && (str < estr) && (*str <= '9') && (*str >= '0'))
397 	       {
398 		  n--;
399 		  str++;
400 	       }
401 	     goto match_rest;
402 
403 	   case ANY:		       /* . */
404 	     /* FIXME: UTF8 */
405 	     if ((str >= estr) || (*str == '\n')) return (NULL);
406 	     str++;
407 	     break;
408 
409 	   case MAYBE_ONCE | ANY:      /* .? */
410 	     /* FIXME: UTF8 */
411 	     save_str = str;
412 	     if ((str < estr) && (*str != '\n')) str++;
413 	     goto match_rest;
414 
415 	   case LEAST_ONCE | ANY:      /* .+ */
416 	     /* FIXME: UTF8 */
417 	     if ((str >= estr) || (*str == '\n')) return (NULL);
418 	     str++;
419 	     /* drop */
420 	   case STAR | ANY:	       /* .* */
421 	     /* FIXME: UTF8 */
422 	     save_str = str;
423 	     while ((str < estr) && (*str != '\n')) str++;
424 	     goto match_rest;
425 
426 	   case MANY | ANY:
427 	     /* minimum number to match--- could be 0 */
428 	     n = n0 = (int) (unsigned char) *regexp++;
429 	     /* maximum number to match */
430 	     n1 = (int) (unsigned char) *regexp++;
431 
432 	     while (n && (str < estr) && (*str != '\n'))
433 	       {
434 		  n--;
435 		  str++;
436 	       }
437 	     if (n) return (NULL);
438 	     save_str = str;
439 	     n = n1 - n0;
440 	     while (n && (str < estr) && (*str != '\n'))
441 	       {
442 		  n--;
443 		  str++;
444 	       }
445 	     goto match_rest;
446 
447 	   case EOL:
448 	     if (str >= estr)
449 	       return str;
450 	     if ((*str == '\n') && (str+1 == estr))
451 	       return str;
452 	     return(NULL);
453 
454 	   default: return (NULL);
455 	  }
456 	p = *regexp++;
457 	continue;
458 
459 	match_rest:
460 	if (save_str == str)
461 	  {
462 	     p = *regexp++;
463 	     continue;
464 	  }
465 
466 	/* if (p == EOL)
467 	 * {
468 	 * if (str < estr) return (NULL); else return (str);
469 	 * }
470 	 */
471 
472 	SLMEMCPY(save_closed_matches, ctx->closed_paren_matches, sizeof(save_closed_matches));
473 	save_num_open = ctx->open_paren_number;
474 	while (str >= save_str)
475 	  {
476 	     tmpstr = regexp_looking_at (ctx, str, estr, regexp, cs);
477 	     if (tmpstr != NULL) return(tmpstr);
478 	     SLMEMCPY(ctx->closed_paren_matches, save_closed_matches, sizeof(ctx->closed_paren_matches));
479 	     ctx->open_paren_number = save_num_open;
480 	     str--;
481 	  }
482 	return NULL;
483      }
484    if ((p != 0) && (p != EOL)) return (NULL); else return (str);
485 }
486 
487 static void
fixup_beg_end_matches(Re_Context_Type * ctx,SLRegexp_Type * r,SLCONST unsigned char * str,SLCONST unsigned char * epos)488 fixup_beg_end_matches (Re_Context_Type *ctx, SLRegexp_Type *r,
489 		       SLCONST unsigned char *str, SLCONST unsigned char *epos)
490 {
491    int i;
492 
493    if (str == NULL)
494      {
495 	r->beg_matches[0] = -1;
496 	r->end_matches[0] = 0;
497 	SLMEMSET(ctx->closed_paren_matches, 0, sizeof(ctx->closed_paren_matches));
498      }
499    else
500      {
501 	r->beg_matches[0] = (str - ctx->str);
502 	r->end_matches[0] = (epos - str);
503      }
504 
505    for (i = 1; i < 10; i++)
506      {
507 	if (ctx->closed_paren_matches [i] == 0)
508 	  {
509 	     r->beg_matches[i] = -1;
510 	     r->end_matches[i] = 0;
511 	  }
512      }
513 }
514 
init_re_context(Re_Context_Type * ctx,SLRegexp_Type * reg,SLCONST unsigned char * str,SLstrlen_Type len)515 static void init_re_context (Re_Context_Type *ctx, SLRegexp_Type *reg,
516 			     SLCONST unsigned char *str, SLstrlen_Type len)
517 {
518    memset ((char *) ctx, 0, sizeof (Re_Context_Type));
519    ctx->reg = reg;
520    ctx->str = str;
521    ctx->len = len;
522 }
523 
regexp_match(SLCONST unsigned char * str,SLstrlen_Type len,SLRegexp_Type * reg)524 static SLCONST unsigned char *regexp_match(SLCONST unsigned char *str,
525 					   SLstrlen_Type len, SLRegexp_Type *reg)
526 {
527    unsigned char c = 0;
528    SLCONST unsigned char *estr = str + len;
529    int cs = reg->case_sensitive, lit = 0;
530    unsigned char *buf = reg->buf;
531    SLCONST unsigned char *epos = NULL;
532    Re_Context_Type ctx_buf;
533 
534    if (reg->min_length > len) return NULL;
535 
536    init_re_context (&ctx_buf, reg, str, len);
537 
538    if (*buf == BOL)
539      {
540 	if (NULL == (epos = regexp_looking_at (&ctx_buf, str, estr, buf + 1, cs)))
541 	  str = NULL;
542 
543 	fixup_beg_end_matches (&ctx_buf, reg, str, epos);
544 	return str;
545      }
546 
547    if (*buf == NO_CASE)
548      {
549 	buf++;  cs = 0;
550      }
551 
552    if (*buf == YES_CASE)
553      {
554 	buf++;  cs = 1;
555      }
556 
557    if (*buf == LITERAL)
558      {
559 	lit = 1;
560 	c = *(buf + 1);
561      }
562    else if ((*buf == OPAREN) && (*(buf + 1) == LITERAL))
563      {
564 	lit = 1;
565 	c = *(buf + 2);
566      }
567 
568    while (1)
569      {
570 	ctx_buf.open_paren_number = 0;
571 	memset (ctx_buf.closed_paren_matches, 0, sizeof(ctx_buf.closed_paren_matches));
572 	/* take care of leading chars */
573 	if (lit)
574 	  {
575 	     while ((str < estr) && (c != UPPERCASE(*str))) str++;
576 	     if (str >= estr)
577 	       break;		       /* failed */
578 	  }
579 
580 	if (NULL != (epos = regexp_looking_at(&ctx_buf, str, estr, buf, cs)))
581 	  {
582 	     fixup_beg_end_matches (&ctx_buf, reg, str, epos);
583 	     return str;
584 	  }
585 	if (str >= estr)
586 	  break;
587 	str++;
588      }
589    fixup_beg_end_matches (&ctx_buf, reg, NULL, epos);
590    return NULL;
591 }
592 
SLregexp_match(SLRegexp_Type * reg,SLFUTURE_CONST char * str,SLstrlen_Type len)593 char *SLregexp_match (SLRegexp_Type *reg, SLFUTURE_CONST char *str, SLstrlen_Type len)
594 {
595    return (char *) regexp_match ((SLCONST unsigned char *)str, len, reg);
596 }
597 
convert_digit(unsigned char * pat,int * nn)598 static unsigned char *convert_digit(unsigned char *pat, int *nn)
599 {
600    int n = 0;
601    unsigned char c;
602    while (c = (unsigned char) *pat, (c <= '9') && (c >= '0'))
603      {
604 	pat++;
605 	n = 10 * n + (c - '0');
606      }
607    *nn = n;
608    return pat;
609 }
610 
611 #define ERROR  return (int) (pat - reg->pat)
612 
613 /* Returns 0 if successful or offset in pattern of error */
regexp_compile(SLRegexp_Type * reg)614 static int regexp_compile (SLRegexp_Type *reg)
615 {
616    register unsigned char *buf, *ebuf, *pat;
617    unsigned char *last = NULL, *tmppat;
618    register unsigned char c;
619    int i, reverse = 0, n, cs;
620    int oparen = 0, nparen = 0;
621    /* substring stuff */
622    int count, last_count, this_max_mm = 0, max_mm = 0, ordinary_search,
623      no_osearch = 0, min_length = 0;
624    unsigned char *mm_p = NULL, *this_mm_p = NULL;
625    static int already_initialized;
626 
627    reg->beg_matches[0] = reg->end_matches[0] = 0;
628    buf = reg->buf;
629    ebuf = (reg->buf + reg->buf_len) - 2; /* make some room */
630    pat = reg->pat;
631    cs = reg->case_sensitive;
632 
633    if (already_initialized == 0)
634      {
635 	SLang_init_case_tables ();
636 #ifdef IBMPC_SYSTEM
637 	SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\200-\232\240-\245\341-\353", 0);
638 #else
639 	SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\277-\326\330-\336\340-\366\370-\376", 0);
640 #endif
641 	already_initialized = 1;
642      }
643 
644    i = 1; while (i < 10)
645      {
646 	reg->beg_matches[i] = -1;
647 	reg->end_matches[i] = 0;
648 	i++;
649      }
650 
651    if (*pat == '\\')
652      {
653 	if (pat[1] == 'c')
654 	  {
655 	     cs = 1;
656 	     pat += 2;
657 	     no_osearch = 1;
658 	  }
659 	else if (pat[1] == 'C')
660 	  {
661 	     cs = 0;
662 	     pat += 2;
663 	     no_osearch = 1;
664 	  }
665      }
666 
667    if (*pat == '^')
668      {
669 	pat++;
670 	*buf++ = BOL;
671 	reg->must_match_bol = 1;
672      }
673    else reg->must_match_bol = 0;
674 
675    if (cs != reg->case_sensitive)
676      {
677 	if (cs) *buf++ = YES_CASE;
678 	else *buf++ = NO_CASE;
679      }
680 
681    *buf = 0;
682 
683    last_count = count = 0;
684    while ((c = *pat++) != 0)
685      {
686 	if (buf >= ebuf - 3)
687 	  {
688 	     _pSLang_verror (SL_BUILTIN_LIMIT_EXCEEDED, "Pattern too large to be compiled.");
689 	     ERROR;
690 	  }
691 
692 	count++;
693 	switch (c)
694 	  {
695 	   case '$':
696 	     if (*pat != 0) goto literal_char;
697 	     *buf++ = EOL;
698 	     break;
699 
700 	   case '\\':
701 	     c = *pat++;
702 	     no_osearch = 1;
703 	     switch(c)
704 	       {
705 		case 'e': c = 033; goto literal_char;
706 		case 'n': c = '\n'; goto literal_char;
707 		case 't': c = '\t'; goto literal_char;
708 		case 'C': cs = 0; *buf++ = NO_CASE; break;
709 		case 'c': cs = 1; *buf++ = YES_CASE; break;
710 		case '1': case '2': case '3':  case '4':  case '5':
711 		case '6': case '7': case '8':  case '9':
712 		  c = c - '0';
713 		  if ((int) c > nparen) ERROR;
714 		  last = buf;
715 		  *buf++ = NTH_MATCH; *buf++ = c;
716 		  break;
717 #ifdef NOT_LITERAL
718 		case '~':	       /* slang extension */
719 		  if ((c = *pat) == 0) ERROR;
720 		  pat++;
721 		  last = buf;
722 		  *buf++ = NOT_LITERAL;
723 		  *buf++ = c;
724 		  min_length++;
725 		  break;
726 #endif
727 		case 'd':	       /* slang extension */
728 		  last = buf;
729 		  *buf++ = ANY_DIGIT;
730 		  min_length++;
731 		  break;
732 
733 		case '<':
734 		  last = NULL;
735 		  *buf++ = BOW;
736 		  break;
737 
738 		case '>':
739 		  last = NULL;
740 		  *buf++ = EOW;
741 		  break;
742 
743 		case '{':
744 		  if (last == NULL) goto literal_char;
745 		  *last |= MANY;
746 		  tmppat = convert_digit(pat, &n);
747 		  if (tmppat == NULL) ERROR;
748 		  pat = tmppat;
749 		  *buf++ = n;
750 
751 		  min_length += (n - 1);
752 
753 		  if (*pat == '\\')
754 		    {
755 		       *buf++ = n;
756 		    }
757 		  else if (*pat == ',')
758 		    {
759 		       pat++;
760 		       if (*pat == '\\')
761 			 {
762 			    n = 255;
763 			 }
764 		       else
765 			 {
766 			    tmppat = convert_digit(pat, &n);
767 			    if (tmppat == NULL) ERROR;
768 			    pat = tmppat;
769 			    if (*pat != '\\') ERROR;
770 			 }
771 		       *buf++ = n;
772 		    }
773 		  else ERROR;
774 		  last = NULL;
775 		  pat++;
776 		  if (*pat != '}') ERROR;
777 		  pat++;
778 		  break;   /* case '{' */
779 
780 		case '(':
781 		  oparen++;
782 		  if (oparen > 9) ERROR;
783 		  *buf++ = OPAREN;
784 		  break;
785 		case ')':
786 		  if (oparen == 0) ERROR;
787 		  oparen--;
788 		  nparen++;
789 		  *buf++ = CPAREN;
790 		  break;
791 
792 		case 0: ERROR;
793 		default:
794 		  goto literal_char;
795 	       }
796 	     break;
797 
798 	   case '[':
799 
800 	     *buf = RANGE;
801 	     last = buf++;
802 
803 	     if (buf + 32 >= ebuf) ERROR;
804 
805 	     for (i = 0; i < 32; i++) buf[i] = 0;
806 	     c = *pat++;
807 	     if (c == '^')
808 	       {
809 		  reverse = 1;
810 		  SET_BIT(buf, '\n');
811 		  c = *pat++;
812 	       }
813 
814 	     if (c == ']')
815 	       {
816 		  SET_BIT(buf, c);
817 		  c = *pat++;
818 	       }
819 	     while (c && (c != ']'))
820 	       {
821 		  if (c == '\\')
822 		    {
823 		       c = *pat++;
824 		       switch(c)
825 			 {
826 			    case 'n': c = '\n'; break;
827 			    case 't': c = '\t'; break;
828 			    case 0: ERROR;
829 			 }
830 		    }
831 
832 		  if (*pat == '-')
833 		    {
834 		       pat++;
835 		       while (c < *pat)
836 			 {
837 			    if (cs == 0)
838 			      {
839 				 SET_BIT(buf, UPPERCASE(c));
840 				 SET_BIT(buf, LOWERCASE(c));
841 			      }
842 			    else SET_BIT(buf, c);
843 			    c++;
844 			 }
845 		    }
846 		  if (cs == 0)
847 		    {
848 		       SET_BIT(buf, UPPERCASE(c));
849 		       SET_BIT(buf, LOWERCASE(c));
850 		    }
851 		  else SET_BIT(buf, c);
852 		  c = *pat++;
853 	       }
854 	     if (c != ']') ERROR;
855 	     if (reverse) for (i = 0; i < 32; i++) buf[i] = buf[i] ^ 0xFF;
856 	     reverse = 0;
857 	     buf += 32;
858 	     min_length++;
859 	     break;
860 
861 	   case '.':
862 	     last = buf;
863 	     *buf++ = ANY;
864 	     min_length++;
865 	     break;
866 
867 	   case '*':
868 	     if (last == NULL) goto literal_char;
869 	     *last |= STAR;
870 	     min_length--;
871 	     last = NULL;
872 	     break;
873 
874 	   case '+':
875 	     if (last == NULL) goto literal_char;
876 	     *last |= LEAST_ONCE;
877 	     last = NULL;
878 	     break;
879 
880 	   case '?':
881 	     if (last == NULL) goto literal_char;
882 	     *last |= MAYBE_ONCE;
883 	     last = NULL;
884 	     min_length--;
885 	     break;
886 
887 	   literal_char:
888 	   default:
889 	     /* This is to keep track of longest substring */
890 	     min_length++;
891 	     this_max_mm++;
892 	     if (last_count + 1 == count)
893 	       {
894 		  if (this_max_mm == 1)
895 		    {
896 		       this_mm_p = buf;
897 		    }
898 		  else if (max_mm < this_max_mm)
899 		    {
900 		       mm_p = this_mm_p;
901 		       max_mm = this_max_mm;
902 		    }
903 	       }
904 	     else
905 	       {
906 		  this_mm_p = buf;
907 		  this_max_mm = 1;
908 	       }
909 
910 	     last_count = count;
911 
912 	     last = buf;
913 	     *buf++ = LITERAL;
914 	     *buf++ = UPPERCASE(c);
915 	  }
916      }
917    *buf = 0;
918    /* Check for ordinary search */
919    ebuf = buf;
920    buf = reg->buf;
921 
922    if (no_osearch) ordinary_search = 0;
923    else
924      {
925 	ordinary_search = 1;
926 	while (buf < ebuf)
927 	  {
928 	     if (*buf != LITERAL)
929 	       {
930 		  ordinary_search = 0;
931 		  break;
932 	       }
933 	     buf += 2;
934 	  }
935      }
936 
937    reg->osearch = ordinary_search;
938    reg->must_match_str[15] = 0;
939    reg->min_length = (min_length > 0) ? (unsigned int) min_length : 0;
940    if (ordinary_search)
941      {
942 	strncpy((char *) reg->must_match_str, (char *) reg->pat, 15);
943 	reg->must_match = 1;
944 	return(0);
945      }
946    /* check for longest substring of pattern */
947    reg->must_match = 0;
948    if ((mm_p == NULL) && (this_mm_p != NULL)) mm_p = this_mm_p;
949    if (mm_p == NULL)
950      {
951 	return (0);
952      }
953    n = 15;
954    pat = reg->must_match_str;
955    buf = mm_p;
956    while (n--)
957      {
958 	if (*buf++ != LITERAL) break;
959 	*pat++ = *buf++;
960      }
961    *pat = 0;
962    if (pat != reg->must_match_str) reg->must_match = 1;
963    return(0);
964 }
965 
SLregexp_free(SLRegexp_Type * reg)966 void SLregexp_free (SLRegexp_Type *reg)
967 {
968    if (reg == NULL)
969      return;
970    if (reg->buf != NULL)
971      SLfree ((char *) reg->buf);
972    SLfree ((char *) reg);
973 }
974 
SLregexp_compile(SLFUTURE_CONST char * pattern,unsigned int flags)975 SLRegexp_Type *SLregexp_compile (SLFUTURE_CONST char *pattern, unsigned int flags)
976 {
977    SLRegexp_Type *reg;
978    int ret;
979 
980    reg = (SLRegexp_Type *)SLcalloc (1, sizeof (SLRegexp_Type));
981    if (reg == NULL)
982      return NULL;
983 
984    if (NULL == (reg->buf = (unsigned char *)SLmalloc (1024)))
985      {
986 	SLfree ((char *) reg);
987 	return NULL;
988      }
989    reg->buf_len = 1024;
990    reg->case_sensitive = (0 == (flags & SLREGEXP_CASELESS));
991    reg->pat = (unsigned char *)pattern;
992 
993    if (0 != (ret = regexp_compile (reg)))
994      {
995 	SLang_verror (SL_Parse_Error, "Error compiling RE '%s' at byte offset %d",
996 		      pattern, ret);
997 	SLregexp_free (reg);
998 	return NULL;
999      }
1000 
1001    return reg;
1002 }
1003 
SLregexp_quote_string(SLFUTURE_CONST char * re,char * buf,unsigned int buflen)1004 char *SLregexp_quote_string (SLFUTURE_CONST char *re, char *buf, unsigned int buflen)
1005 {
1006    char *b, *bmax;
1007 
1008    if (re == NULL) return NULL;
1009 
1010    b = buf;
1011    bmax = buf + buflen;
1012 
1013    while (b < bmax)
1014      {
1015 	char ch;
1016 	switch (ch = *re++)
1017 	  {
1018 	   case 0:
1019 	     *b = 0;
1020 	     return buf;
1021 
1022 	   case '$':
1023 	   case '\\':
1024 	   case '[':
1025 	   case ']':
1026 	   case '.':
1027 	   case '^':
1028 	   case '*':
1029 	   case '+':
1030 	   case '?':
1031 	     *b++ = '\\';
1032 	    if (b == bmax) break;
1033 	     /* drop */
1034 
1035 	   default:
1036 	     *b++ = ch;
1037 	  }
1038      }
1039    return NULL;
1040 }
1041 
SLregexp_nth_match(SLRegexp_Type * reg,unsigned int nth,SLstrlen_Type * ofsp,SLstrlen_Type * lenp)1042 int SLregexp_nth_match (SLRegexp_Type *reg, unsigned int nth,
1043 			SLstrlen_Type *ofsp, SLstrlen_Type *lenp)
1044 {
1045    if (nth >= 10)
1046      {
1047 	SLang_set_error (SL_InvalidParm_Error);
1048 	return -1;
1049      }
1050    if (reg->beg_matches[nth] < 0)
1051      return -1;
1052 
1053    if (ofsp != NULL)
1054      *ofsp = reg->beg_matches[nth];
1055    if (lenp != NULL)
1056      *lenp = reg->end_matches[nth];
1057 
1058    return 0;
1059 }
1060 
SLregexp_get_hints(SLRegexp_Type * reg,unsigned int * hintsp)1061 int SLregexp_get_hints (SLRegexp_Type *reg, unsigned int *hintsp)
1062 {
1063    unsigned int hints = 0;
1064 
1065    if (reg == NULL)
1066      return -1;
1067 
1068    if (reg->osearch) hints |= SLREGEXP_HINT_OSEARCH;
1069    if (reg->must_match_bol) hints |= SLREGEXP_HINT_BOL;
1070 
1071    *hintsp = hints;
1072    return 0;
1073 }
1074 
1075 #if 0
1076 #define MAX_EXP 4096
1077 int main(int argc, char **argv)
1078 {
1079    FILE *fp;
1080    char *regexp, *file;
1081    char expbuf[MAX_EXP], buf[512];
1082    SLRegexp_Type reg;
1083 
1084    file = argv[2];
1085    regexp = argv[1];
1086 
1087    if (NULL == (fp = fopen(file, "r")))
1088      {
1089 	fprintf(stderr, "File not open\n");
1090 	return(1);
1091      }
1092 
1093    reg.buf = expbuf;
1094    reg.buf_len = MAX_EXP;
1095    reg.pat = regexp;
1096    reg.case_sensitive = 1;
1097 
1098    if (!regexp_compile(&reg)) while (NULL != fgets(buf, 511, fp))
1099      {
1100 	if (reg.osearch)
1101 	  {
1102 	     if (NULL == strstr(buf, reg.pat)) continue;
1103 	  }
1104 	else
1105 	  {
1106 	     if (reg.must_match && (NULL == strstr(buf, reg.must_match_str))) continue;
1107 	     if (0 == regexp_match(buf, buf + strlen(buf), &reg)) continue;
1108 	  }
1109 
1110 	fputs(buf, stdout);
1111      }
1112    return (0);
1113 }
1114 #endif
1115