1 /* ed style regular expressions */
2 /*
3 Copyright (C) 2004-2017,2018 John E. Davis
4
5 This file is part of the S-Lang Library.
6
7 The S-Lang Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
11
12 The S-Lang Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this library; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA.
21 */
22
23 #include "slinclud.h"
24
25 #include "slang.h"
26 #include "_slang.h"
27
28 struct _pSLRegexp_Type
29 {
30 /* These must be set by calling routine. */
31 unsigned char *pat; /* regular expression pattern */
32 unsigned char *buf; /* buffer for compiled regexp */
33 size_t buf_len; /* length of buffer */
34 int case_sensitive; /* 1 if match is case sensitive */
35
36 /* The rest are set by SLang_regexp_compile */
37
38 int must_match; /* 1 if line must contain substring */
39 int must_match_bol; /* true if it must match beginning of line */
40 unsigned char must_match_str[16]; /* 15 char null term substring */
41 int osearch; /* 1 if ordinary search suffices */
42 size_t min_length; /* minimum length the match must be */
43 ssize_t beg_matches[10]; /* offset of start of \( */
44 size_t end_matches[10]; /* length of nth submatch
45 * Note that the entire match corresponds
46 * to \0
47 */
48 int offset; /* offset to be added to beg_matches */
49 };
50
51 #define SET_BIT(b, n) b[(unsigned int) (n) >> 3] |= 1 << ((unsigned int) (n) % 8)
52 #define TEST_BIT(b, n) (b[(unsigned int)(n) >> 3] & (1 << ((unsigned int) (n) % 8)))
53 #define LITERAL 1
54 #define RANGE 2 /* [...] */
55 #define ANY 3 /* . */
56 #define BOL 4 /* ^ */
57 #define EOL 5 /* $ */
58 #define NTH_MATCH 6 /* \1 \2 ... \9 */
59 #define OPAREN 7 /* \( */
60 #define CPAREN 0x8 /* \) */
61 #define ANY_DIGIT 0x9 /* \d */
62 #define BOW 0xA /* \< */
63 #define EOW 0xB /* \> */
64 #if 0
65 #define NOT_LITERAL 0xC /* \~ */
66 #endif
67 #define STAR 0x80 /* * */
68 #define LEAST_ONCE 0x40 /* + */
69 #define MAYBE_ONCE 0x20 /* ? */
70 #define MANY 0x10 /* {n,m} */
71 /* The rest are additions */
72 #define YES_CASE (STAR | BOL)
73 #define NO_CASE (STAR | EOL)
74
75 #define UPPERCASE(x) (cs ? (x) : UPPER_CASE(x))
76 #define LOWERCASE(x) (cs ? (x) : LOWER_CASE(x))
77
78 /* FIXME: UTF8 */
79 static unsigned char Word_Chars[256];
80 #define IS_WORD_CHAR(x) Word_Chars[(unsigned int) (x)]
81
82 #if 0
83 static int ctx->open_paren_number;
84 static char Closed_Paren_Matches[10];
85
86 static SLRegexp_Type *This_Reg;
87 static unsigned char *This_Str;
88 #endif
89
90 typedef struct
91 {
92 SLRegexp_Type *reg;
93 SLCONST unsigned char *str;
94 SLstrlen_Type len;
95 char closed_paren_matches[10];
96 int open_paren_number;
97 }
98 Re_Context_Type;
99
do_nth_match(Re_Context_Type * ctx,int idx,SLCONST unsigned char * str,SLCONST unsigned char * estr)100 static SLCONST unsigned char *do_nth_match (Re_Context_Type *ctx, int idx, SLCONST unsigned char *str, SLCONST unsigned char *estr)
101 {
102 SLCONST unsigned char *bpos;
103 size_t m;
104
105 if (ctx->closed_paren_matches[idx] == 0)
106 return NULL;
107
108 bpos = ctx->reg->beg_matches[idx] + ctx->str;
109 m = ctx->reg->end_matches[idx];
110 if (m == 0) return(str);
111 if (str + m > estr) return (NULL);
112
113 /* This needs fixed for case in-sensitive match */
114 if (0 != strncmp((char *) str, (char *) bpos, m)) return (NULL);
115 str += m;
116 return (str);
117 }
118
119 /* returns pointer to the end of regexp or NULL */
regexp_looking_at(Re_Context_Type * ctx,SLCONST unsigned char * str,SLCONST unsigned char * estr,unsigned char * regexp,int cs)120 static SLCONST unsigned char *regexp_looking_at (Re_Context_Type *ctx,
121 SLCONST unsigned char *str, SLCONST unsigned char *estr,
122 unsigned char *regexp,
123 int cs)
124 {
125 register unsigned char p, p1;
126 SLCONST unsigned char *save_str, *tmpstr;
127 int n, n0, n1;
128 int save_num_open;
129 char save_closed_matches[10];
130
131 p = *regexp++;
132
133 while (p != 0)
134 {
135 /* p1 = UPPERCASE(*regexp); */
136 /* if (str < estr) c = UPPERCASE(*str); */
137
138 switch((unsigned char) p)
139 {
140 case BOW:
141 if ((str != ctx->str)
142 && ((str >= estr)
143 || IS_WORD_CHAR(*(str - 1))
144 || (0 == IS_WORD_CHAR(*str)))) return NULL;
145 break;
146
147 case EOW:
148 if ((str < estr)
149 && IS_WORD_CHAR (*str)) return NULL;
150 break;
151
152 case YES_CASE: cs = 1; break;
153 case NO_CASE: cs = 0; break;
154
155 case OPAREN:
156 ctx->open_paren_number++;
157 ctx->reg->beg_matches[ctx->open_paren_number] = (str - ctx->str);
158 break;
159 case CPAREN:
160 n = ctx->open_paren_number;
161 while (n > 0)
162 {
163 if (ctx->closed_paren_matches[n] != 0)
164 {
165 n--;
166 continue;
167 }
168 ctx->closed_paren_matches[n] = 1;
169 ctx->reg->end_matches[n] = (str - (ctx->str + ctx->reg->beg_matches[n]));
170 break;
171 }
172 break;
173 #ifdef NOT_LITERAL
174 case NOT_LITERAL:
175 if ((str >= estr) || (*regexp == UPPERCASE(*str))) return (NULL);
176 str++; regexp++;
177 break;
178
179 case MAYBE_ONCE | NOT_LITERAL:
180 save_str = str;
181 if ((str < estr) && (*regexp != UPPERCASE(*str))) str++;
182 regexp++;
183 goto match_rest;
184
185 case NOT_LITERAL | LEAST_ONCE: /* match at least once */
186 if ((str >= estr) || (UPPERCASE(*str) == UPPERCASE(*regexp))) return (NULL);
187 str++;
188 /* drop */
189 case STAR | NOT_LITERAL:
190 save_str = str; p1 = *regexp;
191 while ((str < estr) && (UPPERCASE(*str) != p1)) str++;
192 regexp++;
193 goto match_rest;
194
195 /* this type consists of the expression + two bytes that
196 determine number of matches to perform */
197 case MANY | NOT_LITERAL:
198 p1 = *regexp; regexp++;
199 n = n0 = (int) (unsigned char) *regexp++;
200 /* minimum number to match--- could be 0 */
201 n1 = (int) (unsigned char) *regexp++;
202 /* maximum number to match */
203
204 while (n && (str < estr) && (p1 != UPPERCASE(*str)))
205 {
206 n--;
207 str++;
208 }
209 if (n) return (NULL);
210
211 save_str = str;
212 n = n1 - n0;
213 while (n && (str < estr) && (p1 != UPPERCASE(*str)))
214 {
215 n--;
216 str++;
217 }
218 goto match_rest;
219 #endif /* NOT_LITERAL */
220 case LITERAL:
221 if ((str >= estr) || (*regexp != UPPERCASE(*str))) return (NULL);
222 str++; regexp++;
223 break;
224
225 case MAYBE_ONCE | LITERAL:
226 save_str = str;
227 if ((str < estr) && (*regexp == UPPERCASE(*str))) str++;
228 regexp++;
229 goto match_rest;
230
231 case LITERAL | LEAST_ONCE: /* match at least once */
232 if ((str >= estr) || (UPPERCASE(*str) != UPPERCASE(*regexp))) return (NULL);
233 str++;
234 /* drop */
235 case STAR | LITERAL:
236 save_str = str; p1 = *regexp;
237 while ((str < estr) && (UPPERCASE(*str) == p1)) str++;
238 regexp++;
239 goto match_rest;
240
241 /* this type consists of the expression + two bytes that
242 determine number of matches to perform */
243 case MANY | LITERAL:
244 p1 = *regexp; regexp++;
245 n = n0 = (int) (unsigned char) *regexp++;
246 /* minimum number to match--- could be 0 */
247 n1 = (int) (unsigned char) *regexp++;
248 /* maximum number to match */
249
250 while (n && (str < estr) && (p1 == UPPERCASE(*str)))
251 {
252 n--;
253 str++;
254 }
255 if (n) return (NULL);
256
257 save_str = str;
258 n = n1 - n0;
259 while (n && (str < estr) && (p1 == UPPERCASE(*str)))
260 {
261 n--;
262 str++;
263 }
264 goto match_rest;
265
266 case NTH_MATCH:
267 if ((str = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr)) == NULL) return(NULL);
268 regexp++;
269 break;
270
271 case MAYBE_ONCE | NTH_MATCH:
272 save_str = str;
273 tmpstr = do_nth_match (ctx, (int) (unsigned char) *regexp, str, estr);
274 if (tmpstr != NULL)
275 str = tmpstr;
276 regexp++;
277 goto match_rest;
278
279 case LEAST_ONCE | NTH_MATCH:
280 if ((str = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr)) == NULL) return(NULL);
281 /* drop */
282 case STAR | NTH_MATCH:
283 save_str = str;
284 while (NULL != (tmpstr = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr)))
285 {
286 str = tmpstr;
287 }
288 regexp++;
289 goto match_rest;
290
291 case MANY | NTH_MATCH:
292 /* minimum number to match--- could be 0 */
293 n = n0 = (int) (unsigned char) regexp[1];
294 /* maximum number to match */
295 n1 = (int) (unsigned char) regexp[2];
296
297 while (n && (str < estr)
298 && (NULL != (tmpstr = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr))))
299 {
300 n--;
301 str = tmpstr;
302 }
303 if (n) return (NULL);
304
305 save_str = str;
306 n = n1 - n0;
307 while (n && (str < estr)
308 && (NULL != (tmpstr = do_nth_match(ctx, (int) (unsigned char) *regexp, str, estr))))
309 {
310 n--;
311 str = tmpstr;
312 }
313 regexp += 3;
314 goto match_rest;
315
316 case RANGE:
317 if (str >= estr) return (NULL);
318 if (TEST_BIT(regexp, UPPERCASE(*str)) == 0) return (NULL);
319 regexp += 32; str++;
320 break;
321
322 case MAYBE_ONCE | RANGE:
323 save_str = str;
324 if ((str < estr) && TEST_BIT(regexp, UPPERCASE(*str))) str++;
325 regexp += 32;
326 goto match_rest;
327
328 case LEAST_ONCE | RANGE:
329 if ((str >= estr) || (0 == TEST_BIT(regexp, UPPERCASE(*str)))) return NULL;
330 str++;
331 /* drop */
332 case STAR | RANGE:
333 save_str = str;
334 while ((str < estr) && TEST_BIT(regexp, UPPERCASE(*str))) str++;
335 regexp += 32;
336 goto match_rest;
337
338 /* The first 32 bytes correspond to the range and the two
339 * following bytes indicate the min and max number of matches.
340 */
341 case MANY | RANGE:
342 /* minimum number to match--- could be 0 */
343 n = n0 = (int) (unsigned char) *(regexp + 32);
344 /* maximum number to match */
345 n1 = (int) (unsigned char) *(regexp + 33);
346
347 while (n && (str < estr) && (TEST_BIT(regexp, UPPERCASE(*str))))
348 {
349 n--;
350 str++;
351 }
352 if (n) return (NULL);
353 save_str = str;
354 n = n1 - n0;
355 while (n && (str < estr) && (TEST_BIT(regexp, UPPERCASE(*str))))
356 {
357 n--;
358 str++;
359 }
360 regexp += 34; /* 32 + 2 */
361 goto match_rest;
362
363 case ANY_DIGIT:
364 if ((str >= estr) || (*str > '9') || (*str < '0')) return (NULL);
365 str++;
366 break;
367
368 case MAYBE_ONCE | ANY_DIGIT:
369 save_str = str;
370 if ((str < estr) && ((*str <= '9') && (*str >= '0'))) str++;
371 goto match_rest;
372
373 case LEAST_ONCE | ANY_DIGIT:
374 if ((str >= estr) || ((*str > '9') || (*str < '0'))) return NULL;
375 str++;
376 /* drop */
377 case STAR | ANY_DIGIT:
378 save_str = str;
379 while ((str < estr) && ((*str <= '9') && (*str >= '0'))) str++;
380 goto match_rest;
381
382 case MANY | ANY_DIGIT:
383 /* minimum number to match--- could be 0 */
384 n = n0 = (int) (unsigned char) *regexp++;
385 /* maximum number to match */
386 n1 = (int) (unsigned char) *regexp++;
387
388 while (n && (str < estr) && (*str <= '9') && (*str >= '0'))
389 {
390 n--;
391 str++;
392 }
393 if (n) return (NULL);
394 save_str = str;
395 n = n1 - n0;
396 while (n && (str < estr) && (*str <= '9') && (*str >= '0'))
397 {
398 n--;
399 str++;
400 }
401 goto match_rest;
402
403 case ANY: /* . */
404 /* FIXME: UTF8 */
405 if ((str >= estr) || (*str == '\n')) return (NULL);
406 str++;
407 break;
408
409 case MAYBE_ONCE | ANY: /* .? */
410 /* FIXME: UTF8 */
411 save_str = str;
412 if ((str < estr) && (*str != '\n')) str++;
413 goto match_rest;
414
415 case LEAST_ONCE | ANY: /* .+ */
416 /* FIXME: UTF8 */
417 if ((str >= estr) || (*str == '\n')) return (NULL);
418 str++;
419 /* drop */
420 case STAR | ANY: /* .* */
421 /* FIXME: UTF8 */
422 save_str = str;
423 while ((str < estr) && (*str != '\n')) str++;
424 goto match_rest;
425
426 case MANY | ANY:
427 /* minimum number to match--- could be 0 */
428 n = n0 = (int) (unsigned char) *regexp++;
429 /* maximum number to match */
430 n1 = (int) (unsigned char) *regexp++;
431
432 while (n && (str < estr) && (*str != '\n'))
433 {
434 n--;
435 str++;
436 }
437 if (n) return (NULL);
438 save_str = str;
439 n = n1 - n0;
440 while (n && (str < estr) && (*str != '\n'))
441 {
442 n--;
443 str++;
444 }
445 goto match_rest;
446
447 case EOL:
448 if (str >= estr)
449 return str;
450 if ((*str == '\n') && (str+1 == estr))
451 return str;
452 return(NULL);
453
454 default: return (NULL);
455 }
456 p = *regexp++;
457 continue;
458
459 match_rest:
460 if (save_str == str)
461 {
462 p = *regexp++;
463 continue;
464 }
465
466 /* if (p == EOL)
467 * {
468 * if (str < estr) return (NULL); else return (str);
469 * }
470 */
471
472 SLMEMCPY(save_closed_matches, ctx->closed_paren_matches, sizeof(save_closed_matches));
473 save_num_open = ctx->open_paren_number;
474 while (str >= save_str)
475 {
476 tmpstr = regexp_looking_at (ctx, str, estr, regexp, cs);
477 if (tmpstr != NULL) return(tmpstr);
478 SLMEMCPY(ctx->closed_paren_matches, save_closed_matches, sizeof(ctx->closed_paren_matches));
479 ctx->open_paren_number = save_num_open;
480 str--;
481 }
482 return NULL;
483 }
484 if ((p != 0) && (p != EOL)) return (NULL); else return (str);
485 }
486
487 static void
fixup_beg_end_matches(Re_Context_Type * ctx,SLRegexp_Type * r,SLCONST unsigned char * str,SLCONST unsigned char * epos)488 fixup_beg_end_matches (Re_Context_Type *ctx, SLRegexp_Type *r,
489 SLCONST unsigned char *str, SLCONST unsigned char *epos)
490 {
491 int i;
492
493 if (str == NULL)
494 {
495 r->beg_matches[0] = -1;
496 r->end_matches[0] = 0;
497 SLMEMSET(ctx->closed_paren_matches, 0, sizeof(ctx->closed_paren_matches));
498 }
499 else
500 {
501 r->beg_matches[0] = (str - ctx->str);
502 r->end_matches[0] = (epos - str);
503 }
504
505 for (i = 1; i < 10; i++)
506 {
507 if (ctx->closed_paren_matches [i] == 0)
508 {
509 r->beg_matches[i] = -1;
510 r->end_matches[i] = 0;
511 }
512 }
513 }
514
init_re_context(Re_Context_Type * ctx,SLRegexp_Type * reg,SLCONST unsigned char * str,SLstrlen_Type len)515 static void init_re_context (Re_Context_Type *ctx, SLRegexp_Type *reg,
516 SLCONST unsigned char *str, SLstrlen_Type len)
517 {
518 memset ((char *) ctx, 0, sizeof (Re_Context_Type));
519 ctx->reg = reg;
520 ctx->str = str;
521 ctx->len = len;
522 }
523
regexp_match(SLCONST unsigned char * str,SLstrlen_Type len,SLRegexp_Type * reg)524 static SLCONST unsigned char *regexp_match(SLCONST unsigned char *str,
525 SLstrlen_Type len, SLRegexp_Type *reg)
526 {
527 unsigned char c = 0;
528 SLCONST unsigned char *estr = str + len;
529 int cs = reg->case_sensitive, lit = 0;
530 unsigned char *buf = reg->buf;
531 SLCONST unsigned char *epos = NULL;
532 Re_Context_Type ctx_buf;
533
534 if (reg->min_length > len) return NULL;
535
536 init_re_context (&ctx_buf, reg, str, len);
537
538 if (*buf == BOL)
539 {
540 if (NULL == (epos = regexp_looking_at (&ctx_buf, str, estr, buf + 1, cs)))
541 str = NULL;
542
543 fixup_beg_end_matches (&ctx_buf, reg, str, epos);
544 return str;
545 }
546
547 if (*buf == NO_CASE)
548 {
549 buf++; cs = 0;
550 }
551
552 if (*buf == YES_CASE)
553 {
554 buf++; cs = 1;
555 }
556
557 if (*buf == LITERAL)
558 {
559 lit = 1;
560 c = *(buf + 1);
561 }
562 else if ((*buf == OPAREN) && (*(buf + 1) == LITERAL))
563 {
564 lit = 1;
565 c = *(buf + 2);
566 }
567
568 while (1)
569 {
570 ctx_buf.open_paren_number = 0;
571 memset (ctx_buf.closed_paren_matches, 0, sizeof(ctx_buf.closed_paren_matches));
572 /* take care of leading chars */
573 if (lit)
574 {
575 while ((str < estr) && (c != UPPERCASE(*str))) str++;
576 if (str >= estr)
577 break; /* failed */
578 }
579
580 if (NULL != (epos = regexp_looking_at(&ctx_buf, str, estr, buf, cs)))
581 {
582 fixup_beg_end_matches (&ctx_buf, reg, str, epos);
583 return str;
584 }
585 if (str >= estr)
586 break;
587 str++;
588 }
589 fixup_beg_end_matches (&ctx_buf, reg, NULL, epos);
590 return NULL;
591 }
592
SLregexp_match(SLRegexp_Type * reg,SLFUTURE_CONST char * str,SLstrlen_Type len)593 char *SLregexp_match (SLRegexp_Type *reg, SLFUTURE_CONST char *str, SLstrlen_Type len)
594 {
595 return (char *) regexp_match ((SLCONST unsigned char *)str, len, reg);
596 }
597
convert_digit(unsigned char * pat,int * nn)598 static unsigned char *convert_digit(unsigned char *pat, int *nn)
599 {
600 int n = 0;
601 unsigned char c;
602 while (c = (unsigned char) *pat, (c <= '9') && (c >= '0'))
603 {
604 pat++;
605 n = 10 * n + (c - '0');
606 }
607 *nn = n;
608 return pat;
609 }
610
611 #define ERROR return (int) (pat - reg->pat)
612
613 /* Returns 0 if successful or offset in pattern of error */
regexp_compile(SLRegexp_Type * reg)614 static int regexp_compile (SLRegexp_Type *reg)
615 {
616 register unsigned char *buf, *ebuf, *pat;
617 unsigned char *last = NULL, *tmppat;
618 register unsigned char c;
619 int i, reverse = 0, n, cs;
620 int oparen = 0, nparen = 0;
621 /* substring stuff */
622 int count, last_count, this_max_mm = 0, max_mm = 0, ordinary_search,
623 no_osearch = 0, min_length = 0;
624 unsigned char *mm_p = NULL, *this_mm_p = NULL;
625 static int already_initialized;
626
627 reg->beg_matches[0] = reg->end_matches[0] = 0;
628 buf = reg->buf;
629 ebuf = (reg->buf + reg->buf_len) - 2; /* make some room */
630 pat = reg->pat;
631 cs = reg->case_sensitive;
632
633 if (already_initialized == 0)
634 {
635 SLang_init_case_tables ();
636 #ifdef IBMPC_SYSTEM
637 SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\200-\232\240-\245\341-\353", 0);
638 #else
639 SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\277-\326\330-\336\340-\366\370-\376", 0);
640 #endif
641 already_initialized = 1;
642 }
643
644 i = 1; while (i < 10)
645 {
646 reg->beg_matches[i] = -1;
647 reg->end_matches[i] = 0;
648 i++;
649 }
650
651 if (*pat == '\\')
652 {
653 if (pat[1] == 'c')
654 {
655 cs = 1;
656 pat += 2;
657 no_osearch = 1;
658 }
659 else if (pat[1] == 'C')
660 {
661 cs = 0;
662 pat += 2;
663 no_osearch = 1;
664 }
665 }
666
667 if (*pat == '^')
668 {
669 pat++;
670 *buf++ = BOL;
671 reg->must_match_bol = 1;
672 }
673 else reg->must_match_bol = 0;
674
675 if (cs != reg->case_sensitive)
676 {
677 if (cs) *buf++ = YES_CASE;
678 else *buf++ = NO_CASE;
679 }
680
681 *buf = 0;
682
683 last_count = count = 0;
684 while ((c = *pat++) != 0)
685 {
686 if (buf >= ebuf - 3)
687 {
688 _pSLang_verror (SL_BUILTIN_LIMIT_EXCEEDED, "Pattern too large to be compiled.");
689 ERROR;
690 }
691
692 count++;
693 switch (c)
694 {
695 case '$':
696 if (*pat != 0) goto literal_char;
697 *buf++ = EOL;
698 break;
699
700 case '\\':
701 c = *pat++;
702 no_osearch = 1;
703 switch(c)
704 {
705 case 'e': c = 033; goto literal_char;
706 case 'n': c = '\n'; goto literal_char;
707 case 't': c = '\t'; goto literal_char;
708 case 'C': cs = 0; *buf++ = NO_CASE; break;
709 case 'c': cs = 1; *buf++ = YES_CASE; break;
710 case '1': case '2': case '3': case '4': case '5':
711 case '6': case '7': case '8': case '9':
712 c = c - '0';
713 if ((int) c > nparen) ERROR;
714 last = buf;
715 *buf++ = NTH_MATCH; *buf++ = c;
716 break;
717 #ifdef NOT_LITERAL
718 case '~': /* slang extension */
719 if ((c = *pat) == 0) ERROR;
720 pat++;
721 last = buf;
722 *buf++ = NOT_LITERAL;
723 *buf++ = c;
724 min_length++;
725 break;
726 #endif
727 case 'd': /* slang extension */
728 last = buf;
729 *buf++ = ANY_DIGIT;
730 min_length++;
731 break;
732
733 case '<':
734 last = NULL;
735 *buf++ = BOW;
736 break;
737
738 case '>':
739 last = NULL;
740 *buf++ = EOW;
741 break;
742
743 case '{':
744 if (last == NULL) goto literal_char;
745 *last |= MANY;
746 tmppat = convert_digit(pat, &n);
747 if (tmppat == NULL) ERROR;
748 pat = tmppat;
749 *buf++ = n;
750
751 min_length += (n - 1);
752
753 if (*pat == '\\')
754 {
755 *buf++ = n;
756 }
757 else if (*pat == ',')
758 {
759 pat++;
760 if (*pat == '\\')
761 {
762 n = 255;
763 }
764 else
765 {
766 tmppat = convert_digit(pat, &n);
767 if (tmppat == NULL) ERROR;
768 pat = tmppat;
769 if (*pat != '\\') ERROR;
770 }
771 *buf++ = n;
772 }
773 else ERROR;
774 last = NULL;
775 pat++;
776 if (*pat != '}') ERROR;
777 pat++;
778 break; /* case '{' */
779
780 case '(':
781 oparen++;
782 if (oparen > 9) ERROR;
783 *buf++ = OPAREN;
784 break;
785 case ')':
786 if (oparen == 0) ERROR;
787 oparen--;
788 nparen++;
789 *buf++ = CPAREN;
790 break;
791
792 case 0: ERROR;
793 default:
794 goto literal_char;
795 }
796 break;
797
798 case '[':
799
800 *buf = RANGE;
801 last = buf++;
802
803 if (buf + 32 >= ebuf) ERROR;
804
805 for (i = 0; i < 32; i++) buf[i] = 0;
806 c = *pat++;
807 if (c == '^')
808 {
809 reverse = 1;
810 SET_BIT(buf, '\n');
811 c = *pat++;
812 }
813
814 if (c == ']')
815 {
816 SET_BIT(buf, c);
817 c = *pat++;
818 }
819 while (c && (c != ']'))
820 {
821 if (c == '\\')
822 {
823 c = *pat++;
824 switch(c)
825 {
826 case 'n': c = '\n'; break;
827 case 't': c = '\t'; break;
828 case 0: ERROR;
829 }
830 }
831
832 if (*pat == '-')
833 {
834 pat++;
835 while (c < *pat)
836 {
837 if (cs == 0)
838 {
839 SET_BIT(buf, UPPERCASE(c));
840 SET_BIT(buf, LOWERCASE(c));
841 }
842 else SET_BIT(buf, c);
843 c++;
844 }
845 }
846 if (cs == 0)
847 {
848 SET_BIT(buf, UPPERCASE(c));
849 SET_BIT(buf, LOWERCASE(c));
850 }
851 else SET_BIT(buf, c);
852 c = *pat++;
853 }
854 if (c != ']') ERROR;
855 if (reverse) for (i = 0; i < 32; i++) buf[i] = buf[i] ^ 0xFF;
856 reverse = 0;
857 buf += 32;
858 min_length++;
859 break;
860
861 case '.':
862 last = buf;
863 *buf++ = ANY;
864 min_length++;
865 break;
866
867 case '*':
868 if (last == NULL) goto literal_char;
869 *last |= STAR;
870 min_length--;
871 last = NULL;
872 break;
873
874 case '+':
875 if (last == NULL) goto literal_char;
876 *last |= LEAST_ONCE;
877 last = NULL;
878 break;
879
880 case '?':
881 if (last == NULL) goto literal_char;
882 *last |= MAYBE_ONCE;
883 last = NULL;
884 min_length--;
885 break;
886
887 literal_char:
888 default:
889 /* This is to keep track of longest substring */
890 min_length++;
891 this_max_mm++;
892 if (last_count + 1 == count)
893 {
894 if (this_max_mm == 1)
895 {
896 this_mm_p = buf;
897 }
898 else if (max_mm < this_max_mm)
899 {
900 mm_p = this_mm_p;
901 max_mm = this_max_mm;
902 }
903 }
904 else
905 {
906 this_mm_p = buf;
907 this_max_mm = 1;
908 }
909
910 last_count = count;
911
912 last = buf;
913 *buf++ = LITERAL;
914 *buf++ = UPPERCASE(c);
915 }
916 }
917 *buf = 0;
918 /* Check for ordinary search */
919 ebuf = buf;
920 buf = reg->buf;
921
922 if (no_osearch) ordinary_search = 0;
923 else
924 {
925 ordinary_search = 1;
926 while (buf < ebuf)
927 {
928 if (*buf != LITERAL)
929 {
930 ordinary_search = 0;
931 break;
932 }
933 buf += 2;
934 }
935 }
936
937 reg->osearch = ordinary_search;
938 reg->must_match_str[15] = 0;
939 reg->min_length = (min_length > 0) ? (unsigned int) min_length : 0;
940 if (ordinary_search)
941 {
942 strncpy((char *) reg->must_match_str, (char *) reg->pat, 15);
943 reg->must_match = 1;
944 return(0);
945 }
946 /* check for longest substring of pattern */
947 reg->must_match = 0;
948 if ((mm_p == NULL) && (this_mm_p != NULL)) mm_p = this_mm_p;
949 if (mm_p == NULL)
950 {
951 return (0);
952 }
953 n = 15;
954 pat = reg->must_match_str;
955 buf = mm_p;
956 while (n--)
957 {
958 if (*buf++ != LITERAL) break;
959 *pat++ = *buf++;
960 }
961 *pat = 0;
962 if (pat != reg->must_match_str) reg->must_match = 1;
963 return(0);
964 }
965
SLregexp_free(SLRegexp_Type * reg)966 void SLregexp_free (SLRegexp_Type *reg)
967 {
968 if (reg == NULL)
969 return;
970 if (reg->buf != NULL)
971 SLfree ((char *) reg->buf);
972 SLfree ((char *) reg);
973 }
974
SLregexp_compile(SLFUTURE_CONST char * pattern,unsigned int flags)975 SLRegexp_Type *SLregexp_compile (SLFUTURE_CONST char *pattern, unsigned int flags)
976 {
977 SLRegexp_Type *reg;
978 int ret;
979
980 reg = (SLRegexp_Type *)SLcalloc (1, sizeof (SLRegexp_Type));
981 if (reg == NULL)
982 return NULL;
983
984 if (NULL == (reg->buf = (unsigned char *)SLmalloc (1024)))
985 {
986 SLfree ((char *) reg);
987 return NULL;
988 }
989 reg->buf_len = 1024;
990 reg->case_sensitive = (0 == (flags & SLREGEXP_CASELESS));
991 reg->pat = (unsigned char *)pattern;
992
993 if (0 != (ret = regexp_compile (reg)))
994 {
995 SLang_verror (SL_Parse_Error, "Error compiling RE '%s' at byte offset %d",
996 pattern, ret);
997 SLregexp_free (reg);
998 return NULL;
999 }
1000
1001 return reg;
1002 }
1003
SLregexp_quote_string(SLFUTURE_CONST char * re,char * buf,unsigned int buflen)1004 char *SLregexp_quote_string (SLFUTURE_CONST char *re, char *buf, unsigned int buflen)
1005 {
1006 char *b, *bmax;
1007
1008 if (re == NULL) return NULL;
1009
1010 b = buf;
1011 bmax = buf + buflen;
1012
1013 while (b < bmax)
1014 {
1015 char ch;
1016 switch (ch = *re++)
1017 {
1018 case 0:
1019 *b = 0;
1020 return buf;
1021
1022 case '$':
1023 case '\\':
1024 case '[':
1025 case ']':
1026 case '.':
1027 case '^':
1028 case '*':
1029 case '+':
1030 case '?':
1031 *b++ = '\\';
1032 if (b == bmax) break;
1033 /* drop */
1034
1035 default:
1036 *b++ = ch;
1037 }
1038 }
1039 return NULL;
1040 }
1041
SLregexp_nth_match(SLRegexp_Type * reg,unsigned int nth,SLstrlen_Type * ofsp,SLstrlen_Type * lenp)1042 int SLregexp_nth_match (SLRegexp_Type *reg, unsigned int nth,
1043 SLstrlen_Type *ofsp, SLstrlen_Type *lenp)
1044 {
1045 if (nth >= 10)
1046 {
1047 SLang_set_error (SL_InvalidParm_Error);
1048 return -1;
1049 }
1050 if (reg->beg_matches[nth] < 0)
1051 return -1;
1052
1053 if (ofsp != NULL)
1054 *ofsp = reg->beg_matches[nth];
1055 if (lenp != NULL)
1056 *lenp = reg->end_matches[nth];
1057
1058 return 0;
1059 }
1060
SLregexp_get_hints(SLRegexp_Type * reg,unsigned int * hintsp)1061 int SLregexp_get_hints (SLRegexp_Type *reg, unsigned int *hintsp)
1062 {
1063 unsigned int hints = 0;
1064
1065 if (reg == NULL)
1066 return -1;
1067
1068 if (reg->osearch) hints |= SLREGEXP_HINT_OSEARCH;
1069 if (reg->must_match_bol) hints |= SLREGEXP_HINT_BOL;
1070
1071 *hintsp = hints;
1072 return 0;
1073 }
1074
1075 #if 0
1076 #define MAX_EXP 4096
1077 int main(int argc, char **argv)
1078 {
1079 FILE *fp;
1080 char *regexp, *file;
1081 char expbuf[MAX_EXP], buf[512];
1082 SLRegexp_Type reg;
1083
1084 file = argv[2];
1085 regexp = argv[1];
1086
1087 if (NULL == (fp = fopen(file, "r")))
1088 {
1089 fprintf(stderr, "File not open\n");
1090 return(1);
1091 }
1092
1093 reg.buf = expbuf;
1094 reg.buf_len = MAX_EXP;
1095 reg.pat = regexp;
1096 reg.case_sensitive = 1;
1097
1098 if (!regexp_compile(®)) while (NULL != fgets(buf, 511, fp))
1099 {
1100 if (reg.osearch)
1101 {
1102 if (NULL == strstr(buf, reg.pat)) continue;
1103 }
1104 else
1105 {
1106 if (reg.must_match && (NULL == strstr(buf, reg.must_match_str))) continue;
1107 if (0 == regexp_match(buf, buf + strlen(buf), ®)) continue;
1108 }
1109
1110 fputs(buf, stdout);
1111 }
1112 return (0);
1113 }
1114 #endif
1115