1 /* ed style regular expressions */
2 /* Copyright (c) 1992, 1999, 2001, 2002 John E. Davis
3 *
4 * You may distribute under the terms of either the GNU General Public
5 * License or the Perl Artistic License.
6 */
7
8 #include "slinclud.h"
9
10 #include "slang.h"
11 #include "_slang.h"
12 #if SLANG_HAS_KANJI_SUPPORT
13 #include "slkanji.h"
14 #endif
15
16 #define SET_BIT(b, n) b[(unsigned int) (n) >> 3] |= 1 << ((unsigned int) (n) % 8)
17 #define TEST_BIT(b, n) (b[(unsigned int)(n) >> 3] & (1 << ((unsigned int) (n) % 8)))
18 #define LITERAL 1
19 #define RANGE 2 /* [...] */
20 #define ANY 3 /* . */
21 #define BOL 4 /* ^ */
22 #define EOL 5 /* $ */
23 #define NTH_MATCH 6 /* \1 \2 ... \9 */
24 #define OPAREN 7 /* \( */
25 #define CPAREN 0x8 /* \) */
26 #define ANY_DIGIT 0x9 /* \d */
27 #define BOW 0xA /* \< */
28 #define EOW 0xB /* \> */
29 #if 0
30 #define NOT_LITERAL 0xC /* \~ */
31 #endif
32 #define STAR 0x80 /* * */
33 #define LEAST_ONCE 0x40 /* + */
34 #define MAYBE_ONCE 0x20 /* ? */
35 #define MANY 0x10 /* {n,m} */
36 /* The rest are additions */
37 #define YES_CASE (STAR | BOL)
38 #define NO_CASE (STAR | EOL)
39
40 #define UPPERCASE(x) (cs ? (x) : UPPER_CASE(x))
41 #define LOWERCASE(x) (cs ? (x) : LOWER_CASE(x))
42
43 static unsigned char Word_Chars[256];
44 #define IS_WORD_CHAR(x) Word_Chars[(unsigned int) (x)]
45
46 #if 0
47 static int ctx->open_paren_number;
48 static char Closed_Paren_Matches[10];
49
50 static SLRegexp_Type *This_Reg;
51 static unsigned char *This_Str;
52 #endif
53
54 typedef struct
55 {
56 SLRegexp_Type *reg;
57 unsigned char *str;
58 unsigned int len;
59 char closed_paren_matches[10];
60 int open_paren_number;
61 }
62 Re_Context_Type;
63
do_nth_match(Re_Context_Type * ctx,int n,unsigned char * str,unsigned char * estr)64 static unsigned char *do_nth_match (Re_Context_Type *ctx, int n, unsigned char *str, unsigned char *estr)
65 {
66 unsigned char *bpos;
67
68 if (ctx->closed_paren_matches[n] == 0)
69 return NULL;
70
71 bpos = ctx->reg->beg_matches[n] + ctx->str;
72 n = ctx->reg->end_matches[n];
73 if (n == 0) return(str);
74 if (n > (int) (estr - str)) return (NULL);
75
76 /* This needs fixed for case sensitive match */
77 if (0 != strncmp((char *) str, (char *) bpos, (unsigned int) n)) return (NULL);
78 str += n;
79 return (str);
80 }
81
82 /* returns pointer to the end of regexp or NULL */
regexp_looking_at(Re_Context_Type * ctx,register unsigned char * str,unsigned char * estr,unsigned char * buf,register int cs)83 static unsigned char *regexp_looking_at (Re_Context_Type *ctx, register unsigned char *str, unsigned char *estr, unsigned char *buf, register int cs)
84 {
85 register unsigned char p, p1;
86 unsigned char *save_str, *tmpstr;
87 int n, n0, n1;
88 int save_num_open;
89 char save_closed_matches[10];
90
91 p = *buf++;
92
93 while (p != 0)
94 {
95 /* p1 = UPPERCASE(*buf); */
96 /* if (str < estr) c = UPPERCASE(*str); */
97
98 switch((unsigned char) p)
99 {
100 case BOW:
101 if ((str != ctx->str)
102 && ((str >= estr)
103 || IS_WORD_CHAR(*(str - 1))
104 || (0 == IS_WORD_CHAR(*str)))) return NULL;
105 break;
106
107 case EOW:
108 if ((str < estr)
109 && IS_WORD_CHAR (*str)) return NULL;
110 break;
111
112 case YES_CASE: cs = 1; break;
113 case NO_CASE: cs = 0; break;
114
115 case OPAREN:
116 ctx->open_paren_number++;
117 ctx->reg->beg_matches[ctx->open_paren_number] = (int) (str - ctx->str);
118 break;
119 case CPAREN:
120 n = ctx->open_paren_number;
121 while (n > 0)
122 {
123 if (ctx->closed_paren_matches[n] != 0)
124 {
125 n--;
126 continue;
127 }
128 ctx->closed_paren_matches[n] = 1;
129 ctx->reg->end_matches[n] = (unsigned int) (str - (ctx->str + ctx->reg->beg_matches[n]));
130 break;
131 }
132 break;
133 #ifdef NOT_LITERAL
134 case NOT_LITERAL:
135 #if SLANG_HAS_KANJI_SUPPORT
136 if ((str >= estr) || ((!iskanji(*buf) && *buf != UPPERCASE(*str)) ||
137 (iskanji(*buf) && (*buf == *str || *(buf+2) == *(str+1)))))
138 return (NULL);
139 if(iskanji(*str))
140 {
141 str++; buf++; buf++;
142 }
143 #else
144 if ((str >= estr) || (*buf == UPPERCASE(*str))) return (NULL);
145 #endif
146 str++; buf++;
147 break;
148
149 case MAYBE_ONCE | NOT_LITERAL:
150 save_str = str;
151 #if SLANG_HAS_KANJI_SUPPORT
152 if ((str < estr) && ((!iskanji(*str) && *buf == UPPERCASE(*str)) ||
153 (iskanji(*str) && *buf == *str && *(buf+1) == *(str+1))))
154 {
155 if(iskanji(*str)) { str++; buf++; buf++; }
156 str++;
157 }
158 #else
159 if ((str < estr) && (*buf != UPPERCASE(*str))) str++;
160 #endif
161 buf++;
162 goto match_rest;
163
164 case NOT_LITERAL | LEAST_ONCE: /* match at least once */
165 if(str >= estr) return (NULL);
166 #if SLANG_HAS_KANJI_SUPPORT
167 if(iskanji(*str))
168 if((*str == *buf) && (*(str+1) != *(buf+2))) return (NULL);
169 else
170 #endif
171 if ((UPPERCASE(*str) == UPPERCASE(*buf))) return (NULL);
172 #if SLANG_HAS_KANJI_SUPPORT
173 if(iskanji(*str)) str++;
174 #endif
175 str++;
176 /* drop */
177 case STAR | NOT_LITERAL:
178 save_str = str; p1 = *buf;
179 #if SLANG_HAS_KANJI_SUPPORT
180 while ((str < estr) && ((!iskanji(*str) && (UPPERCASE(*str) == p1)) ||
181 (iskanji(*str) && *str == p1 && *(str+1) == *(buf +2))))
182 {
183 if(iskanji(*str)) str++;
184 str++;
185 }
186 if(iskanji(p1)) { buf++; buf++; }
187 #else
188 while ((str < estr) && (UPPERCASE(*str) != p1)) str++;
189 #endif
190 buf++;
191 goto match_rest;
192
193 /* this type consists of the expression + two bytes that
194 determine number of matches to perform */
195 case MANY | NOT_LITERAL:
196 p1 = *buf; buf++;
197 n = n0 = (int) (unsigned char) *buf++;
198 /* minimum number to match--- could be 0 */
199 n1 = (int) (unsigned char) *buf++;
200 /* maximum number to match */
201
202 while (n && (str < estr) && (p1 != *str))
203 {
204 #if SLANG_HAS_KANJI_SUPPORT
205 if(iskanji(*str)) { n--; str++; }
206 #endif
207 n--;
208 str++;
209 }
210 if (n) return (NULL);
211
212 save_str = str;
213 n = n1 - n0;
214 while (n && (str < estr) && (p1 != *str))
215 {
216 #if SLANG_HAS_KANJI_SUPPORT
217 if(iskanji(*str)) { n--; str++; }
218 #endif
219 n--;
220 str++;
221 }
222 goto match_rest;
223 #endif /* NOT_LITERAL */
224 case LITERAL:
225 #if SLANG_HAS_KANJI_SUPPORT
226 if ((str >= estr) || ((!iskanji(*buf) && *buf != UPPERCASE(*str)) ||
227 (iskanji(*buf) && (*buf == *str || *(buf+2) == *(str+1)))))
228 return (NULL);
229 if(iskanji(*str))
230 {
231 str++; buf++; buf++;
232 }
233 #else
234 if ((str >= estr) || (*buf != UPPERCASE(*str))) return (NULL);
235 #endif
236 str++; buf++;
237 break;
238
239 case MAYBE_ONCE | LITERAL:
240 save_str = str;
241 #if SLANG_HAS_KANJI_SUPPORT
242 if ((str < estr) && ((!iskanji(*str) && *buf == UPPERCASE(*str)) ||
243 (iskanji(*str) && *buf == *str && *(buf+1) == *(str+1))))
244 {
245 if(iskanji(*str)) { str++; buf++; buf++; }
246 str++;
247 }
248 #else
249 if ((str < estr) && (*buf == UPPERCASE(*str))) str++;
250 #endif
251 buf++;
252 goto match_rest;
253
254 case LITERAL | LEAST_ONCE: /* match at least once */
255 if ((str >= estr) || (UPPERCASE(*str) != UPPERCASE(*buf))) return (NULL);
256 #if SLANG_HAS_KANJI_SUPPORT
257 if(iskanji(*str) && (*str != *buf || *(str+1) != *(buf+2))) return (NULL);
258 if(iskanji(*str)) str++;
259 #endif
260 str++;
261 /* drop */
262 case STAR | LITERAL:
263 save_str = str; p1 = *buf;
264 #if SLANG_HAS_KANJI_SUPPORT
265 while ((str < estr) && ((!iskanji(*str) && (UPPERCASE(*str) == p1)) ||
266 (iskanji(*str) && *str == p1 && *(str+1) == *(buf +2))))
267 {
268 if(iskanji(*str)) str++;
269 str++;
270 }
271 if(iskanji(p1)) { buf++; buf++; }
272 #else
273 while ((str < estr) && (UPPERCASE(*str) == p1)) str++;
274 #endif
275 buf++;
276 goto match_rest;
277
278 /* this type consists of the expression + two bytes that
279 determine number of matches to perform */
280 case MANY | LITERAL:
281 p1 = *buf; buf++;
282 n = n0 = (int) (unsigned char) *buf++;
283 /* minimum number to match--- could be 0 */
284 n1 = (int) (unsigned char) *buf++;
285 /* maximum number to match */
286
287 while (n && (str < estr) && (p1 == *str))
288 {
289 #if SLANG_HAS_KANJI_SUPPORT
290 if(iskanji(*str)) { n--; str++; }
291 #endif
292 n--;
293 str++;
294 }
295 if (n) return (NULL);
296
297 save_str = str;
298 n = n1 - n0;
299 while (n && (str < estr) && (p1 == *str))
300 {
301 #if SLANG_HAS_KANJI_SUPPORT
302 if(iskanji(*str)) { n--; str++; }
303 #endif
304 n--;
305 str++;
306 }
307 goto match_rest;
308
309 case NTH_MATCH:
310 if ((str = do_nth_match(ctx, (int) (unsigned char) *buf, str, estr)) == NULL) return(NULL);
311 buf++;
312 break;
313
314 case MAYBE_ONCE | NTH_MATCH:
315 save_str = str;
316 tmpstr = do_nth_match (ctx, (int) (unsigned char) *buf, str, estr);
317 buf++;
318 if (tmpstr != NULL)
319 {
320 str = tmpstr;
321 goto match_rest;
322 }
323 continue;
324
325 case LEAST_ONCE | NTH_MATCH:
326 if ((str = do_nth_match(ctx, (int) (unsigned char) *buf, str, estr)) == NULL) return(NULL);
327 /* drop */
328 case STAR | NTH_MATCH:
329 save_str = str;
330 while (NULL != (tmpstr = do_nth_match(ctx, (int) (unsigned char) *buf, str, estr)))
331 {
332 str = tmpstr;
333 }
334 buf++;
335 goto match_rest;
336
337 case MANY | NTH_MATCH: return(NULL);
338 /* needs done */
339
340 case RANGE:
341 if (str >= estr) return (NULL);
342 if (TEST_BIT(buf, UPPERCASE(*str)) == 0) return (NULL);
343 #if SLANG_HAS_KANJI_SUPPORT
344 if(iskanji(*str)) str++;
345 #endif
346 buf += 32; str++;
347 break;
348
349 case MAYBE_ONCE | RANGE:
350 save_str = str;
351 if ((str < estr) && TEST_BIT(buf, UPPERCASE(*str)))
352 {
353 #if SLANG_HAS_KANJI_SUPPORT
354 if(iskanji(*str)) str++;
355 #endif
356 str++;
357 }
358 buf += 32;
359 goto match_rest;
360
361 case LEAST_ONCE | RANGE:
362 if ((str >= estr) || (0 == TEST_BIT(buf, UPPERCASE(*str)))) return NULL;
363 #if SLANG_HAS_KANJI_SUPPORT
364 if(iskanji(*str)) str++;
365 #endif
366 str++;
367 /* drop */
368 case STAR | RANGE:
369 save_str = str;
370 while ((str < estr) && TEST_BIT(buf, UPPERCASE(*str)))
371 {
372 #if SLANG_HAS_KANJI_SUPPORT
373 if(iskanji(*str)) str++;
374 #endif
375 str++;
376 }
377 buf += 32;
378 goto match_rest;
379
380 /* The first 32 bytes correspond to the range and the two
381 * following bytes indicate the min and max number of matches.
382 */
383 case MANY | RANGE:
384 /* minimum number to match--- could be 0 */
385 n = n0 = (int) (unsigned char) *(buf + 32);
386 /* maximum number to match */
387 n1 = (int) (unsigned char) *(buf + 33);
388
389 while (n && (str < estr) && (TEST_BIT(buf, UPPERCASE(*str))))
390 {
391 /* for Kanji */
392 #if SLANG_HAS_KANJI_SUPPORT
393 #endif
394 n--;
395 str++;
396 }
397 if (n) return (NULL);
398 save_str = str;
399 n = n1 - n0;
400 while (n && (str < estr) && (TEST_BIT(buf, UPPERCASE(*str))))
401 {
402 /* for kanji */
403 #if SLANG_HAS_KANJI_SUPPORT
404 #endif
405 n--;
406 str++;
407 }
408 buf += 34; /* 32 + 2 */
409 goto match_rest;
410
411 case ANY_DIGIT:
412 if ((str >= estr) || (*str > '9') || (*str < '0')) return (NULL);
413 str++;
414 break;
415
416 case MAYBE_ONCE | ANY_DIGIT:
417 save_str = str;
418 if ((str < estr) && ((*str > '9') || (*str < '0')))
419 {
420 #if SLANG_HAS_KANJI_SUPPORT
421 if(iskanji(*str)) str++;
422 #endif
423 str++;
424 }
425 goto match_rest;
426
427 case LEAST_ONCE | ANY_DIGIT:
428 if ((str >= estr) || ((*str > '9') || (*str < '0'))) return NULL;
429 str++;
430 /* drop */
431 case STAR | ANY_DIGIT:
432 save_str = str;
433 while ((str < estr) && ((*str <= '9') && (*str >= '0'))) str++;
434 goto match_rest;
435
436 case MANY | ANY_DIGIT:
437 /* needs finished */
438 return (NULL);
439
440 case ANY:
441 if ((str >= estr) || (*str == '\n')) return (NULL);
442 #if SLANG_HAS_KANJI_SUPPORT
443 if(iskanji(*str)) str++;
444 #endif
445 str++;
446 break;
447
448 case MAYBE_ONCE | ANY:
449 save_str = str;
450 if ((str < estr) && (*str != '\n'))
451 {
452 #if SLANG_HAS_KANJI_SUPPORT
453 if(iskanji(*str)) str++;
454 #endif
455 str++;
456 }
457 goto match_rest;
458
459 case LEAST_ONCE | ANY:
460 if ((str >= estr) || (*str == '\n')) return (NULL);
461 #if SLANG_HAS_KANJI_SUPPORT
462 if(iskanji(*str)) str++;
463 #endif
464 str++;
465 /* drop */
466 case STAR | ANY:
467 save_str = str;
468 while ((str < estr) && (*str != '\n'))
469 {
470 #if SLANG_HAS_KANJI_SUPPORT
471 if(iskanji(*str)) str++;
472 #endif
473 str++;
474 }
475 goto match_rest;
476
477 case MANY | ANY:
478 return (NULL);
479 /* needs finished */
480
481 case EOL:
482 if ((str >= estr) || (*str == '\n')) return (str);
483 return(NULL);
484
485 default: return (NULL);
486 }
487 p = *buf++;
488 continue;
489
490 match_rest:
491 if (save_str == str)
492 {
493 p = *buf++;
494 continue;
495 }
496
497 /* if (p == EOL)
498 * {
499 * if (str < estr) return (NULL); else return (str);
500 * }
501 */
502
503 SLMEMCPY(save_closed_matches, ctx->closed_paren_matches, sizeof(save_closed_matches));
504 save_num_open = ctx->open_paren_number;
505 while (str >= save_str)
506 {
507 tmpstr = regexp_looking_at (ctx, str, estr, buf, cs);
508 if (tmpstr != NULL) return(tmpstr);
509 SLMEMCPY(ctx->closed_paren_matches, save_closed_matches, sizeof(ctx->closed_paren_matches));
510 ctx->open_paren_number = save_num_open;
511 str--;
512 #if SLANG_HAS_KANJI_SUPPORT
513 if(iskanji2nd(save_str, str - save_str)) str--;
514 #endif
515 }
516 return NULL;
517 }
518 if ((p != 0) && (p != EOL)) return (NULL); else return (str);
519 }
520
521 static void
fixup_beg_end_matches(Re_Context_Type * ctx,SLRegexp_Type * r,unsigned char * str,unsigned char * epos)522 fixup_beg_end_matches (Re_Context_Type *ctx, SLRegexp_Type *r, unsigned char *str, unsigned char *epos)
523 {
524 int i;
525
526 if (str == NULL)
527 {
528 r->beg_matches[0] = -1;
529 r->end_matches[0] = 0;
530 SLMEMSET(ctx->closed_paren_matches, 0, sizeof(ctx->closed_paren_matches));
531 }
532 else
533 {
534 r->beg_matches[0] = (int) (str - ctx->str);
535 r->end_matches[0] = (unsigned int) (epos - str);
536 }
537
538 for (i = 1; i < 10; i++)
539 {
540 if (ctx->closed_paren_matches [i] == 0)
541 {
542 r->beg_matches[i] = -1;
543 r->end_matches[i] = 0;
544 }
545 }
546 }
547
init_re_context(Re_Context_Type * ctx,SLRegexp_Type * reg,unsigned char * str,unsigned int len)548 static void init_re_context (Re_Context_Type *ctx, SLRegexp_Type *reg,
549 unsigned char *str, unsigned int len)
550 {
551 memset ((char *) ctx, 0, sizeof (Re_Context_Type));
552 ctx->reg = reg;
553 ctx->str = str;
554 ctx->len = len;
555 }
556
SLang_regexp_match(unsigned char * str,unsigned int len,SLRegexp_Type * reg)557 unsigned char *SLang_regexp_match(unsigned char *str,
558 unsigned int len, SLRegexp_Type *reg)
559 {
560 register unsigned char c = 0, *estr = str + len;
561 int cs = reg->case_sensitive, lit = 0;
562 unsigned char *buf = reg->buf, *epos = NULL;
563 Re_Context_Type ctx_buf;
564
565 if (reg->min_length > len) return NULL;
566
567 init_re_context (&ctx_buf, reg, str, len);
568
569 if (*buf == BOL)
570 {
571 if (NULL == (epos = regexp_looking_at (&ctx_buf, str, estr, buf + 1, cs)))
572 str = NULL;
573
574 fixup_beg_end_matches (&ctx_buf, reg, str, epos);
575 return str;
576 }
577
578 if (*buf == NO_CASE)
579 {
580 buf++; cs = 0;
581 }
582
583 if (*buf == YES_CASE)
584 {
585 buf++; cs = 1;
586 }
587
588 if (*buf == LITERAL)
589 {
590 lit = 1;
591 c = *(buf + 1);
592 }
593 else if ((*buf == OPAREN) && (*(buf + 1) == LITERAL))
594 {
595 lit = 1;
596 c = *(buf + 2);
597 }
598
599 while (1)
600 {
601 ctx_buf.open_paren_number = 0;
602 memset (ctx_buf.closed_paren_matches, 0, sizeof(ctx_buf.closed_paren_matches));
603 /* take care of leading chars */
604 if (lit)
605 {
606 while ((str < estr) && (c != UPPERCASE(*str)))
607 {
608 #if SLANG_HAS_KANJI_SUPPORT
609 if(iskanji(*str)) str++;
610 #endif
611 str++;
612 }
613 if (str >= estr)
614 break; /* failed */
615 }
616
617 if (NULL != (epos = regexp_looking_at(&ctx_buf, str, estr, buf, cs)))
618 {
619 fixup_beg_end_matches (&ctx_buf, reg, str, epos);
620 return str;
621 }
622 if (str >= estr)
623 break;
624 #if SLANG_HAS_KANJI_SUPPORT
625 if(iskanji(*str))
626 {
627 if ((str +1) == estr)
628 break;
629 str++;
630 }
631 #endif
632 str++;
633 }
634 fixup_beg_end_matches (&ctx_buf, reg, NULL, epos);
635 return NULL;
636 }
637
convert_digit(unsigned char * pat,int * nn)638 static unsigned char *convert_digit(unsigned char *pat, int *nn)
639 {
640 int n = 0, m = 0;
641 unsigned char c;
642 while (c = (unsigned char) *pat, (c <= '9') && (c >= '0'))
643 {
644 pat++;
645 n = 10 * n + (c - '0');
646 m++;
647 }
648 if (m == 0)
649 {
650 return (NULL);
651 }
652 *nn = n;
653 return pat;
654 }
655
656 #define ERROR return (int) (pat - reg->pat)
657
658 /* Returns 0 if successful or offset in pattern of error */
SLang_regexp_compile(SLRegexp_Type * reg)659 int SLang_regexp_compile (SLRegexp_Type *reg)
660 {
661 register unsigned char *buf, *ebuf, *pat;
662 unsigned char *last = NULL, *tmppat;
663 register unsigned char c;
664 int i, reverse = 0, n, cs;
665 int oparen = 0, nparen = 0;
666 /* substring stuff */
667 int count, last_count, this_max_mm = 0, max_mm = 0, ordinary_search,
668 no_osearch = 0, min_length = 0;
669 unsigned char *mm_p = NULL, *this_mm_p = NULL;
670 static int already_initialized;
671
672 reg->beg_matches[0] = reg->end_matches[0] = 0;
673 buf = reg->buf;
674 ebuf = (reg->buf + reg->buf_len) - 2; /* make some room */
675 pat = reg->pat;
676 cs = reg->case_sensitive;
677
678 if (already_initialized == 0)
679 {
680 SLang_init_case_tables ();
681 #if SLANG_HAS_KANJI_SUPPORT
682 SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z", 0);
683 #else
684 # ifdef IBMPC_SYSTEM
685 SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\200-\232\240-\245\341-\353", 0);
686 # else
687 SLmake_lut (Word_Chars, (unsigned char *) "_0-9a-zA-Z\277-\326\330-\336\340-\366\370-\376", 0);
688 # endif
689 #endif
690 already_initialized = 1;
691 }
692
693 i = 1; while (i < 10)
694 {
695 reg->beg_matches[i] = -1;
696 reg->end_matches[i] = 0;
697 i++;
698 }
699
700 if (*pat == '\\')
701 {
702 if (pat[1] == 'c')
703 {
704 cs = 1;
705 pat += 2;
706 no_osearch = 1;
707 }
708 else if (pat[1] == 'C')
709 {
710 cs = 0;
711 pat += 2;
712 no_osearch = 1;
713 }
714 }
715
716 if (*pat == '^')
717 {
718 pat++;
719 *buf++ = BOL;
720 reg->must_match_bol = 1;
721 }
722 else reg->must_match_bol = 0;
723
724 if (cs != reg->case_sensitive)
725 {
726 if (cs) *buf++ = YES_CASE;
727 else *buf++ = NO_CASE;
728 }
729
730 *buf = 0;
731
732 last_count = count = 0;
733 while ((c = *pat++) != 0)
734 {
735 #if SLANG_HAS_KANJI_SUPPORT
736 if ((buf >= ebuf - 3) || (iskanji(c) && buf >= ebuf - 5))
737 #else
738 if (buf >= ebuf - 3)
739 #endif
740 {
741 SLang_doerror ("Pattern too large to be compiled.");
742 ERROR;
743 }
744
745 count++;
746 switch (c)
747 {
748 case '$':
749 if (*pat != 0) goto literal_char;
750 *buf++ = EOL;
751 break;
752
753 case '\\':
754 c = *pat++;
755 no_osearch = 1;
756 switch(c)
757 {
758 case 'e': c = 033; goto literal_char;
759 case 'n': c = '\n'; goto literal_char;
760 case 't': c = '\t'; goto literal_char;
761 case 'C': cs = 0; *buf++ = NO_CASE; break;
762 case 'c': cs = 1; *buf++ = YES_CASE; break;
763 case '1': case '2': case '3': case '4': case '5':
764 case '6': case '7': case '8': case '9':
765 c = c - '0';
766 if ((int) c > nparen) ERROR;
767 last = buf;
768 *buf++ = NTH_MATCH; *buf++ = c;
769 break;
770 #ifdef NOT_LITERAL
771 case '~': /* slang extension */
772 if ((c = *pat) == 0) ERROR;
773 pat++;
774 last = buf;
775 *buf++ = NOT_LITERAL;
776 *buf++ = c;
777 #if SLANG_HAS_KANJI_SUPPORT
778 if(iskanji(c))
779 {
780 *buf++ = NOT_LITERAL;
781 *buf++ = *pat++;
782 min_length++;
783 }
784 #endif
785 min_length++;
786 break;
787 #endif
788 case 'd': /* slang extension */
789 last = buf;
790 *buf++ = ANY_DIGIT;
791 min_length++;
792 break;
793
794 case '<':
795 last = NULL;
796 *buf++ = BOW;
797 break;
798
799 case '>':
800 last = NULL;
801 *buf++ = EOW;
802 break;
803
804 case '{':
805 if (last == NULL) goto literal_char;
806 *last |= MANY;
807 tmppat = convert_digit(pat, &n);
808 if (tmppat == NULL) ERROR;
809 pat = tmppat;
810 *buf++ = n;
811
812 min_length += (n - 1);
813
814 if (*pat == '\\')
815 {
816 *buf++ = n;
817 }
818 else if (*pat == ',')
819 {
820 pat++;
821 if (*pat == '\\')
822 {
823 n = 255;
824 }
825 else
826 {
827 tmppat = convert_digit(pat, &n);
828 if (tmppat == NULL) ERROR;
829 pat = tmppat;
830 if (*pat != '\\') ERROR;
831 }
832 *buf++ = n;
833 }
834 else ERROR;
835 last = NULL;
836 pat++;
837 if (*pat != '}') ERROR;
838 pat++;
839 break; /* case '{' */
840
841 case '(':
842 oparen++;
843 if (oparen > 9) ERROR;
844 *buf++ = OPAREN;
845 break;
846 case ')':
847 if (oparen == 0) ERROR;
848 oparen--;
849 nparen++;
850 *buf++ = CPAREN;
851 break;
852
853 case 0: ERROR;
854 default:
855 goto literal_char;
856 }
857 break;
858
859 case '[':
860
861 *buf = RANGE;
862 last = buf++;
863
864 if (buf + 32 >= ebuf) ERROR;
865
866 for (i = 0; i < 32; i++) buf[i] = 0;
867 c = *pat++;
868 if (c == '^')
869 {
870 reverse = 1;
871 SET_BIT(buf, '\n');
872 c = *pat++;
873 }
874
875 if (c == ']')
876 {
877 SET_BIT(buf, c);
878 c = *pat++;
879 }
880 while (c && (c != ']'))
881 {
882 if (c == '\\')
883 {
884 c = *pat++;
885 switch(c)
886 {
887 case 'n': c = '\n'; break;
888 case 't': c = '\t'; break;
889 case 0: ERROR;
890 }
891 }
892
893 if (*pat == '-')
894 {
895 pat++;
896 while (c < *pat)
897 {
898 if (cs == 0)
899 {
900 SET_BIT(buf, UPPERCASE(c));
901 SET_BIT(buf, LOWERCASE(c));
902 }
903 else SET_BIT(buf, c);
904 c++;
905 }
906 }
907 if (cs == 0)
908 {
909 SET_BIT(buf, UPPERCASE(c));
910 SET_BIT(buf, LOWERCASE(c));
911 }
912 else SET_BIT(buf, c);
913 c = *pat++;
914 }
915 if (c != ']') ERROR;
916 if (reverse) for (i = 0; i < 32; i++) buf[i] = buf[i] ^ 0xFF;
917 reverse = 0;
918 buf += 32;
919 min_length++;
920 break;
921
922 case '.':
923 last = buf;
924 *buf++ = ANY;
925 min_length++;
926 break;
927
928 case '*':
929 if (last == NULL) goto literal_char;
930 *last |= STAR;
931 min_length--;
932 last = NULL;
933 break;
934
935 case '+':
936 if (last == NULL) goto literal_char;
937 *last |= LEAST_ONCE;
938 last = NULL;
939 break;
940
941 case '?':
942 if (last == NULL) goto literal_char;
943 *last |= MAYBE_ONCE;
944 last = NULL;
945 min_length--;
946 break;
947
948 literal_char:
949 default:
950 /* This is to keep track of longest substring */
951 min_length++;
952 this_max_mm++;
953 #if SLANG_HAS_KANJI_SUPPORT
954 if(iskanji(c))
955 {
956 min_length++;
957 this_max_mm++;
958 }
959 #endif
960 if (last_count + 1 == count)
961 {
962 if (this_max_mm == 1)
963 {
964 this_mm_p = buf;
965 }
966 else if (max_mm < this_max_mm)
967 {
968 mm_p = this_mm_p;
969 max_mm = this_max_mm;
970 }
971 }
972 else
973 {
974 this_mm_p = buf;
975 this_max_mm = 1;
976 }
977
978 last_count = count;
979
980 last = buf;
981 *buf++ = LITERAL;
982 #if SLANG_HAS_KANJI_SUPPORT
983 if(iskanji(c))
984 {
985 *buf++ = c;
986 *buf++ = LITERAL;
987 *buf++ = *pat++;
988 }
989 else
990 #endif
991 *buf++ = UPPERCASE(c);
992 }
993 }
994 *buf = 0;
995 /* Check for ordinary search */
996 ebuf = buf;
997 buf = reg->buf;
998
999 if (no_osearch) ordinary_search = 0;
1000 else
1001 {
1002 ordinary_search = 1;
1003 while (buf < ebuf)
1004 {
1005 if (*buf != LITERAL)
1006 {
1007 ordinary_search = 0;
1008 break;
1009 }
1010 buf += 2;
1011 }
1012 }
1013
1014 reg->osearch = ordinary_search;
1015 reg->must_match_str[15] = 0;
1016 reg->min_length = (min_length > 0) ? (unsigned int) min_length : 0;
1017 if (ordinary_search)
1018 {
1019 strncpy((char *) reg->must_match_str, (char *) reg->pat, 15);
1020 reg->must_match = 1;
1021 return(0);
1022 }
1023 /* check for longest substring of pattern */
1024 reg->must_match = 0;
1025 if ((mm_p == NULL) && (this_mm_p != NULL)) mm_p = this_mm_p;
1026 if (mm_p == NULL)
1027 {
1028 return (0);
1029 }
1030 n = 15;
1031 pat = reg->must_match_str;
1032 buf = mm_p;
1033 while (n--)
1034 {
1035 if (*buf++ != LITERAL) break;
1036 *pat++ = *buf++;
1037 }
1038 *pat = 0;
1039 if (pat != reg->must_match_str) reg->must_match = 1;
1040 return(0);
1041 }
1042
SLregexp_quote_string(char * re,char * buf,unsigned int buflen)1043 char *SLregexp_quote_string (char *re, char *buf, unsigned int buflen)
1044 {
1045 char ch;
1046 char *b, *bmax;
1047
1048 if (re == NULL) return NULL;
1049
1050 b = buf;
1051 bmax = buf + buflen;
1052
1053 while (b < bmax)
1054 {
1055 switch (ch = *re++)
1056 {
1057 case 0:
1058 *b = 0;
1059 return buf;
1060
1061 case '$':
1062 case '\\':
1063 case '[':
1064 case ']':
1065 case '.':
1066 case '^':
1067 case '*':
1068 case '+':
1069 case '?':
1070 *b++ = '\\';
1071 if (b == bmax) break;
1072 /* drop */
1073
1074 default:
1075 #if SLANG_HAS_KANJI_SUPPORT
1076 if(iskanji(ch))
1077 {
1078 *b++ = ch;
1079 ch = *re++;
1080 }
1081 #endif
1082 *b++ = ch;
1083 }
1084 }
1085 return NULL;
1086 }
1087
1088 #if 0
1089 #define MAX_EXP 4096
1090 int main(int argc, char **argv)
1091 {
1092 FILE *fp;
1093 char *regexp, *file;
1094 char expbuf[MAX_EXP], buf[512];
1095 SLRegexp_Type reg;
1096
1097 file = argv[2];
1098 regexp = argv[1];
1099
1100 if (NULL == (fp = fopen(file, "r")))
1101 {
1102 fprintf(stderr, "File not open\n");
1103 return(1);
1104 }
1105
1106 reg.buf = expbuf;
1107 reg.buf_len = MAX_EXP;
1108 reg.pat = regexp;
1109 reg.case_sensitive = 1;
1110
1111 if (!regexp_compile(®)) while (NULL != fgets(buf, 511, fp))
1112 {
1113 if (reg.osearch)
1114 {
1115 if (NULL == strstr(buf, reg.pat)) continue;
1116 }
1117 else
1118 {
1119 if (reg.must_match && (NULL == strstr(buf, reg.must_match_str))) continue;
1120 if (0 == regexp_match(buf, buf + strlen(buf), ®)) continue;
1121 }
1122
1123 fputs(buf, stdout);
1124 }
1125 return (0);
1126 }
1127 #endif
1128