1 /* e_regex.cpp
2 *
3 * Copyright (c) 1994-1996, Marko Macek
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10 #include "e_regex.h"
11
12 #include "sysdep.h"
13
14 #include <ctype.h>
15 #include <stdio.h>
16 #include <stdlib.h>
17
18 //#define DEBUG
19
20 static int RegCount = 0;
21
22 // *INDENT-OFF*
23
24 #ifdef DEBUG
25 static void RxDump(int N, RxNode *n);
26 #endif
27
NewNode(int aWhat)28 static RxNode *NewNode(int aWhat) {
29 RxNode *N = (RxNode *) malloc(sizeof(RxNode));
30
31 if (N) {
32 memset(N, 0, sizeof(RxNode));
33 N->fWhat = (short)aWhat;
34 }
35 return N;
36 }
37
NewChar(char Ch)38 static RxNode *NewChar(char Ch) {
39 RxNode *A = NewNode(RE_CHAR);
40
41 if (A) {
42 A->fChar = (char *)malloc(1);
43 A->fLen = 1;
44 A->fChar[0] = Ch;
45 }
46 return A;
47 }
48
NewEscape(const char ** const Regexp)49 static RxNode *NewEscape(const char **const Regexp) {
50 char Ch = **Regexp;
51 ++*Regexp;
52 switch (Ch) {
53 case 0: return 0;
54 case 'a': Ch = '\a'; break;
55 case 'b': Ch = '\b'; break;
56 case 'f': Ch = '\f'; break;
57 case 'n': Ch = '\n'; break;
58 case 'r': Ch = '\r'; break;
59 case 't': Ch = '\t'; break;
60 case 'v': Ch = '\v'; break;
61 case 'e': Ch = 27; break;
62 case 's': return NewNode(RE_WSPACE);
63 case 'S': return NewNode(RE_NWSPACE);
64 case 'U': return NewNode(RE_UPPER);
65 case 'L': return NewNode(RE_LOWER);
66 case 'w': return NewNode(RE_WORD);
67 case 'W': return NewNode(RE_NWORD);
68 case 'd': return NewNode(RE_DIGIT);
69 case 'D': return NewNode(RE_NDIGIT);
70 case 'C': return NewNode(RE_CASE);
71 case 'c': return NewNode(RE_NCASE);
72 case 'N':
73 {
74 unsigned int N = 0;
75 unsigned int A = 0;
76 if (**Regexp == 0) return 0;
77 N = toupper(**Regexp) - 48; if (N > 9) return 0;
78 (*Regexp)++;
79 A = N * 100;
80 if (**Regexp == 0) return 0;
81 N = toupper(**Regexp) - 48; if (N > 9) return 0;
82 (*Regexp)++;
83 A = A + N * 10;
84 if (**Regexp == 0) return 0;
85 N = toupper(**Regexp) - 48; if (N > 9) return 0;
86 (*Regexp)++;
87 A = A + N;
88 Ch = (char)A;
89 }
90 break;
91 case 'o':
92 {
93 unsigned int N = 0;
94 unsigned int A = 0;
95 if (**Regexp == 0) return 0;
96 N = toupper(**Regexp) - 48; if (N > 7) return 0;
97 (*Regexp)++;
98 A = N * 64;
99 if (**Regexp == 0) return 0;
100 N = toupper(**Regexp) - 48; if (N > 7) return 0;
101 (*Regexp)++;
102 A = A + N * 8;
103 if (**Regexp == 0) return 0;
104 N = toupper(**Regexp) - 48; if (N > 7) return 0;
105 (*Regexp)++;
106 A = A + N;
107 Ch = (char)A;
108 }
109 break;
110 case 'x':
111 {
112 unsigned int N = 0;
113 unsigned int A = 0;
114 if (**Regexp == 0) return 0;
115 N = toupper(**Regexp) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
116 (*Regexp)++;
117 A = N << 4;
118 if (**Regexp == 0) return 0;
119 N = toupper(**Regexp) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
120 (*Regexp)++;
121 A = A + N;
122 Ch = (char)A;
123 }
124 break;
125 }
126 return NewChar(Ch);
127 }
128
129
130 #define NNN 32 // 8 * 32 = 256 (match set)
131
132 #define SETOP(set,n) \
133 do { \
134 set[(unsigned char)(n) >> 3] |= (unsigned char)(1 << ((unsigned char)(n) & 7)); \
135 } while (0)
136
NewSet(const char ** const Regexp)137 static RxNode *NewSet(const char ** const Regexp) {
138 unsigned char set[NNN];
139 int s = 0;
140 int c = 0;
141 unsigned int i, xx;
142 unsigned char Ch, C1 = 0, C2 = 0;
143 int doset = 0;
144
145 memset(set, 0, sizeof(set));
146 s = 1;
147 if (**Regexp == '^') {
148 s = 0;
149 ++*Regexp;
150 }
151 c = 0;
152
153 while (**Regexp) {
154 switch (Ch = *((*Regexp)++)) {
155 case ']':
156 if (doset == 1) return 0;
157 {
158 RxNode *N = NewNode(s?RE_INSET:RE_NOTINSET);
159 N->fChar = (char *) malloc(sizeof(set));
160 N->fLen = sizeof(set);
161 if (N->fChar == 0) return 0;
162 memcpy(N->fChar, (char *) set, sizeof(set));
163 return N;
164 }
165 case '\\':
166 switch (Ch = *((*Regexp)++)) {
167 case 0: return 0;
168 case 'a': Ch = '\a'; break;
169 case 'b': Ch = '\b'; break;
170 case 'f': Ch = '\f'; break;
171 case 'n': Ch = '\n'; break;
172 case 'r': Ch = '\r'; break;
173 case 't': Ch = '\t'; break;
174 case 'v': Ch = '\v'; break;
175 case 'e': Ch = 27; break;
176 case 'N':
177 {
178 unsigned int N = 0;
179 unsigned int A = 0;
180 if (**Regexp == 0) return 0;
181 N = toupper(**Regexp) - 48; if (N > 9) return 0;
182 (*Regexp)++;
183 A = N * 100;
184 if (**Regexp == 0) return 0;
185 N = toupper(**Regexp) - 48; if (N > 9) return 0;
186 (*Regexp)++;
187 A = A + N * 10;
188 if (**Regexp == 0) return 0;
189 N = toupper(**Regexp) - 48; if (N > 9) return 0;
190 (*Regexp)++;
191 A = A + N;
192 Ch = (unsigned char)A;
193 }
194 break;
195 case 'o':
196 {
197 unsigned int N = 0;
198 unsigned int A = 0;
199 if (**Regexp == 0) return 0;
200 N = toupper(**Regexp) - 48; if (N > 7) return 0;
201 (*Regexp)++;
202 A = N * 64;
203 if (**Regexp == 0) return 0;
204 N = toupper(**Regexp) - 48; if (N > 7) return 0;
205 (*Regexp)++;
206 A = A + N * 8;
207 if (**Regexp == 0) return 0;
208 N = toupper(**Regexp) - 48; if (N > 7) return 0;
209 (*Regexp)++;
210 A = A + N;
211 Ch = (unsigned char)A;
212 }
213 break;
214 case 'x':
215 {
216 unsigned int N = 0;
217 unsigned int A = 0;
218 if (**Regexp == 0) return 0;
219 N = toupper(**Regexp) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
220 (*Regexp)++;
221 A = N << 4;
222 if (**Regexp == 0) return 0;
223 N = toupper(**Regexp) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
224 (*Regexp)++;
225 A = A + N;
226 Ch = (unsigned char)A;
227 }
228 break;
229 case 's':
230 c += 4;
231 SETOP(set, '\n');
232 SETOP(set, '\t');
233 SETOP(set, ' ');
234 SETOP(set, '\r');
235 continue;
236 case 'S':
237 for (xx = 0; xx <= 255; xx++) {
238 if (xx != ' ' && xx != '\t' && xx != '\n' && xx != '\r') {
239 c++;
240 SETOP(set, xx);
241 }
242 }
243 continue;
244 case 'w':
245 for (xx = 0; xx <= 255; xx++) {
246 if (isalnum(xx)) {
247 c++;
248 SETOP(set, xx);
249 }
250 }
251 break;
252 case 'W':
253 for (xx = 0; xx <= 255; xx++) {
254 if (!isalnum(xx)) {
255 c++;
256 SETOP(set, xx);
257 }
258 }
259 break;
260 case 'd':
261 for (xx = 0; xx <= 255; xx++) {
262 if (isdigit(xx)) {
263 c++;
264 SETOP(set, xx);
265 }
266 }
267 break;
268 case 'D':
269 for (xx = 0; xx <= 255; xx++) {
270 if (!isdigit(xx)) {
271 c++;
272 SETOP(set, xx);
273 }
274 }
275 break;
276 case 'U':
277 for (xx = 'A'; xx <= 'Z'; xx++) {
278 c++;
279 SETOP(set, xx);
280 }
281 continue;
282 case 'L':
283 for (xx = 'a'; xx <= 'z'; xx++) {
284 c++;
285 SETOP(set, xx);
286 }
287 continue;
288 }
289 break;
290 }
291 if (doset == 0 && ((**Regexp) == '-')) {
292 doset = 1;
293 C1 = Ch;
294 ++*Regexp;
295 continue;
296 } else if (doset == 1) {
297 C2 = Ch;
298 if (C2 < C1) return 0;
299 for(i = C1; i <= C2; i++) SETOP(set, i);
300 doset = 0;
301 continue;
302 }
303 c++;
304 SETOP(set, Ch);
305 }
306 return 0;
307 }
308
AddNode(RxNode ** F,RxNode ** N,RxNode * A)309 static int AddNode(RxNode **F, RxNode **N, RxNode *A) {
310 if (A) {
311 if (*F) {
312 (*N)->fNext = A;
313 A->fPrev = (*N);
314 *N = A;
315 } else {
316 (*N) = (*F) = A;
317 A->fPrev = A->fNext = 0;
318 }
319 return 1;
320 }
321 return 0;
322 }
323
CountWidth(RxNode * N)324 static int CountWidth(RxNode *N) {
325 int w = 0;
326
327 while (N) {
328 if (N->fWhat < 32) w += 0;
329 else if (N->fWhat >= 32 && N->fWhat < 64)
330 w += 1;
331 N = N->fNext;
332 }
333 return w;
334 }
335
MakeSub(RxNode ** F,RxNode ** N,char What)336 static int MakeSub(RxNode **F, RxNode **N, char What) {
337 //printf("MakeSub: %c\n", What);
338 if (*N) {
339 RxNode *No;
340 RxNode *New;
341 RxNode *Jump, *Skip;
342 RxNode *Last = (*N);
343
344 if (Last->fWhat & RE_GROUP) {
345 RxNode *P = Last->fPrev;
346 int C = 1;
347
348 while ((C > 0) && P) {
349 //puts("backtracking...-----");
350 //RxDump(0, P);
351 if (P->fWhat & RE_GROUP) {
352 if (P->fWhat & RE_CLOSE) C++;
353 else C--;
354 }
355 Last = P;
356 if (C == 0) break;
357 P = P->fPrev;
358 }
359 //printf("P = %s, c = %d", P ? "ok":"null", C);
360 if (C != 0) return 0;
361 }
362 assert(Last);
363 if (What != '?' && What != '|')
364 if (CountWidth(Last) == 0) {
365 // puts("FAILED count");
366 return 0;
367 }
368 switch (What) {
369 case '?': /* BRANCH x NOTHING */
370 New = NewNode(RE_BRANCH | RE_GREEDY | What);
371 No = NewNode(RE_NOTHING);
372 if (!New || !No) return 0;
373 No->fPrev = *N;
374 if (*N)
375 (*N)->fNext = No;
376 New->fNext = Last;
377 New->fPrev = Last->fPrev;
378 Last->fPrev = New;
379 if (New->fPrev) {
380 New->fPrev->fNext = New;
381 } else {
382 *F = New;
383 }
384 New->fPtr = No;
385 No->fPtr = New;
386 *N = No;
387 //puts("BRANCH ?");
388 break;
389
390 case '*':
391 case '@':
392 New = NewNode(RE_BRANCH | What | ((What == '*') ? RE_GREEDY : 0));
393 Jump = NewNode(RE_JUMP);
394 No = NewNode(RE_NOTHING);
395
396 if (!New || !No || !Jump) return 0;
397 No->fPrev = Jump;
398 Jump->fNext = No;
399 Jump->fPrev = *N;
400 if (*N)
401 (*N)->fNext = Jump;
402 New->fNext = Last;
403 New->fPrev = Last->fPrev;
404 Last->fPrev = New;
405 if (New->fPrev) {
406 New->fPrev->fNext = New;
407 } else {
408 *F = New;
409 }
410 New->fPtr = No;
411 No->fPtr = New;
412 Jump->fPtr = New;
413 *N = No;
414 //puts("BRANCH *");
415 break;
416
417 case '#':
418 case '+':
419 New = NewNode(RE_BRANCH | What | ((What == '+') ? RE_GREEDY : 0));
420 Skip = NewNode(RE_JUMP);
421 Jump = NewNode(RE_JUMP);
422 No = NewNode(RE_NOTHING);
423
424 if (!New || !No || !Jump) return 0;
425 No->fPrev = Jump;
426 Jump->fPrev = *N;
427 Jump->fNext = No;
428
429 Skip->fNext = New;
430 New->fPrev = Skip;
431 if (*N)
432 (*N)->fNext = Jump;
433 New->fNext = Last;
434 Skip->fPrev = Last->fPrev;
435 Last->fPrev = New;
436 if (Skip->fPrev) {
437 Skip->fPrev->fNext = Skip;
438 } else {
439 *F = Skip;
440 }
441 New->fPtr = No;
442 No->fPtr = New;
443 Jump->fPtr = New;
444 Skip->fPtr = Last;
445 *N = No;
446 //puts("BRANCH +");
447 break;
448 case '|':
449 New = NewNode(RE_BRANCH | RE_GREEDY | What);
450 Jump = NewNode(RE_BREAK);
451 No = NewNode(RE_NOTHING);
452
453 if (!New || !No || !Jump) return 0;
454 No->fPrev = Jump;
455 Jump->fNext = No;
456 Jump->fPrev = *N;
457 if (*N)
458 (*N)->fNext = Jump;
459 New->fNext = Last;
460 New->fPrev = Last->fPrev;
461 Last->fPrev = New;
462 if (New->fPrev) {
463 New->fPrev->fNext = New;
464 } else {
465 *F = New;
466 }
467 New->fPtr = No;
468 No->fPtr = New;
469 Jump->fPtr = New;
470 *N = No;
471 //puts("BRANCH |");
472 break;
473 }
474 return 1;
475 }
476 return 0;
477 }
478
479 #define CHECK(n) do { if ((n) == 0) { return 0;} } while (0)
480
RxComp(const char ** const Regexp)481 static RxNode *RxComp(const char **const Regexp) {
482 RxNode *F = 0;
483 RxNode *N = 0;
484 int C;
485 char Ch;
486
487 while (**Regexp) {
488 // puts(*Regexp);
489 switch (Ch = (*(*Regexp)++)) {
490 case '?':
491 case '*':
492 case '+':
493 case '@':
494 case '#':
495 case '|':
496 CHECK(MakeSub(&F, &N, Ch));
497 break;
498 case '}':
499 case ')':
500 return F;
501 case '{':
502 CHECK(AddNode(&F, &N, NewNode(RE_GROUP | RE_OPEN)));
503 CHECK(AddNode(&F, &N, RxComp(Regexp)));
504 while (N->fNext) N = N->fNext;
505 CHECK(AddNode(&F, &N, NewNode(RE_GROUP | RE_CLOSE)));
506 break;
507 case '(':
508 C = ++RegCount;
509 CHECK(AddNode(&F, &N, NewNode(RE_GROUP | RE_OPEN | RE_MEM | C)));
510 CHECK(AddNode(&F, &N, RxComp(Regexp)));
511 while (N->fNext) N = N->fNext;
512 CHECK(AddNode(&F, &N, NewNode(RE_GROUP | RE_CLOSE | RE_MEM | C)));
513 break;
514 case '\\':CHECK(AddNode(&F, &N, NewEscape(Regexp))); break;
515 case '[': CHECK(AddNode(&F, &N, NewSet(Regexp))); break;
516 case '^': CHECK(AddNode(&F, &N, NewNode(RE_ATBOL))); break;
517 case '$': CHECK(AddNode(&F, &N, NewNode(RE_ATEOL))); break;
518 case '.': CHECK(AddNode(&F, &N, NewNode(RE_ANY))); break;
519 case '<': CHECK(AddNode(&F, &N, NewNode(RE_ATBOW))); break;
520 case '>': CHECK(AddNode(&F, &N, NewNode(RE_ATEOW))); break;
521 default:
522 --*Regexp;
523 CHECK(AddNode(&F, &N, NewChar(**Regexp)));
524 ++*Regexp;
525 break;
526 }
527 }
528 return F;
529 }
530
RxOptimize(RxNode * rx)531 static RxNode *RxOptimize(RxNode *rx) {
532 return rx;
533 }
534
RxCompile(const char * Regexp)535 RxNode *RxCompile(const char *Regexp) {
536 RxNode *n = 0, *x;
537 if (Regexp == 0) return 0;
538 RegCount = 0;
539 n = RxComp(&Regexp);
540 if (n == 0) return 0;
541 n = RxOptimize(n);
542 x = n;
543 while (x->fNext) x = x->fNext;
544 x->fNext = NewNode(RE_END);
545 return n;
546 }
547
RxFree(RxNode * n)548 void RxFree(RxNode *n) {
549 RxNode *p;
550
551 while (n) {
552 p = n;
553 n = n->fNext;
554 switch (p->fWhat) {
555 case RE_INSET:
556 case RE_NOTINSET:
557 case RE_CHAR:
558 free(p->fChar);
559 break;
560 default:
561 break;
562 }
563 free(p);
564 }
565 }
566
567 #define ChClass(x) (((((x) >= 'A') && ((x) <= 'Z')) || (((x) >= 'a') && ((x) <= 'z')) || (((x) >= '0') && ((x) <= '9')))?1:0)
568
569 static RxMatchRes *match;
570 static const char *bop;
571 static const char *eop;
572 static int flags = RX_CASE;
573 static const char *rex;
574
RxMatch(RxNode * rx)575 int RxMatch(RxNode *rx) {
576 RxNode *n = rx;
577
578 //printf(">>");
579 while (n) {
580 //printf("%-50.50s\n", rex);
581 //RxDump(1, n);
582 switch (n->fWhat) {
583 case RE_NOTHING:
584 break;
585 case RE_CASE:
586 flags |= RX_CASE;
587 break;
588 case RE_NCASE:
589 flags &= ~RX_CASE;
590 break;
591 case RE_ATBOL:
592 if (rex != bop) return 0;
593 break;
594 case RE_ATEOL:
595 if (rex != eop) return 0;
596 break;
597 case RE_ANY:
598 if (rex == eop) return 0;
599 rex++;
600 break;
601 case RE_WSPACE:
602 if (rex == eop) return 0;
603 if (*rex != ' ' && *rex != '\n' && *rex != '\r' && *rex != '\t') return 0;
604 rex++;
605 break;
606 case RE_NWSPACE:
607 if (rex == eop) return 0;
608 if (*rex == ' ' || *rex == '\n' || *rex == '\r' || *rex == '\t') return 0;
609 rex++;
610 break;
611 case RE_WORD:
612 if (rex == eop) return 0;
613 if (!isalnum(*rex)) return 0;
614 rex++;
615 break;
616 case RE_NWORD:
617 if (rex == eop) return 0;
618 if (isalnum(*rex)) return 0;
619 rex++;
620 break;
621 case RE_DIGIT:
622 if (rex == eop) return 0;
623 if (!isdigit(*rex)) return 0;
624 rex++;
625 break;
626 case RE_NDIGIT:
627 if (rex == eop) return 0;
628 if (isdigit(*rex)) return 0;
629 rex++;
630 break;
631 case RE_UPPER:
632 if (rex == eop) return 0;
633 if (!isupper(*rex)) return 0;
634 rex++;
635 break;
636 case RE_LOWER:
637 if (rex == eop) return 0;
638 if (!islower(*rex)) return 0;
639 rex++;
640 break;
641 case RE_ATBOW:
642 if (rex >= eop) return 0;
643 if (rex > bop) {
644 if ((ChClass(*rex) != 1) || (ChClass(*(rex-1)) != 0)) return 0;
645 }
646 break;
647 case RE_ATEOW:
648 if (rex <= bop) return 0;
649 if (rex < eop) {
650 if ((ChClass(*rex) != 0) || (ChClass(*(rex-1)) != 1)) return 0;
651 }
652 break;
653 case RE_CHAR:
654 if (rex == eop) return 0;
655 if (flags & RX_CASE) {
656 if (*n->fChar != *rex) return 0;
657 if (memcmp(rex, n->fChar, n->fLen) != 0) return 0;
658 } else {
659 for (int i = 0; i < n->fLen; i++)
660 if (toupper(rex[i]) != toupper(n->fChar[i]))
661 return 0;
662 }
663 rex += n->fLen;
664 break;
665 case RE_INSET:
666 if (rex == eop) return 0;
667 if ((n->fChar[(unsigned char)(*rex) >> 3] & (1 << ((unsigned char)(*rex) & 7))) == 0) return 0;
668 rex++;
669 break;
670 case RE_NOTINSET:
671 if (rex == eop) return 0;
672 if (n->fChar[(unsigned char)(*rex) >> 3] & (1 << ((unsigned char)(*rex) & 7))) return 0;
673 rex++;
674 break;
675 case RE_JUMP:
676 n = n->fPtr;
677 continue;
678 case RE_END:
679 return 1;
680 case RE_BREAK:
681 n = n->fNext;
682 if (n->fNext == 0) break;
683 n = n->fNext;
684 if (n->fWhat & RE_BRANCH) {
685 while ((n->fWhat & RE_BRANCH) && n->fPtr && ((n->fWhat & 0xFF) == '|'))
686 n = n->fPtr->fNext;
687 }
688 if (n->fWhat & RE_GROUP) {
689 int C = 1;
690 n = n->fNext;
691 while ((C > 0) && n) {
692 if (n->fWhat & RE_GROUP) {
693 if (n->fWhat & RE_OPEN) C++;
694 else C--;
695 }
696 if (C == 0) break;
697 n = n->fNext;
698 }
699 }
700 break;
701 default:
702 if (n->fWhat & RE_GROUP) {
703 if (n->fWhat & RE_MEM) {
704 const char *save = rex;
705 int b = n->fWhat & 0xFF;
706 int fl = flags;
707
708 if (RxMatch(n->fNext) == 0) {
709 flags = fl;
710 if (n->fWhat & RE_OPEN)
711 match->Open[b] = -1;
712 else
713 match->Close[b] = -1;
714 return 0;
715 }
716
717 if (n->fWhat & RE_OPEN) {
718 // if (match->Open[b] == -1)
719 match->Open[b] = (int) (save - bop);
720 } else {
721 // if (match->Close[b] == -1)
722 match->Close[b] = (int) (save - bop);
723 }
724 return 1;
725 }
726 } else if (n->fWhat & RE_BRANCH) {
727 const char *save = rex;
728 int fl = flags;
729
730 if ((n->fWhat & RE_GREEDY) == 0) {
731 if (RxMatch(n->fPtr) == 1) return 1;
732 flags = fl;
733 rex = save;
734 } else {
735 if (RxMatch(n->fNext) == 1) return 1;
736 flags = fl;
737 rex = save;
738 n = n->fPtr;
739 continue;
740 }
741 }
742 break;
743 }
744 n = n->fNext;
745 }
746 /* NOTREACHED */
747 assert(1 == 0 /* internal regexp error */);
748 return 0;
749 }
750
RxTry(RxNode * rx,const char * s)751 int RxTry(RxNode *rx, const char *s) {
752 int fl = flags;
753 rex = s;
754 for (int i = 0; i < NSEXPS; i++)
755 match->Open[i] = match->Close[i] = -1;
756 if (RxMatch(rx)) {
757 match->Open[0] = (int) (s - bop);
758 match->Close[0] = (int) (rex - bop);
759 return 1;
760 }
761 flags = fl;
762 return 0;
763 }
764
RxExecMatch(RxNode * Regexp,const char * Data,size_t Len,const char * Start,RxMatchRes * Match,unsigned int RxOpt)765 int RxExecMatch(RxNode *Regexp, const char *Data, size_t Len, const char *Start, RxMatchRes *Match, unsigned int RxOpt) {
766 if (Regexp == 0) return 0;
767
768 match = Match;
769 bop = Data;
770 eop = Data + Len;
771
772 flags = RxOpt;
773
774 return RxTry(Regexp, Start);
775 }
776
RxExec(RxNode * Regexp,const char * Data,size_t Len,const char * Start,RxMatchRes * Match,unsigned int RxOpt)777 int RxExec(RxNode *Regexp, const char *Data, size_t Len, const char *Start, RxMatchRes *Match, unsigned int RxOpt) {
778 int Ch;
779 if (Regexp == 0) return 0;
780
781 match = Match;
782 bop = Data;
783 eop = Data + Len;
784
785 flags = RxOpt;
786
787 for (int i = 0; i < NSEXPS; i++) Match->Open[i] = Match->Close[i] = -1;
788
789 switch (Regexp->fWhat) { // this should be more clever
790 case RE_ATBOL: // match is anchored
791 return RxTry(Regexp, Start);
792 case RE_CHAR: // search for a character to match
793 Ch = Regexp->fChar[0];
794 if (Start == eop)
795 break;
796 if (flags & RX_CASE) {
797 while (1) {
798 while (Start < eop && *Start != Ch)
799 Start++;
800 if (Start == eop)
801 break;
802 if (RxTry(Regexp, Start))
803 return 1;
804 if (++Start == eop)
805 break;
806 }
807 } else {
808 Ch = toupper(Ch);
809 while (1) {
810 while (Start < eop && toupper(*Start) != Ch)
811 Start++;
812 if (Start == eop)
813 break;
814 if (RxTry(Regexp, Start))
815 return 1;
816 if (++Start == eop)
817 break;
818 }
819 }
820 break;
821 default: // (slow)
822 do {
823 if (RxTry(Regexp, Start)) return 1;
824 } while (Start++ < eop);
825 break;
826 }
827 return 0;
828 }
829
830 #define FLAG_UP_CASE 1
831 #define FLAG_DOWN_CASE 2
832 #define FLAG_UP_NEXT 4
833 #define FLAG_DOWN_NEXT 8
834
add(size_t * len,char ** s,const char * a,size_t alen,int & flag)835 static int add(size_t *len, char **s, const char *a, size_t alen, int &flag) {
836 size_t NewLen = *len + alen;
837 size_t i;
838
839 NewLen = NewLen * 2;
840
841 if (alen == 0)
842 return 0;
843
844 if (*s) {
845 *s = (char *) realloc(*s, NewLen);
846 assert(*s);
847 memcpy(*s + *len, a, alen);
848 } else {
849 *s = (char *) malloc(NewLen);
850 assert(*s);
851 memcpy(*s, a, alen);
852 *len = 0;
853 }
854 if (flag & FLAG_UP_CASE) {
855 char *p = *s + *len;
856
857 for (i = 0; i < alen; i++) {
858 *p = (char)toupper(*p);
859 p++;
860 }
861 } else if (flag & FLAG_DOWN_CASE) {
862 char *p = *s + *len;
863
864 for (i = 0; i < alen; i++) {
865 *p = (char)tolower(*p);
866 p++;
867 }
868 }
869 if (flag & FLAG_UP_NEXT) {
870 char *p = *s + *len;
871
872 *p = (char)toupper(*p);
873 flag &= ~FLAG_UP_NEXT;
874 } else if (flag & FLAG_DOWN_NEXT) {
875 char *p = *s + *len;
876
877 *p = (char)tolower(*p);
878 flag &= ~FLAG_DOWN_NEXT;
879 }
880 *len += alen;
881 return 0;
882 }
883
RxReplace(const char * rep,const char * Src,size_t,RxMatchRes match,char ** Dest,size_t * Dlen)884 int RxReplace(const char *rep, const char *Src, size_t /*len*/, RxMatchRes match, char **Dest, size_t *Dlen) {
885 size_t dlen = 0;
886 char *dest = 0;
887 char Ch;
888 int n;
889 int flag = 0;
890
891 *Dest = 0;
892 *Dlen = 0;
893 // add(&dlen, &dest, Src, match.Open[0]);
894 while (*rep) {
895 switch (Ch = *rep++) {
896 // case '&':
897 // add(&dlen, &dest, Src + match.Open[0], match.Close[0] - match.Open[0], flag);
898 // break;
899 case '\\':
900 switch (Ch = *rep++) {
901 case '0':
902 case '1': case '2': case '3':
903 case '4': case '5': case '6':
904 case '7': case '8': case '9':
905 n = Ch - 48;
906
907 if (match.Open[n] != -1 && match.Close[n] != -1) {
908 add(&dlen, &dest, Src + match.Open[n], match.Close[n] - match.Open[n], flag);
909 } else return -1;
910 break;
911 case 0:
912 if (dest) free(dest);
913 return -1; // error
914 case 'r': Ch = '\r'; add(&dlen, &dest, &Ch, 1, flag); break;
915 case 'n': Ch = '\n'; add(&dlen, &dest, &Ch, 1, flag); break;
916 case 'b': Ch = '\b'; add(&dlen, &dest, &Ch, 1, flag); break;
917 case 'a': Ch = '\a'; add(&dlen, &dest, &Ch, 1, flag); break;
918 case 't': Ch = '\t'; add(&dlen, &dest, &Ch, 1, flag); break;
919 case 'U': flag |= FLAG_UP_CASE; break;
920 case 'u': flag |= FLAG_UP_NEXT; break;
921 case 'L': flag |= FLAG_DOWN_CASE; break;
922 case 'l': flag |= FLAG_DOWN_NEXT; break;
923 case 'E':
924 case 'e': flag &= ~(FLAG_UP_CASE | FLAG_DOWN_CASE); break;
925 case 'x':
926 {
927 int N = 0;
928 int A = 0;
929
930 if (*rep == 0) { free(dest); return 0; }
931 N = toupper(*rep) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
932 rep++;
933 A = N << 4;
934 if (*rep == 0) { free(dest); return 0; }
935 N = toupper(*rep) - 48; if (N > 9) N = N + 48 - 65 + 10; if (N > 15) return 0;
936 rep++;
937 A = A + N;
938 Ch = (char)A;
939 }
940 add(&dlen, &dest, &Ch, 1, flag);
941 break;
942 case 'd':
943 {
944 int N = 0;
945 int A = 0;
946
947 if (*rep == 0) { free(dest); return 0; }
948 N = toupper(*rep) - 48; if (N > 9) { free(dest); return 0; }
949 rep++;
950 A = N * 100;
951 if (*rep == 0) { free(dest); return 0; }
952 N = toupper(*rep) - 48; if (N > 9) { free(dest); return 0; }
953 rep++;
954 A = N * 10;
955 if (*rep == 0) { free(dest); return 0; }
956 N = toupper(*rep) - 48; if (N > 9) { free(dest); return 0; }
957 rep++;
958 A = A + N;
959 Ch = (char)A;
960 }
961 add(&dlen, &dest, &Ch, 1, flag);
962 break;
963 case 'o':
964 {
965 int N = 0;
966 int A = 0;
967
968 if (*rep == 0) { free(dest); return 0; }
969 N = toupper(*rep) - 48; if (N > 7) { free(dest); return 0; }
970 rep++;
971 A = N * 64;
972 if (*rep == 0) { free(dest); return 0; }
973 N = toupper(*rep) - 48; if (N > 7) { free(dest); return 0; }
974 rep++;
975 A = N * 8;
976 if (*rep == 0) { free(dest); return 0; }
977 N = toupper(*rep) - 48; if (N > 7) { free(dest); return 0; }
978 rep++;
979 A = A + N;
980 Ch = (char)A;
981 }
982 add(&dlen, &dest, &Ch, 1, flag);
983 break;
984 default:
985 add(&dlen, &dest, &Ch, 1, flag);
986 break;
987 }
988 break;
989 default:
990 add(&dlen, &dest, &Ch, 1, flag);
991 break;
992 }
993 }
994 // add(&dlen, &dest, Src + match.Close[0], len - match.Close[0]);
995 *Dlen = dlen;
996 *Dest = dest;
997 return 0;
998 }
999
1000 #ifdef DEBUG
1001
RxDump(int N,RxNode * n)1002 static void RxDump(int N, RxNode *n) {
1003 while (n) {
1004 for (int i = 0; i < N; i++) printf(" ");
1005 switch (n->fWhat) {
1006 case RE_NOTHING: printf("NOTHING\n"); break;
1007 case RE_CHAR: printf("CHAR '%.1s'\n", n->fChar); break;
1008 case RE_ATBOL: printf("^\n"); break;
1009 case RE_ATEOL: printf("$\n"); break;
1010 case RE_ANY: printf(".\n"); break;
1011 case RE_INSET: printf("[\n"/*, n->fChar*/); break;
1012 case RE_NOTINSET: printf("[^\n"/*, n->fChar*/); break;
1013 case RE_ATBOW: printf("<\n"); break;
1014 case RE_ATEOW: printf(">\n"); break;
1015 case RE_WSPACE: printf("WSPACE\n"); break;
1016 case RE_NWSPACE: printf("NWSPACE\n"); break;
1017 case RE_UPPER: printf("UPPER\n"); break;
1018 case RE_LOWER: printf("LOWER\n"); break;
1019 case RE_JUMP: printf("JUMP\n"); break;
1020 case RE_BREAK: printf("BREAK\n"); break;
1021 case RE_END: printf("END\n"); break;
1022 default:
1023 if (n->fWhat & RE_GROUP) {
1024 if (n->fWhat & RE_MEM) {
1025 if (n->fWhat & RE_OPEN) printf("( %d\n", n->fWhat & 0xFF);
1026 if (n->fWhat & RE_CLOSE) printf(") %d\n", n->fWhat & 0xFF);
1027 } else {
1028 if (n->fWhat & RE_OPEN) printf("{\n");
1029 if (n->fWhat & RE_CLOSE) printf("}\n");
1030 }
1031 } else if (n->fWhat & RE_BRANCH) {
1032 if (n->fWhat & RE_GREEDY) {
1033 printf("%c\n", n->fWhat & 0xFF);
1034 } else {
1035 printf("%c\n", n->fWhat & 0xFF);
1036 }
1037 } else {
1038 printf("???????????????\n");
1039 }
1040 break;
1041 }
1042 n = n->fNext;
1043 }
1044 }
1045
1046 #define TEST(rc,rx,st) \
1047 strcpy(line,st); \
1048 assert((a = RxCompile(rx)) != 0); \
1049 puts("\n--- " rx " -- " st " -- "); \
1050 RxDump(0,a);\
1051 assert(rc == RxExec(a, line, strlen(line), line, &b)); \
1052 RxFree(a);
1053
main()1054 int main() {
1055 RxNode *a;
1056 RxMatchRes b;
1057 char line[1024];
1058
1059 TEST(1, "a", "a");
1060 TEST(0, "b", "a");
1061 TEST(1, "aaaa", "aaaa");
1062 TEST(0, "bbbb", "aaaa");
1063 TEST(1, ".", "a");
1064 TEST(0, ".", "");
1065 TEST(1, "a..", "axx");
1066 TEST(0, "a..", "b..");
1067 TEST(1, "a?b", "ab");
1068 TEST(1, "a?b", "xb");
1069 TEST(0, "a?C", "xb");
1070 TEST(1, "{aa}?b", "aab");
1071 TEST(1, "{aa}?b", "xab");
1072 TEST(0, "{aa}?C", "xxb");
1073 TEST(1, "^aa", "aa");
1074 TEST(0, "^aa", "baa");
1075 TEST(1, "^aa$" ,"aa");
1076 TEST(0, "^aa$", "baab");
1077 TEST(1, "a*b", "aaab");
1078 TEST(0, "a*b", "aaaa");
1079 TEST(1, "{aa}*b", "aaab");
1080 TEST(0, "{aa}*b", "aaaa");
1081 TEST(1, "b+", "bb");
1082 TEST(1, "b+", "b");
1083 TEST(0, "b+", "a");
1084 TEST(1, "^b+$", "b");
1085 TEST(0, "^b+$", "aba");
1086 TEST(1, "a|b", " a");
1087 TEST(1, "a|b", " b");
1088 TEST(0, "a|b", " c");
1089 TEST(1, "a|b|c|d|e", " a ");
1090 TEST(1, "a|b|c|d|e", " c ");
1091 TEST(1, "a|b|c|d|e", " e ");
1092 TEST(0, "a|b|c|d|e", " x ");
1093 TEST(1, "{a}|{b}|{c}|{d}|{e}", " a ");
1094 TEST(1, "{a}|{b}|{c}|{d}|{e}", " c ");
1095 TEST(1, "{a}|{b}|{c}|{d}|{e}", " e ");
1096 TEST(0, "{a}|{b}|{c}|{d}|{e}", " x ");
1097 TEST(1, "^xx{alpha}|{beta}xx$", "xxalphaxx");
1098 TEST(1, "^xx{alpha}|{beta}xx$", "xxbetaxx");
1099 TEST(1, "[a-z]", "aaa");
1100 TEST(1, "^{Error}|{Warning}", "Warning search.cpp 35: Conversion may lose significant digits in function AskReplace()");
1101 TEST(1, "^{Error}|{Warning} (.+)", "Warning search.cpp 35: Conversion may lose significant digits in function AskReplace()");
1102 TEST(1, "^{Error}|{Warning} ([a-z.]#) ([0-9]#)", "Warning search.cpp 35: Conversion may lose significant digits in function AskReplace()");
1103 TEST(1, "^{Error}|{Warning} (.+) ([0-9]+): (.*)$", "Warning search.cpp 35: Conversion may lose significant digits in function AskReplace()");
1104 TEST(1, "^{Error}|{Warning} (.+) ([0-9]+): (.*)$", "Error search.cpp 35: Conversion may lose significant digits in function AskReplace()");
1105 TEST(1, "^([a-z]+ +)*\\(", "blabla bla bla bla (");
1106 TEST(1, "^([a-z]+\\s+)+\\(", "blabla bla bla bla (");
1107 TEST(1, "^([a-z]+\\s*)+\\(", "blabla bla bla bla(");
1108 TEST(1, "^([a-z]+\\s+)+\\(", "blabla bla bla bla (");
1109 TEST(1, "^([a-z]+\\s*)+\\(", "blabla bla bla bla(");
1110 TEST(1, "^([a-z]# #)*\\(", "blabla bla bla bla (");
1111 TEST(1, "^([a-z]+ @)@\\(", "blabla bla bla bla (");
1112 TEST(1, "^[\\x20-\\xFF]+$", "blabla");
1113 TEST(1, "{a{a{a{a|a}|{a|a}a}a}a|a}", "aaaaaaaaaaaaaaaaa");
1114
1115 while (1) {
1116 printf("Regexp: "); fflush(stdout); gets(line);
1117 if (!*line) break;
1118 a = RxCompile(line); RxDump(0, a);
1119 printf("String: "); fflush(stdout); gets(line);
1120 printf("rc = %d\n", RxExec(a, line, strlen(line), line, &b));
1121 for (int i = 0; i < NSEXPS; i++) {
1122 if (b.Open[i] != -1) {
1123 printf("%d: %d %d\n", i, b.Open[i], b.Close[i]);
1124 }
1125 }
1126 RxFree(a);
1127 }
1128 return 0;
1129 }
1130
1131 #endif
1132 // *INDENT-ON*
1133