1 // NREX: Node RegEx
2 // Version 0.2
3 //
4 // Copyright (c) 2015-2016, Zher Huei Lee
5 // All rights reserved.
6 //
7 // This software is provided 'as-is', without any express or implied
8 // warranty. In no event will the authors be held liable for any damages
9 // arising from the use of this software.
10 //
11 // Permission is granted to anyone to use this software for any purpose,
12 // including commercial applications, and to alter it and redistribute it
13 // freely, subject to the following restrictions:
14 //
15 // 1. The origin of this software must not be misrepresented; you must not
16 // claim that you wrote the original software. If you use this software
17 // in a product, an acknowledgment in the product documentation would
18 // be appreciated but is not required.
19 //
20 // 2. Altered source versions must be plainly marked as such, and must not
21 // be misrepresented as being the original software.
22 //
23 // 3. This notice may not be removed or altered from any source
24 // distribution.
25 //
26
27 #include "nrex.hpp"
28
29 #ifdef NREX_UNICODE
30 #include <wchar.h>
31 #include <wctype.h>
32 #define NREX_ISALPHANUM iswalnum
33 #define NREX_ISSPACE iswspace
34 #define NREX_STRLEN wcslen
35 #else
36 #include <ctype.h>
37 #include <string.h>
38 #define NREX_ISALPHANUM isalnum
39 #define NREX_ISSPACE isspace
40 #define NREX_STRLEN strlen
41 #endif
42
43 #ifdef NREX_THROW_ERROR
44 #define NREX_COMPILE_ERROR(M) throw nrex_compile_error(M)
45 #else
46 #define NREX_COMPILE_ERROR(M) \
47 reset(); \
48 return false
49 #endif
50
51 #ifndef NREX_NEW
52 #define NREX_NEW(X) new X
53 #define NREX_NEW_ARRAY(X, N) new X[N]
54 #define NREX_DELETE(X) delete X
55 #define NREX_DELETE_ARRAY(X) delete[] X
56 #endif
57
58 template <typename T>
59 class nrex_array {
60 private:
61 T *_data;
62 unsigned int _reserved;
63 unsigned int _size;
64
65 public:
nrex_array()66 nrex_array() :
67 _data(NREX_NEW_ARRAY(T, 2)),
68 _reserved(2),
69 _size(0) {
70 }
71
nrex_array(unsigned int reserved)72 nrex_array(unsigned int reserved) :
73 _data(NREX_NEW_ARRAY(T, reserved ? reserved : 1)),
74 _reserved(reserved ? reserved : 1),
75 _size(0) {
76 }
77
~nrex_array()78 ~nrex_array() {
79 NREX_DELETE_ARRAY(_data);
80 }
81
size() const82 unsigned int size() const {
83 return _size;
84 }
85
reserve(unsigned int size)86 void reserve(unsigned int size) {
87 if (size < _size) {
88 size = _size;
89 }
90 if (size == 0) {
91 size = 1;
92 }
93 T *old = _data;
94 _data = NREX_NEW_ARRAY(T, size);
95 _reserved = size;
96 for (unsigned int i = 0; i < _size; ++i) {
97 _data[i] = old[i];
98 }
99 NREX_DELETE_ARRAY(old);
100 }
101
push(T item)102 void push(T item) {
103 if (_size == _reserved) {
104 reserve(_reserved * 2);
105 }
106 _data[_size] = item;
107 _size++;
108 }
109
top() const110 const T &top() const {
111 return _data[_size - 1];
112 }
113
operator [](unsigned int i) const114 const T &operator[](unsigned int i) const {
115 return _data[i];
116 }
117
pop()118 void pop() {
119 if (_size > 0) {
120 --_size;
121 }
122 }
123 };
124
nrex_parse_hex(nrex_char c)125 static int nrex_parse_hex(nrex_char c) {
126 if ('0' <= c && c <= '9') {
127 return int(c - '0');
128 } else if ('a' <= c && c <= 'f') {
129 return int(c - 'a') + 10;
130 } else if ('A' <= c && c <= 'F') {
131 return int(c - 'A') + 10;
132 }
133 return -1;
134 }
135
nrex_unescape(const nrex_char * & c)136 static nrex_char nrex_unescape(const nrex_char *&c) {
137 switch (c[1]) {
138 case '0': ++c; return '\0';
139 case 'a': ++c; return '\a';
140 case 'e': ++c; return '\e';
141 case 'f': ++c; return '\f';
142 case 'n': ++c; return '\n';
143 case 'r': ++c; return '\r';
144 case 't': ++c; return '\t';
145 case 'v': ++c; return '\v';
146 case 'b': ++c; return '\b';
147 case 'x': {
148 int point = 0;
149 for (int i = 2; i <= 3; ++i) {
150 int res = nrex_parse_hex(c[i]);
151 if (res == -1) {
152 return '\0';
153 }
154 point = (point << 4) + res;
155 }
156 c = &c[3];
157 return nrex_char(point);
158 }
159 case 'u': {
160 int point = 0;
161 for (int i = 2; i <= 5; ++i) {
162 int res = nrex_parse_hex(c[i]);
163 if (res == -1) {
164 return '\0';
165 }
166 point = (point << 4) + res;
167 }
168 c = &c[5];
169 return nrex_char(point);
170 }
171 }
172 return (++c)[0];
173 }
174
175 struct nrex_search {
176 const nrex_char *str;
177 nrex_result *captures;
178 int end;
179 bool complete;
180 nrex_array<int> lookahead_pos;
181
atnrex_search182 nrex_char at(int pos) {
183 return str[pos];
184 }
185
nrex_searchnrex_search186 nrex_search(const nrex_char *str, nrex_result *captures, int lookahead) :
187 str(str),
188 captures(captures),
189 end(0),
190 lookahead_pos(lookahead) {
191 }
192 };
193
194 struct nrex_node {
195 nrex_node *next;
196 nrex_node *previous;
197 nrex_node *parent;
198 bool quantifiable;
199 int length;
200
nrex_nodenrex_node201 nrex_node(bool quantify = false) :
202 next(NULL),
203 previous(NULL),
204 parent(NULL),
205 quantifiable(quantify),
206 length(-1) {
207 }
208
~nrex_nodenrex_node209 virtual ~nrex_node() {
210 if (next) {
211 NREX_DELETE(next);
212 }
213 }
214
testnrex_node215 virtual int test(nrex_search *s, int pos) const {
216 return next ? next->test(s, pos) : -1;
217 }
218
test_parentnrex_node219 virtual int test_parent(nrex_search *s, int pos) const {
220 if (next) {
221 pos = next->test(s, pos);
222 }
223 if (pos >= 0) {
224 s->complete = true;
225 }
226 if (parent && pos >= 0) {
227 pos = parent->test_parent(s, pos);
228 }
229 if (pos < 0) {
230 s->complete = false;
231 }
232 return pos;
233 }
234
increment_lengthnrex_node235 void increment_length(int amount, bool subtract = false) {
236 if (amount >= 0 && length >= 0) {
237 if (!subtract) {
238 length += amount;
239 } else {
240 length -= amount;
241 }
242 } else {
243 length = -1;
244 }
245 if (parent) {
246 parent->increment_length(amount, subtract);
247 }
248 }
249 };
250
251 enum nrex_group_type {
252 nrex_group_capture,
253 nrex_group_non_capture,
254 nrex_group_bracket,
255 nrex_group_look_ahead,
256 nrex_group_look_behind,
257 };
258
259 struct nrex_node_group : public nrex_node {
260 nrex_group_type type;
261 int id;
262 bool negate;
263 nrex_array<nrex_node *> childset;
264 nrex_node *back;
265
nrex_node_groupnrex_node_group266 nrex_node_group(nrex_group_type type, int id = 0) :
267 nrex_node(true),
268 type(type),
269 id(id),
270 negate(false),
271 back(NULL) {
272 if (type != nrex_group_bracket) {
273 length = 0;
274 } else {
275 length = 1;
276 }
277 if (type == nrex_group_look_ahead || type == nrex_group_look_behind) {
278 quantifiable = false;
279 }
280 }
281
~nrex_node_groupnrex_node_group282 virtual ~nrex_node_group() {
283 for (unsigned int i = 0; i < childset.size(); ++i) {
284 NREX_DELETE(childset[i]);
285 }
286 }
287
testnrex_node_group288 int test(nrex_search *s, int pos) const {
289 int old_start;
290 if (type == nrex_group_capture) {
291 old_start = s->captures[id].start;
292 s->captures[id].start = pos;
293 }
294 for (unsigned int i = 0; i < childset.size(); ++i) {
295 s->complete = false;
296 int offset = 0;
297 if (type == nrex_group_look_behind) {
298 if (pos < length) {
299 return -1;
300 }
301 offset = length;
302 }
303 if (type == nrex_group_look_ahead) {
304 s->lookahead_pos.push(pos);
305 }
306 int res = childset[i]->test(s, pos - offset);
307 if (type == nrex_group_look_ahead) {
308 s->lookahead_pos.pop();
309 }
310 if (s->complete) {
311 return res;
312 }
313 if (negate) {
314 if (res < 0) {
315 res = pos + 1;
316 } else {
317 return -1;
318 }
319 if (i + 1 < childset.size()) {
320 continue;
321 }
322 }
323 if (res >= 0) {
324 if (type == nrex_group_capture) {
325 s->captures[id].length = res - pos;
326 } else if (type == nrex_group_look_ahead || type == nrex_group_look_behind) {
327 res = pos;
328 }
329 return next ? next->test(s, res) : res;
330 }
331 }
332 if (type == nrex_group_capture) {
333 s->captures[id].start = old_start;
334 }
335 return -1;
336 }
337
test_parentnrex_node_group338 virtual int test_parent(nrex_search *s, int pos) const {
339 if (type == nrex_group_capture) {
340 s->captures[id].length = pos - s->captures[id].start;
341 }
342 if (type == nrex_group_look_ahead) {
343 pos = s->lookahead_pos[id];
344 }
345 return nrex_node::test_parent(s, pos);
346 }
347
add_childsetnrex_node_group348 void add_childset() {
349 if (childset.size() > 0 && type != nrex_group_bracket) {
350 length = -1;
351 }
352 back = NULL;
353 }
354
add_childnrex_node_group355 void add_child(nrex_node *node) {
356 node->parent = this;
357 node->previous = back;
358 if (back && type != nrex_group_bracket) {
359 back->next = node;
360 } else {
361 childset.push(node);
362 }
363 if (type != nrex_group_bracket) {
364 increment_length(node->length);
365 }
366 back = node;
367 }
368
swap_backnrex_node_group369 nrex_node *swap_back(nrex_node *node) {
370 if (!back) {
371 add_child(node);
372 return NULL;
373 }
374 nrex_node *old = back;
375 if (!old->previous) {
376 childset.pop();
377 }
378 if (type != nrex_group_bracket) {
379 increment_length(old->length, true);
380 }
381 back = old->previous;
382 add_child(node);
383 return old;
384 }
385
pop_backnrex_node_group386 void pop_back() {
387 if (back) {
388 nrex_node *old = back;
389 if (!old->previous) {
390 childset.pop();
391 }
392 if (type != nrex_group_bracket) {
393 increment_length(old->length, true);
394 }
395 back = old->previous;
396 NREX_DELETE(old);
397 }
398 }
399 };
400
401 struct nrex_node_char : public nrex_node {
402 nrex_char ch;
403
nrex_node_charnrex_node_char404 nrex_node_char(nrex_char c) :
405 nrex_node(true),
406 ch(c) {
407 length = 1;
408 }
409
testnrex_node_char410 int test(nrex_search *s, int pos) const {
411 if (s->end <= pos || 0 > pos || s->at(pos) != ch) {
412 return -1;
413 }
414 return next ? next->test(s, pos + 1) : pos + 1;
415 }
416 };
417
418 struct nrex_node_range : public nrex_node {
419 nrex_char start;
420 nrex_char end;
421
nrex_node_rangenrex_node_range422 nrex_node_range(nrex_char s, nrex_char e) :
423 nrex_node(true),
424 start(s),
425 end(e) {
426 length = 1;
427 }
428
testnrex_node_range429 int test(nrex_search *s, int pos) const {
430 if (s->end <= pos || 0 > pos) {
431 return -1;
432 }
433 nrex_char c = s->at(pos);
434 if (c < start || end < c) {
435 return -1;
436 }
437 return next ? next->test(s, pos + 1) : pos + 1;
438 }
439 };
440
441 enum nrex_class_type {
442 nrex_class_none,
443 nrex_class_alnum,
444 nrex_class_alpha,
445 nrex_class_blank,
446 nrex_class_cntrl,
447 nrex_class_digit,
448 nrex_class_graph,
449 nrex_class_lower,
450 nrex_class_print,
451 nrex_class_punct,
452 nrex_class_space,
453 nrex_class_upper,
454 nrex_class_xdigit,
455 nrex_class_word
456 };
457
nrex_compare_class(const nrex_char ** pos,const char * text)458 static bool nrex_compare_class(const nrex_char **pos, const char *text) {
459 unsigned int i = 0;
460 for (i = 0; text[i] != '\0'; ++i) {
461 if ((*pos)[i] != text[i]) {
462 return false;
463 }
464 }
465 if ((*pos)[i++] != ':' || (*pos)[i] != ']') {
466 return false;
467 }
468 *pos = &(*pos)[i];
469 return true;
470 }
471
472 #define NREX_COMPARE_CLASS(POS, NAME) \
473 if (nrex_compare_class(POS, #NAME)) return nrex_class_##NAME
474
nrex_parse_class(const nrex_char ** pos)475 static nrex_class_type nrex_parse_class(const nrex_char **pos) {
476 NREX_COMPARE_CLASS(pos, alnum);
477 NREX_COMPARE_CLASS(pos, alpha);
478 NREX_COMPARE_CLASS(pos, blank);
479 NREX_COMPARE_CLASS(pos, cntrl);
480 NREX_COMPARE_CLASS(pos, digit);
481 NREX_COMPARE_CLASS(pos, graph);
482 NREX_COMPARE_CLASS(pos, lower);
483 NREX_COMPARE_CLASS(pos, print);
484 NREX_COMPARE_CLASS(pos, punct);
485 NREX_COMPARE_CLASS(pos, space);
486 NREX_COMPARE_CLASS(pos, upper);
487 NREX_COMPARE_CLASS(pos, xdigit);
488 NREX_COMPARE_CLASS(pos, word);
489 return nrex_class_none;
490 }
491
492 struct nrex_node_class : public nrex_node {
493 nrex_class_type type;
494
nrex_node_classnrex_node_class495 nrex_node_class(nrex_class_type t) :
496 nrex_node(true),
497 type(t) {
498 length = 1;
499 }
500
testnrex_node_class501 int test(nrex_search *s, int pos) const {
502 if (s->end <= pos || 0 > pos) {
503 return -1;
504 }
505 if (!test_class(s->at(pos))) {
506 return -1;
507 }
508 return next ? next->test(s, pos + 1) : pos + 1;
509 }
510
test_classnrex_node_class511 bool test_class(nrex_char c) const {
512 if ((0 <= c && c <= 0x1F) || c == 0x7F) {
513 if (type == nrex_class_cntrl) {
514 return true;
515 }
516 } else if (c < 0x7F) {
517 if (type == nrex_class_print) {
518 return true;
519 } else if (type == nrex_class_graph && c != ' ') {
520 return true;
521 } else if ('0' <= c && c <= '9') {
522 switch (type) {
523 case nrex_class_alnum:
524 case nrex_class_digit:
525 case nrex_class_xdigit:
526 case nrex_class_word:
527 return true;
528 default:
529 break;
530 }
531 } else if ('A' <= c && c <= 'Z') {
532 switch (type) {
533 case nrex_class_alnum:
534 case nrex_class_alpha:
535 case nrex_class_upper:
536 case nrex_class_word:
537 return true;
538 case nrex_class_xdigit:
539 if (c <= 'F') {
540 return true;
541 }
542 default:
543 break;
544 }
545 } else if ('a' <= c && c <= 'z') {
546 switch (type) {
547 case nrex_class_alnum:
548 case nrex_class_alpha:
549 case nrex_class_lower:
550 case nrex_class_word:
551 return true;
552 case nrex_class_xdigit:
553 if (c <= 'f') {
554 return true;
555 }
556 default:
557 break;
558 }
559 }
560 }
561 switch (c) {
562 case ' ':
563 case '\t':
564 if (type == nrex_class_blank) {
565 return true;
566 }
567 case '\r':
568 case '\n':
569 case '\f':
570 if (type == nrex_class_space) {
571 return true;
572 }
573 break;
574 case '_':
575 if (type == nrex_class_word) {
576 return true;
577 }
578 case ']':
579 case '[':
580 case '!':
581 case '"':
582 case '#':
583 case '$':
584 case '%':
585 case '&':
586 case '\'':
587 case '(':
588 case ')':
589 case '*':
590 case '+':
591 case ',':
592 case '.':
593 case '/':
594 case ':':
595 case ';':
596 case '<':
597 case '=':
598 case '>':
599 case '?':
600 case '@':
601 case '\\':
602 case '^':
603 case '`':
604 case '{':
605 case '|':
606 case '}':
607 case '~':
608 case '-':
609 if (type == nrex_class_punct) {
610 return true;
611 }
612 break;
613 default:
614 break;
615 }
616 return false;
617 }
618 };
619
nrex_is_shorthand(nrex_char repr)620 static bool nrex_is_shorthand(nrex_char repr) {
621 switch (repr) {
622 case 'W':
623 case 'w':
624 case 'D':
625 case 'd':
626 case 'S':
627 case 's':
628 return true;
629 }
630 return false;
631 }
632
633 struct nrex_node_shorthand : public nrex_node {
634 nrex_char repr;
635
nrex_node_shorthandnrex_node_shorthand636 nrex_node_shorthand(nrex_char c) :
637 nrex_node(true),
638 repr(c) {
639 length = 1;
640 }
641
testnrex_node_shorthand642 int test(nrex_search *s, int pos) const {
643 if (s->end <= pos || 0 > pos) {
644 return -1;
645 }
646 bool found = false;
647 bool invert = false;
648 nrex_char c = s->at(pos);
649 switch (repr) {
650 case '.':
651 found = true;
652 break;
653 case 'W':
654 invert = true;
655 case 'w':
656 if (c == '_' || NREX_ISALPHANUM(c)) {
657 found = true;
658 }
659 break;
660 case 'D':
661 invert = true;
662 case 'd':
663 if ('0' <= c && c <= '9') {
664 found = true;
665 }
666 break;
667 case 'S':
668 invert = true;
669 case 's':
670 if (NREX_ISSPACE(c)) {
671 found = true;
672 }
673 break;
674 }
675 if (found == invert) {
676 return -1;
677 }
678 return next ? next->test(s, pos + 1) : pos + 1;
679 }
680 };
681
nrex_is_quantifier(nrex_char repr)682 static bool nrex_is_quantifier(nrex_char repr) {
683 switch (repr) {
684 case '?':
685 case '*':
686 case '+':
687 case '{':
688 return true;
689 }
690 return false;
691 }
692
693 struct nrex_node_quantifier : public nrex_node {
694 int min;
695 int max;
696 bool greedy;
697 nrex_node *child;
698
nrex_node_quantifiernrex_node_quantifier699 nrex_node_quantifier(int min, int max) :
700 nrex_node(),
701 min(min),
702 max(max),
703 greedy(true),
704 child(NULL) {
705 }
706
~nrex_node_quantifiernrex_node_quantifier707 virtual ~nrex_node_quantifier() {
708 if (child) {
709 NREX_DELETE(child);
710 }
711 }
712
testnrex_node_quantifier713 int test(nrex_search *s, int pos) const {
714 return test_step(s, pos, 0, pos);
715 }
716
test_stepnrex_node_quantifier717 int test_step(nrex_search *s, int pos, int level, int start) const {
718 if (pos > s->end) {
719 return -1;
720 }
721 if (!greedy && level > min) {
722 int res = pos;
723 if (next) {
724 res = next->test(s, res);
725 }
726 if (s->complete) {
727 return res;
728 }
729 if (res >= 0 && parent->test_parent(s, res) >= 0) {
730 return res;
731 }
732 }
733 if (max >= 0 && level > max) {
734 return -1;
735 }
736 if (level > 1 && level > min + 1 && pos == start) {
737 return -1;
738 }
739 int res = pos;
740 if (level >= 1) {
741 res = child->test(s, pos);
742 if (s->complete) {
743 return res;
744 }
745 }
746 if (res >= 0) {
747 int res_step = test_step(s, res, level + 1, start);
748 if (res_step >= 0) {
749 return res_step;
750 } else if (greedy && level >= min) {
751 if (next) {
752 res = next->test(s, res);
753 }
754 if (s->complete) {
755 return res;
756 }
757 if (res >= 0 && parent->test_parent(s, res) >= 0) {
758 return res;
759 }
760 }
761 }
762 return -1;
763 }
764
test_parentnrex_node_quantifier765 virtual int test_parent(nrex_search *s, int pos) const {
766 s->complete = false;
767 return pos;
768 }
769 };
770
771 struct nrex_node_anchor : public nrex_node {
772 bool end;
773
nrex_node_anchornrex_node_anchor774 nrex_node_anchor(bool end) :
775 nrex_node(),
776 end(end) {
777 length = 0;
778 }
779
testnrex_node_anchor780 int test(nrex_search *s, int pos) const {
781 if (!end && pos != 0) {
782 return -1;
783 } else if (end && pos != s->end) {
784 return -1;
785 }
786 return next ? next->test(s, pos) : pos;
787 }
788 };
789
790 struct nrex_node_word_boundary : public nrex_node {
791 bool inverse;
792
nrex_node_word_boundarynrex_node_word_boundary793 nrex_node_word_boundary(bool inverse) :
794 nrex_node(),
795 inverse(inverse) {
796 length = 0;
797 }
798
testnrex_node_word_boundary799 int test(nrex_search *s, int pos) const {
800 bool left = false;
801 bool right = false;
802 if (pos != 0) {
803 nrex_char c = s->at(pos - 1);
804 if (c == '_' || NREX_ISALPHANUM(c)) {
805 left = true;
806 }
807 }
808 if (pos != s->end) {
809 nrex_char c = s->at(pos);
810 if (c == '_' || NREX_ISALPHANUM(c)) {
811 right = true;
812 }
813 }
814 if ((left != right) == inverse) {
815 return -1;
816 }
817 return next ? next->test(s, pos) : pos;
818 }
819 };
820
821 struct nrex_node_backreference : public nrex_node {
822 int ref;
823
nrex_node_backreferencenrex_node_backreference824 nrex_node_backreference(int ref) :
825 nrex_node(true),
826 ref(ref) {
827 length = -1;
828 }
829
testnrex_node_backreference830 int test(nrex_search *s, int pos) const {
831 nrex_result &r = s->captures[ref];
832 for (int i = 0; i < r.length; ++i) {
833 if (pos + i >= s->end) {
834 return -1;
835 }
836 if (s->at(r.start + i) != s->at(pos + i)) {
837 return -1;
838 }
839 }
840 return next ? next->test(s, pos + r.length) : pos + r.length;
841 }
842 };
843
nrex_has_lookbehind(nrex_array<nrex_node_group * > & stack)844 bool nrex_has_lookbehind(nrex_array<nrex_node_group *> &stack) {
845 for (unsigned int i = 0; i < stack.size(); i++) {
846 if (stack[i]->type == nrex_group_look_behind) {
847 return true;
848 }
849 }
850 return false;
851 }
852
nrex()853 nrex::nrex() :
854 _capturing(0),
855 _lookahead_depth(0),
856 _root(NULL) {
857 }
858
nrex(const nrex_char * pattern,int captures)859 nrex::nrex(const nrex_char *pattern, int captures) :
860 _capturing(0),
861 _lookahead_depth(0),
862 _root(NULL) {
863 compile(pattern, captures);
864 }
865
~nrex()866 nrex::~nrex() {
867 if (_root) {
868 NREX_DELETE(_root);
869 }
870 }
871
valid() const872 bool nrex::valid() const {
873 return (_root != NULL);
874 }
875
reset()876 void nrex::reset() {
877 _capturing = 0;
878 _lookahead_depth = 0;
879 if (_root) {
880 NREX_DELETE(_root);
881 }
882 _root = NULL;
883 }
884
capture_size() const885 int nrex::capture_size() const {
886 if (_root) {
887 return _capturing + 1;
888 }
889 return 0;
890 }
891
compile(const nrex_char * pattern,int captures)892 bool nrex::compile(const nrex_char *pattern, int captures) {
893 reset();
894 nrex_node_group *root = NREX_NEW(nrex_node_group(nrex_group_capture, _capturing));
895 nrex_array<nrex_node_group *> stack;
896 stack.push(root);
897 unsigned int lookahead_level = 0;
898 _root = root;
899
900 for (const nrex_char *c = pattern; c[0] != '\0'; ++c) {
901 if (c[0] == '(') {
902 if (c[1] == '?') {
903 if (c[2] == ':') {
904 c = &c[2];
905 nrex_node_group *group = NREX_NEW(nrex_node_group(nrex_group_non_capture));
906 stack.top()->add_child(group);
907 stack.push(group);
908 } else if (c[2] == '!' || c[2] == '=') {
909 c = &c[2];
910 nrex_node_group *group = NREX_NEW(nrex_node_group(nrex_group_look_ahead, lookahead_level++));
911 group->negate = (c[0] == '!');
912 stack.top()->add_child(group);
913 stack.push(group);
914 if (lookahead_level > _lookahead_depth) {
915 _lookahead_depth = lookahead_level;
916 }
917 } else if (c[2] == '<' && (c[3] == '!' || c[3] == '=')) {
918 c = &c[3];
919 nrex_node_group *group = NREX_NEW(nrex_node_group(nrex_group_look_behind));
920 group->negate = (c[0] == '!');
921 stack.top()->add_child(group);
922 stack.push(group);
923 } else {
924 NREX_COMPILE_ERROR("unrecognised qualifier for group");
925 }
926 } else if (captures >= 0 && _capturing < captures) {
927 nrex_node_group *group = NREX_NEW(nrex_node_group(nrex_group_capture, ++_capturing));
928 stack.top()->add_child(group);
929 stack.push(group);
930 } else {
931 nrex_node_group *group = NREX_NEW(nrex_node_group(nrex_group_non_capture));
932 stack.top()->add_child(group);
933 stack.push(group);
934 }
935 } else if (c[0] == ')') {
936 if (stack.size() > 1) {
937 if (stack.top()->type == nrex_group_look_ahead) {
938 --lookahead_level;
939 }
940 stack.pop();
941 } else {
942 NREX_COMPILE_ERROR("unexpected ')'");
943 }
944 } else if (c[0] == '[') {
945 nrex_node_group *group = NREX_NEW(nrex_node_group(nrex_group_bracket));
946 stack.top()->add_child(group);
947 if (c[1] == '^') {
948 group->negate = true;
949 ++c;
950 }
951 bool first_child = true;
952 nrex_char previous_child;
953 bool previous_child_single = false;
954 while (true) {
955 group->add_childset();
956 ++c;
957 if (c[0] == '\0') {
958 NREX_COMPILE_ERROR("unclosed bracket expression '['");
959 }
960 if (c[0] == '[' && c[1] == ':') {
961 const nrex_char *d = &c[2];
962 nrex_class_type cls = nrex_parse_class(&d);
963 if (cls != nrex_class_none) {
964 c = d;
965 group->add_child(NREX_NEW(nrex_node_class(cls)));
966 previous_child_single = false;
967 } else {
968 group->add_child(NREX_NEW(nrex_node_char('[')));
969 previous_child = '[';
970 previous_child_single = true;
971 }
972 } else if (c[0] == ']' && !first_child) {
973 break;
974 } else if (c[0] == '\\') {
975 if (nrex_is_shorthand(c[1])) {
976 group->add_child(NREX_NEW(nrex_node_shorthand(c[1])));
977 ++c;
978 previous_child_single = false;
979 } else {
980 const nrex_char *d = c;
981 nrex_char unescaped = nrex_unescape(d);
982 if (c == d) {
983 NREX_COMPILE_ERROR("invalid escape token");
984 }
985 group->add_child(NREX_NEW(nrex_node_char(unescaped)));
986 c = d;
987 previous_child = unescaped;
988 previous_child_single = true;
989 }
990 } else if (previous_child_single && c[0] == '-') {
991 bool is_range = false;
992 nrex_char next;
993 if (c[1] != '\0' && c[1] != ']') {
994 if (c[1] == '\\') {
995 const nrex_char *d = ++c;
996 next = nrex_unescape(d);
997 if (c == d) {
998 NREX_COMPILE_ERROR("invalid escape token in range");
999 }
1000 } else {
1001 next = c[1];
1002 ++c;
1003 }
1004 is_range = true;
1005 }
1006 if (is_range) {
1007 if (next < previous_child) {
1008 NREX_COMPILE_ERROR("text range out of order");
1009 }
1010 group->pop_back();
1011 group->add_child(NREX_NEW(nrex_node_range(previous_child, next)));
1012 previous_child_single = false;
1013 } else {
1014 group->add_child(NREX_NEW(nrex_node_char(c[0])));
1015 previous_child = c[0];
1016 previous_child_single = true;
1017 }
1018 } else {
1019 group->add_child(NREX_NEW(nrex_node_char(c[0])));
1020 previous_child = c[0];
1021 previous_child_single = true;
1022 }
1023 first_child = false;
1024 }
1025 } else if (nrex_is_quantifier(c[0])) {
1026 int min = 0;
1027 int max = -1;
1028 bool valid_quantifier = true;
1029 if (c[0] == '?') {
1030 min = 0;
1031 max = 1;
1032 } else if (c[0] == '+') {
1033 min = 1;
1034 max = -1;
1035 } else if (c[0] == '*') {
1036 min = 0;
1037 max = -1;
1038 } else if (c[0] == '{') {
1039 bool max_set = false;
1040 const nrex_char *d = c;
1041 while (true) {
1042 ++d;
1043 if (d[0] == '\0') {
1044 valid_quantifier = false;
1045 break;
1046 } else if (d[0] == '}') {
1047 break;
1048 } else if (d[0] == ',') {
1049 max_set = true;
1050 continue;
1051 } else if (d[0] < '0' || '9' < d[0]) {
1052 valid_quantifier = false;
1053 break;
1054 }
1055 if (max_set) {
1056 if (max < 0) {
1057 max = int(d[0] - '0');
1058 } else {
1059 max = max * 10 + int(d[0] - '0');
1060 }
1061 } else {
1062 min = min * 10 + int(d[0] - '0');
1063 }
1064 }
1065 if (!max_set) {
1066 max = min;
1067 }
1068 if (valid_quantifier) {
1069 c = d;
1070 }
1071 }
1072 if (valid_quantifier) {
1073 if (stack.top()->back == NULL || !stack.top()->back->quantifiable) {
1074 NREX_COMPILE_ERROR("element not quantifiable");
1075 }
1076 nrex_node_quantifier *quant = NREX_NEW(nrex_node_quantifier(min, max));
1077 if (min == max) {
1078 if (stack.top()->back->length >= 0) {
1079 quant->length = max * stack.top()->back->length;
1080 }
1081 } else {
1082 if (nrex_has_lookbehind(stack)) {
1083 NREX_COMPILE_ERROR("variable length quantifiers inside lookbehind not supported");
1084 }
1085 }
1086 quant->child = stack.top()->swap_back(quant);
1087 quant->child->previous = NULL;
1088 quant->child->next = NULL;
1089 quant->child->parent = quant;
1090 if (c[1] == '?') {
1091 quant->greedy = false;
1092 ++c;
1093 }
1094 } else {
1095 stack.top()->add_child(NREX_NEW(nrex_node_char(c[0])));
1096 }
1097 } else if (c[0] == '|') {
1098 if (nrex_has_lookbehind(stack)) {
1099 NREX_COMPILE_ERROR("alternations inside lookbehind not supported");
1100 }
1101 stack.top()->add_childset();
1102 } else if (c[0] == '^' || c[0] == '$') {
1103 stack.top()->add_child(NREX_NEW(nrex_node_anchor((c[0] == '$'))));
1104 } else if (c[0] == '.') {
1105 stack.top()->add_child(NREX_NEW(nrex_node_shorthand('.')));
1106 } else if (c[0] == '\\') {
1107 if (nrex_is_shorthand(c[1])) {
1108 stack.top()->add_child(NREX_NEW(nrex_node_shorthand(c[1])));
1109 ++c;
1110 } else if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) {
1111 int ref = 0;
1112 bool unclosed = false;
1113 if (c[1] == 'g') {
1114 unclosed = true;
1115 c = &c[2];
1116 }
1117 while ('0' <= c[1] && c[1] <= '9') {
1118 ref = ref * 10 + int(c[1] - '0');
1119 ++c;
1120 }
1121 if (c[1] == '}') {
1122 unclosed = false;
1123 ++c;
1124 }
1125 if (ref > _capturing || ref <= 0 || unclosed) {
1126 NREX_COMPILE_ERROR("backreference to non-existent capture");
1127 }
1128 if (nrex_has_lookbehind(stack)) {
1129 NREX_COMPILE_ERROR("backreferences inside lookbehind not supported");
1130 }
1131 stack.top()->add_child(NREX_NEW(nrex_node_backreference(ref)));
1132 } else if (c[1] == 'b' || c[1] == 'B') {
1133 stack.top()->add_child(NREX_NEW(nrex_node_word_boundary(c[1] == 'B')));
1134 ++c;
1135 } else {
1136 const nrex_char *d = c;
1137 nrex_char unescaped = nrex_unescape(d);
1138 if (c == d) {
1139 NREX_COMPILE_ERROR("invalid escape token");
1140 }
1141 stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped)));
1142 c = d;
1143 }
1144 } else {
1145 stack.top()->add_child(NREX_NEW(nrex_node_char(c[0])));
1146 }
1147 }
1148 if (stack.size() > 1) {
1149 NREX_COMPILE_ERROR("unclosed group '('");
1150 }
1151 return true;
1152 }
1153
match(const nrex_char * str,nrex_result * captures,int offset,int end) const1154 bool nrex::match(const nrex_char *str, nrex_result *captures, int offset, int end) const {
1155 if (!_root) {
1156 return false;
1157 }
1158 nrex_search s(str, captures, _lookahead_depth);
1159 if (end >= offset) {
1160 s.end = end;
1161 } else {
1162 s.end = NREX_STRLEN(str);
1163 }
1164 for (int i = offset; i <= s.end; ++i) {
1165 for (int c = 0; c <= _capturing; ++c) {
1166 captures[c].start = 0;
1167 captures[c].length = 0;
1168 }
1169 if (_root->test(&s, i) >= 0) {
1170 return true;
1171 }
1172 }
1173 return false;
1174 }
1175