1 /*
2 * regular expression module
3 *
4 * Important Note: do not support group name index
5 *
6 * $Id: init.c 144 2009-04-20 16:23:22Z ybc2084 $
7 */
8
9 #include <stdio.h>
10 #include <assert.h>
11 #include <tinypy.h>
12 #include "regexpr.cpp"
13
14 /* tinypy API to be use in this unit */
15 extern tp_obj tp_data(TP,int magic,void *v);
16 extern tp_obj tp_object_new(TP);
17 extern tp_obj tp_object(TP);
18 extern tp_obj tp_method(TP,tp_obj self,tp_obj v(TP));
19 extern tp_obj tp_string_copy(TP, const char *s, int n);
20 extern tp_obj tp_list(TP);
21 extern tp_obj tp_copy(TP);
22
23 /* last error message */
24 static const char * LastError = NULL;
25
26 /* lower level regex object */
27 typedef struct {
28 struct re_pattern_buffer re_patbuf; /* The compiled expression */
29 struct re_registers re_regs; /* The registers from the last match */
30 char re_fastmap[256]; /* Storage for fastmap */
31 unsigned char *re_translate; /* String object for translate table */
32 unsigned char *re_lastok; /* String object last matched/searched */
33
34 /* supplementary */
35 int re_errno; /* error num */
36 int re_syntax; /* syntax */
37 } regexobject;
38
39 /* local declarations */
40 static regexobject* getre(TP, tp_obj rmobj);
41 static tp_obj match_obj_group(TP);
42 static tp_obj match_obj_groups(TP);
43 static tp_obj match_obj_start(TP);
44 static tp_obj match_obj_end(TP);
45 static tp_obj match_obj_span(TP);
46
47 /*
48 * helper function: return lower level regex object
49 * rmobj - regex or match object
50 */
getre(TP,tp_obj rmobj)51 static regexobject * getre(TP, tp_obj rmobj)
52 {
53 tp_obj reobj_data = tp_get(tp, rmobj, tp_string("__data__"));
54 regexobject *re = NULL;
55
56 /* validate magic */
57 if (reobj_data.data.magic != sizeof(regexobject)) {
58 LastError = "broken regex object";
59 return (NULL);
60 }
61 re = (regexobject*)reobj_data.data.val;
62 assert(re);
63
64 return (re);
65 }
66
67 /*
68 * derive match object from regex object
69 */
match_object(TP,tp_obj reobj)70 static tp_obj match_object(TP, tp_obj reobj)
71 {
72 tp_obj mo = tp_object(tp); /* match object */
73 tp_obj redata; /* regex object data */
74 tp_obj madata; /* match object data */
75 regexobject *re = NULL; /* lower level regex object */
76
77 redata = tp_get(tp, reobj, tp_string("__data__"));
78 re = (regexobject *)redata.data.val;
79 assert(re);
80 madata = tp_data(tp, (int)sizeof(regexobject), re);
81
82 tp_set(tp, mo, tp_string("group"), tp_method(tp, mo, match_obj_group));
83 tp_set(tp, mo, tp_string("groups"), tp_method(tp, mo, match_obj_groups));
84 tp_set(tp, mo, tp_string("start"), tp_method(tp, mo, match_obj_start));
85 tp_set(tp, mo, tp_string("end"), tp_method(tp, mo, match_obj_end));
86 tp_set(tp, mo, tp_string("span"), tp_method(tp, mo, match_obj_span));
87 tp_set(tp, mo, tp_string("__data__"), madata);
88
89 return (mo);
90 }
91
92 /*
93 * FUNC: regexobj.search(str[,pos=0])
94 * self - regex object
95 * str - string to be searched
96 * pos - optional starting offset
97 *
98 * RETURN:
99 * match object - when matched
100 * None - not matched
101 */
regex_obj_search(TP)102 static tp_obj regex_obj_search(TP)
103 {
104 tp_obj self = TP_OBJ(); /* regex object */
105 tp_obj str = TP_STR();
106 tp_obj pos = TP_DEFAULT(tp_number(0));
107 tp_obj maobj; /* match object */
108 regexobject *re = NULL;
109 int r = -2; /* -2 indicate exception */
110 int range;
111
112 if (pos.number.val < 0 || pos.number.val > str.string.len) {
113 LastError = "search offset out of range";
114 goto exception;
115 }
116 range = str.string.len - pos.number.val;
117
118 re = getre(tp, self);
119 re->re_lastok = NULL;
120 r = re_search(&re->re_patbuf, (unsigned char *)str.string.val,
121 str.string.len, pos.number.val, range, &re->re_regs);
122
123 /* cannot match pattern */
124 if (r == -1)
125 goto notfind;
126
127 /* error occurred */
128 if (r == -2)
129 goto exception;
130
131 /* matched */
132 re->re_lastok = (unsigned char *)str.string.val;
133
134 /* match obj */
135 maobj = match_object(tp, self);
136
137 return (maobj);
138
139 notfind:
140 re->re_lastok = NULL;
141 return (tp_None);
142 exception:
143 re->re_lastok = NULL;
144 tp_raise(tp_None, tp_string("regex search error"));
145 }
146
147 /*
148 * FUNC: regexobj.match(str[,pos=0])
149 * self - regex object
150 * str - string to be matched
151 * pos - optional starting position
152 *
153 * RETURN:
154 * match object - when matched
155 * None - not matched
156 */
regex_obj_match(TP)157 static tp_obj regex_obj_match(TP)
158 {
159 tp_obj self = TP_OBJ(); /* regex object */
160 tp_obj str = TP_STR();
161 tp_obj pos = TP_DEFAULT(tp_number(0));
162 tp_obj maobj; /* match object */
163 regexobject *re = NULL;
164 int r = -2; /* -2 indicate exception */
165
166 re = getre(tp, self);
167 re->re_lastok = NULL;
168 r = re_match(&re->re_patbuf, (unsigned char *)str.string.val,
169 str.string.len, pos.number.val, &re->re_regs);
170
171 /* cannot match pattern */
172 if (r == -1)
173 goto nomatch;
174
175 /* error occurred */
176 if (r == -2)
177 goto exception;
178
179 /* matched */
180 re->re_lastok = (unsigned char *)str.string.val;
181
182 /* match obj */
183 maobj = match_object(tp, self);
184
185 return (maobj);
186
187 nomatch:
188 re->re_lastok = NULL;
189 return (tp_None);
190 exception:
191 re->re_lastok = NULL;
192 tp_raise(tp_None, tp_string("regex match error"));
193 }
194
195 /*
196 * regex object split()
197 * self - regex object
198 * restr - regex string
199 * maxsplit - max split field, default 0, mean no limit
200 */
regex_obj_split(TP)201 static tp_obj regex_obj_split(TP)
202 {
203 tp_obj self = TP_OBJ(); /* regex object */
204 tp_obj restr = TP_OBJ(); /* string */
205 tp_obj maxsplit = TP_DEFAULT(tp_number(0));
206 tp_obj maobj; /* match object */
207 regexobject *re = NULL; /* lower level regex object */
208 tp_obj result = tp_list(tp);
209 tp_obj grpstr; /* group string */
210 int slen; /* string length */
211 int srchloc; /* search location */
212
213 /* maxsplit == 0 means no limit */
214 if ((int)maxsplit.number.val == 0)
215 maxsplit.number.val = RE_NREGS;
216 assert(maxsplit.number.val > 0);
217
218 srchloc = 0;
219 slen = strlen((char *)restr.string.val);
220
221 do {
222 /* generate a temp match object */
223 tp_params_v(tp, 3, self, restr, tp_number(srchloc));
224 maobj = regex_obj_search(tp);
225 if (!tp_bool(tp, maobj))
226 break;
227
228 re = getre(tp, maobj);
229 if (re->re_lastok == NULL) {
230 tp_raise(tp_None, tp_string("no match for split()"));
231 }
232
233 /* extract fields */
234 if ((int)maxsplit.number.val > 0) {
235 int start = re->re_regs.start[0];
236 int end = re->re_regs.end[0];
237 /*printf("%s:start(%d),end(%d)\n", __func__, start, end);*/
238 if (start < 0 || end < 0)
239 break;
240
241 grpstr = tp_string_copy(tp,
242 (const char *)re->re_lastok + srchloc, start - srchloc);
243
244 if (tp_bool(tp, grpstr)) {
245 tp_set(tp, result, tp_None, grpstr);
246 maxsplit.number.val--;
247 }
248
249 srchloc = end;
250 }
251 } while (srchloc < slen && (int)maxsplit.number.val > 0);
252
253 /* collect remaining string, if necessary */
254 if (srchloc < slen) {
255 grpstr = tp_string_copy(tp,
256 (const char *)restr.string.val + srchloc, slen - srchloc);
257 if (tp_bool(tp, grpstr))
258 tp_set(tp, result, tp_None, grpstr);
259 }
260
261 return (result);
262 }
263
264 /*
265 * regex object findall()
266 * self - regex object
267 * restr - regex string
268 * pos - starting position, default 0
269 */
regex_obj_findall(TP)270 static tp_obj regex_obj_findall(TP)
271 {
272 tp_obj self = TP_OBJ(); /* regex object */
273 tp_obj restr = TP_OBJ(); /* string */
274 tp_obj pos = TP_DEFAULT(tp_number(0));
275 tp_obj maobj; /* match object */
276 regexobject *re = NULL; /* lower level regex object */
277 tp_obj result = tp_list(tp);
278 tp_obj grpstr; /* group string */
279 int slen; /* string length */
280 int srchloc; /* search location */
281
282 srchloc = (int)pos.number.val;
283 slen = strlen((char *)restr.string.val);
284 if (srchloc < 0 || srchloc >= slen)
285 tp_raise(tp_None, tp_string("starting position out of range"));
286
287 do {
288 /* generate a temp match object */
289 tp_params_v(tp, 3, self, restr, tp_number(srchloc));
290 maobj = regex_obj_search(tp);
291 if (!tp_bool(tp, maobj))
292 break;
293
294 re = getre(tp, maobj);
295 if (re->re_lastok == NULL) {
296 tp_raise(tp_None, tp_string("no match for findall()"));
297 }
298
299 /* extract fields */
300 if (srchloc < slen) {
301 int start = re->re_regs.start[0];
302 int end = re->re_regs.end[0];
303 /*printf("%s:start(%d),end(%d)\n", __func__, start, end);*/
304 if (start < 0 || end < 0)
305 break;
306
307 grpstr = tp_string_copy(tp,
308 (const char *)re->re_lastok + start, end - start);
309
310 if (tp_bool(tp, grpstr)) {
311 tp_set(tp, result, tp_None, grpstr);
312 }
313
314 srchloc = end;
315 }
316 } while (srchloc < slen);
317
318 return (result);
319 }
320
321 /*
322 * FUNC: matchobj.group([group1, ...])
323 * self - match object
324 * args - optional group indices, default 0
325 *
326 * return specified group.
327 */
match_obj_group(TP)328 static tp_obj match_obj_group(TP)
329 {
330 tp_obj self = TP_OBJ(); /* match object */
331 tp_obj grpidx; /* a group index */
332 regexobject *re = NULL;
333 int indices[RE_NREGS];
334 int start;
335 int end;
336 int i;
337 int single = 0; /* single group index? */
338 tp_obj result;
339
340 /* get lower level regex object representation */
341 re = getre(tp, self);
342 if (re->re_lastok == NULL)
343 tp_raise(tp_None,
344 tp_string("group() only valid after successful match/search"));
345
346 for (i = 0; i < RE_NREGS; i++)
347 indices[i] = -1;
348
349 /*
350 * if no group index provided, supply default group index 0; else
351 * fill in indices[] with provided group index list.
352 */
353 if (tp->params.list.val->len == 0) {
354 indices[0] = 0;
355 single = 1;
356 } else if (tp->params.list.val->len == 1) {
357 indices[0] = (int)TP_NUM();
358 single = 1;
359 } else {
360 i = 0;
361 TP_LOOP(grpidx)
362 if (grpidx.number.val < 0 || grpidx.number.val > RE_NREGS)
363 tp_raise(tp_None, tp_string("group() grpidx out of range"));
364 indices[i++] = (int)grpidx.number.val;
365 TP_END
366 }
367
368 /* generate result string list */
369 result = tp_list(tp);
370 for (i = 0; i < RE_NREGS && indices[i] >= 0; i++) {
371 tp_obj grpstr;
372 start = re->re_regs.start[indices[i]];
373 end = re->re_regs.end[indices[i]];
374 if (start < 0 || end < 0) {
375 grpstr = tp_None;
376 } else {
377 grpstr = tp_string_copy(tp, (const char *)re->re_lastok + start,
378 end - start);
379 }
380 tp_set(tp, result, tp_None, grpstr);
381 }
382 return (single ? tp_get(tp, result, tp_number(0)) : result);
383 }
384
385 /*
386 * FUNC: matchobj.groups()
387 * self - match object.
388 * return all groups.
389 * Note: CPython allow a 'default' argument, but we disallow it.
390 */
match_obj_groups(TP)391 static tp_obj match_obj_groups(TP)
392 {
393 tp_obj self = TP_OBJ(); /* match object */
394 regexobject *re = NULL;
395 int start;
396 int end;
397 int i;
398 tp_obj result = tp_list(tp);
399
400 re = getre(tp, self);
401 if (re->re_lastok == NULL) {
402 tp_raise(tp_None,
403 tp_string("groups() only valid after successful match/search"));
404 }
405
406 for (i = 1; i < RE_NREGS; i++) {
407 start = re->re_regs.start[i];
408 end = re->re_regs.end[i];
409 if (start < 0 || end < 0)
410 break;
411
412 tp_obj grpstr = tp_string_copy(tp,
413 (const char *)re->re_lastok + start, end - start);
414
415 if (tp_bool(tp, grpstr))
416 tp_set(tp, result, tp_None, grpstr);
417 }
418
419 return (result);
420 }
421
422 /*
423 * FUNC: matchobj.start([group])
424 * self - match object
425 * group - group index
426 * return starting position of matched 'group' substring.
427 */
match_obj_start(TP)428 static tp_obj match_obj_start(TP)
429 {
430 tp_obj self = TP_OBJ(); /* match object */
431 tp_obj group = TP_DEFAULT(tp_number(0)); /* group */
432 regexobject *re = NULL;
433 int start;
434
435 re = getre(tp, self);
436 if (re->re_lastok == NULL) {
437 tp_raise(tp_None,
438 tp_string("start() only valid after successful match/search"));
439 }
440
441 if (group.number.val < 0 || group.number.val > RE_NREGS)
442 tp_raise(tp_None, tp_string("IndexError: group index out of range"));
443
444 start = re->re_regs.start[(int)group.number.val];
445
446 return (tp_number(start));
447 }
448
449 /*
450 * FUNC: matchobj.end([group])
451 * self - match object
452 * group - group index
453 * return ending position of matched 'group' substring.
454 */
match_obj_end(TP)455 static tp_obj match_obj_end(TP)
456 {
457 tp_obj self = TP_OBJ(); /* match object */
458 tp_obj group = TP_DEFAULT(tp_number(0)); /* group */
459 regexobject *re = NULL;
460 int end;
461
462 re = getre(tp, self);
463 if (re->re_lastok == NULL) {
464 tp_raise(tp_None,
465 tp_string("end() only valid after successful match/search"));
466 }
467
468 if (group.number.val < 0 || group.number.val > RE_NREGS)
469 tp_raise(tp_None, tp_string("IndexError: group index out of range"));
470
471 end = re->re_regs.end[(int)group.number.val];
472
473 return (tp_number(end));
474 }
475
476 /*
477 * FUNC: matchobj.span([group])
478 * self - match object
479 * group - group index
480 * return [start,end] position pair of matched 'group' substring.
481 */
match_obj_span(TP)482 static tp_obj match_obj_span(TP)
483 {
484 tp_obj self = TP_OBJ(); /* match object */
485 tp_obj group = TP_DEFAULT(tp_number(0)); /* group */
486 regexobject *re = NULL;
487 int start;
488 int end;
489 tp_obj result;
490
491 re = getre(tp, self);
492 if (re->re_lastok == NULL) {
493 tp_raise(tp_None,
494 tp_string("span() only valid after successful match/search"));
495 }
496
497 if (group.number.val < 0 || group.number.val > RE_NREGS)
498 tp_raise(tp_None, tp_string("IndexError: group index out of range"));
499
500 start = re->re_regs.start[(int)group.number.val];
501 end = re->re_regs.end[(int)group.number.val];
502
503 result = tp_list(tp);
504 tp_set(tp, result, tp_None, tp_number(start));
505 tp_set(tp, result, tp_None, tp_number(end));
506
507 return (result);
508 }
509
510 /*
511 * compile out a re object
512 * repat - regex pattern
513 * resyn - regex syntax
514 */
regex_compile(TP)515 static tp_obj regex_compile(TP)
516 {
517 const char *error = NULL;
518 char const *pat = NULL;
519 int size = 0;
520 tp_obj reobj_data;
521 tp_obj repat = TP_TYPE(TP_STRING); /* pattern */
522 tp_obj resyn = TP_DEFAULT(tp_number(RE_SYNTAX_EMACS)); /* syntax */
523 tp_obj reobj; /* regex object */
524 regexobject *re;
525
526 /*
527 * create regex object, its parent is builtin 'object'
528 */
529 reobj = tp_object(tp);
530
531 re = (regexobject *)malloc(sizeof(regexobject));
532 if (!re) {
533 error = "malloc lower level regex object failed";
534 goto finally;
535 }
536
537 re->re_patbuf.buffer = NULL;
538 re->re_patbuf.allocated = 0;
539 re->re_patbuf.fastmap = (unsigned char *)re->re_fastmap;
540 re->re_patbuf.translate = NULL;
541 re->re_translate = NULL;
542 re->re_lastok = NULL;
543
544 re->re_errno = 0;
545 re->re_syntax = (int)resyn.number.val;
546
547 pat = repat.string.val;
548 size = repat.string.len;
549 error = re_compile_pattern((unsigned char *)pat, size, &re->re_patbuf);
550 if (error != NULL) {
551 LastError = error;
552 goto finally;
553 }
554
555 /* regexobject's size as magic */
556 reobj_data = tp_data(tp, (int)sizeof(regexobject), re);
557
558 /*
559 * bind to regex object
560 */
561 tp_set(tp, reobj, tp_string("search"),
562 tp_method(tp, reobj, regex_obj_search));
563 tp_set(tp, reobj, tp_string("match"),
564 tp_method(tp, reobj, regex_obj_match));
565 tp_set(tp, reobj, tp_string("split"),
566 tp_method(tp, reobj, regex_obj_split));
567 tp_set(tp, reobj, tp_string("findall"),
568 tp_method(tp, reobj, regex_obj_findall));
569 tp_set(tp, reobj, tp_string("__data__"), reobj_data);
570
571 tp_set(tp, reobj, tp_string("__name__"),
572 tp_string("regular expression object"));
573 tp_set(tp, reobj, tp_string("__doc__"), tp_string(
574 "regular expression object, support methods:\n"
575 "search(str[,pos=0])-search 'str' from 'pos'\n"
576 "match(str[,pos=0]) -match 'str' from 'pos'\n"
577 ));
578
579 return (reobj);
580
581 finally:
582 tp_raise(tp_None, tp_string(error));
583 }
584
585 /*
586 * module level search()
587 */
regex_search(TP)588 static tp_obj regex_search(TP)
589 {
590 tp_obj repat = TP_OBJ(); /* pattern */
591 tp_obj restr = TP_OBJ(); /* string */
592 tp_obj resyn = TP_DEFAULT(tp_number(RE_SYNTAX_EMACS));
593 tp_obj reobj; /* regex object */
594 tp_obj maobj; /* match object */
595
596 /* compile out regex object */
597 tp_params_v(tp, 2, repat, resyn);
598 reobj = regex_compile(tp);
599
600 /* call r.search() */
601 tp_params_v(tp, 3, reobj, restr, tp_number(0));
602 maobj = regex_obj_search(tp);
603
604 return (maobj);
605 }
606
607 /*
608 * module level match()
609 */
regex_match(TP)610 static tp_obj regex_match(TP)
611 {
612 tp_obj repat = TP_OBJ(); /* pattern */
613 tp_obj restr = TP_OBJ(); /* string */
614 tp_obj resyn = TP_DEFAULT(tp_number(RE_SYNTAX_EMACS));
615 tp_obj reobj; /* regex object */
616 tp_obj maobj; /* match object */
617
618 /* compile out regex object */
619 tp_params_v(tp, 2, repat, resyn);
620 reobj = regex_compile(tp);
621
622 /* call r.search() */
623 tp_params_v(tp, 3, reobj, restr, tp_number(0));
624 maobj = regex_obj_match(tp);
625
626 return (maobj);
627 }
628
629 /*
630 * module level split()
631 * repat - regex pattern
632 * restr - regex string
633 * maxsplit - max split field, default 0, mean no limit
634 */
regex_split(TP)635 static tp_obj regex_split(TP)
636 {
637 tp_obj repat = TP_OBJ(); /* pattern */
638 tp_obj restr = TP_OBJ(); /* string */
639 tp_obj maxsplit = TP_DEFAULT(tp_number(0));
640 tp_obj reobj; /* regex object */
641
642 /* generate a temp regex object */
643 tp_params_v(tp, 2, repat, tp_number(RE_SYNTAX_EMACS));
644 reobj = regex_compile(tp);
645
646 tp_params_v(tp, 3, reobj, restr, maxsplit);
647 return regex_obj_split(tp);
648 }
649
650 /*
651 * module level findall()
652 * repat - regex pattern
653 * restr - regex string
654 * resyn - regex syntax, optional, default RE_SYNTAX_EMAC
655 */
regex_findall(TP)656 static tp_obj regex_findall(TP)
657 {
658 tp_obj repat = TP_OBJ(); /* pattern */
659 tp_obj restr = TP_OBJ(); /* string */
660 tp_obj resyn = TP_DEFAULT(tp_number(RE_SYNTAX_EMACS));
661 tp_obj reobj; /* regex object */
662
663 /* generate a temp regex object */
664 tp_params_v(tp, 2, repat, resyn);
665 reobj = regex_compile(tp);
666
667 tp_params_v(tp, 2, reobj, restr);
668 return regex_obj_findall(tp);
669 }
670
671
672 /*
673 * re mod can only support 'set_syntax', 'get_syntax', and 'compile' functions,
674 * 'compile' function will return a 'reobj', and this 'reobj' will support
675 * methods 'search', 'match', 'group', 'groupall', el al.
676 */
re_init(TP)677 void re_init(TP)
678 {
679 /*
680 * module dict for re
681 */
682 tp_obj re_mod = tp_dict(tp);
683
684 /*
685 * bind to re module
686 */
687 tp_set(tp, re_mod, tp_string("compile"), tp_fnc(tp, regex_compile));
688 tp_set(tp, re_mod, tp_string("search"), tp_fnc(tp, regex_search));
689 tp_set(tp, re_mod, tp_string("match"), tp_fnc(tp, regex_match));
690 tp_set(tp, re_mod, tp_string("split"), tp_fnc(tp, regex_split));
691 tp_set(tp, re_mod, tp_string("findall"), tp_fnc(tp, regex_findall));
692 tp_set(tp, re_mod, tp_string("AWK_SYNTAX"), tp_number(RE_SYNTAX_AWK));
693 tp_set(tp, re_mod, tp_string("EGREP_SYNTAX"), tp_number(RE_SYNTAX_EGREP));
694 tp_set(tp, re_mod, tp_string("GREP_SYNTAX"), tp_number(RE_SYNTAX_GREP));
695 tp_set(tp, re_mod, tp_string("EMACS_SYNTAX"), tp_number(RE_SYNTAX_EMACS));
696
697 /*
698 * bind special attibutes to re module
699 */
700 tp_set(tp, re_mod, tp_string("__name__"),
701 tp_string("regular expression module"));
702 tp_set(tp, re_mod, tp_string("__file__"), tp_string(__FILE__));
703 tp_set(tp, re_mod, tp_string("__doc__"),
704 tp_string("simple regular express implementation"));
705
706 /*
707 * bind regex module to tinypy modules[]
708 */
709 tp_set(tp, tp->modules, tp_string("re"), re_mod);
710 }
711
712