1 /*
2 mxTextTools -- Fast text manipulation routines
3
4 Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com
5 Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com
6 */
7
8 /* We want all our symbols to be exported */
9 #ifndef MX_BUILDING_MXTEXTTOOLS
10 #define MX_BUILDING_MXTEXTTOOLS
11 #endif
12
13 /* Logging file used by debugging facility */
14 #ifndef MAL_DEBUG_OUTPUTFILE
15 # define MAL_DEBUG_OUTPUTFILE "mxTextTools.log"
16 #endif
17
18 #include "mx.h"
19 #include "mxTextTools.h"
20 #include "structmember.h"
21 #include <ctype.h>
22
23 #define VERSION "2.1.0"
24
25 /* Initial list size used by e.g. setsplit(), setsplitx(),... */
26 #define INITIAL_LIST_SIZE 64
27
28 /* Maximum TagTable cache size. If this limit is reached, the cache
29 is cleared to make room for new compile TagTables. */
30 #define MAX_TAGTABLES_CACHE_SIZE 100
31
32 /* Define this to enable the copy-protocol (__copy__, __deepcopy__) */
33 #define COPY_PROTOCOL
34
35 /* Convenience macro for reducing clutter */
36 #define ADD_INT_CONSTANT(name, value) \
37 if (PyModule_AddIntConstant(module, name, value) < 0) \
38 return NULL;
39
40 /* --- module doc-string -------------------------------------------------- */
41
42 PyDoc_STRVAR(Module_docstring,
43
44 MXTEXTTOOLS_MODULE" -- Tools for fast text processing. Version "VERSION"\n\n"
45
46 "Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com\n"
47 "Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com\n\n"
48 "Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com\n\n"
49
50 " All Rights Reserved\n\n"
51 "See the documentation for further information on copyrights,\n"
52 "or contact the author.")
53 ;
54
55 /* --- internal macros ---------------------------------------------------- */
56
57 /* --- module globals ----------------------------------------------------- */
58
59 /* Translation strings for the 8-bit versions of lower() and upper() */
60 static PyObject *mx_ToUpper;
61 static PyObject *mx_ToLower;
62
63 static PyObject *mxTextTools_Error; /* mxTextTools specific error */
64
65 static PyObject *mxTextTools_TagTables; /* TagTable cache dictionary */
66
67 /* Flag telling us whether the module was initialized or not. */
68 static int mxTextTools_Initialized = 0;
69
70 /* --- forward declarations ----------------------------------------------- */
71
72 /* --- module helper ------------------------------------------------------ */
73
74 static
mxTextTools_ToUpper(void)75 PyObject *mxTextTools_ToUpper(void)
76 {
77 char tr[256];
78 Py_ssize_t i;
79
80 for (i = 0; i < 256; i++)
81 tr[i] = toupper((char)i);
82 return PyString_FromStringAndSize(tr,sizeof(tr));
83 }
84
85 static
mxTextTools_ToLower(void)86 PyObject *mxTextTools_ToLower(void)
87 {
88 char tr[256];
89 Py_ssize_t i;
90
91 for (i = 0; i < 256; i++)
92 tr[i] = tolower((char)i);
93 return PyString_FromStringAndSize(tr,sizeof(tr));
94 }
95
96 /* Create an exception object, insert it into the module dictionary
97 under the given name and return the object pointer; this is NULL in
98 case an error occurred. base can be given to indicate the base
99 object to be used by the exception object. It should be NULL
100 otherwise */
101
102 /* --- module interface --------------------------------------------------- */
103
104 /* --- Text Search Object ----------------------------------------------*/
105
106 /* allocation */
107
108 static
mxTextSearch_New(PyObject * match,PyObject * translate,int algorithm)109 PyObject *mxTextSearch_New(PyObject *match,
110 PyObject *translate,
111 int algorithm)
112 {
113 mxTextSearchObject *so;
114
115 so = PyObject_NEW(mxTextSearchObject, &mxTextSearch_Type);
116 if (so == NULL)
117 return NULL;
118 so->data = NULL;
119 so->translate = NULL;
120 so->match = NULL;
121
122 Py_INCREF(match);
123 so->match = match;
124
125 if (translate == Py_None)
126 translate = NULL;
127 else if (translate) {
128 Py_Assert(PyString_Check(translate),
129 PyExc_TypeError,
130 "translate table must be a string");
131 Py_Assert(PyString_GET_SIZE(translate) == 256,
132 PyExc_TypeError,
133 "translate string must have exactly 256 chars");
134 Py_INCREF(translate);
135 }
136 so->translate = translate;
137
138 /* Init algorithm */
139 so->algorithm = algorithm;
140 switch (algorithm) {
141
142 case MXTEXTSEARCH_BOYERMOORE:
143 Py_Assert(PyString_Check(match),
144 PyExc_TypeError,
145 "match must be a string for Boyer-Moore");
146 so->data = bm_init(PyString_AS_STRING(match),
147 PyString_GET_SIZE(match));
148 Py_Assert(so->data != NULL,
149 PyExc_TypeError,
150 "error initializing the search object");
151 break;
152
153 case MXTEXTSEARCH_TRIVIAL:
154 Py_Assert(PyString_Check(match) || PyUnicode_Check(match),
155 PyExc_TypeError,
156 "match must be a string or unicode");
157 Py_Assert(so->translate == NULL,
158 PyExc_TypeError,
159 "trivial search algorithm does not support translate");
160 break;
161
162 default:
163 Py_Error(PyExc_ValueError,
164 "unknown or unsupported algorithm");
165
166 }
167 return (PyObject *)so;
168
169 onError:
170 Py_DECREF(so);
171 return NULL;
172 }
173
174 Py_C_Function_WithKeywords(
175 mxTextSearch_TextSearch,
176 "TextSearch(match[,translate=None,algorithm=default_algorithm])\n\n"
177 "Create a substring search object for the string match;\n"
178 "translate is an optional translate-string like the one used\n"
179 "in the module re."
180 )
181 {
182 PyObject *match = 0;
183 PyObject *translate = 0;
184 int algorithm = -424242;
185
186 Py_KeywordsGet3Args("O|Oi:TextSearch",match,translate,algorithm);
187
188 if (algorithm == -424242) {
189 if (PyUnicode_Check(match))
190 algorithm = MXTEXTSEARCH_TRIVIAL;
191 else
192 algorithm = MXTEXTSEARCH_BOYERMOORE;
193 }
194 return mxTextSearch_New(match, translate, algorithm);
195
196 onError:
197 return NULL;
198 }
199
200 static
mxTextSearch_Free(mxTextSearchObject * so)201 void mxTextSearch_Free(mxTextSearchObject *so)
202 {
203 if (so->data) {
204 switch (so->algorithm) {
205
206 case MXTEXTSEARCH_BOYERMOORE:
207 bm_free(so->data);
208 break;
209
210 case MXTEXTSEARCH_TRIVIAL:
211 break;
212
213 }
214 }
215 Py_XDECREF(so->match);
216 Py_XDECREF(so->translate);
217 PyObject_Del(so);
218 }
219
220 /* C APIs */
221
222 #define so ((mxTextSearchObject *)self)
223
224 /* Get the match length from an TextSearch object or -1 in case of an
225 error. */
226
mxTextSearch_MatchLength(PyObject * self)227 Py_ssize_t mxTextSearch_MatchLength(PyObject *self)
228 {
229 Py_Assert(mxTextSearch_Check(self),
230 PyExc_TypeError,
231 "expected a TextSearch object");
232
233 switch (so->algorithm) {
234
235 case MXTEXTSEARCH_BOYERMOORE:
236 return BM_MATCH_LEN(so->data);
237 break;
238
239 case MXTEXTSEARCH_TRIVIAL:
240 if (PyString_Check(so->match))
241 return PyString_GET_SIZE(so->match);
242 #ifdef HAVE_UNICODE
243 else if (PyUnicode_Check(so->match))
244 return PyUnicode_GET_SIZE(so->match);
245 #endif
246 break;
247
248 }
249
250 Py_Error(mxTextTools_Error,
251 "internal error");
252
253 onError:
254 return -1;
255 }
256
257 static
trivial_search(const char * text,Py_ssize_t start,Py_ssize_t stop,const char * match,Py_ssize_t match_len)258 Py_ssize_t trivial_search(const char *text,
259 Py_ssize_t start,
260 Py_ssize_t stop,
261 const char *match,
262 Py_ssize_t match_len)
263 {
264 Py_ssize_t ml1 = match_len - 1;
265 register const char *tx = &text[start];
266 register Py_ssize_t x = start;
267
268 if (ml1 < 0)
269 return start;
270
271 /* Brute-force method; from right to left */
272 for (;;) {
273 register Py_ssize_t j = ml1;
274 register const char *mj = &match[j];
275
276 if (x + j >= stop)
277 /* reached eof: no match */
278 return start;
279
280 /* scan from right to left */
281 for (tx += j; j >= 0 && *tx == *mj;
282 tx--, mj--, j--) ;
283
284 if (j < 0) {
285 /* found */
286 x += ml1 + 1;
287 return x;
288 }
289 /* not found: rewind and advance one char */
290 tx -= j - 1;
291 x++;
292 }
293 return start;
294 }
295
296 #ifdef HAVE_UNICODE
297 static
trivial_unicode_search(const Py_UNICODE * text,Py_ssize_t start,Py_ssize_t stop,const Py_UNICODE * match,Py_ssize_t match_len)298 Py_ssize_t trivial_unicode_search(const Py_UNICODE *text,
299 Py_ssize_t start,
300 Py_ssize_t stop,
301 const Py_UNICODE *match,
302 Py_ssize_t match_len)
303 {
304 Py_ssize_t ml1 = match_len - 1;
305 register const Py_UNICODE *tx = &text[start];
306 register Py_ssize_t x = start;
307
308 if (ml1 < 0)
309 return start;
310
311 /* Brute-force method; from right to left */
312 for (;;) {
313 register Py_ssize_t j = ml1;
314 register const Py_UNICODE *mj = &match[j];
315
316 if (x + j >= stop)
317 /* reached eof: no match */
318 return start;
319
320 /* scan from right to left */
321 for (tx += j; j >= 0 && *tx == *mj;
322 tx--, mj--, j--) ;
323
324 if (j < 0) {
325 /* found */
326 x += ml1 + 1;
327 return x;
328 }
329 /* not found: rewind and advance one char */
330 tx -= j - 1;
331 x++;
332 }
333 return start;
334 }
335 #endif
336
337 /* Search for the match in text[start:stop].
338
339 Returns 1 in case a match was found and sets sliceleft, sliceright
340 to the matching slice.
341
342 Returns 0 in case no match was found and -1 in case of an error.
343
344 */
345
mxTextSearch_SearchBuffer(PyObject * self,char * text,Py_ssize_t start,Py_ssize_t stop,Py_ssize_t * sliceleft,Py_ssize_t * sliceright)346 Py_ssize_t mxTextSearch_SearchBuffer(PyObject *self,
347 char *text,
348 Py_ssize_t start,
349 Py_ssize_t stop,
350 Py_ssize_t *sliceleft,
351 Py_ssize_t *sliceright)
352 {
353 Py_ssize_t nextpos;
354 Py_ssize_t match_len;
355
356 Py_Assert(mxTextSearch_Check(self),
357 PyExc_TypeError,
358 "expected a TextSearch object");
359
360 switch (so->algorithm) {
361
362 case MXTEXTSEARCH_BOYERMOORE:
363 if (so->translate) {
364 /* search with translate table */
365 nextpos = bm_tr_search((mxbmse_data *)so->data,
366 text,
367 start,
368 stop,
369 PyString_AS_STRING(so->translate));
370 }
371 else {
372 /* exact search */
373 nextpos = bm_search((mxbmse_data *)so->data,
374 text,
375 start,
376 stop);
377 }
378 match_len = BM_MATCH_LEN(so->data);
379 break;
380
381 case MXTEXTSEARCH_TRIVIAL:
382 {
383 const char *match;
384
385 if (PyString_Check(so->match)) {
386 match = PyString_AS_STRING(so->match);
387 match_len = PyString_GET_SIZE(so->match);
388 }
389 else if (PyObject_AsCharBuffer(so->match, &match, &match_len))
390 goto onError;
391 nextpos = trivial_search(text,
392 start,
393 stop,
394 match,
395 match_len);
396 }
397 break;
398
399 default:
400 Py_Error(mxTextTools_Error,
401 "unknown algorithm type in mxTextSearch_SearchBuffer");
402
403 }
404 /* Found ? */
405 if (nextpos != start) {
406 if (sliceleft)
407 *sliceleft = nextpos - match_len;
408 if (sliceright)
409 *sliceright = nextpos;
410 return 1;
411 }
412 /* Not found */
413 return 0;
414
415 onError:
416 return -1;
417 }
418
419 #ifdef HAVE_UNICODE
mxTextSearch_SearchUnicode(PyObject * self,Py_UNICODE * text,Py_ssize_t start,Py_ssize_t stop,Py_ssize_t * sliceleft,Py_ssize_t * sliceright)420 Py_ssize_t mxTextSearch_SearchUnicode(PyObject *self,
421 Py_UNICODE *text,
422 Py_ssize_t start,
423 Py_ssize_t stop,
424 Py_ssize_t *sliceleft,
425 Py_ssize_t *sliceright)
426 {
427 Py_ssize_t nextpos;
428 Py_ssize_t match_len;
429
430 Py_Assert(mxTextSearch_Check(self),
431 PyExc_TypeError,
432 "expected a TextSearch object");
433
434 switch (so->algorithm) {
435
436 case MXTEXTSEARCH_BOYERMOORE:
437 Py_Error(PyExc_TypeError,
438 "Boyer-Moore search algorithm does not support Unicode");
439 break;
440
441 case MXTEXTSEARCH_TRIVIAL:
442 {
443 PyObject *u;
444 Py_UNICODE *match;
445
446 if (PyUnicode_Check(so->match)) {
447 u = NULL;
448 match = PyUnicode_AS_UNICODE(so->match);
449 match_len = PyUnicode_GET_SIZE(so->match);
450 }
451 else {
452 u = PyUnicode_FromEncodedObject(so->match, NULL, NULL);
453 if (u == NULL)
454 goto onError;
455 match = PyUnicode_AS_UNICODE(u);
456 match_len = PyUnicode_GET_SIZE(u);
457 }
458 nextpos = trivial_unicode_search(text,
459 start,
460 stop,
461 match,
462 match_len);
463 Py_XDECREF(u);
464 }
465 break;
466
467 default:
468 Py_Error(mxTextTools_Error,
469 "unknown algorithm type in mxTextSearch_SearchUnicode");
470
471 }
472 /* Found ? */
473 if (nextpos != start) {
474 if (sliceleft)
475 *sliceleft = nextpos - match_len;
476 if (sliceright)
477 *sliceright = nextpos;
478 return 1;
479 }
480 /* Not found */
481 return 0;
482
483 onError:
484 return -1;
485 }
486 #endif
487
488 /* methods */
489
490 Py_C_Function( mxTextSearch_search,
491 "TextSearch.search(text,start=0,stop=len(text))\n\n"
492 "Search for the substring in text, looking only at the\n"
493 "slice [start:stop] and return the slice (l,r)\n"
494 "where the substring was found, (start,start) otherwise.")
495 {
496 PyObject *text;
497 Py_ssize_t start = 0;
498 Py_ssize_t stop = INT_MAX;
499 Py_ssize_t sliceleft, sliceright;
500 int rc;
501
502 Py_Get3Args("O|ii:TextSearch.search",
503 text,start,stop);
504
505 if (PyString_Check(text)) {
506 Py_CheckStringSlice(text, start, stop);
507 rc = mxTextSearch_SearchBuffer(self,
508 PyString_AS_STRING(text),
509 start,
510 stop,
511 &sliceleft,
512 &sliceright);
513 }
514 #ifdef HAVE_UNICODE
515 else if (PyUnicode_Check(text)) {
516 Py_CheckUnicodeSlice(text, start, stop);
517 rc = mxTextSearch_SearchUnicode(self,
518 PyUnicode_AS_UNICODE(text),
519 start,
520 stop,
521 &sliceleft,
522 &sliceright);
523 }
524 #endif
525 else
526 Py_Error(PyExc_TypeError,
527 "expected string or unicode");
528 if (rc < 0)
529 goto onError;
530 if (rc == 0) {
531 sliceleft = start;
532 sliceright = start;
533 }
534
535 /* Return the slice */
536 Py_Return2("ii", sliceleft, sliceright);
537
538 onError:
539 return NULL;
540 }
541
542 Py_C_Function( mxTextSearch_find,
543 "TextSearch.find(text,start=0,stop=len(text))\n\n"
544 "Search for the substring in text, looking only at the\n"
545 "slice [start:stop] and return the index\n"
546 "where the substring was found, -1 otherwise.")
547 {
548 PyObject *text;
549 Py_ssize_t start = 0;
550 Py_ssize_t stop = INT_MAX;
551 Py_ssize_t sliceleft, sliceright;
552 int rc;
553
554 Py_Get3Args("O|ii:TextSearch.find",
555 text,start,stop);
556
557 if (PyString_Check(text)) {
558 Py_CheckStringSlice(text, start, stop);
559 rc = mxTextSearch_SearchBuffer(self,
560 PyString_AS_STRING(text),
561 start,
562 stop,
563 &sliceleft,
564 &sliceright);
565 }
566 #ifdef HAVE_UNICODE
567 else if (PyUnicode_Check(text)) {
568 Py_CheckUnicodeSlice(text, start, stop);
569 rc = mxTextSearch_SearchUnicode(self,
570 PyUnicode_AS_UNICODE(text),
571 start,
572 stop,
573 &sliceleft,
574 &sliceright);
575 }
576 #endif
577 else
578 Py_Error(PyExc_TypeError,
579 "expected string or unicode");
580 if (rc < 0)
581 goto onError;
582 if (rc == 0)
583 sliceleft = -1;
584 return PyInt_FromLong(sliceleft);
585
586 onError:
587 return NULL;
588 }
589
590 Py_C_Function( mxTextSearch_findall,
591 "TextSearch.findall(text,start=0,stop=len(text))\n\n"
592 "Search for the substring in text, looking only at the\n"
593 "slice [start:stop] and return a list of all\n"
594 "non overlapping slices (l,r) in text where the match\n"
595 "string can be found.")
596 {
597 PyObject *text;
598 PyObject *list = 0;
599 Py_ssize_t start = 0;
600 Py_ssize_t stop = INT_MAX;
601 Py_ssize_t stop_index;
602 Py_ssize_t match_len;
603 Py_ssize_t listsize = INITIAL_LIST_SIZE;
604 Py_ssize_t listitem = 0;
605
606 Py_Get3Args("O|ii:TextSearch.findall",
607 text,start,stop);
608
609 if (PyString_Check(text)) {
610 Py_CheckStringSlice(text, start, stop);
611 }
612 #ifdef HAVE_UNICODE
613 else if (PyUnicode_Check(text)) {
614 Py_CheckUnicodeSlice(text, start, stop);
615 }
616 #endif
617 else
618 Py_Error(PyExc_TypeError,
619 "expected string or unicode");
620
621 list = PyList_New(listsize);
622 if (!list)
623 goto onError;
624
625 match_len = mxTextSearch_MatchLength(self);
626 if (match_len < 0)
627 goto onError;
628 stop_index = stop - match_len;
629
630 while (start <= stop_index) {
631 register PyObject *t,*v;
632 int rc;
633 Py_ssize_t sliceleft, sliceright;
634
635 /* exact search */
636 if (PyString_Check(text))
637 rc = mxTextSearch_SearchBuffer(self,
638 PyString_AS_STRING(text),
639 start,
640 stop,
641 &sliceleft,
642 &sliceright);
643 #ifdef HAVE_UNICODE
644 else if (PyUnicode_Check(text))
645 rc = mxTextSearch_SearchUnicode(self,
646 PyUnicode_AS_UNICODE(text),
647 start,
648 stop,
649 &sliceleft,
650 &sliceright);
651 #endif
652 else
653 break;
654 if (rc < 0)
655 goto onError;
656 if (rc == 0)
657 break;
658
659 /* Build slice and append to list */
660 t = PyTuple_New(2);
661 if (!t)
662 goto onError;
663 v = PyInt_FromLong(sliceleft);
664 if (!v)
665 goto onError;
666 PyTuple_SET_ITEM(t,0,v);
667 v = PyInt_FromLong(sliceright);
668 if (!v)
669 goto onError;
670 PyTuple_SET_ITEM(t,1,v);
671
672 if (listitem < listsize)
673 PyList_SET_ITEM(list, listitem, t);
674 else {
675 PyList_Append(list, t);
676 Py_DECREF(t);
677 }
678 listitem++;
679
680 start = sliceright;
681 }
682
683 /* Resize list if necessary */
684 if (listitem < listsize)
685 PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL);
686
687 return list;
688
689 onError:
690 Py_XDECREF(list);
691 return NULL;
692 }
693
694 #ifdef COPY_PROTOCOL
695 Py_C_Function( mxTextSearch_copy,
696 "copy([memo])\n\n"
697 "Return a new reference for the instance. This function\n"
698 "is used for the copy-protocol. Real copying doesn't take\n"
699 "place, since the instances are immutable.")
700 {
701 PyObject *memo;
702
703 Py_GetArg("|O",memo);
704 Py_INCREF(so);
705 return (PyObject *)so;
706 onError:
707 return NULL;
708 }
709 #endif
710
711 #undef so
712
713 /* --- slots --- */
714
715 static
mxTextSearch_Repr(mxTextSearchObject * self)716 PyObject *mxTextSearch_Repr(mxTextSearchObject *self)
717 {
718 char *algoname;
719 PyObject *v;
720 char t[500], *reprstr;
721
722 v = PyObject_Repr(self->match);
723 if (v == NULL)
724 return NULL;
725 reprstr = PyString_AsString(v);
726 if (reprstr == NULL)
727 return NULL;
728
729 switch (self->algorithm) {
730 case MXTEXTSEARCH_BOYERMOORE:
731 algoname = "Boyer-Moore";
732 break;
733 case MXTEXTSEARCH_TRIVIAL:
734 algoname = "Trivial";
735 break;
736 default:
737 algoname = "";
738 }
739
740 sprintf(t, "<%.50s TextSearch object for %.400s at 0x%lx>",
741 algoname, reprstr, (long)self);
742 Py_DECREF(v);
743 return PyString_FromString(t);
744 }
745
746 /* Python Method Table */
747
748 static
749 PyMethodDef mxTextSearch_Methods[] =
750 {
751 Py_MethodListEntry("search",mxTextSearch_search),
752 Py_MethodListEntry("find",mxTextSearch_find),
753 Py_MethodListEntry("findall",mxTextSearch_findall),
754 #ifdef COPY_PROTOCOL
755 Py_MethodListEntry("__deepcopy__",mxTextSearch_copy),
756 Py_MethodListEntry("__copy__",mxTextSearch_copy),
757 #endif
758 {NULL,NULL} /* end of list */
759 };
760
761 static PyMemberDef mxTextSearch_members[] = {
762 {"match",T_OBJECT_EX,offsetof(mxTextSearchObject,match),READONLY,"Text that this search matches"},
763 {"translate",T_OBJECT,offsetof(mxTextSearchObject,translate),READONLY,"Translated search term"},
764 {"algorithm",T_INT,offsetof(mxTextSearchObject,algorithm),READONLY,"Algorithm in use by the text search"},
765 {NULL}
766 };
767
768 /* Python Type Table */
769
770 PyTypeObject mxTextSearch_Type = {
771 PyVarObject_HEAD_INIT(NULL, 0) /* init at startup ! */
772 "TextSearch", /*tp_name*/
773 sizeof(mxTextSearchObject), /*tp_basicsize*/
774 0, /*tp_itemsize*/
775 /* methods */
776 (destructor)mxTextSearch_Free, /*tp_dealloc*/
777 (printfunc)0, /*tp_print*/
778 (getattrfunc)0, /*tp_getattr*/
779 (setattrfunc)0, /*tp_setattr*/
780 0, /*tp_compare*/
781 (reprfunc)mxTextSearch_Repr, /*tp_repr*/
782 0, /*tp_as_number*/
783 0, /*tp_as_number*/
784 0, /*tp_as_mapping*/
785 (hashfunc)0, /*tp_hash*/
786 (ternaryfunc)0, /*tp_call*/
787 (reprfunc)0, /*tp_str*/
788 (getattrofunc)0, /*tp_getattro*/
789 (setattrofunc)0, /*tp_setattro*/
790 0, /*tp_asbuffer*/
791 Py_TPFLAGS_DEFAULT, /*tp_flags*/
792 "mxTextTools text-search object", /*tp_doc*/
793 0, /*tp_traverse*/
794 0, /*tp_clear*/
795 0, /*tp_richcompare*/
796 0, /*tp_weaklistoffset*/
797 0, /*tp_iter*/
798 0, /*tp_iternext*/
799 mxTextSearch_Methods, /*tp_methods*/
800 mxTextSearch_members, /*tp_members*/
801 };
802
803 /* --- Character Set Object --------------------------------------------*/
804
805 /* internal */
806
807 /* 8-bit character sets are implemented using a simple 32-byte
808 long bitmap with one bit per character.
809
810 Addressing is done as follows:
811
812 def char_is_set(ordinal):
813 return bitmap[ordinal >> 3] & (1 << (ordinal & 7))
814
815 */
816
817 #define STRING_CHARSET_SIZE 256
818 #define STRING_CHARSET_BITMAP_SIZE (STRING_CHARSET_SIZE / 8)
819
820 typedef struct {
821 unsigned char bitmap[STRING_CHARSET_BITMAP_SIZE];
822 /* character bitmap */
823 } string_charset;
824
825 static
init_string_charset(mxCharSetObject * cs,PyObject * definition)826 int init_string_charset(mxCharSetObject *cs,
827 PyObject *definition)
828 {
829 register Py_ssize_t i, j;
830 char *def = PyString_AS_STRING(definition);
831 const Py_ssize_t len = PyString_GET_SIZE(definition);
832 string_charset *lookup = 0;
833 register unsigned char *bitmap;
834 int logic = 1;
835
836 /* Handle logic change (first char is '^' for negative matching) */
837 if (len > 0 && def[0] == '^') {
838 logic = 0;
839 i = 1;
840 }
841 else
842 i = 0;
843
844 /* Build 32-byte lookup bitmap (one bit per character) */
845 lookup = (string_charset *)PyMem_Malloc(sizeof(string_charset));
846 if (lookup == NULL) {
847 PyErr_NoMemory();
848 goto onError;
849 }
850 memset(lookup, 0, sizeof(string_charset));
851 cs->mode = MXCHARSET_8BITMODE;
852 cs->lookup = (void *)lookup;
853 bitmap = lookup->bitmap;
854
855 for (; i < len; i++) {
856
857 /* Handle escapes: "b\-d", "\\" */
858 if (def[i] == '\\') {
859 if (i < len - 1 && def[i+1] == '\\') {
860 j = (unsigned char)'\\';
861 bitmap[j >> 3] |= 1 << (j & 7);
862 i++;
863 }
864 continue;
865 }
866
867 /* Handle ranges: "b-d", "\\-z", "\--z" */
868 if (i < len - 2 && def[i+1] == '-') {
869 unsigned char range_left = def[i];
870 unsigned char range_right = def[i+2];
871 for (j = range_left; j <= range_right; j++)
872 bitmap[j >> 3] |= 1 << (j & 7);
873 i++;
874 continue;
875 }
876
877 /* Normal processing */
878 j = (unsigned char)def[i];
879 bitmap[j >> 3] |= 1 << (j & 7);
880 }
881
882 /* Invert bitmap if negative matching is requested */
883 if (!logic) {
884 DPRINTF("init_string_charset: inverting bitmap\n");
885 for (i = 0; i < STRING_CHARSET_BITMAP_SIZE; i++)
886 bitmap[i] ^= 0xFF;
887 }
888
889 return 0;
890
891 onError:
892 if (lookup)
893 PyMem_Free((void *)lookup);
894 cs->lookup = 0;
895 return -1;
896 }
897
898 #ifdef HAVE_UNICODE
899
900 /* Unicode character sets are implemented using two step indexing
901 which is a good compromise between lookup speed and memory usage.
902
903 Lookup is done using a variable length array of 32-byte bitmap
904 blocks. There can be 256 such blocks. Identical blocks are
905 collapsed into a single copy.
906
907 Addressing is done as follows:
908
909 def char_is_set(ordinal):
910 index = bitmapindex[ordinal >> 8]
911 bitmap = bitmaps[index]
912 return bitmap[(ordinal >> 3) & 31] & (1 << (ordinal & 7))
913
914 The technique used here is very similar to what is done in Python's
915 SRE (see the BIGCHARSET patch by Martin von Loewis). Compression
916 should be reasonably good since character sets in practice usually
917 only contains a few single characters or longer ranges of Unicode
918 characters.
919
920 */
921
922 #define UNICODE_CHARSET_SIZE 65536
923 #define UNICODE_CHARSET_BITMAP_SIZE 32
924 #define UNICODE_CHARSET_BITMAPS (UNICODE_CHARSET_SIZE / (UNICODE_CHARSET_BITMAP_SIZE * 8))
925 #define UNICODE_CHARSET_BIGMAP_SIZE (UNICODE_CHARSET_SIZE / 8)
926
927 typedef struct {
928 unsigned char bitmapindex[UNICODE_CHARSET_BITMAPS];
929 /* Index to char bitmaps */
930 unsigned char bitmaps[UNICODE_CHARSET_BITMAPS][UNICODE_CHARSET_BITMAP_SIZE];
931 /* Variable length bitmap array */
932 } unicode_charset;
933
934 static
init_unicode_charset(mxCharSetObject * cs,PyObject * definition)935 int init_unicode_charset(mxCharSetObject *cs,
936 PyObject *definition)
937 {
938 register Py_ssize_t i, j;
939 Py_UNICODE *def = PyUnicode_AS_UNICODE(definition);
940 const Py_ssize_t len = PyUnicode_GET_SIZE(definition);
941 unicode_charset *lookup = 0;
942 unsigned char bigmap[UNICODE_CHARSET_BIGMAP_SIZE];
943 Py_ssize_t blocks;
944 int logic = 1;
945
946 /* Handle logic change (first char is '^' for negative matching) */
947 if (len > 0 && def[0] == '^') {
948 logic = 0;
949 i = 1;
950 }
951 else
952 i = 0;
953
954 /* Build bigmap */
955 memset(bigmap, 0, sizeof(bigmap));
956 for (; i < len; i++) {
957
958 /* Handle escapes: "b\-d", "\\" */
959 if (def[i] == '\\') {
960 if (i < len - 1 && def[i+1] == '\\') {
961 j = (int)'\\';
962 bigmap[j >> 3] |= 1 << (j & 7);
963 i++;
964 }
965 continue;
966 }
967
968 /* Handle ranges: "b-d", "\\-z", "\--z" */
969 if (i < len - 2 && def[i+1] == '-') {
970 Py_UNICODE range_left = def[i];
971 Py_UNICODE range_right = def[i+2];
972 if (range_right >= UNICODE_CHARSET_SIZE) {
973 Py_Error(PyExc_ValueError,
974 "unicode ordinal out of supported range");
975 }
976 for (j = range_left; j <= range_right; j++)
977 bigmap[j >> 3] |= 1 << (j & 7);
978 i++;
979 continue;
980 }
981
982 /* Normal processing */
983 j = def[i];
984 if (j >= UNICODE_CHARSET_SIZE) {
985 Py_Error(PyExc_ValueError,
986 "unicode ordinal out of supported range");
987 }
988 bigmap[j >> 3] |= 1 << (j & 7);
989 }
990
991 /* Build lookup table
992
993 XXX Could add dynamic resizing here... probably not worth it
994 though, since sizeof(unicode_charset) isn't all that large.
995
996 */
997 lookup = (unicode_charset *)PyMem_Malloc(sizeof(unicode_charset));
998 if (lookup == NULL) {
999 PyErr_NoMemory();
1000 goto onError;
1001 }
1002 blocks = 0;
1003 for (i = UNICODE_CHARSET_BITMAPS - 1; i >= 0; i--) {
1004 unsigned char *block = &bigmap[i << 5];
1005 for (j = blocks - 1; j >= 0; j--)
1006 if (memcmp(lookup->bitmaps[j], block,
1007 UNICODE_CHARSET_BITMAP_SIZE) == 0)
1008 break;
1009 if (j < 0) {
1010 j = blocks;
1011 DPRINTF("init_unicode_charset: Creating new block %i for %i\n",
1012 j, i);
1013 memcpy(lookup->bitmaps[j], block, UNICODE_CHARSET_BITMAP_SIZE);
1014 blocks++;
1015 }
1016 else
1017 DPRINTF("init_unicode_charset: Reusing block %i for %i\n", j, i);
1018 lookup->bitmapindex[i] = j;
1019 }
1020 DPRINTF("init_unicode_charset: Map size: %i block(s) = %i bytes\n",
1021 blocks, UNICODE_CHARSET_BITMAPS +
1022 blocks * UNICODE_CHARSET_BITMAP_SIZE);
1023 lookup = (unicode_charset *)PyMem_Realloc(lookup,
1024 UNICODE_CHARSET_BITMAPS
1025 + blocks * UNICODE_CHARSET_BITMAP_SIZE);
1026 if (lookup == NULL) {
1027 PyErr_NoMemory();
1028 goto onError;
1029 }
1030
1031 /* Invert bitmaps if negative matching is requested */
1032 if (!logic) {
1033 register unsigned char *bitmap = &lookup->bitmaps[0][0];
1034 DPRINTF("init_unicode_charset: inverting bitmaps\n");
1035 for (i = 0; i < blocks * UNICODE_CHARSET_BITMAP_SIZE; i++)
1036 bitmap[i] ^= 0xFF;
1037 }
1038
1039 cs->mode = MXCHARSET_UCS2MODE;
1040 cs->lookup = (void *)lookup;
1041 return 0;
1042
1043 onError:
1044 if (lookup)
1045 PyMem_Free((void *)lookup);
1046 cs->lookup = 0;
1047 return -1;
1048 }
1049
1050 #endif
1051
1052 /* allocation */
1053
1054 static
mxCharSet_New(PyObject * definition)1055 PyObject *mxCharSet_New(PyObject *definition)
1056 {
1057 mxCharSetObject *cs;
1058
1059 cs = PyObject_NEW(mxCharSetObject, &mxCharSet_Type);
1060 if (cs == NULL)
1061 return NULL;
1062 Py_INCREF(definition);
1063 cs->definition = definition;
1064 cs->lookup = NULL;
1065 cs->mode = -1;
1066
1067 if (PyString_Check(definition)) {
1068 if (init_string_charset(cs, definition))
1069 goto onError;
1070 }
1071 #ifdef HAVE_UNICODE
1072 else if (PyUnicode_Check(definition)) {
1073 if (init_unicode_charset(cs, definition))
1074 goto onError;
1075 }
1076 #endif
1077 else
1078 Py_Error(PyExc_TypeError,
1079 "character set definition must be string or unicode");
1080
1081 return (PyObject *)cs;
1082
1083 onError:
1084 Py_DECREF(cs);
1085 return NULL;
1086 }
1087
1088 Py_C_Function( mxCharSet_CharSet,
1089 "CharSet(definition)\n\n"
1090 "Create a character set matching object from the string"
1091 )
1092 {
1093 PyObject *definition;
1094
1095 Py_GetArg("O:CharSet", definition);
1096 return mxCharSet_New(definition);
1097
1098 onError:
1099 return NULL;
1100 }
1101
1102 static
mxCharSet_Free(mxCharSetObject * cs)1103 void mxCharSet_Free(mxCharSetObject *cs)
1104 {
1105 Py_XDECREF(cs->definition);
1106 if (cs->lookup)
1107 PyMem_Free(cs->lookup);
1108 PyObject_Del(cs);
1109 }
1110
1111 /* C APIs */
1112
1113 #define cs ((mxCharSetObject *)self)
1114
mxCharSet_ContainsChar(PyObject * self,register unsigned char ch)1115 int mxCharSet_ContainsChar(PyObject *self,
1116 register unsigned char ch)
1117 {
1118 if (!mxCharSet_Check(self)) {
1119 PyErr_BadInternalCall();
1120 goto onError;
1121 }
1122
1123 if (cs->mode == MXCHARSET_8BITMODE) {
1124 unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap;
1125 return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0);
1126 }
1127 #ifdef HAVE_UNICODE
1128 else if (cs->mode == MXCHARSET_UCS2MODE) {
1129 unicode_charset *lookup = (unicode_charset *)cs->lookup;
1130 unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[0]];
1131 return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0);
1132 }
1133 #endif
1134 else {
1135 Py_Error(mxTextTools_Error,
1136 "unsupported character set mode");
1137 }
1138
1139 onError:
1140 return -1;
1141 }
1142
1143 #ifdef HAVE_UNICODE
1144
mxCharSet_ContainsUnicodeChar(PyObject * self,register Py_UNICODE ch)1145 int mxCharSet_ContainsUnicodeChar(PyObject *self,
1146 register Py_UNICODE ch)
1147 {
1148 if (!mxCharSet_Check(self)) {
1149 PyErr_BadInternalCall();
1150 goto onError;
1151 }
1152
1153 if (cs->mode == MXCHARSET_8BITMODE) {
1154 unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap;
1155 if (ch >= 256)
1156 return 0;
1157 return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0);
1158 }
1159 else if (cs->mode == MXCHARSET_UCS2MODE) {
1160 unicode_charset *lookup = (unicode_charset *)cs->lookup;
1161 unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[ch >> 8]];
1162 return ((bitmap[(ch >> 3) & 31] & (1 << (ch & 7))) != 0);
1163 }
1164 else {
1165 Py_Error(mxTextTools_Error,
1166 "unsupported character set mode");
1167 }
1168
1169 onError:
1170 return -1;
1171 }
1172
1173 #endif
1174
1175 static
mxCharSet_Contains(PyObject * self,PyObject * other)1176 int mxCharSet_Contains(PyObject *self,
1177 PyObject *other)
1178 {
1179 if (PyString_Check(other)) {
1180 Py_Assert(PyString_GET_SIZE(other) == 1,
1181 PyExc_TypeError,
1182 "expected a single character");
1183 return mxCharSet_ContainsChar(self, PyString_AS_STRING(other)[0]);
1184 }
1185 #ifdef HAVE_UNICODE
1186 else if (PyUnicode_Check(other)) {
1187 Py_Assert(PyUnicode_GET_SIZE(other) == 1,
1188 PyExc_TypeError,
1189 "expected a single unicode character");
1190 return mxCharSet_ContainsUnicodeChar(self,
1191 PyUnicode_AS_UNICODE(other)[0]);
1192 }
1193 #endif
1194 else
1195 Py_Error(PyExc_TypeError,
1196 "expected string or unicode character");
1197
1198 onError:
1199 return -1;
1200 }
1201
1202 /* In mode 1, find the position of the first character in text
1203 belonging to set. This may also be stop or start-1 in case no such
1204 character is found during the search (depending on the direction).
1205
1206 In mode 0, find the first character not in set. This may also be
1207 stop or start-1 in case no such character is found during the
1208 search (depending on the direction).
1209
1210 The search is done in the slice start:stop.
1211
1212 -2 is returned in case of an error.
1213
1214 */
1215
1216 static
mxCharSet_FindChar(PyObject * self,unsigned char * text,Py_ssize_t start,Py_ssize_t stop,const int mode,const int direction)1217 int mxCharSet_FindChar(PyObject *self,
1218 unsigned char *text,
1219 Py_ssize_t start,
1220 Py_ssize_t stop,
1221 const int mode,
1222 const int direction)
1223 {
1224 register Py_ssize_t i;
1225 register unsigned int c;
1226 register unsigned int block;
1227 unsigned char *bitmap;
1228
1229 if (!mxCharSet_Check(self)) {
1230 PyErr_BadInternalCall();
1231 goto onError;
1232 }
1233
1234 if (cs->mode == MXCHARSET_8BITMODE)
1235 bitmap = ((string_charset *)cs->lookup)->bitmap;
1236 #ifdef HAVE_UNICODE
1237 else if (cs->mode == MXCHARSET_UCS2MODE) {
1238 unicode_charset *lookup = (unicode_charset *)cs->lookup;
1239 bitmap = lookup->bitmaps[lookup->bitmapindex[0]];
1240 }
1241 #endif
1242 else {
1243 Py_Error(mxTextTools_Error,
1244 "unsupported character set mode");
1245 }
1246
1247 if (direction > 0) {
1248 if (mode)
1249 /* Find first char in set */
1250 for (i = start; i < stop; i++) {
1251 c = text[i];
1252 block = bitmap[c >> 3];
1253 if (block && ((block & (1 << (c & 7))) != 0))
1254 break;
1255 }
1256 else
1257 /* Find first char not in set */
1258 for (i = start; i < stop; i++) {
1259 c = text[i];
1260 block = bitmap[c >> 3];
1261 if (!block || ((block & (1 << (c & 7))) == 0))
1262 break;
1263 }
1264 }
1265 else {
1266 if (mode)
1267 /* Find first char in set, searching from the end */
1268 for (i = stop - 1; i >= start; i--) {
1269 c = text[i];
1270 block = bitmap[c >> 3];
1271 if (block && ((block & (1 << (c & 7))) != 0))
1272 break;
1273 }
1274 else
1275 /* Find first char not in set, searching from the end */
1276 for (i = stop - 1; i >= start; i--) {
1277 c = text[i];
1278 block = bitmap[c >> 3];
1279 if (!block || ((block & (1 << (c & 7))) == 0))
1280 break;
1281 }
1282 }
1283 return i;
1284
1285 onError:
1286 return -2;
1287 }
1288
1289 #ifdef HAVE_UNICODE
1290
1291 static
mxCharSet_FindUnicodeChar(PyObject * self,Py_UNICODE * text,Py_ssize_t start,Py_ssize_t stop,const int mode,const int direction)1292 int mxCharSet_FindUnicodeChar(PyObject *self,
1293 Py_UNICODE *text,
1294 Py_ssize_t start,
1295 Py_ssize_t stop,
1296 const int mode,
1297 const int direction)
1298 {
1299 register int i;
1300 register unsigned int c;
1301 register unsigned int block;
1302 unsigned char *bitmap;
1303
1304 if (!mxCharSet_Check(self)) {
1305 PyErr_BadInternalCall();
1306 goto onError;
1307 }
1308
1309 if (cs->mode == MXCHARSET_8BITMODE) {
1310 bitmap = ((string_charset *)cs->lookup)->bitmap;
1311 if (direction > 0) {
1312 if (mode)
1313 /* Find first char in set */
1314 for (i = start; i < stop; i++) {
1315 c = text[i];
1316 if (c > 256)
1317 continue;
1318 block = bitmap[c >> 3];
1319 if (block && ((block & (1 << (c & 7))) != 0))
1320 break;
1321 }
1322 else
1323 /* Find first char not in set */
1324 for (i = start; i < stop; i++) {
1325 c = text[i];
1326 if (c > 256)
1327 break;
1328 block = bitmap[c >> 3];
1329 if (!block || ((block & (1 << (c & 7))) == 0))
1330 break;
1331 }
1332 }
1333 else {
1334 if (mode)
1335 /* Find first char in set, searching from the end */
1336 for (i = stop - 1; i >= start; i--) {
1337 c = text[i];
1338 if (c > 256)
1339 continue;
1340 block = bitmap[c >> 3];
1341 if (block && ((block & (1 << (c & 7))) != 0))
1342 break;
1343 }
1344 else
1345 /* Find first char not in set, searching from the end */
1346 for (i = stop - 1; i >= start; i--) {
1347 c = text[i];
1348 if (c > 256)
1349 break;
1350 block = bitmap[c >> 3];
1351 if (!block || ((block & (1 << (c & 7))) == 0))
1352 break;
1353 }
1354 }
1355 return i;
1356 }
1357
1358 #ifdef HAVE_UNICODE
1359 else if (cs->mode == MXCHARSET_UCS2MODE) {
1360 unicode_charset *lookup = (unicode_charset *)cs->lookup;
1361 if (direction > 0) {
1362 if (mode)
1363 /* Find first char in set */
1364 for (i = start; i < stop; i++) {
1365 c = text[i];
1366 bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]];
1367 block = bitmap[(c >> 3) & 31];
1368 if (block && ((block & (1 << (c & 7))) != 0))
1369 break;
1370 }
1371 else
1372 /* Find first char not in set */
1373 for (i = start; i < stop; i++) {
1374 c = text[i];
1375 bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]];
1376 block = bitmap[(c >> 3) & 31];
1377 if (!block || ((block & (1 << (c & 7))) == 0))
1378 break;
1379 }
1380 }
1381 else {
1382 if (mode)
1383 /* Find first char in set, searching from the end */
1384 for (i = stop - 1; i >= start; i--) {
1385 c = text[i];
1386 bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]];
1387 block = bitmap[(c >> 3) & 31];
1388 if (block && ((block & (1 << (c & 7))) != 0))
1389 break;
1390 }
1391 else
1392 /* Find first char not in set, searching from the end */
1393 for (i = stop - 1; i >= start; i--) {
1394 c = text[i];
1395 bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]];
1396 block = bitmap[(c >> 3) & 31];
1397 if (!block || ((block & (1 << (c & 7))) == 0))
1398 break;
1399 }
1400 }
1401 return i;
1402 }
1403 #endif
1404 else {
1405 Py_Error(mxTextTools_Error,
1406 "unsupported character set mode");
1407 }
1408
1409 onError:
1410 return -2;
1411 }
1412
1413 #endif
1414
1415 /* Return the position of the first character in text[start:stop]
1416 occurring in set or -1 in case no such character exists.
1417
1418 */
1419
1420 static
mxCharSet_Search(PyObject * self,PyObject * text,Py_ssize_t start,Py_ssize_t stop,int direction)1421 int mxCharSet_Search(PyObject *self,
1422 PyObject *text,
1423 Py_ssize_t start,
1424 Py_ssize_t stop,
1425 int direction)
1426 {
1427 Py_ssize_t position;
1428
1429 if (PyString_Check(text)) {
1430 Py_CheckStringSlice(text, start, stop);
1431 position = mxCharSet_FindChar(self,
1432 (unsigned char *)PyString_AS_STRING(text),
1433 start,
1434 stop,
1435 1,
1436 direction);
1437 }
1438 #ifdef HAVE_UNICODE
1439 else if (PyUnicode_Check(text)) {
1440 Py_CheckUnicodeSlice(text, start, stop);
1441 position = mxCharSet_FindUnicodeChar(self,
1442 PyUnicode_AS_UNICODE(text),
1443 start,
1444 stop,
1445 1,
1446 direction);
1447 }
1448 #endif
1449 else
1450 Py_Error(PyExc_TypeError,
1451 "expected string or unicode");
1452
1453 if ((direction > 0 && position >= stop) ||
1454 (direction <= 0 && position < start))
1455 position = -1;
1456 return position;
1457
1458 onError:
1459 return -2;
1460 }
1461
1462 /* Return the longest match of characters from set in
1463 text[start:stop].
1464
1465 If direction is positive, the search is done from the left (longest
1466 prefix), otherwise it is started from the right (longest suffix).
1467
1468 -1 is returned in case of an error.
1469
1470 */
1471
mxCharSet_Match(PyObject * self,PyObject * text,Py_ssize_t start,Py_ssize_t stop,int direction)1472 Py_ssize_t mxCharSet_Match(PyObject *self,
1473 PyObject *text,
1474 Py_ssize_t start,
1475 Py_ssize_t stop,
1476 int direction)
1477 {
1478 Py_ssize_t position;
1479
1480 if (PyString_Check(text)) {
1481 Py_CheckStringSlice(text, start, stop);
1482 position = mxCharSet_FindChar(self,
1483 (unsigned char *)PyString_AS_STRING(text),
1484 start,
1485 stop,
1486 0,
1487 direction);
1488 }
1489 #ifdef HAVE_UNICODE
1490 else if (PyUnicode_Check(text)) {
1491 Py_CheckUnicodeSlice(text, start, stop);
1492 position = mxCharSet_FindUnicodeChar(self,
1493 PyUnicode_AS_UNICODE(text),
1494 start,
1495 stop,
1496 0,
1497 direction);
1498 }
1499 #endif
1500 else
1501 Py_Error(PyExc_TypeError,
1502 "expected string or unicode");
1503
1504 if (position < -1)
1505 goto onError;
1506 if (direction > 0)
1507 return position - start;
1508 else
1509 return stop-1 - position;
1510
1511 onError:
1512 return -1;
1513 }
1514
1515 /* Stips off characters appearing in the character set from text[start:stop]
1516 and returns the result as Python string object.
1517
1518 where indicates the mode:
1519 where < 0: strip left only
1520 where = 0: strip left and right
1521 where > 0: strip right only
1522
1523 */
1524 static
mxCharSet_Strip(PyObject * self,PyObject * text,Py_ssize_t start,Py_ssize_t stop,Py_ssize_t where)1525 PyObject *mxCharSet_Strip(PyObject *self,
1526 PyObject *text,
1527 Py_ssize_t start,
1528 Py_ssize_t stop,
1529 Py_ssize_t where)
1530 {
1531 Py_ssize_t left,right;
1532
1533 if (!mxCharSet_Check(self)) {
1534 PyErr_BadInternalCall();
1535 goto onError;
1536 }
1537
1538 if (PyString_Check(text)) {
1539 Py_CheckStringSlice(text, start, stop);
1540
1541 /* Strip left */
1542 if (where <= 0) {
1543 left = mxCharSet_FindChar(self,
1544 (unsigned char *)PyString_AS_STRING(text),
1545 start,
1546 stop,
1547 0,
1548 1);
1549 if (left < 0)
1550 goto onError;
1551 }
1552 else
1553 left = start;
1554
1555 /* Strip right */
1556 if (where >= 0) {
1557 right = mxCharSet_FindChar(self,
1558 (unsigned char *)PyString_AS_STRING(text),
1559 left,
1560 stop,
1561 0,
1562 -1) + 1;
1563 if (right < 0)
1564 goto onError;
1565 }
1566 else
1567 right = stop;
1568
1569 return PyString_FromStringAndSize(PyString_AS_STRING(text) + left,
1570 max(right - left, 0));
1571 }
1572 #ifdef HAVE_UNICODE
1573 else if (PyUnicode_Check(text)) {
1574 Py_CheckUnicodeSlice(text, start, stop);
1575
1576 /* Strip left */
1577 if (where <= 0) {
1578 left = mxCharSet_FindUnicodeChar(self,
1579 PyUnicode_AS_UNICODE(text),
1580 start,
1581 stop,
1582 0,
1583 1);
1584 if (left < 0)
1585 goto onError;
1586 }
1587 else
1588 left = start;
1589
1590 /* Strip right */
1591 if (where >= 0) {
1592 right = mxCharSet_FindUnicodeChar(self,
1593 PyUnicode_AS_UNICODE(text),
1594 start,
1595 stop,
1596 0,
1597 -1) + 1;
1598 if (right < 0)
1599 goto onError;
1600 }
1601 else
1602 right = stop;
1603
1604 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(text) + left,
1605 max(right - left, 0));
1606 }
1607 #endif
1608 else
1609 Py_Error(PyExc_TypeError,
1610 "expected string or unicode");
1611
1612 onError:
1613 return NULL;
1614 }
1615
1616 static
mxCharSet_Split(PyObject * self,PyObject * text,Py_ssize_t start,Py_ssize_t text_len,int include_splits)1617 PyObject *mxCharSet_Split(PyObject *self,
1618 PyObject *text,
1619 Py_ssize_t start,
1620 Py_ssize_t text_len,
1621 int include_splits)
1622 {
1623 PyObject *list = NULL;
1624 PyObject *s;
1625 register Py_ssize_t x;
1626 Py_ssize_t listitem = 0;
1627 Py_ssize_t listsize = INITIAL_LIST_SIZE;
1628
1629 if (!mxCharSet_Check(self)) {
1630 PyErr_BadInternalCall();
1631 goto onError;
1632 }
1633
1634 list = PyList_New(listsize);
1635 if (!list)
1636 goto onError;
1637
1638 if (PyString_Check(text)) {
1639 unsigned char *tx = (unsigned char *)PyString_AS_STRING(text);
1640
1641 Py_CheckStringSlice(text, start, text_len);
1642
1643 x = start;
1644 while (x < text_len) {
1645 Py_ssize_t z;
1646
1647 /* Skip all text in set (include_splits == 0), not in set
1648 (include_splits == 1) */
1649 z = x;
1650 x = mxCharSet_FindChar(self, tx, x, text_len, include_splits, 1);
1651
1652 /* Append the slice to list */
1653 if (include_splits) {
1654 s = PyString_FromStringAndSize((char *)&tx[z], x - z);
1655 if (!s)
1656 goto onError;
1657 if (listitem < listsize)
1658 PyList_SET_ITEM(list,listitem,s);
1659 else {
1660 PyList_Append(list,s);
1661 Py_DECREF(s);
1662 }
1663 listitem++;
1664
1665 if (x >= text_len)
1666 break;
1667 }
1668
1669 /* Skip all text in set (include_splits == 1), not in set
1670 (include_splits == 0) */
1671 z = x;
1672 x = mxCharSet_FindChar(self, tx, x, text_len, !include_splits, 1);
1673
1674 /* Append the slice to list if it is not empty */
1675 if (x > z) {
1676 s = PyString_FromStringAndSize((char *)&tx[z], x - z);
1677 if (!s)
1678 goto onError;
1679 if (listitem < listsize)
1680 PyList_SET_ITEM(list,listitem,s);
1681 else {
1682 PyList_Append(list,s);
1683 Py_DECREF(s);
1684 }
1685 listitem++;
1686 }
1687 }
1688
1689 }
1690 #ifdef HAVE_UNICODE
1691 else if (PyUnicode_Check(text)) {
1692 Py_UNICODE *tx = PyUnicode_AS_UNICODE(text);
1693
1694 Py_CheckUnicodeSlice(text, start, text_len);
1695
1696 x = start;
1697 while (x < text_len) {
1698 Py_ssize_t z;
1699
1700 /* Skip all text in set (include_splits == 0), not in set
1701 (include_splits == 1) */
1702 z = x;
1703 x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, include_splits, 1);
1704
1705 /* Append the slice to list */
1706 if (include_splits) {
1707 s = PyUnicode_FromUnicode(&tx[z], x - z);
1708 if (!s)
1709 goto onError;
1710 if (listitem < listsize)
1711 PyList_SET_ITEM(list,listitem,s);
1712 else {
1713 PyList_Append(list,s);
1714 Py_DECREF(s);
1715 }
1716 listitem++;
1717
1718 if (x >= text_len)
1719 break;
1720 }
1721
1722 /* Skip all text in set (include_splits == 1), not in set
1723 (include_splits == 0) */
1724 z = x;
1725 x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, !include_splits, 1);
1726
1727 /* Append the slice to list if it is not empty */
1728 if (x > z) {
1729 s = PyUnicode_FromUnicode(&tx[z], x - z);
1730 if (!s)
1731 goto onError;
1732 if (listitem < listsize)
1733 PyList_SET_ITEM(list,listitem,s);
1734 else {
1735 PyList_Append(list,s);
1736 Py_DECREF(s);
1737 }
1738 listitem++;
1739 }
1740 }
1741 }
1742 #endif
1743 else
1744 Py_Error(PyExc_TypeError,
1745 "expected string or unicode");
1746
1747 /* Resize list if necessary */
1748 if (listitem < listsize)
1749 PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL);
1750
1751 return list;
1752
1753 onError:
1754 Py_XDECREF(list);
1755 return NULL;
1756 }
1757
1758 /* methods */
1759
1760 Py_C_Function( mxCharSet_contains,
1761 ".contains(char)\n\n"
1762 )
1763 {
1764 PyObject *chr;
1765 int rc;
1766
1767 Py_GetArg("O:CharSet.contains", chr);
1768
1769 rc = mxCharSet_Contains(self, chr);
1770 if (rc < 0)
1771 goto onError;
1772 return PyInt_FromLong(rc);
1773
1774 onError:
1775 return NULL;
1776 }
1777
1778 Py_C_Function( mxCharSet_search,
1779 ".search(text[, direction=1, start=0, stop=len(text)])\n\n"
1780 )
1781 {
1782 PyObject *text;
1783 int direction = 1;
1784 Py_ssize_t start = 0, stop = INT_MAX;
1785 int rc;
1786
1787 Py_Get4Args("O|iii:CharSet.search", text, direction, start, stop);
1788
1789 rc = mxCharSet_Search(self, text, start, stop, direction);
1790 if (rc == -1)
1791 Py_ReturnNone();
1792 if (rc < -1)
1793 goto onError;
1794 return PyInt_FromLong(rc);
1795
1796 onError:
1797 return NULL;
1798 }
1799
1800 Py_C_Function( mxCharSet_match,
1801 ".match(text[, direction=1, start=0, stop=len(text)])\n\n"
1802 )
1803 {
1804 PyObject *text;
1805 int direction = 1;
1806 Py_ssize_t start = 0, stop = INT_MAX;
1807 int rc;
1808
1809 Py_Get4Args("O|iii:CharSet.match", text, direction, start, stop);
1810
1811 rc = mxCharSet_Match(self, text, start, stop, direction);
1812 if (rc < 0)
1813 goto onError;
1814 return PyInt_FromLong(rc);
1815
1816 onError:
1817 return NULL;
1818 }
1819
1820 Py_C_Function( mxCharSet_split,
1821 ".split(text[, start=0, stop=len(text)])\n\n"
1822 )
1823 {
1824 PyObject *text;
1825 Py_ssize_t start = 0, stop = INT_MAX;
1826
1827 Py_Get3Args("O|ii:CharSet.split", text, start, stop);
1828
1829 return mxCharSet_Split(self, text, start, stop, 0);
1830
1831 onError:
1832 return NULL;
1833 }
1834
1835 Py_C_Function( mxCharSet_splitx,
1836 ".splitx(text[, start=0, stop=len(text)])\n\n"
1837 )
1838 {
1839 PyObject *text;
1840 Py_ssize_t start = 0, stop = INT_MAX;
1841
1842 Py_Get3Args("O|ii:CharSet.splitx", text, start, stop);
1843
1844 return mxCharSet_Split(self, text, start, stop, 1);
1845
1846 onError:
1847 return NULL;
1848 }
1849
1850 Py_C_Function( mxCharSet_strip,
1851 ".strip(text[, where=0, start=0, stop=len(text)])\n\n"
1852 )
1853 {
1854 PyObject *text;
1855 Py_ssize_t where = 0;
1856 Py_ssize_t start = 0, stop = INT_MAX;
1857
1858 Py_Get4Args("O|iii:CharSet.strip", text, where, start, stop);
1859
1860 return mxCharSet_Strip(self, text, start, stop, where);
1861
1862 onError:
1863 return NULL;
1864 }
1865
1866 #ifdef COPY_PROTOCOL
1867 Py_C_Function( mxCharSet_copy,
1868 "copy([memo])\n\n"
1869 "Return a new reference for the instance. This function\n"
1870 "is used for the copy-protocol. Real copying doesn't take\n"
1871 "place, since the instances are immutable.")
1872 {
1873 PyObject *memo;
1874
1875 Py_GetArg("|O",memo);
1876 Py_INCREF(cs);
1877 return (PyObject *)cs;
1878 onError:
1879 return NULL;
1880 }
1881 #endif
1882
1883 #undef cs
1884
1885 /* --- slots --- */
1886
1887 static
mxCharSet_Repr(mxCharSetObject * self)1888 PyObject *mxCharSet_Repr(mxCharSetObject *self)
1889 {
1890 PyObject *v;
1891 char t[500], *reprstr;
1892
1893 v = PyObject_Repr(self->definition);
1894 if (v == NULL)
1895 return NULL;
1896 reprstr = PyString_AsString(v);
1897 if (reprstr == NULL)
1898 return NULL;
1899 sprintf(t, "<Character Set object for %.400s at 0x%lx>",
1900 reprstr, (long)self);
1901 Py_DECREF(v);
1902 return PyString_FromString(t);
1903 }
1904
1905 /* Python Type Tables */
1906
1907 static
1908 PySequenceMethods mxCharSet_TypeAsSequence = {
1909 (lenfunc)0, /*sq_length*/
1910 (binaryfunc)0, /*sq_concat*/
1911 (ssizeargfunc)0, /*sq_repeat*/
1912 (ssizeargfunc)0, /*sq_item*/
1913 (ssizessizeargfunc)0, /*sq_slice*/
1914 (ssizeobjargproc)0, /*sq_ass_item*/
1915 (ssizessizeobjargproc)0, /*sq_ass_slice*/
1916 (objobjproc)mxCharSet_Contains, /*sq_contains*/
1917 };
1918
1919 static
1920 PyMemberDef mxCharSet_Members[] = {
1921 {"definition",T_OBJECT_EX,offsetof(mxCharSetObject,definition),READONLY,"Definition"},
1922 {NULL}
1923 };
1924
1925 static
1926 PyMethodDef mxCharSet_Methods[] =
1927 {
1928 Py_MethodListEntry("contains",mxCharSet_contains),
1929 Py_MethodListEntry("search",mxCharSet_search),
1930 Py_MethodListEntry("match",mxCharSet_match),
1931 Py_MethodListEntry("strip",mxCharSet_strip),
1932 Py_MethodListEntry("split",mxCharSet_split),
1933 Py_MethodListEntry("splitx",mxCharSet_splitx),
1934 #ifdef COPY_PROTOCOL
1935 Py_MethodListEntry("__deepcopy__",mxCharSet_copy),
1936 Py_MethodListEntry("__copy__",mxCharSet_copy),
1937 #endif
1938 {NULL,NULL} /* end of list */
1939 };
1940
1941 PyTypeObject mxCharSet_Type = {
1942 PyVarObject_HEAD_INIT(NULL, 0) /* init at startup ! */
1943 "Character Set", /* tp_name */
1944 sizeof(mxCharSetObject), /* tp_basicsize */
1945 0, /* tp_itemsize */
1946 /* methods */
1947 (destructor)mxCharSet_Free, /* tp_dealloc */
1948 (printfunc)0, /* tp_print */
1949 (getattrfunc)0, /* tp_getattr */
1950 (setattrfunc)0, /* tp_setattr */
1951 0, /* tp_compare */
1952 (reprfunc)mxCharSet_Repr, /* tp_repr */
1953 0, /* tp_as_number */
1954 &mxCharSet_TypeAsSequence, /* tp_as_sequence */
1955 0, /* tp_as_mapping */
1956 (hashfunc)0, /* tp_hash */
1957 (ternaryfunc)0, /* tp_call */
1958 (reprfunc)0, /* tp_str */
1959 (getattrofunc)0, /* tp_getattro */
1960 (setattrofunc)0, /* tp_setattro */
1961 0, /* tp_as_buffer */
1962 Py_TPFLAGS_DEFAULT, /* tp_flags */
1963 (char*) 0, /* tp_doc */
1964 0, /* tp_traverse */
1965 0, /* tp_clear */
1966 0, /* tp_richcompare */
1967 0, /* tp_weaklistoffset */
1968 0, /* tp_iter */
1969 0, /* tp_iternext */
1970 mxCharSet_Methods, /* tp_methods */
1971 mxCharSet_Members, /* tp_members */
1972 };
1973
1974 /* --- Tag Table Object ------------------------------------------------*/
1975
1976 PyObject *mxTagTable_New(PyObject *definition,
1977 int tabletype,
1978 int cacheable);
1979
1980 /* internal APIs */
1981
1982 static
tc_get_item(register PyObject * obj,register Py_ssize_t i)1983 PyObject *tc_get_item(register PyObject *obj,
1984 register Py_ssize_t i)
1985 {
1986 if (PyTuple_Check(obj)) {
1987 if (i > PyTuple_GET_SIZE(obj))
1988 return NULL;
1989 return PyTuple_GET_ITEM(obj, i);
1990 }
1991 else if (PyList_Check(obj)) {
1992 if (i > PyList_GET_SIZE(obj))
1993 return NULL;
1994 return PyList_GET_ITEM(obj, i);
1995 }
1996 else
1997 return NULL;
1998 }
1999
2000 static
tc_length(register PyObject * obj)2001 Py_ssize_t tc_length(register PyObject *obj)
2002 {
2003 if (obj == NULL)
2004 return -1;
2005 else if (PyTuple_Check(obj))
2006 return PyTuple_GET_SIZE(obj);
2007 else if (PyList_Check(obj))
2008 return PyList_GET_SIZE(obj);
2009 else
2010 return -1;
2011 }
2012
2013 /* Add a jump target to the jump dictionary */
2014
2015 static
tc_add_jumptarget(PyObject * jumpdict,PyObject * targetname,Py_ssize_t index)2016 Py_ssize_t tc_add_jumptarget(PyObject *jumpdict,
2017 PyObject *targetname,
2018 Py_ssize_t index)
2019 {
2020 PyObject *v;
2021
2022 v = PyDict_GetItem(jumpdict, targetname);
2023 if (v != NULL)
2024 Py_ErrorWithArg(PyExc_TypeError,
2025 "tag table entry %d: "
2026 "jump target already defined", (unsigned int) index);
2027 v = PyInt_FromLong(index);
2028 if (v == NULL)
2029 goto onError;
2030 if (PyDict_SetItem(jumpdict, targetname, v))
2031 goto onError;
2032 Py_DECREF(v);
2033 return 0;
2034
2035 onError:
2036 return -1;
2037 }
2038
2039 /* Convert a string command argument to either an 8-bit string or
2040 Unicode depending on the tabletype. */
2041
2042 static
tc_convert_string_arg(PyObject * arg,Py_ssize_t tableposition,int tabletype)2043 PyObject *tc_convert_string_arg(PyObject *arg,
2044 Py_ssize_t tableposition,
2045 int tabletype)
2046 {
2047 /* Convert to strings */
2048 if (tabletype == MXTAGTABLE_STRINGTYPE) {
2049 if (PyString_Check(arg))
2050 return arg;
2051 #ifdef HAVE_UNICODE
2052 else if (PyUnicode_Check(arg)) {
2053 Py_DECREF(arg);
2054 arg = PyUnicode_AsEncodedString(arg,
2055 NULL,
2056 NULL);
2057 if (arg == NULL)
2058 Py_ErrorWithArg(PyExc_TypeError,
2059 "tag table entry %d: "
2060 "conversion from Unicode to "
2061 "string failed", (unsigned int)tableposition);
2062 }
2063 #endif
2064 else
2065 Py_ErrorWithArg(PyExc_TypeError,
2066 "tag table entry %d: "
2067 "command argument must be a "
2068 "string or unicode", (unsigned int)tableposition);
2069 }
2070
2071 #ifdef HAVE_UNICODE
2072 /* Convert to Unicode */
2073 else if (tabletype == MXTAGTABLE_UNICODETYPE) {
2074 if (PyUnicode_Check(arg))
2075 return arg;
2076 else if (PyString_Check(arg)) {
2077 Py_DECREF(arg);
2078 arg = PyUnicode_Decode(PyString_AS_STRING(arg),
2079 PyString_GET_SIZE(arg),
2080 NULL,
2081 NULL);
2082 if (arg == NULL)
2083 Py_ErrorWithArg(PyExc_TypeError,
2084 "tag table entry %d: "
2085 "conversion from string to "
2086 "Unicode failed", (unsigned int)tableposition);
2087 }
2088 else
2089 Py_ErrorWithArg(PyExc_TypeError,
2090 "tag table entry %d: "
2091 "command argument must be a "
2092 "string or unicode", (unsigned int)tableposition);
2093 }
2094 #endif
2095
2096 else
2097 Py_Error(mxTextTools_Error,
2098 "unsupported table type");
2099
2100 return arg;
2101
2102 onError:
2103 return NULL;
2104 }
2105
2106 /* Cleanup any references in the tag table. */
2107
2108 static
tc_cleanup(mxTagTableObject * tagtable)2109 int tc_cleanup(mxTagTableObject *tagtable)
2110 {
2111 Py_ssize_t i;
2112 for (i = 0; i < tagtable->numentries; i++) {
2113 mxTagTableEntry *tagtableentry = &tagtable->entry[i];
2114
2115 Py_XDECREF(tagtableentry->tagobj);
2116 tagtableentry->tagobj = NULL;
2117 Py_XDECREF(tagtableentry->args);
2118 tagtableentry->args = NULL;
2119 }
2120 return 0;
2121 }
2122
2123 /* Initialize the tag table (this is the actual Tag Table compiler) */
2124
2125 static
init_tag_table(mxTagTableObject * tagtable,PyObject * table,Py_ssize_t size,int tabletype,int cacheable)2126 int init_tag_table(mxTagTableObject *tagtable,
2127 PyObject *table,
2128 Py_ssize_t size,
2129 int tabletype,
2130 int cacheable)
2131 {
2132 Py_ssize_t i;
2133 PyObject *entry;
2134 Py_ssize_t entry_len;
2135 PyObject *tagobj, *command, *args = 0, *je, *jne;
2136 PyObject *jumpdict, *v;
2137 int secondpass, own_args = 0;
2138
2139 jumpdict = PyDict_New();
2140 if (jumpdict == NULL)
2141 return -1;
2142
2143 /* Reset to all fields to 0 */
2144 memset(&tagtable->entry[0], 0, size * sizeof(mxTagTableEntry));
2145
2146 /* First pass */
2147 secondpass = 0;
2148 tagtable->numentries = size;
2149 for (i = 0; i < size; i++) {
2150 mxTagTableEntry *tagtableentry = &tagtable->entry[i];
2151
2152 /* Get table entry i and parse it */
2153 entry = tc_get_item(table, i);
2154 if (entry == NULL) {
2155 Py_ErrorWithArg(PyExc_TypeError,
2156 "tag table entry %d: "
2157 "not found or not a supported entry type", (unsigned int)i);
2158 }
2159
2160 /* Special handling for jump marks (args is set to the jump
2161 mark string, jump target index is the next table entry) */
2162 if (PyString_Check(entry)) {
2163 if (tc_add_jumptarget(jumpdict, entry, i + 1))
2164 goto onError;
2165 tagtableentry->tagobj = NULL;
2166 tagtableentry->cmd = MATCH_JUMPTARGET;
2167 tagtableentry->flags = 0;
2168 Py_INCREF(entry);
2169 tagtableentry->args = entry;
2170 tagtableentry->jne = 0;
2171 tagtableentry->je = 1;
2172 continue;
2173 }
2174
2175 /* Get entry length */
2176 entry_len = tc_length(entry);
2177 if (entry_len < 3) {
2178 Py_ErrorWithArg(PyExc_TypeError,
2179 "tag table entry %d: "
2180 "expected an entry of the form "
2181 "(tagobj,command,arg[,jne[,je]])", (unsigned int)i);
2182 }
2183
2184 /* Decode entry parts: (tagobj, command, args[, jne[, je]]) */
2185 tagobj = tc_get_item(entry, 0);
2186 command = tc_get_item(entry, 1);
2187 args = tc_get_item(entry, 2);
2188 if (entry_len >= 4)
2189 jne = tc_get_item(entry, 3);
2190 else
2191 jne = NULL;
2192 if (entry_len >= 5)
2193 je = tc_get_item(entry, 4);
2194 else
2195 je = NULL;
2196
2197 if (tagobj == NULL ||
2198 command == NULL ||
2199 args == NULL ||
2200 (entry_len >= 4 && jne == NULL) ||
2201 (entry_len >= 5 && je == NULL)) {
2202 Py_ErrorWithArg(PyExc_TypeError,
2203 "tag table entry %d: "
2204 "expected an entry of the form "
2205 "(tagobj,command,arg[,jne[,je]])",(unsigned int) i);
2206 }
2207
2208 /* Store tagobj, None gets converted to NULL */
2209 if (tagobj != Py_None)
2210 Py_INCREF(tagobj);
2211 else
2212 tagobj = NULL;
2213 tagtableentry->tagobj = tagobj;
2214
2215 /* Decode command and flags */
2216 Py_AssertWithArg(PyInt_Check(command),
2217 PyExc_TypeError,
2218 "tag table entry %d: "
2219 "command must be an integer",(unsigned int)i);
2220 tagtableentry->cmd = PyInt_AS_LONG(command) & 0xFF;
2221 tagtableentry->flags = PyInt_AS_LONG(command) - tagtableentry->cmd;
2222
2223 /* Check command arguments */
2224 Py_INCREF(args);
2225 own_args = 1;
2226
2227 switch (tagtableentry->cmd) {
2228
2229 case MATCH_JUMP: /* == MATCH_FAIL */
2230 case MATCH_EOF:
2231 case MATCH_LOOP:
2232 /* args is ignored */
2233 break;
2234
2235 case MATCH_SKIP:
2236 case MATCH_MOVE:
2237 case MATCH_LOOPCONTROL:
2238 Py_AssertWithArg(PyInt_Check(args),
2239 PyExc_TypeError,
2240 "tag table entry %d: "
2241 "Skip|Move|LoopControl command argument "
2242 "must be an integer", (unsigned int)i);
2243 break;
2244
2245 case MATCH_JUMPTARGET:
2246 Py_AssertWithArg(PyString_Check(args),
2247 PyExc_TypeError,
2248 "tag table entry %d: "
2249 "JumpMark command argument must be a string",(unsigned int)i);
2250 if (tc_add_jumptarget(jumpdict, args, i + 1))
2251 goto onError;
2252 break;
2253
2254 case MATCH_ALLIN:
2255 case MATCH_ALLNOTIN:
2256 case MATCH_IS:
2257 case MATCH_ISIN:
2258 case MATCH_ISNOTIN:
2259 case MATCH_WORD:
2260 case MATCH_WORDSTART:
2261 case MATCH_WORDEND:
2262 args = tc_convert_string_arg(args, i, tabletype);
2263 if (args == NULL)
2264 goto onError;
2265 break;
2266
2267 case MATCH_ALLINSET:
2268 case MATCH_ISINSET:
2269 Py_AssertWithArg(PyString_Check(args) &&
2270 PyString_GET_SIZE(args) == 32,
2271 PyExc_TypeError,
2272 "tag table entry %d: "
2273 "AllInSet|IsInSet command argument must "
2274 "be a set() string",(unsigned int)i);
2275 break;
2276
2277 case MATCH_ALLINCHARSET:
2278 case MATCH_ISINCHARSET:
2279 Py_AssertWithArg(mxCharSet_Check(args),
2280 PyExc_TypeError,
2281 "tag table entry %d: "
2282 "AllInCharSet|IsInCharSet command argument must "
2283 "be a CharSet instance",(unsigned int)i);
2284 break;
2285
2286 case MATCH_SWORDSTART: /* == MATCH_NOWORD */
2287 case MATCH_SWORDEND:
2288 case MATCH_SFINDWORD:
2289 Py_AssertWithArg(mxTextSearch_Check(args),
2290 PyExc_TypeError,
2291 "tag table entry %d: "
2292 "sWordStart|sWordEnd|sFindWord command "
2293 "argument must be a TextSearch search "
2294 "object",(unsigned int)i);
2295 break;
2296
2297 case MATCH_TABLE:
2298 case MATCH_SUBTABLE:
2299 Py_AssertWithArg(mxTagTable_Check(args) ||
2300 PyTuple_Check(args) ||
2301 PyList_Check(args) ||
2302 (PyInt_Check(args) &&
2303 PyInt_AS_LONG(args) == MATCH_THISTABLE),
2304 PyExc_TypeError,
2305 "tag table entry %d: "
2306 "Table|SubTable command argument "
2307 "must be a tag table tuple/object or "
2308 "ThisTable", (unsigned int)i);
2309 /* XXX We shouldn't recursively compile tag table tuples here
2310 because this will slow down the compile process
2311 too much and it's not clear whether this particular
2312 table will ever be used during tagging.
2313 */
2314 if (!mxTagTable_Check(args) && !PyInt_Check(args)) {
2315 Py_DECREF(args);
2316 args = mxTagTable_New(args, tabletype, cacheable);
2317 if (args == NULL)
2318 goto onError;
2319 }
2320 break;
2321
2322 case MATCH_TABLEINLIST:
2323 case MATCH_SUBTABLEINLIST:
2324 Py_AssertWithArg(PyTuple_Check(args) &&
2325 PyTuple_GET_SIZE(args) == 2 &&
2326 PyList_Check(PyTuple_GET_ITEM(args, 0)) &&
2327 PyInt_Check(PyTuple_GET_ITEM(args, 1)),
2328 PyExc_TypeError,
2329 "tag table entry %d: "
2330 "TableInList|SubTableInList command argument "
2331 "must be a 2-tuple (list, integer)",
2332 (unsigned int)i);
2333 break;
2334
2335 case MATCH_CALL:
2336 Py_AssertWithArg(PyCallable_Check(args),
2337 PyExc_TypeError,
2338 "tag table entry %d: "
2339 "Call command argument "
2340 "must be a callable object",
2341 (unsigned int)i);
2342 break;
2343
2344 case MATCH_CALLARG:
2345 Py_AssertWithArg(PyTuple_Check(args) &&
2346 PyTuple_GET_SIZE(args) > 0 &&
2347 PyCallable_Check(PyTuple_GET_ITEM(args, 0)),
2348 PyExc_TypeError,
2349 "tag table entry %d: "
2350 "CallArg command argument "
2351 "must be a tuple (fct,[arg0,arg1,...])",
2352 (unsigned int)i);
2353 break;
2354
2355 default:
2356 Py_ErrorWith2Args(PyExc_TypeError,
2357 "tag table entry %d: "
2358 "unknown command integer: %i",
2359 (unsigned int)i, tagtableentry->cmd);
2360
2361 }
2362
2363 /* Store command args */
2364 tagtableentry->args = args;
2365 own_args = 0;
2366
2367 /* Decode jump offsets */
2368 if (jne) {
2369 if (PyInt_Check(jne))
2370 tagtableentry->jne = PyInt_AS_LONG(jne);
2371 else if (PyString_Check(jne)) {
2372 /* Mark for back-patching */
2373 tagtableentry->jne = -424242;
2374 secondpass = 1;
2375 }
2376 else
2377 Py_ErrorWithArg(PyExc_TypeError,
2378 "tag table entry %d: "
2379 "jne must be an integer or string", (unsigned int)i);
2380 }
2381 else
2382 tagtableentry->jne = 0;
2383
2384 if (je) {
2385 if (PyInt_Check(je))
2386 tagtableentry->je = PyInt_AS_LONG(je);
2387 else if (PyString_Check(je)) {
2388 /* Mark for back-patching */
2389 tagtableentry->je = -424242;
2390 secondpass = 1;
2391 }
2392 else
2393 Py_ErrorWithArg(PyExc_TypeError,
2394 "tag table entry %d: "
2395 "je must be an integer or string", (unsigned int)i);
2396 }
2397 else
2398 tagtableentry->je = 1;
2399 }
2400
2401 /* Second pass (needed to patch string jump targets) */
2402 if (secondpass)
2403 for (i = 0; i < size; i++) {
2404 mxTagTableEntry *tagtableentry = &tagtable->entry[i];
2405
2406 if (tagtableentry->je != -424242 &&
2407 tagtableentry->jne != -424242)
2408 continue;
2409
2410 /* Entry (most probably) needs back-patching */
2411 entry = tc_get_item(table, i);
2412 if (entry == NULL) {
2413 Py_ErrorWithArg(PyExc_TypeError,
2414 "tag table entry %d: "
2415 "unexpected error (not found)", (unsigned int)i);
2416 }
2417
2418 /* Get entry length */
2419 entry_len = tc_length(entry);
2420 if (entry_len < 0) {
2421 Py_ErrorWithArg(PyExc_TypeError,
2422 "tag table entry %d: "
2423 "unexpected error (no length)", (unsigned int)i);
2424 }
2425
2426 /* Decode jump offsets */
2427 if (entry_len >= 4)
2428 jne = tc_get_item(entry, 3);
2429 else
2430 jne = NULL;
2431 if (entry_len >= 5)
2432 je = tc_get_item(entry, 4);
2433 else
2434 je = NULL;
2435
2436 /* Patch jump offsets */
2437 if (jne && PyString_Check(jne)) {
2438 v = PyDict_GetItem(jumpdict, jne);
2439 if (v == NULL || !PyInt_Check(v))
2440 Py_ErrorWith2Args(PyExc_TypeError,
2441 "tag table entry %d: "
2442 "jne jump target '%s' not found",
2443 (unsigned int)i, PyString_AS_STRING(jne));
2444 tagtableentry->jne = PyInt_AS_LONG(v) - i;
2445 }
2446 if (je && PyString_Check(je)) {
2447 v = PyDict_GetItem(jumpdict, je);
2448 if (v == NULL || !PyInt_Check(v))
2449 Py_ErrorWith2Args(PyExc_TypeError,
2450 "tag table entry %d: "
2451 "je jump target '%s' not found",
2452 (unsigned int)i, PyString_AS_STRING(je));
2453 tagtableentry->je = PyInt_AS_LONG(v) - i;
2454 }
2455 }
2456
2457 Py_DECREF(jumpdict);
2458 return 0;
2459
2460 onError:
2461 if (own_args) {
2462 Py_DECREF(args);
2463 }
2464 return -1;
2465 }
2466
2467 /* Check the cache for an already compiled TagTable for this
2468 definition. Return NULL in case of an error, Py_None without
2469 INCREF in case no such table was found or the TagTable object. */
2470
2471 static
consult_tagtable_cache(PyObject * definition,int tabletype,int cacheable)2472 PyObject *consult_tagtable_cache(PyObject *definition,
2473 int tabletype,
2474 int cacheable)
2475 {
2476 PyObject *v, *key, *tt;
2477
2478 if (!PyTuple_Check(definition) || !cacheable)
2479 return Py_None;
2480
2481 key = PyTuple_New(2);
2482 if (key == NULL)
2483 goto onError;
2484 v = PyInt_FromLong((long) definition);
2485 if (v == NULL)
2486 goto onError;
2487 PyTuple_SET_ITEM(key, 0, v);
2488 v = PyInt_FromLong(tabletype);
2489 if (v == NULL)
2490 goto onError;
2491 PyTuple_SET_ITEM(key, 1, v);
2492 tt = PyDict_GetItem(mxTextTools_TagTables, key);
2493 Py_DECREF(key);
2494 if (tt != NULL) {
2495 Py_INCREF(tt);
2496 return tt;
2497 }
2498 return Py_None;
2499
2500 onError:
2501 return NULL;
2502 }
2503
2504 /* Adds the compiled tagtable to the cache. Returns -1 in case of an
2505 error, 0 on success. */
2506
2507 static
add_to_tagtable_cache(PyObject * definition,int tabletype,int cacheable,PyObject * tagtable)2508 int add_to_tagtable_cache(PyObject *definition,
2509 int tabletype,
2510 int cacheable,
2511 PyObject *tagtable)
2512 {
2513 PyObject *v, *key;
2514 int rc;
2515
2516 if (!PyTuple_Check(definition) || !cacheable)
2517 return 0;
2518
2519 key = PyTuple_New(2);
2520 if (key == NULL)
2521 goto onError;
2522 v = PyInt_FromLong((long) definition);
2523 if (v == NULL)
2524 goto onError;
2525 PyTuple_SET_ITEM(key, 0, v);
2526 v = PyInt_FromLong(tabletype);
2527 if (v == NULL)
2528 goto onError;
2529 PyTuple_SET_ITEM(key, 1, v);
2530
2531 /* Hard-limit the cache size */
2532 if (PyDict_Size(mxTextTools_TagTables) >= MAX_TAGTABLES_CACHE_SIZE)
2533 PyDict_Clear(mxTextTools_TagTables);
2534
2535 rc = PyDict_SetItem(mxTextTools_TagTables, key, tagtable);
2536 Py_DECREF(key);
2537 if (rc)
2538 goto onError;
2539 return 0;
2540
2541 onError:
2542 return -1;
2543 }
2544
2545
2546 /* allocation */
2547
mxTagTable_New(PyObject * definition,int tabletype,int cacheable)2548 PyObject *mxTagTable_New(PyObject *definition,
2549 int tabletype,
2550 int cacheable)
2551 {
2552 mxTagTableObject *tagtable = 0;
2553 PyObject *v;
2554 Py_ssize_t size;
2555
2556 /* First, consult the TagTable cache */
2557 v = consult_tagtable_cache(definition, tabletype, cacheable);
2558 if (v == NULL)
2559 goto onError;
2560 else if (v != Py_None)
2561 return v;
2562
2563 size = tc_length(definition);
2564 if (size < 0)
2565 Py_Error(PyExc_TypeError,
2566 "tag table definition must be a tuple or a list");
2567
2568 tagtable = PyObject_NEW_VAR(mxTagTableObject, &mxTagTable_Type, size);
2569 if (tagtable == NULL)
2570 goto onError;
2571 if (cacheable) {
2572 Py_INCREF(definition);
2573 tagtable->definition = definition;
2574 }
2575 else
2576 tagtable->definition = NULL;
2577 tagtable->tabletype = tabletype;
2578
2579 /* Compile table ... */
2580 if (init_tag_table(tagtable, definition, size, tabletype, cacheable))
2581 goto onError;
2582
2583 /* Cache the compiled table if it is cacheable and derived from a
2584 tuple */
2585 if (add_to_tagtable_cache(definition, tabletype, cacheable,
2586 (PyObject *)tagtable))
2587 goto onError;
2588
2589 return (PyObject *)tagtable;
2590
2591 onError:
2592 Py_XDECREF(tagtable);
2593 return NULL;
2594 }
2595
2596 Py_C_Function( mxTagTable_TagTable,
2597 "TagTable(definition[,cachable=1])\n\n"
2598 )
2599 {
2600 PyObject *definition;
2601 int cacheable = 1;
2602
2603 Py_Get2Args("O|i:TagTable", definition, cacheable);
2604 return mxTagTable_New(definition, 0, cacheable);
2605
2606 onError:
2607 return NULL;
2608 }
2609
2610 #ifdef HAVE_UNICODE
2611 Py_C_Function( mxTagTable_UnicodeTagTable,
2612 "TagTable(definition[,cachable=1])\n\n"
2613 )
2614 {
2615 PyObject *definition;
2616 int cacheable = 1;
2617
2618 Py_Get2Args("O|i:UnicodeTagTable", definition, cacheable);
2619 return mxTagTable_New(definition, 1, cacheable);
2620
2621 onError:
2622 return NULL;
2623 }
2624 #endif
2625
2626 static
mxTagTable_Free(mxTagTableObject * tagtable)2627 void mxTagTable_Free(mxTagTableObject *tagtable)
2628 {
2629 tc_cleanup(tagtable);
2630 Py_XDECREF(tagtable->definition);
2631 PyObject_Del(tagtable);
2632 }
2633
2634 /* C APIs */
2635
2636 #define tagtable ((mxTagTableObject *)self)
2637
2638 static
mxTagTable_CompiledDefinition(PyObject * self)2639 PyObject *mxTagTable_CompiledDefinition(PyObject *self)
2640 {
2641 PyObject *tuple = 0, *v, *w;
2642 Py_ssize_t i;
2643 Py_ssize_t size;
2644
2645 if (!mxTagTable_Check(self)) {
2646 PyErr_BadInternalCall();
2647 goto onError;
2648 }
2649
2650 size = tagtable->numentries;
2651 tuple = PyTuple_New(size);
2652 if (tuple == NULL)
2653 goto onError;
2654
2655 for (i = 0; i < size; i++) {
2656 mxTagTableEntry *tagtableentry = &tagtable->entry[i];
2657
2658 /* Build tuple (tagobj, command, args, jne, je) */
2659 v = PyTuple_New(5);
2660 if (v == NULL)
2661 goto onError;
2662 w = tagtableentry->tagobj;
2663 if (w == NULL)
2664 w = Py_None;
2665 Py_INCREF(w);
2666 PyTuple_SET_ITEM(v, 0, w);
2667 PyTuple_SET_ITEM(v, 1, PyInt_FromLong(tagtableentry->cmd |
2668 tagtableentry->flags));
2669 w = tagtableentry->args;
2670 if (w == NULL)
2671 w = Py_None;
2672 Py_INCREF(w);
2673 PyTuple_SET_ITEM(v, 2, w);
2674 PyTuple_SET_ITEM(v, 3, PyInt_FromLong(tagtableentry->jne));
2675 PyTuple_SET_ITEM(v, 4, PyInt_FromLong(tagtableentry->je));
2676 if (PyErr_Occurred()) {
2677 Py_DECREF(v);
2678 goto onError;
2679 }
2680 PyTuple_SET_ITEM(tuple, i, v);
2681 }
2682
2683 return tuple;
2684
2685 onError:
2686 Py_XDECREF(tuple);
2687 return NULL;
2688 }
2689
2690
2691 /* methods */
2692
2693 Py_C_Function( mxTagTable_compiled,
2694 ".compiled()\n\n"
2695 )
2696 {
2697 Py_NoArgsCheck();
2698 return mxTagTable_CompiledDefinition(self);
2699
2700 onError:
2701 return NULL;
2702 }
2703
2704 #ifdef COPY_PROTOCOL
2705 Py_C_Function( mxTagTable_copy,
2706 "copy([memo])\n\n"
2707 "Return a new reference for the instance. This function\n"
2708 "is used for the copy-protocol. Real copying doesn't take\n"
2709 "place, since the instances are immutable.")
2710 {
2711 PyObject *memo;
2712
2713 Py_GetArg("|O",memo);
2714 Py_INCREF(tagtable);
2715 return (PyObject *)tagtable;
2716
2717 onError:
2718 return NULL;
2719 }
2720 #endif
2721
2722 #undef tagtable
2723
2724 /* --- slots --- */
2725
2726 static
mxTagTable_Repr(mxTagTableObject * self)2727 PyObject *mxTagTable_Repr(mxTagTableObject *self)
2728 {
2729 char t[100];
2730
2731 if (self->tabletype == MXTAGTABLE_STRINGTYPE)
2732 sprintf(t,"<String Tag Table object at 0x%lx>", (long)self);
2733 else if (self->tabletype == MXTAGTABLE_UNICODETYPE)
2734 sprintf(t,"<Unicode Tag Table object at 0x%lx>", (long)self);
2735 else
2736 sprintf(t,"<Tag Table object at 0x%lx>", (long)self);
2737 return PyString_FromString(t);
2738 }
2739
2740 static
2741 PyMethodDef mxTagTable_Methods[] =
2742 {
2743 Py_MethodListEntryNoArgs("compiled",mxTagTable_compiled),
2744 #ifdef COPY_PROTOCOL
2745 Py_MethodListEntry("__deepcopy__",mxTagTable_copy),
2746 Py_MethodListEntry("__copy__",mxTagTable_copy),
2747 #endif
2748 {NULL,NULL} /* end of list */
2749 };
2750
2751 static
2752 PyMemberDef mxTagTable_Members[] = {
2753 {"definition",T_OBJECT_EX,offsetof(mxTagTableObject,definition),READONLY,"Definition"},
2754 {NULL}
2755 };
2756
2757 /* Python Type Tables */
2758
2759 PyTypeObject mxTagTable_Type = {
2760 PyVarObject_HEAD_INIT(NULL, 0) /* init at startup ! */
2761 "Tag Table", /* tp_name */
2762 sizeof(mxTagTableObject), /* tp_basicsize */
2763 sizeof(mxTagTableEntry), /* tp_itemsize */
2764 /* methods */
2765 (destructor)mxTagTable_Free, /* tp_dealloc */
2766 (printfunc)0, /* tp_print */
2767 (getattrfunc)0, /* tp_getattr */
2768 (setattrfunc)0, /* tp_setattr */
2769 0, /* tp_compare */
2770 (reprfunc)mxTagTable_Repr, /* tp_repr */
2771 0, /* tp_as_number */
2772 0, /* tp_as_sequence */
2773 0, /* tp_as_mapping */
2774 (hashfunc)0, /* tp_hash */
2775 (ternaryfunc)0, /* tp_call */
2776 (reprfunc)0, /* tp_str */
2777 (getattrofunc)0, /* tp_getattro */
2778 (setattrofunc)0, /* tp_setattro */
2779 0, /* tp_as_buffer */
2780 Py_TPFLAGS_DEFAULT, /* tp_flags */
2781 (char*) 0, /* tp_doc */
2782 0, /* tp_traverse */
2783 0, /* tp_clear */
2784 0, /* tp_richcompare */
2785 0, /* tp_weaklistoffset */
2786 0, /* tp_iter */
2787 0, /* tp_iternext */
2788 mxTagTable_Methods, /* tp_methods */
2789 mxTagTable_Members, /* tp_members */
2790 };
2791
2792 /* --- Internal functions ----------------------------------------------*/
2793
2794 #ifdef HAVE_UNICODE
2795
2796 /* Same as mxTextTools_Join() for Unicode objects. */
2797
2798 static
mxTextTools_UnicodeJoin(PyObject * seq,Py_ssize_t start,Py_ssize_t stop,PyObject * separator)2799 PyObject *mxTextTools_UnicodeJoin(PyObject *seq,
2800 Py_ssize_t start,
2801 Py_ssize_t stop,
2802 PyObject *separator)
2803 {
2804 PyObject *newstring = 0, *tempstr = 0;
2805 Py_ssize_t newstring_len,current_len = 0;
2806 Py_UNICODE *p;
2807 Py_ssize_t i;
2808 Py_UNICODE *sep;
2809 Py_ssize_t sep_len;
2810
2811 if (separator) {
2812 separator = PyUnicode_FromObject(separator);
2813 if (separator == NULL)
2814 goto onError;
2815 sep = PyUnicode_AS_UNICODE(separator);
2816 sep_len = PyUnicode_GET_SIZE(separator);
2817 }
2818 else {
2819 sep = NULL;
2820 sep_len = 0;
2821 }
2822
2823 /* Create an empty new string */
2824 newstring_len = (10 + sep_len) * (stop - start);
2825 newstring = PyUnicode_FromUnicode(NULL, newstring_len);
2826 if (newstring == NULL)
2827 goto onError;
2828 p = PyUnicode_AS_UNICODE(newstring);
2829
2830 /* Join with separator */
2831 for (i = start; i < stop; i++) {
2832 register PyObject *o;
2833 Py_UNICODE *st;
2834 Py_ssize_t len_st;
2835
2836 o = PySequence_GetItem(seq, i);
2837
2838 if (PyTuple_Check(o)) {
2839 /* Tuple entry: (string,l,r,[...]) */
2840 register Py_ssize_t l,r;
2841
2842 /* parse tuple */
2843 Py_Assert((PyTuple_GET_SIZE(o) >= 3) &&
2844 PyInt_Check(PyTuple_GET_ITEM(o,1)) &&
2845 PyInt_Check(PyTuple_GET_ITEM(o,2)),
2846 PyExc_TypeError,
2847 "tuples must be of the format (string,l,r[,...])");
2848 tempstr = PyUnicode_FromObject(PyTuple_GET_ITEM(o,0));
2849 if (tempstr == NULL)
2850 goto onError;
2851 st = PyUnicode_AS_UNICODE(tempstr);
2852 len_st = PyUnicode_GET_SIZE(tempstr);
2853 l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1));
2854 r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2));
2855
2856 /* compute slice */
2857 if (r > len_st) r = len_st;
2858 else if (r < 0) {
2859 r += len_st + 1;
2860 if (r < 0)
2861 r = 0;
2862 }
2863 if (l > len_st) l = len_st;
2864 else if (l < 0) {
2865 l += len_st + 1;
2866 if (l < 0)
2867 l = 0;
2868 }
2869
2870 /* empty ? */
2871 if (l > r)
2872 continue;
2873 len_st = r - l;
2874 if (len_st == 0)
2875 continue;
2876
2877 /* get pointer right */
2878 st += l;
2879 }
2880 else {
2881 /* Must be a string entry: take the whole string */
2882 tempstr = PyUnicode_FromObject(o);
2883 if (tempstr == NULL)
2884 goto onError;
2885 st = PyUnicode_AS_UNICODE(tempstr);
2886 len_st = PyUnicode_GET_SIZE(tempstr);
2887 }
2888
2889 Py_DECREF(o);
2890
2891 /* Resize the new string if needed */
2892 while (current_len + len_st + sep_len >= newstring_len) {
2893 newstring_len += newstring_len >> 1;
2894 if (PyUnicode_Resize(&newstring, newstring_len))
2895 goto onError;
2896 p = PyUnicode_AS_UNICODE(newstring) + current_len;
2897 }
2898
2899 /* Insert separator */
2900 if (i > 0 && sep_len > 0) {
2901 Py_UNICODE_COPY(p, sep, sep_len);
2902 p += sep_len;
2903 current_len += sep_len;
2904 }
2905
2906 /* Copy snippet into new string */
2907 Py_UNICODE_COPY(p, st, len_st);
2908 p += len_st;
2909 current_len += len_st;
2910
2911 Py_DECREF(tempstr);
2912 tempstr = NULL;
2913 }
2914
2915 /* Resize new string to the actual length */
2916 if (PyUnicode_Resize(&newstring, current_len))
2917 goto onError;
2918
2919 Py_XDECREF(separator);
2920 return newstring;
2921
2922 onError:
2923 Py_XDECREF(newstring);
2924 Py_XDECREF(separator);
2925 Py_XDECREF(tempstr);
2926 return NULL;
2927 }
2928
2929 #endif
2930
2931 /* Enhanced string join: also excepts tuple (text, left, right,...)
2932 entries which then cause text[left:right] to be used as string
2933 snippet.
2934
2935 separator may be NULL; in that case, "" is used as separator.
2936
2937 */
2938
2939 static
mxTextTools_Join(PyObject * seq,Py_ssize_t start,Py_ssize_t stop,PyObject * separator)2940 PyObject *mxTextTools_Join(PyObject *seq,
2941 Py_ssize_t start,
2942 Py_ssize_t stop,
2943 PyObject *separator)
2944 {
2945 PyObject *newstring = 0;
2946 Py_ssize_t newstring_len, current_len = 0;
2947 char *p;
2948 Py_ssize_t i;
2949 char *sep;
2950 Py_ssize_t sep_len;
2951
2952 if (separator) {
2953 #ifdef HAVE_UNICODE
2954 if (PyUnicode_Check(separator))
2955 return mxTextTools_UnicodeJoin(seq, start, stop, separator);
2956 #endif
2957 Py_Assert(PyString_Check(separator),
2958 PyExc_TypeError,
2959 "separator must be a string");
2960 sep = PyString_AS_STRING(separator);
2961 sep_len = PyString_GET_SIZE(separator);
2962 }
2963 else {
2964 sep = NULL;
2965 sep_len = 0;
2966 }
2967
2968 /* Create an empty new string */
2969 newstring_len = (10 + sep_len) * (stop - start);
2970 newstring = PyString_FromStringAndSize((char*)NULL, newstring_len);
2971 if (newstring == NULL)
2972 goto onError;
2973 p = PyString_AS_STRING(newstring);
2974
2975 /* Join with separator */
2976 for (i = start; i < stop; i++) {
2977 register PyObject *o;
2978 char *st;
2979 Py_ssize_t len_st;
2980
2981 o = PySequence_GetItem(seq, i);
2982
2983 if (PyTuple_Check(o)) {
2984 /* Tuple entry: (string,l,r,[...]) */
2985 register Py_ssize_t l,r;
2986
2987 /* parse tuple */
2988 Py_Assert((PyTuple_GET_SIZE(o) >= 3) &&
2989 PyInt_Check(PyTuple_GET_ITEM(o,1)) &&
2990 PyInt_Check(PyTuple_GET_ITEM(o,2)),
2991 PyExc_TypeError,
2992 "tuples must be of the format (string,int,int[,...])");
2993 #ifdef HAVE_UNICODE
2994 if (PyUnicode_Check(PyTuple_GET_ITEM(o,0))) {
2995 /* Redirect to Unicode implementation; all previous work
2996 is lost. */
2997 Py_DECREF(o);
2998 Py_DECREF(newstring);
2999 return mxTextTools_UnicodeJoin(seq, start, stop, separator);
3000 }
3001 #endif
3002 Py_Assert(PyString_Check(PyTuple_GET_ITEM(o,0)),
3003 PyExc_TypeError,
3004 "tuples must be of the format (string,int,int[,...])");
3005 st = PyString_AS_STRING(PyTuple_GET_ITEM(o,0));
3006 len_st = PyString_GET_SIZE(PyTuple_GET_ITEM(o,0));
3007 l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1));
3008 r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2));
3009
3010 /* compute slice */
3011 if (r > len_st) r = len_st;
3012 else if (r < 0) {
3013 r += len_st + 1;
3014 if (r < 0)
3015 r = 0;
3016 }
3017 if (l > len_st) l = len_st;
3018 else if (l < 0) {
3019 l += len_st + 1;
3020 if (l < 0)
3021 l = 0;
3022 }
3023
3024 /* empty ? */
3025 if (l > r)
3026 continue;
3027 len_st = r - l;
3028 if (len_st == 0)
3029 continue;
3030
3031 /* get pointer right */
3032 st += l;
3033 }
3034 else if (PyString_Check(o)) {
3035 /* String entry: take the whole string */
3036 st = PyString_AS_STRING(o);
3037 len_st = PyString_GET_SIZE(o);
3038 }
3039 #ifdef HAVE_UNICODE
3040 else if (PyUnicode_Check(o)) {
3041 /* Redirect to Unicode implementation; all previous work
3042 is lost. */
3043 Py_DECREF(o);
3044 Py_DECREF(newstring);
3045 return mxTextTools_UnicodeJoin(seq, start, stop, separator);
3046 }
3047 #endif
3048 else {
3049 Py_DECREF(o);
3050 Py_Error(PyExc_TypeError,
3051 "list must contain tuples or strings as entries");
3052 }
3053
3054 Py_DECREF(o);
3055
3056 /* Resize the new string if needed */
3057 while (current_len + len_st + sep_len >= newstring_len) {
3058 newstring_len += newstring_len >> 1;
3059 if (_PyString_Resize(&newstring, newstring_len))
3060 goto onError;
3061 p = PyString_AS_STRING(newstring) + current_len;
3062 }
3063
3064 /* Insert separator */
3065 if (i > 0 && sep_len > 0) {
3066 memcpy(p, sep, sep_len);
3067 p += sep_len;
3068 current_len += sep_len;
3069 }
3070
3071 /* Copy snippet into new string */
3072 memcpy(p,st,len_st);
3073 p += len_st;
3074 current_len += len_st;
3075 }
3076
3077 /* Resize new string to the actual length */
3078 if (_PyString_Resize(&newstring, current_len))
3079 goto onError;
3080
3081 return newstring;
3082
3083 onError:
3084 Py_XDECREF(newstring);
3085 return NULL;
3086 }
3087
3088 static
mxTextTools_HexStringFromString(char * str,Py_ssize_t len)3089 PyObject *mxTextTools_HexStringFromString(char *str,
3090 Py_ssize_t len)
3091 {
3092 PyObject *w = 0;
3093 Py_ssize_t i;
3094 char *hex;
3095 static const char hexdigits[] = "0123456789abcdef";
3096
3097 /* Convert to HEX */
3098 w = PyString_FromStringAndSize(NULL,2*len);
3099 if (!w)
3100 goto onError;
3101 hex = PyString_AS_STRING(w);
3102 for (i = 0; i < len; i ++) {
3103 unsigned char c = (unsigned char)*str;
3104
3105 *hex++ = hexdigits[c >> 4];
3106 *hex++ = hexdigits[c & 0x0F];
3107 str++;
3108 }
3109 return w;
3110
3111 onError:
3112 Py_XDECREF(w);
3113 return NULL;
3114 }
3115
3116 static
mxTextTools_StringFromHexString(char * hex,Py_ssize_t len)3117 PyObject *mxTextTools_StringFromHexString(char *hex,
3118 Py_ssize_t len)
3119 {
3120 PyObject *w = 0;
3121 Py_ssize_t i;
3122 char *str;
3123 static const char hexdigits[] = "0123456789abcdef";
3124
3125 /* Convert to string */
3126 Py_Assert(len % 2 == 0,
3127 PyExc_TypeError,
3128 "need 2-digit hex string argument");
3129 len >>= 1;
3130 w = PyString_FromStringAndSize(NULL,len);
3131 if (!w)
3132 goto onError;
3133 str = PyString_AS_STRING(w);
3134 for (i = 0; i < len; i++,str++) {
3135 register char c;
3136 register Py_ssize_t j;
3137
3138 c = tolower(*hex++);
3139 for (j = 0; j < (Py_ssize_t)sizeof(hexdigits); j++)
3140 if (c == hexdigits[j]) {
3141 *str = j << 4;
3142 break;
3143 }
3144 if (j == sizeof(hexdigits)) {
3145 DPRINTF("Failed: '%c' (%u) at %i\n",c,(unsigned int)c,i);
3146 Py_Error(PyExc_ValueError,
3147 "argument contains non-hex characters");
3148 }
3149
3150 c = tolower(*hex++);
3151 for (j = 0; j < (Py_ssize_t)sizeof(hexdigits); j++)
3152 if (c == hexdigits[j]) {
3153 *str += j;
3154 break;
3155 }
3156 if (j == sizeof(hexdigits)) {
3157 DPRINTF("Failed2: '%c' (%u) at %i\n",c,(unsigned int)c,i);
3158 Py_Error(PyExc_ValueError,
3159 "argument contains non-hex characters");
3160 }
3161 }
3162 return w;
3163
3164 onError:
3165 Py_XDECREF(w);
3166 return NULL;
3167 }
3168
3169 static
mxTextTools_IsASCII(PyObject * text,Py_ssize_t left,Py_ssize_t right)3170 int mxTextTools_IsASCII(PyObject *text,
3171 Py_ssize_t left,
3172 Py_ssize_t right)
3173 {
3174 if (PyString_Check(text)) {
3175 Py_ssize_t len;
3176 register Py_ssize_t i;
3177 register unsigned char *str = (unsigned char *)PyString_AS_STRING(text);
3178
3179 len = PyString_GET_SIZE(text);
3180 Py_CheckSequenceSlice(len, left, right);
3181 for (i = left; i < right; i++)
3182 if (str[i] >= 128)
3183 return 0;
3184 return 1;
3185 }
3186
3187 #ifdef HAVE_UNICODE
3188 else if (PyUnicode_Check(text)) {
3189 Py_ssize_t len;
3190 register Py_ssize_t i;
3191 register Py_UNICODE *str = PyUnicode_AS_UNICODE(text);
3192
3193 len = PyUnicode_GET_SIZE(text);
3194 Py_CheckSequenceSlice(len, left, right);
3195 for (i = left; i < right; i++)
3196 if (str[i] >= 128)
3197 return 0;
3198 return 1;
3199 }
3200 #endif
3201
3202 else
3203 Py_Error(PyExc_TypeError,
3204 "need string object");
3205
3206 onError:
3207 return -1;
3208 }
3209
3210 /* Takes a list of tuples (replacement,l,r,...) and produces a taglist
3211 suitable for mxTextTools_Join() which creates a copy of
3212 text where every slice [l:r] is replaced by the given replacement.
3213
3214 */
3215
3216 static
mxTextTools_Joinlist(PyObject * text,PyObject * list,Py_ssize_t pos,Py_ssize_t text_len)3217 PyObject *mxTextTools_Joinlist(PyObject *text,
3218 PyObject *list,
3219 Py_ssize_t pos,
3220 Py_ssize_t text_len)
3221 {
3222 PyObject *joinlist = 0;
3223 Py_ssize_t list_len;
3224 Py_ssize_t i;
3225 Py_ssize_t listitem = 0;
3226 Py_ssize_t listsize = INITIAL_LIST_SIZE;
3227
3228 if (PyString_Check(text)) {
3229 Py_CheckStringSlice(text, pos, text_len);
3230 }
3231 #ifdef HAVE_UNICODE
3232 else if (PyUnicode_Check(text)) {
3233 Py_CheckUnicodeSlice(text, pos, text_len);
3234 }
3235 #endif
3236 else
3237 Py_Error(PyExc_TypeError,
3238 "expected string or unicode");
3239
3240 Py_Assert(PyList_Check(list),
3241 PyExc_TypeError,
3242 "expected a list of tuples as second argument");
3243 list_len = PyList_GET_SIZE(list);
3244
3245 joinlist = PyList_New(listsize);
3246 if (joinlist == NULL)
3247 goto onError;
3248
3249 for (i = 0; i < list_len; i++) {
3250 register PyObject *t;
3251 register Py_ssize_t left, right;
3252
3253 t = PyList_GET_ITEM(list, i);
3254 Py_Assert(PyTuple_Check(t) &&
3255 (PyTuple_GET_SIZE(t) >= 3) &&
3256 (PyString_Check(PyTuple_GET_ITEM(t,0)) ||
3257 PyUnicode_Check(PyTuple_GET_ITEM(t,0))) &&
3258 PyInt_Check(PyTuple_GET_ITEM(t,1)) &&
3259 PyInt_Check(PyTuple_GET_ITEM(t,2)),
3260 PyExc_TypeError,
3261 "tuples must be of the form (string,int,int,...)");
3262 left = PyInt_AS_LONG(PyTuple_GET_ITEM(t,1));
3263 right = PyInt_AS_LONG(PyTuple_GET_ITEM(t,2));
3264
3265 Py_Assert(left >= pos,
3266 PyExc_ValueError,
3267 "list is not sorted ascending");
3268
3269 if (left > pos) { /* joinlist.append((text,pos,left)) */
3270 register PyObject *v;
3271 register PyObject *w;
3272
3273 v = PyTuple_New(3);
3274 if (v == NULL)
3275 goto onError;
3276
3277 Py_INCREF(text);
3278 PyTuple_SET_ITEM(v,0,text);
3279
3280 w = PyInt_FromLong(pos);
3281 if (w == NULL)
3282 goto onError;
3283 PyTuple_SET_ITEM(v,1,w);
3284
3285 w = PyTuple_GET_ITEM(t,1);
3286 Py_INCREF(w);
3287 PyTuple_SET_ITEM(v,2,w);
3288
3289 if (listitem < listsize)
3290 PyList_SET_ITEM(joinlist,listitem,v);
3291 else {
3292 PyList_Append(joinlist,v);
3293 Py_DECREF(v);
3294 }
3295 listitem++;
3296 }
3297
3298 /* joinlist.append(string) */
3299 if (listitem < listsize) {
3300 register PyObject *v = PyTuple_GET_ITEM(t,0);
3301 Py_INCREF(v);
3302 PyList_SET_ITEM(joinlist,listitem,v);
3303 }
3304 else
3305 PyList_Append(joinlist,PyTuple_GET_ITEM(t,0));
3306 listitem++;
3307
3308 pos = right;
3309 }
3310
3311 if (pos < text_len) { /* joinlist.append((text,pos,text_len)) */
3312 register PyObject *v;
3313 register PyObject *w;
3314
3315 v = PyTuple_New(3);
3316 if (v == NULL)
3317 goto onError;
3318
3319 Py_INCREF(text);
3320 PyTuple_SET_ITEM(v,0,text);
3321
3322 w = PyInt_FromLong(pos);
3323 if (w == NULL)
3324 goto onError;
3325 PyTuple_SET_ITEM(v,1,w);
3326
3327 w = PyInt_FromLong(text_len);
3328 if (w == NULL)
3329 goto onError;
3330 PyTuple_SET_ITEM(v,2,w);
3331
3332 if (listitem < listsize)
3333 PyList_SET_ITEM(joinlist,listitem,v);
3334 else {
3335 PyList_Append(joinlist,v);
3336 Py_DECREF(v);
3337 }
3338 listitem++;
3339 }
3340
3341 /* Resize list if necessary */
3342 if (listitem < listsize)
3343 PyList_SetSlice(joinlist,listitem,listsize,(PyObject*)NULL);
3344
3345 return joinlist;
3346
3347 onError:
3348
3349 Py_XDECREF(joinlist);
3350 return NULL;
3351 }
3352
3353 #ifdef HAVE_UNICODE
3354 static
mxTextTools_UnicodeCharSplit(PyObject * text,PyObject * separator,Py_ssize_t start,Py_ssize_t text_len)3355 PyObject *mxTextTools_UnicodeCharSplit(PyObject *text,
3356 PyObject *separator,
3357 Py_ssize_t start,
3358 Py_ssize_t text_len)
3359 {
3360 PyObject *list = NULL;
3361 register Py_ssize_t x;
3362 Py_ssize_t listitem = 0;
3363 Py_ssize_t listsize = INITIAL_LIST_SIZE;
3364 Py_UNICODE *tx;
3365 Py_UNICODE sep;
3366
3367 text = PyUnicode_FromObject(text);
3368 if (text == NULL) {
3369 separator = NULL;
3370 goto onError;
3371 }
3372 separator = PyUnicode_FromObject(separator);
3373 if (separator == NULL)
3374 goto onError;
3375
3376 Py_CheckUnicodeSlice(text, start, text_len);
3377
3378 Py_Assert(PyUnicode_GET_SIZE(separator) == 1,
3379 PyExc_TypeError,
3380 "separator must be a single character");
3381
3382 tx = PyUnicode_AS_UNICODE(text);
3383 sep = *PyUnicode_AS_UNICODE(separator);
3384
3385 list = PyList_New(listsize);
3386 if (!list)
3387 goto onError;
3388
3389 x = start;
3390 while (1) {
3391 PyObject *s;
3392 register Py_ssize_t z;
3393
3394 /* Skip to next separator */
3395 z = x;
3396 for (;x < text_len; x++)
3397 if (tx[x] == sep)
3398 break;
3399
3400 /* Append the slice to list */
3401 s = PyUnicode_FromUnicode(&tx[z], x - z);
3402 if (!s)
3403 goto onError;
3404 if (listitem < listsize)
3405 PyList_SET_ITEM(list,listitem,s);
3406 else {
3407 PyList_Append(list,s);
3408 Py_DECREF(s);
3409 }
3410 listitem++;
3411
3412 if (x == text_len)
3413 break;
3414
3415 /* Skip separator */
3416 x++;
3417 }
3418
3419 /* Resize list if necessary */
3420 if (listitem < listsize)
3421 PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL);
3422
3423 Py_DECREF(text);
3424 Py_DECREF(separator);
3425 return list;
3426
3427 onError:
3428 Py_XDECREF(list);
3429 Py_XDECREF(text);
3430 Py_XDECREF(separator);
3431 return NULL;
3432 }
3433 #endif
3434
3435 static
mxTextTools_CharSplit(PyObject * text,PyObject * separator,Py_ssize_t start,Py_ssize_t text_len)3436 PyObject *mxTextTools_CharSplit(PyObject *text,
3437 PyObject *separator,
3438 Py_ssize_t start,
3439 Py_ssize_t text_len)
3440 {
3441 PyObject *list = 0;
3442 register Py_ssize_t x;
3443 Py_ssize_t listitem = 0;
3444 Py_ssize_t listsize = INITIAL_LIST_SIZE;
3445 char *tx;
3446 char sep;
3447
3448 #ifdef HAVE_UNICODE
3449 if (PyUnicode_Check(text) || PyUnicode_Check(separator))
3450 return mxTextTools_UnicodeCharSplit(text, separator,
3451 start, text_len);
3452 #endif
3453
3454 if (PyString_Check(text) && PyString_Check(separator)) {
3455 Py_CheckStringSlice(text, start, text_len);
3456 }
3457 else
3458 Py_Error(PyExc_TypeError,
3459 "text and separator must be strings or unicode");
3460
3461 Py_Assert(PyString_GET_SIZE(separator) == 1,
3462 PyExc_TypeError,
3463 "separator must be a single character");
3464
3465 tx = PyString_AS_STRING(text);
3466 sep = *PyString_AS_STRING(separator);
3467
3468 list = PyList_New(listsize);
3469 if (!list)
3470 goto onError;
3471
3472 x = start;
3473 while (1) {
3474 PyObject *s;
3475 register Py_ssize_t z;
3476
3477 /* Skip to next separator */
3478 z = x;
3479 for (;x < text_len; x++)
3480 if (tx[x] == sep)
3481 break;
3482
3483 /* Append the slice to list */
3484 s = PyString_FromStringAndSize(&tx[z], x - z);
3485 if (!s)
3486 goto onError;
3487 if (listitem < listsize)
3488 PyList_SET_ITEM(list,listitem,s);
3489 else {
3490 PyList_Append(list,s);
3491 Py_DECREF(s);
3492 }
3493 listitem++;
3494
3495 if (x == text_len)
3496 break;
3497
3498 /* Skip separator */
3499 x++;
3500 }
3501
3502 /* Resize list if necessary */
3503 if (listitem < listsize)
3504 PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL);
3505
3506 return list;
3507
3508 onError:
3509 Py_XDECREF(list);
3510 return NULL;
3511 }
3512
3513 #ifdef HAVE_UNICODE
3514 static
mxTextTools_UnicodeSplitAt(PyObject * text,PyObject * separator,Py_ssize_t nth,Py_ssize_t start,Py_ssize_t text_len)3515 PyObject *mxTextTools_UnicodeSplitAt(PyObject *text,
3516 PyObject *separator,
3517 Py_ssize_t nth,
3518 Py_ssize_t start,
3519 Py_ssize_t text_len)
3520 {
3521 PyObject *tuple = 0;
3522 register Py_ssize_t x;
3523 PyObject *s;
3524 Py_UNICODE *tx;
3525 Py_UNICODE sep;
3526
3527 text = PyUnicode_FromObject(text);
3528 if (text == NULL) {
3529 separator = NULL;
3530 goto onError;
3531 }
3532 separator = PyUnicode_FromObject(separator);
3533 if (separator == NULL)
3534 goto onError;
3535
3536 Py_CheckUnicodeSlice(text, start, text_len);
3537
3538 Py_Assert(PyUnicode_GET_SIZE(separator) == 1,
3539 PyExc_TypeError,
3540 "separator must be a single character");
3541
3542 tx = PyUnicode_AS_UNICODE(text);
3543 sep = *PyUnicode_AS_UNICODE(separator);
3544
3545 tuple = PyTuple_New(2);
3546 if (!tuple)
3547 goto onError;
3548
3549 if (nth > 0) {
3550 /* Skip to nth separator from the left */
3551 x = start;
3552 while (1) {
3553 for (; x < text_len; x++)
3554 if (tx[x] == sep)
3555 break;
3556 if (--nth == 0 || x == text_len)
3557 break;
3558 x++;
3559 }
3560 }
3561 else if (nth < 0) {
3562 /* Skip to nth separator from the right */
3563 x = text_len - 1;
3564 while (1) {
3565 for (; x >= start; x--)
3566 if (tx[x] == sep)
3567 break;
3568 if (++nth == 0 || x < start)
3569 break;
3570 x--;
3571 }
3572 }
3573 else
3574 Py_Error(PyExc_ValueError,
3575 "nth must be non-zero");
3576
3577 /* Add to tuple */
3578 if (x < start)
3579 s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0);
3580 else
3581 s = PyUnicode_FromUnicode(&tx[start], x - start);
3582 if (!s)
3583 goto onError;
3584 PyTuple_SET_ITEM(tuple,0,s);
3585
3586 /* Skip separator */
3587 x++;
3588
3589 if (x >= text_len)
3590 s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0);
3591 else
3592 s = PyUnicode_FromUnicode(&tx[x], text_len - x);
3593 if (!s)
3594 goto onError;
3595 PyTuple_SET_ITEM(tuple,1,s);
3596
3597 Py_DECREF(text);
3598 Py_DECREF(separator);
3599 return tuple;
3600
3601 onError:
3602 Py_XDECREF(tuple);
3603 Py_XDECREF(text);
3604 Py_XDECREF(separator);
3605 return NULL;
3606 }
3607 #endif
3608
3609 static
mxTextTools_SplitAt(PyObject * text,PyObject * separator,Py_ssize_t nth,Py_ssize_t start,Py_ssize_t text_len)3610 PyObject *mxTextTools_SplitAt(PyObject *text,
3611 PyObject *separator,
3612 Py_ssize_t nth,
3613 Py_ssize_t start,
3614 Py_ssize_t text_len)
3615 {
3616 PyObject *tuple = 0;
3617 register Py_ssize_t x;
3618 PyObject *s;
3619 char *tx;
3620 char sep;
3621
3622 #ifdef HAVE_UNICODE
3623 if (PyUnicode_Check(text) || PyUnicode_Check(separator))
3624 return mxTextTools_UnicodeSplitAt(text, separator,
3625 nth, start, text_len);
3626 #endif
3627
3628 if (PyString_Check(text) && PyString_Check(separator)) {
3629 Py_CheckStringSlice(text, start, text_len);
3630 }
3631 else
3632 Py_Error(PyExc_TypeError,
3633 "text and separator must be strings or unicode");
3634
3635 Py_Assert(PyString_GET_SIZE(separator) == 1,
3636 PyExc_TypeError,
3637 "separator must be a single character");
3638
3639 tx = PyString_AS_STRING(text);
3640 sep = *PyString_AS_STRING(separator);
3641
3642 tuple = PyTuple_New(2);
3643 if (!tuple)
3644 goto onError;
3645
3646 if (nth > 0) {
3647 /* Skip to nth separator from the left */
3648 x = start;
3649 while (1) {
3650 for (; x < text_len; x++)
3651 if (tx[x] == sep)
3652 break;
3653 if (--nth == 0 || x == text_len)
3654 break;
3655 x++;
3656 }
3657 }
3658 else if (nth < 0) {
3659 /* Skip to nth separator from the right */
3660 x = text_len - 1;
3661 while (1) {
3662 for (; x >= start; x--)
3663 if (tx[x] == sep)
3664 break;
3665 if (++nth == 0 || x < start)
3666 break;
3667 x--;
3668 }
3669 }
3670 else
3671 Py_Error(PyExc_ValueError,
3672 "nth must be non-zero");
3673
3674 /* Add to tuple */
3675 if (x < start)
3676 s = PyString_FromStringAndSize("",0);
3677 else
3678 s = PyString_FromStringAndSize(&tx[start], x - start);
3679 if (!s)
3680 goto onError;
3681 PyTuple_SET_ITEM(tuple,0,s);
3682
3683 /* Skip separator */
3684 x++;
3685
3686 if (x >= text_len)
3687 s = PyString_FromStringAndSize("",0);
3688 else
3689 s = PyString_FromStringAndSize(&tx[x], text_len - x);
3690 if (!s)
3691 goto onError;
3692 PyTuple_SET_ITEM(tuple,1,s);
3693
3694 return tuple;
3695
3696 onError:
3697 Py_XDECREF(tuple);
3698 return NULL;
3699 }
3700
3701 #ifdef HAVE_UNICODE
3702 static
mxTextTools_UnicodeSuffix(PyObject * text,PyObject * suffixes,Py_ssize_t start,Py_ssize_t text_len,PyObject * translate)3703 PyObject *mxTextTools_UnicodeSuffix(PyObject *text,
3704 PyObject *suffixes,
3705 Py_ssize_t start,
3706 Py_ssize_t text_len,
3707 PyObject *translate)
3708 {
3709 Py_ssize_t i;
3710 Py_UNICODE *tx;
3711
3712 text = PyUnicode_FromObject(text);
3713 if (text == NULL)
3714 goto onError;
3715
3716 if (PyUnicode_Check(text)) {
3717 Py_CheckUnicodeSlice(text, start, text_len);
3718 }
3719 else
3720 Py_Error(PyExc_TypeError,
3721 "expected unicode");
3722 Py_Assert(PyTuple_Check(suffixes),
3723 PyExc_TypeError,
3724 "suffixes needs to be a tuple of unicode strings");
3725
3726 /* XXX Add support for translate... */
3727 Py_Assert(translate == NULL,
3728 PyExc_TypeError,
3729 "translate is not supported for Unicode suffix()es");
3730
3731 tx = PyUnicode_AS_UNICODE(text);
3732
3733 for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) {
3734 PyObject *suffix = PyTuple_GET_ITEM(suffixes,i);
3735 Py_ssize_t start_cmp;
3736
3737 suffix = PyUnicode_FromObject(suffix);
3738 if (suffix == NULL)
3739 goto onError;
3740
3741 start_cmp = text_len - PyUnicode_GET_SIZE(suffix);
3742 if (start_cmp >= start &&
3743 PyUnicode_AS_UNICODE(suffix)[0] == tx[start_cmp] &&
3744 memcmp(PyUnicode_AS_UNICODE(suffix),
3745 &tx[start_cmp],
3746 PyUnicode_GET_DATA_SIZE(suffix)) == 0) {
3747 Py_DECREF(text);
3748 return suffix;
3749 }
3750
3751 Py_DECREF(suffix);
3752 }
3753
3754 Py_DECREF(text);
3755 Py_ReturnNone();
3756
3757 onError:
3758 Py_XDECREF(text);
3759 return NULL;
3760 }
3761 #endif
3762
3763 static
mxTextTools_Suffix(PyObject * text,PyObject * suffixes,Py_ssize_t start,Py_ssize_t text_len,PyObject * translate)3764 PyObject *mxTextTools_Suffix(PyObject *text,
3765 PyObject *suffixes,
3766 Py_ssize_t start,
3767 Py_ssize_t text_len,
3768 PyObject *translate)
3769 {
3770 Py_ssize_t i;
3771 char *tx;
3772
3773 #ifdef HAVE_UNICODE
3774 if (PyUnicode_Check(text))
3775 return mxTextTools_UnicodeSuffix(text, suffixes,
3776 start, text_len,
3777 translate);
3778 #endif
3779
3780 if (PyString_Check(text)) {
3781 Py_CheckStringSlice(text, start, text_len);
3782 }
3783 else
3784 Py_Error(PyExc_TypeError,
3785 "expected string or unicode");
3786 Py_Assert(PyTuple_Check(suffixes),
3787 PyExc_TypeError,
3788 "suffixes needs to be a tuple of strings");
3789 tx = PyString_AS_STRING(text);
3790
3791 if (translate) {
3792 char *tr;
3793
3794 Py_Assert(PyString_Check(translate) &&
3795 PyString_GET_SIZE(translate) == 256,
3796 PyExc_TypeError,
3797 "translate must be a string having 256 characters");
3798 tr = PyString_AS_STRING(translate);
3799
3800 for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) {
3801 PyObject *suffix = PyTuple_GET_ITEM(suffixes, i);
3802 Py_ssize_t start_cmp;
3803 register char *s;
3804 register char *t;
3805 register Py_ssize_t j;
3806
3807 Py_AssertWithArg(PyString_Check(suffix),
3808 PyExc_TypeError,
3809 "tuple entry %d is not a string",(unsigned int)i);
3810 start_cmp = text_len - PyString_GET_SIZE(suffix);
3811 if (start_cmp < start)
3812 continue;
3813
3814 /* Do the compare using a translate table */
3815 s = PyString_AS_STRING(suffix);
3816 t = tx + start_cmp;
3817 for (j = start_cmp; j < text_len; j++, s++, t++)
3818 if (*s != tr[(unsigned char)*t])
3819 break;
3820 if (j == text_len) {
3821 Py_INCREF(suffix);
3822 return suffix;
3823 }
3824 }
3825 }
3826
3827 else
3828 for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) {
3829 PyObject *suffix = PyTuple_GET_ITEM(suffixes,i);
3830 Py_ssize_t start_cmp;
3831
3832 Py_AssertWithArg(PyString_Check(suffix),
3833 PyExc_TypeError,
3834 "tuple entry %d is not a string",(unsigned int)i);
3835 start_cmp = text_len - PyString_GET_SIZE(suffix);
3836 if (start_cmp < start)
3837 continue;
3838
3839 /* Compare without translate table */
3840 if (PyString_AS_STRING(suffix)[0] == tx[start_cmp]
3841 &&
3842 strncmp(PyString_AS_STRING(suffix),
3843 &tx[start_cmp],
3844 PyString_GET_SIZE(suffix)) == 0) {
3845 Py_INCREF(suffix);
3846 return suffix;
3847 }
3848 }
3849
3850 Py_ReturnNone();
3851
3852 onError:
3853 return NULL;
3854 }
3855
3856 #ifdef HAVE_UNICODE
3857 static
mxTextTools_UnicodePrefix(PyObject * text,PyObject * prefixes,Py_ssize_t start,Py_ssize_t text_len,PyObject * translate)3858 PyObject *mxTextTools_UnicodePrefix(PyObject *text,
3859 PyObject *prefixes,
3860 Py_ssize_t start,
3861 Py_ssize_t text_len,
3862 PyObject *translate)
3863 {
3864 Py_ssize_t i;
3865 Py_UNICODE *tx;
3866
3867 text = PyUnicode_FromObject(text);
3868 if (text == NULL)
3869 goto onError;
3870
3871 if (PyUnicode_Check(text)) {
3872 Py_CheckUnicodeSlice(text, start, text_len);
3873 }
3874 else
3875 Py_Error(PyExc_TypeError,
3876 "expected unicode");
3877 Py_Assert(PyTuple_Check(prefixes),
3878 PyExc_TypeError,
3879 "prefixes needs to be a tuple of unicode strings");
3880
3881 /* XXX Add support for translate... */
3882 Py_Assert(translate == NULL,
3883 PyExc_TypeError,
3884 "translate is not supported for Unicode prefix()es");
3885
3886 tx = PyUnicode_AS_UNICODE(text);
3887
3888 for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) {
3889 PyObject *prefix = PyTuple_GET_ITEM(prefixes,i);
3890
3891 prefix = PyUnicode_FromObject(prefix);
3892 if (prefix == NULL)
3893 goto onError;
3894
3895 /* Compare without translate table */
3896 if (start + PyString_GET_SIZE(prefix) <= text_len &&
3897 PyUnicode_AS_UNICODE(prefix)[0] == tx[start] &&
3898 memcmp(PyUnicode_AS_UNICODE(prefix),
3899 &tx[start],
3900 PyUnicode_GET_DATA_SIZE(prefix)) == 0) {
3901 Py_INCREF(prefix);
3902 return prefix;
3903 }
3904
3905 Py_DECREF(prefix);
3906 }
3907
3908 Py_DECREF(text);
3909 Py_ReturnNone();
3910
3911 onError:
3912 Py_XDECREF(text);
3913 return NULL;
3914 }
3915 #endif
3916
3917 static
mxTextTools_Prefix(PyObject * text,PyObject * prefixes,Py_ssize_t start,Py_ssize_t text_len,PyObject * translate)3918 PyObject *mxTextTools_Prefix(PyObject *text,
3919 PyObject *prefixes,
3920 Py_ssize_t start,
3921 Py_ssize_t text_len,
3922 PyObject *translate)
3923 {
3924 Py_ssize_t i;
3925 char *tx;
3926
3927 #ifdef HAVE_UNICODE
3928 if (PyUnicode_Check(text))
3929 return mxTextTools_UnicodePrefix(text, prefixes,
3930 start, text_len,
3931 translate);
3932 #endif
3933
3934 if (PyString_Check(text)) {
3935 Py_CheckStringSlice(text, start, text_len);
3936 }
3937 else
3938 Py_Error(PyExc_TypeError,
3939 "expected string or unicode");
3940 Py_Assert(PyTuple_Check(prefixes),
3941 PyExc_TypeError,
3942 "prefixes needs to be a tuple of strings");
3943 tx = PyString_AS_STRING(text);
3944
3945 if (translate) {
3946 char *tr;
3947
3948 Py_Assert(PyString_Check(translate) &&
3949 PyString_GET_SIZE(translate) == 256,
3950 PyExc_TypeError,
3951 "translate must be a string having 256 characters");
3952 tr = PyString_AS_STRING(translate);
3953
3954 for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) {
3955 PyObject *prefix = PyTuple_GET_ITEM(prefixes,i);
3956 Py_ssize_t cmp_len;
3957 register char *s;
3958 register char *t;
3959 register Py_ssize_t j;
3960
3961 Py_AssertWithArg(PyString_Check(prefix),
3962 PyExc_TypeError,
3963 "tuple entry %d is not a string",(unsigned int)i);
3964 cmp_len = PyString_GET_SIZE(prefix);
3965 if (start + cmp_len > text_len)
3966 continue;
3967
3968 /* Do the compare using a translate table */
3969 s = PyString_AS_STRING(prefix);
3970 t = tx + start;
3971 for (j = 0; j < cmp_len; j++, s++, t++)
3972 if (*s != tr[(unsigned char)*t])
3973 break;
3974 if (j == cmp_len) {
3975 Py_INCREF(prefix);
3976 return prefix;
3977 }
3978 }
3979 }
3980
3981 else
3982 for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) {
3983 PyObject *prefix = PyTuple_GET_ITEM(prefixes,i);
3984
3985 Py_AssertWithArg(PyString_Check(prefix),
3986 PyExc_TypeError,
3987 "tuple entry %d is not a string",(unsigned int)i);
3988 if (start + PyString_GET_SIZE(prefix) > text_len)
3989 continue;
3990
3991 /* Compare without translate table */
3992 if (PyString_AS_STRING(prefix)[0] == tx[start] &&
3993 strncmp(PyString_AS_STRING(prefix),
3994 &tx[start],
3995 PyString_GET_SIZE(prefix)) == 0) {
3996 Py_INCREF(prefix);
3997 return prefix;
3998 }
3999 }
4000
4001 Py_ReturnNone();
4002
4003 onError:
4004 return NULL;
4005 }
4006
4007 /* Stips off characters appearing in the character set from text[start:stop]
4008 and returns the result as Python string object.
4009
4010 where indicates the mode:
4011 where < 0: strip left only
4012 where = 0: strip left and right
4013 where > 0: strip right only
4014
4015 */
4016 static
mxTextTools_SetStrip(char * tx,Py_ssize_t tx_len,char * setstr,Py_ssize_t setstr_len,Py_ssize_t start,Py_ssize_t stop,Py_ssize_t where)4017 PyObject *mxTextTools_SetStrip(char *tx,
4018 Py_ssize_t tx_len,
4019 char *setstr,
4020 Py_ssize_t setstr_len,
4021 Py_ssize_t start,
4022 Py_ssize_t stop,
4023 Py_ssize_t where)
4024 {
4025 Py_ssize_t left, right;
4026
4027 Py_Assert(setstr_len == 32,
4028 PyExc_TypeError,
4029 "separator needs to be a set as obtained from set()");
4030 Py_CheckBufferSlice(tx_len, start, stop);
4031
4032 /* Strip left */
4033 if (where <= 0) {
4034 register Py_ssize_t x;
4035 for (x = start; x < stop; x++)
4036 if (!Py_CharInSet(tx[x], setstr))
4037 break;
4038 left = x;
4039 }
4040 else
4041 left = start;
4042
4043 /* Strip right */
4044 if (where >= 0) {
4045 register Py_ssize_t x;
4046 for (x = stop - 1; x >= start; x--)
4047 if (!Py_CharInSet(tx[x], setstr))
4048 break;
4049 right = x + 1;
4050 }
4051 else
4052 right = stop;
4053
4054 return PyString_FromStringAndSize(tx + left, max(right - left, 0));
4055
4056 onError:
4057 return NULL;
4058 }
4059
4060 static
mxTextTools_SetSplit(char * tx,Py_ssize_t tx_len,char * setstr,Py_ssize_t setstr_len,Py_ssize_t start,Py_ssize_t text_len)4061 PyObject *mxTextTools_SetSplit(char *tx,
4062 Py_ssize_t tx_len,
4063 char *setstr,
4064 Py_ssize_t setstr_len,
4065 Py_ssize_t start,
4066 Py_ssize_t text_len)
4067 {
4068 PyObject *list = NULL;
4069 register Py_ssize_t x;
4070 Py_ssize_t listitem = 0;
4071 Py_ssize_t listsize = INITIAL_LIST_SIZE;
4072
4073 Py_Assert(setstr_len == 32,
4074 PyExc_TypeError,
4075 "separator needs to be a set as obtained from set()");
4076 Py_CheckBufferSlice(tx_len,start,text_len);
4077
4078 list = PyList_New(listsize);
4079 if (!list)
4080 goto onError;
4081
4082 x = start;
4083 while (x < text_len) {
4084 Py_ssize_t z;
4085
4086 /* Skip all text in set */
4087 for (;x < text_len; x++) {
4088 register Py_ssize_t c = (unsigned char)tx[x];
4089 register Py_ssize_t block = (unsigned char)setstr[c >> 3];
4090 if (!block || ((block & (1 << (c & 7))) == 0))
4091 break;
4092 }
4093
4094 /* Skip all text not in set */
4095 z = x;
4096 for (;x < text_len; x++) {
4097 register Py_ssize_t c = (unsigned char)tx[x];
4098 register Py_ssize_t block = (unsigned char)setstr[c >> 3];
4099 if (block && ((block & (1 << (c & 7))) != 0))
4100 break;
4101 }
4102
4103 /* Append the slice to list if it is not empty */
4104 if (x > z) {
4105 PyObject *s;
4106 s = PyString_FromStringAndSize((char *)&tx[z], x - z);
4107 if (!s)
4108 goto onError;
4109 if (listitem < listsize)
4110 PyList_SET_ITEM(list,listitem,s);
4111 else {
4112 PyList_Append(list,s);
4113 Py_DECREF(s);
4114 }
4115 listitem++;
4116 }
4117 }
4118
4119 /* Resize list if necessary */
4120 if (listitem < listsize)
4121 PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL);
4122
4123 return list;
4124
4125 onError:
4126 Py_XDECREF(list);
4127 return NULL;
4128 }
4129
4130 static
mxTextTools_SetSplitX(char * tx,Py_ssize_t tx_len,char * setstr,Py_ssize_t setstr_len,Py_ssize_t start,Py_ssize_t text_len)4131 PyObject *mxTextTools_SetSplitX(char *tx,
4132 Py_ssize_t tx_len,
4133 char *setstr,
4134 Py_ssize_t setstr_len,
4135 Py_ssize_t start,
4136 Py_ssize_t text_len)
4137 {
4138 PyObject *list = NULL;
4139 register Py_ssize_t x;
4140 Py_ssize_t listitem = 0;
4141 Py_ssize_t listsize = INITIAL_LIST_SIZE;
4142
4143 Py_Assert(setstr_len == 32,
4144 PyExc_TypeError,
4145 "separator needs to be a set as obtained from set()");
4146 Py_CheckBufferSlice(tx_len,start,text_len);
4147
4148 list = PyList_New(listsize);
4149 if (!list)
4150 goto onError;
4151
4152 x = start;
4153 while (x < text_len) {
4154 PyObject *s;
4155 register Py_ssize_t z;
4156
4157 /* Skip all text not in set */
4158 z = x;
4159 for (;x < text_len; x++) {
4160 register unsigned int c = (unsigned char)tx[x];
4161 register unsigned int block = (unsigned char)setstr[c >> 3];
4162 if (block && ((block & (1 << (c & 7))) != 0))
4163 break;
4164 }
4165
4166 /* Append the slice to list */
4167 s = PyString_FromStringAndSize((char *)&tx[z], x - z);
4168 if (!s)
4169 goto onError;
4170 if (listitem < listsize)
4171 PyList_SET_ITEM(list,listitem,s);
4172 else {
4173 PyList_Append(list,s);
4174 Py_DECREF(s);
4175 }
4176 listitem++;
4177
4178 if (x >= text_len)
4179 break;
4180
4181 /* Skip all text in set */
4182 z = x;
4183 for (;x < text_len; x++) {
4184 register unsigned int c = (unsigned char)tx[x];
4185 register unsigned int block = (unsigned char)setstr[c >> 3];
4186 if (!block || ((block & (1 << (c & 7))) == 0))
4187 break;
4188 }
4189
4190 /* Append the slice to list if it is not empty */
4191 s = PyString_FromStringAndSize((char *)&tx[z], x - z);
4192 if (!s)
4193 goto onError;
4194 if (listitem < listsize)
4195 PyList_SET_ITEM(list,listitem,s);
4196 else {
4197 PyList_Append(list,s);
4198 Py_DECREF(s);
4199 }
4200 listitem++;
4201 }
4202
4203 /* Resize list if necessary */
4204 if (listitem < listsize)
4205 PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL);
4206
4207 return list;
4208
4209 onError:
4210 Py_XDECREF(list);
4211 return NULL;
4212 }
4213
4214 static
mxTextTools_Upper(PyObject * text)4215 PyObject *mxTextTools_Upper(PyObject *text)
4216 {
4217 PyObject *ntext;
4218 register unsigned char *s;
4219 register unsigned char *orig;
4220 register Py_ssize_t i;
4221 unsigned char *tr;
4222 Py_ssize_t len;
4223
4224 Py_Assert(PyString_Check(text),
4225 PyExc_TypeError,
4226 "expected a Python string");
4227
4228 len = PyString_GET_SIZE(text);
4229 ntext = PyString_FromStringAndSize(NULL,len);
4230 if (!ntext)
4231 goto onError;
4232
4233 /* Translate */
4234 tr = (unsigned char *)PyString_AS_STRING(mx_ToUpper);
4235 orig = (unsigned char *)PyString_AS_STRING(text);
4236 s = (unsigned char *)PyString_AS_STRING(ntext);
4237 for (i = 0; i < len; i++, s++, orig++)
4238 *s = tr[*orig];
4239
4240 return ntext;
4241
4242 onError:
4243 return NULL;
4244 }
4245
4246 #ifdef HAVE_UNICODE
4247 static
mxTextTools_UnicodeUpper(PyObject * text)4248 PyObject *mxTextTools_UnicodeUpper(PyObject *text)
4249 {
4250 PyObject *ntext;
4251 register Py_UNICODE *s;
4252 register Py_UNICODE *orig;
4253 register Py_ssize_t i;
4254 Py_ssize_t len;
4255
4256 text = PyUnicode_FromObject(text);
4257 if (text == NULL)
4258 goto onError;
4259
4260 len = PyUnicode_GET_SIZE(text);
4261 ntext = PyUnicode_FromUnicode(NULL, len);
4262 if (!ntext)
4263 goto onError;
4264
4265 /* Translate */
4266 orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text);
4267 s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext);
4268 for (i = 0; i < len; i++, s++, orig++)
4269 *s = Py_UNICODE_TOUPPER(*orig);
4270
4271 Py_DECREF(text);
4272 return ntext;
4273
4274 onError:
4275 Py_XDECREF(text);
4276 return NULL;
4277 }
4278 #endif
4279
4280 static
mxTextTools_Lower(PyObject * text)4281 PyObject *mxTextTools_Lower(PyObject *text)
4282 {
4283 PyObject *ntext;
4284 register unsigned char *s;
4285 register unsigned char *orig;
4286 register Py_ssize_t i;
4287 unsigned char *tr;
4288 Py_ssize_t len;
4289
4290 Py_Assert(PyString_Check(text),
4291 PyExc_TypeError,
4292 "expected a Python string");
4293
4294 len = PyString_GET_SIZE(text);
4295 ntext = PyString_FromStringAndSize(NULL,len);
4296 if (!ntext)
4297 goto onError;
4298
4299 /* Translate */
4300 tr = (unsigned char *)PyString_AS_STRING(mx_ToLower);
4301 orig = (unsigned char *)PyString_AS_STRING(text);
4302 s = (unsigned char *)PyString_AS_STRING(ntext);
4303 for (i = 0; i < len; i++, s++, orig++)
4304 *s = tr[*orig];
4305
4306 return ntext;
4307
4308 onError:
4309 return NULL;
4310 }
4311
4312 #ifdef HAVE_UNICODE
4313 static
mxTextTools_UnicodeLower(PyObject * text)4314 PyObject *mxTextTools_UnicodeLower(PyObject *text)
4315 {
4316 PyObject *ntext;
4317 register Py_UNICODE *s;
4318 register Py_UNICODE *orig;
4319 register Py_ssize_t i;
4320 Py_ssize_t len;
4321
4322 text = PyUnicode_FromObject(text);
4323 if (text == NULL)
4324 goto onError;
4325
4326 len = PyUnicode_GET_SIZE(text);
4327 ntext = PyUnicode_FromUnicode(NULL, len);
4328 if (!ntext)
4329 goto onError;
4330
4331 /* Translate */
4332 orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text);
4333 s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext);
4334 for (i = 0; i < len; i++, s++, orig++)
4335 *s = Py_UNICODE_TOLOWER(*orig);
4336
4337 Py_DECREF(text);
4338 return ntext;
4339
4340 onError:
4341 Py_XDECREF(text);
4342 return NULL;
4343 }
4344 #endif
4345
4346 /* --- Module functions ------------------------------------------------*/
4347
4348 /* Interface to the tagging engine in mxte.c */
4349
4350 Py_C_Function_WithKeywords(
4351 mxTextTools_tag,
4352 "tag(text,tagtable,sliceleft=0,sliceright=len(text),taglist=[],context=None) \n"""
4353 "Produce a tag list for a string, given a tag-table\n"
4354 "- returns a tuple (success, taglist, nextindex)\n"
4355 "- if taglist == None, then no taglist is created"
4356 )
4357 {
4358 PyObject *text;
4359 PyObject *tagtable;
4360 Py_ssize_t sliceright = INT_MAX;
4361 Py_ssize_t sliceleft = 0;
4362 PyObject *taglist = 0;
4363 Py_ssize_t taglist_len;
4364 PyObject *context = 0;
4365 Py_ssize_t next, result;
4366 PyObject *res;
4367
4368 Py_KeywordsGet6Args("OO|iiOO:tag",
4369 text,tagtable,sliceleft,sliceright,taglist,context);
4370
4371 if (taglist == NULL) {
4372 /* not given, so use default: an empty list */
4373 taglist = PyList_New(0);
4374 if (taglist == NULL)
4375 goto onError;
4376 taglist_len = 0;
4377 }
4378 else {
4379 Py_INCREF(taglist);
4380 Py_Assert(PyList_Check(taglist) || taglist == Py_None,
4381 PyExc_TypeError,
4382 "taglist must be a list or None");
4383 if (taglist != Py_None) {
4384 taglist_len = PyList_Size(taglist);
4385 if (taglist_len < 0)
4386 goto onError;
4387 }
4388 else
4389 taglist_len = 0;
4390 }
4391
4392 Py_Assert(mxTagTable_Check(tagtable) ||
4393 PyTuple_Check(tagtable) ||
4394 PyList_Check(tagtable),
4395 PyExc_TypeError,
4396 "tagtable must be a TagTable instance, list or tuple");
4397
4398 /* Prepare the argument for the Tagging Engine and let it process
4399 the request */
4400 if (PyString_Check(text)) {
4401
4402 Py_CheckStringSlice(text, sliceleft, sliceright);
4403
4404 if (!mxTagTable_Check(tagtable)) {
4405 tagtable = mxTagTable_New(tagtable, MXTAGTABLE_STRINGTYPE, 1);
4406 if (tagtable == NULL)
4407 goto onError;
4408 }
4409 else if (mxTagTable_Type(tagtable) != MXTAGTABLE_STRINGTYPE) {
4410 Py_Error(PyExc_TypeError,
4411 "TagTable instance is not intended for parsing strings");
4412 }
4413 else
4414 Py_INCREF(tagtable);
4415
4416 /* Call the Tagging Engine */
4417 result = mxTextTools_TaggingEngine(text,
4418 sliceleft,
4419 sliceright,
4420 (mxTagTableObject *)tagtable,
4421 taglist,
4422 context,
4423 &next);
4424 Py_DECREF(tagtable);
4425
4426 }
4427 #ifdef HAVE_UNICODE
4428 else if (PyUnicode_Check(text)) {
4429
4430 Py_CheckUnicodeSlice(text, sliceleft, sliceright);
4431
4432 if (!mxTagTable_Check(tagtable)) {
4433 tagtable = mxTagTable_New(tagtable, 1, 1);
4434 if (tagtable == NULL)
4435 goto onError;
4436 }
4437 else if (mxTagTable_Type(tagtable) != MXTAGTABLE_UNICODETYPE) {
4438 Py_Error(PyExc_TypeError,
4439 "TagTable instance is not intended for parsing Unicode");
4440 }
4441 else
4442 Py_INCREF(tagtable);
4443
4444 /* Call the Tagging Engine */
4445 result = mxTextTools_UnicodeTaggingEngine(text,
4446 sliceleft,
4447 sliceright,
4448 (mxTagTableObject *)tagtable,
4449 taglist,
4450 context,
4451 &next);
4452 Py_DECREF(tagtable);
4453
4454 }
4455 #endif
4456 else
4457 Py_Error(PyExc_TypeError,
4458 "text must be a string or unicode");
4459
4460 /* Check for exceptions during matching */
4461 if (result == 0)
4462 goto onError;
4463
4464 /* Undo changes to taglist in case of a match failure (result == 1) */
4465 if (result == 1 && taglist != Py_None) {
4466 DPRINTF(" undoing changes: del taglist[%i:%i]\n",
4467 taglist_len, PyList_Size(taglist));
4468 if (PyList_SetSlice(taglist,
4469 taglist_len,
4470 PyList_Size(taglist),
4471 NULL))
4472 goto onError;
4473 }
4474
4475 /* Convert result to the documented external values:
4476 0 - no match, 1 - match. */
4477 result--;
4478
4479 /* Build result tuple */
4480 res = PyTuple_New(3);
4481 if (!res)
4482 goto onError;
4483 PyTuple_SET_ITEM(res,0,PyInt_FromLong(result));
4484 PyTuple_SET_ITEM(res,1,taglist);
4485 PyTuple_SET_ITEM(res,2,PyInt_FromLong(next));
4486 return res;
4487
4488 onError:
4489 if (!PyErr_Occurred())
4490 Py_Error(PyExc_SystemError,
4491 "NULL result without error in builtin tag()");
4492 Py_XDECREF(taglist);
4493 return NULL;
4494 }
4495
4496 /* An extended version of string.join() for taglists: */
4497
4498 Py_C_Function( mxTextTools_join,
4499 "join(joinlist,sep='',start=0,stop=len(joinlist))\n\n"
4500 "Copy snippets from different strings together producing a\n"
4501 "new string\n"
4502 "The first argument must be a list of tuples or strings;\n"
4503 "tuples must be of the form (string,l,r[,...]) and turn out\n"
4504 "as string[l:r]\n"
4505 "NOTE: the syntax used for negative slices is different\n"
4506 "than the Python standard: -1 corresponds to the first\n"
4507 "character *after* the string, e.g. ('Example',0,-1) gives\n"
4508 "'Example' and not 'Exampl', like in Python\n"
4509 "sep is an optional separator string, start and stop\n"
4510 "define the slice of joinlist that is taken into accont."
4511 )
4512 {
4513 PyObject *joinlist = NULL;
4514 Py_ssize_t joinlist_len;
4515 PyObject *separator = NULL;
4516 Py_ssize_t start=0, stop=INT_MAX;
4517
4518 Py_Get4Args("O|Oii:join",
4519 joinlist,separator,start,stop);
4520
4521 Py_Assert(PySequence_Check(joinlist),
4522 PyExc_TypeError,
4523 "first argument needs to be a sequence");
4524
4525 joinlist_len = PySequence_Length(joinlist);
4526 Py_Assert(joinlist_len >= 0,
4527 PyExc_TypeError,
4528 "first argument needs to have a __len__ method");
4529
4530 Py_CheckSequenceSlice(joinlist_len, start, stop);
4531
4532 /* Short-cut */
4533 if ((stop - start) <= 0)
4534 return PyString_FromString("");
4535
4536 return mxTextTools_Join(joinlist,
4537 start, stop,
4538 separator);
4539
4540 onError:
4541 return NULL;
4542 }
4543
4544 /*
4545 Special compare function for taglist-tuples, comparing
4546 the text-slices given:
4547 - slices starting at a smaller index come first
4548 - for slices starting at the same index, the longer one
4549 wins
4550 */
4551
4552 Py_C_Function( mxTextTools_cmp,
4553 "cmp(a,b)\n\n"
4554 "Compare two valid taglist tuples w/r to their slice\n"
4555 "position; this is useful for sorting joinlists.")
4556 {
4557 PyObject *v,*w;
4558 short index;
4559 int cmp;
4560
4561 Py_Get2Args("OO:cmp",v,w);
4562
4563 Py_Assert(PyTuple_Check(v) && PyTuple_Check(w) &&
4564 PyTuple_GET_SIZE(v) >= 3 && PyTuple_GET_SIZE(w) >= 3,
4565 PyExc_TypeError,
4566 "invalid taglist-tuple");
4567
4568 for (index = 1; index < 3; index++) {
4569 cmp = PyObject_RichCompareBool(PyTuple_GET_ITEM(v,1),PyTuple_GET_ITEM(w,1),Py_LT);
4570 if (cmp)
4571 return PyInt_FromLong(cmp);
4572 cmp = PyObject_RichCompareBool(PyTuple_GET_ITEM(v,2),PyTuple_GET_ITEM(w,2), Py_GT);
4573 if (cmp)
4574 return PyInt_FromLong(cmp);
4575 }
4576 return PyInt_FromLong(0);
4577
4578 onError:
4579 return NULL;
4580 }
4581
4582 Py_C_Function( mxTextTools_joinlist,
4583 "joinlist(text,list,start=0,stop=len(text))\n\n"
4584 "Takes a list of tuples (replacement,l,r,...) and produces\n"
4585 "a taglist suitable for join() which creates a copy\n"
4586 "of text where every slice [l:r] is replaced by the\n"
4587 "given replacement\n"
4588 "- the list must be sorted using cmp() as compare function\n"
4589 "- it may not contain overlapping slices\n"
4590 "- the slices may not contain negative indices\n"
4591 "- if the taglist cannot contain overlapping slices, you can\n"
4592 " give this function the taglist produced by tag() directly\n"
4593 " (sorting is not needed, as the list will already be sorted)\n"
4594 "- start and stop set the slice to work in, i.e. text[start:stop]"
4595 )
4596 {
4597 PyObject *list;
4598 PyObject *text;
4599 Py_ssize_t text_len = INT_MAX;
4600 Py_ssize_t pos = 0;
4601
4602 Py_Get4Args("OO|ii:joinlist",text,list,pos,text_len);
4603
4604 return mxTextTools_Joinlist(text, list, pos, text_len);
4605
4606 onError:
4607 return NULL;
4608 }
4609
4610 Py_C_Function( mxTextTools_charsplit,
4611 "charsplit(text,char,start=0,stop=len(text))\n\n"
4612 "Split text[start:stop] into substrings at char and\n"
4613 "return the result as list of strings."
4614 )
4615 {
4616 PyObject *text, *separator;
4617 Py_ssize_t text_len = INT_MAX;
4618 Py_ssize_t start = 0;
4619
4620 Py_Get4Args("OO|ii:charsplit",
4621 text,separator,start,text_len);
4622
4623 return mxTextTools_CharSplit(text, separator,
4624 start, text_len);
4625
4626 onError:
4627 return NULL;
4628 }
4629
4630 Py_C_Function( mxTextTools_splitat,
4631 "splitat(text,char,nth=1,start=0,stop=len(text))\n\n"
4632 "Split text[start:stop] into two substrings at the nth\n"
4633 "occurance of char and return the result as 2-tuple. If the\n"
4634 "character is not found, the second string is empty. nth may\n"
4635 "be negative: the search is then done from the right and the\n"
4636 "first string is empty in case the character is not found."
4637 )
4638 {
4639 PyObject *text, *separator;
4640 Py_ssize_t text_len = INT_MAX;
4641 Py_ssize_t start = 0;
4642 Py_ssize_t nth = 1;
4643
4644 Py_Get5Args("OO|iii:splitat",
4645 text,separator,nth,start,text_len);
4646
4647 return mxTextTools_SplitAt(text, separator,
4648 nth, start, text_len);
4649 onError:
4650 return NULL;
4651 }
4652
4653 Py_C_Function( mxTextTools_suffix,
4654 "suffix(text,suffixes,start=0,stop=len(text)[,translate])\n\n"
4655 "Looks at text[start:stop] and returns the first matching\n"
4656 "suffix out of the tuple of strings given in suffixes.\n"
4657 "If no suffix is found to be matching, None is returned.\n"
4658 "The optional 256 char translate string is used to translate\n"
4659 "the text prior to comparing it with the given suffixes."
4660 )
4661 {
4662 PyObject *text, *suffixes, *translate = NULL;
4663 Py_ssize_t text_len = INT_MAX;
4664 Py_ssize_t start = 0;
4665
4666 Py_Get5Args("OO|iiO:suffix",
4667 text,suffixes,start,text_len,translate);
4668
4669 return mxTextTools_Suffix(text,
4670 suffixes,
4671 start, text_len,
4672 translate);
4673 onError:
4674 return NULL;
4675 }
4676
4677 Py_C_Function( mxTextTools_prefix,
4678 "prefix(text,prefixes,start=0,stop=len(text)[,translate])\n\n"
4679 "Looks at text[start:stop] and returns the first matching\n"
4680 "prefix out of the tuple of strings given in prefixes.\n"
4681 "If no prefix is found to be matching, None is returned.\n"
4682 "The optional 256 char translate string is used to translate\n"
4683 "the text prior to comparing it with the given suffixes."
4684 )
4685 {
4686 PyObject *text, *prefixes, *translate = NULL;
4687 Py_ssize_t text_len = INT_MAX;
4688 Py_ssize_t start = 0;
4689
4690 Py_Get5Args("OO|iiO:prefix",
4691 text,prefixes,start,text_len,translate);
4692
4693 return mxTextTools_Prefix(text,
4694 prefixes,
4695 start, text_len,
4696 translate);
4697 onError:
4698 return NULL;
4699 }
4700
4701 Py_C_Function( mxTextTools_set,
4702 "set(string,logic=1)\n\n"
4703 "Returns a character set for string: a bit encoded version\n"
4704 "of the characters occurring in string.\n"
4705 "- logic can be set to 0 if all characters *not* in string\n"
4706 " should go into the set")
4707 {
4708 PyObject *sto;
4709 char *s,*st;
4710 Py_ssize_t len_s;
4711 int logic = 1;
4712 Py_ssize_t i;
4713
4714 Py_Get3Args("s#|i:set",
4715 s,len_s,logic);
4716
4717 sto = PyString_FromStringAndSize(NULL,32);
4718 if (sto == NULL)
4719 goto onError;
4720
4721 st = PyString_AS_STRING(sto);
4722
4723 if (logic) {
4724 memset(st,0x00,32);
4725 for (i = 0; i < len_s; i++,s++) {
4726 int j = (unsigned char)*s;
4727
4728 st[j >> 3] |= 1 << (j & 7);
4729 }
4730 }
4731 else {
4732 memset(st,0xFF,32);
4733 for (i = 0; i < len_s; i++,s++) {
4734 int j = (unsigned char)*s;
4735
4736 st[j >> 3] &= ~(1 << (j & 7));
4737 }
4738 }
4739 return sto;
4740
4741 onError:
4742 return NULL;
4743 }
4744
4745 Py_C_Function( mxTextTools_setfind,
4746 "setfind(text,set,start=0,stop=len(text))\n\n"
4747 "Find the first occurence of any character from set in\n"
4748 "text[start:stop]\n set must be a string obtained with set()\n"
4749 "DEPRECATED: use CharSet().search() instead."
4750 )
4751 {
4752 PyObject *text;
4753 PyObject *set;
4754 Py_ssize_t text_len = INT_MAX;
4755 Py_ssize_t start = 0;
4756 register Py_ssize_t x;
4757 register char *tx;
4758 register unsigned char *setstr;
4759
4760 Py_Get4Args("OO|ii:setfind",text,set,start,text_len);
4761
4762 Py_Assert(PyString_Check(text),
4763 PyExc_TypeError,
4764 "first argument needs to be a string");
4765 Py_Assert(PyString_Check(set) && PyString_GET_SIZE(set) == 32,
4766 PyExc_TypeError,
4767 "second argument needs to be a set");
4768 Py_CheckStringSlice(text,start,text_len);
4769
4770 x = start;
4771 tx = PyString_AS_STRING(text) + x;
4772 setstr = (unsigned char *)PyString_AS_STRING(set);
4773
4774 for (;x < text_len; tx++, x++)
4775 if (Py_CharInSet(*tx,setstr))
4776 break;
4777
4778 if (x == text_len)
4779 /* Not found */
4780 return PyInt_FromLong(-1L);
4781 else
4782 return PyInt_FromLong(x);
4783
4784 onError:
4785 return NULL;
4786 }
4787
4788 Py_C_Function( mxTextTools_setstrip,
4789 "setstrip(text,set,start=0,stop=len(text),mode=0)\n\n"
4790 "Strip all characters in text[start:stop] appearing in set.\n"
4791 "mode indicates where to strip (<0: left; =0: left and right;\n"
4792 ">0: right). set must be a string obtained with set()\n"
4793 "DEPRECATED: use CharSet().strip() instead."
4794 )
4795 {
4796 char *tx;
4797 Py_ssize_t tx_len;
4798 char *setstr;
4799 Py_ssize_t setstr_len;
4800 Py_ssize_t start = 0;
4801 Py_ssize_t stop = INT_MAX;
4802 int mode = 0;
4803
4804 Py_Get7Args("s#s#|iii:setstip",
4805 tx,tx_len,setstr,setstr_len,start,stop,mode);
4806
4807 return mxTextTools_SetStrip(tx, tx_len,
4808 setstr, setstr_len,
4809 start, stop,
4810 mode);
4811
4812 onError:
4813 return NULL;
4814 }
4815
4816 Py_C_Function( mxTextTools_setsplit,
4817 "setsplit(text,set,start=0,stop=len(text))\n\n"
4818 "Split text[start:stop] into substrings using set,\n"
4819 "omitting the splitting parts and empty substrings.\n"
4820 "set must be a string obtained from set()\n"
4821 "DEPRECATED: use CharSet().split() instead."
4822 )
4823 {
4824 char *tx;
4825 Py_ssize_t tx_len;
4826 char *setstr;
4827 Py_ssize_t setstr_len;
4828 Py_ssize_t start = 0;
4829 Py_ssize_t stop = INT_MAX;
4830
4831 Py_Get6Args("s#s#|ii:setsplit",
4832 tx,tx_len,setstr,setstr_len,start,stop);
4833
4834 return mxTextTools_SetSplit(tx, tx_len,
4835 setstr, setstr_len,
4836 start, stop);
4837 onError:
4838 return NULL;
4839 }
4840
4841 Py_C_Function( mxTextTools_setsplitx,
4842 "setsplitx(text,set,start=0,stop=len(text))\n\n"
4843 "Split text[start:stop] into substrings using set, so\n"
4844 "that every second entry consists only of characters in set.\n"
4845 "set must be a string obtained with set()\n"
4846 "DEPRECATED: use CharSet().splitx() instead."
4847 )
4848 {
4849 Py_ssize_t text_len = INT_MAX;
4850 Py_ssize_t start = 0;
4851 char *tx;
4852 Py_ssize_t tx_len;
4853 char *setstr;
4854 Py_ssize_t setstr_len;
4855
4856 Py_Get6Args("s#s#|ii:setsplitx",
4857 tx,tx_len,setstr,setstr_len,start,text_len);
4858
4859 return mxTextTools_SetSplitX(tx, tx_len,
4860 setstr, setstr_len,
4861 start, text_len);
4862 onError:
4863 return NULL;
4864 }
4865
4866 Py_C_Function( mxTextTools_upper,
4867 "upper(text)\n\n"
4868 "Return text converted to upper case.")
4869 {
4870 PyObject *text;
4871
4872 Py_GetArgObject(text);
4873 if (PyString_Check(text))
4874 return mxTextTools_Upper(text);
4875 #ifdef HAVE_UNICODE
4876 else if (PyUnicode_Check(text))
4877 return mxTextTools_UnicodeUpper(text);
4878 #endif
4879 else
4880 Py_Error(PyExc_TypeError,
4881 "expected string or unicode");
4882
4883 onError:
4884 return NULL;
4885 }
4886
4887 Py_C_Function( mxTextTools_lower,
4888 "lower(text)\n\n"
4889 "Return text converted to lower case.")
4890 {
4891 PyObject *text;
4892
4893 Py_GetArgObject(text);
4894 if (PyString_Check(text))
4895 return mxTextTools_Lower(text);
4896 #ifdef HAVE_UNICODE
4897 else if (PyUnicode_Check(text))
4898 return mxTextTools_UnicodeLower(text);
4899 #endif
4900 else
4901 Py_Error(PyExc_TypeError,
4902 "expected string or unicode");
4903
4904 onError:
4905 return NULL;
4906 }
4907
4908 Py_C_Function( mxTextTools_str2hex,
4909 "str2hex(text)\n\n"
4910 "Return text converted to a string consisting of two byte\n"
4911 "HEX values.")
4912 {
4913 char *str;
4914 Py_ssize_t len;
4915
4916 Py_Get2Args("s#",str,len);
4917
4918 return mxTextTools_HexStringFromString(str,len);
4919
4920 onError:
4921 return NULL;
4922 }
4923
4924 Py_C_Function( mxTextTools_hex2str,
4925 "hex2str(text)\n\n"
4926 "Return text interpreted as two byte HEX values converted\n"
4927 "to a string.")
4928 {
4929 char *str;
4930 Py_ssize_t len;
4931
4932 Py_Get2Args("s#",str,len);
4933
4934 return mxTextTools_StringFromHexString(str,len);
4935
4936 onError:
4937 return NULL;
4938 }
4939
4940 Py_C_Function( mxTextTools_isascii,
4941 "isascii(text,start=0,stop=len(text))\n\n"
4942 "Return 1/0 depending on whether text only contains ASCII\n"
4943 "characters."
4944 )
4945 {
4946 PyObject *text;
4947 Py_ssize_t start=0, stop = INT_MAX;
4948 int rc;
4949
4950 Py_GetArgObject(text);
4951 rc = mxTextTools_IsASCII(text, start, stop);
4952 if (rc < 0)
4953 goto onError;
4954 return PyInt_FromLong(rc);
4955
4956 onError:
4957 return NULL;
4958 }
4959
4960 /* --- module init --------------------------------------------------------- */
4961
4962 /* Python Method Table */
4963
4964 static PyMethodDef Module_methods[] =
4965 {
4966 Py_MethodWithKeywordsListEntry("tag",mxTextTools_tag),
4967 Py_MethodListEntry("join",mxTextTools_join),
4968 Py_MethodListEntry("cmp",mxTextTools_cmp),
4969 Py_MethodListEntry("joinlist",mxTextTools_joinlist),
4970 Py_MethodListEntry("set",mxTextTools_set),
4971 Py_MethodListEntry("setfind",mxTextTools_setfind),
4972 Py_MethodListEntry("setsplit",mxTextTools_setsplit),
4973 Py_MethodListEntry("setsplitx",mxTextTools_setsplitx),
4974 Py_MethodListEntry("setstrip",mxTextTools_setstrip),
4975 Py_MethodWithKeywordsListEntry("TextSearch",mxTextSearch_TextSearch),
4976 Py_MethodListEntry("CharSet",mxCharSet_CharSet),
4977 Py_MethodListEntry("TagTable",mxTagTable_TagTable),
4978 #ifdef HAVE_UNICODE
4979 Py_MethodListEntry("UnicodeTagTable",mxTagTable_UnicodeTagTable),
4980 #endif
4981 // Disabled because we don't actually use these functions
4982 // and they are using a hack that tries to avoid the overhead
4983 // of the single-value tuple creation/unpacking
4984 // Py_MethodListEntrySingleArg("upper",mxTextTools_upper),
4985 // Py_MethodListEntrySingleArg("lower",mxTextTools_lower),
4986 Py_MethodListEntry("charsplit",mxTextTools_charsplit),
4987 Py_MethodListEntry("splitat",mxTextTools_splitat),
4988 Py_MethodListEntry("suffix",mxTextTools_suffix),
4989 Py_MethodListEntry("prefix",mxTextTools_prefix),
4990 Py_MethodListEntry("hex2str",mxTextTools_hex2str),
4991 Py_MethodListEntry("str2hex",mxTextTools_str2hex),
4992 // Py_MethodListEntrySingleArg("isascii",mxTextTools_isascii),
4993 {NULL,NULL} /* end of list */
4994 };
4995
4996 /* Cleanup function */
4997 static
mxTextToolsModule_Cleanup(void)4998 void mxTextToolsModule_Cleanup(void)
4999 {
5000 mxTextTools_TagTables = NULL;
5001
5002 /* Reset mxTextTools_Initialized flag */
5003 mxTextTools_Initialized = 0;
5004 }
5005
5006 #if PY_MAJOR_VERSION >= 3
5007 static struct PyModuleDef mxTextTools_ModuleDef = {
5008 PyModuleDef_HEAD_INIT,
5009 MXTEXTTOOLS_MODULE,
5010 Module_docstring,
5011 -1,
5012 Module_methods
5013 };
5014 #endif
5015
mxTextToolsModule_Initialize(void)5016 static PyObject* mxTextToolsModule_Initialize(void)
5017 {
5018 PyObject *module;
5019
5020 if (mxTextTools_Initialized) {
5021 PyErr_SetString(PyExc_SystemError,
5022 "can't initialize "MXTEXTTOOLS_MODULE" more than once");
5023 return NULL;
5024 }
5025
5026 /* Init type objects */
5027 if (PyType_Ready(&mxTextSearch_Type) < 0)
5028 return NULL;
5029 if (PyType_Ready(&mxCharSet_Type) < 0)
5030 return NULL;
5031 if (PyType_Ready(&mxTagTable_Type) < 0)
5032 return NULL;
5033
5034 /* create module */
5035 #if PY_MAJOR_VERSION >= 3
5036 module = PyModule_Create(&mxTextTools_ModuleDef);
5037 #else
5038 module = Py_InitModule4(MXTEXTTOOLS_MODULE, /* Module name */
5039 Module_methods, /* Method list */
5040 Module_docstring, /* Module doc-string */
5041 (PyObject *)NULL, /* always pass this as *self */
5042 PYTHON_API_VERSION); /* API Version */
5043 #endif
5044 if (!module)
5045 return NULL;
5046
5047 /* Init TagTable cache */
5048 mxTextTools_TagTables = PyDict_New();
5049 if (!mxTextTools_TagTables)
5050 return NULL;
5051
5052 /* Register cleanup function */
5053 if (Py_AtExit(mxTextToolsModule_Cleanup) < 0)
5054 return NULL;
5055
5056 /* Add some symbolic constants to the module */
5057 if (PyModule_AddStringConstant(module, "__version__", VERSION) < 0)
5058 return NULL;
5059 mx_ToUpper = mxTextTools_ToUpper();
5060 if (!mx_ToUpper)
5061 return NULL;
5062 if (PyModule_AddObject(module, "to_upper", mx_ToUpper) < 0)
5063 return NULL;
5064 mx_ToLower = mxTextTools_ToLower();
5065 if (!mx_ToLower)
5066 return NULL;
5067 if (PyModule_AddObject(module, "to_lower", mx_ToLower) < 0)
5068 return NULL;
5069
5070 /* Let the tag table cache live in the module dictionary; we just
5071 keep a weak reference in mxTextTools_TagTables around. */
5072 if (PyModule_AddObject(module, "tagtable_cache", mxTextTools_TagTables) < 0)
5073 return NULL;
5074 Py_DECREF(mxTextTools_TagTables);
5075
5076 ADD_INT_CONSTANT("BOYERMOORE", MXTEXTSEARCH_BOYERMOORE);
5077 ADD_INT_CONSTANT("FASTSEARCH", MXTEXTSEARCH_FASTSEARCH);
5078 ADD_INT_CONSTANT("TRIVIAL", MXTEXTSEARCH_TRIVIAL);
5079
5080 /* Init exceptions */
5081 mxTextTools_Error = PyErr_NewException("mxTextTools.Error", PyExc_Exception, NULL);
5082 if (!mxTextTools_Error)
5083 return NULL;
5084 if (PyModule_AddObject(module, "Error", mxTextTools_Error) < 0)
5085 return NULL;
5086
5087 /* Type objects */
5088 Py_INCREF(&mxTextSearch_Type);
5089 if (PyModule_AddObject(module, "TextSearchType", (PyObject*) &mxTextSearch_Type) < 0)
5090 return NULL;
5091 Py_INCREF(&mxCharSet_Type);
5092 if (PyModule_AddObject(module, "CharSetType", (PyObject*) &mxCharSet_Type) < 0)
5093 return NULL;
5094 Py_INCREF(&mxTagTable_Type);
5095 if (PyModule_AddObject(module, "TagTableType", (PyObject*) &mxTagTable_Type) < 0)
5096 return NULL;
5097
5098 /* Tag Table command symbols (these will be exposed via
5099 simpleparse.stt.TextTools.Constants.TagTables) */
5100 ADD_INT_CONSTANT("_const_AllIn", MATCH_ALLIN);
5101 ADD_INT_CONSTANT("_const_AllNotIn", MATCH_ALLNOTIN);
5102 ADD_INT_CONSTANT("_const_Is", MATCH_IS);
5103 ADD_INT_CONSTANT("_const_IsIn", MATCH_ISIN);
5104 ADD_INT_CONSTANT("_const_IsNot", MATCH_ISNOTIN);
5105 ADD_INT_CONSTANT("_const_IsNotIn", MATCH_ISNOTIN);
5106
5107 ADD_INT_CONSTANT("_const_Word", MATCH_WORD);
5108 ADD_INT_CONSTANT("_const_WordStart", MATCH_WORDSTART);
5109 ADD_INT_CONSTANT("_const_WordEnd", MATCH_WORDEND);
5110
5111 ADD_INT_CONSTANT("_const_AllInSet", MATCH_ALLINSET);
5112 ADD_INT_CONSTANT("_const_IsInSet", MATCH_ISINSET);
5113 ADD_INT_CONSTANT("_const_AllInCharSet", MATCH_ALLINCHARSET);
5114 ADD_INT_CONSTANT("_const_IsInCharSet", MATCH_ISINCHARSET);
5115
5116 ADD_INT_CONSTANT("_const_Fail", MATCH_FAIL);
5117 ADD_INT_CONSTANT("_const_Jump", MATCH_JUMP);
5118 ADD_INT_CONSTANT("_const_EOF", MATCH_EOF);
5119 ADD_INT_CONSTANT("_const_Skip", MATCH_SKIP);
5120 ADD_INT_CONSTANT("_const_Move", MATCH_MOVE);
5121
5122 ADD_INT_CONSTANT("_const_JumpTarget", MATCH_JUMPTARGET);
5123
5124 ADD_INT_CONSTANT("_const_sWordStart", MATCH_SWORDSTART);
5125 ADD_INT_CONSTANT("_const_sWordEnd", MATCH_SWORDEND);
5126 ADD_INT_CONSTANT("_const_sFindWord", MATCH_SFINDWORD);
5127 ADD_INT_CONSTANT("_const_NoWord", MATCH_NOWORD);
5128
5129 ADD_INT_CONSTANT("_const_Call", MATCH_CALL);
5130 ADD_INT_CONSTANT("_const_CallArg", MATCH_CALLARG);
5131
5132 ADD_INT_CONSTANT("_const_Table", MATCH_TABLE);
5133 ADD_INT_CONSTANT("_const_SubTable", MATCH_SUBTABLE);
5134 ADD_INT_CONSTANT("_const_TableInList", MATCH_TABLEINLIST);
5135 ADD_INT_CONSTANT("_const_SubTableInList", MATCH_SUBTABLEINLIST);
5136
5137 ADD_INT_CONSTANT("_const_Loop", MATCH_LOOP);
5138 ADD_INT_CONSTANT("_const_LoopControl", MATCH_LOOPCONTROL);
5139
5140 /* Tag Table command flags */
5141 ADD_INT_CONSTANT("_const_CallTag", MATCH_CALLTAG);
5142 ADD_INT_CONSTANT("_const_AppendToTagobj", MATCH_APPENDTAG);
5143 ADD_INT_CONSTANT("_const_AppendTagobj", MATCH_APPENDTAGOBJ);
5144 ADD_INT_CONSTANT("_const_AppendMatch", MATCH_APPENDMATCH);
5145 ADD_INT_CONSTANT("_const_LookAhead", MATCH_LOOKAHEAD);
5146
5147 /* Tag Table argument integers */
5148 ADD_INT_CONSTANT("_const_To", MATCH_JUMP_TO);
5149 ADD_INT_CONSTANT("_const_MatchOk", MATCH_JUMP_MATCHOK);
5150 ADD_INT_CONSTANT("_const_MatchFail", MATCH_JUMP_MATCHFAIL);
5151 ADD_INT_CONSTANT("_const_ToEOF", MATCH_MOVE_EOF);
5152 ADD_INT_CONSTANT("_const_ToBOF", MATCH_MOVE_BOF);
5153 ADD_INT_CONSTANT("_const_Here", MATCH_FAIL_HERE);
5154
5155 ADD_INT_CONSTANT("_const_ThisTable", MATCH_THISTABLE);
5156
5157 ADD_INT_CONSTANT("_const_Break", MATCH_LOOPCONTROL_BREAK);
5158 ADD_INT_CONSTANT("_const_Reset", MATCH_LOOPCONTROL_RESET);
5159
5160 DPRINTF("sizeof(string_charset)=%i bytes\n", sizeof(string_charset));
5161 #ifdef HAVE_UNICODE
5162 DPRINTF("sizeof(unicode_charset)=%i bytes\n", sizeof(unicode_charset));
5163 #endif
5164
5165 /* We are now initialized */
5166 mxTextTools_Initialized = 1;
5167
5168 return module;
5169 }
5170
5171 #if PY_MAJOR_VERSION >= 3
PyInit_mxTextTools(void)5172 PyMODINIT_FUNC PyInit_mxTextTools(void)
5173 {
5174 return mxTextToolsModule_Initialize();
5175 }
5176 #else
initmxTextTools(void)5177 MX_EXPORT(void) initmxTextTools(void)
5178 {
5179 mxTextToolsModule_Initialize();
5180 }
5181 #endif
5182
5183