1 /*
2   mxTextTools -- Fast text manipulation routines
3 
4   Copyright (c) 2000, Marc-Andre Lemburg; mailto:mal@lemburg.com
5   Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com
6 */
7 
8 /* We want all our symbols to be exported */
9 #ifndef MX_BUILDING_MXTEXTTOOLS
10 #define MX_BUILDING_MXTEXTTOOLS
11 #endif
12 
13 /* Logging file used by debugging facility */
14 #ifndef MAL_DEBUG_OUTPUTFILE
15 # define MAL_DEBUG_OUTPUTFILE "mxTextTools.log"
16 #endif
17 
18 #include "mx.h"
19 #include "mxTextTools.h"
20 #include "structmember.h"
21 #include <ctype.h>
22 
23 #define VERSION "2.1.0"
24 
25 /* Initial list size used by e.g. setsplit(), setsplitx(),... */
26 #define INITIAL_LIST_SIZE 64
27 
28 /* Maximum TagTable cache size. If this limit is reached, the cache
29    is cleared to make room for new compile TagTables. */
30 #define MAX_TAGTABLES_CACHE_SIZE 100
31 
32 /* Define this to enable the copy-protocol (__copy__, __deepcopy__) */
33 #define COPY_PROTOCOL
34 
35 /* Convenience macro for reducing clutter */
36 #define ADD_INT_CONSTANT(name, value) \
37     if (PyModule_AddIntConstant(module, name, value) < 0) \
38         return NULL;
39 
40 /* --- module doc-string -------------------------------------------------- */
41 
42 PyDoc_STRVAR(Module_docstring,
43 
44  MXTEXTTOOLS_MODULE" -- Tools for fast text processing. Version "VERSION"\n\n"
45 
46  "Copyright (c) 1997-2000, Marc-Andre Lemburg; mailto:mal@lemburg.com\n"
47  "Copyright (c) 2000-2002, eGenix.com Software GmbH; mailto:info@egenix.com\n\n"
48  "Copyright (c) 2003-2006, Mike Fletcher; mailto:mcfletch@vrplumber.com\n\n"
49 
50  "                 All Rights Reserved\n\n"
51  "See the documentation for further information on copyrights,\n"
52  "or contact the author.")
53 ;
54 
55 /* --- internal macros ---------------------------------------------------- */
56 
57 /* --- module globals ----------------------------------------------------- */
58 
59 /* Translation strings for the 8-bit versions of lower() and upper() */
60 static PyObject *mx_ToUpper;
61 static PyObject *mx_ToLower;
62 
63 static PyObject *mxTextTools_Error;	/* mxTextTools specific error */
64 
65 static PyObject *mxTextTools_TagTables;	/* TagTable cache dictionary */
66 
67 /* Flag telling us whether the module was initialized or not. */
68 static int mxTextTools_Initialized = 0;
69 
70 /* --- forward declarations ----------------------------------------------- */
71 
72 /* --- module helper ------------------------------------------------------ */
73 
74 static
mxTextTools_ToUpper(void)75 PyObject *mxTextTools_ToUpper(void)
76 {
77     char tr[256];
78     Py_ssize_t i;
79 
80     for (i = 0; i < 256; i++)
81 	tr[i] = toupper((char)i);
82     return PyString_FromStringAndSize(tr,sizeof(tr));
83 }
84 
85 static
mxTextTools_ToLower(void)86 PyObject *mxTextTools_ToLower(void)
87 {
88     char tr[256];
89     Py_ssize_t i;
90 
91     for (i = 0; i < 256; i++)
92 	tr[i] = tolower((char)i);
93     return PyString_FromStringAndSize(tr,sizeof(tr));
94 }
95 
96 /* Create an exception object, insert it into the module dictionary
97    under the given name and return the object pointer; this is NULL in
98    case an error occurred. base can be given to indicate the base
99    object to be used by the exception object. It should be NULL
100    otherwise */
101 
102 /* --- module interface --------------------------------------------------- */
103 
104 /* --- Text Search Object ----------------------------------------------*/
105 
106 /* allocation */
107 
108 static
mxTextSearch_New(PyObject * match,PyObject * translate,int algorithm)109 PyObject *mxTextSearch_New(PyObject *match,
110 			   PyObject *translate,
111 			   int algorithm)
112 {
113     mxTextSearchObject *so;
114 
115     so = PyObject_NEW(mxTextSearchObject, &mxTextSearch_Type);
116     if (so == NULL)
117 	return NULL;
118     so->data = NULL;
119     so->translate = NULL;
120     so->match = NULL;
121 
122     Py_INCREF(match);
123     so->match = match;
124 
125     if (translate == Py_None)
126 	translate = NULL;
127     else if (translate) {
128 	Py_Assert(PyString_Check(translate),
129 		  PyExc_TypeError,
130 		  "translate table must be a string");
131 	Py_Assert(PyString_GET_SIZE(translate) == 256,
132 		  PyExc_TypeError,
133 		  "translate string must have exactly 256 chars");
134 	Py_INCREF(translate);
135     }
136     so->translate = translate;
137 
138     /* Init algorithm */
139     so->algorithm = algorithm;
140     switch (algorithm) {
141 
142     case MXTEXTSEARCH_BOYERMOORE:
143 	Py_Assert(PyString_Check(match),
144 		  PyExc_TypeError,
145 		  "match must be a string for Boyer-Moore");
146 	so->data = bm_init(PyString_AS_STRING(match),
147 			   PyString_GET_SIZE(match));
148 	Py_Assert(so->data != NULL,
149 		  PyExc_TypeError,
150 		  "error initializing the search object");
151 	break;
152 
153     case MXTEXTSEARCH_TRIVIAL:
154 	Py_Assert(PyString_Check(match) || PyUnicode_Check(match),
155 		  PyExc_TypeError,
156 		  "match must be a string or unicode");
157 	Py_Assert(so->translate == NULL,
158 		  PyExc_TypeError,
159 		  "trivial search algorithm does not support translate");
160 	break;
161 
162     default:
163 	Py_Error(PyExc_ValueError,
164 		 "unknown or unsupported algorithm");
165 
166     }
167     return (PyObject *)so;
168 
169  onError:
170     Py_DECREF(so);
171     return NULL;
172 }
173 
174 Py_C_Function_WithKeywords(
175                 mxTextSearch_TextSearch,
176 	       "TextSearch(match[,translate=None,algorithm=default_algorithm])\n\n"
177 	       "Create a substring search object for the string match;\n"
178 	       "translate is an optional translate-string like the one used\n"
179 	       "in the module re."
180 		)
181 {
182     PyObject *match = 0;
183     PyObject *translate = 0;
184     int algorithm = -424242;
185 
186     Py_KeywordsGet3Args("O|Oi:TextSearch",match,translate,algorithm);
187 
188     if (algorithm == -424242) {
189 	if (PyUnicode_Check(match))
190 	    algorithm = MXTEXTSEARCH_TRIVIAL;
191 	else
192 	    algorithm = MXTEXTSEARCH_BOYERMOORE;
193     }
194     return mxTextSearch_New(match, translate, algorithm);
195 
196  onError:
197     return NULL;
198 }
199 
200 static
mxTextSearch_Free(mxTextSearchObject * so)201 void mxTextSearch_Free(mxTextSearchObject *so)
202 {
203     if (so->data) {
204 	switch  (so->algorithm) {
205 
206 	case MXTEXTSEARCH_BOYERMOORE:
207 	    bm_free(so->data);
208 	    break;
209 
210 	case MXTEXTSEARCH_TRIVIAL:
211 	    break;
212 
213 	}
214     }
215     Py_XDECREF(so->match);
216     Py_XDECREF(so->translate);
217     PyObject_Del(so);
218 }
219 
220 /* C APIs */
221 
222 #define so ((mxTextSearchObject *)self)
223 
224 /* Get the match length from an TextSearch object or -1 in case of an
225    error. */
226 
mxTextSearch_MatchLength(PyObject * self)227 Py_ssize_t mxTextSearch_MatchLength(PyObject *self)
228 {
229     Py_Assert(mxTextSearch_Check(self),
230 	      PyExc_TypeError,
231 	      "expected a TextSearch object");
232 
233     switch  (so->algorithm) {
234 
235     case MXTEXTSEARCH_BOYERMOORE:
236 	return BM_MATCH_LEN(so->data);
237 	break;
238 
239     case MXTEXTSEARCH_TRIVIAL:
240 	if (PyString_Check(so->match))
241 	    return PyString_GET_SIZE(so->match);
242 #ifdef HAVE_UNICODE
243 	else if (PyUnicode_Check(so->match))
244 	    return PyUnicode_GET_SIZE(so->match);
245 #endif
246 	break;
247 
248     }
249 
250     Py_Error(mxTextTools_Error,
251 	     "internal error");
252 
253  onError:
254     return -1;
255 }
256 
257 static
trivial_search(const char * text,Py_ssize_t start,Py_ssize_t stop,const char * match,Py_ssize_t match_len)258 Py_ssize_t trivial_search(const char *text,
259 		   Py_ssize_t start,
260 		   Py_ssize_t stop,
261 		   const char *match,
262 		   Py_ssize_t match_len)
263 {
264     Py_ssize_t ml1 = match_len - 1;
265     register const char *tx = &text[start];
266     register Py_ssize_t x = start;
267 
268     if (ml1 < 0)
269 	return start;
270 
271     /* Brute-force method; from right to left */
272     for (;;) {
273 	register Py_ssize_t j = ml1;
274 	register const char *mj = &match[j];
275 
276 	if (x + j >= stop)
277 	    /* reached eof: no match */
278 	    return start;
279 
280 	/* scan from right to left */
281 	for (tx += j; j >= 0 && *tx == *mj;
282 	     tx--, mj--, j--) ;
283 
284 	if (j < 0) {
285 	    /* found */
286 	    x += ml1 + 1;
287 	    return x;
288 	}
289 	/* not found: rewind and advance one char */
290 	tx -= j - 1;
291 	x++;
292     }
293     return start;
294 }
295 
296 #ifdef HAVE_UNICODE
297 static
trivial_unicode_search(const Py_UNICODE * text,Py_ssize_t start,Py_ssize_t stop,const Py_UNICODE * match,Py_ssize_t match_len)298 Py_ssize_t trivial_unicode_search(const Py_UNICODE *text,
299 			   Py_ssize_t start,
300 			   Py_ssize_t stop,
301 			   const Py_UNICODE *match,
302 			   Py_ssize_t match_len)
303 {
304     Py_ssize_t ml1 = match_len - 1;
305     register const Py_UNICODE *tx = &text[start];
306     register Py_ssize_t x = start;
307 
308     if (ml1 < 0)
309 	return start;
310 
311     /* Brute-force method; from right to left */
312     for (;;) {
313 	register Py_ssize_t j = ml1;
314 	register const Py_UNICODE *mj = &match[j];
315 
316 	if (x + j >= stop)
317 	    /* reached eof: no match */
318 	    return start;
319 
320 	/* scan from right to left */
321 	for (tx += j; j >= 0 && *tx == *mj;
322 	     tx--, mj--, j--) ;
323 
324 	if (j < 0) {
325 	    /* found */
326 	    x += ml1 + 1;
327 	    return x;
328 	}
329 	/* not found: rewind and advance one char */
330 	tx -= j - 1;
331 	x++;
332     }
333     return start;
334 }
335 #endif
336 
337 /* Search for the match in text[start:stop].
338 
339    Returns 1 in case a match was found and sets sliceleft, sliceright
340    to the matching slice.
341 
342    Returns 0 in case no match was found and -1 in case of an error.
343 
344 */
345 
mxTextSearch_SearchBuffer(PyObject * self,char * text,Py_ssize_t start,Py_ssize_t stop,Py_ssize_t * sliceleft,Py_ssize_t * sliceright)346 Py_ssize_t mxTextSearch_SearchBuffer(PyObject *self,
347 			      char *text,
348 			      Py_ssize_t start,
349 			      Py_ssize_t stop,
350 			      Py_ssize_t *sliceleft,
351 			      Py_ssize_t *sliceright)
352 {
353     Py_ssize_t nextpos;
354     Py_ssize_t match_len;
355 
356     Py_Assert(mxTextSearch_Check(self),
357 	      PyExc_TypeError,
358 	      "expected a TextSearch object");
359 
360     switch  (so->algorithm) {
361 
362     case MXTEXTSEARCH_BOYERMOORE:
363 	if (so->translate) {
364 	    /* search with translate table */
365 	    nextpos = bm_tr_search((mxbmse_data *)so->data,
366 				   text,
367 				   start,
368 				   stop,
369 				   PyString_AS_STRING(so->translate));
370 	}
371 	else {
372 	    /* exact search */
373 	    nextpos = bm_search((mxbmse_data *)so->data,
374 				text,
375 				start,
376 				stop);
377 	}
378 	match_len = BM_MATCH_LEN(so->data);
379 	break;
380 
381     case MXTEXTSEARCH_TRIVIAL:
382 	{
383 	    const char *match;
384 
385 	    if (PyString_Check(so->match)) {
386 		match = PyString_AS_STRING(so->match);
387 		match_len = PyString_GET_SIZE(so->match);
388 	    }
389 	    else if (PyObject_AsCharBuffer(so->match, &match, &match_len))
390 		goto onError;
391 	    nextpos = trivial_search(text,
392 				     start,
393 				     stop,
394 				     match,
395 				     match_len);
396 	}
397 	break;
398 
399     default:
400 	Py_Error(mxTextTools_Error,
401 		 "unknown algorithm type in mxTextSearch_SearchBuffer");
402 
403     }
404     /* Found ? */
405     if (nextpos != start) {
406 	if (sliceleft)
407 	    *sliceleft = nextpos - match_len;
408 	if (sliceright)
409 	    *sliceright = nextpos;
410 	return 1;
411     }
412     /* Not found */
413     return 0;
414 
415  onError:
416     return -1;
417 }
418 
419 #ifdef HAVE_UNICODE
mxTextSearch_SearchUnicode(PyObject * self,Py_UNICODE * text,Py_ssize_t start,Py_ssize_t stop,Py_ssize_t * sliceleft,Py_ssize_t * sliceright)420 Py_ssize_t mxTextSearch_SearchUnicode(PyObject *self,
421 			       Py_UNICODE *text,
422 			       Py_ssize_t start,
423 			       Py_ssize_t stop,
424 			       Py_ssize_t *sliceleft,
425 			       Py_ssize_t *sliceright)
426 {
427     Py_ssize_t nextpos;
428     Py_ssize_t match_len;
429 
430     Py_Assert(mxTextSearch_Check(self),
431 	      PyExc_TypeError,
432 	      "expected a TextSearch object");
433 
434     switch  (so->algorithm) {
435 
436     case MXTEXTSEARCH_BOYERMOORE:
437 	Py_Error(PyExc_TypeError,
438 		 "Boyer-Moore search algorithm does not support Unicode");
439 	break;
440 
441     case MXTEXTSEARCH_TRIVIAL:
442 	{
443 	    PyObject *u;
444 	    Py_UNICODE *match;
445 
446 	    if (PyUnicode_Check(so->match)) {
447 		u = NULL;
448 		match = PyUnicode_AS_UNICODE(so->match);
449 		match_len = PyUnicode_GET_SIZE(so->match);
450 	    }
451 	    else {
452 		u = PyUnicode_FromEncodedObject(so->match, NULL, NULL);
453 		if (u == NULL)
454 		    goto onError;
455 		match = PyUnicode_AS_UNICODE(u);
456 		match_len = PyUnicode_GET_SIZE(u);
457 	    }
458 	    nextpos = trivial_unicode_search(text,
459 					     start,
460 					     stop,
461 					     match,
462 					     match_len);
463 	    Py_XDECREF(u);
464 	}
465 	break;
466 
467     default:
468 	Py_Error(mxTextTools_Error,
469 		 "unknown algorithm type in mxTextSearch_SearchUnicode");
470 
471     }
472     /* Found ? */
473     if (nextpos != start) {
474 	if (sliceleft)
475 	    *sliceleft = nextpos - match_len;
476 	if (sliceright)
477 	    *sliceright = nextpos;
478 	return 1;
479     }
480     /* Not found */
481     return 0;
482 
483  onError:
484     return -1;
485 }
486 #endif
487 
488 /* methods */
489 
490 Py_C_Function( mxTextSearch_search,
491 	       "TextSearch.search(text,start=0,stop=len(text))\n\n"
492 	       "Search for the substring in text, looking only at the\n"
493 	       "slice [start:stop] and return the slice (l,r)\n"
494 	       "where the substring was found, (start,start) otherwise.")
495 {
496     PyObject *text;
497     Py_ssize_t start = 0;
498     Py_ssize_t stop = INT_MAX;
499     Py_ssize_t sliceleft, sliceright;
500     int rc;
501 
502     Py_Get3Args("O|ii:TextSearch.search",
503 		text,start,stop);
504 
505     if (PyString_Check(text)) {
506 	Py_CheckStringSlice(text, start, stop);
507 	rc = mxTextSearch_SearchBuffer(self,
508 				       PyString_AS_STRING(text),
509 				       start,
510 				       stop,
511 				       &sliceleft,
512 				       &sliceright);
513     }
514 #ifdef HAVE_UNICODE
515     else if (PyUnicode_Check(text)) {
516 	Py_CheckUnicodeSlice(text, start, stop);
517 	rc = mxTextSearch_SearchUnicode(self,
518 					PyUnicode_AS_UNICODE(text),
519 					start,
520 					stop,
521 					&sliceleft,
522 					&sliceright);
523     }
524 #endif
525     else
526 	Py_Error(PyExc_TypeError,
527 		 "expected string or unicode");
528     if (rc < 0)
529 	goto onError;
530     if (rc == 0) {
531 	sliceleft = start;
532 	sliceright = start;
533     }
534 
535     /* Return the slice */
536     Py_Return2("ii", sliceleft, sliceright);
537 
538  onError:
539     return NULL;
540 }
541 
542 Py_C_Function( mxTextSearch_find,
543 	       "TextSearch.find(text,start=0,stop=len(text))\n\n"
544 	       "Search for the substring in text, looking only at the\n"
545 	       "slice [start:stop] and return the index\n"
546 	       "where the substring was found, -1 otherwise.")
547 {
548     PyObject *text;
549     Py_ssize_t start = 0;
550     Py_ssize_t stop = INT_MAX;
551     Py_ssize_t sliceleft, sliceright;
552     int rc;
553 
554     Py_Get3Args("O|ii:TextSearch.find",
555 		text,start,stop);
556 
557     if (PyString_Check(text)) {
558 	Py_CheckStringSlice(text, start, stop);
559 	rc = mxTextSearch_SearchBuffer(self,
560 				       PyString_AS_STRING(text),
561 				       start,
562 				       stop,
563 				       &sliceleft,
564 				       &sliceright);
565     }
566 #ifdef HAVE_UNICODE
567     else if (PyUnicode_Check(text)) {
568 	Py_CheckUnicodeSlice(text, start, stop);
569 	rc = mxTextSearch_SearchUnicode(self,
570 					PyUnicode_AS_UNICODE(text),
571 					start,
572 					stop,
573 					&sliceleft,
574 					&sliceright);
575     }
576 #endif
577     else
578 	Py_Error(PyExc_TypeError,
579 		 "expected string or unicode");
580     if (rc < 0)
581 	goto onError;
582     if (rc == 0)
583 	sliceleft = -1;
584     return PyInt_FromLong(sliceleft);
585 
586  onError:
587     return NULL;
588 }
589 
590 Py_C_Function( mxTextSearch_findall,
591 	       "TextSearch.findall(text,start=0,stop=len(text))\n\n"
592 	       "Search for the substring in text, looking only at the\n"
593 	       "slice [start:stop] and return a list of all\n"
594 	       "non overlapping slices (l,r) in text where the match\n"
595 	       "string can be found.")
596 {
597     PyObject *text;
598     PyObject *list = 0;
599     Py_ssize_t start = 0;
600     Py_ssize_t stop = INT_MAX;
601     Py_ssize_t stop_index;
602     Py_ssize_t match_len;
603     Py_ssize_t listsize = INITIAL_LIST_SIZE;
604     Py_ssize_t listitem = 0;
605 
606     Py_Get3Args("O|ii:TextSearch.findall",
607 		text,start,stop);
608 
609     if (PyString_Check(text)) {
610 	Py_CheckStringSlice(text, start, stop);
611     }
612 #ifdef HAVE_UNICODE
613     else if (PyUnicode_Check(text)) {
614 	Py_CheckUnicodeSlice(text, start, stop);
615     }
616 #endif
617     else
618 	Py_Error(PyExc_TypeError,
619 		 "expected string or unicode");
620 
621     list = PyList_New(listsize);
622     if (!list)
623 	goto onError;
624 
625     match_len = mxTextSearch_MatchLength(self);
626     if (match_len < 0)
627 	goto onError;
628     stop_index = stop - match_len;
629 
630     while (start <= stop_index) {
631 	register PyObject *t,*v;
632 	int rc;
633 	Py_ssize_t sliceleft, sliceright;
634 
635 	/* exact search */
636 	if (PyString_Check(text))
637 	    rc = mxTextSearch_SearchBuffer(self,
638 					   PyString_AS_STRING(text),
639 					   start,
640 					   stop,
641 					   &sliceleft,
642 					   &sliceright);
643 #ifdef HAVE_UNICODE
644 	else if (PyUnicode_Check(text))
645 	    rc = mxTextSearch_SearchUnicode(self,
646 					    PyUnicode_AS_UNICODE(text),
647 					    start,
648 					    stop,
649 					    &sliceleft,
650 					    &sliceright);
651 #endif
652 	else
653 	    break;
654 	if (rc < 0)
655 	    goto onError;
656 	if (rc == 0)
657 	    break;
658 
659 	/* Build slice and append to list */
660 	t = PyTuple_New(2);
661 	if (!t)
662 	    goto onError;
663 	v = PyInt_FromLong(sliceleft);
664 	if (!v)
665 	    goto onError;
666 	PyTuple_SET_ITEM(t,0,v);
667 	v = PyInt_FromLong(sliceright);
668 	if (!v)
669 	    goto onError;
670 	PyTuple_SET_ITEM(t,1,v);
671 
672 	if (listitem < listsize)
673 	    PyList_SET_ITEM(list, listitem, t);
674 	else {
675 	    PyList_Append(list, t);
676 	    Py_DECREF(t);
677 	}
678 	listitem++;
679 
680 	start = sliceright;
681     }
682 
683     /* Resize list if necessary */
684     if (listitem < listsize)
685 	PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL);
686 
687     return list;
688 
689  onError:
690     Py_XDECREF(list);
691     return NULL;
692 }
693 
694 #ifdef COPY_PROTOCOL
695 Py_C_Function( mxTextSearch_copy,
696 	       "copy([memo])\n\n"
697 	       "Return a new reference for the instance. This function\n"
698 	       "is used for the copy-protocol. Real copying doesn't take\n"
699 	       "place, since the instances are immutable.")
700 {
701     PyObject *memo;
702 
703     Py_GetArg("|O",memo);
704     Py_INCREF(so);
705     return (PyObject *)so;
706  onError:
707     return NULL;
708 }
709 #endif
710 
711 #undef so
712 
713 /* --- slots --- */
714 
715 static
mxTextSearch_Repr(mxTextSearchObject * self)716 PyObject *mxTextSearch_Repr(mxTextSearchObject *self)
717 {
718     char *algoname;
719     PyObject *v;
720     char t[500], *reprstr;
721 
722     v = PyObject_Repr(self->match);
723     if (v == NULL)
724 	return NULL;
725     reprstr = PyString_AsString(v);
726     if (reprstr == NULL)
727 	return NULL;
728 
729     switch (self->algorithm) {
730     case MXTEXTSEARCH_BOYERMOORE:
731 	algoname = "Boyer-Moore";
732 	break;
733     case MXTEXTSEARCH_TRIVIAL:
734 	algoname = "Trivial";
735 	break;
736     default:
737 	algoname = "";
738     }
739 
740     sprintf(t, "<%.50s TextSearch object for %.400s at 0x%lx>",
741 	    algoname, reprstr, (long)self);
742     Py_DECREF(v);
743     return PyString_FromString(t);
744 }
745 
746 /* Python Method Table */
747 
748 static
749 PyMethodDef mxTextSearch_Methods[] =
750 {
751     Py_MethodListEntry("search",mxTextSearch_search),
752     Py_MethodListEntry("find",mxTextSearch_find),
753     Py_MethodListEntry("findall",mxTextSearch_findall),
754 #ifdef COPY_PROTOCOL
755     Py_MethodListEntry("__deepcopy__",mxTextSearch_copy),
756     Py_MethodListEntry("__copy__",mxTextSearch_copy),
757 #endif
758     {NULL,NULL} /* end of list */
759 };
760 
761 static PyMemberDef mxTextSearch_members[] = {
762     {"match",T_OBJECT_EX,offsetof(mxTextSearchObject,match),READONLY,"Text that this search matches"},
763     {"translate",T_OBJECT,offsetof(mxTextSearchObject,translate),READONLY,"Translated search term"},
764     {"algorithm",T_INT,offsetof(mxTextSearchObject,algorithm),READONLY,"Algorithm in use by the text search"},
765     {NULL}
766 };
767 
768 /* Python Type Table */
769 
770 PyTypeObject mxTextSearch_Type = {
771     PyVarObject_HEAD_INIT(NULL, 0)      /* init at startup ! */
772     "TextSearch",                       /*tp_name*/
773     sizeof(mxTextSearchObject),         /*tp_basicsize*/
774     0,                                  /*tp_itemsize*/
775     /* methods */
776     (destructor)mxTextSearch_Free,      /*tp_dealloc*/
777     (printfunc)0,                       /*tp_print*/
778     (getattrfunc)0,                     /*tp_getattr*/
779     (setattrfunc)0,                     /*tp_setattr*/
780     0,                                  /*tp_compare*/
781     (reprfunc)mxTextSearch_Repr,        /*tp_repr*/
782     0,                                  /*tp_as_number*/
783     0,                                  /*tp_as_number*/
784     0,                                  /*tp_as_mapping*/
785     (hashfunc)0,                        /*tp_hash*/
786     (ternaryfunc)0,                     /*tp_call*/
787     (reprfunc)0,                        /*tp_str*/
788     (getattrofunc)0,                    /*tp_getattro*/
789     (setattrofunc)0,                    /*tp_setattro*/
790     0,                                  /*tp_asbuffer*/
791     Py_TPFLAGS_DEFAULT,                 /*tp_flags*/
792     "mxTextTools text-search object",   /*tp_doc*/
793     0,                                  /*tp_traverse*/
794     0,                                  /*tp_clear*/
795     0,                                  /*tp_richcompare*/
796     0,                                  /*tp_weaklistoffset*/
797     0,                                  /*tp_iter*/
798     0,                                  /*tp_iternext*/
799     mxTextSearch_Methods,               /*tp_methods*/
800     mxTextSearch_members,               /*tp_members*/
801 };
802 
803 /* --- Character Set Object --------------------------------------------*/
804 
805 /* internal */
806 
807 /* 8-bit character sets are implemented using a simple 32-byte
808    long bitmap with one bit per character.
809 
810    Addressing is done as follows:
811 
812       def char_is_set(ordinal):
813           return bitmap[ordinal >> 3]  & (1 << (ordinal & 7))
814 
815 */
816 
817 #define STRING_CHARSET_SIZE 		256
818 #define STRING_CHARSET_BITMAP_SIZE 	(STRING_CHARSET_SIZE / 8)
819 
820 typedef struct {
821     unsigned char bitmap[STRING_CHARSET_BITMAP_SIZE];
822     						/* character bitmap */
823 } string_charset;
824 
825 static
init_string_charset(mxCharSetObject * cs,PyObject * definition)826 int init_string_charset(mxCharSetObject *cs,
827 			PyObject *definition)
828 {
829     register Py_ssize_t i, j;
830     char *def = PyString_AS_STRING(definition);
831     const Py_ssize_t len = PyString_GET_SIZE(definition);
832     string_charset *lookup = 0;
833     register unsigned char *bitmap;
834     int logic = 1;
835 
836     /* Handle logic change (first char is '^' for negative matching) */
837     if (len > 0 && def[0] == '^') {
838 	logic = 0;
839 	i = 1;
840     }
841     else
842 	i = 0;
843 
844     /* Build 32-byte lookup bitmap (one bit per character) */
845     lookup = (string_charset *)PyMem_Malloc(sizeof(string_charset));
846     if (lookup == NULL) {
847 	PyErr_NoMemory();
848 	goto onError;
849     }
850     memset(lookup, 0, sizeof(string_charset));
851     cs->mode = MXCHARSET_8BITMODE;
852     cs->lookup = (void *)lookup;
853     bitmap = lookup->bitmap;
854 
855     for (; i < len; i++) {
856 
857 	/* Handle escapes: "b\-d", "\\" */
858 	if (def[i] == '\\') {
859 	    if (i < len - 1 && def[i+1] == '\\') {
860 		j = (unsigned char)'\\';
861 		bitmap[j >> 3] |= 1 << (j & 7);
862 		i++;
863 	    }
864 	    continue;
865 	}
866 
867 	/* Handle ranges: "b-d", "\\-z", "\--z" */
868 	if (i < len - 2 && def[i+1] == '-') {
869 	    unsigned char range_left = def[i];
870 	    unsigned char range_right = def[i+2];
871 	    for (j = range_left; j <= range_right; j++)
872 		bitmap[j >> 3] |= 1 << (j & 7);
873 	    i++;
874 	    continue;
875 	}
876 
877 	/* Normal processing */
878 	j = (unsigned char)def[i];
879 	bitmap[j >> 3] |= 1 << (j & 7);
880     }
881 
882     /* Invert bitmap if negative matching is requested */
883     if (!logic) {
884 	DPRINTF("init_string_charset: inverting bitmap\n");
885 	for (i = 0; i < STRING_CHARSET_BITMAP_SIZE; i++)
886 	    bitmap[i] ^= 0xFF;
887     }
888 
889     return 0;
890 
891  onError:
892     if (lookup)
893 	PyMem_Free((void *)lookup);
894     cs->lookup = 0;
895     return -1;
896 }
897 
898 #ifdef HAVE_UNICODE
899 
900 /* Unicode character sets are implemented using two step indexing
901    which is a good compromise between lookup speed and memory usage.
902 
903    Lookup is done using a variable length array of 32-byte bitmap
904    blocks. There can be 256 such blocks. Identical blocks are
905    collapsed into a single copy.
906 
907    Addressing is done as follows:
908 
909       def char_is_set(ordinal):
910           index = bitmapindex[ordinal >> 8]
911 	  bitmap = bitmaps[index]
912           return bitmap[(ordinal >> 3) & 31]  & (1 << (ordinal & 7))
913 
914    The technique used here is very similar to what is done in Python's
915    SRE (see the BIGCHARSET patch by Martin von Loewis). Compression
916    should be reasonably good since character sets in practice usually
917    only contains a few single characters or longer ranges of Unicode
918    characters.
919 
920 */
921 
922 #define UNICODE_CHARSET_SIZE 		65536
923 #define UNICODE_CHARSET_BITMAP_SIZE 	32
924 #define UNICODE_CHARSET_BITMAPS 	(UNICODE_CHARSET_SIZE / (UNICODE_CHARSET_BITMAP_SIZE * 8))
925 #define UNICODE_CHARSET_BIGMAP_SIZE	(UNICODE_CHARSET_SIZE / 8)
926 
927 typedef struct {
928     unsigned char bitmapindex[UNICODE_CHARSET_BITMAPS];
929     					/* Index to char bitmaps */
930     unsigned char bitmaps[UNICODE_CHARSET_BITMAPS][UNICODE_CHARSET_BITMAP_SIZE];
931     					/* Variable length bitmap array */
932 } unicode_charset;
933 
934 static
init_unicode_charset(mxCharSetObject * cs,PyObject * definition)935 int init_unicode_charset(mxCharSetObject *cs,
936 			 PyObject *definition)
937 {
938     register Py_ssize_t i, j;
939     Py_UNICODE *def = PyUnicode_AS_UNICODE(definition);
940     const Py_ssize_t len = PyUnicode_GET_SIZE(definition);
941     unicode_charset *lookup = 0;
942     unsigned char bigmap[UNICODE_CHARSET_BIGMAP_SIZE];
943     Py_ssize_t blocks;
944     int logic = 1;
945 
946     /* Handle logic change (first char is '^' for negative matching) */
947     if (len > 0 && def[0] == '^') {
948 	logic = 0;
949 	i = 1;
950     }
951     else
952 	i = 0;
953 
954     /* Build bigmap */
955     memset(bigmap, 0, sizeof(bigmap));
956     for (; i < len; i++) {
957 
958 	/* Handle escapes: "b\-d", "\\" */
959 	if (def[i] == '\\') {
960 	    if (i < len - 1 && def[i+1] == '\\') {
961 		j = (int)'\\';
962 		bigmap[j >> 3] |= 1 << (j & 7);
963 		i++;
964 	    }
965 	    continue;
966 	}
967 
968 	/* Handle ranges: "b-d", "\\-z", "\--z" */
969 	if (i < len - 2 && def[i+1] == '-') {
970 	    Py_UNICODE range_left = def[i];
971 	    Py_UNICODE range_right = def[i+2];
972 	    if (range_right >= UNICODE_CHARSET_SIZE) {
973 		Py_Error(PyExc_ValueError,
974 			 "unicode ordinal out of supported range");
975 	    }
976 	    for (j = range_left; j <= range_right; j++)
977 		bigmap[j >> 3] |= 1 << (j & 7);
978 	    i++;
979 	    continue;
980 	}
981 
982 	/* Normal processing */
983 	j = def[i];
984 	if (j >= UNICODE_CHARSET_SIZE) {
985 	    Py_Error(PyExc_ValueError,
986 		     "unicode ordinal out of supported range");
987 	}
988 	bigmap[j >> 3] |= 1 << (j & 7);
989     }
990 
991     /* Build lookup table
992 
993        XXX Could add dynamic resizing here... probably not worth it
994            though, since sizeof(unicode_charset) isn't all that large.
995 
996     */
997     lookup = (unicode_charset *)PyMem_Malloc(sizeof(unicode_charset));
998     if (lookup == NULL) {
999 	PyErr_NoMemory();
1000 	goto onError;
1001     }
1002     blocks = 0;
1003     for (i = UNICODE_CHARSET_BITMAPS - 1; i >= 0; i--) {
1004 	unsigned char *block = &bigmap[i << 5];
1005 	for (j = blocks - 1; j >= 0; j--)
1006 	    if (memcmp(lookup->bitmaps[j], block,
1007 		       UNICODE_CHARSET_BITMAP_SIZE) == 0)
1008 		break;
1009 	if (j < 0) {
1010 	    j = blocks;
1011 	    DPRINTF("init_unicode_charset: Creating new block %i for %i\n",
1012 		    j, i);
1013 	    memcpy(lookup->bitmaps[j], block, UNICODE_CHARSET_BITMAP_SIZE);
1014 	    blocks++;
1015 	}
1016 	else
1017 	    DPRINTF("init_unicode_charset: Reusing block %i for %i\n", j, i);
1018 	lookup->bitmapindex[i] = j;
1019     }
1020     DPRINTF("init_unicode_charset: Map size: %i block(s) = %i bytes\n",
1021 	    blocks, UNICODE_CHARSET_BITMAPS +
1022 	    blocks * UNICODE_CHARSET_BITMAP_SIZE);
1023     lookup = (unicode_charset *)PyMem_Realloc(lookup,
1024 				 UNICODE_CHARSET_BITMAPS
1025 				 + blocks * UNICODE_CHARSET_BITMAP_SIZE);
1026     if (lookup == NULL) {
1027 	PyErr_NoMemory();
1028 	goto onError;
1029     }
1030 
1031     /* Invert bitmaps if negative matching is requested */
1032     if (!logic) {
1033 	register unsigned char *bitmap = &lookup->bitmaps[0][0];
1034 	DPRINTF("init_unicode_charset: inverting bitmaps\n");
1035 	for (i = 0; i < blocks * UNICODE_CHARSET_BITMAP_SIZE; i++)
1036 	    bitmap[i] ^= 0xFF;
1037     }
1038 
1039     cs->mode = MXCHARSET_UCS2MODE;
1040     cs->lookup = (void *)lookup;
1041     return 0;
1042 
1043  onError:
1044     if (lookup)
1045 	PyMem_Free((void *)lookup);
1046     cs->lookup = 0;
1047     return -1;
1048 }
1049 
1050 #endif
1051 
1052 /* allocation */
1053 
1054 static
mxCharSet_New(PyObject * definition)1055 PyObject *mxCharSet_New(PyObject *definition)
1056 {
1057     mxCharSetObject *cs;
1058 
1059     cs = PyObject_NEW(mxCharSetObject, &mxCharSet_Type);
1060     if (cs == NULL)
1061 	return NULL;
1062     Py_INCREF(definition);
1063     cs->definition = definition;
1064     cs->lookup = NULL;
1065     cs->mode = -1;
1066 
1067     if (PyString_Check(definition)) {
1068 	if (init_string_charset(cs, definition))
1069 	    goto onError;
1070     }
1071 #ifdef HAVE_UNICODE
1072     else if (PyUnicode_Check(definition)) {
1073 	if (init_unicode_charset(cs, definition))
1074 	    goto onError;
1075     }
1076 #endif
1077     else
1078 	Py_Error(PyExc_TypeError,
1079 		 "character set definition must be string or unicode");
1080 
1081     return (PyObject *)cs;
1082 
1083  onError:
1084     Py_DECREF(cs);
1085     return NULL;
1086 }
1087 
1088 Py_C_Function( mxCharSet_CharSet,
1089 	       "CharSet(definition)\n\n"
1090 	       "Create a character set matching object from the string"
1091 	       )
1092 {
1093     PyObject *definition;
1094 
1095     Py_GetArg("O:CharSet", definition);
1096     return mxCharSet_New(definition);
1097 
1098  onError:
1099     return NULL;
1100 }
1101 
1102 static
mxCharSet_Free(mxCharSetObject * cs)1103 void mxCharSet_Free(mxCharSetObject *cs)
1104 {
1105     Py_XDECREF(cs->definition);
1106     if (cs->lookup)
1107 	PyMem_Free(cs->lookup);
1108     PyObject_Del(cs);
1109 }
1110 
1111 /* C APIs */
1112 
1113 #define cs ((mxCharSetObject *)self)
1114 
mxCharSet_ContainsChar(PyObject * self,register unsigned char ch)1115 int mxCharSet_ContainsChar(PyObject *self,
1116 			   register unsigned char ch)
1117 {
1118     if (!mxCharSet_Check(self)) {
1119 	PyErr_BadInternalCall();
1120 	goto onError;
1121     }
1122 
1123     if (cs->mode == MXCHARSET_8BITMODE) {
1124 	unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap;
1125 	return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0);
1126     }
1127 #ifdef HAVE_UNICODE
1128     else if (cs->mode == MXCHARSET_UCS2MODE) {
1129 	unicode_charset *lookup = (unicode_charset *)cs->lookup;
1130 	unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[0]];
1131 	return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0);
1132     }
1133 #endif
1134     else {
1135 	Py_Error(mxTextTools_Error,
1136 		 "unsupported character set mode");
1137     }
1138 
1139  onError:
1140     return -1;
1141 }
1142 
1143 #ifdef HAVE_UNICODE
1144 
mxCharSet_ContainsUnicodeChar(PyObject * self,register Py_UNICODE ch)1145 int mxCharSet_ContainsUnicodeChar(PyObject *self,
1146 				  register Py_UNICODE ch)
1147 {
1148     if (!mxCharSet_Check(self)) {
1149 	PyErr_BadInternalCall();
1150 	goto onError;
1151     }
1152 
1153     if (cs->mode == MXCHARSET_8BITMODE) {
1154 	unsigned char *bitmap = ((string_charset *)cs->lookup)->bitmap;
1155 	if (ch >= 256)
1156 	    return 0;
1157 	return ((bitmap[ch >> 3] & (1 << (ch & 7))) != 0);
1158     }
1159     else if (cs->mode == MXCHARSET_UCS2MODE) {
1160 	unicode_charset *lookup = (unicode_charset *)cs->lookup;
1161 	unsigned char *bitmap = lookup->bitmaps[lookup->bitmapindex[ch >> 8]];
1162 	return ((bitmap[(ch >> 3) & 31] & (1 << (ch & 7))) != 0);
1163     }
1164     else {
1165 	Py_Error(mxTextTools_Error,
1166 		 "unsupported character set mode");
1167     }
1168 
1169  onError:
1170     return -1;
1171 }
1172 
1173 #endif
1174 
1175 static
mxCharSet_Contains(PyObject * self,PyObject * other)1176 int mxCharSet_Contains(PyObject *self,
1177 		       PyObject *other)
1178 {
1179     if (PyString_Check(other)) {
1180 	Py_Assert(PyString_GET_SIZE(other) == 1,
1181 		  PyExc_TypeError,
1182 		  "expected a single character");
1183 	return mxCharSet_ContainsChar(self, PyString_AS_STRING(other)[0]);
1184     }
1185 #ifdef HAVE_UNICODE
1186     else if (PyUnicode_Check(other)) {
1187 	Py_Assert(PyUnicode_GET_SIZE(other) == 1,
1188 		  PyExc_TypeError,
1189 		  "expected a single unicode character");
1190 	return mxCharSet_ContainsUnicodeChar(self,
1191 					     PyUnicode_AS_UNICODE(other)[0]);
1192     }
1193 #endif
1194     else
1195 	Py_Error(PyExc_TypeError,
1196 		 "expected string or unicode character");
1197 
1198  onError:
1199     return -1;
1200 }
1201 
1202 /* In mode 1, find the position of the first character in text
1203    belonging to set. This may also be stop or start-1 in case no such
1204    character is found during the search (depending on the direction).
1205 
1206    In mode 0, find the first character not in set. This may also be
1207    stop or start-1 in case no such character is found during the
1208    search (depending on the direction).
1209 
1210    The search is done in the slice start:stop.
1211 
1212    -2 is returned in case of an error.
1213 
1214 */
1215 
1216 static
mxCharSet_FindChar(PyObject * self,unsigned char * text,Py_ssize_t start,Py_ssize_t stop,const int mode,const int direction)1217 int mxCharSet_FindChar(PyObject *self,
1218 		       unsigned char *text,
1219 		       Py_ssize_t start,
1220 		       Py_ssize_t stop,
1221 		       const int mode,
1222 		       const int direction)
1223 {
1224     register Py_ssize_t i;
1225     register unsigned int c;
1226     register unsigned int block;
1227     unsigned char *bitmap;
1228 
1229     if (!mxCharSet_Check(self)) {
1230 	PyErr_BadInternalCall();
1231 	goto onError;
1232     }
1233 
1234     if (cs->mode == MXCHARSET_8BITMODE)
1235 	bitmap = ((string_charset *)cs->lookup)->bitmap;
1236 #ifdef HAVE_UNICODE
1237     else if (cs->mode == MXCHARSET_UCS2MODE) {
1238 	unicode_charset *lookup = (unicode_charset *)cs->lookup;
1239 	bitmap = lookup->bitmaps[lookup->bitmapindex[0]];
1240     }
1241 #endif
1242     else {
1243 	Py_Error(mxTextTools_Error,
1244 		 "unsupported character set mode");
1245     }
1246 
1247     if (direction > 0) {
1248 	if (mode)
1249 	    /* Find first char in set */
1250 	    for (i = start; i < stop; i++) {
1251 		c = text[i];
1252 		block = bitmap[c >> 3];
1253 		if (block && ((block & (1 << (c & 7))) != 0))
1254 		    break;
1255 	    }
1256 	else
1257 	    /* Find first char not in set */
1258 	    for (i = start; i < stop; i++) {
1259 		c = text[i];
1260 		block = bitmap[c >> 3];
1261 		if (!block || ((block & (1 << (c & 7))) == 0))
1262 		    break;
1263 	    }
1264     }
1265     else {
1266 	if (mode)
1267 	    /* Find first char in set, searching from the end */
1268 	    for (i = stop - 1; i >= start; i--) {
1269 		c = text[i];
1270 		block = bitmap[c >> 3];
1271 		if (block && ((block & (1 << (c & 7))) != 0))
1272 		    break;
1273 	    }
1274 	else
1275 	    /* Find first char not in set, searching from the end */
1276 	    for (i = stop - 1; i >= start; i--) {
1277 		c = text[i];
1278 		block = bitmap[c >> 3];
1279 		if (!block || ((block & (1 << (c & 7))) == 0))
1280 		    break;
1281 	    }
1282     }
1283     return i;
1284 
1285  onError:
1286     return -2;
1287 }
1288 
1289 #ifdef HAVE_UNICODE
1290 
1291 static
mxCharSet_FindUnicodeChar(PyObject * self,Py_UNICODE * text,Py_ssize_t start,Py_ssize_t stop,const int mode,const int direction)1292 int mxCharSet_FindUnicodeChar(PyObject *self,
1293 			      Py_UNICODE *text,
1294 			      Py_ssize_t start,
1295 			      Py_ssize_t stop,
1296 			      const int mode,
1297 			      const int direction)
1298 {
1299     register int i;
1300     register unsigned int c;
1301     register unsigned int block;
1302     unsigned char *bitmap;
1303 
1304     if (!mxCharSet_Check(self)) {
1305 	PyErr_BadInternalCall();
1306 	goto onError;
1307     }
1308 
1309     if (cs->mode == MXCHARSET_8BITMODE) {
1310 	bitmap = ((string_charset *)cs->lookup)->bitmap;
1311 	if (direction > 0) {
1312 	    if (mode)
1313 		/* Find first char in set */
1314 		for (i = start; i < stop; i++) {
1315 		    c = text[i];
1316 		    if (c > 256)
1317 			continue;
1318 		    block = bitmap[c >> 3];
1319 		    if (block && ((block & (1 << (c & 7))) != 0))
1320 			break;
1321 		}
1322 	    else
1323 		/* Find first char not in set */
1324 		for (i = start; i < stop; i++) {
1325 		    c = text[i];
1326 		    if (c > 256)
1327 			break;
1328 		    block = bitmap[c >> 3];
1329 		    if (!block || ((block & (1 << (c & 7))) == 0))
1330 			break;
1331 		}
1332         }
1333 	else {
1334 	    if (mode)
1335 		/* Find first char in set, searching from the end */
1336 		for (i = stop - 1; i >= start; i--) {
1337 		    c = text[i];
1338 		    if (c > 256)
1339 			continue;
1340 		    block = bitmap[c >> 3];
1341 		    if (block && ((block & (1 << (c & 7))) != 0))
1342 			break;
1343 		}
1344 	    else
1345 		/* Find first char not in set, searching from the end */
1346 		for (i = stop - 1; i >= start; i--) {
1347 		    c = text[i];
1348 		    if (c > 256)
1349 			break;
1350 		    block = bitmap[c >> 3];
1351 		    if (!block || ((block & (1 << (c & 7))) == 0))
1352 			break;
1353 		}
1354 	}
1355 	return i;
1356     }
1357 
1358 #ifdef HAVE_UNICODE
1359     else if (cs->mode == MXCHARSET_UCS2MODE) {
1360 	unicode_charset *lookup = (unicode_charset *)cs->lookup;
1361 	if (direction > 0) {
1362 	    if (mode)
1363 		/* Find first char in set */
1364 		for (i = start; i < stop; i++) {
1365 		    c = text[i];
1366 		    bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]];
1367 		    block = bitmap[(c >> 3) & 31];
1368 		    if (block && ((block & (1 << (c & 7))) != 0))
1369 			break;
1370 		}
1371 	    else
1372 		/* Find first char not in set */
1373 		for (i = start; i < stop; i++) {
1374 		    c = text[i];
1375 		    bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]];
1376 		    block = bitmap[(c >> 3) & 31];
1377 		    if (!block || ((block & (1 << (c & 7))) == 0))
1378 			break;
1379 		}
1380 	}
1381 	else {
1382 	    if (mode)
1383 		/* Find first char in set, searching from the end */
1384 		for (i = stop - 1; i >= start; i--) {
1385 		    c = text[i];
1386 		    bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]];
1387 		    block = bitmap[(c >> 3) & 31];
1388 		    if (block && ((block & (1 << (c & 7))) != 0))
1389 			break;
1390 		}
1391 	    else
1392 		/* Find first char not in set, searching from the end */
1393 		for (i = stop - 1; i >= start; i--) {
1394 		    c = text[i];
1395 		    bitmap = lookup->bitmaps[lookup->bitmapindex[c >> 8]];
1396 		    block = bitmap[(c >> 3) & 31];
1397 		    if (!block || ((block & (1 << (c & 7))) == 0))
1398 			break;
1399 		}
1400 	}
1401 	return i;
1402     }
1403 #endif
1404     else {
1405 	Py_Error(mxTextTools_Error,
1406 		 "unsupported character set mode");
1407     }
1408 
1409  onError:
1410     return -2;
1411 }
1412 
1413 #endif
1414 
1415 /* Return the position of the first character in text[start:stop]
1416    occurring in set or -1 in case no such character exists.
1417 
1418 */
1419 
1420 static
mxCharSet_Search(PyObject * self,PyObject * text,Py_ssize_t start,Py_ssize_t stop,int direction)1421 int mxCharSet_Search(PyObject *self,
1422 		     PyObject *text,
1423 		     Py_ssize_t start,
1424 		     Py_ssize_t stop,
1425 		     int direction)
1426 {
1427     Py_ssize_t position;
1428 
1429     if (PyString_Check(text)) {
1430 	Py_CheckStringSlice(text, start, stop);
1431 	position = mxCharSet_FindChar(self,
1432 				      (unsigned char *)PyString_AS_STRING(text),
1433 				      start,
1434 				      stop,
1435 				      1,
1436 				      direction);
1437     }
1438 #ifdef HAVE_UNICODE
1439     else if (PyUnicode_Check(text)) {
1440 	Py_CheckUnicodeSlice(text, start, stop);
1441 	position = mxCharSet_FindUnicodeChar(self,
1442 					     PyUnicode_AS_UNICODE(text),
1443 					     start,
1444 					     stop,
1445 					     1,
1446 					     direction);
1447     }
1448 #endif
1449     else
1450 	Py_Error(PyExc_TypeError,
1451 		 "expected string or unicode");
1452 
1453     if ((direction > 0 && position >= stop) ||
1454 	(direction <= 0 && position < start))
1455 	position = -1;
1456     return position;
1457 
1458  onError:
1459     return -2;
1460 }
1461 
1462 /* Return the longest match of characters from set in
1463    text[start:stop].
1464 
1465    If direction is positive, the search is done from the left (longest
1466    prefix), otherwise it is started from the right (longest suffix).
1467 
1468    -1 is returned in case of an error.
1469 
1470 */
1471 
mxCharSet_Match(PyObject * self,PyObject * text,Py_ssize_t start,Py_ssize_t stop,int direction)1472 Py_ssize_t mxCharSet_Match(PyObject *self,
1473 		    PyObject *text,
1474 		    Py_ssize_t start,
1475 		    Py_ssize_t stop,
1476 		    int direction)
1477 {
1478     Py_ssize_t position;
1479 
1480     if (PyString_Check(text)) {
1481 	Py_CheckStringSlice(text, start, stop);
1482 	position = mxCharSet_FindChar(self,
1483 				      (unsigned char *)PyString_AS_STRING(text),
1484 				      start,
1485 				      stop,
1486 				      0,
1487 				      direction);
1488     }
1489 #ifdef HAVE_UNICODE
1490     else if (PyUnicode_Check(text)) {
1491 	Py_CheckUnicodeSlice(text, start, stop);
1492 	position = mxCharSet_FindUnicodeChar(self,
1493 					     PyUnicode_AS_UNICODE(text),
1494 					     start,
1495 					     stop,
1496 					     0,
1497 					     direction);
1498     }
1499 #endif
1500     else
1501 	Py_Error(PyExc_TypeError,
1502 		 "expected string or unicode");
1503 
1504     if (position < -1)
1505 	goto onError;
1506     if (direction > 0)
1507 	return position - start;
1508     else
1509 	return stop-1 - position;
1510 
1511  onError:
1512     return -1;
1513 }
1514 
1515 /* Stips off characters appearing in the character set from text[start:stop]
1516    and returns the result as Python string object.
1517 
1518    where indicates the mode:
1519    where < 0: strip left only
1520    where = 0: strip left and right
1521    where > 0: strip right only
1522 
1523 */
1524 static
mxCharSet_Strip(PyObject * self,PyObject * text,Py_ssize_t start,Py_ssize_t stop,Py_ssize_t where)1525 PyObject *mxCharSet_Strip(PyObject *self,
1526 			  PyObject *text,
1527 			  Py_ssize_t start,
1528 			  Py_ssize_t stop,
1529 			  Py_ssize_t where)
1530 {
1531     Py_ssize_t left,right;
1532 
1533     if (!mxCharSet_Check(self)) {
1534 	PyErr_BadInternalCall();
1535 	goto onError;
1536     }
1537 
1538     if (PyString_Check(text)) {
1539 	Py_CheckStringSlice(text, start, stop);
1540 
1541 	/* Strip left */
1542 	if (where <= 0) {
1543 	    left = mxCharSet_FindChar(self,
1544 				      (unsigned char *)PyString_AS_STRING(text),
1545 				      start,
1546 				      stop,
1547 				      0,
1548 				      1);
1549 	    if (left < 0)
1550 		goto onError;
1551 	}
1552 	else
1553 	    left = start;
1554 
1555 	/* Strip right */
1556 	if (where >= 0) {
1557 	    right = mxCharSet_FindChar(self,
1558 				       (unsigned char *)PyString_AS_STRING(text),
1559 				       left,
1560 				       stop,
1561 				       0,
1562 				       -1) + 1;
1563 	    if (right < 0)
1564 		goto onError;
1565 	}
1566 	else
1567 	    right = stop;
1568 
1569 	return PyString_FromStringAndSize(PyString_AS_STRING(text) + left,
1570 					  max(right - left, 0));
1571     }
1572 #ifdef HAVE_UNICODE
1573     else if (PyUnicode_Check(text)) {
1574         Py_CheckUnicodeSlice(text, start, stop);
1575 
1576 	/* Strip left */
1577 	if (where <= 0) {
1578 	    left = mxCharSet_FindUnicodeChar(self,
1579 					     PyUnicode_AS_UNICODE(text),
1580 					     start,
1581 					     stop,
1582 					     0,
1583 					     1);
1584 	    if (left < 0)
1585 		goto onError;
1586 	}
1587 	else
1588 	    left = start;
1589 
1590 	/* Strip right */
1591 	if (where >= 0) {
1592 	    right = mxCharSet_FindUnicodeChar(self,
1593 					     PyUnicode_AS_UNICODE(text),
1594 					     start,
1595 					     stop,
1596 					     0,
1597 					     -1) + 1;
1598 	    if (right < 0)
1599 		goto onError;
1600 	}
1601 	else
1602 	    right = stop;
1603 
1604 	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(text) + left,
1605 				     max(right - left, 0));
1606     }
1607 #endif
1608     else
1609 	Py_Error(PyExc_TypeError,
1610 		 "expected string or unicode");
1611 
1612  onError:
1613     return NULL;
1614 }
1615 
1616 static
mxCharSet_Split(PyObject * self,PyObject * text,Py_ssize_t start,Py_ssize_t text_len,int include_splits)1617 PyObject *mxCharSet_Split(PyObject *self,
1618 			  PyObject *text,
1619 			  Py_ssize_t start,
1620 			  Py_ssize_t text_len,
1621 			  int include_splits)
1622 {
1623     PyObject *list = NULL;
1624     PyObject *s;
1625     register Py_ssize_t x;
1626     Py_ssize_t listitem = 0;
1627     Py_ssize_t listsize = INITIAL_LIST_SIZE;
1628 
1629     if (!mxCharSet_Check(self)) {
1630 	PyErr_BadInternalCall();
1631 	goto onError;
1632     }
1633 
1634     list = PyList_New(listsize);
1635     if (!list)
1636 	goto onError;
1637 
1638     if (PyString_Check(text)) {
1639 	unsigned char *tx = (unsigned char *)PyString_AS_STRING(text);
1640 
1641 	Py_CheckStringSlice(text, start, text_len);
1642 
1643 	x = start;
1644 	while (x < text_len) {
1645 	    Py_ssize_t z;
1646 
1647 	    /* Skip all text in set (include_splits == 0), not in set
1648     	       (include_splits == 1) */
1649 	    z = x;
1650 	    x = mxCharSet_FindChar(self, tx, x, text_len, include_splits, 1);
1651 
1652 	    /* Append the slice to list */
1653 	    if (include_splits) {
1654 		s = PyString_FromStringAndSize((char *)&tx[z], x - z);
1655 		if (!s)
1656 		    goto onError;
1657 		if (listitem < listsize)
1658 		    PyList_SET_ITEM(list,listitem,s);
1659 		else {
1660 		    PyList_Append(list,s);
1661 		    Py_DECREF(s);
1662 		}
1663 		listitem++;
1664 
1665 		if (x >= text_len)
1666 		    break;
1667 	    }
1668 
1669 	    /* Skip all text in set (include_splits == 1), not in set
1670     	       (include_splits == 0) */
1671 	    z = x;
1672 	    x = mxCharSet_FindChar(self, tx, x, text_len, !include_splits, 1);
1673 
1674 	    /* Append the slice to list if it is not empty */
1675 	    if (x > z) {
1676 		s = PyString_FromStringAndSize((char *)&tx[z], x - z);
1677 		if (!s)
1678 		    goto onError;
1679 		if (listitem < listsize)
1680 		    PyList_SET_ITEM(list,listitem,s);
1681 		else {
1682 		    PyList_Append(list,s);
1683 		    Py_DECREF(s);
1684 		}
1685 		listitem++;
1686 	    }
1687 	}
1688 
1689     }
1690 #ifdef HAVE_UNICODE
1691     else if (PyUnicode_Check(text)) {
1692 	Py_UNICODE *tx = PyUnicode_AS_UNICODE(text);
1693 
1694 	Py_CheckUnicodeSlice(text, start, text_len);
1695 
1696 	x = start;
1697 	while (x < text_len) {
1698 	    Py_ssize_t z;
1699 
1700 	    /* Skip all text in set (include_splits == 0), not in set
1701     	       (include_splits == 1) */
1702 	    z = x;
1703 	    x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, include_splits, 1);
1704 
1705 	    /* Append the slice to list */
1706 	    if (include_splits) {
1707 		s = PyUnicode_FromUnicode(&tx[z], x - z);
1708 		if (!s)
1709 		    goto onError;
1710 		if (listitem < listsize)
1711 		    PyList_SET_ITEM(list,listitem,s);
1712 		else {
1713 		    PyList_Append(list,s);
1714 		    Py_DECREF(s);
1715 		}
1716 		listitem++;
1717 
1718 		if (x >= text_len)
1719 		    break;
1720 	    }
1721 
1722 	    /* Skip all text in set (include_splits == 1), not in set
1723     	       (include_splits == 0) */
1724 	    z = x;
1725 	    x = mxCharSet_FindUnicodeChar(self, tx, x, text_len, !include_splits, 1);
1726 
1727 	    /* Append the slice to list if it is not empty */
1728 	    if (x > z) {
1729 		s = PyUnicode_FromUnicode(&tx[z], x - z);
1730 		if (!s)
1731 		    goto onError;
1732 		if (listitem < listsize)
1733 		    PyList_SET_ITEM(list,listitem,s);
1734 		else {
1735 		    PyList_Append(list,s);
1736 		    Py_DECREF(s);
1737 		}
1738 		listitem++;
1739 	    }
1740 	}
1741     }
1742 #endif
1743     else
1744 	Py_Error(PyExc_TypeError,
1745 		 "expected string or unicode");
1746 
1747     /* Resize list if necessary */
1748     if (listitem < listsize)
1749 	PyList_SetSlice(list, listitem, listsize, (PyObject*)NULL);
1750 
1751     return list;
1752 
1753  onError:
1754     Py_XDECREF(list);
1755     return NULL;
1756 }
1757 
1758 /* methods */
1759 
1760 Py_C_Function( mxCharSet_contains,
1761 	       ".contains(char)\n\n"
1762 	       )
1763 {
1764     PyObject *chr;
1765     int rc;
1766 
1767     Py_GetArg("O:CharSet.contains", chr);
1768 
1769     rc = mxCharSet_Contains(self, chr);
1770     if (rc < 0)
1771 	goto onError;
1772     return PyInt_FromLong(rc);
1773 
1774  onError:
1775     return NULL;
1776 }
1777 
1778 Py_C_Function( mxCharSet_search,
1779 	       ".search(text[, direction=1, start=0, stop=len(text)])\n\n"
1780 	       )
1781 {
1782     PyObject *text;
1783     int direction = 1;
1784     Py_ssize_t start = 0, stop = INT_MAX;
1785     int rc;
1786 
1787     Py_Get4Args("O|iii:CharSet.search", text, direction, start, stop);
1788 
1789     rc = mxCharSet_Search(self, text, start, stop, direction);
1790     if (rc == -1)
1791 	Py_ReturnNone();
1792     if (rc < -1)
1793 	goto onError;
1794     return PyInt_FromLong(rc);
1795 
1796  onError:
1797     return NULL;
1798 }
1799 
1800 Py_C_Function( mxCharSet_match,
1801 	       ".match(text[, direction=1, start=0, stop=len(text)])\n\n"
1802 	       )
1803 {
1804     PyObject *text;
1805     int direction = 1;
1806     Py_ssize_t start = 0, stop = INT_MAX;
1807     int rc;
1808 
1809     Py_Get4Args("O|iii:CharSet.match", text, direction, start, stop);
1810 
1811     rc = mxCharSet_Match(self, text, start, stop, direction);
1812     if (rc < 0)
1813 	goto onError;
1814     return PyInt_FromLong(rc);
1815 
1816  onError:
1817     return NULL;
1818 }
1819 
1820 Py_C_Function( mxCharSet_split,
1821 	       ".split(text[, start=0, stop=len(text)])\n\n"
1822 	       )
1823 {
1824     PyObject *text;
1825     Py_ssize_t start = 0, stop = INT_MAX;
1826 
1827     Py_Get3Args("O|ii:CharSet.split", text, start, stop);
1828 
1829     return mxCharSet_Split(self, text, start, stop, 0);
1830 
1831  onError:
1832     return NULL;
1833 }
1834 
1835 Py_C_Function( mxCharSet_splitx,
1836 	       ".splitx(text[, start=0, stop=len(text)])\n\n"
1837 	       )
1838 {
1839     PyObject *text;
1840     Py_ssize_t start = 0, stop = INT_MAX;
1841 
1842     Py_Get3Args("O|ii:CharSet.splitx", text, start, stop);
1843 
1844     return mxCharSet_Split(self, text, start, stop, 1);
1845 
1846  onError:
1847     return NULL;
1848 }
1849 
1850 Py_C_Function( mxCharSet_strip,
1851 	       ".strip(text[, where=0, start=0, stop=len(text)])\n\n"
1852 	       )
1853 {
1854     PyObject *text;
1855     Py_ssize_t where = 0;
1856     Py_ssize_t start = 0, stop = INT_MAX;
1857 
1858     Py_Get4Args("O|iii:CharSet.strip", text, where, start, stop);
1859 
1860     return mxCharSet_Strip(self, text, start, stop, where);
1861 
1862  onError:
1863     return NULL;
1864 }
1865 
1866 #ifdef COPY_PROTOCOL
1867 Py_C_Function( mxCharSet_copy,
1868 	       "copy([memo])\n\n"
1869 	       "Return a new reference for the instance. This function\n"
1870 	       "is used for the copy-protocol. Real copying doesn't take\n"
1871 	       "place, since the instances are immutable.")
1872 {
1873     PyObject *memo;
1874 
1875     Py_GetArg("|O",memo);
1876     Py_INCREF(cs);
1877     return (PyObject *)cs;
1878  onError:
1879     return NULL;
1880 }
1881 #endif
1882 
1883 #undef cs
1884 
1885 /* --- slots --- */
1886 
1887 static
mxCharSet_Repr(mxCharSetObject * self)1888 PyObject *mxCharSet_Repr(mxCharSetObject *self)
1889 {
1890     PyObject *v;
1891     char t[500], *reprstr;
1892 
1893     v = PyObject_Repr(self->definition);
1894     if (v == NULL)
1895 	return NULL;
1896     reprstr = PyString_AsString(v);
1897     if (reprstr == NULL)
1898 	return NULL;
1899     sprintf(t, "<Character Set object for %.400s at 0x%lx>",
1900 	    reprstr, (long)self);
1901     Py_DECREF(v);
1902     return PyString_FromString(t);
1903 }
1904 
1905 /* Python Type Tables */
1906 
1907 static
1908 PySequenceMethods mxCharSet_TypeAsSequence = {
1909     (lenfunc)0,                         /*sq_length*/
1910     (binaryfunc)0,                      /*sq_concat*/
1911     (ssizeargfunc)0,                    /*sq_repeat*/
1912     (ssizeargfunc)0,                    /*sq_item*/
1913     (ssizessizeargfunc)0,               /*sq_slice*/
1914     (ssizeobjargproc)0,                 /*sq_ass_item*/
1915     (ssizessizeobjargproc)0,            /*sq_ass_slice*/
1916     (objobjproc)mxCharSet_Contains,     /*sq_contains*/
1917 };
1918 
1919 static
1920 PyMemberDef mxCharSet_Members[] = {
1921     {"definition",T_OBJECT_EX,offsetof(mxCharSetObject,definition),READONLY,"Definition"},
1922     {NULL}
1923 };
1924 
1925 static
1926 PyMethodDef mxCharSet_Methods[] =
1927 {
1928     Py_MethodListEntry("contains",mxCharSet_contains),
1929     Py_MethodListEntry("search",mxCharSet_search),
1930     Py_MethodListEntry("match",mxCharSet_match),
1931     Py_MethodListEntry("strip",mxCharSet_strip),
1932     Py_MethodListEntry("split",mxCharSet_split),
1933     Py_MethodListEntry("splitx",mxCharSet_splitx),
1934 #ifdef COPY_PROTOCOL
1935     Py_MethodListEntry("__deepcopy__",mxCharSet_copy),
1936     Py_MethodListEntry("__copy__",mxCharSet_copy),
1937 #endif
1938     {NULL,NULL} /* end of list */
1939 };
1940 
1941 PyTypeObject mxCharSet_Type = {
1942     PyVarObject_HEAD_INIT(NULL, 0)      /* init at startup ! */
1943     "Character Set",                    /* tp_name */
1944     sizeof(mxCharSetObject),            /* tp_basicsize */
1945     0,                                  /* tp_itemsize */
1946     /* methods */
1947     (destructor)mxCharSet_Free,         /* tp_dealloc */
1948     (printfunc)0,                       /* tp_print */
1949     (getattrfunc)0,                     /* tp_getattr */
1950     (setattrfunc)0,                     /* tp_setattr */
1951     0,                                  /* tp_compare */
1952     (reprfunc)mxCharSet_Repr,           /* tp_repr */
1953     0,                                  /* tp_as_number */
1954     &mxCharSet_TypeAsSequence,          /* tp_as_sequence */
1955     0,                                  /* tp_as_mapping */
1956     (hashfunc)0,                        /* tp_hash */
1957     (ternaryfunc)0,                     /* tp_call */
1958     (reprfunc)0,                        /* tp_str */
1959     (getattrofunc)0,                    /* tp_getattro */
1960     (setattrofunc)0,                    /* tp_setattro */
1961     0,                                  /* tp_as_buffer */
1962     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1963     (char*) 0,                          /* tp_doc */
1964     0,                                  /* tp_traverse */
1965     0,                                  /* tp_clear */
1966     0,                                  /* tp_richcompare */
1967     0,                                  /* tp_weaklistoffset */
1968     0,                                  /* tp_iter */
1969     0,                                  /* tp_iternext */
1970     mxCharSet_Methods,                  /* tp_methods */
1971     mxCharSet_Members,                  /* tp_members */
1972 };
1973 
1974 /* --- Tag Table Object ------------------------------------------------*/
1975 
1976 PyObject *mxTagTable_New(PyObject *definition,
1977 			 int tabletype,
1978 			 int cacheable);
1979 
1980 /* internal APIs */
1981 
1982 static
tc_get_item(register PyObject * obj,register Py_ssize_t i)1983 PyObject *tc_get_item(register PyObject *obj,
1984 		      register Py_ssize_t i)
1985 {
1986     if (PyTuple_Check(obj)) {
1987 	if (i > PyTuple_GET_SIZE(obj))
1988 	    return NULL;
1989 	return PyTuple_GET_ITEM(obj, i);
1990     }
1991     else if (PyList_Check(obj)) {
1992 	if (i > PyList_GET_SIZE(obj))
1993 	    return NULL;
1994 	return PyList_GET_ITEM(obj, i);
1995     }
1996     else
1997 	return NULL;
1998 }
1999 
2000 static
tc_length(register PyObject * obj)2001 Py_ssize_t tc_length(register PyObject *obj)
2002 {
2003     if (obj == NULL)
2004 	return -1;
2005     else if (PyTuple_Check(obj))
2006 	return PyTuple_GET_SIZE(obj);
2007     else if (PyList_Check(obj))
2008 	return PyList_GET_SIZE(obj);
2009     else
2010 	return -1;
2011 }
2012 
2013 /* Add a jump target to the jump dictionary */
2014 
2015 static
tc_add_jumptarget(PyObject * jumpdict,PyObject * targetname,Py_ssize_t index)2016 Py_ssize_t tc_add_jumptarget(PyObject *jumpdict,
2017 		      PyObject *targetname,
2018 		      Py_ssize_t index)
2019 {
2020     PyObject *v;
2021 
2022     v = PyDict_GetItem(jumpdict, targetname);
2023     if (v != NULL)
2024 	Py_ErrorWithArg(PyExc_TypeError,
2025 			"tag table entry %d: "
2026 			"jump target already defined", (unsigned int) index);
2027     v = PyInt_FromLong(index);
2028     if (v == NULL)
2029 	goto onError;
2030     if (PyDict_SetItem(jumpdict, targetname, v))
2031 	goto onError;
2032     Py_DECREF(v);
2033     return 0;
2034 
2035  onError:
2036     return -1;
2037 }
2038 
2039 /* Convert a string command argument to either an 8-bit string or
2040    Unicode depending on the tabletype. */
2041 
2042 static
tc_convert_string_arg(PyObject * arg,Py_ssize_t tableposition,int tabletype)2043 PyObject *tc_convert_string_arg(PyObject *arg,
2044 				Py_ssize_t tableposition,
2045 				int tabletype)
2046 {
2047     /* Convert to strings */
2048     if (tabletype == MXTAGTABLE_STRINGTYPE) {
2049 	if (PyString_Check(arg))
2050 	    return arg;
2051 #ifdef HAVE_UNICODE
2052 	else if (PyUnicode_Check(arg)) {
2053 	    Py_DECREF(arg);
2054 	    arg = PyUnicode_AsEncodedString(arg,
2055 					    NULL,
2056 					    NULL);
2057 	    if (arg == NULL)
2058 		Py_ErrorWithArg(PyExc_TypeError,
2059 				"tag table entry %d: "
2060 				"conversion from Unicode to "
2061 				"string failed", (unsigned int)tableposition);
2062 	}
2063 #endif
2064 	else
2065 	    Py_ErrorWithArg(PyExc_TypeError,
2066 			    "tag table entry %d: "
2067 			    "command argument must be a "
2068 			    "string or unicode", (unsigned int)tableposition);
2069     }
2070 
2071 #ifdef HAVE_UNICODE
2072     /* Convert to Unicode */
2073     else if (tabletype == MXTAGTABLE_UNICODETYPE) {
2074 	if (PyUnicode_Check(arg))
2075 	    return arg;
2076 	else if (PyString_Check(arg)) {
2077 	    Py_DECREF(arg);
2078 	    arg = PyUnicode_Decode(PyString_AS_STRING(arg),
2079 				    PyString_GET_SIZE(arg),
2080 				    NULL,
2081 				    NULL);
2082 	    if (arg == NULL)
2083 		Py_ErrorWithArg(PyExc_TypeError,
2084 				"tag table entry %d: "
2085 				"conversion from string to "
2086 				"Unicode failed", (unsigned int)tableposition);
2087 	}
2088 	else
2089 	    Py_ErrorWithArg(PyExc_TypeError,
2090 			    "tag table entry %d: "
2091 			    "command argument must be a "
2092 			    "string or unicode", (unsigned int)tableposition);
2093     }
2094 #endif
2095 
2096     else
2097 	Py_Error(mxTextTools_Error,
2098 		 "unsupported table type");
2099 
2100     return arg;
2101 
2102  onError:
2103     return NULL;
2104 }
2105 
2106 /* Cleanup any references in the tag table. */
2107 
2108 static
tc_cleanup(mxTagTableObject * tagtable)2109 int tc_cleanup(mxTagTableObject *tagtable)
2110 {
2111     Py_ssize_t i;
2112     for (i = 0; i < tagtable->numentries; i++) {
2113 	mxTagTableEntry *tagtableentry = &tagtable->entry[i];
2114 
2115 	Py_XDECREF(tagtableentry->tagobj);
2116 	tagtableentry->tagobj = NULL;
2117 	Py_XDECREF(tagtableentry->args);
2118 	tagtableentry->args = NULL;
2119     }
2120     return 0;
2121 }
2122 
2123 /* Initialize the tag table (this is the actual Tag Table compiler) */
2124 
2125 static
init_tag_table(mxTagTableObject * tagtable,PyObject * table,Py_ssize_t size,int tabletype,int cacheable)2126 int init_tag_table(mxTagTableObject *tagtable,
2127 		   PyObject *table,
2128 		   Py_ssize_t size,
2129 		   int tabletype,
2130 		   int cacheable)
2131 {
2132     Py_ssize_t i;
2133     PyObject *entry;
2134     Py_ssize_t entry_len;
2135     PyObject *tagobj, *command, *args = 0, *je, *jne;
2136     PyObject *jumpdict, *v;
2137     int secondpass, own_args = 0;
2138 
2139     jumpdict = PyDict_New();
2140     if (jumpdict == NULL)
2141 	return -1;
2142 
2143     /* Reset to all fields to 0 */
2144     memset(&tagtable->entry[0], 0, size * sizeof(mxTagTableEntry));
2145 
2146     /* First pass */
2147     secondpass = 0;
2148     tagtable->numentries = size;
2149     for (i = 0; i < size; i++) {
2150 	mxTagTableEntry *tagtableentry = &tagtable->entry[i];
2151 
2152 	/* Get table entry i and parse it */
2153 	entry = tc_get_item(table, i);
2154 	if (entry == NULL) {
2155 	    Py_ErrorWithArg(PyExc_TypeError,
2156 			    "tag table entry %d: "
2157 			    "not found or not a supported entry type", (unsigned int)i);
2158 	}
2159 
2160 	/* Special handling for jump marks (args is set to the jump
2161 	   mark string, jump target index is the next table entry) */
2162 	if (PyString_Check(entry)) {
2163 	    if (tc_add_jumptarget(jumpdict, entry, i + 1))
2164 		goto onError;
2165 	    tagtableentry->tagobj = NULL;
2166 	    tagtableentry->cmd = MATCH_JUMPTARGET;
2167 	    tagtableentry->flags = 0;
2168 	    Py_INCREF(entry);
2169 	    tagtableentry->args = entry;
2170 	    tagtableentry->jne = 0;
2171 	    tagtableentry->je = 1;
2172 	    continue;
2173 	}
2174 
2175 	/* Get entry length */
2176 	entry_len = tc_length(entry);
2177 	if (entry_len < 3) {
2178 	    Py_ErrorWithArg(PyExc_TypeError,
2179 			    "tag table entry %d: "
2180 			    "expected an entry of the form "
2181 			    "(tagobj,command,arg[,jne[,je]])", (unsigned int)i);
2182 	}
2183 
2184 	/* Decode entry parts: (tagobj, command, args[, jne[, je]]) */
2185 	tagobj = tc_get_item(entry, 0);
2186 	command = tc_get_item(entry, 1);
2187 	args = tc_get_item(entry, 2);
2188 	if (entry_len >= 4)
2189 	    jne = tc_get_item(entry, 3);
2190 	else
2191 	    jne = NULL;
2192 	if (entry_len >= 5)
2193 	    je = tc_get_item(entry, 4);
2194 	else
2195 	    je = NULL;
2196 
2197 	if (tagobj == NULL ||
2198 	    command == NULL ||
2199 	    args == NULL ||
2200 	    (entry_len >= 4 && jne == NULL) ||
2201 	    (entry_len >= 5 && je == NULL)) {
2202 	    Py_ErrorWithArg(PyExc_TypeError,
2203 			    "tag table entry %d: "
2204 			    "expected an entry of the form "
2205 			    "(tagobj,command,arg[,jne[,je]])",(unsigned int) i);
2206 	}
2207 
2208 	/* Store tagobj, None gets converted to NULL */
2209 	if (tagobj != Py_None)
2210 	    Py_INCREF(tagobj);
2211 	else
2212 	    tagobj = NULL;
2213 	tagtableentry->tagobj = tagobj;
2214 
2215 	/* Decode command and flags */
2216 	Py_AssertWithArg(PyInt_Check(command),
2217 			 PyExc_TypeError,
2218 			 "tag table entry %d: "
2219 			 "command must be an integer",(unsigned int)i);
2220 	tagtableentry->cmd = PyInt_AS_LONG(command) & 0xFF;
2221 	tagtableentry->flags = PyInt_AS_LONG(command) - tagtableentry->cmd;
2222 
2223 	/* Check command arguments */
2224 	Py_INCREF(args);
2225 	own_args = 1;
2226 
2227 	switch (tagtableentry->cmd) {
2228 
2229 	case MATCH_JUMP: /* == MATCH_FAIL */
2230 	case MATCH_EOF:
2231 	case MATCH_LOOP:
2232 	    /* args is ignored */
2233 	    break;
2234 
2235 	case MATCH_SKIP:
2236 	case MATCH_MOVE:
2237 	case MATCH_LOOPCONTROL:
2238 	    Py_AssertWithArg(PyInt_Check(args),
2239 			     PyExc_TypeError,
2240 			     "tag table entry %d: "
2241 			     "Skip|Move|LoopControl command argument "
2242 			     "must be an integer", (unsigned int)i);
2243 	    break;
2244 
2245 	case MATCH_JUMPTARGET:
2246 	    Py_AssertWithArg(PyString_Check(args),
2247 			     PyExc_TypeError,
2248 			     "tag table entry %d: "
2249 			     "JumpMark command argument must be a string",(unsigned int)i);
2250 	    if (tc_add_jumptarget(jumpdict, args, i + 1))
2251 		goto onError;
2252 	    break;
2253 
2254 	case MATCH_ALLIN:
2255 	case MATCH_ALLNOTIN:
2256 	case MATCH_IS:
2257 	case MATCH_ISIN:
2258 	case MATCH_ISNOTIN:
2259 	case MATCH_WORD:
2260 	case MATCH_WORDSTART:
2261 	case MATCH_WORDEND:
2262 	    args = tc_convert_string_arg(args, i, tabletype);
2263 	    if (args == NULL)
2264 		goto onError;
2265 	    break;
2266 
2267 	case MATCH_ALLINSET:
2268 	case MATCH_ISINSET:
2269 	    Py_AssertWithArg(PyString_Check(args) &&
2270 			     PyString_GET_SIZE(args) == 32,
2271 			     PyExc_TypeError,
2272 			     "tag table entry %d: "
2273 			     "AllInSet|IsInSet command argument must "
2274 			     "be a set() string",(unsigned int)i);
2275 	    break;
2276 
2277 	case MATCH_ALLINCHARSET:
2278 	case MATCH_ISINCHARSET:
2279 	    Py_AssertWithArg(mxCharSet_Check(args),
2280 			     PyExc_TypeError,
2281 			     "tag table entry %d: "
2282 			     "AllInCharSet|IsInCharSet command argument must "
2283 			     "be a CharSet instance",(unsigned int)i);
2284 	    break;
2285 
2286 	case MATCH_SWORDSTART: /* == MATCH_NOWORD */
2287 	case MATCH_SWORDEND:
2288 	case MATCH_SFINDWORD:
2289 	    Py_AssertWithArg(mxTextSearch_Check(args),
2290 			     PyExc_TypeError,
2291 			     "tag table entry %d: "
2292 			     "sWordStart|sWordEnd|sFindWord command "
2293 			     "argument must be a TextSearch search "
2294 			     "object",(unsigned int)i);
2295 	    break;
2296 
2297 	case MATCH_TABLE:
2298 	case MATCH_SUBTABLE:
2299 	    Py_AssertWithArg(mxTagTable_Check(args) ||
2300 			     PyTuple_Check(args) ||
2301 			     PyList_Check(args) ||
2302 			     (PyInt_Check(args) &&
2303 			      PyInt_AS_LONG(args) == MATCH_THISTABLE),
2304 			     PyExc_TypeError,
2305 			     "tag table entry %d: "
2306 			     "Table|SubTable command argument "
2307 			     "must be a tag table tuple/object or "
2308 			     "ThisTable", (unsigned int)i);
2309 	    /* XXX We shouldn't recursively compile tag table tuples here
2310 		   because this will slow down the compile process
2311 		   too much and it's not clear whether this particular
2312 		   table will ever be used during tagging.
2313 	    */
2314 	    if (!mxTagTable_Check(args) && !PyInt_Check(args)) {
2315 		Py_DECREF(args);
2316 		args = mxTagTable_New(args, tabletype, cacheable);
2317 		if (args == NULL)
2318 		    goto onError;
2319 	    }
2320 	    break;
2321 
2322 	case MATCH_TABLEINLIST:
2323 	case MATCH_SUBTABLEINLIST:
2324 	    Py_AssertWithArg(PyTuple_Check(args) &&
2325 			     PyTuple_GET_SIZE(args) == 2 &&
2326 			     PyList_Check(PyTuple_GET_ITEM(args, 0)) &&
2327 			     PyInt_Check(PyTuple_GET_ITEM(args, 1)),
2328 			     PyExc_TypeError,
2329 			     "tag table entry %d: "
2330 			     "TableInList|SubTableInList command argument "
2331 			     "must be a 2-tuple (list, integer)",
2332 			     (unsigned int)i);
2333 	    break;
2334 
2335 	case MATCH_CALL:
2336 	    Py_AssertWithArg(PyCallable_Check(args),
2337 			     PyExc_TypeError,
2338 			     "tag table entry %d: "
2339 			     "Call command argument "
2340 			     "must be a callable object",
2341 			     (unsigned int)i);
2342 	    break;
2343 
2344 	case MATCH_CALLARG:
2345 	    Py_AssertWithArg(PyTuple_Check(args) &&
2346 			     PyTuple_GET_SIZE(args) > 0 &&
2347 			     PyCallable_Check(PyTuple_GET_ITEM(args, 0)),
2348 			     PyExc_TypeError,
2349 			     "tag table entry %d: "
2350 			     "CallArg command argument "
2351 			     "must be a tuple (fct,[arg0,arg1,...])",
2352 			     (unsigned int)i);
2353 	    break;
2354 
2355 	default:
2356 	    Py_ErrorWith2Args(PyExc_TypeError,
2357 			      "tag table entry %d: "
2358 			      "unknown command integer: %i",
2359 			      (unsigned int)i, tagtableentry->cmd);
2360 
2361 	}
2362 
2363 	/* Store command args */
2364 	tagtableentry->args = args;
2365 	own_args = 0;
2366 
2367 	/* Decode jump offsets */
2368 	if (jne) {
2369 	    if (PyInt_Check(jne))
2370 		tagtableentry->jne = PyInt_AS_LONG(jne);
2371 	    else if (PyString_Check(jne)) {
2372 		/* Mark for back-patching */
2373 		tagtableentry->jne = -424242;
2374 		secondpass = 1;
2375 	    }
2376 	    else
2377 		Py_ErrorWithArg(PyExc_TypeError,
2378 				"tag table entry %d: "
2379 				"jne must be an integer or string", (unsigned int)i);
2380 	}
2381 	else
2382 	    tagtableentry->jne = 0;
2383 
2384 	if (je) {
2385 	    if (PyInt_Check(je))
2386 		tagtableentry->je = PyInt_AS_LONG(je);
2387 	    else if (PyString_Check(je)) {
2388 		/* Mark for back-patching */
2389 		tagtableentry->je = -424242;
2390 		secondpass = 1;
2391 	    }
2392 	    else
2393 		Py_ErrorWithArg(PyExc_TypeError,
2394 				"tag table entry %d: "
2395 				"je must be an integer or string", (unsigned int)i);
2396 	}
2397 	else
2398 	    tagtableentry->je = 1;
2399     }
2400 
2401     /* Second pass (needed to patch string jump targets) */
2402     if (secondpass)
2403 	for (i = 0; i < size; i++) {
2404 	    mxTagTableEntry *tagtableentry = &tagtable->entry[i];
2405 
2406 	    if (tagtableentry->je != -424242 &&
2407 		tagtableentry->jne != -424242)
2408 		continue;
2409 
2410 	    /* Entry (most probably) needs back-patching */
2411 	    entry = tc_get_item(table, i);
2412 	    if (entry == NULL) {
2413 		Py_ErrorWithArg(PyExc_TypeError,
2414 				"tag table entry %d: "
2415 				"unexpected error (not found)", (unsigned int)i);
2416 	    }
2417 
2418 	    /* Get entry length */
2419 	    entry_len = tc_length(entry);
2420 	    if (entry_len < 0) {
2421 		Py_ErrorWithArg(PyExc_TypeError,
2422 				"tag table entry %d: "
2423 				"unexpected error (no length)", (unsigned int)i);
2424 	    }
2425 
2426 	    /* Decode jump offsets */
2427 	    if (entry_len >= 4)
2428 		jne = tc_get_item(entry, 3);
2429 	    else
2430 		jne = NULL;
2431 	    if (entry_len >= 5)
2432 		je = tc_get_item(entry, 4);
2433 	    else
2434 		je = NULL;
2435 
2436 	    /* Patch jump offsets */
2437 	    if (jne && PyString_Check(jne)) {
2438 		v = PyDict_GetItem(jumpdict, jne);
2439 		if (v == NULL || !PyInt_Check(v))
2440 		    Py_ErrorWith2Args(PyExc_TypeError,
2441 				      "tag table entry %d: "
2442 				      "jne jump target '%s' not found",
2443 				      (unsigned int)i, PyString_AS_STRING(jne));
2444 		tagtableentry->jne = PyInt_AS_LONG(v) - i;
2445 	    }
2446 	    if (je && PyString_Check(je)) {
2447 		v = PyDict_GetItem(jumpdict, je);
2448 		if (v == NULL || !PyInt_Check(v))
2449 		    Py_ErrorWith2Args(PyExc_TypeError,
2450 				      "tag table entry %d: "
2451 				      "je jump target '%s' not found",
2452 				      (unsigned int)i, PyString_AS_STRING(je));
2453 		tagtableentry->je = PyInt_AS_LONG(v) - i;
2454 	    }
2455 	}
2456 
2457     Py_DECREF(jumpdict);
2458     return 0;
2459 
2460  onError:
2461     if (own_args) {
2462 	Py_DECREF(args);
2463     }
2464     return -1;
2465 }
2466 
2467 /* Check the cache for an already compiled TagTable for this
2468    definition.  Return NULL in case of an error, Py_None without
2469    INCREF in case no such table was found or the TagTable object. */
2470 
2471 static
consult_tagtable_cache(PyObject * definition,int tabletype,int cacheable)2472 PyObject *consult_tagtable_cache(PyObject *definition,
2473 				 int tabletype,
2474 				 int cacheable)
2475 {
2476     PyObject *v, *key, *tt;
2477 
2478     if (!PyTuple_Check(definition) || !cacheable)
2479 	return Py_None;
2480 
2481     key = PyTuple_New(2);
2482     if (key == NULL)
2483 	goto onError;
2484     v = PyInt_FromLong((long) definition);
2485     if (v == NULL)
2486 	goto onError;
2487     PyTuple_SET_ITEM(key, 0, v);
2488     v = PyInt_FromLong(tabletype);
2489     if (v == NULL)
2490 	goto onError;
2491     PyTuple_SET_ITEM(key, 1, v);
2492     tt = PyDict_GetItem(mxTextTools_TagTables, key);
2493     Py_DECREF(key);
2494     if (tt != NULL) {
2495 	Py_INCREF(tt);
2496 	return tt;
2497     }
2498     return Py_None;
2499 
2500  onError:
2501     return NULL;
2502 }
2503 
2504 /* Adds the compiled tagtable to the cache. Returns -1 in case of an
2505    error, 0 on success. */
2506 
2507 static
add_to_tagtable_cache(PyObject * definition,int tabletype,int cacheable,PyObject * tagtable)2508 int add_to_tagtable_cache(PyObject *definition,
2509 			  int tabletype,
2510 			  int cacheable,
2511 			  PyObject *tagtable)
2512 {
2513     PyObject *v, *key;
2514     int rc;
2515 
2516     if (!PyTuple_Check(definition) || !cacheable)
2517 	return 0;
2518 
2519     key = PyTuple_New(2);
2520     if (key == NULL)
2521 	goto onError;
2522     v = PyInt_FromLong((long) definition);
2523     if (v == NULL)
2524 	goto onError;
2525     PyTuple_SET_ITEM(key, 0, v);
2526     v = PyInt_FromLong(tabletype);
2527     if (v == NULL)
2528 	goto onError;
2529     PyTuple_SET_ITEM(key, 1, v);
2530 
2531     /* Hard-limit the cache size */
2532     if (PyDict_Size(mxTextTools_TagTables) >= MAX_TAGTABLES_CACHE_SIZE)
2533 	PyDict_Clear(mxTextTools_TagTables);
2534 
2535     rc = PyDict_SetItem(mxTextTools_TagTables, key, tagtable);
2536     Py_DECREF(key);
2537     if (rc)
2538 	goto onError;
2539     return 0;
2540 
2541  onError:
2542     return -1;
2543 }
2544 
2545 
2546 /* allocation */
2547 
mxTagTable_New(PyObject * definition,int tabletype,int cacheable)2548 PyObject *mxTagTable_New(PyObject *definition,
2549 			 int tabletype,
2550 			 int cacheable)
2551 {
2552     mxTagTableObject *tagtable = 0;
2553     PyObject *v;
2554     Py_ssize_t size;
2555 
2556     /* First, consult the TagTable cache */
2557     v = consult_tagtable_cache(definition, tabletype, cacheable);
2558     if (v == NULL)
2559 	goto onError;
2560     else if (v != Py_None)
2561 	return v;
2562 
2563     size = tc_length(definition);
2564     if (size < 0)
2565 	Py_Error(PyExc_TypeError,
2566 		 "tag table definition must be a tuple or a list");
2567 
2568     tagtable = PyObject_NEW_VAR(mxTagTableObject, &mxTagTable_Type, size);
2569     if (tagtable == NULL)
2570 	goto onError;
2571     if (cacheable) {
2572 	Py_INCREF(definition);
2573 	tagtable->definition = definition;
2574     }
2575     else
2576 	tagtable->definition = NULL;
2577     tagtable->tabletype = tabletype;
2578 
2579     /* Compile table ... */
2580     if (init_tag_table(tagtable, definition, size, tabletype, cacheable))
2581 	goto onError;
2582 
2583     /* Cache the compiled table if it is cacheable and derived from a
2584        tuple */
2585     if (add_to_tagtable_cache(definition, tabletype, cacheable,
2586 			      (PyObject *)tagtable))
2587 	goto onError;
2588 
2589     return (PyObject *)tagtable;
2590 
2591  onError:
2592     Py_XDECREF(tagtable);
2593     return NULL;
2594 }
2595 
2596 Py_C_Function( mxTagTable_TagTable,
2597 	       "TagTable(definition[,cachable=1])\n\n"
2598 	       )
2599 {
2600     PyObject *definition;
2601     int cacheable = 1;
2602 
2603     Py_Get2Args("O|i:TagTable", definition, cacheable);
2604     return mxTagTable_New(definition, 0, cacheable);
2605 
2606  onError:
2607     return NULL;
2608 }
2609 
2610 #ifdef HAVE_UNICODE
2611 Py_C_Function( mxTagTable_UnicodeTagTable,
2612 	       "TagTable(definition[,cachable=1])\n\n"
2613 	       )
2614 {
2615     PyObject *definition;
2616     int cacheable = 1;
2617 
2618     Py_Get2Args("O|i:UnicodeTagTable", definition, cacheable);
2619     return mxTagTable_New(definition, 1, cacheable);
2620 
2621  onError:
2622     return NULL;
2623 }
2624 #endif
2625 
2626 static
mxTagTable_Free(mxTagTableObject * tagtable)2627 void mxTagTable_Free(mxTagTableObject *tagtable)
2628 {
2629     tc_cleanup(tagtable);
2630     Py_XDECREF(tagtable->definition);
2631     PyObject_Del(tagtable);
2632 }
2633 
2634 /* C APIs */
2635 
2636 #define tagtable ((mxTagTableObject *)self)
2637 
2638 static
mxTagTable_CompiledDefinition(PyObject * self)2639 PyObject *mxTagTable_CompiledDefinition(PyObject *self)
2640 {
2641     PyObject *tuple = 0, *v, *w;
2642     Py_ssize_t i;
2643     Py_ssize_t size;
2644 
2645     if (!mxTagTable_Check(self)) {
2646 	PyErr_BadInternalCall();
2647 	goto onError;
2648     }
2649 
2650     size = tagtable->numentries;
2651     tuple = PyTuple_New(size);
2652     if (tuple == NULL)
2653 	goto onError;
2654 
2655     for (i = 0; i < size; i++) {
2656 	mxTagTableEntry *tagtableentry = &tagtable->entry[i];
2657 
2658 	/* Build tuple (tagobj, command, args, jne, je) */
2659 	v = PyTuple_New(5);
2660 	if (v == NULL)
2661 	    goto onError;
2662 	w = tagtableentry->tagobj;
2663 	if (w == NULL)
2664 	    w = Py_None;
2665 	Py_INCREF(w);
2666 	PyTuple_SET_ITEM(v, 0, w);
2667 	PyTuple_SET_ITEM(v, 1, PyInt_FromLong(tagtableentry->cmd |
2668 					      tagtableentry->flags));
2669 	w = tagtableentry->args;
2670 	if (w == NULL)
2671 	    w = Py_None;
2672 	Py_INCREF(w);
2673 	PyTuple_SET_ITEM(v, 2, w);
2674 	PyTuple_SET_ITEM(v, 3, PyInt_FromLong(tagtableentry->jne));
2675 	PyTuple_SET_ITEM(v, 4, PyInt_FromLong(tagtableentry->je));
2676 	if (PyErr_Occurred()) {
2677 	    Py_DECREF(v);
2678 	    goto onError;
2679 	}
2680 	PyTuple_SET_ITEM(tuple, i, v);
2681     }
2682 
2683     return tuple;
2684 
2685  onError:
2686     Py_XDECREF(tuple);
2687     return NULL;
2688 }
2689 
2690 
2691 /* methods */
2692 
2693 Py_C_Function( mxTagTable_compiled,
2694 	       ".compiled()\n\n"
2695 	       )
2696 {
2697     Py_NoArgsCheck();
2698     return mxTagTable_CompiledDefinition(self);
2699 
2700  onError:
2701     return NULL;
2702 }
2703 
2704 #ifdef COPY_PROTOCOL
2705 Py_C_Function( mxTagTable_copy,
2706 	       "copy([memo])\n\n"
2707 	       "Return a new reference for the instance. This function\n"
2708 	       "is used for the copy-protocol. Real copying doesn't take\n"
2709 	       "place, since the instances are immutable.")
2710 {
2711     PyObject *memo;
2712 
2713     Py_GetArg("|O",memo);
2714     Py_INCREF(tagtable);
2715     return (PyObject *)tagtable;
2716 
2717  onError:
2718     return NULL;
2719 }
2720 #endif
2721 
2722 #undef tagtable
2723 
2724 /* --- slots --- */
2725 
2726 static
mxTagTable_Repr(mxTagTableObject * self)2727 PyObject *mxTagTable_Repr(mxTagTableObject *self)
2728 {
2729     char t[100];
2730 
2731     if (self->tabletype == MXTAGTABLE_STRINGTYPE)
2732 	sprintf(t,"<String Tag Table object at 0x%lx>", (long)self);
2733     else if (self->tabletype == MXTAGTABLE_UNICODETYPE)
2734 	sprintf(t,"<Unicode Tag Table object at 0x%lx>", (long)self);
2735     else
2736 	sprintf(t,"<Tag Table object at 0x%lx>", (long)self);
2737     return PyString_FromString(t);
2738 }
2739 
2740 static
2741 PyMethodDef mxTagTable_Methods[] =
2742 {
2743     Py_MethodListEntryNoArgs("compiled",mxTagTable_compiled),
2744 #ifdef COPY_PROTOCOL
2745     Py_MethodListEntry("__deepcopy__",mxTagTable_copy),
2746     Py_MethodListEntry("__copy__",mxTagTable_copy),
2747 #endif
2748     {NULL,NULL} /* end of list */
2749 };
2750 
2751 static
2752 PyMemberDef mxTagTable_Members[] = {
2753     {"definition",T_OBJECT_EX,offsetof(mxTagTableObject,definition),READONLY,"Definition"},
2754     {NULL}
2755 };
2756 
2757 /* Python Type Tables */
2758 
2759 PyTypeObject mxTagTable_Type = {
2760     PyVarObject_HEAD_INIT(NULL, 0)          /* init at startup ! */
2761     "Tag Table",                            /* tp_name */
2762     sizeof(mxTagTableObject),               /* tp_basicsize */
2763     sizeof(mxTagTableEntry),                /* tp_itemsize */
2764     /* methods */
2765     (destructor)mxTagTable_Free,            /* tp_dealloc */
2766     (printfunc)0,                           /* tp_print */
2767     (getattrfunc)0,                         /* tp_getattr */
2768     (setattrfunc)0,                         /* tp_setattr */
2769     0,                                      /* tp_compare */
2770     (reprfunc)mxTagTable_Repr,              /* tp_repr */
2771     0,                                      /* tp_as_number */
2772     0,                                      /* tp_as_sequence */
2773     0,                                      /* tp_as_mapping */
2774     (hashfunc)0,                            /* tp_hash */
2775     (ternaryfunc)0,                         /* tp_call */
2776     (reprfunc)0,                            /* tp_str */
2777     (getattrofunc)0,                        /* tp_getattro */
2778     (setattrofunc)0,                        /* tp_setattro */
2779     0,                                      /* tp_as_buffer */
2780     Py_TPFLAGS_DEFAULT,                     /* tp_flags */
2781     (char*) 0,                              /* tp_doc */
2782     0,                                      /* tp_traverse */
2783     0,                                      /* tp_clear */
2784     0,                                      /* tp_richcompare */
2785     0,                                      /* tp_weaklistoffset */
2786     0,                                      /* tp_iter */
2787     0,                                      /* tp_iternext */
2788     mxTagTable_Methods,                     /* tp_methods */
2789     mxTagTable_Members,                     /* tp_members */
2790 };
2791 
2792 /* --- Internal functions ----------------------------------------------*/
2793 
2794 #ifdef HAVE_UNICODE
2795 
2796 /* Same as mxTextTools_Join() for Unicode objects. */
2797 
2798 static
mxTextTools_UnicodeJoin(PyObject * seq,Py_ssize_t start,Py_ssize_t stop,PyObject * separator)2799 PyObject *mxTextTools_UnicodeJoin(PyObject *seq,
2800 				  Py_ssize_t start,
2801 				  Py_ssize_t stop,
2802 				  PyObject *separator)
2803 {
2804     PyObject *newstring = 0, *tempstr = 0;
2805     Py_ssize_t newstring_len,current_len = 0;
2806     Py_UNICODE *p;
2807     Py_ssize_t i;
2808     Py_UNICODE *sep;
2809     Py_ssize_t sep_len;
2810 
2811     if (separator) {
2812 	separator = PyUnicode_FromObject(separator);
2813 	if (separator == NULL)
2814 	    goto onError;
2815 	sep = PyUnicode_AS_UNICODE(separator);
2816 	sep_len = PyUnicode_GET_SIZE(separator);
2817     }
2818     else {
2819 	sep = NULL;
2820 	sep_len = 0;
2821     }
2822 
2823     /* Create an empty new string */
2824     newstring_len = (10 + sep_len) * (stop - start);
2825     newstring = PyUnicode_FromUnicode(NULL, newstring_len);
2826     if (newstring == NULL)
2827 	goto onError;
2828     p = PyUnicode_AS_UNICODE(newstring);
2829 
2830     /* Join with separator */
2831     for (i = start; i < stop; i++) {
2832 	register PyObject *o;
2833 	Py_UNICODE *st;
2834 	Py_ssize_t len_st;
2835 
2836 	o = PySequence_GetItem(seq, i);
2837 
2838 	if (PyTuple_Check(o)) {
2839 	    /* Tuple entry: (string,l,r,[...]) */
2840 	    register Py_ssize_t l,r;
2841 
2842 	    /* parse tuple */
2843 	    Py_Assert((PyTuple_GET_SIZE(o) >= 3) &&
2844 		      PyInt_Check(PyTuple_GET_ITEM(o,1)) &&
2845 		      PyInt_Check(PyTuple_GET_ITEM(o,2)),
2846 		      PyExc_TypeError,
2847 		      "tuples must be of the format (string,l,r[,...])");
2848 	    tempstr = PyUnicode_FromObject(PyTuple_GET_ITEM(o,0));
2849 	    if (tempstr == NULL)
2850 		goto onError;
2851 	    st = PyUnicode_AS_UNICODE(tempstr);
2852 	    len_st = PyUnicode_GET_SIZE(tempstr);
2853 	    l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1));
2854 	    r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2));
2855 
2856 	    /* compute slice */
2857 	    if (r > len_st) r = len_st;
2858 	    else if (r < 0) {
2859 		r += len_st + 1;
2860 		if (r < 0)
2861 		    r = 0;
2862 	    }
2863 	    if (l > len_st) l = len_st;
2864 	    else if (l < 0) {
2865 		l += len_st + 1;
2866 		if (l < 0)
2867 		    l = 0;
2868 	    }
2869 
2870 	    /* empty ? */
2871 	    if (l > r)
2872 		continue;
2873 	    len_st = r - l;
2874 	    if (len_st == 0)
2875 		continue;
2876 
2877 	    /* get pointer right */
2878 	    st += l;
2879 	}
2880 	else {
2881 	    /* Must be a string entry: take the whole string */
2882 	    tempstr = PyUnicode_FromObject(o);
2883 	    if (tempstr == NULL)
2884 		goto onError;
2885 	    st = PyUnicode_AS_UNICODE(tempstr);
2886 	    len_st = PyUnicode_GET_SIZE(tempstr);
2887 	}
2888 
2889         Py_DECREF(o);
2890 
2891 	/* Resize the new string if needed */
2892 	while (current_len + len_st + sep_len >= newstring_len) {
2893 	    newstring_len += newstring_len >> 1;
2894 	    if (PyUnicode_Resize(&newstring, newstring_len))
2895 		goto onError;
2896 	    p = PyUnicode_AS_UNICODE(newstring) + current_len;
2897 	}
2898 
2899 	/* Insert separator */
2900 	if (i > 0 && sep_len > 0) {
2901 	    Py_UNICODE_COPY(p, sep, sep_len);
2902 	    p += sep_len;
2903 	    current_len += sep_len;
2904 	}
2905 
2906 	/* Copy snippet into new string */
2907 	Py_UNICODE_COPY(p, st, len_st);
2908 	p += len_st;
2909 	current_len += len_st;
2910 
2911 	Py_DECREF(tempstr);
2912 	tempstr = NULL;
2913     }
2914 
2915     /* Resize new string to the actual length */
2916     if (PyUnicode_Resize(&newstring, current_len))
2917 	goto onError;
2918 
2919     Py_XDECREF(separator);
2920     return newstring;
2921 
2922  onError:
2923     Py_XDECREF(newstring);
2924     Py_XDECREF(separator);
2925     Py_XDECREF(tempstr);
2926     return NULL;
2927 }
2928 
2929 #endif
2930 
2931 /* Enhanced string join: also excepts tuple (text, left, right,...)
2932    entries which then cause text[left:right] to be used as string
2933    snippet.
2934 
2935    separator may be NULL; in that case, "" is used as separator.
2936 
2937 */
2938 
2939 static
mxTextTools_Join(PyObject * seq,Py_ssize_t start,Py_ssize_t stop,PyObject * separator)2940 PyObject *mxTextTools_Join(PyObject *seq,
2941 			   Py_ssize_t start,
2942 			   Py_ssize_t stop,
2943 			   PyObject *separator)
2944 {
2945     PyObject *newstring = 0;
2946     Py_ssize_t newstring_len, current_len = 0;
2947     char *p;
2948     Py_ssize_t i;
2949     char *sep;
2950     Py_ssize_t sep_len;
2951 
2952     if (separator) {
2953 #ifdef HAVE_UNICODE
2954 	if (PyUnicode_Check(separator))
2955 	    return mxTextTools_UnicodeJoin(seq, start, stop, separator);
2956 #endif
2957 	Py_Assert(PyString_Check(separator),
2958 		  PyExc_TypeError,
2959 		  "separator must be a string");
2960 	sep = PyString_AS_STRING(separator);
2961 	sep_len = PyString_GET_SIZE(separator);
2962     }
2963     else {
2964 	sep = NULL;
2965 	sep_len = 0;
2966     }
2967 
2968     /* Create an empty new string */
2969     newstring_len = (10 + sep_len) * (stop - start);
2970     newstring = PyString_FromStringAndSize((char*)NULL, newstring_len);
2971     if (newstring == NULL)
2972 	goto onError;
2973     p = PyString_AS_STRING(newstring);
2974 
2975     /* Join with separator */
2976     for (i = start; i < stop; i++) {
2977 	register PyObject *o;
2978 	char *st;
2979 	Py_ssize_t len_st;
2980 
2981 	o = PySequence_GetItem(seq, i);
2982 
2983 	if (PyTuple_Check(o)) {
2984 	    /* Tuple entry: (string,l,r,[...]) */
2985 	    register Py_ssize_t l,r;
2986 
2987 	    /* parse tuple */
2988 	    Py_Assert((PyTuple_GET_SIZE(o) >= 3) &&
2989 		      PyInt_Check(PyTuple_GET_ITEM(o,1)) &&
2990 		      PyInt_Check(PyTuple_GET_ITEM(o,2)),
2991 		      PyExc_TypeError,
2992 		      "tuples must be of the format (string,int,int[,...])");
2993 #ifdef HAVE_UNICODE
2994 	    if (PyUnicode_Check(PyTuple_GET_ITEM(o,0))) {
2995 		/* Redirect to Unicode implementation; all previous work
2996 		   is lost. */
2997 		Py_DECREF(o);
2998 		Py_DECREF(newstring);
2999 		return mxTextTools_UnicodeJoin(seq, start, stop, separator);
3000 	    }
3001 #endif
3002 	    Py_Assert(PyString_Check(PyTuple_GET_ITEM(o,0)),
3003 		      PyExc_TypeError,
3004 		      "tuples must be of the format (string,int,int[,...])");
3005 	    st = PyString_AS_STRING(PyTuple_GET_ITEM(o,0));
3006 	    len_st = PyString_GET_SIZE(PyTuple_GET_ITEM(o,0));
3007 	    l = PyInt_AS_LONG(PyTuple_GET_ITEM(o,1));
3008 	    r = PyInt_AS_LONG(PyTuple_GET_ITEM(o,2));
3009 
3010 	    /* compute slice */
3011 	    if (r > len_st) r = len_st;
3012 	    else if (r < 0) {
3013 		r += len_st + 1;
3014 		if (r < 0)
3015 		    r = 0;
3016 	    }
3017 	    if (l > len_st) l = len_st;
3018 	    else if (l < 0) {
3019 		l += len_st + 1;
3020 		if (l < 0)
3021 		    l = 0;
3022 	    }
3023 
3024 	    /* empty ? */
3025 	    if (l > r)
3026 		continue;
3027 	    len_st = r - l;
3028 	    if (len_st == 0)
3029 		continue;
3030 
3031 	    /* get pointer right */
3032 	    st += l;
3033 	}
3034 	else if (PyString_Check(o)) {
3035 	    /* String entry: take the whole string */
3036 	    st = PyString_AS_STRING(o);
3037 	    len_st = PyString_GET_SIZE(o);
3038 	}
3039 #ifdef HAVE_UNICODE
3040 	else if (PyUnicode_Check(o)) {
3041 	    /* Redirect to Unicode implementation; all previous work
3042 	       is lost. */
3043 	    Py_DECREF(o);
3044 	    Py_DECREF(newstring);
3045 	    return mxTextTools_UnicodeJoin(seq, start, stop, separator);
3046 	}
3047 #endif
3048 	else {
3049 	    Py_DECREF(o);
3050 	    Py_Error(PyExc_TypeError,
3051 		     "list must contain tuples or strings as entries");
3052 	}
3053 
3054         Py_DECREF(o);
3055 
3056 	/* Resize the new string if needed */
3057 	while (current_len + len_st + sep_len >= newstring_len) {
3058 	    newstring_len += newstring_len >> 1;
3059 	    if (_PyString_Resize(&newstring, newstring_len))
3060 		goto onError;
3061 	    p = PyString_AS_STRING(newstring) + current_len;
3062 	}
3063 
3064 	/* Insert separator */
3065 	if (i > 0 && sep_len > 0) {
3066 	    memcpy(p, sep, sep_len);
3067 	    p += sep_len;
3068 	    current_len += sep_len;
3069 	}
3070 
3071 	/* Copy snippet into new string */
3072 	memcpy(p,st,len_st);
3073 	p += len_st;
3074 	current_len += len_st;
3075     }
3076 
3077     /* Resize new string to the actual length */
3078     if (_PyString_Resize(&newstring, current_len))
3079 	goto onError;
3080 
3081     return newstring;
3082 
3083  onError:
3084     Py_XDECREF(newstring);
3085     return NULL;
3086 }
3087 
3088 static
mxTextTools_HexStringFromString(char * str,Py_ssize_t len)3089 PyObject *mxTextTools_HexStringFromString(char *str,
3090 					  Py_ssize_t len)
3091 {
3092     PyObject *w = 0;
3093     Py_ssize_t i;
3094     char *hex;
3095     static const char hexdigits[] = "0123456789abcdef";
3096 
3097     /* Convert to HEX */
3098     w = PyString_FromStringAndSize(NULL,2*len);
3099     if (!w)
3100 	goto onError;
3101     hex = PyString_AS_STRING(w);
3102     for (i = 0; i < len; i ++) {
3103 	unsigned char c = (unsigned char)*str;
3104 
3105 	*hex++ = hexdigits[c >> 4];
3106 	*hex++ = hexdigits[c & 0x0F];
3107 	str++;
3108     }
3109     return w;
3110 
3111  onError:
3112     Py_XDECREF(w);
3113     return NULL;
3114 }
3115 
3116 static
mxTextTools_StringFromHexString(char * hex,Py_ssize_t len)3117 PyObject *mxTextTools_StringFromHexString(char *hex,
3118 					  Py_ssize_t len)
3119 {
3120     PyObject *w = 0;
3121     Py_ssize_t i;
3122     char *str;
3123     static const char hexdigits[] = "0123456789abcdef";
3124 
3125     /* Convert to string */
3126     Py_Assert(len % 2 == 0,
3127 	      PyExc_TypeError,
3128 	      "need 2-digit hex string argument");
3129     len >>= 1;
3130     w = PyString_FromStringAndSize(NULL,len);
3131     if (!w)
3132 	goto onError;
3133     str = PyString_AS_STRING(w);
3134     for (i = 0; i < len; i++,str++) {
3135 	register char c;
3136 	register Py_ssize_t j;
3137 
3138 	c = tolower(*hex++);
3139 	for (j = 0; j < (Py_ssize_t)sizeof(hexdigits); j++)
3140 	  if (c == hexdigits[j]) {
3141 	    *str = j << 4;
3142 	    break;
3143 	  }
3144 	if (j == sizeof(hexdigits)) {
3145 	  DPRINTF("Failed: '%c' (%u) at %i\n",c,(unsigned int)c,i);
3146 	  Py_Error(PyExc_ValueError,
3147 		   "argument contains non-hex characters");
3148 	}
3149 
3150 	c = tolower(*hex++);
3151 	for (j = 0; j < (Py_ssize_t)sizeof(hexdigits); j++)
3152 	  if (c == hexdigits[j]) {
3153 	    *str += j;
3154 	    break;
3155 	  }
3156 	if (j == sizeof(hexdigits)) {
3157 	  DPRINTF("Failed2: '%c' (%u) at %i\n",c,(unsigned int)c,i);
3158 	  Py_Error(PyExc_ValueError,
3159 		   "argument contains non-hex characters");
3160 	}
3161     }
3162     return w;
3163 
3164  onError:
3165     Py_XDECREF(w);
3166     return NULL;
3167 }
3168 
3169 static
mxTextTools_IsASCII(PyObject * text,Py_ssize_t left,Py_ssize_t right)3170 int mxTextTools_IsASCII(PyObject *text,
3171 			Py_ssize_t left,
3172 			Py_ssize_t right)
3173 {
3174     if (PyString_Check(text)) {
3175 	Py_ssize_t len;
3176 	register Py_ssize_t i;
3177 	register unsigned char *str = (unsigned char *)PyString_AS_STRING(text);
3178 
3179 	len = PyString_GET_SIZE(text);
3180 	Py_CheckSequenceSlice(len, left, right);
3181 	for (i = left; i < right; i++)
3182 	    if (str[i] >= 128)
3183 		return 0;
3184 	return 1;
3185     }
3186 
3187 #ifdef HAVE_UNICODE
3188     else if (PyUnicode_Check(text)) {
3189 	Py_ssize_t len;
3190 	register Py_ssize_t i;
3191 	register Py_UNICODE *str = PyUnicode_AS_UNICODE(text);
3192 
3193 	len = PyUnicode_GET_SIZE(text);
3194 	Py_CheckSequenceSlice(len, left, right);
3195 	for (i = left; i < right; i++)
3196 	    if (str[i] >= 128)
3197 		return 0;
3198 	return 1;
3199     }
3200 #endif
3201 
3202     else
3203 	Py_Error(PyExc_TypeError,
3204 		 "need string object");
3205 
3206  onError:
3207     return -1;
3208 }
3209 
3210 /* Takes a list of tuples (replacement,l,r,...) and produces a taglist
3211    suitable for mxTextTools_Join() which creates a copy of
3212    text where every slice [l:r] is replaced by the given replacement.
3213 
3214 */
3215 
3216 static
mxTextTools_Joinlist(PyObject * text,PyObject * list,Py_ssize_t pos,Py_ssize_t text_len)3217 PyObject *mxTextTools_Joinlist(PyObject *text,
3218 			       PyObject *list,
3219 			       Py_ssize_t pos,
3220 			       Py_ssize_t text_len)
3221 {
3222     PyObject *joinlist = 0;
3223     Py_ssize_t list_len;
3224     Py_ssize_t i;
3225     Py_ssize_t listitem = 0;
3226     Py_ssize_t listsize = INITIAL_LIST_SIZE;
3227 
3228     if (PyString_Check(text)) {
3229 	Py_CheckStringSlice(text, pos, text_len);
3230     }
3231 #ifdef HAVE_UNICODE
3232     else if (PyUnicode_Check(text)) {
3233 	Py_CheckUnicodeSlice(text, pos, text_len);
3234     }
3235 #endif
3236     else
3237 	Py_Error(PyExc_TypeError,
3238 		 "expected string or unicode");
3239 
3240     Py_Assert(PyList_Check(list),
3241 	      PyExc_TypeError,
3242 	      "expected a list of tuples as second argument");
3243     list_len = PyList_GET_SIZE(list);
3244 
3245     joinlist = PyList_New(listsize);
3246     if (joinlist == NULL)
3247 	goto onError;
3248 
3249     for (i = 0; i < list_len; i++) {
3250 	register PyObject *t;
3251 	register Py_ssize_t left, right;
3252 
3253 	t = PyList_GET_ITEM(list, i);
3254 	Py_Assert(PyTuple_Check(t) &&
3255 		  (PyTuple_GET_SIZE(t) >= 3) &&
3256 		  (PyString_Check(PyTuple_GET_ITEM(t,0)) ||
3257 		   PyUnicode_Check(PyTuple_GET_ITEM(t,0))) &&
3258 		  PyInt_Check(PyTuple_GET_ITEM(t,1)) &&
3259 		  PyInt_Check(PyTuple_GET_ITEM(t,2)),
3260 		  PyExc_TypeError,
3261 		  "tuples must be of the form (string,int,int,...)");
3262 	left = PyInt_AS_LONG(PyTuple_GET_ITEM(t,1));
3263 	right = PyInt_AS_LONG(PyTuple_GET_ITEM(t,2));
3264 
3265 	Py_Assert(left >= pos,
3266 		  PyExc_ValueError,
3267 		  "list is not sorted ascending");
3268 
3269 	if (left > pos) { /* joinlist.append((text,pos,left)) */
3270 	    register PyObject *v;
3271 	    register PyObject *w;
3272 
3273 	    v = PyTuple_New(3);
3274 	    if (v == NULL)
3275 		goto onError;
3276 
3277 	    Py_INCREF(text);
3278 	    PyTuple_SET_ITEM(v,0,text);
3279 
3280 	    w = PyInt_FromLong(pos);
3281 	    if (w == NULL)
3282 		goto onError;
3283 	    PyTuple_SET_ITEM(v,1,w);
3284 
3285 	    w = PyTuple_GET_ITEM(t,1);
3286 	    Py_INCREF(w);
3287 	    PyTuple_SET_ITEM(v,2,w);
3288 
3289 	    if (listitem < listsize)
3290 		PyList_SET_ITEM(joinlist,listitem,v);
3291 	    else {
3292 		PyList_Append(joinlist,v);
3293 		Py_DECREF(v);
3294 	    }
3295 	    listitem++;
3296 	}
3297 
3298 	/* joinlist.append(string) */
3299 	if (listitem < listsize) {
3300 	    register PyObject *v = PyTuple_GET_ITEM(t,0);
3301 	    Py_INCREF(v);
3302 	    PyList_SET_ITEM(joinlist,listitem,v);
3303 	}
3304 	else
3305 	    PyList_Append(joinlist,PyTuple_GET_ITEM(t,0));
3306 	listitem++;
3307 
3308 	pos = right;
3309     }
3310 
3311     if (pos < text_len) { /* joinlist.append((text,pos,text_len)) */
3312 	register PyObject *v;
3313 	register PyObject *w;
3314 
3315 	v = PyTuple_New(3);
3316 	if (v == NULL)
3317 	    goto onError;
3318 
3319 	Py_INCREF(text);
3320 	PyTuple_SET_ITEM(v,0,text);
3321 
3322 	w = PyInt_FromLong(pos);
3323 	if (w == NULL)
3324 	    goto onError;
3325 	PyTuple_SET_ITEM(v,1,w);
3326 
3327 	w = PyInt_FromLong(text_len);
3328 	if (w == NULL)
3329 	    goto onError;
3330 	PyTuple_SET_ITEM(v,2,w);
3331 
3332 	if (listitem < listsize)
3333 	    PyList_SET_ITEM(joinlist,listitem,v);
3334 	else {
3335 	    PyList_Append(joinlist,v);
3336 	    Py_DECREF(v);
3337 	}
3338 	listitem++;
3339     }
3340 
3341     /* Resize list if necessary */
3342     if (listitem < listsize)
3343 	PyList_SetSlice(joinlist,listitem,listsize,(PyObject*)NULL);
3344 
3345     return joinlist;
3346 
3347  onError:
3348 
3349     Py_XDECREF(joinlist);
3350     return NULL;
3351 }
3352 
3353 #ifdef HAVE_UNICODE
3354 static
mxTextTools_UnicodeCharSplit(PyObject * text,PyObject * separator,Py_ssize_t start,Py_ssize_t text_len)3355 PyObject *mxTextTools_UnicodeCharSplit(PyObject *text,
3356 				       PyObject *separator,
3357 				       Py_ssize_t start,
3358 				       Py_ssize_t text_len)
3359 {
3360     PyObject *list = NULL;
3361     register Py_ssize_t x;
3362     Py_ssize_t listitem = 0;
3363     Py_ssize_t listsize = INITIAL_LIST_SIZE;
3364     Py_UNICODE *tx;
3365     Py_UNICODE sep;
3366 
3367     text = PyUnicode_FromObject(text);
3368     if (text == NULL) {
3369 	separator = NULL;
3370 	goto onError;
3371     }
3372     separator = PyUnicode_FromObject(separator);
3373     if (separator == NULL)
3374 	goto onError;
3375 
3376     Py_CheckUnicodeSlice(text, start, text_len);
3377 
3378     Py_Assert(PyUnicode_GET_SIZE(separator) == 1,
3379 	      PyExc_TypeError,
3380 	      "separator must be a single character");
3381 
3382     tx = PyUnicode_AS_UNICODE(text);
3383     sep = *PyUnicode_AS_UNICODE(separator);
3384 
3385     list = PyList_New(listsize);
3386     if (!list)
3387 	goto onError;
3388 
3389     x = start;
3390     while (1) {
3391 	PyObject *s;
3392 	register Py_ssize_t z;
3393 
3394 	/* Skip to next separator */
3395 	z = x;
3396 	for (;x < text_len; x++)
3397 	    if (tx[x] == sep)
3398 		break;
3399 
3400 	/* Append the slice to list */
3401 	s = PyUnicode_FromUnicode(&tx[z], x - z);
3402 	if (!s)
3403 	    goto onError;
3404 	if (listitem < listsize)
3405 	    PyList_SET_ITEM(list,listitem,s);
3406 	else {
3407 	    PyList_Append(list,s);
3408 	    Py_DECREF(s);
3409 	}
3410 	listitem++;
3411 
3412 	if (x == text_len)
3413 	    break;
3414 
3415 	/* Skip separator */
3416 	x++;
3417     }
3418 
3419     /* Resize list if necessary */
3420     if (listitem < listsize)
3421 	PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL);
3422 
3423     Py_DECREF(text);
3424     Py_DECREF(separator);
3425     return list;
3426 
3427  onError:
3428     Py_XDECREF(list);
3429     Py_XDECREF(text);
3430     Py_XDECREF(separator);
3431     return NULL;
3432 }
3433 #endif
3434 
3435 static
mxTextTools_CharSplit(PyObject * text,PyObject * separator,Py_ssize_t start,Py_ssize_t text_len)3436 PyObject *mxTextTools_CharSplit(PyObject *text,
3437 				PyObject *separator,
3438 				Py_ssize_t start,
3439 				Py_ssize_t text_len)
3440 {
3441     PyObject *list = 0;
3442     register Py_ssize_t x;
3443     Py_ssize_t listitem = 0;
3444     Py_ssize_t listsize = INITIAL_LIST_SIZE;
3445     char *tx;
3446     char sep;
3447 
3448 #ifdef HAVE_UNICODE
3449     if (PyUnicode_Check(text) || PyUnicode_Check(separator))
3450 	return mxTextTools_UnicodeCharSplit(text, separator,
3451 					    start, text_len);
3452 #endif
3453 
3454     if (PyString_Check(text) && PyString_Check(separator)) {
3455 	Py_CheckStringSlice(text, start, text_len);
3456     }
3457     else
3458 	Py_Error(PyExc_TypeError,
3459 		 "text and separator must be strings or unicode");
3460 
3461     Py_Assert(PyString_GET_SIZE(separator) == 1,
3462 	      PyExc_TypeError,
3463 	      "separator must be a single character");
3464 
3465     tx = PyString_AS_STRING(text);
3466     sep = *PyString_AS_STRING(separator);
3467 
3468     list = PyList_New(listsize);
3469     if (!list)
3470 	goto onError;
3471 
3472     x = start;
3473     while (1) {
3474 	PyObject *s;
3475 	register Py_ssize_t z;
3476 
3477 	/* Skip to next separator */
3478 	z = x;
3479 	for (;x < text_len; x++)
3480 	    if (tx[x] == sep)
3481 		break;
3482 
3483 	/* Append the slice to list */
3484 	s = PyString_FromStringAndSize(&tx[z], x - z);
3485 	if (!s)
3486 	    goto onError;
3487 	if (listitem < listsize)
3488 	    PyList_SET_ITEM(list,listitem,s);
3489 	else {
3490 	    PyList_Append(list,s);
3491 	    Py_DECREF(s);
3492 	}
3493 	listitem++;
3494 
3495 	if (x == text_len)
3496 	    break;
3497 
3498 	/* Skip separator */
3499 	x++;
3500     }
3501 
3502     /* Resize list if necessary */
3503     if (listitem < listsize)
3504 	PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL);
3505 
3506     return list;
3507 
3508  onError:
3509     Py_XDECREF(list);
3510     return NULL;
3511 }
3512 
3513 #ifdef HAVE_UNICODE
3514 static
mxTextTools_UnicodeSplitAt(PyObject * text,PyObject * separator,Py_ssize_t nth,Py_ssize_t start,Py_ssize_t text_len)3515 PyObject *mxTextTools_UnicodeSplitAt(PyObject *text,
3516 				     PyObject *separator,
3517 				     Py_ssize_t nth,
3518 				     Py_ssize_t start,
3519 				     Py_ssize_t text_len)
3520 {
3521     PyObject *tuple = 0;
3522     register Py_ssize_t x;
3523     PyObject *s;
3524     Py_UNICODE *tx;
3525     Py_UNICODE sep;
3526 
3527     text = PyUnicode_FromObject(text);
3528     if (text == NULL) {
3529 	separator = NULL;
3530 	goto onError;
3531     }
3532     separator = PyUnicode_FromObject(separator);
3533     if (separator == NULL)
3534 	goto onError;
3535 
3536     Py_CheckUnicodeSlice(text, start, text_len);
3537 
3538     Py_Assert(PyUnicode_GET_SIZE(separator) == 1,
3539 	      PyExc_TypeError,
3540 	      "separator must be a single character");
3541 
3542     tx = PyUnicode_AS_UNICODE(text);
3543     sep = *PyUnicode_AS_UNICODE(separator);
3544 
3545     tuple = PyTuple_New(2);
3546     if (!tuple)
3547 	goto onError;
3548 
3549     if (nth > 0) {
3550 	/* Skip to nth separator from the left */
3551 	x = start;
3552 	while (1) {
3553 	    for (; x < text_len; x++)
3554 		if (tx[x] == sep)
3555 		    break;
3556 	    if (--nth == 0 || x == text_len)
3557 		break;
3558 	    x++;
3559 	}
3560     }
3561     else if (nth < 0) {
3562 	/* Skip to nth separator from the right */
3563 	x = text_len - 1;
3564 	while (1) {
3565 	    for (; x >= start; x--)
3566 		if (tx[x] == sep)
3567 		    break;
3568 	    if (++nth == 0 || x < start)
3569 		break;
3570 	    x--;
3571 	}
3572     }
3573     else
3574 	Py_Error(PyExc_ValueError,
3575 		 "nth must be non-zero");
3576 
3577     /* Add to tuple */
3578     if (x < start)
3579 	s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0);
3580     else
3581 	s = PyUnicode_FromUnicode(&tx[start], x - start);
3582     if (!s)
3583 	goto onError;
3584     PyTuple_SET_ITEM(tuple,0,s);
3585 
3586     /* Skip separator */
3587     x++;
3588 
3589     if (x >= text_len)
3590 	s = PyUnicode_FromUnicode((Py_UNICODE *)"", 0);
3591     else
3592 	s = PyUnicode_FromUnicode(&tx[x], text_len - x);
3593     if (!s)
3594 	goto onError;
3595     PyTuple_SET_ITEM(tuple,1,s);
3596 
3597     Py_DECREF(text);
3598     Py_DECREF(separator);
3599     return tuple;
3600 
3601  onError:
3602     Py_XDECREF(tuple);
3603     Py_XDECREF(text);
3604     Py_XDECREF(separator);
3605     return NULL;
3606 }
3607 #endif
3608 
3609 static
mxTextTools_SplitAt(PyObject * text,PyObject * separator,Py_ssize_t nth,Py_ssize_t start,Py_ssize_t text_len)3610 PyObject *mxTextTools_SplitAt(PyObject *text,
3611 			      PyObject *separator,
3612 			      Py_ssize_t nth,
3613 			      Py_ssize_t start,
3614 			      Py_ssize_t text_len)
3615 {
3616     PyObject *tuple = 0;
3617     register Py_ssize_t x;
3618     PyObject *s;
3619     char *tx;
3620     char sep;
3621 
3622 #ifdef HAVE_UNICODE
3623     if (PyUnicode_Check(text) || PyUnicode_Check(separator))
3624 	return mxTextTools_UnicodeSplitAt(text, separator,
3625 					  nth, start, text_len);
3626 #endif
3627 
3628     if (PyString_Check(text) && PyString_Check(separator)) {
3629 	Py_CheckStringSlice(text, start, text_len);
3630     }
3631     else
3632 	Py_Error(PyExc_TypeError,
3633 		 "text and separator must be strings or unicode");
3634 
3635     Py_Assert(PyString_GET_SIZE(separator) == 1,
3636 	      PyExc_TypeError,
3637 	      "separator must be a single character");
3638 
3639     tx = PyString_AS_STRING(text);
3640     sep = *PyString_AS_STRING(separator);
3641 
3642     tuple = PyTuple_New(2);
3643     if (!tuple)
3644 	goto onError;
3645 
3646     if (nth > 0) {
3647 	/* Skip to nth separator from the left */
3648 	x = start;
3649 	while (1) {
3650 	    for (; x < text_len; x++)
3651 		if (tx[x] == sep)
3652 		    break;
3653 	    if (--nth == 0 || x == text_len)
3654 		break;
3655 	    x++;
3656 	}
3657     }
3658     else if (nth < 0) {
3659 	/* Skip to nth separator from the right */
3660 	x = text_len - 1;
3661 	while (1) {
3662 	    for (; x >= start; x--)
3663 		if (tx[x] == sep)
3664 		    break;
3665 	    if (++nth == 0 || x < start)
3666 		break;
3667 	    x--;
3668 	}
3669     }
3670     else
3671 	Py_Error(PyExc_ValueError,
3672 		 "nth must be non-zero");
3673 
3674     /* Add to tuple */
3675     if (x < start)
3676 	s = PyString_FromStringAndSize("",0);
3677     else
3678 	s = PyString_FromStringAndSize(&tx[start], x - start);
3679     if (!s)
3680 	goto onError;
3681     PyTuple_SET_ITEM(tuple,0,s);
3682 
3683     /* Skip separator */
3684     x++;
3685 
3686     if (x >= text_len)
3687 	s = PyString_FromStringAndSize("",0);
3688     else
3689 	s = PyString_FromStringAndSize(&tx[x], text_len - x);
3690     if (!s)
3691 	goto onError;
3692     PyTuple_SET_ITEM(tuple,1,s);
3693 
3694     return tuple;
3695 
3696  onError:
3697     Py_XDECREF(tuple);
3698     return NULL;
3699 }
3700 
3701 #ifdef HAVE_UNICODE
3702 static
mxTextTools_UnicodeSuffix(PyObject * text,PyObject * suffixes,Py_ssize_t start,Py_ssize_t text_len,PyObject * translate)3703 PyObject *mxTextTools_UnicodeSuffix(PyObject *text,
3704 				    PyObject *suffixes,
3705 				    Py_ssize_t start,
3706 				    Py_ssize_t text_len,
3707 				    PyObject *translate)
3708 {
3709     Py_ssize_t i;
3710     Py_UNICODE *tx;
3711 
3712     text = PyUnicode_FromObject(text);
3713     if (text == NULL)
3714 	goto onError;
3715 
3716     if (PyUnicode_Check(text)) {
3717 	Py_CheckUnicodeSlice(text, start, text_len);
3718     }
3719     else
3720 	Py_Error(PyExc_TypeError,
3721 		 "expected unicode");
3722     Py_Assert(PyTuple_Check(suffixes),
3723 	      PyExc_TypeError,
3724 	      "suffixes needs to be a tuple of unicode strings");
3725 
3726     /* XXX Add support for translate... */
3727     Py_Assert(translate == NULL,
3728 	      PyExc_TypeError,
3729 	      "translate is not supported for Unicode suffix()es");
3730 
3731     tx = PyUnicode_AS_UNICODE(text);
3732 
3733     for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) {
3734 	PyObject *suffix = PyTuple_GET_ITEM(suffixes,i);
3735 	Py_ssize_t start_cmp;
3736 
3737 	suffix = PyUnicode_FromObject(suffix);
3738 	if (suffix == NULL)
3739 	    goto onError;
3740 
3741 	start_cmp = text_len - PyUnicode_GET_SIZE(suffix);
3742 	if (start_cmp >= start &&
3743 	    PyUnicode_AS_UNICODE(suffix)[0] == tx[start_cmp] &&
3744 	    memcmp(PyUnicode_AS_UNICODE(suffix),
3745 		   &tx[start_cmp],
3746 		   PyUnicode_GET_DATA_SIZE(suffix)) == 0) {
3747 	    Py_DECREF(text);
3748 	    return suffix;
3749 	}
3750 
3751 	Py_DECREF(suffix);
3752     }
3753 
3754     Py_DECREF(text);
3755     Py_ReturnNone();
3756 
3757  onError:
3758     Py_XDECREF(text);
3759     return NULL;
3760 }
3761 #endif
3762 
3763 static
mxTextTools_Suffix(PyObject * text,PyObject * suffixes,Py_ssize_t start,Py_ssize_t text_len,PyObject * translate)3764 PyObject *mxTextTools_Suffix(PyObject *text,
3765 			     PyObject *suffixes,
3766 			     Py_ssize_t start,
3767 			     Py_ssize_t text_len,
3768 			     PyObject *translate)
3769 {
3770     Py_ssize_t i;
3771     char *tx;
3772 
3773 #ifdef HAVE_UNICODE
3774     if (PyUnicode_Check(text))
3775 	return mxTextTools_UnicodeSuffix(text, suffixes,
3776 					 start, text_len,
3777 					 translate);
3778 #endif
3779 
3780     if (PyString_Check(text)) {
3781 	Py_CheckStringSlice(text, start, text_len);
3782     }
3783     else
3784 	Py_Error(PyExc_TypeError,
3785 		 "expected string or unicode");
3786     Py_Assert(PyTuple_Check(suffixes),
3787 	      PyExc_TypeError,
3788 	      "suffixes needs to be a tuple of strings");
3789     tx = PyString_AS_STRING(text);
3790 
3791     if (translate) {
3792 	char *tr;
3793 
3794 	Py_Assert(PyString_Check(translate) &&
3795 		  PyString_GET_SIZE(translate) == 256,
3796 		  PyExc_TypeError,
3797 		  "translate must be a string having 256 characters");
3798 	tr = PyString_AS_STRING(translate);
3799 
3800 	for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) {
3801 	    PyObject *suffix = PyTuple_GET_ITEM(suffixes, i);
3802 	    Py_ssize_t start_cmp;
3803 	    register char *s;
3804 	    register char *t;
3805 	    register Py_ssize_t j;
3806 
3807 	    Py_AssertWithArg(PyString_Check(suffix),
3808 			     PyExc_TypeError,
3809 			     "tuple entry %d is not a string",(unsigned int)i);
3810 	    start_cmp = text_len - PyString_GET_SIZE(suffix);
3811 	    if (start_cmp < start)
3812 		continue;
3813 
3814 	    /* Do the compare using a translate table */
3815 	    s = PyString_AS_STRING(suffix);
3816 	    t = tx + start_cmp;
3817 	    for (j = start_cmp; j < text_len; j++, s++, t++)
3818 		if (*s != tr[(unsigned char)*t])
3819 		    break;
3820 	    if (j == text_len) {
3821 		Py_INCREF(suffix);
3822 		return suffix;
3823 	    }
3824 	}
3825     }
3826 
3827     else
3828 	for (i = 0; i < PyTuple_GET_SIZE(suffixes); i++) {
3829 	    PyObject *suffix = PyTuple_GET_ITEM(suffixes,i);
3830 	    Py_ssize_t start_cmp;
3831 
3832 	    Py_AssertWithArg(PyString_Check(suffix),
3833 			     PyExc_TypeError,
3834 			     "tuple entry %d is not a string",(unsigned int)i);
3835 	    start_cmp = text_len - PyString_GET_SIZE(suffix);
3836 	    if (start_cmp < start)
3837 		continue;
3838 
3839 	    /* Compare without translate table */
3840 	    if (PyString_AS_STRING(suffix)[0] == tx[start_cmp]
3841 		&&
3842 		strncmp(PyString_AS_STRING(suffix),
3843 			&tx[start_cmp],
3844 			PyString_GET_SIZE(suffix)) == 0) {
3845 		Py_INCREF(suffix);
3846 		return suffix;
3847 	    }
3848 	}
3849 
3850     Py_ReturnNone();
3851 
3852  onError:
3853     return NULL;
3854 }
3855 
3856 #ifdef HAVE_UNICODE
3857 static
mxTextTools_UnicodePrefix(PyObject * text,PyObject * prefixes,Py_ssize_t start,Py_ssize_t text_len,PyObject * translate)3858 PyObject *mxTextTools_UnicodePrefix(PyObject *text,
3859 				    PyObject *prefixes,
3860 				    Py_ssize_t start,
3861 				    Py_ssize_t text_len,
3862 				    PyObject *translate)
3863 {
3864     Py_ssize_t i;
3865     Py_UNICODE *tx;
3866 
3867     text = PyUnicode_FromObject(text);
3868     if (text == NULL)
3869 	goto onError;
3870 
3871     if (PyUnicode_Check(text)) {
3872 	Py_CheckUnicodeSlice(text, start, text_len);
3873     }
3874     else
3875 	Py_Error(PyExc_TypeError,
3876 		 "expected unicode");
3877     Py_Assert(PyTuple_Check(prefixes),
3878 	      PyExc_TypeError,
3879 	      "prefixes needs to be a tuple of unicode strings");
3880 
3881     /* XXX Add support for translate... */
3882     Py_Assert(translate == NULL,
3883 	      PyExc_TypeError,
3884 	      "translate is not supported for Unicode prefix()es");
3885 
3886     tx = PyUnicode_AS_UNICODE(text);
3887 
3888     for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) {
3889 	PyObject *prefix = PyTuple_GET_ITEM(prefixes,i);
3890 
3891 	prefix = PyUnicode_FromObject(prefix);
3892 	if (prefix == NULL)
3893 	    goto onError;
3894 
3895 	/* Compare without translate table */
3896 	if (start + PyString_GET_SIZE(prefix) <= text_len &&
3897 	    PyUnicode_AS_UNICODE(prefix)[0] == tx[start] &&
3898 	    memcmp(PyUnicode_AS_UNICODE(prefix),
3899 		   &tx[start],
3900 		   PyUnicode_GET_DATA_SIZE(prefix)) == 0) {
3901 	    Py_INCREF(prefix);
3902 	    return prefix;
3903 	}
3904 
3905 	Py_DECREF(prefix);
3906     }
3907 
3908     Py_DECREF(text);
3909     Py_ReturnNone();
3910 
3911  onError:
3912     Py_XDECREF(text);
3913     return NULL;
3914 }
3915 #endif
3916 
3917 static
mxTextTools_Prefix(PyObject * text,PyObject * prefixes,Py_ssize_t start,Py_ssize_t text_len,PyObject * translate)3918 PyObject *mxTextTools_Prefix(PyObject *text,
3919 			     PyObject *prefixes,
3920 			     Py_ssize_t start,
3921 			     Py_ssize_t text_len,
3922 			     PyObject *translate)
3923 {
3924     Py_ssize_t i;
3925     char *tx;
3926 
3927 #ifdef HAVE_UNICODE
3928     if (PyUnicode_Check(text))
3929 	return mxTextTools_UnicodePrefix(text, prefixes,
3930 					 start, text_len,
3931 					 translate);
3932 #endif
3933 
3934     if (PyString_Check(text)) {
3935 	Py_CheckStringSlice(text, start, text_len);
3936     }
3937     else
3938 	Py_Error(PyExc_TypeError,
3939 		 "expected string or unicode");
3940     Py_Assert(PyTuple_Check(prefixes),
3941 	      PyExc_TypeError,
3942 	      "prefixes needs to be a tuple of strings");
3943     tx = PyString_AS_STRING(text);
3944 
3945     if (translate) {
3946 	char *tr;
3947 
3948 	Py_Assert(PyString_Check(translate) &&
3949 		  PyString_GET_SIZE(translate) == 256,
3950 		  PyExc_TypeError,
3951 		  "translate must be a string having 256 characters");
3952 	tr = PyString_AS_STRING(translate);
3953 
3954 	for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) {
3955 	    PyObject *prefix = PyTuple_GET_ITEM(prefixes,i);
3956 	    Py_ssize_t cmp_len;
3957 	    register char *s;
3958 	    register char *t;
3959 	    register Py_ssize_t j;
3960 
3961 	    Py_AssertWithArg(PyString_Check(prefix),
3962 			     PyExc_TypeError,
3963 			     "tuple entry %d is not a string",(unsigned int)i);
3964 	    cmp_len = PyString_GET_SIZE(prefix);
3965 	    if (start + cmp_len > text_len)
3966 		continue;
3967 
3968 	    /* Do the compare using a translate table */
3969 	    s = PyString_AS_STRING(prefix);
3970 	    t = tx + start;
3971 	    for (j = 0; j < cmp_len; j++, s++, t++)
3972 		if (*s != tr[(unsigned char)*t])
3973 		    break;
3974 	    if (j == cmp_len) {
3975 		Py_INCREF(prefix);
3976 		return prefix;
3977 	    }
3978 	}
3979     }
3980 
3981     else
3982 	for (i = 0; i < PyTuple_GET_SIZE(prefixes); i++) {
3983 	    PyObject *prefix = PyTuple_GET_ITEM(prefixes,i);
3984 
3985 	    Py_AssertWithArg(PyString_Check(prefix),
3986 			     PyExc_TypeError,
3987 			     "tuple entry %d is not a string",(unsigned int)i);
3988 	    if (start + PyString_GET_SIZE(prefix) > text_len)
3989 		continue;
3990 
3991 	    /* Compare without translate table */
3992 	    if (PyString_AS_STRING(prefix)[0] == tx[start] &&
3993 		strncmp(PyString_AS_STRING(prefix),
3994 			&tx[start],
3995 			PyString_GET_SIZE(prefix)) == 0) {
3996 		Py_INCREF(prefix);
3997 		return prefix;
3998 	    }
3999 	}
4000 
4001     Py_ReturnNone();
4002 
4003  onError:
4004     return NULL;
4005 }
4006 
4007 /* Stips off characters appearing in the character set from text[start:stop]
4008    and returns the result as Python string object.
4009 
4010    where indicates the mode:
4011    where < 0: strip left only
4012    where = 0: strip left and right
4013    where > 0: strip right only
4014 
4015 */
4016 static
mxTextTools_SetStrip(char * tx,Py_ssize_t tx_len,char * setstr,Py_ssize_t setstr_len,Py_ssize_t start,Py_ssize_t stop,Py_ssize_t where)4017 PyObject *mxTextTools_SetStrip(char *tx,
4018 			       Py_ssize_t tx_len,
4019 			       char *setstr,
4020 			       Py_ssize_t setstr_len,
4021 			       Py_ssize_t start,
4022 			       Py_ssize_t stop,
4023 			       Py_ssize_t where)
4024 {
4025     Py_ssize_t left, right;
4026 
4027     Py_Assert(setstr_len == 32,
4028 	      PyExc_TypeError,
4029 	      "separator needs to be a set as obtained from set()");
4030     Py_CheckBufferSlice(tx_len, start, stop);
4031 
4032     /* Strip left */
4033     if (where <= 0) {
4034 	register Py_ssize_t x;
4035 	for (x = start; x < stop; x++)
4036 	    if (!Py_CharInSet(tx[x], setstr))
4037 		break;
4038 	left = x;
4039     }
4040     else
4041 	left = start;
4042 
4043     /* Strip right */
4044     if (where >= 0) {
4045 	register Py_ssize_t x;
4046 	for (x = stop - 1; x >= start; x--)
4047 	    if (!Py_CharInSet(tx[x], setstr))
4048 		break;
4049 	right = x + 1;
4050     }
4051     else
4052 	right = stop;
4053 
4054     return PyString_FromStringAndSize(tx + left, max(right - left, 0));
4055 
4056  onError:
4057     return NULL;
4058 }
4059 
4060 static
mxTextTools_SetSplit(char * tx,Py_ssize_t tx_len,char * setstr,Py_ssize_t setstr_len,Py_ssize_t start,Py_ssize_t text_len)4061 PyObject *mxTextTools_SetSplit(char *tx,
4062 			       Py_ssize_t tx_len,
4063 			       char *setstr,
4064 			       Py_ssize_t setstr_len,
4065 			       Py_ssize_t start,
4066 			       Py_ssize_t text_len)
4067 {
4068     PyObject *list = NULL;
4069     register Py_ssize_t x;
4070     Py_ssize_t listitem = 0;
4071     Py_ssize_t listsize = INITIAL_LIST_SIZE;
4072 
4073     Py_Assert(setstr_len == 32,
4074 	      PyExc_TypeError,
4075 	      "separator needs to be a set as obtained from set()");
4076     Py_CheckBufferSlice(tx_len,start,text_len);
4077 
4078     list = PyList_New(listsize);
4079     if (!list)
4080 	goto onError;
4081 
4082     x = start;
4083     while (x < text_len) {
4084 	Py_ssize_t z;
4085 
4086 	/* Skip all text in set */
4087 	for (;x < text_len; x++) {
4088 	    register Py_ssize_t c = (unsigned char)tx[x];
4089 	    register Py_ssize_t block = (unsigned char)setstr[c >> 3];
4090 	    if (!block || ((block & (1 << (c & 7))) == 0))
4091 		break;
4092 	}
4093 
4094 	/* Skip all text not in set */
4095 	z = x;
4096 	for (;x < text_len; x++) {
4097 	    register Py_ssize_t c = (unsigned char)tx[x];
4098 	    register Py_ssize_t block = (unsigned char)setstr[c >> 3];
4099 	    if (block && ((block & (1 << (c & 7))) != 0))
4100 		break;
4101 	}
4102 
4103 	/* Append the slice to list if it is not empty */
4104 	if (x > z) {
4105 	    PyObject *s;
4106 	    s = PyString_FromStringAndSize((char *)&tx[z], x - z);
4107 	    if (!s)
4108 		goto onError;
4109 	    if (listitem < listsize)
4110 		PyList_SET_ITEM(list,listitem,s);
4111 	    else {
4112 		PyList_Append(list,s);
4113 		Py_DECREF(s);
4114 	    }
4115 	    listitem++;
4116 	}
4117     }
4118 
4119     /* Resize list if necessary */
4120     if (listitem < listsize)
4121 	PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL);
4122 
4123     return list;
4124 
4125  onError:
4126     Py_XDECREF(list);
4127     return NULL;
4128 }
4129 
4130 static
mxTextTools_SetSplitX(char * tx,Py_ssize_t tx_len,char * setstr,Py_ssize_t setstr_len,Py_ssize_t start,Py_ssize_t text_len)4131 PyObject *mxTextTools_SetSplitX(char *tx,
4132 				Py_ssize_t tx_len,
4133 				char *setstr,
4134 				Py_ssize_t setstr_len,
4135 				Py_ssize_t start,
4136 				Py_ssize_t text_len)
4137 {
4138     PyObject *list = NULL;
4139     register Py_ssize_t x;
4140     Py_ssize_t listitem = 0;
4141     Py_ssize_t listsize = INITIAL_LIST_SIZE;
4142 
4143     Py_Assert(setstr_len == 32,
4144 	      PyExc_TypeError,
4145 	      "separator needs to be a set as obtained from set()");
4146     Py_CheckBufferSlice(tx_len,start,text_len);
4147 
4148     list = PyList_New(listsize);
4149     if (!list)
4150 	goto onError;
4151 
4152     x = start;
4153     while (x < text_len) {
4154 	PyObject *s;
4155 	register Py_ssize_t z;
4156 
4157 	/* Skip all text not in set */
4158 	z = x;
4159 	for (;x < text_len; x++) {
4160 	    register unsigned int c = (unsigned char)tx[x];
4161 	    register unsigned int block = (unsigned char)setstr[c >> 3];
4162 	    if (block && ((block & (1 << (c & 7))) != 0))
4163 		break;
4164 	}
4165 
4166 	/* Append the slice to list */
4167 	s = PyString_FromStringAndSize((char *)&tx[z], x - z);
4168 	if (!s)
4169 	    goto onError;
4170 	if (listitem < listsize)
4171 	    PyList_SET_ITEM(list,listitem,s);
4172 	else {
4173 	    PyList_Append(list,s);
4174 	    Py_DECREF(s);
4175 	}
4176 	listitem++;
4177 
4178 	if (x >= text_len)
4179 	    break;
4180 
4181 	/* Skip all text in set */
4182 	z = x;
4183 	for (;x < text_len; x++) {
4184 	    register unsigned int c = (unsigned char)tx[x];
4185 	    register unsigned int block = (unsigned char)setstr[c >> 3];
4186 	    if (!block || ((block & (1 << (c & 7))) == 0))
4187 		break;
4188 	}
4189 
4190 	/* Append the slice to list if it is not empty */
4191 	s = PyString_FromStringAndSize((char *)&tx[z], x - z);
4192 	if (!s)
4193 	    goto onError;
4194 	if (listitem < listsize)
4195 	    PyList_SET_ITEM(list,listitem,s);
4196 	else {
4197 	    PyList_Append(list,s);
4198 	    Py_DECREF(s);
4199 	}
4200 	listitem++;
4201     }
4202 
4203     /* Resize list if necessary */
4204     if (listitem < listsize)
4205 	PyList_SetSlice(list,listitem,listsize,(PyObject*)NULL);
4206 
4207     return list;
4208 
4209  onError:
4210     Py_XDECREF(list);
4211     return NULL;
4212 }
4213 
4214 static
mxTextTools_Upper(PyObject * text)4215 PyObject *mxTextTools_Upper(PyObject *text)
4216 {
4217     PyObject *ntext;
4218     register unsigned char *s;
4219     register unsigned char *orig;
4220     register Py_ssize_t i;
4221     unsigned char *tr;
4222     Py_ssize_t len;
4223 
4224     Py_Assert(PyString_Check(text),
4225 	      PyExc_TypeError,
4226 	      "expected a Python string");
4227 
4228     len = PyString_GET_SIZE(text);
4229     ntext = PyString_FromStringAndSize(NULL,len);
4230     if (!ntext)
4231 	goto onError;
4232 
4233     /* Translate */
4234     tr = (unsigned char *)PyString_AS_STRING(mx_ToUpper);
4235     orig = (unsigned char *)PyString_AS_STRING(text);
4236     s = (unsigned char *)PyString_AS_STRING(ntext);
4237     for (i = 0; i < len; i++, s++, orig++)
4238 	*s = tr[*orig];
4239 
4240     return ntext;
4241 
4242  onError:
4243     return NULL;
4244 }
4245 
4246 #ifdef HAVE_UNICODE
4247 static
mxTextTools_UnicodeUpper(PyObject * text)4248 PyObject *mxTextTools_UnicodeUpper(PyObject *text)
4249 {
4250     PyObject *ntext;
4251     register Py_UNICODE *s;
4252     register Py_UNICODE *orig;
4253     register Py_ssize_t i;
4254     Py_ssize_t	len;
4255 
4256     text = PyUnicode_FromObject(text);
4257     if (text == NULL)
4258 	goto onError;
4259 
4260     len = PyUnicode_GET_SIZE(text);
4261     ntext = PyUnicode_FromUnicode(NULL, len);
4262     if (!ntext)
4263 	goto onError;
4264 
4265     /* Translate */
4266     orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text);
4267     s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext);
4268     for (i = 0; i < len; i++, s++, orig++)
4269 	*s = Py_UNICODE_TOUPPER(*orig);
4270 
4271     Py_DECREF(text);
4272     return ntext;
4273 
4274  onError:
4275     Py_XDECREF(text);
4276     return NULL;
4277 }
4278 #endif
4279 
4280 static
mxTextTools_Lower(PyObject * text)4281 PyObject *mxTextTools_Lower(PyObject *text)
4282 {
4283     PyObject *ntext;
4284     register unsigned char *s;
4285     register unsigned char *orig;
4286     register Py_ssize_t i;
4287     unsigned char *tr;
4288     Py_ssize_t len;
4289 
4290     Py_Assert(PyString_Check(text),
4291 	      PyExc_TypeError,
4292 	      "expected a Python string");
4293 
4294     len = PyString_GET_SIZE(text);
4295     ntext = PyString_FromStringAndSize(NULL,len);
4296     if (!ntext)
4297 	goto onError;
4298 
4299     /* Translate */
4300     tr = (unsigned char *)PyString_AS_STRING(mx_ToLower);
4301     orig = (unsigned char *)PyString_AS_STRING(text);
4302     s = (unsigned char *)PyString_AS_STRING(ntext);
4303     for (i = 0; i < len; i++, s++, orig++)
4304 	*s = tr[*orig];
4305 
4306     return ntext;
4307 
4308  onError:
4309     return NULL;
4310 }
4311 
4312 #ifdef HAVE_UNICODE
4313 static
mxTextTools_UnicodeLower(PyObject * text)4314 PyObject *mxTextTools_UnicodeLower(PyObject *text)
4315 {
4316     PyObject *ntext;
4317     register Py_UNICODE *s;
4318     register Py_UNICODE *orig;
4319     register Py_ssize_t i;
4320     Py_ssize_t	len;
4321 
4322     text = PyUnicode_FromObject(text);
4323     if (text == NULL)
4324 	goto onError;
4325 
4326     len = PyUnicode_GET_SIZE(text);
4327     ntext = PyUnicode_FromUnicode(NULL, len);
4328     if (!ntext)
4329 	goto onError;
4330 
4331     /* Translate */
4332     orig = (Py_UNICODE *)PyUnicode_AS_UNICODE(text);
4333     s = (Py_UNICODE *)PyUnicode_AS_UNICODE(ntext);
4334     for (i = 0; i < len; i++, s++, orig++)
4335 	*s = Py_UNICODE_TOLOWER(*orig);
4336 
4337     Py_DECREF(text);
4338     return ntext;
4339 
4340  onError:
4341     Py_XDECREF(text);
4342     return NULL;
4343 }
4344 #endif
4345 
4346 /* --- Module functions ------------------------------------------------*/
4347 
4348 /* Interface to the tagging engine in mxte.c */
4349 
4350 Py_C_Function_WithKeywords(
4351                mxTextTools_tag,
4352 	       "tag(text,tagtable,sliceleft=0,sliceright=len(text),taglist=[],context=None) \n"""
4353 	       "Produce a tag list for a string, given a tag-table\n"
4354 	       "- returns a tuple (success, taglist, nextindex)\n"
4355 	       "- if taglist == None, then no taglist is created"
4356 	       )
4357 {
4358     PyObject *text;
4359     PyObject *tagtable;
4360     Py_ssize_t sliceright = INT_MAX;
4361     Py_ssize_t sliceleft = 0;
4362     PyObject *taglist = 0;
4363     Py_ssize_t taglist_len;
4364     PyObject *context = 0;
4365     Py_ssize_t next, result;
4366     PyObject *res;
4367 
4368     Py_KeywordsGet6Args("OO|iiOO:tag",
4369 			text,tagtable,sliceleft,sliceright,taglist,context);
4370 
4371     if (taglist == NULL) {
4372 	/* not given, so use default: an empty list */
4373 	taglist = PyList_New(0);
4374 	if (taglist == NULL)
4375 	    goto onError;
4376 	taglist_len = 0;
4377     }
4378     else {
4379 	Py_INCREF(taglist);
4380 	Py_Assert(PyList_Check(taglist) || taglist == Py_None,
4381 		  PyExc_TypeError,
4382 		  "taglist must be a list or None");
4383 	if (taglist != Py_None) {
4384 	    taglist_len = PyList_Size(taglist);
4385 	    if (taglist_len < 0)
4386 		goto onError;
4387 	}
4388 	else
4389 	    taglist_len = 0;
4390     }
4391 
4392     Py_Assert(mxTagTable_Check(tagtable) ||
4393 	      PyTuple_Check(tagtable) ||
4394 	      PyList_Check(tagtable),
4395 	      PyExc_TypeError,
4396 	      "tagtable must be a TagTable instance, list or tuple");
4397 
4398     /* Prepare the argument for the Tagging Engine and let it process
4399        the request */
4400     if (PyString_Check(text)) {
4401 
4402 	Py_CheckStringSlice(text, sliceleft, sliceright);
4403 
4404         if (!mxTagTable_Check(tagtable)) {
4405 	    tagtable = mxTagTable_New(tagtable, MXTAGTABLE_STRINGTYPE, 1);
4406 	    if (tagtable == NULL)
4407 		goto onError;
4408 	}
4409 	else if (mxTagTable_Type(tagtable) != MXTAGTABLE_STRINGTYPE) {
4410 	    Py_Error(PyExc_TypeError,
4411 		     "TagTable instance is not intended for parsing strings");
4412 	}
4413 	else
4414 	    Py_INCREF(tagtable);
4415 
4416 	/* Call the Tagging Engine */
4417 	result = mxTextTools_TaggingEngine(text,
4418 					   sliceleft,
4419 					   sliceright,
4420 					   (mxTagTableObject *)tagtable,
4421 					   taglist,
4422 					   context,
4423 					   &next);
4424 	Py_DECREF(tagtable);
4425 
4426     }
4427 #ifdef HAVE_UNICODE
4428     else if (PyUnicode_Check(text)) {
4429 
4430 	Py_CheckUnicodeSlice(text, sliceleft, sliceright);
4431 
4432         if (!mxTagTable_Check(tagtable)) {
4433 	    tagtable = mxTagTable_New(tagtable, 1, 1);
4434 	    if (tagtable == NULL)
4435 		goto onError;
4436 	}
4437 	else if (mxTagTable_Type(tagtable) != MXTAGTABLE_UNICODETYPE) {
4438 	    Py_Error(PyExc_TypeError,
4439 		     "TagTable instance is not intended for parsing Unicode");
4440 	}
4441 	else
4442 	    Py_INCREF(tagtable);
4443 
4444 	/* Call the Tagging Engine */
4445 	result = mxTextTools_UnicodeTaggingEngine(text,
4446 						  sliceleft,
4447 						  sliceright,
4448 						  (mxTagTableObject *)tagtable,
4449 						  taglist,
4450 						  context,
4451 						  &next);
4452 	Py_DECREF(tagtable);
4453 
4454     }
4455 #endif
4456     else
4457 	Py_Error(PyExc_TypeError,
4458 		 "text must be a string or unicode");
4459 
4460     /* Check for exceptions during matching */
4461     if (result == 0)
4462 	goto onError;
4463 
4464     /* Undo changes to taglist in case of a match failure (result == 1) */
4465     if (result == 1 && taglist != Py_None) {
4466 	DPRINTF("  undoing changes: del taglist[%i:%i]\n",
4467 		taglist_len, PyList_Size(taglist));
4468 	if (PyList_SetSlice(taglist,
4469 			    taglist_len,
4470 			    PyList_Size(taglist),
4471 			    NULL))
4472 	    goto onError;
4473     }
4474 
4475     /* Convert result to the documented external values:
4476        0 - no match, 1 - match. */
4477     result--;
4478 
4479     /* Build result tuple */
4480     res = PyTuple_New(3);
4481     if (!res)
4482 	goto onError;
4483     PyTuple_SET_ITEM(res,0,PyInt_FromLong(result));
4484     PyTuple_SET_ITEM(res,1,taglist);
4485     PyTuple_SET_ITEM(res,2,PyInt_FromLong(next));
4486     return res;
4487 
4488  onError:
4489     if (!PyErr_Occurred())
4490 	Py_Error(PyExc_SystemError,
4491 		 "NULL result without error in builtin tag()");
4492     Py_XDECREF(taglist);
4493     return NULL;
4494 }
4495 
4496 /* An extended version of string.join() for taglists: */
4497 
4498 Py_C_Function( mxTextTools_join,
4499 	       "join(joinlist,sep='',start=0,stop=len(joinlist))\n\n"
4500 	       "Copy snippets from different strings together producing a\n"
4501 	       "new string\n"
4502 	       "The first argument must be a list of tuples or strings;\n"
4503 	       "tuples must be of the form (string,l,r[,...]) and turn out\n"
4504 	       "as string[l:r]\n"
4505 	       "NOTE: the syntax used for negative slices is different\n"
4506 	       "than the Python standard: -1 corresponds to the first\n"
4507 	       "character *after* the string, e.g. ('Example',0,-1) gives\n"
4508 	       "'Example' and not 'Exampl', like in Python\n"
4509 	       "sep is an optional separator string, start and stop\n"
4510 	       "define the slice of joinlist that is taken into accont."
4511 	       )
4512 {
4513     PyObject *joinlist = NULL;
4514     Py_ssize_t joinlist_len;
4515     PyObject *separator = NULL;
4516     Py_ssize_t start=0, stop=INT_MAX;
4517 
4518     Py_Get4Args("O|Oii:join",
4519 		joinlist,separator,start,stop);
4520 
4521     Py_Assert(PySequence_Check(joinlist),
4522 	      PyExc_TypeError,
4523 	      "first argument needs to be a sequence");
4524 
4525     joinlist_len = PySequence_Length(joinlist);
4526     Py_Assert(joinlist_len >= 0,
4527 	      PyExc_TypeError,
4528 	      "first argument needs to have a __len__ method");
4529 
4530     Py_CheckSequenceSlice(joinlist_len, start, stop);
4531 
4532     /* Short-cut */
4533     if ((stop - start) <= 0)
4534 	return PyString_FromString("");
4535 
4536     return mxTextTools_Join(joinlist,
4537 			    start, stop,
4538 			    separator);
4539 
4540  onError:
4541     return NULL;
4542 }
4543 
4544 /*
4545    Special compare function for taglist-tuples, comparing
4546    the text-slices given:
4547     - slices starting at a smaller index come first
4548     - for slices starting at the same index, the longer one
4549       wins
4550 */
4551 
4552 Py_C_Function( mxTextTools_cmp,
4553 	       "cmp(a,b)\n\n"
4554 	       "Compare two valid taglist tuples w/r to their slice\n"
4555 	       "position; this is useful for sorting joinlists.")
4556 {
4557     PyObject *v,*w;
4558     short index;
4559     int cmp;
4560 
4561     Py_Get2Args("OO:cmp",v,w);
4562 
4563     Py_Assert(PyTuple_Check(v) && PyTuple_Check(w) &&
4564 	      PyTuple_GET_SIZE(v) >= 3 && PyTuple_GET_SIZE(w) >= 3,
4565 	      PyExc_TypeError,
4566 	      "invalid taglist-tuple");
4567 
4568     for (index = 1; index < 3; index++) {
4569         cmp = PyObject_RichCompareBool(PyTuple_GET_ITEM(v,1),PyTuple_GET_ITEM(w,1),Py_LT);
4570         if (cmp)
4571             return PyInt_FromLong(cmp);
4572         cmp = PyObject_RichCompareBool(PyTuple_GET_ITEM(v,2),PyTuple_GET_ITEM(w,2), Py_GT);
4573         if (cmp)
4574             return PyInt_FromLong(cmp);
4575     }
4576     return PyInt_FromLong(0);
4577 
4578  onError:
4579     return NULL;
4580 }
4581 
4582 Py_C_Function( mxTextTools_joinlist,
4583 	       "joinlist(text,list,start=0,stop=len(text))\n\n"
4584 	       "Takes a list of tuples (replacement,l,r,...) and produces\n"
4585 	       "a taglist suitable for join() which creates a copy\n"
4586 	       "of text where every slice [l:r] is replaced by the\n"
4587 	       "given replacement\n"
4588 	       "- the list must be sorted using cmp() as compare function\n"
4589 	       "- it may not contain overlapping slices\n"
4590 	       "- the slices may not contain negative indices\n"
4591 	       "- if the taglist cannot contain overlapping slices, you can\n"
4592 	       "  give this function the taglist produced by tag() directly\n"
4593 	       "  (sorting is not needed, as the list will already be sorted)\n"
4594 	       "- start and stop set the slice to work in, i.e. text[start:stop]"
4595 )
4596 {
4597     PyObject *list;
4598     PyObject *text;
4599     Py_ssize_t text_len = INT_MAX;
4600     Py_ssize_t pos = 0;
4601 
4602     Py_Get4Args("OO|ii:joinlist",text,list,pos,text_len);
4603 
4604     return mxTextTools_Joinlist(text, list, pos, text_len);
4605 
4606  onError:
4607     return NULL;
4608 }
4609 
4610 Py_C_Function( mxTextTools_charsplit,
4611 	       "charsplit(text,char,start=0,stop=len(text))\n\n"
4612 	       "Split text[start:stop] into substrings at char and\n"
4613 	       "return the result as list of strings."
4614 )
4615 {
4616     PyObject *text, *separator;
4617     Py_ssize_t text_len = INT_MAX;
4618     Py_ssize_t start = 0;
4619 
4620     Py_Get4Args("OO|ii:charsplit",
4621 		text,separator,start,text_len);
4622 
4623     return mxTextTools_CharSplit(text, separator,
4624 				 start, text_len);
4625 
4626  onError:
4627     return NULL;
4628 }
4629 
4630 Py_C_Function( mxTextTools_splitat,
4631 	       "splitat(text,char,nth=1,start=0,stop=len(text))\n\n"
4632 	       "Split text[start:stop] into two substrings at the nth\n"
4633 	       "occurance of char and return the result as 2-tuple. If the\n"
4634 	       "character is not found, the second string is empty. nth may\n"
4635 	       "be negative: the search is then done from the right and the\n"
4636 	       "first string is empty in case the character is not found."
4637 )
4638 {
4639     PyObject *text, *separator;
4640     Py_ssize_t text_len = INT_MAX;
4641     Py_ssize_t start = 0;
4642     Py_ssize_t nth = 1;
4643 
4644     Py_Get5Args("OO|iii:splitat",
4645 		text,separator,nth,start,text_len);
4646 
4647     return mxTextTools_SplitAt(text, separator,
4648 			       nth, start, text_len);
4649  onError:
4650     return NULL;
4651 }
4652 
4653 Py_C_Function( mxTextTools_suffix,
4654 	       "suffix(text,suffixes,start=0,stop=len(text)[,translate])\n\n"
4655 	       "Looks at text[start:stop] and returns the first matching\n"
4656 	       "suffix out of the tuple of strings given in suffixes.\n"
4657 	       "If no suffix is found to be matching, None is returned.\n"
4658 	       "The optional 256 char translate string is used to translate\n"
4659 	       "the text prior to comparing it with the given suffixes."
4660 	       )
4661 {
4662     PyObject *text, *suffixes, *translate = NULL;
4663     Py_ssize_t text_len = INT_MAX;
4664     Py_ssize_t start = 0;
4665 
4666     Py_Get5Args("OO|iiO:suffix",
4667 		text,suffixes,start,text_len,translate);
4668 
4669     return mxTextTools_Suffix(text,
4670 			      suffixes,
4671 			      start, text_len,
4672 			      translate);
4673  onError:
4674     return NULL;
4675 }
4676 
4677 Py_C_Function( mxTextTools_prefix,
4678 	       "prefix(text,prefixes,start=0,stop=len(text)[,translate])\n\n"
4679 	       "Looks at text[start:stop] and returns the first matching\n"
4680 	       "prefix out of the tuple of strings given in prefixes.\n"
4681 	       "If no prefix is found to be matching, None is returned.\n"
4682 	       "The optional 256 char translate string is used to translate\n"
4683 	       "the text prior to comparing it with the given suffixes."
4684 )
4685 {
4686     PyObject *text, *prefixes, *translate = NULL;
4687     Py_ssize_t text_len = INT_MAX;
4688     Py_ssize_t start = 0;
4689 
4690     Py_Get5Args("OO|iiO:prefix",
4691 		text,prefixes,start,text_len,translate);
4692 
4693     return mxTextTools_Prefix(text,
4694 			      prefixes,
4695 			      start, text_len,
4696 			      translate);
4697  onError:
4698     return NULL;
4699 }
4700 
4701 Py_C_Function( mxTextTools_set,
4702 	       "set(string,logic=1)\n\n"
4703 	       "Returns a character set for string: a bit encoded version\n"
4704 	       "of the characters occurring in string.\n"
4705 	       "- logic can be set to 0 if all characters *not* in string\n"
4706 	       "  should go into the set")
4707 {
4708     PyObject *sto;
4709     char *s,*st;
4710     Py_ssize_t len_s;
4711     int logic = 1;
4712     Py_ssize_t i;
4713 
4714     Py_Get3Args("s#|i:set",
4715 		s,len_s,logic);
4716 
4717     sto = PyString_FromStringAndSize(NULL,32);
4718     if (sto == NULL)
4719 	goto onError;
4720 
4721     st = PyString_AS_STRING(sto);
4722 
4723     if (logic) {
4724 	memset(st,0x00,32);
4725 	for (i = 0; i < len_s; i++,s++) {
4726 	    int j = (unsigned char)*s;
4727 
4728 	    st[j >> 3] |= 1 << (j & 7);
4729 	}
4730     }
4731     else {
4732 	memset(st,0xFF,32);
4733 	for (i = 0; i < len_s; i++,s++) {
4734 	    int j = (unsigned char)*s;
4735 
4736 	    st[j >> 3] &= ~(1 << (j & 7));
4737 	}
4738     }
4739     return sto;
4740 
4741  onError:
4742     return NULL;
4743 }
4744 
4745 Py_C_Function( mxTextTools_setfind,
4746 	       "setfind(text,set,start=0,stop=len(text))\n\n"
4747 	       "Find the first occurence of any character from set in\n"
4748 	       "text[start:stop]\n set must be a string obtained with set()\n"
4749 	       "DEPRECATED: use CharSet().search() instead."
4750 )
4751 {
4752     PyObject *text;
4753     PyObject *set;
4754     Py_ssize_t text_len = INT_MAX;
4755     Py_ssize_t start = 0;
4756     register Py_ssize_t x;
4757     register char *tx;
4758     register unsigned char *setstr;
4759 
4760     Py_Get4Args("OO|ii:setfind",text,set,start,text_len);
4761 
4762     Py_Assert(PyString_Check(text),
4763 	      PyExc_TypeError,
4764 	      "first argument needs to be a string");
4765     Py_Assert(PyString_Check(set) && PyString_GET_SIZE(set) == 32,
4766 	      PyExc_TypeError,
4767 	      "second argument needs to be a set");
4768     Py_CheckStringSlice(text,start,text_len);
4769 
4770     x = start;
4771     tx = PyString_AS_STRING(text) + x;
4772     setstr = (unsigned char *)PyString_AS_STRING(set);
4773 
4774     for (;x < text_len; tx++, x++)
4775 	if (Py_CharInSet(*tx,setstr))
4776 	    break;
4777 
4778     if (x == text_len)
4779 	/* Not found */
4780 	return PyInt_FromLong(-1L);
4781     else
4782 	return PyInt_FromLong(x);
4783 
4784  onError:
4785     return NULL;
4786 }
4787 
4788 Py_C_Function( mxTextTools_setstrip,
4789 	       "setstrip(text,set,start=0,stop=len(text),mode=0)\n\n"
4790 	       "Strip all characters in text[start:stop] appearing in set.\n"
4791 	       "mode indicates where to strip (<0: left; =0: left and right;\n"
4792 	       ">0: right). set must be a string obtained with set()\n"
4793 	       "DEPRECATED: use CharSet().strip() instead."
4794 	       )
4795 {
4796     char *tx;
4797     Py_ssize_t tx_len;
4798     char *setstr;
4799     Py_ssize_t setstr_len;
4800     Py_ssize_t start = 0;
4801     Py_ssize_t stop = INT_MAX;
4802     int mode = 0;
4803 
4804     Py_Get7Args("s#s#|iii:setstip",
4805 		tx,tx_len,setstr,setstr_len,start,stop,mode);
4806 
4807     return mxTextTools_SetStrip(tx, tx_len,
4808 				setstr, setstr_len,
4809 				start, stop,
4810 				mode);
4811 
4812  onError:
4813     return NULL;
4814 }
4815 
4816 Py_C_Function( mxTextTools_setsplit,
4817 	       "setsplit(text,set,start=0,stop=len(text))\n\n"
4818 	       "Split text[start:stop] into substrings using set,\n"
4819 	       "omitting the splitting parts and empty substrings.\n"
4820 	       "set must be a string obtained from set()\n"
4821 	       "DEPRECATED: use CharSet().split() instead."
4822 	       )
4823 {
4824     char *tx;
4825     Py_ssize_t tx_len;
4826     char *setstr;
4827     Py_ssize_t setstr_len;
4828     Py_ssize_t start = 0;
4829     Py_ssize_t stop = INT_MAX;
4830 
4831     Py_Get6Args("s#s#|ii:setsplit",
4832 		tx,tx_len,setstr,setstr_len,start,stop);
4833 
4834     return mxTextTools_SetSplit(tx, tx_len,
4835 				setstr, setstr_len,
4836 				start, stop);
4837  onError:
4838     return NULL;
4839 }
4840 
4841 Py_C_Function( mxTextTools_setsplitx,
4842 	       "setsplitx(text,set,start=0,stop=len(text))\n\n"
4843 	       "Split text[start:stop] into substrings using set, so\n"
4844 	       "that every second entry consists only of characters in set.\n"
4845 	       "set must be a string obtained with set()\n"
4846 	       "DEPRECATED: use CharSet().splitx() instead."
4847 	       )
4848 {
4849     Py_ssize_t text_len = INT_MAX;
4850     Py_ssize_t start = 0;
4851     char *tx;
4852     Py_ssize_t tx_len;
4853     char *setstr;
4854     Py_ssize_t setstr_len;
4855 
4856     Py_Get6Args("s#s#|ii:setsplitx",
4857 		tx,tx_len,setstr,setstr_len,start,text_len);
4858 
4859     return mxTextTools_SetSplitX(tx, tx_len,
4860 				 setstr, setstr_len,
4861 				 start, text_len);
4862  onError:
4863     return NULL;
4864 }
4865 
4866 Py_C_Function( mxTextTools_upper,
4867 	       "upper(text)\n\n"
4868 	       "Return text converted to upper case.")
4869 {
4870     PyObject *text;
4871 
4872     Py_GetArgObject(text);
4873     if (PyString_Check(text))
4874 	return mxTextTools_Upper(text);
4875 #ifdef HAVE_UNICODE
4876     else if (PyUnicode_Check(text))
4877 	return mxTextTools_UnicodeUpper(text);
4878 #endif
4879     else
4880 	Py_Error(PyExc_TypeError,
4881 		 "expected string or unicode");
4882 
4883  onError:
4884     return NULL;
4885 }
4886 
4887 Py_C_Function( mxTextTools_lower,
4888 	       "lower(text)\n\n"
4889 	       "Return text converted to lower case.")
4890 {
4891     PyObject *text;
4892 
4893     Py_GetArgObject(text);
4894     if (PyString_Check(text))
4895 	return mxTextTools_Lower(text);
4896 #ifdef HAVE_UNICODE
4897     else if (PyUnicode_Check(text))
4898 	return mxTextTools_UnicodeLower(text);
4899 #endif
4900     else
4901 	Py_Error(PyExc_TypeError,
4902 		 "expected string or unicode");
4903 
4904  onError:
4905     return NULL;
4906 }
4907 
4908 Py_C_Function( mxTextTools_str2hex,
4909 	       "str2hex(text)\n\n"
4910 	       "Return text converted to a string consisting of two byte\n"
4911 	       "HEX values.")
4912 {
4913     char *str;
4914     Py_ssize_t len;
4915 
4916     Py_Get2Args("s#",str,len);
4917 
4918     return mxTextTools_HexStringFromString(str,len);
4919 
4920  onError:
4921     return NULL;
4922 }
4923 
4924 Py_C_Function( mxTextTools_hex2str,
4925 	       "hex2str(text)\n\n"
4926 	       "Return text interpreted as two byte HEX values converted\n"
4927 	       "to a string.")
4928 {
4929     char *str;
4930     Py_ssize_t len;
4931 
4932     Py_Get2Args("s#",str,len);
4933 
4934     return mxTextTools_StringFromHexString(str,len);
4935 
4936  onError:
4937     return NULL;
4938 }
4939 
4940 Py_C_Function( mxTextTools_isascii,
4941 	       "isascii(text,start=0,stop=len(text))\n\n"
4942 	       "Return 1/0 depending on whether text only contains ASCII\n"
4943 	       "characters."
4944 	       )
4945 {
4946     PyObject *text;
4947     Py_ssize_t start=0, stop = INT_MAX;
4948     int rc;
4949 
4950     Py_GetArgObject(text);
4951     rc = mxTextTools_IsASCII(text, start, stop);
4952     if (rc < 0)
4953 	goto onError;
4954     return PyInt_FromLong(rc);
4955 
4956  onError:
4957     return NULL;
4958 }
4959 
4960 /* --- module init --------------------------------------------------------- */
4961 
4962 /* Python Method Table */
4963 
4964 static PyMethodDef Module_methods[] =
4965 {
4966     Py_MethodWithKeywordsListEntry("tag",mxTextTools_tag),
4967     Py_MethodListEntry("join",mxTextTools_join),
4968     Py_MethodListEntry("cmp",mxTextTools_cmp),
4969     Py_MethodListEntry("joinlist",mxTextTools_joinlist),
4970     Py_MethodListEntry("set",mxTextTools_set),
4971     Py_MethodListEntry("setfind",mxTextTools_setfind),
4972     Py_MethodListEntry("setsplit",mxTextTools_setsplit),
4973     Py_MethodListEntry("setsplitx",mxTextTools_setsplitx),
4974     Py_MethodListEntry("setstrip",mxTextTools_setstrip),
4975     Py_MethodWithKeywordsListEntry("TextSearch",mxTextSearch_TextSearch),
4976     Py_MethodListEntry("CharSet",mxCharSet_CharSet),
4977     Py_MethodListEntry("TagTable",mxTagTable_TagTable),
4978 #ifdef HAVE_UNICODE
4979     Py_MethodListEntry("UnicodeTagTable",mxTagTable_UnicodeTagTable),
4980 #endif
4981 	// Disabled because we don't actually use these functions
4982 	// and they are using a hack that tries to avoid the overhead
4983 	// of the single-value tuple creation/unpacking
4984     // Py_MethodListEntrySingleArg("upper",mxTextTools_upper),
4985     // Py_MethodListEntrySingleArg("lower",mxTextTools_lower),
4986     Py_MethodListEntry("charsplit",mxTextTools_charsplit),
4987     Py_MethodListEntry("splitat",mxTextTools_splitat),
4988     Py_MethodListEntry("suffix",mxTextTools_suffix),
4989     Py_MethodListEntry("prefix",mxTextTools_prefix),
4990     Py_MethodListEntry("hex2str",mxTextTools_hex2str),
4991     Py_MethodListEntry("str2hex",mxTextTools_str2hex),
4992     // Py_MethodListEntrySingleArg("isascii",mxTextTools_isascii),
4993     {NULL,NULL} /* end of list */
4994 };
4995 
4996 /* Cleanup function */
4997 static
mxTextToolsModule_Cleanup(void)4998 void mxTextToolsModule_Cleanup(void)
4999 {
5000     mxTextTools_TagTables = NULL;
5001 
5002     /* Reset mxTextTools_Initialized flag */
5003     mxTextTools_Initialized = 0;
5004 }
5005 
5006 #if PY_MAJOR_VERSION >= 3
5007 static struct PyModuleDef mxTextTools_ModuleDef = {
5008     PyModuleDef_HEAD_INIT,
5009     MXTEXTTOOLS_MODULE,
5010     Module_docstring,
5011     -1,
5012     Module_methods
5013 };
5014 #endif
5015 
mxTextToolsModule_Initialize(void)5016 static PyObject* mxTextToolsModule_Initialize(void)
5017 {
5018     PyObject *module;
5019 
5020     if (mxTextTools_Initialized) {
5021         PyErr_SetString(PyExc_SystemError,
5022                 "can't initialize "MXTEXTTOOLS_MODULE" more than once");
5023         return NULL;
5024     }
5025 
5026     /* Init type objects */
5027     if (PyType_Ready(&mxTextSearch_Type) < 0)
5028         return NULL;
5029     if (PyType_Ready(&mxCharSet_Type) < 0)
5030         return NULL;
5031     if (PyType_Ready(&mxTagTable_Type) < 0)
5032         return NULL;
5033 
5034     /* create module */
5035 #if PY_MAJOR_VERSION >= 3
5036     module = PyModule_Create(&mxTextTools_ModuleDef);
5037 #else
5038     module = Py_InitModule4(MXTEXTTOOLS_MODULE, /* Module name */
5039                 Module_methods, /* Method list */
5040                 Module_docstring, /* Module doc-string */
5041                 (PyObject *)NULL, /* always pass this as *self */
5042                 PYTHON_API_VERSION); /* API Version */
5043 #endif
5044     if (!module)
5045         return NULL;
5046 
5047     /* Init TagTable cache */
5048     mxTextTools_TagTables = PyDict_New();
5049     if (!mxTextTools_TagTables)
5050         return NULL;
5051 
5052     /* Register cleanup function */
5053     if (Py_AtExit(mxTextToolsModule_Cleanup) < 0)
5054         return NULL;
5055 
5056     /* Add some symbolic constants to the module */
5057     if (PyModule_AddStringConstant(module, "__version__", VERSION) < 0)
5058         return NULL;
5059     mx_ToUpper = mxTextTools_ToUpper();
5060     if (!mx_ToUpper)
5061         return NULL;
5062     if (PyModule_AddObject(module, "to_upper", mx_ToUpper) < 0)
5063         return NULL;
5064     mx_ToLower = mxTextTools_ToLower();
5065     if (!mx_ToLower)
5066         return NULL;
5067     if (PyModule_AddObject(module, "to_lower", mx_ToLower) < 0)
5068         return NULL;
5069 
5070     /* Let the tag table cache live in the module dictionary; we just
5071        keep a weak reference in mxTextTools_TagTables around. */
5072     if (PyModule_AddObject(module, "tagtable_cache", mxTextTools_TagTables) < 0)
5073         return NULL;
5074     Py_DECREF(mxTextTools_TagTables);
5075 
5076     ADD_INT_CONSTANT("BOYERMOORE", MXTEXTSEARCH_BOYERMOORE);
5077     ADD_INT_CONSTANT("FASTSEARCH", MXTEXTSEARCH_FASTSEARCH);
5078     ADD_INT_CONSTANT("TRIVIAL", MXTEXTSEARCH_TRIVIAL);
5079 
5080     /* Init exceptions */
5081     mxTextTools_Error = PyErr_NewException("mxTextTools.Error", PyExc_Exception, NULL);
5082     if (!mxTextTools_Error)
5083         return NULL;
5084     if (PyModule_AddObject(module, "Error", mxTextTools_Error) < 0)
5085         return NULL;
5086 
5087     /* Type objects */
5088     Py_INCREF(&mxTextSearch_Type);
5089     if (PyModule_AddObject(module, "TextSearchType", (PyObject*) &mxTextSearch_Type) < 0)
5090         return NULL;
5091     Py_INCREF(&mxCharSet_Type);
5092     if (PyModule_AddObject(module, "CharSetType", (PyObject*) &mxCharSet_Type) < 0)
5093         return NULL;
5094     Py_INCREF(&mxTagTable_Type);
5095     if (PyModule_AddObject(module, "TagTableType", (PyObject*) &mxTagTable_Type) < 0)
5096         return NULL;
5097 
5098     /* Tag Table command symbols (these will be exposed via
5099        simpleparse.stt.TextTools.Constants.TagTables) */
5100     ADD_INT_CONSTANT("_const_AllIn", MATCH_ALLIN);
5101     ADD_INT_CONSTANT("_const_AllNotIn", MATCH_ALLNOTIN);
5102     ADD_INT_CONSTANT("_const_Is", MATCH_IS);
5103     ADD_INT_CONSTANT("_const_IsIn", MATCH_ISIN);
5104     ADD_INT_CONSTANT("_const_IsNot", MATCH_ISNOTIN);
5105     ADD_INT_CONSTANT("_const_IsNotIn", MATCH_ISNOTIN);
5106 
5107     ADD_INT_CONSTANT("_const_Word", MATCH_WORD);
5108     ADD_INT_CONSTANT("_const_WordStart", MATCH_WORDSTART);
5109     ADD_INT_CONSTANT("_const_WordEnd", MATCH_WORDEND);
5110 
5111     ADD_INT_CONSTANT("_const_AllInSet", MATCH_ALLINSET);
5112     ADD_INT_CONSTANT("_const_IsInSet", MATCH_ISINSET);
5113     ADD_INT_CONSTANT("_const_AllInCharSet", MATCH_ALLINCHARSET);
5114     ADD_INT_CONSTANT("_const_IsInCharSet", MATCH_ISINCHARSET);
5115 
5116     ADD_INT_CONSTANT("_const_Fail", MATCH_FAIL);
5117     ADD_INT_CONSTANT("_const_Jump", MATCH_JUMP);
5118     ADD_INT_CONSTANT("_const_EOF", MATCH_EOF);
5119     ADD_INT_CONSTANT("_const_Skip", MATCH_SKIP);
5120     ADD_INT_CONSTANT("_const_Move", MATCH_MOVE);
5121 
5122     ADD_INT_CONSTANT("_const_JumpTarget", MATCH_JUMPTARGET);
5123 
5124     ADD_INT_CONSTANT("_const_sWordStart", MATCH_SWORDSTART);
5125     ADD_INT_CONSTANT("_const_sWordEnd", MATCH_SWORDEND);
5126     ADD_INT_CONSTANT("_const_sFindWord", MATCH_SFINDWORD);
5127     ADD_INT_CONSTANT("_const_NoWord", MATCH_NOWORD);
5128 
5129     ADD_INT_CONSTANT("_const_Call", MATCH_CALL);
5130     ADD_INT_CONSTANT("_const_CallArg", MATCH_CALLARG);
5131 
5132     ADD_INT_CONSTANT("_const_Table", MATCH_TABLE);
5133     ADD_INT_CONSTANT("_const_SubTable", MATCH_SUBTABLE);
5134     ADD_INT_CONSTANT("_const_TableInList", MATCH_TABLEINLIST);
5135     ADD_INT_CONSTANT("_const_SubTableInList", MATCH_SUBTABLEINLIST);
5136 
5137     ADD_INT_CONSTANT("_const_Loop", MATCH_LOOP);
5138     ADD_INT_CONSTANT("_const_LoopControl", MATCH_LOOPCONTROL);
5139 
5140     /* Tag Table command flags */
5141     ADD_INT_CONSTANT("_const_CallTag", MATCH_CALLTAG);
5142     ADD_INT_CONSTANT("_const_AppendToTagobj", MATCH_APPENDTAG);
5143     ADD_INT_CONSTANT("_const_AppendTagobj", MATCH_APPENDTAGOBJ);
5144     ADD_INT_CONSTANT("_const_AppendMatch", MATCH_APPENDMATCH);
5145     ADD_INT_CONSTANT("_const_LookAhead", MATCH_LOOKAHEAD);
5146 
5147     /* Tag Table argument integers */
5148     ADD_INT_CONSTANT("_const_To", MATCH_JUMP_TO);
5149     ADD_INT_CONSTANT("_const_MatchOk", MATCH_JUMP_MATCHOK);
5150     ADD_INT_CONSTANT("_const_MatchFail", MATCH_JUMP_MATCHFAIL);
5151     ADD_INT_CONSTANT("_const_ToEOF", MATCH_MOVE_EOF);
5152     ADD_INT_CONSTANT("_const_ToBOF", MATCH_MOVE_BOF);
5153     ADD_INT_CONSTANT("_const_Here", MATCH_FAIL_HERE);
5154 
5155     ADD_INT_CONSTANT("_const_ThisTable", MATCH_THISTABLE);
5156 
5157     ADD_INT_CONSTANT("_const_Break", MATCH_LOOPCONTROL_BREAK);
5158     ADD_INT_CONSTANT("_const_Reset", MATCH_LOOPCONTROL_RESET);
5159 
5160     DPRINTF("sizeof(string_charset)=%i bytes\n", sizeof(string_charset));
5161 #ifdef HAVE_UNICODE
5162     DPRINTF("sizeof(unicode_charset)=%i bytes\n", sizeof(unicode_charset));
5163 #endif
5164 
5165     /* We are now initialized */
5166     mxTextTools_Initialized = 1;
5167 
5168     return module;
5169 }
5170 
5171 #if PY_MAJOR_VERSION >= 3
PyInit_mxTextTools(void)5172 PyMODINIT_FUNC PyInit_mxTextTools(void)
5173 {
5174     return mxTextToolsModule_Initialize();
5175 }
5176 #else
initmxTextTools(void)5177 MX_EXPORT(void) initmxTextTools(void)
5178 {
5179     mxTextToolsModule_Initialize();
5180 }
5181 #endif
5182 
5183