1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
6 
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9 
10 Copyright (c) Corporation for National Research Initiatives.
11 
12 --------------------------------------------------------------------
13 The original string type implementation is:
14 
15   Copyright (c) 1999 by Secret Labs AB
16   Copyright (c) 1999 by Fredrik Lundh
17 
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
21 
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
30 
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
39 
40 */
41 
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
44 
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
47 
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51 
52 /* Limit for the Unicode object free list */
53 
54 #define PyUnicode_MAXFREELIST       1024
55 
56 /* Limit for the Unicode object free list stay alive optimization.
57 
58    The implementation will keep allocated Unicode memory intact for
59    all objects on the free list having a size less than this
60    limit. This reduces malloc() overhead for small Unicode objects.
61 
62    At worst this will result in PyUnicode_MAXFREELIST *
63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64    malloc()-overhead) bytes of unused garbage.
65 
66    Setting the limit to 0 effectively turns the feature off.
67 
68    Note: This is an experimental feature ! If you get core dumps when
69    using Unicode objects, turn this feature off.
70 
71 */
72 
73 #define KEEPALIVE_SIZE_LIMIT       9
74 
75 /* Endianness switches; defaults to little endian */
76 
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
82 
83 /* --- Globals ------------------------------------------------------------
84 
85    The globals are initialized by the _PyUnicode_Init() API and should
86    not be used before calling that API.
87 
88 */
89 
90 
91 #ifdef __cplusplus
92 extern "C" {
93 #endif
94 
95 /* Free list for Unicode objects */
96 static PyUnicodeObject *free_list;
97 static int numfree;
98 
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject *unicode_empty;
101 
102 /* Single character Unicode strings in the Latin-1 range are being
103    shared as well. */
104 static PyUnicodeObject *unicode_latin1[256];
105 
106 /* Default encoding to use and assume when NULL is passed as encoding
107    parameter; it is initialized by _PyUnicode_Init().
108 
109    Always use the PyUnicode_SetDefaultEncoding() and
110    PyUnicode_GetDefaultEncoding() APIs to access this global.
111 
112 */
113 static char unicode_default_encoding[100];
114 
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace[] = {
117     0, 0, 0, 0, 0, 0, 0, 0,
118 /*     case 0x0009: * CHARACTER TABULATION */
119 /*     case 0x000A: * LINE FEED */
120 /*     case 0x000B: * LINE TABULATION */
121 /*     case 0x000C: * FORM FEED */
122 /*     case 0x000D: * CARRIAGE RETURN */
123     0, 1, 1, 1, 1, 1, 0, 0,
124     0, 0, 0, 0, 0, 0, 0, 0,
125 /*     case 0x001C: * FILE SEPARATOR */
126 /*     case 0x001D: * GROUP SEPARATOR */
127 /*     case 0x001E: * RECORD SEPARATOR */
128 /*     case 0x001F: * UNIT SEPARATOR */
129     0, 0, 0, 0, 1, 1, 1, 1,
130 /*     case 0x0020: * SPACE */
131     1, 0, 0, 0, 0, 0, 0, 0,
132     0, 0, 0, 0, 0, 0, 0, 0,
133     0, 0, 0, 0, 0, 0, 0, 0,
134     0, 0, 0, 0, 0, 0, 0, 0,
135 
136     0, 0, 0, 0, 0, 0, 0, 0,
137     0, 0, 0, 0, 0, 0, 0, 0,
138     0, 0, 0, 0, 0, 0, 0, 0,
139     0, 0, 0, 0, 0, 0, 0, 0,
140     0, 0, 0, 0, 0, 0, 0, 0,
141     0, 0, 0, 0, 0, 0, 0, 0,
142     0, 0, 0, 0, 0, 0, 0, 0,
143     0, 0, 0, 0, 0, 0, 0, 0
144 };
145 
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak[] = {
148     0, 0, 0, 0, 0, 0, 0, 0,
149 /*         0x000A, * LINE FEED */
150 /*         0x000B, * LINE TABULATION */
151 /*         0x000C, * FORM FEED */
152 /*         0x000D, * CARRIAGE RETURN */
153     0, 0, 1, 1, 1, 1, 0, 0,
154     0, 0, 0, 0, 0, 0, 0, 0,
155 /*         0x001C, * FILE SEPARATOR */
156 /*         0x001D, * GROUP SEPARATOR */
157 /*         0x001E, * RECORD SEPARATOR */
158     0, 0, 0, 0, 1, 1, 1, 0,
159     0, 0, 0, 0, 0, 0, 0, 0,
160     0, 0, 0, 0, 0, 0, 0, 0,
161     0, 0, 0, 0, 0, 0, 0, 0,
162     0, 0, 0, 0, 0, 0, 0, 0,
163 
164     0, 0, 0, 0, 0, 0, 0, 0,
165     0, 0, 0, 0, 0, 0, 0, 0,
166     0, 0, 0, 0, 0, 0, 0, 0,
167     0, 0, 0, 0, 0, 0, 0, 0,
168     0, 0, 0, 0, 0, 0, 0, 0,
169     0, 0, 0, 0, 0, 0, 0, 0,
170     0, 0, 0, 0, 0, 0, 0, 0,
171     0, 0, 0, 0, 0, 0, 0, 0
172 };
173 
174 
175 Py_UNICODE
PyUnicode_GetMax(void)176 PyUnicode_GetMax(void)
177 {
178 #ifdef Py_UNICODE_WIDE
179     return 0x10FFFF;
180 #else
181     /* This is actually an illegal character, so it should
182        not be passed to unichr. */
183     return 0xFFFF;
184 #endif
185 }
186 
187 /* --- Bloom Filters ----------------------------------------------------- */
188 
189 /* stuff to implement simple "bloom filters" for Unicode characters.
190    to keep things simple, we use a single bitmask, using the least 5
191    bits from each unicode characters as the bit index. */
192 
193 /* the linebreak mask is set up by Unicode_Init below */
194 
195 #if LONG_BIT >= 128
196 #define BLOOM_WIDTH 128
197 #elif LONG_BIT >= 64
198 #define BLOOM_WIDTH 64
199 #elif LONG_BIT >= 32
200 #define BLOOM_WIDTH 32
201 #else
202 #error "LONG_BIT is smaller than 32"
203 #endif
204 
205 #define BLOOM_MASK unsigned long
206 
207 static BLOOM_MASK bloom_linebreak;
208 
209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
211 
212 #define BLOOM_LINEBREAK(ch)                                             \
213     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
214      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
215 
make_bloom_mask(Py_UNICODE * ptr,Py_ssize_t len)216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217 {
218     /* calculate simple bloom-style bitmask for a given unicode string */
219 
220     BLOOM_MASK mask;
221     Py_ssize_t i;
222 
223     mask = 0;
224     for (i = 0; i < len; i++)
225         BLOOM_ADD(mask, ptr[i]);
226 
227     return mask;
228 }
229 
unicode_member(Py_UNICODE chr,Py_UNICODE * set,Py_ssize_t setlen)230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231 {
232     Py_ssize_t i;
233 
234     for (i = 0; i < setlen; i++)
235         if (set[i] == chr)
236             return 1;
237 
238     return 0;
239 }
240 
241 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
242     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243 
244 /* --- Unicode Object ----------------------------------------------------- */
245 
246 static
unicode_resize(register PyUnicodeObject * unicode,Py_ssize_t length)247 int unicode_resize(register PyUnicodeObject *unicode,
248                    Py_ssize_t length)
249 {
250     void *oldstr;
251 
252     /* Shortcut if there's nothing much to do. */
253     if (unicode->length == length)
254         goto reset;
255 
256     /* Resizing shared object (unicode_empty or single character
257        objects) in-place is not allowed. Use PyUnicode_Resize()
258        instead ! */
259 
260     if (unicode == unicode_empty ||
261         (unicode->length == 1 &&
262          unicode->str[0] < 256U &&
263          unicode_latin1[unicode->str[0]] == unicode)) {
264         PyErr_SetString(PyExc_SystemError,
265                         "can't resize shared unicode objects");
266         return -1;
267     }
268 
269     /* We allocate one more byte to make sure the string is Ux0000 terminated.
270        The overallocation is also used by fastsearch, which assumes that it's
271        safe to look at str[length] (without making any assumptions about what
272        it contains). */
273 
274     oldstr = unicode->str;
275     unicode->str = PyObject_REALLOC(unicode->str,
276                                     sizeof(Py_UNICODE) * (length + 1));
277     if (!unicode->str) {
278         unicode->str = (Py_UNICODE *)oldstr;
279         PyErr_NoMemory();
280         return -1;
281     }
282     unicode->str[length] = 0;
283     unicode->length = length;
284 
285   reset:
286     /* Reset the object caches */
287     if (unicode->defenc) {
288         Py_CLEAR(unicode->defenc);
289     }
290     unicode->hash = -1;
291 
292     return 0;
293 }
294 
295 /* We allocate one more byte to make sure the string is
296    Ux0000 terminated; some code relies on that.
297 
298    XXX This allocator could further be enhanced by assuring that the
299    free list never reduces its size below 1.
300 
301 */
302 
303 static
_PyUnicode_New(Py_ssize_t length)304 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305 {
306     register PyUnicodeObject *unicode;
307 
308     /* Optimization for empty strings */
309     if (length == 0 && unicode_empty != NULL) {
310         Py_INCREF(unicode_empty);
311         return unicode_empty;
312     }
313 
314     /* Ensure we won't overflow the size. */
315     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316         return (PyUnicodeObject *)PyErr_NoMemory();
317     }
318 
319     /* Unicode freelist & memory allocation */
320     if (free_list) {
321         unicode = free_list;
322         free_list = *(PyUnicodeObject **)unicode;
323         numfree--;
324         if (unicode->str) {
325             /* Keep-Alive optimization: we only upsize the buffer,
326                never downsize it. */
327             if ((unicode->length < length) &&
328                 unicode_resize(unicode, length) < 0) {
329                 PyObject_DEL(unicode->str);
330                 unicode->str = NULL;
331             }
332         }
333         else {
334             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336         }
337         PyObject_INIT(unicode, &PyUnicode_Type);
338     }
339     else {
340         size_t new_size;
341         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
342         if (unicode == NULL)
343             return NULL;
344         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
346     }
347 
348     if (!unicode->str) {
349         PyErr_NoMemory();
350         goto onError;
351     }
352     /* Initialize the first element to guard against cases where
353      * the caller fails before initializing str -- unicode_resize()
354      * reads str[0], and the Keep-Alive optimization can keep memory
355      * allocated for str alive across a call to unicode_dealloc(unicode).
356      * We don't want unicode_resize to read uninitialized memory in
357      * that case.
358      */
359     unicode->str[0] = 0;
360     unicode->str[length] = 0;
361     unicode->length = length;
362     unicode->hash = -1;
363     unicode->defenc = NULL;
364     return unicode;
365 
366   onError:
367     /* XXX UNREF/NEWREF interface should be more symmetrical */
368     _Py_DEC_REFTOTAL;
369     _Py_ForgetReference((PyObject *)unicode);
370     PyObject_Del(unicode);
371     return NULL;
372 }
373 
374 static
unicode_dealloc(register PyUnicodeObject * unicode)375 void unicode_dealloc(register PyUnicodeObject *unicode)
376 {
377     if (PyUnicode_CheckExact(unicode) &&
378         numfree < PyUnicode_MAXFREELIST) {
379         /* Keep-Alive optimization */
380         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
381             PyObject_DEL(unicode->str);
382             unicode->str = NULL;
383             unicode->length = 0;
384         }
385         if (unicode->defenc) {
386             Py_CLEAR(unicode->defenc);
387         }
388         /* Add to free list */
389         *(PyUnicodeObject **)unicode = free_list;
390         free_list = unicode;
391         numfree++;
392     }
393     else {
394         PyObject_DEL(unicode->str);
395         Py_XDECREF(unicode->defenc);
396         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
397     }
398 }
399 
400 static
_PyUnicode_Resize(PyUnicodeObject ** unicode,Py_ssize_t length)401 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
402 {
403     register PyUnicodeObject *v;
404 
405     /* Argument checks */
406     if (unicode == NULL) {
407         PyErr_BadInternalCall();
408         return -1;
409     }
410     v = *unicode;
411     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
412         PyErr_BadInternalCall();
413         return -1;
414     }
415 
416     /* Resizing unicode_empty and single character objects is not
417        possible since these are being shared. We simply return a fresh
418        copy with the same Unicode content. */
419     if (v->length != length &&
420         (v == unicode_empty || v->length == 1)) {
421         PyUnicodeObject *w = _PyUnicode_New(length);
422         if (w == NULL)
423             return -1;
424         Py_UNICODE_COPY(w->str, v->str,
425                         length < v->length ? length : v->length);
426         Py_DECREF(*unicode);
427         *unicode = w;
428         return 0;
429     }
430 
431     /* Note that we don't have to modify *unicode for unshared Unicode
432        objects, since we can modify them in-place. */
433     return unicode_resize(v, length);
434 }
435 
PyUnicode_Resize(PyObject ** unicode,Py_ssize_t length)436 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
437 {
438     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
439 }
440 
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)441 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
442                                 Py_ssize_t size)
443 {
444     PyUnicodeObject *unicode;
445 
446     /* If the Unicode data is known at construction time, we can apply
447        some optimizations which share commonly used objects. */
448     if (u != NULL) {
449 
450         /* Optimization for empty strings */
451         if (size == 0 && unicode_empty != NULL) {
452             Py_INCREF(unicode_empty);
453             return (PyObject *)unicode_empty;
454         }
455 
456         /* Single character Unicode objects in the Latin-1 range are
457            shared when using this constructor */
458         if (size == 1 && *u < 256) {
459             unicode = unicode_latin1[*u];
460             if (!unicode) {
461                 unicode = _PyUnicode_New(1);
462                 if (!unicode)
463                     return NULL;
464                 unicode->str[0] = *u;
465                 unicode_latin1[*u] = unicode;
466             }
467             Py_INCREF(unicode);
468             return (PyObject *)unicode;
469         }
470     }
471 
472     unicode = _PyUnicode_New(size);
473     if (!unicode)
474         return NULL;
475 
476     /* Copy the Unicode data into the new object */
477     if (u != NULL)
478         Py_UNICODE_COPY(unicode->str, u, size);
479 
480     return (PyObject *)unicode;
481 }
482 
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)483 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
484 {
485     PyUnicodeObject *unicode;
486 
487     if (size < 0) {
488         PyErr_SetString(PyExc_SystemError,
489                         "Negative size passed to PyUnicode_FromStringAndSize");
490         return NULL;
491     }
492 
493     /* If the Unicode data is known at construction time, we can apply
494        some optimizations which share commonly used objects.
495        Also, this means the input must be UTF-8, so fall back to the
496        UTF-8 decoder at the end. */
497     if (u != NULL) {
498 
499         /* Optimization for empty strings */
500         if (size == 0 && unicode_empty != NULL) {
501             Py_INCREF(unicode_empty);
502             return (PyObject *)unicode_empty;
503         }
504 
505         /* Single characters are shared when using this constructor.
506            Restrict to ASCII, since the input must be UTF-8. */
507         if (size == 1 && Py_CHARMASK(*u) < 128) {
508             unicode = unicode_latin1[Py_CHARMASK(*u)];
509             if (!unicode) {
510                 unicode = _PyUnicode_New(1);
511                 if (!unicode)
512                     return NULL;
513                 unicode->str[0] = Py_CHARMASK(*u);
514                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
515             }
516             Py_INCREF(unicode);
517             return (PyObject *)unicode;
518         }
519 
520         return PyUnicode_DecodeUTF8(u, size, NULL);
521     }
522 
523     unicode = _PyUnicode_New(size);
524     if (!unicode)
525         return NULL;
526 
527     return (PyObject *)unicode;
528 }
529 
PyUnicode_FromString(const char * u)530 PyObject *PyUnicode_FromString(const char *u)
531 {
532     size_t size = strlen(u);
533     if (size > PY_SSIZE_T_MAX) {
534         PyErr_SetString(PyExc_OverflowError, "input too long");
535         return NULL;
536     }
537 
538     return PyUnicode_FromStringAndSize(u, size);
539 }
540 
541 #ifdef HAVE_WCHAR_H
542 
543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544 # define CONVERT_WCHAR_TO_SURROGATES
545 #endif
546 
547 #ifdef CONVERT_WCHAR_TO_SURROGATES
548 
549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550    to convert from UTF32 to UTF16. */
551 
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553                                  Py_ssize_t size)
554 {
555     PyUnicodeObject *unicode;
556     register Py_ssize_t i;
557     Py_ssize_t alloc;
558     const wchar_t *orig_w;
559 
560     if (w == NULL) {
561         PyErr_BadInternalCall();
562         return NULL;
563     }
564 
565     alloc = size;
566     orig_w = w;
567     for (i = size; i > 0; i--) {
568         if (*w > 0xFFFF)
569             alloc++;
570         w++;
571     }
572     w = orig_w;
573     unicode = _PyUnicode_New(alloc);
574     if (!unicode)
575         return NULL;
576 
577     /* Copy the wchar_t data into the new object */
578     {
579         register Py_UNICODE *u;
580         u = PyUnicode_AS_UNICODE(unicode);
581         for (i = size; i > 0; i--) {
582             if (*w > 0xFFFF) {
583                 wchar_t ordinal = *w++;
584                 ordinal -= 0x10000;
585                 *u++ = 0xD800 | (ordinal >> 10);
586                 *u++ = 0xDC00 | (ordinal & 0x3FF);
587             }
588             else
589                 *u++ = *w++;
590         }
591     }
592     return (PyObject *)unicode;
593 }
594 
595 #else
596 
PyUnicode_FromWideChar(register const wchar_t * w,Py_ssize_t size)597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598                                  Py_ssize_t size)
599 {
600     PyUnicodeObject *unicode;
601 
602     if (w == NULL) {
603         PyErr_BadInternalCall();
604         return NULL;
605     }
606 
607     unicode = _PyUnicode_New(size);
608     if (!unicode)
609         return NULL;
610 
611     /* Copy the wchar_t data into the new object */
612 #ifdef HAVE_USABLE_WCHAR_T
613     memcpy(unicode->str, w, size * sizeof(wchar_t));
614 #else
615     {
616         register Py_UNICODE *u;
617         register Py_ssize_t i;
618         u = PyUnicode_AS_UNICODE(unicode);
619         for (i = size; i > 0; i--)
620             *u++ = *w++;
621     }
622 #endif
623 
624     return (PyObject *)unicode;
625 }
626 
627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
628 
629 #undef CONVERT_WCHAR_TO_SURROGATES
630 
631 static void
makefmt(char * fmt,int longflag,int size_tflag,int zeropad,int width,int precision,char c)632 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
633 {
634     *fmt++ = '%';
635     if (width) {
636         if (zeropad)
637             *fmt++ = '0';
638         fmt += sprintf(fmt, "%d", width);
639     }
640     if (precision)
641         fmt += sprintf(fmt, ".%d", precision);
642     if (longflag)
643         *fmt++ = 'l';
644     else if (size_tflag) {
645         char *f = PY_FORMAT_SIZE_T;
646         while (*f)
647             *fmt++ = *f++;
648     }
649     *fmt++ = c;
650     *fmt = '\0';
651 }
652 
653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
654 
655 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)656 PyUnicode_FromFormatV(const char *format, va_list vargs)
657 {
658     va_list count;
659     Py_ssize_t callcount = 0;
660     PyObject **callresults = NULL;
661     PyObject **callresult = NULL;
662     Py_ssize_t n = 0;
663     int width = 0;
664     int precision = 0;
665     int zeropad;
666     const char* f;
667     Py_UNICODE *s;
668     PyObject *string;
669     /* used by sprintf */
670     char buffer[21];
671     /* use abuffer instead of buffer, if we need more space
672      * (which can happen if there's a format specifier with width). */
673     char *abuffer = NULL;
674     char *realbuffer;
675     Py_ssize_t abuffersize = 0;
676     char fmt[60]; /* should be enough for %0width.precisionld */
677     const char *copy;
678 
679 #ifdef VA_LIST_IS_ARRAY
680     Py_MEMCPY(count, vargs, sizeof(va_list));
681 #else
682 #ifdef  __va_copy
683     __va_copy(count, vargs);
684 #else
685     count = vargs;
686 #endif
687 #endif
688      /* step 1: count the number of %S/%R/%s format specifications
689       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690       * objects once during step 3 and put the result in an array) */
691     for (f = format; *f; f++) {
692          if (*f == '%') {
693              if (*(f+1)=='%')
694                  continue;
695              if (*(f+1)=='S' || *(f+1)=='R')
696                  ++callcount;
697              while (isdigit((unsigned)*f))
698                  width = (width*10) + *f++ - '0';
699              while (*++f && *f != '%' && !isalpha((unsigned)*f))
700                  ;
701              if (*f == 's')
702                  ++callcount;
703          }
704     }
705     /* step 2: allocate memory for the results of
706      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
707     if (callcount) {
708         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709         if (!callresults) {
710             PyErr_NoMemory();
711             return NULL;
712         }
713         callresult = callresults;
714     }
715     /* step 3: figure out how large a buffer we need */
716     for (f = format; *f; f++) {
717         if (*f == '%') {
718             const char* p = f;
719             width = 0;
720             while (isdigit((unsigned)*f))
721                 width = (width*10) + *f++ - '0';
722             while (*++f && *f != '%' && !isalpha((unsigned)*f))
723                 ;
724 
725             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726              * they don't affect the amount of space we reserve.
727              */
728             if ((*f == 'l' || *f == 'z') &&
729                 (f[1] == 'd' || f[1] == 'u'))
730                 ++f;
731 
732             switch (*f) {
733             case 'c':
734                 (void)va_arg(count, int);
735                 /* fall through... */
736             case '%':
737                 n++;
738                 break;
739             case 'd': case 'u': case 'i': case 'x':
740                 (void) va_arg(count, int);
741                 /* 20 bytes is enough to hold a 64-bit
742                    integer.  Decimal takes the most space.
743                    This isn't enough for octal.
744                    If a width is specified we need more
745                    (which we allocate later). */
746                 if (width < 20)
747                     width = 20;
748                 n += width;
749                 if (abuffersize < width)
750                     abuffersize = width;
751                 break;
752             case 's':
753             {
754                 /* UTF-8 */
755                 const char *s = va_arg(count, const char*);
756                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
757                 if (!str)
758                     goto fail;
759                 n += PyUnicode_GET_SIZE(str);
760                 /* Remember the str and switch to the next slot */
761                 *callresult++ = str;
762                 break;
763             }
764             case 'U':
765             {
766                 PyObject *obj = va_arg(count, PyObject *);
767                 assert(obj && PyUnicode_Check(obj));
768                 n += PyUnicode_GET_SIZE(obj);
769                 break;
770             }
771             case 'V':
772             {
773                 PyObject *obj = va_arg(count, PyObject *);
774                 const char *str = va_arg(count, const char *);
775                 assert(obj || str);
776                 assert(!obj || PyUnicode_Check(obj));
777                 if (obj)
778                     n += PyUnicode_GET_SIZE(obj);
779                 else
780                     n += strlen(str);
781                 break;
782             }
783             case 'S':
784             {
785                 PyObject *obj = va_arg(count, PyObject *);
786                 PyObject *str;
787                 assert(obj);
788                 str = PyObject_Str(obj);
789                 if (!str)
790                     goto fail;
791                 n += PyUnicode_GET_SIZE(str);
792                 /* Remember the str and switch to the next slot */
793                 *callresult++ = str;
794                 break;
795             }
796             case 'R':
797             {
798                 PyObject *obj = va_arg(count, PyObject *);
799                 PyObject *repr;
800                 assert(obj);
801                 repr = PyObject_Repr(obj);
802                 if (!repr)
803                     goto fail;
804                 n += PyUnicode_GET_SIZE(repr);
805                 /* Remember the repr and switch to the next slot */
806                 *callresult++ = repr;
807                 break;
808             }
809             case 'p':
810                 (void) va_arg(count, int);
811                 /* maximum 64-bit pointer representation:
812                  * 0xffffffffffffffff
813                  * so 19 characters is enough.
814                  * XXX I count 18 -- what's the extra for?
815                  */
816                 n += 19;
817                 break;
818             default:
819                 /* if we stumble upon an unknown
820                    formatting code, copy the rest of
821                    the format string to the output
822                    string. (we cannot just skip the
823                    code, since there's no way to know
824                    what's in the argument list) */
825                 n += strlen(p);
826                 goto expand;
827             }
828         } else
829             n++;
830     }
831   expand:
832     if (abuffersize > 20) {
833         abuffer = PyObject_Malloc(abuffersize);
834         if (!abuffer) {
835             PyErr_NoMemory();
836             goto fail;
837         }
838         realbuffer = abuffer;
839     }
840     else
841         realbuffer = buffer;
842     /* step 4: fill the buffer */
843     /* Since we've analyzed how much space we need for the worst case,
844        we don't have to resize the string.
845        There can be no errors beyond this point. */
846     string = PyUnicode_FromUnicode(NULL, n);
847     if (!string)
848         goto fail;
849 
850     s = PyUnicode_AS_UNICODE(string);
851     callresult = callresults;
852 
853     for (f = format; *f; f++) {
854         if (*f == '%') {
855             const char* p = f++;
856             int longflag = 0;
857             int size_tflag = 0;
858             zeropad = (*f == '0');
859             /* parse the width.precision part */
860             width = 0;
861             while (isdigit((unsigned)*f))
862                 width = (width*10) + *f++ - '0';
863             precision = 0;
864             if (*f == '.') {
865                 f++;
866                 while (isdigit((unsigned)*f))
867                     precision = (precision*10) + *f++ - '0';
868             }
869             /* handle the long flag, but only for %ld and %lu.
870                others can be added when necessary. */
871             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872                 longflag = 1;
873                 ++f;
874             }
875             /* handle the size_t flag. */
876             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
877                 size_tflag = 1;
878                 ++f;
879             }
880 
881             switch (*f) {
882             case 'c':
883                 *s++ = va_arg(vargs, int);
884                 break;
885             case 'd':
886                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
887                 if (longflag)
888                     sprintf(realbuffer, fmt, va_arg(vargs, long));
889                 else if (size_tflag)
890                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
891                 else
892                     sprintf(realbuffer, fmt, va_arg(vargs, int));
893                 appendstring(realbuffer);
894                 break;
895             case 'u':
896                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
897                 if (longflag)
898                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
899                 else if (size_tflag)
900                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
901                 else
902                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
903                 appendstring(realbuffer);
904                 break;
905             case 'i':
906                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
907                 sprintf(realbuffer, fmt, va_arg(vargs, int));
908                 appendstring(realbuffer);
909                 break;
910             case 'x':
911                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
912                 sprintf(realbuffer, fmt, va_arg(vargs, int));
913                 appendstring(realbuffer);
914                 break;
915             case 's':
916             {
917                 /* unused, since we already have the result */
918                 (void) va_arg(vargs, char *);
919                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
920                                 PyUnicode_GET_SIZE(*callresult));
921                 s += PyUnicode_GET_SIZE(*callresult);
922                 /* We're done with the unicode()/repr() => forget it */
923                 Py_DECREF(*callresult);
924                 /* switch to next unicode()/repr() result */
925                 ++callresult;
926                 break;
927             }
928             case 'U':
929             {
930                 PyObject *obj = va_arg(vargs, PyObject *);
931                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933                 s += size;
934                 break;
935             }
936             case 'V':
937             {
938                 PyObject *obj = va_arg(vargs, PyObject *);
939                 const char *str = va_arg(vargs, const char *);
940                 if (obj) {
941                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
942                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
943                     s += size;
944                 } else {
945                     appendstring(str);
946                 }
947                 break;
948             }
949             case 'S':
950             case 'R':
951             {
952                 Py_UNICODE *ucopy;
953                 Py_ssize_t usize;
954                 Py_ssize_t upos;
955                 /* unused, since we already have the result */
956                 (void) va_arg(vargs, PyObject *);
957                 ucopy = PyUnicode_AS_UNICODE(*callresult);
958                 usize = PyUnicode_GET_SIZE(*callresult);
959                 for (upos = 0; upos<usize;)
960                     *s++ = ucopy[upos++];
961                 /* We're done with the unicode()/repr() => forget it */
962                 Py_DECREF(*callresult);
963                 /* switch to next unicode()/repr() result */
964                 ++callresult;
965                 break;
966             }
967             case 'p':
968                 sprintf(buffer, "%p", va_arg(vargs, void*));
969                 /* %p is ill-defined:  ensure leading 0x. */
970                 if (buffer[1] == 'X')
971                     buffer[1] = 'x';
972                 else if (buffer[1] != 'x') {
973                     memmove(buffer+2, buffer, strlen(buffer)+1);
974                     buffer[0] = '0';
975                     buffer[1] = 'x';
976                 }
977                 appendstring(buffer);
978                 break;
979             case '%':
980                 *s++ = '%';
981                 break;
982             default:
983                 appendstring(p);
984                 goto end;
985             }
986         } else
987             *s++ = *f;
988     }
989 
990   end:
991     if (callresults)
992         PyObject_Free(callresults);
993     if (abuffer)
994         PyObject_Free(abuffer);
995     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
996     return string;
997   fail:
998     if (callresults) {
999         PyObject **callresult2 = callresults;
1000         while (callresult2 < callresult) {
1001             Py_DECREF(*callresult2);
1002             ++callresult2;
1003         }
1004         PyObject_Free(callresults);
1005     }
1006     if (abuffer)
1007         PyObject_Free(abuffer);
1008     return NULL;
1009 }
1010 
1011 #undef appendstring
1012 
1013 PyObject *
PyUnicode_FromFormat(const char * format,...)1014 PyUnicode_FromFormat(const char *format, ...)
1015 {
1016     PyObject* ret;
1017     va_list vargs;
1018 
1019 #ifdef HAVE_STDARG_PROTOTYPES
1020     va_start(vargs, format);
1021 #else
1022     va_start(vargs);
1023 #endif
1024     ret = PyUnicode_FromFormatV(format, vargs);
1025     va_end(vargs);
1026     return ret;
1027 }
1028 
PyUnicode_AsWideChar(PyUnicodeObject * unicode,wchar_t * w,Py_ssize_t size)1029 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1030                                 wchar_t *w,
1031                                 Py_ssize_t size)
1032 {
1033     if (unicode == NULL) {
1034         PyErr_BadInternalCall();
1035         return -1;
1036     }
1037 
1038     /* If possible, try to copy the 0-termination as well */
1039     if (size > PyUnicode_GET_SIZE(unicode))
1040         size = PyUnicode_GET_SIZE(unicode) + 1;
1041 
1042 #ifdef HAVE_USABLE_WCHAR_T
1043     memcpy(w, unicode->str, size * sizeof(wchar_t));
1044 #else
1045     {
1046         register Py_UNICODE *u;
1047         register Py_ssize_t i;
1048         u = PyUnicode_AS_UNICODE(unicode);
1049         for (i = size; i > 0; i--)
1050             *w++ = *u++;
1051     }
1052 #endif
1053 
1054     if (size > PyUnicode_GET_SIZE(unicode))
1055         return PyUnicode_GET_SIZE(unicode);
1056     else
1057         return size;
1058 }
1059 
1060 #endif
1061 
PyUnicode_FromOrdinal(int ordinal)1062 PyObject *PyUnicode_FromOrdinal(int ordinal)
1063 {
1064     Py_UNICODE s[1];
1065 
1066 #ifdef Py_UNICODE_WIDE
1067     if (ordinal < 0 || ordinal > 0x10ffff) {
1068         PyErr_SetString(PyExc_ValueError,
1069                         "unichr() arg not in range(0x110000) "
1070                         "(wide Python build)");
1071         return NULL;
1072     }
1073 #else
1074     if (ordinal < 0 || ordinal > 0xffff) {
1075         PyErr_SetString(PyExc_ValueError,
1076                         "unichr() arg not in range(0x10000) "
1077                         "(narrow Python build)");
1078         return NULL;
1079     }
1080 #endif
1081 
1082     s[0] = (Py_UNICODE)ordinal;
1083     return PyUnicode_FromUnicode(s, 1);
1084 }
1085 
PyUnicode_FromObject(register PyObject * obj)1086 PyObject *PyUnicode_FromObject(register PyObject *obj)
1087 {
1088     /* XXX Perhaps we should make this API an alias of
1089        PyObject_Unicode() instead ?! */
1090     if (PyUnicode_CheckExact(obj)) {
1091         Py_INCREF(obj);
1092         return obj;
1093     }
1094     if (PyUnicode_Check(obj)) {
1095         /* For a Unicode subtype that's not a Unicode object,
1096            return a true Unicode object with the same data. */
1097         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098                                      PyUnicode_GET_SIZE(obj));
1099     }
1100     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101 }
1102 
PyUnicode_FromEncodedObject(register PyObject * obj,const char * encoding,const char * errors)1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1104                                       const char *encoding,
1105                                       const char *errors)
1106 {
1107     const char *s = NULL;
1108     Py_ssize_t len;
1109     PyObject *v;
1110 
1111     if (obj == NULL) {
1112         PyErr_BadInternalCall();
1113         return NULL;
1114     }
1115 
1116 #if 0
1117     /* For b/w compatibility we also accept Unicode objects provided
1118        that no encodings is given and then redirect to
1119        PyObject_Unicode() which then applies the additional logic for
1120        Unicode subclasses.
1121 
1122        NOTE: This API should really only be used for object which
1123        represent *encoded* Unicode !
1124 
1125     */
1126     if (PyUnicode_Check(obj)) {
1127         if (encoding) {
1128             PyErr_SetString(PyExc_TypeError,
1129                             "decoding Unicode is not supported");
1130             return NULL;
1131         }
1132         return PyObject_Unicode(obj);
1133     }
1134 #else
1135     if (PyUnicode_Check(obj)) {
1136         PyErr_SetString(PyExc_TypeError,
1137                         "decoding Unicode is not supported");
1138         return NULL;
1139     }
1140 #endif
1141 
1142     /* Coerce object */
1143     if (PyString_Check(obj)) {
1144         s = PyString_AS_STRING(obj);
1145         len = PyString_GET_SIZE(obj);
1146     }
1147     else if (PyByteArray_Check(obj)) {
1148         /* Python 2.x specific */
1149         PyErr_Format(PyExc_TypeError,
1150                      "decoding bytearray is not supported");
1151         return NULL;
1152     }
1153     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1154         /* Overwrite the error message with something more useful in
1155            case of a TypeError. */
1156         if (PyErr_ExceptionMatches(PyExc_TypeError))
1157             PyErr_Format(PyExc_TypeError,
1158                          "coercing to Unicode: need string or buffer, "
1159                          "%.80s found",
1160                          Py_TYPE(obj)->tp_name);
1161         goto onError;
1162     }
1163 
1164     /* Convert to Unicode */
1165     if (len == 0) {
1166         Py_INCREF(unicode_empty);
1167         v = (PyObject *)unicode_empty;
1168     }
1169     else
1170         v = PyUnicode_Decode(s, len, encoding, errors);
1171 
1172     return v;
1173 
1174   onError:
1175     return NULL;
1176 }
1177 
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)1178 PyObject *PyUnicode_Decode(const char *s,
1179                            Py_ssize_t size,
1180                            const char *encoding,
1181                            const char *errors)
1182 {
1183     PyObject *buffer = NULL, *unicode;
1184 
1185     if (encoding == NULL)
1186         encoding = PyUnicode_GetDefaultEncoding();
1187 
1188     /* Shortcuts for common default encodings */
1189     if (strcmp(encoding, "utf-8") == 0)
1190         return PyUnicode_DecodeUTF8(s, size, errors);
1191     else if (strcmp(encoding, "latin-1") == 0)
1192         return PyUnicode_DecodeLatin1(s, size, errors);
1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194     else if (strcmp(encoding, "mbcs") == 0)
1195         return PyUnicode_DecodeMBCS(s, size, errors);
1196 #endif
1197     else if (strcmp(encoding, "ascii") == 0)
1198         return PyUnicode_DecodeASCII(s, size, errors);
1199 
1200     /* Decode via the codec registry */
1201     buffer = PyBuffer_FromMemory((void *)s, size);
1202     if (buffer == NULL)
1203         goto onError;
1204     unicode = PyCodec_Decode(buffer, encoding, errors);
1205     if (unicode == NULL)
1206         goto onError;
1207     if (!PyUnicode_Check(unicode)) {
1208         PyErr_Format(PyExc_TypeError,
1209                      "decoder did not return an unicode object (type=%.400s)",
1210                      Py_TYPE(unicode)->tp_name);
1211         Py_DECREF(unicode);
1212         goto onError;
1213     }
1214     Py_DECREF(buffer);
1215     return unicode;
1216 
1217   onError:
1218     Py_XDECREF(buffer);
1219     return NULL;
1220 }
1221 
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)1222 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223                                     const char *encoding,
1224                                     const char *errors)
1225 {
1226     PyObject *v;
1227 
1228     if (!PyUnicode_Check(unicode)) {
1229         PyErr_BadArgument();
1230         goto onError;
1231     }
1232 
1233     if (encoding == NULL)
1234         encoding = PyUnicode_GetDefaultEncoding();
1235 
1236     /* Decode via the codec registry */
1237     v = PyCodec_Decode(unicode, encoding, errors);
1238     if (v == NULL)
1239         goto onError;
1240     return v;
1241 
1242   onError:
1243     return NULL;
1244 }
1245 
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1247                            Py_ssize_t size,
1248                            const char *encoding,
1249                            const char *errors)
1250 {
1251     PyObject *v, *unicode;
1252 
1253     unicode = PyUnicode_FromUnicode(s, size);
1254     if (unicode == NULL)
1255         return NULL;
1256     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257     Py_DECREF(unicode);
1258     return v;
1259 }
1260 
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)1261 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262                                     const char *encoding,
1263                                     const char *errors)
1264 {
1265     PyObject *v;
1266 
1267     if (!PyUnicode_Check(unicode)) {
1268         PyErr_BadArgument();
1269         goto onError;
1270     }
1271 
1272     if (encoding == NULL)
1273         encoding = PyUnicode_GetDefaultEncoding();
1274 
1275     /* Encode via the codec registry */
1276     v = PyCodec_Encode(unicode, encoding, errors);
1277     if (v == NULL)
1278         goto onError;
1279     return v;
1280 
1281   onError:
1282     return NULL;
1283 }
1284 
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286                                     const char *encoding,
1287                                     const char *errors)
1288 {
1289     PyObject *v;
1290 
1291     if (!PyUnicode_Check(unicode)) {
1292         PyErr_BadArgument();
1293         goto onError;
1294     }
1295 
1296     if (encoding == NULL)
1297         encoding = PyUnicode_GetDefaultEncoding();
1298 
1299     /* Shortcuts for common default encodings */
1300     if (errors == NULL) {
1301         if (strcmp(encoding, "utf-8") == 0)
1302             return PyUnicode_AsUTF8String(unicode);
1303         else if (strcmp(encoding, "latin-1") == 0)
1304             return PyUnicode_AsLatin1String(unicode);
1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306         else if (strcmp(encoding, "mbcs") == 0)
1307             return PyUnicode_AsMBCSString(unicode);
1308 #endif
1309         else if (strcmp(encoding, "ascii") == 0)
1310             return PyUnicode_AsASCIIString(unicode);
1311     }
1312 
1313     /* Encode via the codec registry */
1314     v = PyCodec_Encode(unicode, encoding, errors);
1315     if (v == NULL)
1316         goto onError;
1317     if (!PyString_Check(v)) {
1318         PyErr_Format(PyExc_TypeError,
1319                      "encoder did not return a string object (type=%.400s)",
1320                      Py_TYPE(v)->tp_name);
1321         Py_DECREF(v);
1322         goto onError;
1323     }
1324     return v;
1325 
1326   onError:
1327     return NULL;
1328 }
1329 
_PyUnicode_AsDefaultEncodedString(PyObject * unicode,const char * errors)1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1331                                             const char *errors)
1332 {
1333     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334 
1335     if (v)
1336         return v;
1337     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338     if (v && errors == NULL)
1339         ((PyUnicodeObject *)unicode)->defenc = v;
1340     return v;
1341 }
1342 
PyUnicode_AsUnicode(PyObject * unicode)1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344 {
1345     if (!PyUnicode_Check(unicode)) {
1346         PyErr_BadArgument();
1347         goto onError;
1348     }
1349     return PyUnicode_AS_UNICODE(unicode);
1350 
1351   onError:
1352     return NULL;
1353 }
1354 
PyUnicode_GetSize(PyObject * unicode)1355 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1356 {
1357     if (!PyUnicode_Check(unicode)) {
1358         PyErr_BadArgument();
1359         goto onError;
1360     }
1361     return PyUnicode_GET_SIZE(unicode);
1362 
1363   onError:
1364     return -1;
1365 }
1366 
PyUnicode_GetDefaultEncoding(void)1367 const char *PyUnicode_GetDefaultEncoding(void)
1368 {
1369     return unicode_default_encoding;
1370 }
1371 
PyUnicode_SetDefaultEncoding(const char * encoding)1372 int PyUnicode_SetDefaultEncoding(const char *encoding)
1373 {
1374     PyObject *v;
1375 
1376     /* Make sure the encoding is valid. As side effect, this also
1377        loads the encoding into the codec registry cache. */
1378     v = _PyCodec_Lookup(encoding);
1379     if (v == NULL)
1380         goto onError;
1381     Py_DECREF(v);
1382     strncpy(unicode_default_encoding,
1383             encoding,
1384             sizeof(unicode_default_encoding));
1385     return 0;
1386 
1387   onError:
1388     return -1;
1389 }
1390 
1391 /* error handling callback helper:
1392    build arguments, call the callback and check the arguments,
1393    if no exception occurred, copy the replacement to the output
1394    and adjust various state variables.
1395    return 0 on success, -1 on error
1396 */
1397 
1398 static
unicode_decode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char * input,Py_ssize_t insize,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyUnicodeObject ** output,Py_ssize_t * outpos,Py_UNICODE ** outptr)1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1400                                      const char *encoding, const char *reason,
1401                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1404 {
1405     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1406 
1407     PyObject *restuple = NULL;
1408     PyObject *repunicode = NULL;
1409     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410     Py_ssize_t requiredsize;
1411     Py_ssize_t newpos;
1412     Py_UNICODE *repptr;
1413     Py_ssize_t repsize;
1414     int res = -1;
1415 
1416     if (*errorHandler == NULL) {
1417         *errorHandler = PyCodec_LookupError(errors);
1418         if (*errorHandler == NULL)
1419             goto onError;
1420     }
1421 
1422     if (*exceptionObject == NULL) {
1423         *exceptionObject = PyUnicodeDecodeError_Create(
1424             encoding, input, insize, *startinpos, *endinpos, reason);
1425         if (*exceptionObject == NULL)
1426             goto onError;
1427     }
1428     else {
1429         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430             goto onError;
1431         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432             goto onError;
1433         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434             goto onError;
1435     }
1436 
1437     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438     if (restuple == NULL)
1439         goto onError;
1440     if (!PyTuple_Check(restuple)) {
1441         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1442         goto onError;
1443     }
1444     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1445         goto onError;
1446     if (newpos<0)
1447         newpos = insize+newpos;
1448     if (newpos<0 || newpos>insize) {
1449         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450         goto onError;
1451     }
1452 
1453     /* need more space? (at least enough for what we
1454        have+the replacement+the rest of the string (starting
1455        at the new input position), so we won't have to check space
1456        when there are no errors in the rest of the string) */
1457     repptr = PyUnicode_AS_UNICODE(repunicode);
1458     repsize = PyUnicode_GET_SIZE(repunicode);
1459     requiredsize = *outpos + repsize + insize-newpos;
1460     if (requiredsize > outsize) {
1461         if (requiredsize<2*outsize)
1462             requiredsize = 2*outsize;
1463         if (_PyUnicode_Resize(output, requiredsize) < 0)
1464             goto onError;
1465         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1466     }
1467     *endinpos = newpos;
1468     *inptr = input + newpos;
1469     Py_UNICODE_COPY(*outptr, repptr, repsize);
1470     *outptr += repsize;
1471     *outpos += repsize;
1472     /* we made it! */
1473     res = 0;
1474 
1475   onError:
1476     Py_XDECREF(restuple);
1477     return res;
1478 }
1479 
1480 /* --- UTF-7 Codec -------------------------------------------------------- */
1481 
1482 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1483 
1484 /* Three simple macros defining base-64. */
1485 
1486 /* Is c a base-64 character? */
1487 
1488 #define IS_BASE64(c) \
1489     (isalnum(c) || (c) == '+' || (c) == '/')
1490 
1491 /* given that c is a base-64 character, what is its base-64 value? */
1492 
1493 #define FROM_BASE64(c)                                                  \
1494     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1495      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1496      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1497      (c) == '+' ? 62 : 63)
1498 
1499 /* What is the base-64 character of the bottom 6 bits of n? */
1500 
1501 #define TO_BASE64(n)  \
1502     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503 
1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505  * decoded as itself.  We are permissive on decoding; the only ASCII
1506  * byte not decoding to itself is the + which begins a base64
1507  * string. */
1508 
1509 #define DECODE_DIRECT(c)                                \
1510     ((c) <= 127 && (c) != '+')
1511 
1512 /* The UTF-7 encoder treats ASCII characters differently according to
1513  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514  * the above).  See RFC2152.  This array identifies these different
1515  * sets:
1516  * 0 : "Set D"
1517  *     alphanumeric and '(),-./:?
1518  * 1 : "Set O"
1519  *     !"#$%&*;<=>@[]^_`{|}
1520  * 2 : "whitespace"
1521  *     ht nl cr sp
1522  * 3 : special (must be base64 encoded)
1523  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524  */
1525 
1526 static
1527 char utf7_category[128] = {
1528 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1529     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1531     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1532 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1533     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1534 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1535     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1536 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1537     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1538 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1539     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1540 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1541     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1542 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1543     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1544 };
1545 
1546 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1547  * answer depends on whether we are encoding set O as itself, and also
1548  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1549  * clear that the answers to these questions vary between
1550  * applications, so this code needs to be flexible.  */
1551 
1552 #define ENCODE_DIRECT(c, directO, directWS)             \
1553     ((c) < 128 && (c) > 0 &&                            \
1554      ((utf7_category[(c)] == 0) ||                      \
1555       (directWS && (utf7_category[(c)] == 2)) ||        \
1556       (directO && (utf7_category[(c)] == 1))))
1557 
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)1558 PyObject *PyUnicode_DecodeUTF7(const char *s,
1559                                Py_ssize_t size,
1560                                const char *errors)
1561 {
1562     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563 }
1564 
1565 /* The decoder.  The only state we preserve is our read position,
1566  * i.e. how many characters we have consumed.  So if we end in the
1567  * middle of a shift sequence we have to back off the read position
1568  * and the output to the beginning of the sequence, otherwise we lose
1569  * all the shift state (seen bits, number of bits seen, high
1570  * surrogate). */
1571 
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1572 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1573                                        Py_ssize_t size,
1574                                        const char *errors,
1575                                        Py_ssize_t *consumed)
1576 {
1577     const char *starts = s;
1578     Py_ssize_t startinpos;
1579     Py_ssize_t endinpos;
1580     Py_ssize_t outpos;
1581     const char *e;
1582     PyUnicodeObject *unicode;
1583     Py_UNICODE *p;
1584     const char *errmsg = "";
1585     int inShift = 0;
1586     Py_UNICODE *shiftOutStart;
1587     unsigned int base64bits = 0;
1588     unsigned long base64buffer = 0;
1589     Py_UNICODE surrogate = 0;
1590     PyObject *errorHandler = NULL;
1591     PyObject *exc = NULL;
1592 
1593     unicode = _PyUnicode_New(size);
1594     if (!unicode)
1595         return NULL;
1596     if (size == 0) {
1597         if (consumed)
1598             *consumed = 0;
1599         return (PyObject *)unicode;
1600     }
1601 
1602     p = unicode->str;
1603     shiftOutStart = p;
1604     e = s + size;
1605 
1606     while (s < e) {
1607         Py_UNICODE ch = (unsigned char) *s;
1608 
1609         if (inShift) { /* in a base-64 section */
1610             if (IS_BASE64(ch)) { /* consume a base-64 character */
1611                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612                 base64bits += 6;
1613                 s++;
1614                 if (base64bits >= 16) {
1615                     /* we have enough bits for a UTF-16 value */
1616                     Py_UNICODE outCh = (Py_UNICODE)
1617                                        (base64buffer >> (base64bits-16));
1618                     base64bits -= 16;
1619                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620                     if (surrogate) {
1621                         /* expecting a second surrogate */
1622                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623 #ifdef Py_UNICODE_WIDE
1624                             *p++ = (((surrogate & 0x3FF)<<10)
1625                                     | (outCh & 0x3FF)) + 0x10000;
1626 #else
1627                             *p++ = surrogate;
1628                             *p++ = outCh;
1629 #endif
1630                             surrogate = 0;
1631                         }
1632                         else {
1633                             surrogate = 0;
1634                             errmsg = "second surrogate missing";
1635                             goto utf7Error;
1636                         }
1637                     }
1638                     else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1639                         /* first surrogate */
1640                         surrogate = outCh;
1641                     }
1642                     else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1643                         errmsg = "unexpected second surrogate";
1644                         goto utf7Error;
1645                     }
1646                     else {
1647                         *p++ = outCh;
1648                     }
1649                 }
1650             }
1651             else { /* now leaving a base-64 section */
1652                 inShift = 0;
1653                 s++;
1654                 if (surrogate) {
1655                     errmsg = "second surrogate missing at end of shift sequence";
1656                     goto utf7Error;
1657                 }
1658                 if (base64bits > 0) { /* left-over bits */
1659                     if (base64bits >= 6) {
1660                         /* We've seen at least one base-64 character */
1661                         errmsg = "partial character in shift sequence";
1662                         goto utf7Error;
1663                     }
1664                     else {
1665                         /* Some bits remain; they should be zero */
1666                         if (base64buffer != 0) {
1667                             errmsg = "non-zero padding bits in shift sequence";
1668                             goto utf7Error;
1669                         }
1670                     }
1671                 }
1672                 if (ch != '-') {
1673                     /* '-' is absorbed; other terminating
1674                        characters are preserved */
1675                     *p++ = ch;
1676                 }
1677             }
1678         }
1679         else if ( ch == '+' ) {
1680             startinpos = s-starts;
1681             s++; /* consume '+' */
1682             if (s < e && *s == '-') { /* '+-' encodes '+' */
1683                 s++;
1684                 *p++ = '+';
1685             }
1686             else { /* begin base64-encoded section */
1687                 inShift = 1;
1688                 shiftOutStart = p;
1689                 base64bits = 0;
1690             }
1691         }
1692         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1693             *p++ = ch;
1694             s++;
1695         }
1696         else {
1697             startinpos = s-starts;
1698             s++;
1699             errmsg = "unexpected special character";
1700             goto utf7Error;
1701         }
1702         continue;
1703 utf7Error:
1704         outpos = p-PyUnicode_AS_UNICODE(unicode);
1705         endinpos = s-starts;
1706         if (unicode_decode_call_errorhandler(
1707                 errors, &errorHandler,
1708                 "utf7", errmsg,
1709                 starts, size, &startinpos, &endinpos, &exc, &s,
1710                 &unicode, &outpos, &p))
1711             goto onError;
1712     }
1713 
1714     /* end of string */
1715 
1716     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1717         /* if we're in an inconsistent state, that's an error */
1718         if (surrogate ||
1719                 (base64bits >= 6) ||
1720                 (base64bits > 0 && base64buffer != 0)) {
1721             outpos = p-PyUnicode_AS_UNICODE(unicode);
1722             endinpos = size;
1723             if (unicode_decode_call_errorhandler(
1724                     errors, &errorHandler,
1725                     "utf7", "unterminated shift sequence",
1726                     starts, size, &startinpos, &endinpos, &exc, &s,
1727                     &unicode, &outpos, &p))
1728                 goto onError;
1729         }
1730     }
1731 
1732     /* return state */
1733     if (consumed) {
1734         if (inShift) {
1735             p = shiftOutStart; /* back off output */
1736             *consumed = startinpos;
1737         }
1738         else {
1739             *consumed = s-starts;
1740         }
1741     }
1742 
1743     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1744         goto onError;
1745 
1746     Py_XDECREF(errorHandler);
1747     Py_XDECREF(exc);
1748     return (PyObject *)unicode;
1749 
1750   onError:
1751     Py_XDECREF(errorHandler);
1752     Py_XDECREF(exc);
1753     Py_DECREF(unicode);
1754     return NULL;
1755 }
1756 
1757 
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)1758 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1759                                Py_ssize_t size,
1760                                int base64SetO,
1761                                int base64WhiteSpace,
1762                                const char *errors)
1763 {
1764     PyObject *v;
1765     /* It might be possible to tighten this worst case */
1766     Py_ssize_t allocated = 8 * size;
1767     int inShift = 0;
1768     Py_ssize_t i = 0;
1769     unsigned int base64bits = 0;
1770     unsigned long base64buffer = 0;
1771     char * out;
1772     char * start;
1773 
1774     if (allocated / 8 != size)
1775         return PyErr_NoMemory();
1776 
1777     if (size == 0)
1778         return PyString_FromStringAndSize(NULL, 0);
1779 
1780     v = PyString_FromStringAndSize(NULL, allocated);
1781     if (v == NULL)
1782         return NULL;
1783 
1784     start = out = PyString_AS_STRING(v);
1785     for (;i < size; ++i) {
1786         Py_UNICODE ch = s[i];
1787 
1788         if (inShift) {
1789             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1790                 /* shifting out */
1791                 if (base64bits) { /* output remaining bits */
1792                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1793                     base64buffer = 0;
1794                     base64bits = 0;
1795                 }
1796                 inShift = 0;
1797                 /* Characters not in the BASE64 set implicitly unshift the sequence
1798                    so no '-' is required, except if the character is itself a '-' */
1799                 if (IS_BASE64(ch) || ch == '-') {
1800                     *out++ = '-';
1801                 }
1802                 *out++ = (char) ch;
1803             }
1804             else {
1805                 goto encode_char;
1806             }
1807         }
1808         else { /* not in a shift sequence */
1809             if (ch == '+') {
1810                 *out++ = '+';
1811                         *out++ = '-';
1812             }
1813             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1814                 *out++ = (char) ch;
1815             }
1816             else {
1817                 *out++ = '+';
1818                 inShift = 1;
1819                 goto encode_char;
1820             }
1821         }
1822         continue;
1823 encode_char:
1824 #ifdef Py_UNICODE_WIDE
1825         if (ch >= 0x10000) {
1826             /* code first surrogate */
1827             base64bits += 16;
1828             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1829             while (base64bits >= 6) {
1830                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1831                 base64bits -= 6;
1832             }
1833             /* prepare second surrogate */
1834             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1835         }
1836 #endif
1837         base64bits += 16;
1838         base64buffer = (base64buffer << 16) | ch;
1839         while (base64bits >= 6) {
1840             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1841             base64bits -= 6;
1842         }
1843     }
1844     if (base64bits)
1845         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1846     if (inShift)
1847         *out++ = '-';
1848 
1849     if (_PyString_Resize(&v, out - start))
1850         return NULL;
1851     return v;
1852 }
1853 
1854 #undef IS_BASE64
1855 #undef FROM_BASE64
1856 #undef TO_BASE64
1857 #undef DECODE_DIRECT
1858 #undef ENCODE_DIRECT
1859 
1860 /* --- UTF-8 Codec -------------------------------------------------------- */
1861 
1862 static
1863 char utf8_code_length[256] = {
1864     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
1865        illegal prefix.  See RFC 3629 for details */
1866     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1867     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1874     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1875     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1878     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1879     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1880     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1881     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
1882 };
1883 
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)1884 PyObject *PyUnicode_DecodeUTF8(const char *s,
1885                                Py_ssize_t size,
1886                                const char *errors)
1887 {
1888     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1889 }
1890 
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)1891 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1892                                        Py_ssize_t size,
1893                                        const char *errors,
1894                                        Py_ssize_t *consumed)
1895 {
1896     const char *starts = s;
1897     int n;
1898     int k;
1899     Py_ssize_t startinpos;
1900     Py_ssize_t endinpos;
1901     Py_ssize_t outpos;
1902     const char *e;
1903     PyUnicodeObject *unicode;
1904     Py_UNICODE *p;
1905     const char *errmsg = "";
1906     PyObject *errorHandler = NULL;
1907     PyObject *exc = NULL;
1908 
1909     /* Note: size will always be longer than the resulting Unicode
1910        character count */
1911     unicode = _PyUnicode_New(size);
1912     if (!unicode)
1913         return NULL;
1914     if (size == 0) {
1915         if (consumed)
1916             *consumed = 0;
1917         return (PyObject *)unicode;
1918     }
1919 
1920     /* Unpack UTF-8 encoded data */
1921     p = unicode->str;
1922     e = s + size;
1923 
1924     while (s < e) {
1925         Py_UCS4 ch = (unsigned char)*s;
1926 
1927         if (ch < 0x80) {
1928             *p++ = (Py_UNICODE)ch;
1929             s++;
1930             continue;
1931         }
1932 
1933         n = utf8_code_length[ch];
1934 
1935         if (s + n > e) {
1936             if (consumed)
1937                 break;
1938             else {
1939                 errmsg = "unexpected end of data";
1940                 startinpos = s-starts;
1941                 endinpos = startinpos+1;
1942                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1943                     endinpos++;
1944                 goto utf8Error;
1945             }
1946         }
1947 
1948         switch (n) {
1949 
1950         case 0:
1951             errmsg = "invalid start byte";
1952             startinpos = s-starts;
1953             endinpos = startinpos+1;
1954             goto utf8Error;
1955 
1956         case 1:
1957             errmsg = "internal error";
1958             startinpos = s-starts;
1959             endinpos = startinpos+1;
1960             goto utf8Error;
1961 
1962         case 2:
1963             if ((s[1] & 0xc0) != 0x80) {
1964                 errmsg = "invalid continuation byte";
1965                 startinpos = s-starts;
1966                 endinpos = startinpos + 1;
1967                 goto utf8Error;
1968             }
1969             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1970             assert ((ch > 0x007F) && (ch <= 0x07FF));
1971             *p++ = (Py_UNICODE)ch;
1972             break;
1973 
1974         case 3:
1975             /* XXX: surrogates shouldn't be valid UTF-8!
1976                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1977                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1978                Uncomment the 2 lines below to make them invalid,
1979                codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
1980             if ((s[1] & 0xc0) != 0x80 ||
1981                 (s[2] & 0xc0) != 0x80 ||
1982                 ((unsigned char)s[0] == 0xE0 &&
1983                  (unsigned char)s[1] < 0xA0)/* ||
1984                 ((unsigned char)s[0] == 0xED &&
1985                  (unsigned char)s[1] > 0x9F)*/) {
1986                 errmsg = "invalid continuation byte";
1987                 startinpos = s-starts;
1988                 endinpos = startinpos + 1;
1989 
1990                 /* if s[1] first two bits are 1 and 0, then the invalid
1991                    continuation byte is s[2], so increment endinpos by 1,
1992                    if not, s[1] is invalid and endinpos doesn't need to
1993                    be incremented. */
1994                 if ((s[1] & 0xC0) == 0x80)
1995                     endinpos++;
1996                 goto utf8Error;
1997             }
1998             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1999             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2000             *p++ = (Py_UNICODE)ch;
2001             break;
2002 
2003         case 4:
2004             if ((s[1] & 0xc0) != 0x80 ||
2005                 (s[2] & 0xc0) != 0x80 ||
2006                 (s[3] & 0xc0) != 0x80 ||
2007                 ((unsigned char)s[0] == 0xF0 &&
2008                  (unsigned char)s[1] < 0x90) ||
2009                 ((unsigned char)s[0] == 0xF4 &&
2010                  (unsigned char)s[1] > 0x8F)) {
2011                 errmsg = "invalid continuation byte";
2012                 startinpos = s-starts;
2013                 endinpos = startinpos + 1;
2014                 if ((s[1] & 0xC0) == 0x80) {
2015                     endinpos++;
2016                     if ((s[2] & 0xC0) == 0x80)
2017                         endinpos++;
2018                 }
2019                 goto utf8Error;
2020             }
2021             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2022                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2023             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2024 
2025 #ifdef Py_UNICODE_WIDE
2026             *p++ = (Py_UNICODE)ch;
2027 #else
2028             /*  compute and append the two surrogates: */
2029 
2030             /*  translate from 10000..10FFFF to 0..FFFF */
2031             ch -= 0x10000;
2032 
2033             /*  high surrogate = top 10 bits added to D800 */
2034             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2035 
2036             /*  low surrogate = bottom 10 bits added to DC00 */
2037             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2038 #endif
2039             break;
2040         }
2041         s += n;
2042         continue;
2043 
2044       utf8Error:
2045         outpos = p-PyUnicode_AS_UNICODE(unicode);
2046         if (unicode_decode_call_errorhandler(
2047                 errors, &errorHandler,
2048                 "utf8", errmsg,
2049                 starts, size, &startinpos, &endinpos, &exc, &s,
2050                 &unicode, &outpos, &p))
2051             goto onError;
2052     }
2053     if (consumed)
2054         *consumed = s-starts;
2055 
2056     /* Adjust length */
2057     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2058         goto onError;
2059 
2060     Py_XDECREF(errorHandler);
2061     Py_XDECREF(exc);
2062     return (PyObject *)unicode;
2063 
2064   onError:
2065     Py_XDECREF(errorHandler);
2066     Py_XDECREF(exc);
2067     Py_DECREF(unicode);
2068     return NULL;
2069 }
2070 
2071 /* Allocation strategy:  if the string is short, convert into a stack buffer
2072    and allocate exactly as much space needed at the end.  Else allocate the
2073    maximum possible needed (4 result bytes per Unicode character), and return
2074    the excess memory at the end.
2075 */
2076 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)2077 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2078                      Py_ssize_t size,
2079                      const char *errors)
2080 {
2081 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2082 
2083     Py_ssize_t i;           /* index into s of next input byte */
2084     PyObject *v;        /* result string object */
2085     char *p;            /* next free byte in output buffer */
2086     Py_ssize_t nallocated;  /* number of result bytes allocated */
2087     Py_ssize_t nneeded;        /* number of result bytes needed */
2088     char stackbuf[MAX_SHORT_UNICHARS * 4];
2089 
2090     assert(s != NULL);
2091     assert(size >= 0);
2092 
2093     if (size <= MAX_SHORT_UNICHARS) {
2094         /* Write into the stack buffer; nallocated can't overflow.
2095          * At the end, we'll allocate exactly as much heap space as it
2096          * turns out we need.
2097          */
2098         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2099         v = NULL;   /* will allocate after we're done */
2100         p = stackbuf;
2101     }
2102     else {
2103         /* Overallocate on the heap, and give the excess back at the end. */
2104         nallocated = size * 4;
2105         if (nallocated / 4 != size)  /* overflow! */
2106             return PyErr_NoMemory();
2107         v = PyString_FromStringAndSize(NULL, nallocated);
2108         if (v == NULL)
2109             return NULL;
2110         p = PyString_AS_STRING(v);
2111     }
2112 
2113     for (i = 0; i < size;) {
2114         Py_UCS4 ch = s[i++];
2115 
2116         if (ch < 0x80)
2117             /* Encode ASCII */
2118             *p++ = (char) ch;
2119 
2120         else if (ch < 0x0800) {
2121             /* Encode Latin-1 */
2122             *p++ = (char)(0xc0 | (ch >> 6));
2123             *p++ = (char)(0x80 | (ch & 0x3f));
2124         }
2125         else {
2126             /* Encode UCS2 Unicode ordinals */
2127             if (ch < 0x10000) {
2128                 /* Special case: check for high surrogate */
2129                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2130                     Py_UCS4 ch2 = s[i];
2131                     /* Check for low surrogate and combine the two to
2132                        form a UCS4 value */
2133                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2134                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2135                         i++;
2136                         goto encodeUCS4;
2137                     }
2138                     /* Fall through: handles isolated high surrogates */
2139                 }
2140                 *p++ = (char)(0xe0 | (ch >> 12));
2141                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2142                 *p++ = (char)(0x80 | (ch & 0x3f));
2143                 continue;
2144             }
2145           encodeUCS4:
2146             /* Encode UCS4 Unicode ordinals */
2147             *p++ = (char)(0xf0 | (ch >> 18));
2148             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2149             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2150             *p++ = (char)(0x80 | (ch & 0x3f));
2151         }
2152     }
2153 
2154     if (v == NULL) {
2155         /* This was stack allocated. */
2156         nneeded = p - stackbuf;
2157         assert(nneeded <= nallocated);
2158         v = PyString_FromStringAndSize(stackbuf, nneeded);
2159     }
2160     else {
2161         /* Cut back to size actually needed. */
2162         nneeded = p - PyString_AS_STRING(v);
2163         assert(nneeded <= nallocated);
2164         if (_PyString_Resize(&v, nneeded))
2165             return NULL;
2166     }
2167     return v;
2168 
2169 #undef MAX_SHORT_UNICHARS
2170 }
2171 
PyUnicode_AsUTF8String(PyObject * unicode)2172 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2173 {
2174     if (!PyUnicode_Check(unicode)) {
2175         PyErr_BadArgument();
2176         return NULL;
2177     }
2178     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2179                                 PyUnicode_GET_SIZE(unicode),
2180                                 NULL);
2181 }
2182 
2183 /* --- UTF-32 Codec ------------------------------------------------------- */
2184 
2185 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2186 PyUnicode_DecodeUTF32(const char *s,
2187                       Py_ssize_t size,
2188                       const char *errors,
2189                       int *byteorder)
2190 {
2191     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2192 }
2193 
2194 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2195 PyUnicode_DecodeUTF32Stateful(const char *s,
2196                               Py_ssize_t size,
2197                               const char *errors,
2198                               int *byteorder,
2199                               Py_ssize_t *consumed)
2200 {
2201     const char *starts = s;
2202     Py_ssize_t startinpos;
2203     Py_ssize_t endinpos;
2204     Py_ssize_t outpos;
2205     PyUnicodeObject *unicode;
2206     Py_UNICODE *p;
2207 #ifndef Py_UNICODE_WIDE
2208     int pairs = 0;
2209     const unsigned char *qq;
2210 #else
2211     const int pairs = 0;
2212 #endif
2213     const unsigned char *q, *e;
2214     int bo = 0;       /* assume native ordering by default */
2215     const char *errmsg = "";
2216     /* Offsets from q for retrieving bytes in the right order. */
2217 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2218     int iorder[] = {0, 1, 2, 3};
2219 #else
2220     int iorder[] = {3, 2, 1, 0};
2221 #endif
2222     PyObject *errorHandler = NULL;
2223     PyObject *exc = NULL;
2224 
2225     q = (unsigned char *)s;
2226     e = q + size;
2227 
2228     if (byteorder)
2229         bo = *byteorder;
2230 
2231     /* Check for BOM marks (U+FEFF) in the input and adjust current
2232        byte order setting accordingly. In native mode, the leading BOM
2233        mark is skipped, in all other modes, it is copied to the output
2234        stream as-is (giving a ZWNBSP character). */
2235     if (bo == 0) {
2236         if (size >= 4) {
2237             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2238                 (q[iorder[1]] << 8) | q[iorder[0]];
2239 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2240             if (bom == 0x0000FEFF) {
2241                 q += 4;
2242                 bo = -1;
2243             }
2244             else if (bom == 0xFFFE0000) {
2245                 q += 4;
2246                 bo = 1;
2247             }
2248 #else
2249             if (bom == 0x0000FEFF) {
2250                 q += 4;
2251                 bo = 1;
2252             }
2253             else if (bom == 0xFFFE0000) {
2254                 q += 4;
2255                 bo = -1;
2256             }
2257 #endif
2258         }
2259     }
2260 
2261     if (bo == -1) {
2262         /* force LE */
2263         iorder[0] = 0;
2264         iorder[1] = 1;
2265         iorder[2] = 2;
2266         iorder[3] = 3;
2267     }
2268     else if (bo == 1) {
2269         /* force BE */
2270         iorder[0] = 3;
2271         iorder[1] = 2;
2272         iorder[2] = 1;
2273         iorder[3] = 0;
2274     }
2275 
2276     /* On narrow builds we split characters outside the BMP into two
2277        codepoints => count how much extra space we need. */
2278 #ifndef Py_UNICODE_WIDE
2279     for (qq = q; qq < e; qq += 4)
2280         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2281             pairs++;
2282 #endif
2283 
2284     /* This might be one to much, because of a BOM */
2285     unicode = _PyUnicode_New((size+3)/4+pairs);
2286     if (!unicode)
2287         return NULL;
2288     if (size == 0)
2289         return (PyObject *)unicode;
2290 
2291     /* Unpack UTF-32 encoded data */
2292     p = unicode->str;
2293 
2294     while (q < e) {
2295         Py_UCS4 ch;
2296         /* remaining bytes at the end? (size should be divisible by 4) */
2297         if (e-q<4) {
2298             if (consumed)
2299                 break;
2300             errmsg = "truncated data";
2301             startinpos = ((const char *)q)-starts;
2302             endinpos = ((const char *)e)-starts;
2303             goto utf32Error;
2304             /* The remaining input chars are ignored if the callback
2305                chooses to skip the input */
2306         }
2307         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2308             (q[iorder[1]] << 8) | q[iorder[0]];
2309 
2310         if (ch >= 0x110000)
2311         {
2312             errmsg = "codepoint not in range(0x110000)";
2313             startinpos = ((const char *)q)-starts;
2314             endinpos = startinpos+4;
2315             goto utf32Error;
2316         }
2317 #ifndef Py_UNICODE_WIDE
2318         if (ch >= 0x10000)
2319         {
2320             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2321             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2322         }
2323         else
2324 #endif
2325             *p++ = ch;
2326         q += 4;
2327         continue;
2328       utf32Error:
2329         outpos = p-PyUnicode_AS_UNICODE(unicode);
2330         if (unicode_decode_call_errorhandler(
2331                 errors, &errorHandler,
2332                 "utf32", errmsg,
2333                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2334                 &unicode, &outpos, &p))
2335             goto onError;
2336     }
2337 
2338     if (byteorder)
2339         *byteorder = bo;
2340 
2341     if (consumed)
2342         *consumed = (const char *)q-starts;
2343 
2344     /* Adjust length */
2345     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2346         goto onError;
2347 
2348     Py_XDECREF(errorHandler);
2349     Py_XDECREF(exc);
2350     return (PyObject *)unicode;
2351 
2352   onError:
2353     Py_DECREF(unicode);
2354     Py_XDECREF(errorHandler);
2355     Py_XDECREF(exc);
2356     return NULL;
2357 }
2358 
2359 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2360 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2361                       Py_ssize_t size,
2362                       const char *errors,
2363                       int byteorder)
2364 {
2365     PyObject *v;
2366     unsigned char *p;
2367     Py_ssize_t nsize, bytesize;
2368 #ifndef Py_UNICODE_WIDE
2369     Py_ssize_t i, pairs;
2370 #else
2371     const int pairs = 0;
2372 #endif
2373     /* Offsets from p for storing byte pairs in the right order. */
2374 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2375     int iorder[] = {0, 1, 2, 3};
2376 #else
2377     int iorder[] = {3, 2, 1, 0};
2378 #endif
2379 
2380 #define STORECHAR(CH)                           \
2381     do {                                        \
2382         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2383         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2384         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2385         p[iorder[0]] = (CH) & 0xff;             \
2386         p += 4;                                 \
2387     } while(0)
2388 
2389     /* In narrow builds we can output surrogate pairs as one codepoint,
2390        so we need less space. */
2391 #ifndef Py_UNICODE_WIDE
2392     for (i = pairs = 0; i < size-1; i++)
2393         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2394             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2395             pairs++;
2396 #endif
2397     nsize = (size - pairs + (byteorder == 0));
2398     bytesize = nsize * 4;
2399     if (bytesize / 4 != nsize)
2400         return PyErr_NoMemory();
2401     v = PyString_FromStringAndSize(NULL, bytesize);
2402     if (v == NULL)
2403         return NULL;
2404 
2405     p = (unsigned char *)PyString_AS_STRING(v);
2406     if (byteorder == 0)
2407         STORECHAR(0xFEFF);
2408     if (size == 0)
2409         return v;
2410 
2411     if (byteorder == -1) {
2412         /* force LE */
2413         iorder[0] = 0;
2414         iorder[1] = 1;
2415         iorder[2] = 2;
2416         iorder[3] = 3;
2417     }
2418     else if (byteorder == 1) {
2419         /* force BE */
2420         iorder[0] = 3;
2421         iorder[1] = 2;
2422         iorder[2] = 1;
2423         iorder[3] = 0;
2424     }
2425 
2426     while (size-- > 0) {
2427         Py_UCS4 ch = *s++;
2428 #ifndef Py_UNICODE_WIDE
2429         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2430             Py_UCS4 ch2 = *s;
2431             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2432                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2433                 s++;
2434                 size--;
2435             }
2436         }
2437 #endif
2438         STORECHAR(ch);
2439     }
2440     return v;
2441 #undef STORECHAR
2442 }
2443 
PyUnicode_AsUTF32String(PyObject * unicode)2444 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2445 {
2446     if (!PyUnicode_Check(unicode)) {
2447         PyErr_BadArgument();
2448         return NULL;
2449     }
2450     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2451                                  PyUnicode_GET_SIZE(unicode),
2452                                  NULL,
2453                                  0);
2454 }
2455 
2456 /* --- UTF-16 Codec ------------------------------------------------------- */
2457 
2458 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)2459 PyUnicode_DecodeUTF16(const char *s,
2460                       Py_ssize_t size,
2461                       const char *errors,
2462                       int *byteorder)
2463 {
2464     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2465 }
2466 
2467 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)2468 PyUnicode_DecodeUTF16Stateful(const char *s,
2469                               Py_ssize_t size,
2470                               const char *errors,
2471                               int *byteorder,
2472                               Py_ssize_t *consumed)
2473 {
2474     const char *starts = s;
2475     Py_ssize_t startinpos;
2476     Py_ssize_t endinpos;
2477     Py_ssize_t outpos;
2478     PyUnicodeObject *unicode;
2479     Py_UNICODE *p;
2480     const unsigned char *q, *e;
2481     int bo = 0;       /* assume native ordering by default */
2482     const char *errmsg = "";
2483     /* Offsets from q for retrieving byte pairs in the right order. */
2484 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2485     int ihi = 1, ilo = 0;
2486 #else
2487     int ihi = 0, ilo = 1;
2488 #endif
2489     PyObject *errorHandler = NULL;
2490     PyObject *exc = NULL;
2491 
2492     /* Note: size will always be longer than the resulting Unicode
2493        character count */
2494     unicode = _PyUnicode_New(size);
2495     if (!unicode)
2496         return NULL;
2497     if (size == 0)
2498         return (PyObject *)unicode;
2499 
2500     /* Unpack UTF-16 encoded data */
2501     p = unicode->str;
2502     q = (unsigned char *)s;
2503     e = q + size;
2504 
2505     if (byteorder)
2506         bo = *byteorder;
2507 
2508     /* Check for BOM marks (U+FEFF) in the input and adjust current
2509        byte order setting accordingly. In native mode, the leading BOM
2510        mark is skipped, in all other modes, it is copied to the output
2511        stream as-is (giving a ZWNBSP character). */
2512     if (bo == 0) {
2513         if (size >= 2) {
2514             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2515 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2516             if (bom == 0xFEFF) {
2517                 q += 2;
2518                 bo = -1;
2519             }
2520             else if (bom == 0xFFFE) {
2521                 q += 2;
2522                 bo = 1;
2523             }
2524 #else
2525             if (bom == 0xFEFF) {
2526                 q += 2;
2527                 bo = 1;
2528             }
2529             else if (bom == 0xFFFE) {
2530                 q += 2;
2531                 bo = -1;
2532             }
2533 #endif
2534         }
2535     }
2536 
2537     if (bo == -1) {
2538         /* force LE */
2539         ihi = 1;
2540         ilo = 0;
2541     }
2542     else if (bo == 1) {
2543         /* force BE */
2544         ihi = 0;
2545         ilo = 1;
2546     }
2547 
2548     while (q < e) {
2549         Py_UNICODE ch;
2550         /* remaining bytes at the end? (size should be even) */
2551         if (e-q<2) {
2552             if (consumed)
2553                 break;
2554             errmsg = "truncated data";
2555             startinpos = ((const char *)q)-starts;
2556             endinpos = ((const char *)e)-starts;
2557             goto utf16Error;
2558             /* The remaining input chars are ignored if the callback
2559                chooses to skip the input */
2560         }
2561         ch = (q[ihi] << 8) | q[ilo];
2562 
2563         q += 2;
2564 
2565         if (ch < 0xD800 || ch > 0xDFFF) {
2566             *p++ = ch;
2567             continue;
2568         }
2569 
2570         /* UTF-16 code pair: */
2571         if (q >= e) {
2572             errmsg = "unexpected end of data";
2573             startinpos = (((const char *)q)-2)-starts;
2574             endinpos = ((const char *)e)-starts;
2575             goto utf16Error;
2576         }
2577         if (0xD800 <= ch && ch <= 0xDBFF) {
2578             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2579             q += 2;
2580             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2581 #ifndef Py_UNICODE_WIDE
2582                 *p++ = ch;
2583                 *p++ = ch2;
2584 #else
2585                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2586 #endif
2587                 continue;
2588             }
2589             else {
2590                 errmsg = "illegal UTF-16 surrogate";
2591                 startinpos = (((const char *)q)-4)-starts;
2592                 endinpos = startinpos+2;
2593                 goto utf16Error;
2594             }
2595 
2596         }
2597         errmsg = "illegal encoding";
2598         startinpos = (((const char *)q)-2)-starts;
2599         endinpos = startinpos+2;
2600         /* Fall through to report the error */
2601 
2602       utf16Error:
2603         outpos = p-PyUnicode_AS_UNICODE(unicode);
2604         if (unicode_decode_call_errorhandler(
2605                 errors, &errorHandler,
2606                 "utf16", errmsg,
2607                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2608                 &unicode, &outpos, &p))
2609             goto onError;
2610     }
2611 
2612     if (byteorder)
2613         *byteorder = bo;
2614 
2615     if (consumed)
2616         *consumed = (const char *)q-starts;
2617 
2618     /* Adjust length */
2619     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2620         goto onError;
2621 
2622     Py_XDECREF(errorHandler);
2623     Py_XDECREF(exc);
2624     return (PyObject *)unicode;
2625 
2626   onError:
2627     Py_DECREF(unicode);
2628     Py_XDECREF(errorHandler);
2629     Py_XDECREF(exc);
2630     return NULL;
2631 }
2632 
2633 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)2634 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2635                       Py_ssize_t size,
2636                       const char *errors,
2637                       int byteorder)
2638 {
2639     PyObject *v;
2640     unsigned char *p;
2641     Py_ssize_t nsize, bytesize;
2642 #ifdef Py_UNICODE_WIDE
2643     Py_ssize_t i, pairs;
2644 #else
2645     const int pairs = 0;
2646 #endif
2647     /* Offsets from p for storing byte pairs in the right order. */
2648 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2649     int ihi = 1, ilo = 0;
2650 #else
2651     int ihi = 0, ilo = 1;
2652 #endif
2653 
2654 #define STORECHAR(CH)                           \
2655     do {                                        \
2656         p[ihi] = ((CH) >> 8) & 0xff;            \
2657         p[ilo] = (CH) & 0xff;                   \
2658         p += 2;                                 \
2659     } while(0)
2660 
2661 #ifdef Py_UNICODE_WIDE
2662     for (i = pairs = 0; i < size; i++)
2663         if (s[i] >= 0x10000)
2664             pairs++;
2665 #endif
2666     /* 2 * (size + pairs + (byteorder == 0)) */
2667     if (size > PY_SSIZE_T_MAX ||
2668         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2669         return PyErr_NoMemory();
2670     nsize = size + pairs + (byteorder == 0);
2671     bytesize = nsize * 2;
2672     if (bytesize / 2 != nsize)
2673         return PyErr_NoMemory();
2674     v = PyString_FromStringAndSize(NULL, bytesize);
2675     if (v == NULL)
2676         return NULL;
2677 
2678     p = (unsigned char *)PyString_AS_STRING(v);
2679     if (byteorder == 0)
2680         STORECHAR(0xFEFF);
2681     if (size == 0)
2682         return v;
2683 
2684     if (byteorder == -1) {
2685         /* force LE */
2686         ihi = 1;
2687         ilo = 0;
2688     }
2689     else if (byteorder == 1) {
2690         /* force BE */
2691         ihi = 0;
2692         ilo = 1;
2693     }
2694 
2695     while (size-- > 0) {
2696         Py_UNICODE ch = *s++;
2697         Py_UNICODE ch2 = 0;
2698 #ifdef Py_UNICODE_WIDE
2699         if (ch >= 0x10000) {
2700             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2701             ch  = 0xD800 | ((ch-0x10000) >> 10);
2702         }
2703 #endif
2704         STORECHAR(ch);
2705         if (ch2)
2706             STORECHAR(ch2);
2707     }
2708     return v;
2709 #undef STORECHAR
2710 }
2711 
PyUnicode_AsUTF16String(PyObject * unicode)2712 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2713 {
2714     if (!PyUnicode_Check(unicode)) {
2715         PyErr_BadArgument();
2716         return NULL;
2717     }
2718     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2719                                  PyUnicode_GET_SIZE(unicode),
2720                                  NULL,
2721                                  0);
2722 }
2723 
2724 /* --- Unicode Escape Codec ----------------------------------------------- */
2725 
2726 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2727 
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)2728 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2729                                         Py_ssize_t size,
2730                                         const char *errors)
2731 {
2732     const char *starts = s;
2733     Py_ssize_t startinpos;
2734     Py_ssize_t endinpos;
2735     Py_ssize_t outpos;
2736     int i;
2737     PyUnicodeObject *v;
2738     Py_UNICODE *p;
2739     const char *end;
2740     char* message;
2741     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2742     PyObject *errorHandler = NULL;
2743     PyObject *exc = NULL;
2744 
2745     /* Escaped strings will always be longer than the resulting
2746        Unicode string, so we start with size here and then reduce the
2747        length after conversion to the true value.
2748        (but if the error callback returns a long replacement string
2749        we'll have to allocate more space) */
2750     v = _PyUnicode_New(size);
2751     if (v == NULL)
2752         goto onError;
2753     if (size == 0)
2754         return (PyObject *)v;
2755 
2756     p = PyUnicode_AS_UNICODE(v);
2757     end = s + size;
2758 
2759     while (s < end) {
2760         unsigned char c;
2761         Py_UNICODE x;
2762         int digits;
2763 
2764         /* Non-escape characters are interpreted as Unicode ordinals */
2765         if (*s != '\\') {
2766             *p++ = (unsigned char) *s++;
2767             continue;
2768         }
2769 
2770         startinpos = s-starts;
2771         /* \ - Escapes */
2772         s++;
2773         c = *s++;
2774         if (s > end)
2775             c = '\0'; /* Invalid after \ */
2776         switch (c) {
2777 
2778             /* \x escapes */
2779         case '\n': break;
2780         case '\\': *p++ = '\\'; break;
2781         case '\'': *p++ = '\''; break;
2782         case '\"': *p++ = '\"'; break;
2783         case 'b': *p++ = '\b'; break;
2784         case 'f': *p++ = '\014'; break; /* FF */
2785         case 't': *p++ = '\t'; break;
2786         case 'n': *p++ = '\n'; break;
2787         case 'r': *p++ = '\r'; break;
2788         case 'v': *p++ = '\013'; break; /* VT */
2789         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2790 
2791             /* \OOO (octal) escapes */
2792         case '0': case '1': case '2': case '3':
2793         case '4': case '5': case '6': case '7':
2794             x = s[-1] - '0';
2795             if (s < end && '0' <= *s && *s <= '7') {
2796                 x = (x<<3) + *s++ - '0';
2797                 if (s < end && '0' <= *s && *s <= '7')
2798                     x = (x<<3) + *s++ - '0';
2799             }
2800             *p++ = x;
2801             break;
2802 
2803             /* hex escapes */
2804             /* \xXX */
2805         case 'x':
2806             digits = 2;
2807             message = "truncated \\xXX escape";
2808             goto hexescape;
2809 
2810             /* \uXXXX */
2811         case 'u':
2812             digits = 4;
2813             message = "truncated \\uXXXX escape";
2814             goto hexescape;
2815 
2816             /* \UXXXXXXXX */
2817         case 'U':
2818             digits = 8;
2819             message = "truncated \\UXXXXXXXX escape";
2820         hexescape:
2821             chr = 0;
2822             outpos = p-PyUnicode_AS_UNICODE(v);
2823             if (s+digits>end) {
2824                 endinpos = size;
2825                 if (unicode_decode_call_errorhandler(
2826                         errors, &errorHandler,
2827                         "unicodeescape", "end of string in escape sequence",
2828                         starts, size, &startinpos, &endinpos, &exc, &s,
2829                         &v, &outpos, &p))
2830                     goto onError;
2831                 goto nextByte;
2832             }
2833             for (i = 0; i < digits; ++i) {
2834                 c = (unsigned char) s[i];
2835                 if (!isxdigit(c)) {
2836                     endinpos = (s+i+1)-starts;
2837                     if (unicode_decode_call_errorhandler(
2838                             errors, &errorHandler,
2839                             "unicodeescape", message,
2840                             starts, size, &startinpos, &endinpos, &exc, &s,
2841                             &v, &outpos, &p))
2842                         goto onError;
2843                     goto nextByte;
2844                 }
2845                 chr = (chr<<4) & ~0xF;
2846                 if (c >= '0' && c <= '9')
2847                     chr += c - '0';
2848                 else if (c >= 'a' && c <= 'f')
2849                     chr += 10 + c - 'a';
2850                 else
2851                     chr += 10 + c - 'A';
2852             }
2853             s += i;
2854             if (chr == 0xffffffff && PyErr_Occurred())
2855                 /* _decoding_error will have already written into the
2856                    target buffer. */
2857                 break;
2858         store:
2859             /* when we get here, chr is a 32-bit unicode character */
2860             if (chr <= 0xffff)
2861                 /* UCS-2 character */
2862                 *p++ = (Py_UNICODE) chr;
2863             else if (chr <= 0x10ffff) {
2864                 /* UCS-4 character. Either store directly, or as
2865                    surrogate pair. */
2866 #ifdef Py_UNICODE_WIDE
2867                 *p++ = chr;
2868 #else
2869                 chr -= 0x10000L;
2870                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2871                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2872 #endif
2873             } else {
2874                 endinpos = s-starts;
2875                 outpos = p-PyUnicode_AS_UNICODE(v);
2876                 if (unicode_decode_call_errorhandler(
2877                         errors, &errorHandler,
2878                         "unicodeescape", "illegal Unicode character",
2879                         starts, size, &startinpos, &endinpos, &exc, &s,
2880                         &v, &outpos, &p))
2881                     goto onError;
2882             }
2883             break;
2884 
2885             /* \N{name} */
2886         case 'N':
2887             message = "malformed \\N character escape";
2888             if (ucnhash_CAPI == NULL) {
2889                 /* load the unicode data module */
2890                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2891                 if (ucnhash_CAPI == NULL)
2892                     goto ucnhashError;
2893             }
2894             if (*s == '{') {
2895                 const char *start = s+1;
2896                 /* look for the closing brace */
2897                 while (*s != '}' && s < end)
2898                     s++;
2899                 if (s > start && s < end && *s == '}') {
2900                     /* found a name.  look it up in the unicode database */
2901                     message = "unknown Unicode character name";
2902                     s++;
2903                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2904                         goto store;
2905                 }
2906             }
2907             endinpos = s-starts;
2908             outpos = p-PyUnicode_AS_UNICODE(v);
2909             if (unicode_decode_call_errorhandler(
2910                     errors, &errorHandler,
2911                     "unicodeescape", message,
2912                     starts, size, &startinpos, &endinpos, &exc, &s,
2913                     &v, &outpos, &p))
2914                 goto onError;
2915             break;
2916 
2917         default:
2918             if (s > end) {
2919                 message = "\\ at end of string";
2920                 s--;
2921                 endinpos = s-starts;
2922                 outpos = p-PyUnicode_AS_UNICODE(v);
2923                 if (unicode_decode_call_errorhandler(
2924                         errors, &errorHandler,
2925                         "unicodeescape", message,
2926                         starts, size, &startinpos, &endinpos, &exc, &s,
2927                         &v, &outpos, &p))
2928                     goto onError;
2929             }
2930             else {
2931                 *p++ = '\\';
2932                 *p++ = (unsigned char)s[-1];
2933             }
2934             break;
2935         }
2936       nextByte:
2937         ;
2938     }
2939     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2940         goto onError;
2941     Py_XDECREF(errorHandler);
2942     Py_XDECREF(exc);
2943     return (PyObject *)v;
2944 
2945   ucnhashError:
2946     PyErr_SetString(
2947         PyExc_UnicodeError,
2948         "\\N escapes not supported (can't load unicodedata module)"
2949         );
2950     Py_XDECREF(v);
2951     Py_XDECREF(errorHandler);
2952     Py_XDECREF(exc);
2953     return NULL;
2954 
2955   onError:
2956     Py_XDECREF(v);
2957     Py_XDECREF(errorHandler);
2958     Py_XDECREF(exc);
2959     return NULL;
2960 }
2961 
2962 /* Return a Unicode-Escape string version of the Unicode object.
2963 
2964    If quotes is true, the string is enclosed in u"" or u'' quotes as
2965    appropriate.
2966 
2967 */
2968 
findchar(const Py_UNICODE * s,Py_ssize_t size,Py_UNICODE ch)2969 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2970                                              Py_ssize_t size,
2971                                              Py_UNICODE ch)
2972 {
2973     /* like wcschr, but doesn't stop at NULL characters */
2974 
2975     while (size-- > 0) {
2976         if (*s == ch)
2977             return s;
2978         s++;
2979     }
2980 
2981     return NULL;
2982 }
2983 
2984 static
unicodeescape_string(const Py_UNICODE * s,Py_ssize_t size,int quotes)2985 PyObject *unicodeescape_string(const Py_UNICODE *s,
2986                                Py_ssize_t size,
2987                                int quotes)
2988 {
2989     PyObject *repr;
2990     char *p;
2991 
2992     static const char *hexdigit = "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994     const Py_ssize_t expandsize = 10;
2995 #else
2996     const Py_ssize_t expandsize = 6;
2997 #endif
2998 
2999     /* XXX(nnorwitz): rather than over-allocating, it would be
3000        better to choose a different scheme.  Perhaps scan the
3001        first N-chars of the string and allocate based on that size.
3002     */
3003     /* Initial allocation is based on the longest-possible unichr
3004        escape.
3005 
3006        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007        unichr, so in this case it's the longest unichr escape. In
3008        narrow (UTF-16) builds this is five chars per source unichr
3009        since there are two unichrs in the surrogate pair, so in narrow
3010        (UTF-16) builds it's not the longest unichr escape.
3011 
3012        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013        so in the narrow (UTF-16) build case it's the longest unichr
3014        escape.
3015     */
3016 
3017     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018         return PyErr_NoMemory();
3019 
3020     repr = PyString_FromStringAndSize(NULL,
3021                                       2
3022                                       + expandsize*size
3023                                       + 1);
3024     if (repr == NULL)
3025         return NULL;
3026 
3027     p = PyString_AS_STRING(repr);
3028 
3029     if (quotes) {
3030         *p++ = 'u';
3031         *p++ = (findchar(s, size, '\'') &&
3032                 !findchar(s, size, '"')) ? '"' : '\'';
3033     }
3034     while (size-- > 0) {
3035         Py_UNICODE ch = *s++;
3036 
3037         /* Escape quotes and backslashes */
3038         if ((quotes &&
3039              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040             *p++ = '\\';
3041             *p++ = (char) ch;
3042             continue;
3043         }
3044 
3045 #ifdef Py_UNICODE_WIDE
3046         /* Map 21-bit characters to '\U00xxxxxx' */
3047         else if (ch >= 0x10000) {
3048             *p++ = '\\';
3049             *p++ = 'U';
3050             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3057             *p++ = hexdigit[ch & 0x0000000F];
3058             continue;
3059         }
3060 #else
3061         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062         else if (ch >= 0xD800 && ch < 0xDC00) {
3063             Py_UNICODE ch2;
3064             Py_UCS4 ucs;
3065 
3066             ch2 = *s++;
3067             size--;
3068             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070                 *p++ = '\\';
3071                 *p++ = 'U';
3072                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079                 *p++ = hexdigit[ucs & 0x0000000F];
3080                 continue;
3081             }
3082             /* Fall through: isolated surrogates are copied as-is */
3083             s--;
3084             size++;
3085         }
3086 #endif
3087 
3088         /* Map 16-bit characters to '\uxxxx' */
3089         if (ch >= 256) {
3090             *p++ = '\\';
3091             *p++ = 'u';
3092             *p++ = hexdigit[(ch >> 12) & 0x000F];
3093             *p++ = hexdigit[(ch >> 8) & 0x000F];
3094             *p++ = hexdigit[(ch >> 4) & 0x000F];
3095             *p++ = hexdigit[ch & 0x000F];
3096         }
3097 
3098         /* Map special whitespace to '\t', \n', '\r' */
3099         else if (ch == '\t') {
3100             *p++ = '\\';
3101             *p++ = 't';
3102         }
3103         else if (ch == '\n') {
3104             *p++ = '\\';
3105             *p++ = 'n';
3106         }
3107         else if (ch == '\r') {
3108             *p++ = '\\';
3109             *p++ = 'r';
3110         }
3111 
3112         /* Map non-printable US ASCII to '\xhh' */
3113         else if (ch < ' ' || ch >= 0x7F) {
3114             *p++ = '\\';
3115             *p++ = 'x';
3116             *p++ = hexdigit[(ch >> 4) & 0x000F];
3117             *p++ = hexdigit[ch & 0x000F];
3118         }
3119 
3120         /* Copy everything else as-is */
3121         else
3122             *p++ = (char) ch;
3123     }
3124     if (quotes)
3125         *p++ = PyString_AS_STRING(repr)[1];
3126 
3127     *p = '\0';
3128     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3129         return NULL;
3130     return repr;
3131 }
3132 
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3133 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3134                                         Py_ssize_t size)
3135 {
3136     return unicodeescape_string(s, size, 0);
3137 }
3138 
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)3139 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140 {
3141     if (!PyUnicode_Check(unicode)) {
3142         PyErr_BadArgument();
3143         return NULL;
3144     }
3145     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3146                                          PyUnicode_GET_SIZE(unicode));
3147 }
3148 
3149 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3150 
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)3151 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3152                                            Py_ssize_t size,
3153                                            const char *errors)
3154 {
3155     const char *starts = s;
3156     Py_ssize_t startinpos;
3157     Py_ssize_t endinpos;
3158     Py_ssize_t outpos;
3159     PyUnicodeObject *v;
3160     Py_UNICODE *p;
3161     const char *end;
3162     const char *bs;
3163     PyObject *errorHandler = NULL;
3164     PyObject *exc = NULL;
3165 
3166     /* Escaped strings will always be longer than the resulting
3167        Unicode string, so we start with size here and then reduce the
3168        length after conversion to the true value. (But decoding error
3169        handler might have to resize the string) */
3170     v = _PyUnicode_New(size);
3171     if (v == NULL)
3172         goto onError;
3173     if (size == 0)
3174         return (PyObject *)v;
3175     p = PyUnicode_AS_UNICODE(v);
3176     end = s + size;
3177     while (s < end) {
3178         unsigned char c;
3179         Py_UCS4 x;
3180         int i;
3181         int count;
3182 
3183         /* Non-escape characters are interpreted as Unicode ordinals */
3184         if (*s != '\\') {
3185             *p++ = (unsigned char)*s++;
3186             continue;
3187         }
3188         startinpos = s-starts;
3189 
3190         /* \u-escapes are only interpreted iff the number of leading
3191            backslashes if odd */
3192         bs = s;
3193         for (;s < end;) {
3194             if (*s != '\\')
3195                 break;
3196             *p++ = (unsigned char)*s++;
3197         }
3198         if (((s - bs) & 1) == 0 ||
3199             s >= end ||
3200             (*s != 'u' && *s != 'U')) {
3201             continue;
3202         }
3203         p--;
3204         count = *s=='u' ? 4 : 8;
3205         s++;
3206 
3207         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3208         outpos = p-PyUnicode_AS_UNICODE(v);
3209         for (x = 0, i = 0; i < count; ++i, ++s) {
3210             c = (unsigned char)*s;
3211             if (!isxdigit(c)) {
3212                 endinpos = s-starts;
3213                 if (unicode_decode_call_errorhandler(
3214                         errors, &errorHandler,
3215                         "rawunicodeescape", "truncated \\uXXXX",
3216                         starts, size, &startinpos, &endinpos, &exc, &s,
3217                         &v, &outpos, &p))
3218                     goto onError;
3219                 goto nextByte;
3220             }
3221             x = (x<<4) & ~0xF;
3222             if (c >= '0' && c <= '9')
3223                 x += c - '0';
3224             else if (c >= 'a' && c <= 'f')
3225                 x += 10 + c - 'a';
3226             else
3227                 x += 10 + c - 'A';
3228         }
3229         if (x <= 0xffff)
3230             /* UCS-2 character */
3231             *p++ = (Py_UNICODE) x;
3232         else if (x <= 0x10ffff) {
3233             /* UCS-4 character. Either store directly, or as
3234                surrogate pair. */
3235 #ifdef Py_UNICODE_WIDE
3236             *p++ = (Py_UNICODE) x;
3237 #else
3238             x -= 0x10000L;
3239             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3240             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3241 #endif
3242         } else {
3243             endinpos = s-starts;
3244             outpos = p-PyUnicode_AS_UNICODE(v);
3245             if (unicode_decode_call_errorhandler(
3246                     errors, &errorHandler,
3247                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3248                     starts, size, &startinpos, &endinpos, &exc, &s,
3249                     &v, &outpos, &p))
3250                 goto onError;
3251         }
3252       nextByte:
3253         ;
3254     }
3255     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3256         goto onError;
3257     Py_XDECREF(errorHandler);
3258     Py_XDECREF(exc);
3259     return (PyObject *)v;
3260 
3261   onError:
3262     Py_XDECREF(v);
3263     Py_XDECREF(errorHandler);
3264     Py_XDECREF(exc);
3265     return NULL;
3266 }
3267 
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)3268 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3269                                            Py_ssize_t size)
3270 {
3271     PyObject *repr;
3272     char *p;
3273     char *q;
3274 
3275     static const char *hexdigit = "0123456789abcdef";
3276 #ifdef Py_UNICODE_WIDE
3277     const Py_ssize_t expandsize = 10;
3278 #else
3279     const Py_ssize_t expandsize = 6;
3280 #endif
3281 
3282     if (size > PY_SSIZE_T_MAX / expandsize)
3283         return PyErr_NoMemory();
3284 
3285     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3286     if (repr == NULL)
3287         return NULL;
3288     if (size == 0)
3289         return repr;
3290 
3291     p = q = PyString_AS_STRING(repr);
3292     while (size-- > 0) {
3293         Py_UNICODE ch = *s++;
3294 #ifdef Py_UNICODE_WIDE
3295         /* Map 32-bit characters to '\Uxxxxxxxx' */
3296         if (ch >= 0x10000) {
3297             *p++ = '\\';
3298             *p++ = 'U';
3299             *p++ = hexdigit[(ch >> 28) & 0xf];
3300             *p++ = hexdigit[(ch >> 24) & 0xf];
3301             *p++ = hexdigit[(ch >> 20) & 0xf];
3302             *p++ = hexdigit[(ch >> 16) & 0xf];
3303             *p++ = hexdigit[(ch >> 12) & 0xf];
3304             *p++ = hexdigit[(ch >> 8) & 0xf];
3305             *p++ = hexdigit[(ch >> 4) & 0xf];
3306             *p++ = hexdigit[ch & 15];
3307         }
3308         else
3309 #else
3310             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3311             if (ch >= 0xD800 && ch < 0xDC00) {
3312                 Py_UNICODE ch2;
3313                 Py_UCS4 ucs;
3314 
3315                 ch2 = *s++;
3316                 size--;
3317                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3318                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3319                     *p++ = '\\';
3320                     *p++ = 'U';
3321                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3322                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3323                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3324                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3325                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3326                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3327                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3328                     *p++ = hexdigit[ucs & 0xf];
3329                     continue;
3330                 }
3331                 /* Fall through: isolated surrogates are copied as-is */
3332                 s--;
3333                 size++;
3334             }
3335 #endif
3336         /* Map 16-bit characters to '\uxxxx' */
3337         if (ch >= 256) {
3338             *p++ = '\\';
3339             *p++ = 'u';
3340             *p++ = hexdigit[(ch >> 12) & 0xf];
3341             *p++ = hexdigit[(ch >> 8) & 0xf];
3342             *p++ = hexdigit[(ch >> 4) & 0xf];
3343             *p++ = hexdigit[ch & 15];
3344         }
3345         /* Copy everything else as-is */
3346         else
3347             *p++ = (char) ch;
3348     }
3349     *p = '\0';
3350     if (_PyString_Resize(&repr, p - q))
3351         return NULL;
3352     return repr;
3353 }
3354 
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)3355 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3356 {
3357     if (!PyUnicode_Check(unicode)) {
3358         PyErr_BadArgument();
3359         return NULL;
3360     }
3361     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3362                                             PyUnicode_GET_SIZE(unicode));
3363 }
3364 
3365 /* --- Unicode Internal Codec ------------------------------------------- */
3366 
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)3367 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3368                                            Py_ssize_t size,
3369                                            const char *errors)
3370 {
3371     const char *starts = s;
3372     Py_ssize_t startinpos;
3373     Py_ssize_t endinpos;
3374     Py_ssize_t outpos;
3375     PyUnicodeObject *v;
3376     Py_UNICODE *p;
3377     const char *end;
3378     const char *reason;
3379     PyObject *errorHandler = NULL;
3380     PyObject *exc = NULL;
3381 
3382 #ifdef Py_UNICODE_WIDE
3383     Py_UNICODE unimax = PyUnicode_GetMax();
3384 #endif
3385 
3386     /* XXX overflow detection missing */
3387     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3388     if (v == NULL)
3389         goto onError;
3390     if (PyUnicode_GetSize((PyObject *)v) == 0)
3391         return (PyObject *)v;
3392     p = PyUnicode_AS_UNICODE(v);
3393     end = s + size;
3394 
3395     while (s < end) {
3396         memcpy(p, s, sizeof(Py_UNICODE));
3397         /* We have to sanity check the raw data, otherwise doom looms for
3398            some malformed UCS-4 data. */
3399         if (
3400 #ifdef Py_UNICODE_WIDE
3401             *p > unimax || *p < 0 ||
3402 #endif
3403             end-s < Py_UNICODE_SIZE
3404             )
3405         {
3406             startinpos = s - starts;
3407             if (end-s < Py_UNICODE_SIZE) {
3408                 endinpos = end-starts;
3409                 reason = "truncated input";
3410             }
3411             else {
3412                 endinpos = s - starts + Py_UNICODE_SIZE;
3413                 reason = "illegal code point (> 0x10FFFF)";
3414             }
3415             outpos = p - PyUnicode_AS_UNICODE(v);
3416             if (unicode_decode_call_errorhandler(
3417                     errors, &errorHandler,
3418                     "unicode_internal", reason,
3419                     starts, size, &startinpos, &endinpos, &exc, &s,
3420                     &v, &outpos, &p)) {
3421                 goto onError;
3422             }
3423         }
3424         else {
3425             p++;
3426             s += Py_UNICODE_SIZE;
3427         }
3428     }
3429 
3430     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3431         goto onError;
3432     Py_XDECREF(errorHandler);
3433     Py_XDECREF(exc);
3434     return (PyObject *)v;
3435 
3436   onError:
3437     Py_XDECREF(v);
3438     Py_XDECREF(errorHandler);
3439     Py_XDECREF(exc);
3440     return NULL;
3441 }
3442 
3443 /* --- Latin-1 Codec ------------------------------------------------------ */
3444 
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)3445 PyObject *PyUnicode_DecodeLatin1(const char *s,
3446                                  Py_ssize_t size,
3447                                  const char *errors)
3448 {
3449     PyUnicodeObject *v;
3450     Py_UNICODE *p;
3451 
3452     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3453     if (size == 1) {
3454         Py_UNICODE r = *(unsigned char*)s;
3455         return PyUnicode_FromUnicode(&r, 1);
3456     }
3457 
3458     v = _PyUnicode_New(size);
3459     if (v == NULL)
3460         goto onError;
3461     if (size == 0)
3462         return (PyObject *)v;
3463     p = PyUnicode_AS_UNICODE(v);
3464     while (size-- > 0)
3465         *p++ = (unsigned char)*s++;
3466     return (PyObject *)v;
3467 
3468   onError:
3469     Py_XDECREF(v);
3470     return NULL;
3471 }
3472 
3473 /* create or adjust a UnicodeEncodeError */
make_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3474 static void make_encode_exception(PyObject **exceptionObject,
3475                                   const char *encoding,
3476                                   const Py_UNICODE *unicode, Py_ssize_t size,
3477                                   Py_ssize_t startpos, Py_ssize_t endpos,
3478                                   const char *reason)
3479 {
3480     if (*exceptionObject == NULL) {
3481         *exceptionObject = PyUnicodeEncodeError_Create(
3482             encoding, unicode, size, startpos, endpos, reason);
3483     }
3484     else {
3485         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3486             goto onError;
3487         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3488             goto onError;
3489         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3490             goto onError;
3491         return;
3492       onError:
3493         Py_DECREF(*exceptionObject);
3494         *exceptionObject = NULL;
3495     }
3496 }
3497 
3498 /* raises a UnicodeEncodeError */
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)3499 static void raise_encode_exception(PyObject **exceptionObject,
3500                                    const char *encoding,
3501                                    const Py_UNICODE *unicode, Py_ssize_t size,
3502                                    Py_ssize_t startpos, Py_ssize_t endpos,
3503                                    const char *reason)
3504 {
3505     make_encode_exception(exceptionObject,
3506                           encoding, unicode, size, startpos, endpos, reason);
3507     if (*exceptionObject != NULL)
3508         PyCodec_StrictErrors(*exceptionObject);
3509 }
3510 
3511 /* error handling callback helper:
3512    build arguments, call the callback and check the arguments,
3513    put the result into newpos and return the replacement string, which
3514    has to be freed by the caller */
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)3515 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3516                                                   PyObject **errorHandler,
3517                                                   const char *encoding, const char *reason,
3518                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3519                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3520                                                   Py_ssize_t *newpos)
3521 {
3522     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3523 
3524     PyObject *restuple;
3525     PyObject *resunicode;
3526 
3527     if (*errorHandler == NULL) {
3528         *errorHandler = PyCodec_LookupError(errors);
3529         if (*errorHandler == NULL)
3530             return NULL;
3531     }
3532 
3533     make_encode_exception(exceptionObject,
3534                           encoding, unicode, size, startpos, endpos, reason);
3535     if (*exceptionObject == NULL)
3536         return NULL;
3537 
3538     restuple = PyObject_CallFunctionObjArgs(
3539         *errorHandler, *exceptionObject, NULL);
3540     if (restuple == NULL)
3541         return NULL;
3542     if (!PyTuple_Check(restuple)) {
3543         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3544         Py_DECREF(restuple);
3545         return NULL;
3546     }
3547     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3548                           &resunicode, newpos)) {
3549         Py_DECREF(restuple);
3550         return NULL;
3551     }
3552     if (*newpos<0)
3553         *newpos = size+*newpos;
3554     if (*newpos<0 || *newpos>size) {
3555         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3556         Py_DECREF(restuple);
3557         return NULL;
3558     }
3559     Py_INCREF(resunicode);
3560     Py_DECREF(restuple);
3561     return resunicode;
3562 }
3563 
unicode_encode_ucs1(const Py_UNICODE * p,Py_ssize_t size,const char * errors,int limit)3564 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3565                                      Py_ssize_t size,
3566                                      const char *errors,
3567                                      int limit)
3568 {
3569     /* output object */
3570     PyObject *res;
3571     /* pointers to the beginning and end+1 of input */
3572     const Py_UNICODE *startp = p;
3573     const Py_UNICODE *endp = p + size;
3574     /* pointer to the beginning of the unencodable characters */
3575     /* const Py_UNICODE *badp = NULL; */
3576     /* pointer into the output */
3577     char *str;
3578     /* current output position */
3579     Py_ssize_t respos = 0;
3580     Py_ssize_t ressize;
3581     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3582     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3583     PyObject *errorHandler = NULL;
3584     PyObject *exc = NULL;
3585     /* the following variable is used for caching string comparisons
3586      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3587     int known_errorHandler = -1;
3588 
3589     /* allocate enough for a simple encoding without
3590        replacements, if we need more, we'll resize */
3591     res = PyString_FromStringAndSize(NULL, size);
3592     if (res == NULL)
3593         goto onError;
3594     if (size == 0)
3595         return res;
3596     str = PyString_AS_STRING(res);
3597     ressize = size;
3598 
3599     while (p<endp) {
3600         Py_UNICODE c = *p;
3601 
3602         /* can we encode this? */
3603         if (c<limit) {
3604             /* no overflow check, because we know that the space is enough */
3605             *str++ = (char)c;
3606             ++p;
3607         }
3608         else {
3609             Py_ssize_t unicodepos = p-startp;
3610             Py_ssize_t requiredsize;
3611             PyObject *repunicode;
3612             Py_ssize_t repsize;
3613             Py_ssize_t newpos;
3614             Py_ssize_t respos;
3615             Py_UNICODE *uni2;
3616             /* startpos for collecting unencodable chars */
3617             const Py_UNICODE *collstart = p;
3618             const Py_UNICODE *collend = p;
3619             /* find all unecodable characters */
3620             while ((collend < endp) && ((*collend)>=limit))
3621                 ++collend;
3622             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3623             if (known_errorHandler==-1) {
3624                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3625                     known_errorHandler = 1;
3626                 else if (!strcmp(errors, "replace"))
3627                     known_errorHandler = 2;
3628                 else if (!strcmp(errors, "ignore"))
3629                     known_errorHandler = 3;
3630                 else if (!strcmp(errors, "xmlcharrefreplace"))
3631                     known_errorHandler = 4;
3632                 else
3633                     known_errorHandler = 0;
3634             }
3635             switch (known_errorHandler) {
3636             case 1: /* strict */
3637                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3638                 goto onError;
3639             case 2: /* replace */
3640                 while (collstart++<collend)
3641                     *str++ = '?'; /* fall through */
3642             case 3: /* ignore */
3643                 p = collend;
3644                 break;
3645             case 4: /* xmlcharrefreplace */
3646                 respos = str-PyString_AS_STRING(res);
3647                 /* determine replacement size (temporarily (mis)uses p) */
3648                 for (p = collstart, repsize = 0; p < collend; ++p) {
3649                     if (*p<10)
3650                         repsize += 2+1+1;
3651                     else if (*p<100)
3652                         repsize += 2+2+1;
3653                     else if (*p<1000)
3654                         repsize += 2+3+1;
3655                     else if (*p<10000)
3656                         repsize += 2+4+1;
3657 #ifndef Py_UNICODE_WIDE
3658                     else
3659                         repsize += 2+5+1;
3660 #else
3661                     else if (*p<100000)
3662                         repsize += 2+5+1;
3663                     else if (*p<1000000)
3664                         repsize += 2+6+1;
3665                     else
3666                         repsize += 2+7+1;
3667 #endif
3668                 }
3669                 requiredsize = respos+repsize+(endp-collend);
3670                 if (requiredsize > ressize) {
3671                     if (requiredsize<2*ressize)
3672                         requiredsize = 2*ressize;
3673                     if (_PyString_Resize(&res, requiredsize))
3674                         goto onError;
3675                     str = PyString_AS_STRING(res) + respos;
3676                     ressize = requiredsize;
3677                 }
3678                 /* generate replacement (temporarily (mis)uses p) */
3679                 for (p = collstart; p < collend; ++p) {
3680                     str += sprintf(str, "&#%d;", (int)*p);
3681                 }
3682                 p = collend;
3683                 break;
3684             default:
3685                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3686                                                               encoding, reason, startp, size, &exc,
3687                                                               collstart-startp, collend-startp, &newpos);
3688                 if (repunicode == NULL)
3689                     goto onError;
3690                 /* need more space? (at least enough for what we have+the
3691                    replacement+the rest of the string, so we won't have to
3692                    check space for encodable characters) */
3693                 respos = str-PyString_AS_STRING(res);
3694                 repsize = PyUnicode_GET_SIZE(repunicode);
3695                 requiredsize = respos+repsize+(endp-collend);
3696                 if (requiredsize > ressize) {
3697                     if (requiredsize<2*ressize)
3698                         requiredsize = 2*ressize;
3699                     if (_PyString_Resize(&res, requiredsize)) {
3700                         Py_DECREF(repunicode);
3701                         goto onError;
3702                     }
3703                     str = PyString_AS_STRING(res) + respos;
3704                     ressize = requiredsize;
3705                 }
3706                 /* check if there is anything unencodable in the replacement
3707                    and copy it to the output */
3708                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3709                     c = *uni2;
3710                     if (c >= limit) {
3711                         raise_encode_exception(&exc, encoding, startp, size,
3712                                                unicodepos, unicodepos+1, reason);
3713                         Py_DECREF(repunicode);
3714                         goto onError;
3715                     }
3716                     *str = (char)c;
3717                 }
3718                 p = startp + newpos;
3719                 Py_DECREF(repunicode);
3720             }
3721         }
3722     }
3723     /* Resize if we allocated to much */
3724     respos = str-PyString_AS_STRING(res);
3725     if (respos<ressize)
3726         /* If this falls res will be NULL */
3727         _PyString_Resize(&res, respos);
3728     Py_XDECREF(errorHandler);
3729     Py_XDECREF(exc);
3730     return res;
3731 
3732   onError:
3733     Py_XDECREF(res);
3734     Py_XDECREF(errorHandler);
3735     Py_XDECREF(exc);
3736     return NULL;
3737 }
3738 
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3739 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3740                                  Py_ssize_t size,
3741                                  const char *errors)
3742 {
3743     return unicode_encode_ucs1(p, size, errors, 256);
3744 }
3745 
PyUnicode_AsLatin1String(PyObject * unicode)3746 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3747 {
3748     if (!PyUnicode_Check(unicode)) {
3749         PyErr_BadArgument();
3750         return NULL;
3751     }
3752     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3753                                   PyUnicode_GET_SIZE(unicode),
3754                                   NULL);
3755 }
3756 
3757 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3758 
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)3759 PyObject *PyUnicode_DecodeASCII(const char *s,
3760                                 Py_ssize_t size,
3761                                 const char *errors)
3762 {
3763     const char *starts = s;
3764     PyUnicodeObject *v;
3765     Py_UNICODE *p;
3766     Py_ssize_t startinpos;
3767     Py_ssize_t endinpos;
3768     Py_ssize_t outpos;
3769     const char *e;
3770     PyObject *errorHandler = NULL;
3771     PyObject *exc = NULL;
3772 
3773     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3774     if (size == 1 && *(unsigned char*)s < 128) {
3775         Py_UNICODE r = *(unsigned char*)s;
3776         return PyUnicode_FromUnicode(&r, 1);
3777     }
3778 
3779     v = _PyUnicode_New(size);
3780     if (v == NULL)
3781         goto onError;
3782     if (size == 0)
3783         return (PyObject *)v;
3784     p = PyUnicode_AS_UNICODE(v);
3785     e = s + size;
3786     while (s < e) {
3787         register unsigned char c = (unsigned char)*s;
3788         if (c < 128) {
3789             *p++ = c;
3790             ++s;
3791         }
3792         else {
3793             startinpos = s-starts;
3794             endinpos = startinpos + 1;
3795             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3796             if (unicode_decode_call_errorhandler(
3797                     errors, &errorHandler,
3798                     "ascii", "ordinal not in range(128)",
3799                     starts, size, &startinpos, &endinpos, &exc, &s,
3800                     &v, &outpos, &p))
3801                 goto onError;
3802         }
3803     }
3804     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3805         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3806             goto onError;
3807     Py_XDECREF(errorHandler);
3808     Py_XDECREF(exc);
3809     return (PyObject *)v;
3810 
3811   onError:
3812     Py_XDECREF(v);
3813     Py_XDECREF(errorHandler);
3814     Py_XDECREF(exc);
3815     return NULL;
3816 }
3817 
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)3818 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3819                                 Py_ssize_t size,
3820                                 const char *errors)
3821 {
3822     return unicode_encode_ucs1(p, size, errors, 128);
3823 }
3824 
PyUnicode_AsASCIIString(PyObject * unicode)3825 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3826 {
3827     if (!PyUnicode_Check(unicode)) {
3828         PyErr_BadArgument();
3829         return NULL;
3830     }
3831     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3832                                  PyUnicode_GET_SIZE(unicode),
3833                                  NULL);
3834 }
3835 
3836 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3837 
3838 /* --- MBCS codecs for Windows -------------------------------------------- */
3839 
3840 #if SIZEOF_INT < SIZEOF_SIZE_T
3841 #define NEED_RETRY
3842 #endif
3843 
3844 /* XXX This code is limited to "true" double-byte encodings, as
3845    a) it assumes an incomplete character consists of a single byte, and
3846    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3847    encodings, see IsDBCSLeadByteEx documentation. */
3848 
is_dbcs_lead_byte(const char * s,int offset)3849 static int is_dbcs_lead_byte(const char *s, int offset)
3850 {
3851     const char *curr = s + offset;
3852 
3853     if (IsDBCSLeadByte(*curr)) {
3854         const char *prev = CharPrev(s, curr);
3855         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3856     }
3857     return 0;
3858 }
3859 
3860 /*
3861  * Decode MBCS string into unicode object. If 'final' is set, converts
3862  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3863  */
decode_mbcs(PyUnicodeObject ** v,const char * s,int size,int final)3864 static int decode_mbcs(PyUnicodeObject **v,
3865                        const char *s, /* MBCS string */
3866                        int size, /* sizeof MBCS string */
3867                        int final)
3868 {
3869     Py_UNICODE *p;
3870     Py_ssize_t n = 0;
3871     int usize = 0;
3872 
3873     assert(size >= 0);
3874 
3875     /* Skip trailing lead-byte unless 'final' is set */
3876     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3877         --size;
3878 
3879     /* First get the size of the result */
3880     if (size > 0) {
3881         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3882         if (usize == 0) {
3883             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3884             return -1;
3885         }
3886     }
3887 
3888     if (*v == NULL) {
3889         /* Create unicode object */
3890         *v = _PyUnicode_New(usize);
3891         if (*v == NULL)
3892             return -1;
3893     }
3894     else {
3895         /* Extend unicode object */
3896         n = PyUnicode_GET_SIZE(*v);
3897         if (_PyUnicode_Resize(v, n + usize) < 0)
3898             return -1;
3899     }
3900 
3901     /* Do the conversion */
3902     if (size > 0) {
3903         p = PyUnicode_AS_UNICODE(*v) + n;
3904         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3905             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3906             return -1;
3907         }
3908     }
3909 
3910     return size;
3911 }
3912 
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)3913 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3914                                        Py_ssize_t size,
3915                                        const char *errors,
3916                                        Py_ssize_t *consumed)
3917 {
3918     PyUnicodeObject *v = NULL;
3919     int done;
3920 
3921     if (consumed)
3922         *consumed = 0;
3923 
3924 #ifdef NEED_RETRY
3925   retry:
3926     if (size > INT_MAX)
3927         done = decode_mbcs(&v, s, INT_MAX, 0);
3928     else
3929 #endif
3930         done = decode_mbcs(&v, s, (int)size, !consumed);
3931 
3932     if (done < 0) {
3933         Py_XDECREF(v);
3934         return NULL;
3935     }
3936 
3937     if (consumed)
3938         *consumed += done;
3939 
3940 #ifdef NEED_RETRY
3941     if (size > INT_MAX) {
3942         s += done;
3943         size -= done;
3944         goto retry;
3945     }
3946 #endif
3947 
3948     return (PyObject *)v;
3949 }
3950 
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)3951 PyObject *PyUnicode_DecodeMBCS(const char *s,
3952                                Py_ssize_t size,
3953                                const char *errors)
3954 {
3955     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3956 }
3957 
3958 /*
3959  * Convert unicode into string object (MBCS).
3960  * Returns 0 if succeed, -1 otherwise.
3961  */
encode_mbcs(PyObject ** repr,const Py_UNICODE * p,int size)3962 static int encode_mbcs(PyObject **repr,
3963                        const Py_UNICODE *p, /* unicode */
3964                        int size) /* size of unicode */
3965 {
3966     int mbcssize = 0;
3967     Py_ssize_t n = 0;
3968 
3969     assert(size >= 0);
3970 
3971     /* First get the size of the result */
3972     if (size > 0) {
3973         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3974         if (mbcssize == 0) {
3975             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3976             return -1;
3977         }
3978     }
3979 
3980     if (*repr == NULL) {
3981         /* Create string object */
3982         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3983         if (*repr == NULL)
3984             return -1;
3985     }
3986     else {
3987         /* Extend string object */
3988         n = PyString_Size(*repr);
3989         if (_PyString_Resize(repr, n + mbcssize) < 0)
3990             return -1;
3991     }
3992 
3993     /* Do the conversion */
3994     if (size > 0) {
3995         char *s = PyString_AS_STRING(*repr) + n;
3996         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3997             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3998             return -1;
3999         }
4000     }
4001 
4002     return 0;
4003 }
4004 
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)4005 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4006                                Py_ssize_t size,
4007                                const char *errors)
4008 {
4009     PyObject *repr = NULL;
4010     int ret;
4011 
4012 #ifdef NEED_RETRY
4013   retry:
4014     if (size > INT_MAX)
4015         ret = encode_mbcs(&repr, p, INT_MAX);
4016     else
4017 #endif
4018         ret = encode_mbcs(&repr, p, (int)size);
4019 
4020     if (ret < 0) {
4021         Py_XDECREF(repr);
4022         return NULL;
4023     }
4024 
4025 #ifdef NEED_RETRY
4026     if (size > INT_MAX) {
4027         p += INT_MAX;
4028         size -= INT_MAX;
4029         goto retry;
4030     }
4031 #endif
4032 
4033     return repr;
4034 }
4035 
PyUnicode_AsMBCSString(PyObject * unicode)4036 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4037 {
4038     if (!PyUnicode_Check(unicode)) {
4039         PyErr_BadArgument();
4040         return NULL;
4041     }
4042     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4043                                 PyUnicode_GET_SIZE(unicode),
4044                                 NULL);
4045 }
4046 
4047 #undef NEED_RETRY
4048 
4049 #endif /* MS_WINDOWS */
4050 
4051 /* --- Character Mapping Codec -------------------------------------------- */
4052 
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)4053 PyObject *PyUnicode_DecodeCharmap(const char *s,
4054                                   Py_ssize_t size,
4055                                   PyObject *mapping,
4056                                   const char *errors)
4057 {
4058     const char *starts = s;
4059     Py_ssize_t startinpos;
4060     Py_ssize_t endinpos;
4061     Py_ssize_t outpos;
4062     const char *e;
4063     PyUnicodeObject *v;
4064     Py_UNICODE *p;
4065     Py_ssize_t extrachars = 0;
4066     PyObject *errorHandler = NULL;
4067     PyObject *exc = NULL;
4068     Py_UNICODE *mapstring = NULL;
4069     Py_ssize_t maplen = 0;
4070 
4071     /* Default to Latin-1 */
4072     if (mapping == NULL)
4073         return PyUnicode_DecodeLatin1(s, size, errors);
4074 
4075     v = _PyUnicode_New(size);
4076     if (v == NULL)
4077         goto onError;
4078     if (size == 0)
4079         return (PyObject *)v;
4080     p = PyUnicode_AS_UNICODE(v);
4081     e = s + size;
4082     if (PyUnicode_CheckExact(mapping)) {
4083         mapstring = PyUnicode_AS_UNICODE(mapping);
4084         maplen = PyUnicode_GET_SIZE(mapping);
4085         while (s < e) {
4086             unsigned char ch = *s;
4087             Py_UNICODE x = 0xfffe; /* illegal value */
4088 
4089             if (ch < maplen)
4090                 x = mapstring[ch];
4091 
4092             if (x == 0xfffe) {
4093                 /* undefined mapping */
4094                 outpos = p-PyUnicode_AS_UNICODE(v);
4095                 startinpos = s-starts;
4096                 endinpos = startinpos+1;
4097                 if (unicode_decode_call_errorhandler(
4098                         errors, &errorHandler,
4099                         "charmap", "character maps to <undefined>",
4100                         starts, size, &startinpos, &endinpos, &exc, &s,
4101                         &v, &outpos, &p)) {
4102                     goto onError;
4103                 }
4104                 continue;
4105             }
4106             *p++ = x;
4107             ++s;
4108         }
4109     }
4110     else {
4111         while (s < e) {
4112             unsigned char ch = *s;
4113             PyObject *w, *x;
4114 
4115             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4116             w = PyInt_FromLong((long)ch);
4117             if (w == NULL)
4118                 goto onError;
4119             x = PyObject_GetItem(mapping, w);
4120             Py_DECREF(w);
4121             if (x == NULL) {
4122                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4123                     /* No mapping found means: mapping is undefined. */
4124                     PyErr_Clear();
4125                     x = Py_None;
4126                     Py_INCREF(x);
4127                 } else
4128                     goto onError;
4129             }
4130 
4131             /* Apply mapping */
4132             if (PyInt_Check(x)) {
4133                 long value = PyInt_AS_LONG(x);
4134                 if (value < 0 || value > 65535) {
4135                     PyErr_SetString(PyExc_TypeError,
4136                                     "character mapping must be in range(65536)");
4137                     Py_DECREF(x);
4138                     goto onError;
4139                 }
4140                 *p++ = (Py_UNICODE)value;
4141             }
4142             else if (x == Py_None) {
4143                 /* undefined mapping */
4144                 outpos = p-PyUnicode_AS_UNICODE(v);
4145                 startinpos = s-starts;
4146                 endinpos = startinpos+1;
4147                 if (unicode_decode_call_errorhandler(
4148                         errors, &errorHandler,
4149                         "charmap", "character maps to <undefined>",
4150                         starts, size, &startinpos, &endinpos, &exc, &s,
4151                         &v, &outpos, &p)) {
4152                     Py_DECREF(x);
4153                     goto onError;
4154                 }
4155                 Py_DECREF(x);
4156                 continue;
4157             }
4158             else if (PyUnicode_Check(x)) {
4159                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4160 
4161                 if (targetsize == 1)
4162                     /* 1-1 mapping */
4163                     *p++ = *PyUnicode_AS_UNICODE(x);
4164 
4165                 else if (targetsize > 1) {
4166                     /* 1-n mapping */
4167                     if (targetsize > extrachars) {
4168                         /* resize first */
4169                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4170                         Py_ssize_t needed = (targetsize - extrachars) + \
4171                             (targetsize << 2);
4172                         extrachars += needed;
4173                         /* XXX overflow detection missing */
4174                         if (_PyUnicode_Resize(&v,
4175                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4176                             Py_DECREF(x);
4177                             goto onError;
4178                         }
4179                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4180                     }
4181                     Py_UNICODE_COPY(p,
4182                                     PyUnicode_AS_UNICODE(x),
4183                                     targetsize);
4184                     p += targetsize;
4185                     extrachars -= targetsize;
4186                 }
4187                 /* 1-0 mapping: skip the character */
4188             }
4189             else {
4190                 /* wrong return value */
4191                 PyErr_SetString(PyExc_TypeError,
4192                                 "character mapping must return integer, None or unicode");
4193                 Py_DECREF(x);
4194                 goto onError;
4195             }
4196             Py_DECREF(x);
4197             ++s;
4198         }
4199     }
4200     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4201         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4202             goto onError;
4203     Py_XDECREF(errorHandler);
4204     Py_XDECREF(exc);
4205     return (PyObject *)v;
4206 
4207   onError:
4208     Py_XDECREF(errorHandler);
4209     Py_XDECREF(exc);
4210     Py_XDECREF(v);
4211     return NULL;
4212 }
4213 
4214 /* Charmap encoding: the lookup table */
4215 
4216 struct encoding_map{
4217     PyObject_HEAD
4218     unsigned char level1[32];
4219     int count2, count3;
4220     unsigned char level23[1];
4221 };
4222 
4223 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)4224 encoding_map_size(PyObject *obj, PyObject* args)
4225 {
4226     struct encoding_map *map = (struct encoding_map*)obj;
4227     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4228                           128*map->count3);
4229 }
4230 
4231 static PyMethodDef encoding_map_methods[] = {
4232     {"size", encoding_map_size, METH_NOARGS,
4233      PyDoc_STR("Return the size (in bytes) of this object") },
4234     { 0 }
4235 };
4236 
4237 static void
encoding_map_dealloc(PyObject * o)4238 encoding_map_dealloc(PyObject* o)
4239 {
4240     PyObject_FREE(o);
4241 }
4242 
4243 static PyTypeObject EncodingMapType = {
4244     PyVarObject_HEAD_INIT(NULL, 0)
4245     "EncodingMap",          /*tp_name*/
4246     sizeof(struct encoding_map),   /*tp_basicsize*/
4247     0,                      /*tp_itemsize*/
4248     /* methods */
4249     encoding_map_dealloc,   /*tp_dealloc*/
4250     0,                      /*tp_print*/
4251     0,                      /*tp_getattr*/
4252     0,                      /*tp_setattr*/
4253     0,                      /*tp_compare*/
4254     0,                      /*tp_repr*/
4255     0,                      /*tp_as_number*/
4256     0,                      /*tp_as_sequence*/
4257     0,                      /*tp_as_mapping*/
4258     0,                      /*tp_hash*/
4259     0,                      /*tp_call*/
4260     0,                      /*tp_str*/
4261     0,                      /*tp_getattro*/
4262     0,                      /*tp_setattro*/
4263     0,                      /*tp_as_buffer*/
4264     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4265     0,                      /*tp_doc*/
4266     0,                      /*tp_traverse*/
4267     0,                      /*tp_clear*/
4268     0,                      /*tp_richcompare*/
4269     0,                      /*tp_weaklistoffset*/
4270     0,                      /*tp_iter*/
4271     0,                      /*tp_iternext*/
4272     encoding_map_methods,   /*tp_methods*/
4273     0,                      /*tp_members*/
4274     0,                      /*tp_getset*/
4275     0,                      /*tp_base*/
4276     0,                      /*tp_dict*/
4277     0,                      /*tp_descr_get*/
4278     0,                      /*tp_descr_set*/
4279     0,                      /*tp_dictoffset*/
4280     0,                      /*tp_init*/
4281     0,                      /*tp_alloc*/
4282     0,                      /*tp_new*/
4283     0,                      /*tp_free*/
4284     0,                      /*tp_is_gc*/
4285 };
4286 
4287 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)4288 PyUnicode_BuildEncodingMap(PyObject* string)
4289 {
4290     Py_UNICODE *decode;
4291     PyObject *result;
4292     struct encoding_map *mresult;
4293     int i;
4294     int need_dict = 0;
4295     unsigned char level1[32];
4296     unsigned char level2[512];
4297     unsigned char *mlevel1, *mlevel2, *mlevel3;
4298     int count2 = 0, count3 = 0;
4299 
4300     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4301         PyErr_BadArgument();
4302         return NULL;
4303     }
4304     decode = PyUnicode_AS_UNICODE(string);
4305     memset(level1, 0xFF, sizeof level1);
4306     memset(level2, 0xFF, sizeof level2);
4307 
4308     /* If there isn't a one-to-one mapping of NULL to \0,
4309        or if there are non-BMP characters, we need to use
4310        a mapping dictionary. */
4311     if (decode[0] != 0)
4312         need_dict = 1;
4313     for (i = 1; i < 256; i++) {
4314         int l1, l2;
4315         if (decode[i] == 0
4316 #ifdef Py_UNICODE_WIDE
4317             || decode[i] > 0xFFFF
4318 #endif
4319             ) {
4320             need_dict = 1;
4321             break;
4322         }
4323         if (decode[i] == 0xFFFE)
4324             /* unmapped character */
4325             continue;
4326         l1 = decode[i] >> 11;
4327         l2 = decode[i] >> 7;
4328         if (level1[l1] == 0xFF)
4329             level1[l1] = count2++;
4330         if (level2[l2] == 0xFF)
4331             level2[l2] = count3++;
4332     }
4333 
4334     if (count2 >= 0xFF || count3 >= 0xFF)
4335         need_dict = 1;
4336 
4337     if (need_dict) {
4338         PyObject *result = PyDict_New();
4339         PyObject *key, *value;
4340         if (!result)
4341             return NULL;
4342         for (i = 0; i < 256; i++) {
4343             value = NULL;
4344             key = PyInt_FromLong(decode[i]);
4345             value = PyInt_FromLong(i);
4346             if (!key || !value)
4347                 goto failed1;
4348             if (PyDict_SetItem(result, key, value) == -1)
4349                 goto failed1;
4350             Py_DECREF(key);
4351             Py_DECREF(value);
4352         }
4353         return result;
4354       failed1:
4355         Py_XDECREF(key);
4356         Py_XDECREF(value);
4357         Py_DECREF(result);
4358         return NULL;
4359     }
4360 
4361     /* Create a three-level trie */
4362     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4363                              16*count2 + 128*count3 - 1);
4364     if (!result)
4365         return PyErr_NoMemory();
4366     PyObject_Init(result, &EncodingMapType);
4367     mresult = (struct encoding_map*)result;
4368     mresult->count2 = count2;
4369     mresult->count3 = count3;
4370     mlevel1 = mresult->level1;
4371     mlevel2 = mresult->level23;
4372     mlevel3 = mresult->level23 + 16*count2;
4373     memcpy(mlevel1, level1, 32);
4374     memset(mlevel2, 0xFF, 16*count2);
4375     memset(mlevel3, 0, 128*count3);
4376     count3 = 0;
4377     for (i = 1; i < 256; i++) {
4378         int o1, o2, o3, i2, i3;
4379         if (decode[i] == 0xFFFE)
4380             /* unmapped character */
4381             continue;
4382         o1 = decode[i]>>11;
4383         o2 = (decode[i]>>7) & 0xF;
4384         i2 = 16*mlevel1[o1] + o2;
4385         if (mlevel2[i2] == 0xFF)
4386             mlevel2[i2] = count3++;
4387         o3 = decode[i] & 0x7F;
4388         i3 = 128*mlevel2[i2] + o3;
4389         mlevel3[i3] = i;
4390     }
4391     return result;
4392 }
4393 
4394 static int
encoding_map_lookup(Py_UNICODE c,PyObject * mapping)4395 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4396 {
4397     struct encoding_map *map = (struct encoding_map*)mapping;
4398     int l1 = c>>11;
4399     int l2 = (c>>7) & 0xF;
4400     int l3 = c & 0x7F;
4401     int i;
4402 
4403 #ifdef Py_UNICODE_WIDE
4404     if (c > 0xFFFF) {
4405         return -1;
4406     }
4407 #endif
4408     if (c == 0)
4409         return 0;
4410     /* level 1*/
4411     i = map->level1[l1];
4412     if (i == 0xFF) {
4413         return -1;
4414     }
4415     /* level 2*/
4416     i = map->level23[16*i+l2];
4417     if (i == 0xFF) {
4418         return -1;
4419     }
4420     /* level 3 */
4421     i = map->level23[16*map->count2 + 128*i + l3];
4422     if (i == 0) {
4423         return -1;
4424     }
4425     return i;
4426 }
4427 
4428 /* Lookup the character ch in the mapping. If the character
4429    can't be found, Py_None is returned (or NULL, if another
4430    error occurred). */
charmapencode_lookup(Py_UNICODE c,PyObject * mapping)4431 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4432 {
4433     PyObject *w = PyInt_FromLong((long)c);
4434     PyObject *x;
4435 
4436     if (w == NULL)
4437         return NULL;
4438     x = PyObject_GetItem(mapping, w);
4439     Py_DECREF(w);
4440     if (x == NULL) {
4441         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4442             /* No mapping found means: mapping is undefined. */
4443             PyErr_Clear();
4444             x = Py_None;
4445             Py_INCREF(x);
4446             return x;
4447         } else
4448             return NULL;
4449     }
4450     else if (x == Py_None)
4451         return x;
4452     else if (PyInt_Check(x)) {
4453         long value = PyInt_AS_LONG(x);
4454         if (value < 0 || value > 255) {
4455             PyErr_SetString(PyExc_TypeError,
4456                             "character mapping must be in range(256)");
4457             Py_DECREF(x);
4458             return NULL;
4459         }
4460         return x;
4461     }
4462     else if (PyString_Check(x))
4463         return x;
4464     else {
4465         /* wrong return value */
4466         PyErr_SetString(PyExc_TypeError,
4467                         "character mapping must return integer, None or str");
4468         Py_DECREF(x);
4469         return NULL;
4470     }
4471 }
4472 
4473 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)4474 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4475 {
4476     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4477     /* exponentially overallocate to minimize reallocations */
4478     if (requiredsize < 2*outsize)
4479         requiredsize = 2*outsize;
4480     if (_PyString_Resize(outobj, requiredsize)) {
4481         return 0;
4482     }
4483     return 1;
4484 }
4485 
4486 typedef enum charmapencode_result {
4487     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4488 }charmapencode_result;
4489 /* lookup the character, put the result in the output string and adjust
4490    various state variables. Reallocate the output string if not enough
4491    space is available. Return a new reference to the object that
4492    was put in the output buffer, or Py_None, if the mapping was undefined
4493    (in which case no character was written) or NULL, if a
4494    reallocation error occurred. The caller must decref the result */
4495 static
charmapencode_output(Py_UNICODE c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)4496 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4497                                           PyObject **outobj, Py_ssize_t *outpos)
4498 {
4499     PyObject *rep;
4500     char *outstart;
4501     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4502 
4503     if (Py_TYPE(mapping) == &EncodingMapType) {
4504         int res = encoding_map_lookup(c, mapping);
4505         Py_ssize_t requiredsize = *outpos+1;
4506         if (res == -1)
4507             return enc_FAILED;
4508         if (outsize<requiredsize)
4509             if (!charmapencode_resize(outobj, outpos, requiredsize))
4510                 return enc_EXCEPTION;
4511         outstart = PyString_AS_STRING(*outobj);
4512         outstart[(*outpos)++] = (char)res;
4513         return enc_SUCCESS;
4514     }
4515 
4516     rep = charmapencode_lookup(c, mapping);
4517     if (rep==NULL)
4518         return enc_EXCEPTION;
4519     else if (rep==Py_None) {
4520         Py_DECREF(rep);
4521         return enc_FAILED;
4522     } else {
4523         if (PyInt_Check(rep)) {
4524             Py_ssize_t requiredsize = *outpos+1;
4525             if (outsize<requiredsize)
4526                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4527                     Py_DECREF(rep);
4528                     return enc_EXCEPTION;
4529                 }
4530             outstart = PyString_AS_STRING(*outobj);
4531             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4532         }
4533         else {
4534             const char *repchars = PyString_AS_STRING(rep);
4535             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4536             Py_ssize_t requiredsize = *outpos+repsize;
4537             if (outsize<requiredsize)
4538                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4539                     Py_DECREF(rep);
4540                     return enc_EXCEPTION;
4541                 }
4542             outstart = PyString_AS_STRING(*outobj);
4543             memcpy(outstart + *outpos, repchars, repsize);
4544             *outpos += repsize;
4545         }
4546     }
4547     Py_DECREF(rep);
4548     return enc_SUCCESS;
4549 }
4550 
4551 /* handle an error in PyUnicode_EncodeCharmap
4552    Return 0 on success, -1 on error */
4553 static
charmap_encoding_error(const Py_UNICODE * p,Py_ssize_t size,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,int * known_errorHandler,PyObject ** errorHandler,const char * errors,PyObject ** res,Py_ssize_t * respos)4554 int charmap_encoding_error(
4555     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4556     PyObject **exceptionObject,
4557     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4558     PyObject **res, Py_ssize_t *respos)
4559 {
4560     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4561     Py_ssize_t repsize;
4562     Py_ssize_t newpos;
4563     Py_UNICODE *uni2;
4564     /* startpos for collecting unencodable chars */
4565     Py_ssize_t collstartpos = *inpos;
4566     Py_ssize_t collendpos = *inpos+1;
4567     Py_ssize_t collpos;
4568     char *encoding = "charmap";
4569     char *reason = "character maps to <undefined>";
4570     charmapencode_result x;
4571 
4572     /* find all unencodable characters */
4573     while (collendpos < size) {
4574         PyObject *rep;
4575         if (Py_TYPE(mapping) == &EncodingMapType) {
4576             int res = encoding_map_lookup(p[collendpos], mapping);
4577             if (res != -1)
4578                 break;
4579             ++collendpos;
4580             continue;
4581         }
4582 
4583         rep = charmapencode_lookup(p[collendpos], mapping);
4584         if (rep==NULL)
4585             return -1;
4586         else if (rep!=Py_None) {
4587             Py_DECREF(rep);
4588             break;
4589         }
4590         Py_DECREF(rep);
4591         ++collendpos;
4592     }
4593     /* cache callback name lookup
4594      * (if not done yet, i.e. it's the first error) */
4595     if (*known_errorHandler==-1) {
4596         if ((errors==NULL) || (!strcmp(errors, "strict")))
4597             *known_errorHandler = 1;
4598         else if (!strcmp(errors, "replace"))
4599             *known_errorHandler = 2;
4600         else if (!strcmp(errors, "ignore"))
4601             *known_errorHandler = 3;
4602         else if (!strcmp(errors, "xmlcharrefreplace"))
4603             *known_errorHandler = 4;
4604         else
4605             *known_errorHandler = 0;
4606     }
4607     switch (*known_errorHandler) {
4608     case 1: /* strict */
4609         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4610         return -1;
4611     case 2: /* replace */
4612         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4613             x = charmapencode_output('?', mapping, res, respos);
4614             if (x==enc_EXCEPTION) {
4615                 return -1;
4616             }
4617             else if (x==enc_FAILED) {
4618                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619                 return -1;
4620             }
4621         }
4622         /* fall through */
4623     case 3: /* ignore */
4624         *inpos = collendpos;
4625         break;
4626     case 4: /* xmlcharrefreplace */
4627         /* generate replacement (temporarily (mis)uses p) */
4628         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4629             char buffer[2+29+1+1];
4630             char *cp;
4631             sprintf(buffer, "&#%d;", (int)p[collpos]);
4632             for (cp = buffer; *cp; ++cp) {
4633                 x = charmapencode_output(*cp, mapping, res, respos);
4634                 if (x==enc_EXCEPTION)
4635                     return -1;
4636                 else if (x==enc_FAILED) {
4637                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4638                     return -1;
4639                 }
4640             }
4641         }
4642         *inpos = collendpos;
4643         break;
4644     default:
4645         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4646                                                       encoding, reason, p, size, exceptionObject,
4647                                                       collstartpos, collendpos, &newpos);
4648         if (repunicode == NULL)
4649             return -1;
4650         /* generate replacement  */
4651         repsize = PyUnicode_GET_SIZE(repunicode);
4652         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4653             x = charmapencode_output(*uni2, mapping, res, respos);
4654             if (x==enc_EXCEPTION) {
4655                 return -1;
4656             }
4657             else if (x==enc_FAILED) {
4658                 Py_DECREF(repunicode);
4659                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4660                 return -1;
4661             }
4662         }
4663         *inpos = newpos;
4664         Py_DECREF(repunicode);
4665     }
4666     return 0;
4667 }
4668 
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)4669 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4670                                   Py_ssize_t size,
4671                                   PyObject *mapping,
4672                                   const char *errors)
4673 {
4674     /* output object */
4675     PyObject *res = NULL;
4676     /* current input position */
4677     Py_ssize_t inpos = 0;
4678     /* current output position */
4679     Py_ssize_t respos = 0;
4680     PyObject *errorHandler = NULL;
4681     PyObject *exc = NULL;
4682     /* the following variable is used for caching string comparisons
4683      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4684      * 3=ignore, 4=xmlcharrefreplace */
4685     int known_errorHandler = -1;
4686 
4687     /* Default to Latin-1 */
4688     if (mapping == NULL)
4689         return PyUnicode_EncodeLatin1(p, size, errors);
4690 
4691     /* allocate enough for a simple encoding without
4692        replacements, if we need more, we'll resize */
4693     res = PyString_FromStringAndSize(NULL, size);
4694     if (res == NULL)
4695         goto onError;
4696     if (size == 0)
4697         return res;
4698 
4699     while (inpos<size) {
4700         /* try to encode it */
4701         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4702         if (x==enc_EXCEPTION) /* error */
4703             goto onError;
4704         if (x==enc_FAILED) { /* unencodable character */
4705             if (charmap_encoding_error(p, size, &inpos, mapping,
4706                                        &exc,
4707                                        &known_errorHandler, &errorHandler, errors,
4708                                        &res, &respos)) {
4709                 goto onError;
4710             }
4711         }
4712         else
4713             /* done with this character => adjust input position */
4714             ++inpos;
4715     }
4716 
4717     /* Resize if we allocated to much */
4718     if (respos<PyString_GET_SIZE(res)) {
4719         if (_PyString_Resize(&res, respos))
4720             goto onError;
4721     }
4722     Py_XDECREF(exc);
4723     Py_XDECREF(errorHandler);
4724     return res;
4725 
4726   onError:
4727     Py_XDECREF(res);
4728     Py_XDECREF(exc);
4729     Py_XDECREF(errorHandler);
4730     return NULL;
4731 }
4732 
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)4733 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4734                                     PyObject *mapping)
4735 {
4736     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4737         PyErr_BadArgument();
4738         return NULL;
4739     }
4740     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4741                                    PyUnicode_GET_SIZE(unicode),
4742                                    mapping,
4743                                    NULL);
4744 }
4745 
4746 /* create or adjust a UnicodeTranslateError */
make_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4747 static void make_translate_exception(PyObject **exceptionObject,
4748                                      const Py_UNICODE *unicode, Py_ssize_t size,
4749                                      Py_ssize_t startpos, Py_ssize_t endpos,
4750                                      const char *reason)
4751 {
4752     if (*exceptionObject == NULL) {
4753         *exceptionObject = PyUnicodeTranslateError_Create(
4754             unicode, size, startpos, endpos, reason);
4755     }
4756     else {
4757         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4758             goto onError;
4759         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4760             goto onError;
4761         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4762             goto onError;
4763         return;
4764       onError:
4765         Py_DECREF(*exceptionObject);
4766         *exceptionObject = NULL;
4767     }
4768 }
4769 
4770 /* raises a UnicodeTranslateError */
raise_translate_exception(PyObject ** exceptionObject,const Py_UNICODE * unicode,Py_ssize_t size,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4771 static void raise_translate_exception(PyObject **exceptionObject,
4772                                       const Py_UNICODE *unicode, Py_ssize_t size,
4773                                       Py_ssize_t startpos, Py_ssize_t endpos,
4774                                       const char *reason)
4775 {
4776     make_translate_exception(exceptionObject,
4777                              unicode, size, startpos, endpos, reason);
4778     if (*exceptionObject != NULL)
4779         PyCodec_StrictErrors(*exceptionObject);
4780 }
4781 
4782 /* error handling callback helper:
4783    build arguments, call the callback and check the arguments,
4784    put the result into newpos and return the replacement string, which
4785    has to be freed by the caller */
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,const Py_UNICODE * unicode,Py_ssize_t size,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)4786 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4787                                                      PyObject **errorHandler,
4788                                                      const char *reason,
4789                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4790                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4791                                                      Py_ssize_t *newpos)
4792 {
4793     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4794 
4795     Py_ssize_t i_newpos;
4796     PyObject *restuple;
4797     PyObject *resunicode;
4798 
4799     if (*errorHandler == NULL) {
4800         *errorHandler = PyCodec_LookupError(errors);
4801         if (*errorHandler == NULL)
4802             return NULL;
4803     }
4804 
4805     make_translate_exception(exceptionObject,
4806                              unicode, size, startpos, endpos, reason);
4807     if (*exceptionObject == NULL)
4808         return NULL;
4809 
4810     restuple = PyObject_CallFunctionObjArgs(
4811         *errorHandler, *exceptionObject, NULL);
4812     if (restuple == NULL)
4813         return NULL;
4814     if (!PyTuple_Check(restuple)) {
4815         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4816         Py_DECREF(restuple);
4817         return NULL;
4818     }
4819     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4820                           &resunicode, &i_newpos)) {
4821         Py_DECREF(restuple);
4822         return NULL;
4823     }
4824     if (i_newpos<0)
4825         *newpos = size+i_newpos;
4826     else
4827         *newpos = i_newpos;
4828     if (*newpos<0 || *newpos>size) {
4829         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4830         Py_DECREF(restuple);
4831         return NULL;
4832     }
4833     Py_INCREF(resunicode);
4834     Py_DECREF(restuple);
4835     return resunicode;
4836 }
4837 
4838 /* Lookup the character ch in the mapping and put the result in result,
4839    which must be decrefed by the caller.
4840    Return 0 on success, -1 on error */
4841 static
charmaptranslate_lookup(Py_UNICODE c,PyObject * mapping,PyObject ** result)4842 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4843 {
4844     PyObject *w = PyInt_FromLong((long)c);
4845     PyObject *x;
4846 
4847     if (w == NULL)
4848         return -1;
4849     x = PyObject_GetItem(mapping, w);
4850     Py_DECREF(w);
4851     if (x == NULL) {
4852         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4853             /* No mapping found means: use 1:1 mapping. */
4854             PyErr_Clear();
4855             *result = NULL;
4856             return 0;
4857         } else
4858             return -1;
4859     }
4860     else if (x == Py_None) {
4861         *result = x;
4862         return 0;
4863     }
4864     else if (PyInt_Check(x)) {
4865         long value = PyInt_AS_LONG(x);
4866         long max = PyUnicode_GetMax();
4867         if (value < 0 || value > max) {
4868             PyErr_Format(PyExc_TypeError,
4869                          "character mapping must be in range(0x%lx)", max+1);
4870             Py_DECREF(x);
4871             return -1;
4872         }
4873         *result = x;
4874         return 0;
4875     }
4876     else if (PyUnicode_Check(x)) {
4877         *result = x;
4878         return 0;
4879     }
4880     else {
4881         /* wrong return value */
4882         PyErr_SetString(PyExc_TypeError,
4883                         "character mapping must return integer, None or unicode");
4884         Py_DECREF(x);
4885         return -1;
4886     }
4887 }
4888 /* ensure that *outobj is at least requiredsize characters long,
4889    if not reallocate and adjust various state variables.
4890    Return 0 on success, -1 on error */
4891 static
charmaptranslate_makespace(PyObject ** outobj,Py_UNICODE ** outp,Py_ssize_t requiredsize)4892 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4893                                Py_ssize_t requiredsize)
4894 {
4895     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4896     if (requiredsize > oldsize) {
4897         /* remember old output position */
4898         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4899         /* exponentially overallocate to minimize reallocations */
4900         if (requiredsize < 2 * oldsize)
4901             requiredsize = 2 * oldsize;
4902         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4903             return -1;
4904         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4905     }
4906     return 0;
4907 }
4908 /* lookup the character, put the result in the output string and adjust
4909    various state variables. Return a new reference to the object that
4910    was put in the output buffer in *result, or Py_None, if the mapping was
4911    undefined (in which case no character was written).
4912    The called must decref result.
4913    Return 0 on success, -1 on error. */
4914 static
charmaptranslate_output(const Py_UNICODE * startinp,const Py_UNICODE * curinp,Py_ssize_t insize,PyObject * mapping,PyObject ** outobj,Py_UNICODE ** outp,PyObject ** res)4915 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4916                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4917                             PyObject **res)
4918 {
4919     if (charmaptranslate_lookup(*curinp, mapping, res))
4920         return -1;
4921     if (*res==NULL) {
4922         /* not found => default to 1:1 mapping */
4923         *(*outp)++ = *curinp;
4924     }
4925     else if (*res==Py_None)
4926         ;
4927     else if (PyInt_Check(*res)) {
4928         /* no overflow check, because we know that the space is enough */
4929         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4930     }
4931     else if (PyUnicode_Check(*res)) {
4932         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4933         if (repsize==1) {
4934             /* no overflow check, because we know that the space is enough */
4935             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4936         }
4937         else if (repsize!=0) {
4938             /* more than one character */
4939             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4940                 (insize - (curinp-startinp)) +
4941                 repsize - 1;
4942             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4943                 return -1;
4944             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4945             *outp += repsize;
4946         }
4947     }
4948     else
4949         return -1;
4950     return 0;
4951 }
4952 
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)4953 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4954                                      Py_ssize_t size,
4955                                      PyObject *mapping,
4956                                      const char *errors)
4957 {
4958     /* output object */
4959     PyObject *res = NULL;
4960     /* pointers to the beginning and end+1 of input */
4961     const Py_UNICODE *startp = p;
4962     const Py_UNICODE *endp = p + size;
4963     /* pointer into the output */
4964     Py_UNICODE *str;
4965     /* current output position */
4966     Py_ssize_t respos = 0;
4967     char *reason = "character maps to <undefined>";
4968     PyObject *errorHandler = NULL;
4969     PyObject *exc = NULL;
4970     /* the following variable is used for caching string comparisons
4971      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4972      * 3=ignore, 4=xmlcharrefreplace */
4973     int known_errorHandler = -1;
4974 
4975     if (mapping == NULL) {
4976         PyErr_BadArgument();
4977         return NULL;
4978     }
4979 
4980     /* allocate enough for a simple 1:1 translation without
4981        replacements, if we need more, we'll resize */
4982     res = PyUnicode_FromUnicode(NULL, size);
4983     if (res == NULL)
4984         goto onError;
4985     if (size == 0)
4986         return res;
4987     str = PyUnicode_AS_UNICODE(res);
4988 
4989     while (p<endp) {
4990         /* try to encode it */
4991         PyObject *x = NULL;
4992         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4993             Py_XDECREF(x);
4994             goto onError;
4995         }
4996         Py_XDECREF(x);
4997         if (x!=Py_None) /* it worked => adjust input pointer */
4998             ++p;
4999         else { /* untranslatable character */
5000             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5001             Py_ssize_t repsize;
5002             Py_ssize_t newpos;
5003             Py_UNICODE *uni2;
5004             /* startpos for collecting untranslatable chars */
5005             const Py_UNICODE *collstart = p;
5006             const Py_UNICODE *collend = p+1;
5007             const Py_UNICODE *coll;
5008 
5009             /* find all untranslatable characters */
5010             while (collend < endp) {
5011                 if (charmaptranslate_lookup(*collend, mapping, &x))
5012                     goto onError;
5013                 Py_XDECREF(x);
5014                 if (x!=Py_None)
5015                     break;
5016                 ++collend;
5017             }
5018             /* cache callback name lookup
5019              * (if not done yet, i.e. it's the first error) */
5020             if (known_errorHandler==-1) {
5021                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5022                     known_errorHandler = 1;
5023                 else if (!strcmp(errors, "replace"))
5024                     known_errorHandler = 2;
5025                 else if (!strcmp(errors, "ignore"))
5026                     known_errorHandler = 3;
5027                 else if (!strcmp(errors, "xmlcharrefreplace"))
5028                     known_errorHandler = 4;
5029                 else
5030                     known_errorHandler = 0;
5031             }
5032             switch (known_errorHandler) {
5033             case 1: /* strict */
5034                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5035                 goto onError;
5036             case 2: /* replace */
5037                 /* No need to check for space, this is a 1:1 replacement */
5038                 for (coll = collstart; coll<collend; ++coll)
5039                     *str++ = '?';
5040                 /* fall through */
5041             case 3: /* ignore */
5042                 p = collend;
5043                 break;
5044             case 4: /* xmlcharrefreplace */
5045                 /* generate replacement (temporarily (mis)uses p) */
5046                 for (p = collstart; p < collend; ++p) {
5047                     char buffer[2+29+1+1];
5048                     char *cp;
5049                     sprintf(buffer, "&#%d;", (int)*p);
5050                     if (charmaptranslate_makespace(&res, &str,
5051                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5052                         goto onError;
5053                     for (cp = buffer; *cp; ++cp)
5054                         *str++ = *cp;
5055                 }
5056                 p = collend;
5057                 break;
5058             default:
5059                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5060                                                                  reason, startp, size, &exc,
5061                                                                  collstart-startp, collend-startp, &newpos);
5062                 if (repunicode == NULL)
5063                     goto onError;
5064                 /* generate replacement  */
5065                 repsize = PyUnicode_GET_SIZE(repunicode);
5066                 if (charmaptranslate_makespace(&res, &str,
5067                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5068                     Py_DECREF(repunicode);
5069                     goto onError;
5070                 }
5071                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5072                     *str++ = *uni2;
5073                 p = startp + newpos;
5074                 Py_DECREF(repunicode);
5075             }
5076         }
5077     }
5078     /* Resize if we allocated to much */
5079     respos = str-PyUnicode_AS_UNICODE(res);
5080     if (respos<PyUnicode_GET_SIZE(res)) {
5081         if (PyUnicode_Resize(&res, respos) < 0)
5082             goto onError;
5083     }
5084     Py_XDECREF(exc);
5085     Py_XDECREF(errorHandler);
5086     return res;
5087 
5088   onError:
5089     Py_XDECREF(res);
5090     Py_XDECREF(exc);
5091     Py_XDECREF(errorHandler);
5092     return NULL;
5093 }
5094 
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)5095 PyObject *PyUnicode_Translate(PyObject *str,
5096                               PyObject *mapping,
5097                               const char *errors)
5098 {
5099     PyObject *result;
5100 
5101     str = PyUnicode_FromObject(str);
5102     if (str == NULL)
5103         goto onError;
5104     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5105                                         PyUnicode_GET_SIZE(str),
5106                                         mapping,
5107                                         errors);
5108     Py_DECREF(str);
5109     return result;
5110 
5111   onError:
5112     Py_XDECREF(str);
5113     return NULL;
5114 }
5115 
5116 /* --- Decimal Encoder ---------------------------------------------------- */
5117 
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)5118 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5119                             Py_ssize_t length,
5120                             char *output,
5121                             const char *errors)
5122 {
5123     Py_UNICODE *p, *end;
5124     PyObject *errorHandler = NULL;
5125     PyObject *exc = NULL;
5126     const char *encoding = "decimal";
5127     const char *reason = "invalid decimal Unicode string";
5128     /* the following variable is used for caching string comparisons
5129      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5130     int known_errorHandler = -1;
5131 
5132     if (output == NULL) {
5133         PyErr_BadArgument();
5134         return -1;
5135     }
5136 
5137     p = s;
5138     end = s + length;
5139     while (p < end) {
5140         register Py_UNICODE ch = *p;
5141         int decimal;
5142         PyObject *repunicode;
5143         Py_ssize_t repsize;
5144         Py_ssize_t newpos;
5145         Py_UNICODE *uni2;
5146         Py_UNICODE *collstart;
5147         Py_UNICODE *collend;
5148 
5149         if (Py_UNICODE_ISSPACE(ch)) {
5150             *output++ = ' ';
5151             ++p;
5152             continue;
5153         }
5154         decimal = Py_UNICODE_TODECIMAL(ch);
5155         if (decimal >= 0) {
5156             *output++ = '0' + decimal;
5157             ++p;
5158             continue;
5159         }
5160         if (0 < ch && ch < 256) {
5161             *output++ = (char)ch;
5162             ++p;
5163             continue;
5164         }
5165         /* All other characters are considered unencodable */
5166         collstart = p;
5167         collend = p+1;
5168         while (collend < end) {
5169             if ((0 < *collend && *collend < 256) ||
5170                 !Py_UNICODE_ISSPACE(*collend) ||
5171                 Py_UNICODE_TODECIMAL(*collend))
5172                 break;
5173         }
5174         /* cache callback name lookup
5175          * (if not done yet, i.e. it's the first error) */
5176         if (known_errorHandler==-1) {
5177             if ((errors==NULL) || (!strcmp(errors, "strict")))
5178                 known_errorHandler = 1;
5179             else if (!strcmp(errors, "replace"))
5180                 known_errorHandler = 2;
5181             else if (!strcmp(errors, "ignore"))
5182                 known_errorHandler = 3;
5183             else if (!strcmp(errors, "xmlcharrefreplace"))
5184                 known_errorHandler = 4;
5185             else
5186                 known_errorHandler = 0;
5187         }
5188         switch (known_errorHandler) {
5189         case 1: /* strict */
5190             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5191             goto onError;
5192         case 2: /* replace */
5193             for (p = collstart; p < collend; ++p)
5194                 *output++ = '?';
5195             /* fall through */
5196         case 3: /* ignore */
5197             p = collend;
5198             break;
5199         case 4: /* xmlcharrefreplace */
5200             /* generate replacement (temporarily (mis)uses p) */
5201             for (p = collstart; p < collend; ++p)
5202                 output += sprintf(output, "&#%d;", (int)*p);
5203             p = collend;
5204             break;
5205         default:
5206             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5207                                                           encoding, reason, s, length, &exc,
5208                                                           collstart-s, collend-s, &newpos);
5209             if (repunicode == NULL)
5210                 goto onError;
5211             /* generate replacement  */
5212             repsize = PyUnicode_GET_SIZE(repunicode);
5213             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5214                 Py_UNICODE ch = *uni2;
5215                 if (Py_UNICODE_ISSPACE(ch))
5216                     *output++ = ' ';
5217                 else {
5218                     decimal = Py_UNICODE_TODECIMAL(ch);
5219                     if (decimal >= 0)
5220                         *output++ = '0' + decimal;
5221                     else if (0 < ch && ch < 256)
5222                         *output++ = (char)ch;
5223                     else {
5224                         Py_DECREF(repunicode);
5225                         raise_encode_exception(&exc, encoding,
5226                                                s, length, collstart-s, collend-s, reason);
5227                         goto onError;
5228                     }
5229                 }
5230             }
5231             p = s + newpos;
5232             Py_DECREF(repunicode);
5233         }
5234     }
5235     /* 0-terminate the output string */
5236     *output++ = '\0';
5237     Py_XDECREF(exc);
5238     Py_XDECREF(errorHandler);
5239     return 0;
5240 
5241   onError:
5242     Py_XDECREF(exc);
5243     Py_XDECREF(errorHandler);
5244     return -1;
5245 }
5246 
5247 /* --- Helpers ------------------------------------------------------------ */
5248 
5249 #include "stringlib/unicodedefs.h"
5250 #include "stringlib/fastsearch.h"
5251 
5252 #include "stringlib/count.h"
5253 #include "stringlib/find.h"
5254 #include "stringlib/partition.h"
5255 #include "stringlib/split.h"
5256 
5257 /* helper macro to fixup start/end slice values */
5258 #define ADJUST_INDICES(start, end, len)         \
5259     if (end > len)                              \
5260         end = len;                              \
5261     else if (end < 0) {                         \
5262         end += len;                             \
5263         if (end < 0)                            \
5264             end = 0;                            \
5265     }                                           \
5266     if (start < 0) {                            \
5267         start += len;                           \
5268         if (start < 0)                          \
5269             start = 0;                          \
5270     }
5271 
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)5272 Py_ssize_t PyUnicode_Count(PyObject *str,
5273                            PyObject *substr,
5274                            Py_ssize_t start,
5275                            Py_ssize_t end)
5276 {
5277     Py_ssize_t result;
5278     PyUnicodeObject* str_obj;
5279     PyUnicodeObject* sub_obj;
5280 
5281     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5282     if (!str_obj)
5283         return -1;
5284     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5285     if (!sub_obj) {
5286         Py_DECREF(str_obj);
5287         return -1;
5288     }
5289 
5290     ADJUST_INDICES(start, end, str_obj->length);
5291     result = stringlib_count(
5292         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5293         PY_SSIZE_T_MAX
5294         );
5295 
5296     Py_DECREF(sub_obj);
5297     Py_DECREF(str_obj);
5298 
5299     return result;
5300 }
5301 
PyUnicode_Find(PyObject * str,PyObject * sub,Py_ssize_t start,Py_ssize_t end,int direction)5302 Py_ssize_t PyUnicode_Find(PyObject *str,
5303                           PyObject *sub,
5304                           Py_ssize_t start,
5305                           Py_ssize_t end,
5306                           int direction)
5307 {
5308     Py_ssize_t result;
5309 
5310     str = PyUnicode_FromObject(str);
5311     if (!str)
5312         return -2;
5313     sub = PyUnicode_FromObject(sub);
5314     if (!sub) {
5315         Py_DECREF(str);
5316         return -2;
5317     }
5318 
5319     if (direction > 0)
5320         result = stringlib_find_slice(
5321             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5322             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5323             start, end
5324             );
5325     else
5326         result = stringlib_rfind_slice(
5327             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5328             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5329             start, end
5330             );
5331 
5332     Py_DECREF(str);
5333     Py_DECREF(sub);
5334 
5335     return result;
5336 }
5337 
5338 static
tailmatch(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)5339 int tailmatch(PyUnicodeObject *self,
5340               PyUnicodeObject *substring,
5341               Py_ssize_t start,
5342               Py_ssize_t end,
5343               int direction)
5344 {
5345     if (substring->length == 0)
5346         return 1;
5347 
5348     ADJUST_INDICES(start, end, self->length);
5349     end -= substring->length;
5350     if (end < start)
5351         return 0;
5352 
5353     if (direction > 0) {
5354         if (Py_UNICODE_MATCH(self, end, substring))
5355             return 1;
5356     } else {
5357         if (Py_UNICODE_MATCH(self, start, substring))
5358             return 1;
5359     }
5360 
5361     return 0;
5362 }
5363 
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)5364 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5365                                PyObject *substr,
5366                                Py_ssize_t start,
5367                                Py_ssize_t end,
5368                                int direction)
5369 {
5370     Py_ssize_t result;
5371 
5372     str = PyUnicode_FromObject(str);
5373     if (str == NULL)
5374         return -1;
5375     substr = PyUnicode_FromObject(substr);
5376     if (substr == NULL) {
5377         Py_DECREF(str);
5378         return -1;
5379     }
5380 
5381     result = tailmatch((PyUnicodeObject *)str,
5382                        (PyUnicodeObject *)substr,
5383                        start, end, direction);
5384     Py_DECREF(str);
5385     Py_DECREF(substr);
5386     return result;
5387 }
5388 
5389 /* Apply fixfct filter to the Unicode object self and return a
5390    reference to the modified object */
5391 
5392 static
fixup(PyUnicodeObject * self,int (* fixfct)(PyUnicodeObject * s))5393 PyObject *fixup(PyUnicodeObject *self,
5394                 int (*fixfct)(PyUnicodeObject *s))
5395 {
5396 
5397     PyUnicodeObject *u;
5398 
5399     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5400     if (u == NULL)
5401         return NULL;
5402 
5403     Py_UNICODE_COPY(u->str, self->str, self->length);
5404 
5405     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5406         /* fixfct should return TRUE if it modified the buffer. If
5407            FALSE, return a reference to the original buffer instead
5408            (to save space, not time) */
5409         Py_INCREF(self);
5410         Py_DECREF(u);
5411         return (PyObject*) self;
5412     }
5413     return (PyObject*) u;
5414 }
5415 
5416 static
fixupper(PyUnicodeObject * self)5417 int fixupper(PyUnicodeObject *self)
5418 {
5419     Py_ssize_t len = self->length;
5420     Py_UNICODE *s = self->str;
5421     int status = 0;
5422 
5423     while (len-- > 0) {
5424         register Py_UNICODE ch;
5425 
5426         ch = Py_UNICODE_TOUPPER(*s);
5427         if (ch != *s) {
5428             status = 1;
5429             *s = ch;
5430         }
5431         s++;
5432     }
5433 
5434     return status;
5435 }
5436 
5437 static
fixlower(PyUnicodeObject * self)5438 int fixlower(PyUnicodeObject *self)
5439 {
5440     Py_ssize_t len = self->length;
5441     Py_UNICODE *s = self->str;
5442     int status = 0;
5443 
5444     while (len-- > 0) {
5445         register Py_UNICODE ch;
5446 
5447         ch = Py_UNICODE_TOLOWER(*s);
5448         if (ch != *s) {
5449             status = 1;
5450             *s = ch;
5451         }
5452         s++;
5453     }
5454 
5455     return status;
5456 }
5457 
5458 static
fixswapcase(PyUnicodeObject * self)5459 int fixswapcase(PyUnicodeObject *self)
5460 {
5461     Py_ssize_t len = self->length;
5462     Py_UNICODE *s = self->str;
5463     int status = 0;
5464 
5465     while (len-- > 0) {
5466         if (Py_UNICODE_ISUPPER(*s)) {
5467             *s = Py_UNICODE_TOLOWER(*s);
5468             status = 1;
5469         } else if (Py_UNICODE_ISLOWER(*s)) {
5470             *s = Py_UNICODE_TOUPPER(*s);
5471             status = 1;
5472         }
5473         s++;
5474     }
5475 
5476     return status;
5477 }
5478 
5479 static
fixcapitalize(PyUnicodeObject * self)5480 int fixcapitalize(PyUnicodeObject *self)
5481 {
5482     Py_ssize_t len = self->length;
5483     Py_UNICODE *s = self->str;
5484     int status = 0;
5485 
5486     if (len == 0)
5487         return 0;
5488     if (Py_UNICODE_ISLOWER(*s)) {
5489         *s = Py_UNICODE_TOUPPER(*s);
5490         status = 1;
5491     }
5492     s++;
5493     while (--len > 0) {
5494         if (Py_UNICODE_ISUPPER(*s)) {
5495             *s = Py_UNICODE_TOLOWER(*s);
5496             status = 1;
5497         }
5498         s++;
5499     }
5500     return status;
5501 }
5502 
5503 static
fixtitle(PyUnicodeObject * self)5504 int fixtitle(PyUnicodeObject *self)
5505 {
5506     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5507     register Py_UNICODE *e;
5508     int previous_is_cased;
5509 
5510     /* Shortcut for single character strings */
5511     if (PyUnicode_GET_SIZE(self) == 1) {
5512         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5513         if (*p != ch) {
5514             *p = ch;
5515             return 1;
5516         }
5517         else
5518             return 0;
5519     }
5520 
5521     e = p + PyUnicode_GET_SIZE(self);
5522     previous_is_cased = 0;
5523     for (; p < e; p++) {
5524         register const Py_UNICODE ch = *p;
5525 
5526         if (previous_is_cased)
5527             *p = Py_UNICODE_TOLOWER(ch);
5528         else
5529             *p = Py_UNICODE_TOTITLE(ch);
5530 
5531         if (Py_UNICODE_ISLOWER(ch) ||
5532             Py_UNICODE_ISUPPER(ch) ||
5533             Py_UNICODE_ISTITLE(ch))
5534             previous_is_cased = 1;
5535         else
5536             previous_is_cased = 0;
5537     }
5538     return 1;
5539 }
5540 
5541 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)5542 PyUnicode_Join(PyObject *separator, PyObject *seq)
5543 {
5544     PyObject *internal_separator = NULL;
5545     const Py_UNICODE blank = ' ';
5546     const Py_UNICODE *sep = &blank;
5547     Py_ssize_t seplen = 1;
5548     PyUnicodeObject *res = NULL; /* the result */
5549     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5550     Py_ssize_t res_used;         /* # used bytes */
5551     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5552     PyObject *fseq;          /* PySequence_Fast(seq) */
5553     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5554     PyObject *item;
5555     Py_ssize_t i;
5556 
5557     fseq = PySequence_Fast(seq, "");
5558     if (fseq == NULL) {
5559         return NULL;
5560     }
5561 
5562     /* Grrrr.  A codec may be invoked to convert str objects to
5563      * Unicode, and so it's possible to call back into Python code
5564      * during PyUnicode_FromObject(), and so it's possible for a sick
5565      * codec to change the size of fseq (if seq is a list).  Therefore
5566      * we have to keep refetching the size -- can't assume seqlen
5567      * is invariant.
5568      */
5569     seqlen = PySequence_Fast_GET_SIZE(fseq);
5570     /* If empty sequence, return u"". */
5571     if (seqlen == 0) {
5572         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5573         goto Done;
5574     }
5575     /* If singleton sequence with an exact Unicode, return that. */
5576     if (seqlen == 1) {
5577         item = PySequence_Fast_GET_ITEM(fseq, 0);
5578         if (PyUnicode_CheckExact(item)) {
5579             Py_INCREF(item);
5580             res = (PyUnicodeObject *)item;
5581             goto Done;
5582         }
5583     }
5584 
5585     /* At least two items to join, or one that isn't exact Unicode. */
5586     if (seqlen > 1) {
5587         /* Set up sep and seplen -- they're needed. */
5588         if (separator == NULL) {
5589             sep = &blank;
5590             seplen = 1;
5591         }
5592         else {
5593             internal_separator = PyUnicode_FromObject(separator);
5594             if (internal_separator == NULL)
5595                 goto onError;
5596             sep = PyUnicode_AS_UNICODE(internal_separator);
5597             seplen = PyUnicode_GET_SIZE(internal_separator);
5598             /* In case PyUnicode_FromObject() mutated seq. */
5599             seqlen = PySequence_Fast_GET_SIZE(fseq);
5600         }
5601     }
5602 
5603     /* Get space. */
5604     res = _PyUnicode_New(res_alloc);
5605     if (res == NULL)
5606         goto onError;
5607     res_p = PyUnicode_AS_UNICODE(res);
5608     res_used = 0;
5609 
5610     for (i = 0; i < seqlen; ++i) {
5611         Py_ssize_t itemlen;
5612         Py_ssize_t new_res_used;
5613 
5614         item = PySequence_Fast_GET_ITEM(fseq, i);
5615         /* Convert item to Unicode. */
5616         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5617             PyErr_Format(PyExc_TypeError,
5618                          "sequence item %zd: expected string or Unicode,"
5619                          " %.80s found",
5620                          i, Py_TYPE(item)->tp_name);
5621             goto onError;
5622         }
5623         item = PyUnicode_FromObject(item);
5624         if (item == NULL)
5625             goto onError;
5626         /* We own a reference to item from here on. */
5627 
5628         /* In case PyUnicode_FromObject() mutated seq. */
5629         seqlen = PySequence_Fast_GET_SIZE(fseq);
5630 
5631         /* Make sure we have enough space for the separator and the item. */
5632         itemlen = PyUnicode_GET_SIZE(item);
5633         new_res_used = res_used + itemlen;
5634         if (new_res_used < 0)
5635             goto Overflow;
5636         if (i < seqlen - 1) {
5637             new_res_used += seplen;
5638             if (new_res_used < 0)
5639                 goto Overflow;
5640         }
5641         if (new_res_used > res_alloc) {
5642             /* double allocated size until it's big enough */
5643             do {
5644                 res_alloc += res_alloc;
5645                 if (res_alloc <= 0)
5646                     goto Overflow;
5647             } while (new_res_used > res_alloc);
5648             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5649                 Py_DECREF(item);
5650                 goto onError;
5651             }
5652             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5653         }
5654 
5655         /* Copy item, and maybe the separator. */
5656         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5657         res_p += itemlen;
5658         if (i < seqlen - 1) {
5659             Py_UNICODE_COPY(res_p, sep, seplen);
5660             res_p += seplen;
5661         }
5662         Py_DECREF(item);
5663         res_used = new_res_used;
5664     }
5665 
5666     /* Shrink res to match the used area; this probably can't fail,
5667      * but it's cheap to check.
5668      */
5669     if (_PyUnicode_Resize(&res, res_used) < 0)
5670         goto onError;
5671 
5672   Done:
5673     Py_XDECREF(internal_separator);
5674     Py_DECREF(fseq);
5675     return (PyObject *)res;
5676 
5677   Overflow:
5678     PyErr_SetString(PyExc_OverflowError,
5679                     "join() result is too long for a Python string");
5680     Py_DECREF(item);
5681     /* fall through */
5682 
5683   onError:
5684     Py_XDECREF(internal_separator);
5685     Py_DECREF(fseq);
5686     Py_XDECREF(res);
5687     return NULL;
5688 }
5689 
5690 static
pad(PyUnicodeObject * self,Py_ssize_t left,Py_ssize_t right,Py_UNICODE fill)5691 PyUnicodeObject *pad(PyUnicodeObject *self,
5692                      Py_ssize_t left,
5693                      Py_ssize_t right,
5694                      Py_UNICODE fill)
5695 {
5696     PyUnicodeObject *u;
5697 
5698     if (left < 0)
5699         left = 0;
5700     if (right < 0)
5701         right = 0;
5702 
5703     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5704         Py_INCREF(self);
5705         return self;
5706     }
5707 
5708     if (left > PY_SSIZE_T_MAX - self->length ||
5709         right > PY_SSIZE_T_MAX - (left + self->length)) {
5710         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5711         return NULL;
5712     }
5713     u = _PyUnicode_New(left + self->length + right);
5714     if (u) {
5715         if (left)
5716             Py_UNICODE_FILL(u->str, fill, left);
5717         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5718         if (right)
5719             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5720     }
5721 
5722     return u;
5723 }
5724 
PyUnicode_Splitlines(PyObject * string,int keepends)5725 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5726 {
5727     PyObject *list;
5728 
5729     string = PyUnicode_FromObject(string);
5730     if (string == NULL)
5731         return NULL;
5732 
5733     list = stringlib_splitlines(
5734         (PyObject*) string, PyUnicode_AS_UNICODE(string),
5735         PyUnicode_GET_SIZE(string), keepends);
5736 
5737     Py_DECREF(string);
5738     return list;
5739 }
5740 
5741 static
split(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5742 PyObject *split(PyUnicodeObject *self,
5743                 PyUnicodeObject *substring,
5744                 Py_ssize_t maxcount)
5745 {
5746     if (maxcount < 0)
5747         maxcount = PY_SSIZE_T_MAX;
5748 
5749     if (substring == NULL)
5750         return stringlib_split_whitespace(
5751             (PyObject*) self,  self->str, self->length, maxcount
5752             );
5753 
5754     return stringlib_split(
5755         (PyObject*) self,  self->str, self->length,
5756         substring->str, substring->length,
5757         maxcount
5758         );
5759 }
5760 
5761 static
rsplit(PyUnicodeObject * self,PyUnicodeObject * substring,Py_ssize_t maxcount)5762 PyObject *rsplit(PyUnicodeObject *self,
5763                  PyUnicodeObject *substring,
5764                  Py_ssize_t maxcount)
5765 {
5766     if (maxcount < 0)
5767         maxcount = PY_SSIZE_T_MAX;
5768 
5769     if (substring == NULL)
5770         return stringlib_rsplit_whitespace(
5771             (PyObject*) self,  self->str, self->length, maxcount
5772             );
5773 
5774     return stringlib_rsplit(
5775         (PyObject*) self,  self->str, self->length,
5776         substring->str, substring->length,
5777         maxcount
5778         );
5779 }
5780 
5781 static
replace(PyUnicodeObject * self,PyUnicodeObject * str1,PyUnicodeObject * str2,Py_ssize_t maxcount)5782 PyObject *replace(PyUnicodeObject *self,
5783                   PyUnicodeObject *str1,
5784                   PyUnicodeObject *str2,
5785                   Py_ssize_t maxcount)
5786 {
5787     PyUnicodeObject *u;
5788 
5789     if (maxcount < 0)
5790         maxcount = PY_SSIZE_T_MAX;
5791     else if (maxcount == 0 || self->length == 0)
5792         goto nothing;
5793 
5794     if (str1->length == str2->length) {
5795         Py_ssize_t i;
5796         /* same length */
5797         if (str1->length == 0)
5798             goto nothing;
5799         if (str1->length == 1) {
5800             /* replace characters */
5801             Py_UNICODE u1, u2;
5802             if (!findchar(self->str, self->length, str1->str[0]))
5803                 goto nothing;
5804             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5805             if (!u)
5806                 return NULL;
5807             Py_UNICODE_COPY(u->str, self->str, self->length);
5808             u1 = str1->str[0];
5809             u2 = str2->str[0];
5810             for (i = 0; i < u->length; i++)
5811                 if (u->str[i] == u1) {
5812                     if (--maxcount < 0)
5813                         break;
5814                     u->str[i] = u2;
5815                 }
5816         } else {
5817             i = stringlib_find(
5818                 self->str, self->length, str1->str, str1->length, 0
5819                 );
5820             if (i < 0)
5821                 goto nothing;
5822             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5823             if (!u)
5824                 return NULL;
5825             Py_UNICODE_COPY(u->str, self->str, self->length);
5826 
5827             /* change everything in-place, starting with this one */
5828             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5829             i += str1->length;
5830 
5831             while ( --maxcount > 0) {
5832                 i = stringlib_find(self->str+i, self->length-i,
5833                                    str1->str, str1->length,
5834                                    i);
5835                 if (i == -1)
5836                     break;
5837                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5838                 i += str1->length;
5839             }
5840         }
5841     } else {
5842 
5843         Py_ssize_t n, i, j;
5844         Py_ssize_t product, new_size, delta;
5845         Py_UNICODE *p;
5846 
5847         /* replace strings */
5848         n = stringlib_count(self->str, self->length, str1->str, str1->length,
5849                             maxcount);
5850         if (n == 0)
5851             goto nothing;
5852         /* new_size = self->length + n * (str2->length - str1->length)); */
5853         delta = (str2->length - str1->length);
5854         if (delta == 0) {
5855             new_size = self->length;
5856         } else {
5857             product = n * (str2->length - str1->length);
5858             if ((product / (str2->length - str1->length)) != n) {
5859                 PyErr_SetString(PyExc_OverflowError,
5860                                 "replace string is too long");
5861                 return NULL;
5862             }
5863             new_size = self->length + product;
5864             if (new_size < 0) {
5865                 PyErr_SetString(PyExc_OverflowError,
5866                                 "replace string is too long");
5867                 return NULL;
5868             }
5869         }
5870         u = _PyUnicode_New(new_size);
5871         if (!u)
5872             return NULL;
5873         i = 0;
5874         p = u->str;
5875         if (str1->length > 0) {
5876             while (n-- > 0) {
5877                 /* look for next match */
5878                 j = stringlib_find(self->str+i, self->length-i,
5879                                    str1->str, str1->length,
5880                                    i);
5881                 if (j == -1)
5882                     break;
5883                 else if (j > i) {
5884                     /* copy unchanged part [i:j] */
5885                     Py_UNICODE_COPY(p, self->str+i, j-i);
5886                     p += j - i;
5887                 }
5888                 /* copy substitution string */
5889                 if (str2->length > 0) {
5890                     Py_UNICODE_COPY(p, str2->str, str2->length);
5891                     p += str2->length;
5892                 }
5893                 i = j + str1->length;
5894             }
5895             if (i < self->length)
5896                 /* copy tail [i:] */
5897                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5898         } else {
5899             /* interleave */
5900             while (n > 0) {
5901                 Py_UNICODE_COPY(p, str2->str, str2->length);
5902                 p += str2->length;
5903                 if (--n <= 0)
5904                     break;
5905                 *p++ = self->str[i++];
5906             }
5907             Py_UNICODE_COPY(p, self->str+i, self->length-i);
5908         }
5909     }
5910     return (PyObject *) u;
5911 
5912   nothing:
5913     /* nothing to replace; return original string (when possible) */
5914     if (PyUnicode_CheckExact(self)) {
5915         Py_INCREF(self);
5916         return (PyObject *) self;
5917     }
5918     return PyUnicode_FromUnicode(self->str, self->length);
5919 }
5920 
5921 /* --- Unicode Object Methods --------------------------------------------- */
5922 
5923 PyDoc_STRVAR(title__doc__,
5924              "S.title() -> unicode\n\
5925 \n\
5926 Return a titlecased version of S, i.e. words start with title case\n\
5927 characters, all remaining cased characters have lower case.");
5928 
5929 static PyObject*
unicode_title(PyUnicodeObject * self)5930 unicode_title(PyUnicodeObject *self)
5931 {
5932     return fixup(self, fixtitle);
5933 }
5934 
5935 PyDoc_STRVAR(capitalize__doc__,
5936              "S.capitalize() -> unicode\n\
5937 \n\
5938 Return a capitalized version of S, i.e. make the first character\n\
5939 have upper case and the rest lower case.");
5940 
5941 static PyObject*
unicode_capitalize(PyUnicodeObject * self)5942 unicode_capitalize(PyUnicodeObject *self)
5943 {
5944     return fixup(self, fixcapitalize);
5945 }
5946 
5947 #if 0
5948 PyDoc_STRVAR(capwords__doc__,
5949              "S.capwords() -> unicode\n\
5950 \n\
5951 Apply .capitalize() to all words in S and return the result with\n\
5952 normalized whitespace (all whitespace strings are replaced by ' ').");
5953 
5954 static PyObject*
5955 unicode_capwords(PyUnicodeObject *self)
5956 {
5957     PyObject *list;
5958     PyObject *item;
5959     Py_ssize_t i;
5960 
5961     /* Split into words */
5962     list = split(self, NULL, -1);
5963     if (!list)
5964         return NULL;
5965 
5966     /* Capitalize each word */
5967     for (i = 0; i < PyList_GET_SIZE(list); i++) {
5968         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5969                      fixcapitalize);
5970         if (item == NULL)
5971             goto onError;
5972         Py_DECREF(PyList_GET_ITEM(list, i));
5973         PyList_SET_ITEM(list, i, item);
5974     }
5975 
5976     /* Join the words to form a new string */
5977     item = PyUnicode_Join(NULL, list);
5978 
5979   onError:
5980     Py_DECREF(list);
5981     return (PyObject *)item;
5982 }
5983 #endif
5984 
5985 /* Argument converter.  Coerces to a single unicode character */
5986 
5987 static int
convert_uc(PyObject * obj,void * addr)5988 convert_uc(PyObject *obj, void *addr)
5989 {
5990     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5991     PyObject *uniobj;
5992     Py_UNICODE *unistr;
5993 
5994     uniobj = PyUnicode_FromObject(obj);
5995     if (uniobj == NULL) {
5996         PyErr_SetString(PyExc_TypeError,
5997                         "The fill character cannot be converted to Unicode");
5998         return 0;
5999     }
6000     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6001         PyErr_SetString(PyExc_TypeError,
6002                         "The fill character must be exactly one character long");
6003         Py_DECREF(uniobj);
6004         return 0;
6005     }
6006     unistr = PyUnicode_AS_UNICODE(uniobj);
6007     *fillcharloc = unistr[0];
6008     Py_DECREF(uniobj);
6009     return 1;
6010 }
6011 
6012 PyDoc_STRVAR(center__doc__,
6013              "S.center(width[, fillchar]) -> unicode\n\
6014 \n\
6015 Return S centered in a Unicode string of length width. Padding is\n\
6016 done using the specified fill character (default is a space)");
6017 
6018 static PyObject *
unicode_center(PyUnicodeObject * self,PyObject * args)6019 unicode_center(PyUnicodeObject *self, PyObject *args)
6020 {
6021     Py_ssize_t marg, left;
6022     Py_ssize_t width;
6023     Py_UNICODE fillchar = ' ';
6024 
6025     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6026         return NULL;
6027 
6028     if (self->length >= width && PyUnicode_CheckExact(self)) {
6029         Py_INCREF(self);
6030         return (PyObject*) self;
6031     }
6032 
6033     marg = width - self->length;
6034     left = marg / 2 + (marg & width & 1);
6035 
6036     return (PyObject*) pad(self, left, marg - left, fillchar);
6037 }
6038 
6039 #if 0
6040 
6041 /* This code should go into some future Unicode collation support
6042    module. The basic comparison should compare ordinals on a naive
6043    basis (this is what Java does and thus Jython too). */
6044 
6045 /* speedy UTF-16 code point order comparison */
6046 /* gleaned from: */
6047 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6048 
6049 static short utf16Fixup[32] =
6050 {
6051     0, 0, 0, 0, 0, 0, 0, 0,
6052     0, 0, 0, 0, 0, 0, 0, 0,
6053     0, 0, 0, 0, 0, 0, 0, 0,
6054     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6055 };
6056 
6057 static int
6058 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6059 {
6060     Py_ssize_t len1, len2;
6061 
6062     Py_UNICODE *s1 = str1->str;
6063     Py_UNICODE *s2 = str2->str;
6064 
6065     len1 = str1->length;
6066     len2 = str2->length;
6067 
6068     while (len1 > 0 && len2 > 0) {
6069         Py_UNICODE c1, c2;
6070 
6071         c1 = *s1++;
6072         c2 = *s2++;
6073 
6074         if (c1 > (1<<11) * 26)
6075             c1 += utf16Fixup[c1>>11];
6076         if (c2 > (1<<11) * 26)
6077             c2 += utf16Fixup[c2>>11];
6078         /* now c1 and c2 are in UTF-32-compatible order */
6079 
6080         if (c1 != c2)
6081             return (c1 < c2) ? -1 : 1;
6082 
6083         len1--; len2--;
6084     }
6085 
6086     return (len1 < len2) ? -1 : (len1 != len2);
6087 }
6088 
6089 #else
6090 
6091 static int
unicode_compare(PyUnicodeObject * str1,PyUnicodeObject * str2)6092 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6093 {
6094     register Py_ssize_t len1, len2;
6095 
6096     Py_UNICODE *s1 = str1->str;
6097     Py_UNICODE *s2 = str2->str;
6098 
6099     len1 = str1->length;
6100     len2 = str2->length;
6101 
6102     while (len1 > 0 && len2 > 0) {
6103         Py_UNICODE c1, c2;
6104 
6105         c1 = *s1++;
6106         c2 = *s2++;
6107 
6108         if (c1 != c2)
6109             return (c1 < c2) ? -1 : 1;
6110 
6111         len1--; len2--;
6112     }
6113 
6114     return (len1 < len2) ? -1 : (len1 != len2);
6115 }
6116 
6117 #endif
6118 
PyUnicode_Compare(PyObject * left,PyObject * right)6119 int PyUnicode_Compare(PyObject *left,
6120                       PyObject *right)
6121 {
6122     PyUnicodeObject *u = NULL, *v = NULL;
6123     int result;
6124 
6125     /* Coerce the two arguments */
6126     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6127     if (u == NULL)
6128         goto onError;
6129     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6130     if (v == NULL)
6131         goto onError;
6132 
6133     /* Shortcut for empty or interned objects */
6134     if (v == u) {
6135         Py_DECREF(u);
6136         Py_DECREF(v);
6137         return 0;
6138     }
6139 
6140     result = unicode_compare(u, v);
6141 
6142     Py_DECREF(u);
6143     Py_DECREF(v);
6144     return result;
6145 
6146   onError:
6147     Py_XDECREF(u);
6148     Py_XDECREF(v);
6149     return -1;
6150 }
6151 
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)6152 PyObject *PyUnicode_RichCompare(PyObject *left,
6153                                 PyObject *right,
6154                                 int op)
6155 {
6156     int result;
6157 
6158     result = PyUnicode_Compare(left, right);
6159     if (result == -1 && PyErr_Occurred())
6160         goto onError;
6161 
6162     /* Convert the return value to a Boolean */
6163     switch (op) {
6164     case Py_EQ:
6165         result = (result == 0);
6166         break;
6167     case Py_NE:
6168         result = (result != 0);
6169         break;
6170     case Py_LE:
6171         result = (result <= 0);
6172         break;
6173     case Py_GE:
6174         result = (result >= 0);
6175         break;
6176     case Py_LT:
6177         result = (result == -1);
6178         break;
6179     case Py_GT:
6180         result = (result == 1);
6181         break;
6182     }
6183     return PyBool_FromLong(result);
6184 
6185   onError:
6186 
6187     /* Standard case
6188 
6189        Type errors mean that PyUnicode_FromObject() could not convert
6190        one of the arguments (usually the right hand side) to Unicode,
6191        ie. we can't handle the comparison request. However, it is
6192        possible that the other object knows a comparison method, which
6193        is why we return Py_NotImplemented to give the other object a
6194        chance.
6195 
6196     */
6197     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6198         PyErr_Clear();
6199         Py_INCREF(Py_NotImplemented);
6200         return Py_NotImplemented;
6201     }
6202     if (op != Py_EQ && op != Py_NE)
6203         return NULL;
6204 
6205     /* Equality comparison.
6206 
6207        This is a special case: we silence any PyExc_UnicodeDecodeError
6208        and instead turn it into a PyErr_UnicodeWarning.
6209 
6210     */
6211     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6212         return NULL;
6213     PyErr_Clear();
6214     if (PyErr_Warn(PyExc_UnicodeWarning,
6215                    (op == Py_EQ) ?
6216                    "Unicode equal comparison "
6217                    "failed to convert both arguments to Unicode - "
6218                    "interpreting them as being unequal" :
6219                    "Unicode unequal comparison "
6220                    "failed to convert both arguments to Unicode - "
6221                    "interpreting them as being unequal"
6222             ) < 0)
6223         return NULL;
6224     result = (op == Py_NE);
6225     return PyBool_FromLong(result);
6226 }
6227 
PyUnicode_Contains(PyObject * container,PyObject * element)6228 int PyUnicode_Contains(PyObject *container,
6229                        PyObject *element)
6230 {
6231     PyObject *str, *sub;
6232     int result;
6233 
6234     /* Coerce the two arguments */
6235     sub = PyUnicode_FromObject(element);
6236     if (!sub) {
6237         return -1;
6238     }
6239 
6240     str = PyUnicode_FromObject(container);
6241     if (!str) {
6242         Py_DECREF(sub);
6243         return -1;
6244     }
6245 
6246     result = stringlib_contains_obj(str, sub);
6247 
6248     Py_DECREF(str);
6249     Py_DECREF(sub);
6250 
6251     return result;
6252 }
6253 
6254 /* Concat to string or Unicode object giving a new Unicode object. */
6255 
PyUnicode_Concat(PyObject * left,PyObject * right)6256 PyObject *PyUnicode_Concat(PyObject *left,
6257                            PyObject *right)
6258 {
6259     PyUnicodeObject *u = NULL, *v = NULL, *w;
6260 
6261     /* Coerce the two arguments */
6262     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6263     if (u == NULL)
6264         goto onError;
6265     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6266     if (v == NULL)
6267         goto onError;
6268 
6269     /* Shortcuts */
6270     if (v == unicode_empty) {
6271         Py_DECREF(v);
6272         return (PyObject *)u;
6273     }
6274     if (u == unicode_empty) {
6275         Py_DECREF(u);
6276         return (PyObject *)v;
6277     }
6278 
6279     /* Concat the two Unicode strings */
6280     w = _PyUnicode_New(u->length + v->length);
6281     if (w == NULL)
6282         goto onError;
6283     Py_UNICODE_COPY(w->str, u->str, u->length);
6284     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6285 
6286     Py_DECREF(u);
6287     Py_DECREF(v);
6288     return (PyObject *)w;
6289 
6290   onError:
6291     Py_XDECREF(u);
6292     Py_XDECREF(v);
6293     return NULL;
6294 }
6295 
6296 PyDoc_STRVAR(count__doc__,
6297              "S.count(sub[, start[, end]]) -> int\n\
6298 \n\
6299 Return the number of non-overlapping occurrences of substring sub in\n\
6300 Unicode string S[start:end].  Optional arguments start and end are\n\
6301 interpreted as in slice notation.");
6302 
6303 static PyObject *
unicode_count(PyUnicodeObject * self,PyObject * args)6304 unicode_count(PyUnicodeObject *self, PyObject *args)
6305 {
6306     PyUnicodeObject *substring;
6307     Py_ssize_t start = 0;
6308     Py_ssize_t end = PY_SSIZE_T_MAX;
6309     PyObject *result;
6310 
6311     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6312                                             &start, &end))
6313         return NULL;
6314 
6315     ADJUST_INDICES(start, end, self->length);
6316     result = PyInt_FromSsize_t(
6317         stringlib_count(self->str + start, end - start,
6318                         substring->str, substring->length,
6319                         PY_SSIZE_T_MAX)
6320         );
6321 
6322     Py_DECREF(substring);
6323 
6324     return result;
6325 }
6326 
6327 PyDoc_STRVAR(encode__doc__,
6328              "S.encode([encoding[,errors]]) -> string or unicode\n\
6329 \n\
6330 Encodes S using the codec registered for encoding. encoding defaults\n\
6331 to the default encoding. errors may be given to set a different error\n\
6332 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6333 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6334 'xmlcharrefreplace' as well as any other name registered with\n\
6335 codecs.register_error that can handle UnicodeEncodeErrors.");
6336 
6337 static PyObject *
unicode_encode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6338 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6339 {
6340     static char *kwlist[] = {"encoding", "errors", 0};
6341     char *encoding = NULL;
6342     char *errors = NULL;
6343     PyObject *v;
6344 
6345     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6346                                      kwlist, &encoding, &errors))
6347         return NULL;
6348     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6349     if (v == NULL)
6350         goto onError;
6351     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6352         PyErr_Format(PyExc_TypeError,
6353                      "encoder did not return a string/unicode object "
6354                      "(type=%.400s)",
6355                      Py_TYPE(v)->tp_name);
6356         Py_DECREF(v);
6357         return NULL;
6358     }
6359     return v;
6360 
6361   onError:
6362     return NULL;
6363 }
6364 
6365 PyDoc_STRVAR(decode__doc__,
6366              "S.decode([encoding[,errors]]) -> string or unicode\n\
6367 \n\
6368 Decodes S using the codec registered for encoding. encoding defaults\n\
6369 to the default encoding. errors may be given to set a different error\n\
6370 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6371 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6372 as well as any other name registerd with codecs.register_error that is\n\
6373 able to handle UnicodeDecodeErrors.");
6374 
6375 static PyObject *
unicode_decode(PyUnicodeObject * self,PyObject * args,PyObject * kwargs)6376 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6377 {
6378     static char *kwlist[] = {"encoding", "errors", 0};
6379     char *encoding = NULL;
6380     char *errors = NULL;
6381     PyObject *v;
6382 
6383     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6384                                      kwlist, &encoding, &errors))
6385         return NULL;
6386     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6387     if (v == NULL)
6388         goto onError;
6389     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6390         PyErr_Format(PyExc_TypeError,
6391                      "decoder did not return a string/unicode object "
6392                      "(type=%.400s)",
6393                      Py_TYPE(v)->tp_name);
6394         Py_DECREF(v);
6395         return NULL;
6396     }
6397     return v;
6398 
6399   onError:
6400     return NULL;
6401 }
6402 
6403 PyDoc_STRVAR(expandtabs__doc__,
6404              "S.expandtabs([tabsize]) -> unicode\n\
6405 \n\
6406 Return a copy of S where all tab characters are expanded using spaces.\n\
6407 If tabsize is not given, a tab size of 8 characters is assumed.");
6408 
6409 static PyObject*
unicode_expandtabs(PyUnicodeObject * self,PyObject * args)6410 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6411 {
6412     Py_UNICODE *e;
6413     Py_UNICODE *p;
6414     Py_UNICODE *q;
6415     Py_UNICODE *qe;
6416     Py_ssize_t i, j, incr;
6417     PyUnicodeObject *u;
6418     int tabsize = 8;
6419 
6420     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6421         return NULL;
6422 
6423     /* First pass: determine size of output string */
6424     i = 0; /* chars up to and including most recent \n or \r */
6425     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6426     e = self->str + self->length; /* end of input */
6427     for (p = self->str; p < e; p++)
6428         if (*p == '\t') {
6429             if (tabsize > 0) {
6430                 incr = tabsize - (j % tabsize); /* cannot overflow */
6431                 if (j > PY_SSIZE_T_MAX - incr)
6432                     goto overflow1;
6433                 j += incr;
6434             }
6435         }
6436         else {
6437             if (j > PY_SSIZE_T_MAX - 1)
6438                 goto overflow1;
6439             j++;
6440             if (*p == '\n' || *p == '\r') {
6441                 if (i > PY_SSIZE_T_MAX - j)
6442                     goto overflow1;
6443                 i += j;
6444                 j = 0;
6445             }
6446         }
6447 
6448     if (i > PY_SSIZE_T_MAX - j)
6449         goto overflow1;
6450 
6451     /* Second pass: create output string and fill it */
6452     u = _PyUnicode_New(i + j);
6453     if (!u)
6454         return NULL;
6455 
6456     j = 0; /* same as in first pass */
6457     q = u->str; /* next output char */
6458     qe = u->str + u->length; /* end of output */
6459 
6460     for (p = self->str; p < e; p++)
6461         if (*p == '\t') {
6462             if (tabsize > 0) {
6463                 i = tabsize - (j % tabsize);
6464                 j += i;
6465                 while (i--) {
6466                     if (q >= qe)
6467                         goto overflow2;
6468                     *q++ = ' ';
6469                 }
6470             }
6471         }
6472         else {
6473             if (q >= qe)
6474                 goto overflow2;
6475             *q++ = *p;
6476             j++;
6477             if (*p == '\n' || *p == '\r')
6478                 j = 0;
6479         }
6480 
6481     return (PyObject*) u;
6482 
6483   overflow2:
6484     Py_DECREF(u);
6485   overflow1:
6486     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6487     return NULL;
6488 }
6489 
6490 PyDoc_STRVAR(find__doc__,
6491              "S.find(sub [,start [,end]]) -> int\n\
6492 \n\
6493 Return the lowest index in S where substring sub is found,\n\
6494 such that sub is contained within s[start:end].  Optional\n\
6495 arguments start and end are interpreted as in slice notation.\n\
6496 \n\
6497 Return -1 on failure.");
6498 
6499 static PyObject *
unicode_find(PyUnicodeObject * self,PyObject * args)6500 unicode_find(PyUnicodeObject *self, PyObject *args)
6501 {
6502     PyUnicodeObject *substring;
6503     Py_ssize_t start;
6504     Py_ssize_t end;
6505     Py_ssize_t result;
6506 
6507     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6508                                             &start, &end))
6509         return NULL;
6510 
6511     result = stringlib_find_slice(
6512         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6513         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6514         start, end
6515         );
6516 
6517     Py_DECREF(substring);
6518 
6519     return PyInt_FromSsize_t(result);
6520 }
6521 
6522 static PyObject *
unicode_getitem(PyUnicodeObject * self,Py_ssize_t index)6523 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6524 {
6525     if (index < 0 || index >= self->length) {
6526         PyErr_SetString(PyExc_IndexError, "string index out of range");
6527         return NULL;
6528     }
6529 
6530     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6531 }
6532 
6533 static long
unicode_hash(PyUnicodeObject * self)6534 unicode_hash(PyUnicodeObject *self)
6535 {
6536     /* Since Unicode objects compare equal to their ASCII string
6537        counterparts, they should use the individual character values
6538        as basis for their hash value.  This is needed to assure that
6539        strings and Unicode objects behave in the same way as
6540        dictionary keys. */
6541 
6542     register Py_ssize_t len;
6543     register Py_UNICODE *p;
6544     register long x;
6545 
6546     if (self->hash != -1)
6547         return self->hash;
6548     len = PyUnicode_GET_SIZE(self);
6549     p = PyUnicode_AS_UNICODE(self);
6550     x = *p << 7;
6551     while (--len >= 0)
6552         x = (1000003*x) ^ *p++;
6553     x ^= PyUnicode_GET_SIZE(self);
6554     if (x == -1)
6555         x = -2;
6556     self->hash = x;
6557     return x;
6558 }
6559 
6560 PyDoc_STRVAR(index__doc__,
6561              "S.index(sub [,start [,end]]) -> int\n\
6562 \n\
6563 Like S.find() but raise ValueError when the substring is not found.");
6564 
6565 static PyObject *
unicode_index(PyUnicodeObject * self,PyObject * args)6566 unicode_index(PyUnicodeObject *self, PyObject *args)
6567 {
6568     Py_ssize_t result;
6569     PyUnicodeObject *substring;
6570     Py_ssize_t start;
6571     Py_ssize_t end;
6572 
6573     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6574                                             &start, &end))
6575         return NULL;
6576 
6577     result = stringlib_find_slice(
6578         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6579         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6580         start, end
6581         );
6582 
6583     Py_DECREF(substring);
6584 
6585     if (result < 0) {
6586         PyErr_SetString(PyExc_ValueError, "substring not found");
6587         return NULL;
6588     }
6589 
6590     return PyInt_FromSsize_t(result);
6591 }
6592 
6593 PyDoc_STRVAR(islower__doc__,
6594              "S.islower() -> bool\n\
6595 \n\
6596 Return True if all cased characters in S are lowercase and there is\n\
6597 at least one cased character in S, False otherwise.");
6598 
6599 static PyObject*
unicode_islower(PyUnicodeObject * self)6600 unicode_islower(PyUnicodeObject *self)
6601 {
6602     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6603     register const Py_UNICODE *e;
6604     int cased;
6605 
6606     /* Shortcut for single character strings */
6607     if (PyUnicode_GET_SIZE(self) == 1)
6608         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6609 
6610     /* Special case for empty strings */
6611     if (PyUnicode_GET_SIZE(self) == 0)
6612         return PyBool_FromLong(0);
6613 
6614     e = p + PyUnicode_GET_SIZE(self);
6615     cased = 0;
6616     for (; p < e; p++) {
6617         register const Py_UNICODE ch = *p;
6618 
6619         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6620             return PyBool_FromLong(0);
6621         else if (!cased && Py_UNICODE_ISLOWER(ch))
6622             cased = 1;
6623     }
6624     return PyBool_FromLong(cased);
6625 }
6626 
6627 PyDoc_STRVAR(isupper__doc__,
6628              "S.isupper() -> bool\n\
6629 \n\
6630 Return True if all cased characters in S are uppercase and there is\n\
6631 at least one cased character in S, False otherwise.");
6632 
6633 static PyObject*
unicode_isupper(PyUnicodeObject * self)6634 unicode_isupper(PyUnicodeObject *self)
6635 {
6636     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6637     register const Py_UNICODE *e;
6638     int cased;
6639 
6640     /* Shortcut for single character strings */
6641     if (PyUnicode_GET_SIZE(self) == 1)
6642         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6643 
6644     /* Special case for empty strings */
6645     if (PyUnicode_GET_SIZE(self) == 0)
6646         return PyBool_FromLong(0);
6647 
6648     e = p + PyUnicode_GET_SIZE(self);
6649     cased = 0;
6650     for (; p < e; p++) {
6651         register const Py_UNICODE ch = *p;
6652 
6653         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6654             return PyBool_FromLong(0);
6655         else if (!cased && Py_UNICODE_ISUPPER(ch))
6656             cased = 1;
6657     }
6658     return PyBool_FromLong(cased);
6659 }
6660 
6661 PyDoc_STRVAR(istitle__doc__,
6662              "S.istitle() -> bool\n\
6663 \n\
6664 Return True if S is a titlecased string and there is at least one\n\
6665 character in S, i.e. upper- and titlecase characters may only\n\
6666 follow uncased characters and lowercase characters only cased ones.\n\
6667 Return False otherwise.");
6668 
6669 static PyObject*
unicode_istitle(PyUnicodeObject * self)6670 unicode_istitle(PyUnicodeObject *self)
6671 {
6672     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6673     register const Py_UNICODE *e;
6674     int cased, previous_is_cased;
6675 
6676     /* Shortcut for single character strings */
6677     if (PyUnicode_GET_SIZE(self) == 1)
6678         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6679                                (Py_UNICODE_ISUPPER(*p) != 0));
6680 
6681     /* Special case for empty strings */
6682     if (PyUnicode_GET_SIZE(self) == 0)
6683         return PyBool_FromLong(0);
6684 
6685     e = p + PyUnicode_GET_SIZE(self);
6686     cased = 0;
6687     previous_is_cased = 0;
6688     for (; p < e; p++) {
6689         register const Py_UNICODE ch = *p;
6690 
6691         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6692             if (previous_is_cased)
6693                 return PyBool_FromLong(0);
6694             previous_is_cased = 1;
6695             cased = 1;
6696         }
6697         else if (Py_UNICODE_ISLOWER(ch)) {
6698             if (!previous_is_cased)
6699                 return PyBool_FromLong(0);
6700             previous_is_cased = 1;
6701             cased = 1;
6702         }
6703         else
6704             previous_is_cased = 0;
6705     }
6706     return PyBool_FromLong(cased);
6707 }
6708 
6709 PyDoc_STRVAR(isspace__doc__,
6710              "S.isspace() -> bool\n\
6711 \n\
6712 Return True if all characters in S are whitespace\n\
6713 and there is at least one character in S, False otherwise.");
6714 
6715 static PyObject*
unicode_isspace(PyUnicodeObject * self)6716 unicode_isspace(PyUnicodeObject *self)
6717 {
6718     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6719     register const Py_UNICODE *e;
6720 
6721     /* Shortcut for single character strings */
6722     if (PyUnicode_GET_SIZE(self) == 1 &&
6723         Py_UNICODE_ISSPACE(*p))
6724         return PyBool_FromLong(1);
6725 
6726     /* Special case for empty strings */
6727     if (PyUnicode_GET_SIZE(self) == 0)
6728         return PyBool_FromLong(0);
6729 
6730     e = p + PyUnicode_GET_SIZE(self);
6731     for (; p < e; p++) {
6732         if (!Py_UNICODE_ISSPACE(*p))
6733             return PyBool_FromLong(0);
6734     }
6735     return PyBool_FromLong(1);
6736 }
6737 
6738 PyDoc_STRVAR(isalpha__doc__,
6739              "S.isalpha() -> bool\n\
6740 \n\
6741 Return True if all characters in S are alphabetic\n\
6742 and there is at least one character in S, False otherwise.");
6743 
6744 static PyObject*
unicode_isalpha(PyUnicodeObject * self)6745 unicode_isalpha(PyUnicodeObject *self)
6746 {
6747     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6748     register const Py_UNICODE *e;
6749 
6750     /* Shortcut for single character strings */
6751     if (PyUnicode_GET_SIZE(self) == 1 &&
6752         Py_UNICODE_ISALPHA(*p))
6753         return PyBool_FromLong(1);
6754 
6755     /* Special case for empty strings */
6756     if (PyUnicode_GET_SIZE(self) == 0)
6757         return PyBool_FromLong(0);
6758 
6759     e = p + PyUnicode_GET_SIZE(self);
6760     for (; p < e; p++) {
6761         if (!Py_UNICODE_ISALPHA(*p))
6762             return PyBool_FromLong(0);
6763     }
6764     return PyBool_FromLong(1);
6765 }
6766 
6767 PyDoc_STRVAR(isalnum__doc__,
6768              "S.isalnum() -> bool\n\
6769 \n\
6770 Return True if all characters in S are alphanumeric\n\
6771 and there is at least one character in S, False otherwise.");
6772 
6773 static PyObject*
unicode_isalnum(PyUnicodeObject * self)6774 unicode_isalnum(PyUnicodeObject *self)
6775 {
6776     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6777     register const Py_UNICODE *e;
6778 
6779     /* Shortcut for single character strings */
6780     if (PyUnicode_GET_SIZE(self) == 1 &&
6781         Py_UNICODE_ISALNUM(*p))
6782         return PyBool_FromLong(1);
6783 
6784     /* Special case for empty strings */
6785     if (PyUnicode_GET_SIZE(self) == 0)
6786         return PyBool_FromLong(0);
6787 
6788     e = p + PyUnicode_GET_SIZE(self);
6789     for (; p < e; p++) {
6790         if (!Py_UNICODE_ISALNUM(*p))
6791             return PyBool_FromLong(0);
6792     }
6793     return PyBool_FromLong(1);
6794 }
6795 
6796 PyDoc_STRVAR(isdecimal__doc__,
6797              "S.isdecimal() -> bool\n\
6798 \n\
6799 Return True if there are only decimal characters in S,\n\
6800 False otherwise.");
6801 
6802 static PyObject*
unicode_isdecimal(PyUnicodeObject * self)6803 unicode_isdecimal(PyUnicodeObject *self)
6804 {
6805     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6806     register const Py_UNICODE *e;
6807 
6808     /* Shortcut for single character strings */
6809     if (PyUnicode_GET_SIZE(self) == 1 &&
6810         Py_UNICODE_ISDECIMAL(*p))
6811         return PyBool_FromLong(1);
6812 
6813     /* Special case for empty strings */
6814     if (PyUnicode_GET_SIZE(self) == 0)
6815         return PyBool_FromLong(0);
6816 
6817     e = p + PyUnicode_GET_SIZE(self);
6818     for (; p < e; p++) {
6819         if (!Py_UNICODE_ISDECIMAL(*p))
6820             return PyBool_FromLong(0);
6821     }
6822     return PyBool_FromLong(1);
6823 }
6824 
6825 PyDoc_STRVAR(isdigit__doc__,
6826              "S.isdigit() -> bool\n\
6827 \n\
6828 Return True if all characters in S are digits\n\
6829 and there is at least one character in S, False otherwise.");
6830 
6831 static PyObject*
unicode_isdigit(PyUnicodeObject * self)6832 unicode_isdigit(PyUnicodeObject *self)
6833 {
6834     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6835     register const Py_UNICODE *e;
6836 
6837     /* Shortcut for single character strings */
6838     if (PyUnicode_GET_SIZE(self) == 1 &&
6839         Py_UNICODE_ISDIGIT(*p))
6840         return PyBool_FromLong(1);
6841 
6842     /* Special case for empty strings */
6843     if (PyUnicode_GET_SIZE(self) == 0)
6844         return PyBool_FromLong(0);
6845 
6846     e = p + PyUnicode_GET_SIZE(self);
6847     for (; p < e; p++) {
6848         if (!Py_UNICODE_ISDIGIT(*p))
6849             return PyBool_FromLong(0);
6850     }
6851     return PyBool_FromLong(1);
6852 }
6853 
6854 PyDoc_STRVAR(isnumeric__doc__,
6855              "S.isnumeric() -> bool\n\
6856 \n\
6857 Return True if there are only numeric characters in S,\n\
6858 False otherwise.");
6859 
6860 static PyObject*
unicode_isnumeric(PyUnicodeObject * self)6861 unicode_isnumeric(PyUnicodeObject *self)
6862 {
6863     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864     register const Py_UNICODE *e;
6865 
6866     /* Shortcut for single character strings */
6867     if (PyUnicode_GET_SIZE(self) == 1 &&
6868         Py_UNICODE_ISNUMERIC(*p))
6869         return PyBool_FromLong(1);
6870 
6871     /* Special case for empty strings */
6872     if (PyUnicode_GET_SIZE(self) == 0)
6873         return PyBool_FromLong(0);
6874 
6875     e = p + PyUnicode_GET_SIZE(self);
6876     for (; p < e; p++) {
6877         if (!Py_UNICODE_ISNUMERIC(*p))
6878             return PyBool_FromLong(0);
6879     }
6880     return PyBool_FromLong(1);
6881 }
6882 
6883 PyDoc_STRVAR(join__doc__,
6884              "S.join(iterable) -> unicode\n\
6885 \n\
6886 Return a string which is the concatenation of the strings in the\n\
6887 iterable.  The separator between elements is S.");
6888 
6889 static PyObject*
unicode_join(PyObject * self,PyObject * data)6890 unicode_join(PyObject *self, PyObject *data)
6891 {
6892     return PyUnicode_Join(self, data);
6893 }
6894 
6895 static Py_ssize_t
unicode_length(PyUnicodeObject * self)6896 unicode_length(PyUnicodeObject *self)
6897 {
6898     return self->length;
6899 }
6900 
6901 PyDoc_STRVAR(ljust__doc__,
6902              "S.ljust(width[, fillchar]) -> int\n\
6903 \n\
6904 Return S left-justified in a Unicode string of length width. Padding is\n\
6905 done using the specified fill character (default is a space).");
6906 
6907 static PyObject *
unicode_ljust(PyUnicodeObject * self,PyObject * args)6908 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6909 {
6910     Py_ssize_t width;
6911     Py_UNICODE fillchar = ' ';
6912 
6913     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6914         return NULL;
6915 
6916     if (self->length >= width && PyUnicode_CheckExact(self)) {
6917         Py_INCREF(self);
6918         return (PyObject*) self;
6919     }
6920 
6921     return (PyObject*) pad(self, 0, width - self->length, fillchar);
6922 }
6923 
6924 PyDoc_STRVAR(lower__doc__,
6925              "S.lower() -> unicode\n\
6926 \n\
6927 Return a copy of the string S converted to lowercase.");
6928 
6929 static PyObject*
unicode_lower(PyUnicodeObject * self)6930 unicode_lower(PyUnicodeObject *self)
6931 {
6932     return fixup(self, fixlower);
6933 }
6934 
6935 #define LEFTSTRIP 0
6936 #define RIGHTSTRIP 1
6937 #define BOTHSTRIP 2
6938 
6939 /* Arrays indexed by above */
6940 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6941 
6942 #define STRIPNAME(i) (stripformat[i]+3)
6943 
6944 /* externally visible for str.strip(unicode) */
6945 PyObject *
_PyUnicode_XStrip(PyUnicodeObject * self,int striptype,PyObject * sepobj)6946 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6947 {
6948     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6949     Py_ssize_t len = PyUnicode_GET_SIZE(self);
6950     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6951     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6952     Py_ssize_t i, j;
6953 
6954     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6955 
6956     i = 0;
6957     if (striptype != RIGHTSTRIP) {
6958         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6959             i++;
6960         }
6961     }
6962 
6963     j = len;
6964     if (striptype != LEFTSTRIP) {
6965         do {
6966             j--;
6967         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6968         j++;
6969     }
6970 
6971     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6972         Py_INCREF(self);
6973         return (PyObject*)self;
6974     }
6975     else
6976         return PyUnicode_FromUnicode(s+i, j-i);
6977 }
6978 
6979 
6980 static PyObject *
do_strip(PyUnicodeObject * self,int striptype)6981 do_strip(PyUnicodeObject *self, int striptype)
6982 {
6983     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6984     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6985 
6986     i = 0;
6987     if (striptype != RIGHTSTRIP) {
6988         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6989             i++;
6990         }
6991     }
6992 
6993     j = len;
6994     if (striptype != LEFTSTRIP) {
6995         do {
6996             j--;
6997         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6998         j++;
6999     }
7000 
7001     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7002         Py_INCREF(self);
7003         return (PyObject*)self;
7004     }
7005     else
7006         return PyUnicode_FromUnicode(s+i, j-i);
7007 }
7008 
7009 
7010 static PyObject *
do_argstrip(PyUnicodeObject * self,int striptype,PyObject * args)7011 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7012 {
7013     PyObject *sep = NULL;
7014 
7015     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7016         return NULL;
7017 
7018     if (sep != NULL && sep != Py_None) {
7019         if (PyUnicode_Check(sep))
7020             return _PyUnicode_XStrip(self, striptype, sep);
7021         else if (PyString_Check(sep)) {
7022             PyObject *res;
7023             sep = PyUnicode_FromObject(sep);
7024             if (sep==NULL)
7025                 return NULL;
7026             res = _PyUnicode_XStrip(self, striptype, sep);
7027             Py_DECREF(sep);
7028             return res;
7029         }
7030         else {
7031             PyErr_Format(PyExc_TypeError,
7032                          "%s arg must be None, unicode or str",
7033                          STRIPNAME(striptype));
7034             return NULL;
7035         }
7036     }
7037 
7038     return do_strip(self, striptype);
7039 }
7040 
7041 
7042 PyDoc_STRVAR(strip__doc__,
7043              "S.strip([chars]) -> unicode\n\
7044 \n\
7045 Return a copy of the string S with leading and trailing\n\
7046 whitespace removed.\n\
7047 If chars is given and not None, remove characters in chars instead.\n\
7048 If chars is a str, it will be converted to unicode before stripping");
7049 
7050 static PyObject *
unicode_strip(PyUnicodeObject * self,PyObject * args)7051 unicode_strip(PyUnicodeObject *self, PyObject *args)
7052 {
7053     if (PyTuple_GET_SIZE(args) == 0)
7054         return do_strip(self, BOTHSTRIP); /* Common case */
7055     else
7056         return do_argstrip(self, BOTHSTRIP, args);
7057 }
7058 
7059 
7060 PyDoc_STRVAR(lstrip__doc__,
7061              "S.lstrip([chars]) -> unicode\n\
7062 \n\
7063 Return a copy of the string S with leading whitespace removed.\n\
7064 If chars is given and not None, remove characters in chars instead.\n\
7065 If chars is a str, it will be converted to unicode before stripping");
7066 
7067 static PyObject *
unicode_lstrip(PyUnicodeObject * self,PyObject * args)7068 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7069 {
7070     if (PyTuple_GET_SIZE(args) == 0)
7071         return do_strip(self, LEFTSTRIP); /* Common case */
7072     else
7073         return do_argstrip(self, LEFTSTRIP, args);
7074 }
7075 
7076 
7077 PyDoc_STRVAR(rstrip__doc__,
7078              "S.rstrip([chars]) -> unicode\n\
7079 \n\
7080 Return a copy of the string S with trailing whitespace removed.\n\
7081 If chars is given and not None, remove characters in chars instead.\n\
7082 If chars is a str, it will be converted to unicode before stripping");
7083 
7084 static PyObject *
unicode_rstrip(PyUnicodeObject * self,PyObject * args)7085 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7086 {
7087     if (PyTuple_GET_SIZE(args) == 0)
7088         return do_strip(self, RIGHTSTRIP); /* Common case */
7089     else
7090         return do_argstrip(self, RIGHTSTRIP, args);
7091 }
7092 
7093 
7094 static PyObject*
unicode_repeat(PyUnicodeObject * str,Py_ssize_t len)7095 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7096 {
7097     PyUnicodeObject *u;
7098     Py_UNICODE *p;
7099     Py_ssize_t nchars;
7100     size_t nbytes;
7101 
7102     if (len < 0)
7103         len = 0;
7104 
7105     if (len == 1 && PyUnicode_CheckExact(str)) {
7106         /* no repeat, return original string */
7107         Py_INCREF(str);
7108         return (PyObject*) str;
7109     }
7110 
7111     /* ensure # of chars needed doesn't overflow int and # of bytes
7112      * needed doesn't overflow size_t
7113      */
7114     nchars = len * str->length;
7115     if (len && nchars / len != str->length) {
7116         PyErr_SetString(PyExc_OverflowError,
7117                         "repeated string is too long");
7118         return NULL;
7119     }
7120     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7121     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7122         PyErr_SetString(PyExc_OverflowError,
7123                         "repeated string is too long");
7124         return NULL;
7125     }
7126     u = _PyUnicode_New(nchars);
7127     if (!u)
7128         return NULL;
7129 
7130     p = u->str;
7131 
7132     if (str->length == 1 && len > 0) {
7133         Py_UNICODE_FILL(p, str->str[0], len);
7134     } else {
7135         Py_ssize_t done = 0; /* number of characters copied this far */
7136         if (done < nchars) {
7137             Py_UNICODE_COPY(p, str->str, str->length);
7138             done = str->length;
7139         }
7140         while (done < nchars) {
7141             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7142             Py_UNICODE_COPY(p+done, p, n);
7143             done += n;
7144         }
7145     }
7146 
7147     return (PyObject*) u;
7148 }
7149 
PyUnicode_Replace(PyObject * obj,PyObject * subobj,PyObject * replobj,Py_ssize_t maxcount)7150 PyObject *PyUnicode_Replace(PyObject *obj,
7151                             PyObject *subobj,
7152                             PyObject *replobj,
7153                             Py_ssize_t maxcount)
7154 {
7155     PyObject *self;
7156     PyObject *str1;
7157     PyObject *str2;
7158     PyObject *result;
7159 
7160     self = PyUnicode_FromObject(obj);
7161     if (self == NULL)
7162         return NULL;
7163     str1 = PyUnicode_FromObject(subobj);
7164     if (str1 == NULL) {
7165         Py_DECREF(self);
7166         return NULL;
7167     }
7168     str2 = PyUnicode_FromObject(replobj);
7169     if (str2 == NULL) {
7170         Py_DECREF(self);
7171         Py_DECREF(str1);
7172         return NULL;
7173     }
7174     result = replace((PyUnicodeObject *)self,
7175                      (PyUnicodeObject *)str1,
7176                      (PyUnicodeObject *)str2,
7177                      maxcount);
7178     Py_DECREF(self);
7179     Py_DECREF(str1);
7180     Py_DECREF(str2);
7181     return result;
7182 }
7183 
7184 PyDoc_STRVAR(replace__doc__,
7185              "S.replace(old, new[, count]) -> unicode\n\
7186 \n\
7187 Return a copy of S with all occurrences of substring\n\
7188 old replaced by new.  If the optional argument count is\n\
7189 given, only the first count occurrences are replaced.");
7190 
7191 static PyObject*
unicode_replace(PyUnicodeObject * self,PyObject * args)7192 unicode_replace(PyUnicodeObject *self, PyObject *args)
7193 {
7194     PyUnicodeObject *str1;
7195     PyUnicodeObject *str2;
7196     Py_ssize_t maxcount = -1;
7197     PyObject *result;
7198 
7199     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7200         return NULL;
7201     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7202     if (str1 == NULL)
7203         return NULL;
7204     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7205     if (str2 == NULL) {
7206         Py_DECREF(str1);
7207         return NULL;
7208     }
7209 
7210     result = replace(self, str1, str2, maxcount);
7211 
7212     Py_DECREF(str1);
7213     Py_DECREF(str2);
7214     return result;
7215 }
7216 
7217 static
unicode_repr(PyObject * unicode)7218 PyObject *unicode_repr(PyObject *unicode)
7219 {
7220     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7221                                 PyUnicode_GET_SIZE(unicode),
7222                                 1);
7223 }
7224 
7225 PyDoc_STRVAR(rfind__doc__,
7226              "S.rfind(sub [,start [,end]]) -> int\n\
7227 \n\
7228 Return the highest index in S where substring sub is found,\n\
7229 such that sub is contained within s[start:end].  Optional\n\
7230 arguments start and end are interpreted as in slice notation.\n\
7231 \n\
7232 Return -1 on failure.");
7233 
7234 static PyObject *
unicode_rfind(PyUnicodeObject * self,PyObject * args)7235 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7236 {
7237     PyUnicodeObject *substring;
7238     Py_ssize_t start;
7239     Py_ssize_t end;
7240     Py_ssize_t result;
7241 
7242     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7243                                             &start, &end))
7244         return NULL;
7245 
7246     result = stringlib_rfind_slice(
7247         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7248         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7249         start, end
7250         );
7251 
7252     Py_DECREF(substring);
7253 
7254     return PyInt_FromSsize_t(result);
7255 }
7256 
7257 PyDoc_STRVAR(rindex__doc__,
7258              "S.rindex(sub [,start [,end]]) -> int\n\
7259 \n\
7260 Like S.rfind() but raise ValueError when the substring is not found.");
7261 
7262 static PyObject *
unicode_rindex(PyUnicodeObject * self,PyObject * args)7263 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7264 {
7265     PyUnicodeObject *substring;
7266     Py_ssize_t start;
7267     Py_ssize_t end;
7268     Py_ssize_t result;
7269 
7270     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7271                                             &start, &end))
7272         return NULL;
7273 
7274     result = stringlib_rfind_slice(
7275         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7276         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7277         start, end
7278         );
7279 
7280     Py_DECREF(substring);
7281 
7282     if (result < 0) {
7283         PyErr_SetString(PyExc_ValueError, "substring not found");
7284         return NULL;
7285     }
7286     return PyInt_FromSsize_t(result);
7287 }
7288 
7289 PyDoc_STRVAR(rjust__doc__,
7290              "S.rjust(width[, fillchar]) -> unicode\n\
7291 \n\
7292 Return S right-justified in a Unicode string of length width. Padding is\n\
7293 done using the specified fill character (default is a space).");
7294 
7295 static PyObject *
unicode_rjust(PyUnicodeObject * self,PyObject * args)7296 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7297 {
7298     Py_ssize_t width;
7299     Py_UNICODE fillchar = ' ';
7300 
7301     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7302         return NULL;
7303 
7304     if (self->length >= width && PyUnicode_CheckExact(self)) {
7305         Py_INCREF(self);
7306         return (PyObject*) self;
7307     }
7308 
7309     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7310 }
7311 
7312 static PyObject*
unicode_slice(PyUnicodeObject * self,Py_ssize_t start,Py_ssize_t end)7313 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7314 {
7315     /* standard clamping */
7316     if (start < 0)
7317         start = 0;
7318     if (end < 0)
7319         end = 0;
7320     if (end > self->length)
7321         end = self->length;
7322     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7323         /* full slice, return original string */
7324         Py_INCREF(self);
7325         return (PyObject*) self;
7326     }
7327     if (start > end)
7328         start = end;
7329     /* copy slice */
7330     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7331                                              end - start);
7332 }
7333 
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7334 PyObject *PyUnicode_Split(PyObject *s,
7335                           PyObject *sep,
7336                           Py_ssize_t maxsplit)
7337 {
7338     PyObject *result;
7339 
7340     s = PyUnicode_FromObject(s);
7341     if (s == NULL)
7342         return NULL;
7343     if (sep != NULL) {
7344         sep = PyUnicode_FromObject(sep);
7345         if (sep == NULL) {
7346             Py_DECREF(s);
7347             return NULL;
7348         }
7349     }
7350 
7351     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7352 
7353     Py_DECREF(s);
7354     Py_XDECREF(sep);
7355     return result;
7356 }
7357 
7358 PyDoc_STRVAR(split__doc__,
7359              "S.split([sep [,maxsplit]]) -> list of strings\n\
7360 \n\
7361 Return a list of the words in S, using sep as the\n\
7362 delimiter string.  If maxsplit is given, at most maxsplit\n\
7363 splits are done. If sep is not specified or is None, any\n\
7364 whitespace string is a separator and empty strings are\n\
7365 removed from the result.");
7366 
7367 static PyObject*
unicode_split(PyUnicodeObject * self,PyObject * args)7368 unicode_split(PyUnicodeObject *self, PyObject *args)
7369 {
7370     PyObject *substring = Py_None;
7371     Py_ssize_t maxcount = -1;
7372 
7373     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7374         return NULL;
7375 
7376     if (substring == Py_None)
7377         return split(self, NULL, maxcount);
7378     else if (PyUnicode_Check(substring))
7379         return split(self, (PyUnicodeObject *)substring, maxcount);
7380     else
7381         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7382 }
7383 
7384 PyObject *
PyUnicode_Partition(PyObject * str_in,PyObject * sep_in)7385 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7386 {
7387     PyObject* str_obj;
7388     PyObject* sep_obj;
7389     PyObject* out;
7390 
7391     str_obj = PyUnicode_FromObject(str_in);
7392     if (!str_obj)
7393         return NULL;
7394     sep_obj = PyUnicode_FromObject(sep_in);
7395     if (!sep_obj) {
7396         Py_DECREF(str_obj);
7397         return NULL;
7398     }
7399 
7400     out = stringlib_partition(
7401         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7402         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7403         );
7404 
7405     Py_DECREF(sep_obj);
7406     Py_DECREF(str_obj);
7407 
7408     return out;
7409 }
7410 
7411 
7412 PyObject *
PyUnicode_RPartition(PyObject * str_in,PyObject * sep_in)7413 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7414 {
7415     PyObject* str_obj;
7416     PyObject* sep_obj;
7417     PyObject* out;
7418 
7419     str_obj = PyUnicode_FromObject(str_in);
7420     if (!str_obj)
7421         return NULL;
7422     sep_obj = PyUnicode_FromObject(sep_in);
7423     if (!sep_obj) {
7424         Py_DECREF(str_obj);
7425         return NULL;
7426     }
7427 
7428     out = stringlib_rpartition(
7429         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7430         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7431         );
7432 
7433     Py_DECREF(sep_obj);
7434     Py_DECREF(str_obj);
7435 
7436     return out;
7437 }
7438 
7439 PyDoc_STRVAR(partition__doc__,
7440              "S.partition(sep) -> (head, sep, tail)\n\
7441 \n\
7442 Search for the separator sep in S, and return the part before it,\n\
7443 the separator itself, and the part after it.  If the separator is not\n\
7444 found, return S and two empty strings.");
7445 
7446 static PyObject*
unicode_partition(PyUnicodeObject * self,PyObject * separator)7447 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7448 {
7449     return PyUnicode_Partition((PyObject *)self, separator);
7450 }
7451 
7452 PyDoc_STRVAR(rpartition__doc__,
7453              "S.rpartition(sep) -> (head, sep, tail)\n\
7454 \n\
7455 Search for the separator sep in S, starting at the end of S, and return\n\
7456 the part before it, the separator itself, and the part after it.  If the\n\
7457 separator is not found, return two empty strings and S.");
7458 
7459 static PyObject*
unicode_rpartition(PyUnicodeObject * self,PyObject * separator)7460 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7461 {
7462     return PyUnicode_RPartition((PyObject *)self, separator);
7463 }
7464 
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)7465 PyObject *PyUnicode_RSplit(PyObject *s,
7466                            PyObject *sep,
7467                            Py_ssize_t maxsplit)
7468 {
7469     PyObject *result;
7470 
7471     s = PyUnicode_FromObject(s);
7472     if (s == NULL)
7473         return NULL;
7474     if (sep != NULL) {
7475         sep = PyUnicode_FromObject(sep);
7476         if (sep == NULL) {
7477             Py_DECREF(s);
7478             return NULL;
7479         }
7480     }
7481 
7482     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7483 
7484     Py_DECREF(s);
7485     Py_XDECREF(sep);
7486     return result;
7487 }
7488 
7489 PyDoc_STRVAR(rsplit__doc__,
7490              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7491 \n\
7492 Return a list of the words in S, using sep as the\n\
7493 delimiter string, starting at the end of the string and\n\
7494 working to the front.  If maxsplit is given, at most maxsplit\n\
7495 splits are done. If sep is not specified, any whitespace string\n\
7496 is a separator.");
7497 
7498 static PyObject*
unicode_rsplit(PyUnicodeObject * self,PyObject * args)7499 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7500 {
7501     PyObject *substring = Py_None;
7502     Py_ssize_t maxcount = -1;
7503 
7504     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7505         return NULL;
7506 
7507     if (substring == Py_None)
7508         return rsplit(self, NULL, maxcount);
7509     else if (PyUnicode_Check(substring))
7510         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7511     else
7512         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7513 }
7514 
7515 PyDoc_STRVAR(splitlines__doc__,
7516              "S.splitlines([keepends]) -> list of strings\n\
7517 \n\
7518 Return a list of the lines in S, breaking at line boundaries.\n\
7519 Line breaks are not included in the resulting list unless keepends\n\
7520 is given and true.");
7521 
7522 static PyObject*
unicode_splitlines(PyUnicodeObject * self,PyObject * args)7523 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7524 {
7525     int keepends = 0;
7526 
7527     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7528         return NULL;
7529 
7530     return PyUnicode_Splitlines((PyObject *)self, keepends);
7531 }
7532 
7533 static
unicode_str(PyUnicodeObject * self)7534 PyObject *unicode_str(PyUnicodeObject *self)
7535 {
7536     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7537 }
7538 
7539 PyDoc_STRVAR(swapcase__doc__,
7540              "S.swapcase() -> unicode\n\
7541 \n\
7542 Return a copy of S with uppercase characters converted to lowercase\n\
7543 and vice versa.");
7544 
7545 static PyObject*
unicode_swapcase(PyUnicodeObject * self)7546 unicode_swapcase(PyUnicodeObject *self)
7547 {
7548     return fixup(self, fixswapcase);
7549 }
7550 
7551 PyDoc_STRVAR(translate__doc__,
7552              "S.translate(table) -> unicode\n\
7553 \n\
7554 Return a copy of the string S, where all characters have been mapped\n\
7555 through the given translation table, which must be a mapping of\n\
7556 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7557 Unmapped characters are left untouched. Characters mapped to None\n\
7558 are deleted.");
7559 
7560 static PyObject*
unicode_translate(PyUnicodeObject * self,PyObject * table)7561 unicode_translate(PyUnicodeObject *self, PyObject *table)
7562 {
7563     return PyUnicode_TranslateCharmap(self->str,
7564                                       self->length,
7565                                       table,
7566                                       "ignore");
7567 }
7568 
7569 PyDoc_STRVAR(upper__doc__,
7570              "S.upper() -> unicode\n\
7571 \n\
7572 Return a copy of S converted to uppercase.");
7573 
7574 static PyObject*
unicode_upper(PyUnicodeObject * self)7575 unicode_upper(PyUnicodeObject *self)
7576 {
7577     return fixup(self, fixupper);
7578 }
7579 
7580 PyDoc_STRVAR(zfill__doc__,
7581              "S.zfill(width) -> unicode\n\
7582 \n\
7583 Pad a numeric string S with zeros on the left, to fill a field\n\
7584 of the specified width. The string S is never truncated.");
7585 
7586 static PyObject *
unicode_zfill(PyUnicodeObject * self,PyObject * args)7587 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7588 {
7589     Py_ssize_t fill;
7590     PyUnicodeObject *u;
7591 
7592     Py_ssize_t width;
7593     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7594         return NULL;
7595 
7596     if (self->length >= width) {
7597         if (PyUnicode_CheckExact(self)) {
7598             Py_INCREF(self);
7599             return (PyObject*) self;
7600         }
7601         else
7602             return PyUnicode_FromUnicode(
7603                 PyUnicode_AS_UNICODE(self),
7604                 PyUnicode_GET_SIZE(self)
7605                 );
7606     }
7607 
7608     fill = width - self->length;
7609 
7610     u = pad(self, fill, 0, '0');
7611 
7612     if (u == NULL)
7613         return NULL;
7614 
7615     if (u->str[fill] == '+' || u->str[fill] == '-') {
7616         /* move sign to beginning of string */
7617         u->str[0] = u->str[fill];
7618         u->str[fill] = '0';
7619     }
7620 
7621     return (PyObject*) u;
7622 }
7623 
7624 #if 0
7625 static PyObject*
7626 free_listsize(PyUnicodeObject *self)
7627 {
7628     return PyInt_FromLong(numfree);
7629 }
7630 #endif
7631 
7632 PyDoc_STRVAR(startswith__doc__,
7633              "S.startswith(prefix[, start[, end]]) -> bool\n\
7634 \n\
7635 Return True if S starts with the specified prefix, False otherwise.\n\
7636 With optional start, test S beginning at that position.\n\
7637 With optional end, stop comparing S at that position.\n\
7638 prefix can also be a tuple of strings to try.");
7639 
7640 static PyObject *
unicode_startswith(PyUnicodeObject * self,PyObject * args)7641 unicode_startswith(PyUnicodeObject *self,
7642                    PyObject *args)
7643 {
7644     PyObject *subobj;
7645     PyUnicodeObject *substring;
7646     Py_ssize_t start = 0;
7647     Py_ssize_t end = PY_SSIZE_T_MAX;
7648     int result;
7649 
7650     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7651         return NULL;
7652     if (PyTuple_Check(subobj)) {
7653         Py_ssize_t i;
7654         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7655             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7656                 PyTuple_GET_ITEM(subobj, i));
7657             if (substring == NULL)
7658                 return NULL;
7659             result = tailmatch(self, substring, start, end, -1);
7660             Py_DECREF(substring);
7661             if (result) {
7662                 Py_RETURN_TRUE;
7663             }
7664         }
7665         /* nothing matched */
7666         Py_RETURN_FALSE;
7667     }
7668     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7669     if (substring == NULL) {
7670         if (PyErr_ExceptionMatches(PyExc_TypeError))
7671             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7672                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7673         return NULL;
7674     }
7675     result = tailmatch(self, substring, start, end, -1);
7676     Py_DECREF(substring);
7677     return PyBool_FromLong(result);
7678 }
7679 
7680 
7681 PyDoc_STRVAR(endswith__doc__,
7682              "S.endswith(suffix[, start[, end]]) -> bool\n\
7683 \n\
7684 Return True if S ends with the specified suffix, False otherwise.\n\
7685 With optional start, test S beginning at that position.\n\
7686 With optional end, stop comparing S at that position.\n\
7687 suffix can also be a tuple of strings to try.");
7688 
7689 static PyObject *
unicode_endswith(PyUnicodeObject * self,PyObject * args)7690 unicode_endswith(PyUnicodeObject *self,
7691                  PyObject *args)
7692 {
7693     PyObject *subobj;
7694     PyUnicodeObject *substring;
7695     Py_ssize_t start = 0;
7696     Py_ssize_t end = PY_SSIZE_T_MAX;
7697     int result;
7698 
7699     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7700         return NULL;
7701     if (PyTuple_Check(subobj)) {
7702         Py_ssize_t i;
7703         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7704             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7705                 PyTuple_GET_ITEM(subobj, i));
7706             if (substring == NULL)
7707                 return NULL;
7708             result = tailmatch(self, substring, start, end, +1);
7709             Py_DECREF(substring);
7710             if (result) {
7711                 Py_RETURN_TRUE;
7712             }
7713         }
7714         Py_RETURN_FALSE;
7715     }
7716     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7717     if (substring == NULL) {
7718         if (PyErr_ExceptionMatches(PyExc_TypeError))
7719             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7720                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7721         return NULL;
7722     }
7723     result = tailmatch(self, substring, start, end, +1);
7724     Py_DECREF(substring);
7725     return PyBool_FromLong(result);
7726 }
7727 
7728 
7729 /* Implements do_string_format, which is unicode because of stringlib */
7730 #include "stringlib/string_format.h"
7731 
7732 PyDoc_STRVAR(format__doc__,
7733              "S.format(*args, **kwargs) -> unicode\n\
7734 \n\
7735 Return a formatted version of S, using substitutions from args and kwargs.\n\
7736 The substitutions are identified by braces ('{' and '}').");
7737 
7738 static PyObject *
unicode__format__(PyObject * self,PyObject * args)7739 unicode__format__(PyObject *self, PyObject *args)
7740 {
7741     PyObject *format_spec;
7742     PyObject *result = NULL;
7743     PyObject *tmp = NULL;
7744 
7745     /* If 2.x, convert format_spec to the same type as value */
7746     /* This is to allow things like u''.format('') */
7747     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7748         goto done;
7749     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7750         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7751                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7752         goto done;
7753     }
7754     tmp = PyObject_Unicode(format_spec);
7755     if (tmp == NULL)
7756         goto done;
7757     format_spec = tmp;
7758 
7759     result = _PyUnicode_FormatAdvanced(self,
7760                                        PyUnicode_AS_UNICODE(format_spec),
7761                                        PyUnicode_GET_SIZE(format_spec));
7762   done:
7763     Py_XDECREF(tmp);
7764     return result;
7765 }
7766 
7767 PyDoc_STRVAR(p_format__doc__,
7768              "S.__format__(format_spec) -> unicode\n\
7769 \n\
7770 Return a formatted version of S as described by format_spec.");
7771 
7772 static PyObject *
unicode__sizeof__(PyUnicodeObject * v)7773 unicode__sizeof__(PyUnicodeObject *v)
7774 {
7775     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7776                              sizeof(Py_UNICODE) * (v->length + 1));
7777 }
7778 
7779 PyDoc_STRVAR(sizeof__doc__,
7780              "S.__sizeof__() -> size of S in memory, in bytes\n\
7781 \n\
7782 ");
7783 
7784 static PyObject *
unicode_getnewargs(PyUnicodeObject * v)7785 unicode_getnewargs(PyUnicodeObject *v)
7786 {
7787     return Py_BuildValue("(u#)", v->str, v->length);
7788 }
7789 
7790 
7791 static PyMethodDef unicode_methods[] = {
7792 
7793     /* Order is according to common usage: often used methods should
7794        appear first, since lookup is done sequentially. */
7795 
7796     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7797     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7798     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7799     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7800     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7801     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7802     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7803     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7804     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7805     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7806     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7807     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7808     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7809     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7810     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7811     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7812     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7813 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7814     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7815     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7816     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7817     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7818     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7819     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7820     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7821     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7822     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7823     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7824     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7825     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7826     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7827     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7828     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7829     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7830     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7831     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7832     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7833     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7834     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7835     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7836     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7837     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7838     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7839     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7840     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7841 #if 0
7842     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7843 #endif
7844 
7845 #if 0
7846     /* This one is just used for debugging the implementation. */
7847     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7848 #endif
7849 
7850     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7851     {NULL, NULL}
7852 };
7853 
7854 static PyObject *
unicode_mod(PyObject * v,PyObject * w)7855 unicode_mod(PyObject *v, PyObject *w)
7856 {
7857     if (!PyUnicode_Check(v)) {
7858         Py_INCREF(Py_NotImplemented);
7859         return Py_NotImplemented;
7860     }
7861     return PyUnicode_Format(v, w);
7862 }
7863 
7864 static PyNumberMethods unicode_as_number = {
7865     0,              /*nb_add*/
7866     0,              /*nb_subtract*/
7867     0,              /*nb_multiply*/
7868     0,              /*nb_divide*/
7869     unicode_mod,            /*nb_remainder*/
7870 };
7871 
7872 static PySequenceMethods unicode_as_sequence = {
7873     (lenfunc) unicode_length,       /* sq_length */
7874     PyUnicode_Concat,           /* sq_concat */
7875     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7876     (ssizeargfunc) unicode_getitem,     /* sq_item */
7877     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7878     0,                  /* sq_ass_item */
7879     0,                  /* sq_ass_slice */
7880     PyUnicode_Contains,         /* sq_contains */
7881 };
7882 
7883 static PyObject*
unicode_subscript(PyUnicodeObject * self,PyObject * item)7884 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7885 {
7886     if (PyIndex_Check(item)) {
7887         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7888         if (i == -1 && PyErr_Occurred())
7889             return NULL;
7890         if (i < 0)
7891             i += PyUnicode_GET_SIZE(self);
7892         return unicode_getitem(self, i);
7893     } else if (PySlice_Check(item)) {
7894         Py_ssize_t start, stop, step, slicelength, cur, i;
7895         Py_UNICODE* source_buf;
7896         Py_UNICODE* result_buf;
7897         PyObject* result;
7898 
7899         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7900                                  &start, &stop, &step, &slicelength) < 0) {
7901             return NULL;
7902         }
7903 
7904         if (slicelength <= 0) {
7905             return PyUnicode_FromUnicode(NULL, 0);
7906         } else if (start == 0 && step == 1 && slicelength == self->length &&
7907                    PyUnicode_CheckExact(self)) {
7908             Py_INCREF(self);
7909             return (PyObject *)self;
7910         } else if (step == 1) {
7911             return PyUnicode_FromUnicode(self->str + start, slicelength);
7912         } else {
7913             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7914             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7915                                                        sizeof(Py_UNICODE));
7916 
7917             if (result_buf == NULL)
7918                 return PyErr_NoMemory();
7919 
7920             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7921                 result_buf[i] = source_buf[cur];
7922             }
7923 
7924             result = PyUnicode_FromUnicode(result_buf, slicelength);
7925             PyObject_FREE(result_buf);
7926             return result;
7927         }
7928     } else {
7929         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7930         return NULL;
7931     }
7932 }
7933 
7934 static PyMappingMethods unicode_as_mapping = {
7935     (lenfunc)unicode_length,        /* mp_length */
7936     (binaryfunc)unicode_subscript,  /* mp_subscript */
7937     (objobjargproc)0,           /* mp_ass_subscript */
7938 };
7939 
7940 static Py_ssize_t
unicode_buffer_getreadbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)7941 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7942                           Py_ssize_t index,
7943                           const void **ptr)
7944 {
7945     if (index != 0) {
7946         PyErr_SetString(PyExc_SystemError,
7947                         "accessing non-existent unicode segment");
7948         return -1;
7949     }
7950     *ptr = (void *) self->str;
7951     return PyUnicode_GET_DATA_SIZE(self);
7952 }
7953 
7954 static Py_ssize_t
unicode_buffer_getwritebuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)7955 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7956                            const void **ptr)
7957 {
7958     PyErr_SetString(PyExc_TypeError,
7959                     "cannot use unicode as modifiable buffer");
7960     return -1;
7961 }
7962 
7963 static int
unicode_buffer_getsegcount(PyUnicodeObject * self,Py_ssize_t * lenp)7964 unicode_buffer_getsegcount(PyUnicodeObject *self,
7965                            Py_ssize_t *lenp)
7966 {
7967     if (lenp)
7968         *lenp = PyUnicode_GET_DATA_SIZE(self);
7969     return 1;
7970 }
7971 
7972 static Py_ssize_t
unicode_buffer_getcharbuf(PyUnicodeObject * self,Py_ssize_t index,const void ** ptr)7973 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7974                           Py_ssize_t index,
7975                           const void **ptr)
7976 {
7977     PyObject *str;
7978 
7979     if (index != 0) {
7980         PyErr_SetString(PyExc_SystemError,
7981                         "accessing non-existent unicode segment");
7982         return -1;
7983     }
7984     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7985     if (str == NULL)
7986         return -1;
7987     *ptr = (void *) PyString_AS_STRING(str);
7988     return PyString_GET_SIZE(str);
7989 }
7990 
7991 /* Helpers for PyUnicode_Format() */
7992 
7993 static PyObject *
getnextarg(PyObject * args,Py_ssize_t arglen,Py_ssize_t * p_argidx)7994 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7995 {
7996     Py_ssize_t argidx = *p_argidx;
7997     if (argidx < arglen) {
7998         (*p_argidx)++;
7999         if (arglen < 0)
8000             return args;
8001         else
8002             return PyTuple_GetItem(args, argidx);
8003     }
8004     PyErr_SetString(PyExc_TypeError,
8005                     "not enough arguments for format string");
8006     return NULL;
8007 }
8008 
8009 #define F_LJUST (1<<0)
8010 #define F_SIGN  (1<<1)
8011 #define F_BLANK (1<<2)
8012 #define F_ALT   (1<<3)
8013 #define F_ZERO  (1<<4)
8014 
8015 static Py_ssize_t
strtounicode(Py_UNICODE * buffer,const char * charbuffer)8016 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8017 {
8018     register Py_ssize_t i;
8019     Py_ssize_t len = strlen(charbuffer);
8020     for (i = len - 1; i >= 0; i--)
8021         buffer[i] = (Py_UNICODE) charbuffer[i];
8022 
8023     return len;
8024 }
8025 
8026 static int
longtounicode(Py_UNICODE * buffer,size_t len,const char * format,long x)8027 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8028 {
8029     Py_ssize_t result;
8030 
8031     PyOS_snprintf((char *)buffer, len, format, x);
8032     result = strtounicode(buffer, (char *)buffer);
8033     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8034 }
8035 
8036 /* XXX To save some code duplication, formatfloat/long/int could have been
8037    shared with stringobject.c, converting from 8-bit to Unicode after the
8038    formatting is done. */
8039 
8040 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8041 
8042 static PyObject *
formatfloat(PyObject * v,int flags,int prec,int type)8043 formatfloat(PyObject *v, int flags, int prec, int type)
8044 {
8045     char *p;
8046     PyObject *result;
8047     double x;
8048 
8049     x = PyFloat_AsDouble(v);
8050     if (x == -1.0 && PyErr_Occurred())
8051         return NULL;
8052 
8053     if (prec < 0)
8054         prec = 6;
8055 
8056     p = PyOS_double_to_string(x, type, prec,
8057                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8058     if (p == NULL)
8059         return NULL;
8060     result = PyUnicode_FromStringAndSize(p, strlen(p));
8061     PyMem_Free(p);
8062     return result;
8063 }
8064 
8065 static PyObject*
formatlong(PyObject * val,int flags,int prec,int type)8066 formatlong(PyObject *val, int flags, int prec, int type)
8067 {
8068     char *buf;
8069     int i, len;
8070     PyObject *str; /* temporary string object. */
8071     PyUnicodeObject *result;
8072 
8073     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8074     if (!str)
8075         return NULL;
8076     result = _PyUnicode_New(len);
8077     if (!result) {
8078         Py_DECREF(str);
8079         return NULL;
8080     }
8081     for (i = 0; i < len; i++)
8082         result->str[i] = buf[i];
8083     result->str[len] = 0;
8084     Py_DECREF(str);
8085     return (PyObject*)result;
8086 }
8087 
8088 static int
formatint(Py_UNICODE * buf,size_t buflen,int flags,int prec,int type,PyObject * v)8089 formatint(Py_UNICODE *buf,
8090           size_t buflen,
8091           int flags,
8092           int prec,
8093           int type,
8094           PyObject *v)
8095 {
8096     /* fmt = '%#.' + `prec` + 'l' + `type`
8097      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8098      *                     + 1 + 1
8099      *                   = 24
8100      */
8101     char fmt[64]; /* plenty big enough! */
8102     char *sign;
8103     long x;
8104 
8105     x = PyInt_AsLong(v);
8106     if (x == -1 && PyErr_Occurred())
8107         return -1;
8108     if (x < 0 && type == 'u') {
8109         type = 'd';
8110     }
8111     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8112         sign = "-";
8113     else
8114         sign = "";
8115     if (prec < 0)
8116         prec = 1;
8117 
8118     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8119      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8120      */
8121     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8122         PyErr_SetString(PyExc_OverflowError,
8123                         "formatted integer is too long (precision too large?)");
8124         return -1;
8125     }
8126 
8127     if ((flags & F_ALT) &&
8128         (type == 'x' || type == 'X')) {
8129         /* When converting under %#x or %#X, there are a number
8130          * of issues that cause pain:
8131          * - when 0 is being converted, the C standard leaves off
8132          *   the '0x' or '0X', which is inconsistent with other
8133          *   %#x/%#X conversions and inconsistent with Python's
8134          *   hex() function
8135          * - there are platforms that violate the standard and
8136          *   convert 0 with the '0x' or '0X'
8137          *   (Metrowerks, Compaq Tru64)
8138          * - there are platforms that give '0x' when converting
8139          *   under %#X, but convert 0 in accordance with the
8140          *   standard (OS/2 EMX)
8141          *
8142          * We can achieve the desired consistency by inserting our
8143          * own '0x' or '0X' prefix, and substituting %x/%X in place
8144          * of %#x/%#X.
8145          *
8146          * Note that this is the same approach as used in
8147          * formatint() in stringobject.c
8148          */
8149         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8150                       sign, type, prec, type);
8151     }
8152     else {
8153         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8154                       sign, (flags&F_ALT) ? "#" : "",
8155                       prec, type);
8156     }
8157     if (sign[0])
8158         return longtounicode(buf, buflen, fmt, -x);
8159     else
8160         return longtounicode(buf, buflen, fmt, x);
8161 }
8162 
8163 static int
formatchar(Py_UNICODE * buf,size_t buflen,PyObject * v)8164 formatchar(Py_UNICODE *buf,
8165            size_t buflen,
8166            PyObject *v)
8167 {
8168     PyObject *unistr;
8169     char *str;
8170     /* presume that the buffer is at least 2 characters long */
8171     if (PyUnicode_Check(v)) {
8172         if (PyUnicode_GET_SIZE(v) != 1)
8173             goto onError;
8174         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8175     }
8176 
8177     else if (PyString_Check(v)) {
8178         if (PyString_GET_SIZE(v) != 1)
8179             goto onError;
8180         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8181            with a UnicodeDecodeError if 'char' is not decodable with the
8182            default encoding (usually ASCII, but it might be something else) */
8183         str = PyString_AS_STRING(v);
8184         if ((unsigned char)str[0] > 0x7F) {
8185             /* the char is not ASCII; try to decode the string using the
8186                default encoding and return -1 to let the UnicodeDecodeError
8187                be raised if the string can't be decoded */
8188             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8189             if (unistr == NULL)
8190                 return -1;
8191             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8192             Py_DECREF(unistr);
8193         }
8194         else
8195             buf[0] = (Py_UNICODE)str[0];
8196     }
8197 
8198     else {
8199         /* Integer input truncated to a character */
8200         long x;
8201         x = PyInt_AsLong(v);
8202         if (x == -1 && PyErr_Occurred())
8203             goto onError;
8204 #ifdef Py_UNICODE_WIDE
8205         if (x < 0 || x > 0x10ffff) {
8206             PyErr_SetString(PyExc_OverflowError,
8207                             "%c arg not in range(0x110000) "
8208                             "(wide Python build)");
8209             return -1;
8210         }
8211 #else
8212         if (x < 0 || x > 0xffff) {
8213             PyErr_SetString(PyExc_OverflowError,
8214                             "%c arg not in range(0x10000) "
8215                             "(narrow Python build)");
8216             return -1;
8217         }
8218 #endif
8219         buf[0] = (Py_UNICODE) x;
8220     }
8221     buf[1] = '\0';
8222     return 1;
8223 
8224   onError:
8225     PyErr_SetString(PyExc_TypeError,
8226                     "%c requires int or char");
8227     return -1;
8228 }
8229 
8230 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8231 
8232    FORMATBUFLEN is the length of the buffer in which the ints &
8233    chars are formatted. XXX This is a magic number. Each formatting
8234    routine does bounds checking to ensure no overflow, but a better
8235    solution may be to malloc a buffer of appropriate size for each
8236    format. For now, the current solution is sufficient.
8237 */
8238 #define FORMATBUFLEN (size_t)120
8239 
PyUnicode_Format(PyObject * format,PyObject * args)8240 PyObject *PyUnicode_Format(PyObject *format,
8241                            PyObject *args)
8242 {
8243     Py_UNICODE *fmt, *res;
8244     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8245     int args_owned = 0;
8246     PyUnicodeObject *result = NULL;
8247     PyObject *dict = NULL;
8248     PyObject *uformat;
8249 
8250     if (format == NULL || args == NULL) {
8251         PyErr_BadInternalCall();
8252         return NULL;
8253     }
8254     uformat = PyUnicode_FromObject(format);
8255     if (uformat == NULL)
8256         return NULL;
8257     fmt = PyUnicode_AS_UNICODE(uformat);
8258     fmtcnt = PyUnicode_GET_SIZE(uformat);
8259 
8260     reslen = rescnt = fmtcnt + 100;
8261     result = _PyUnicode_New(reslen);
8262     if (result == NULL)
8263         goto onError;
8264     res = PyUnicode_AS_UNICODE(result);
8265 
8266     if (PyTuple_Check(args)) {
8267         arglen = PyTuple_Size(args);
8268         argidx = 0;
8269     }
8270     else {
8271         arglen = -1;
8272         argidx = -2;
8273     }
8274     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8275         !PyObject_TypeCheck(args, &PyBaseString_Type))
8276         dict = args;
8277 
8278     while (--fmtcnt >= 0) {
8279         if (*fmt != '%') {
8280             if (--rescnt < 0) {
8281                 rescnt = fmtcnt + 100;
8282                 reslen += rescnt;
8283                 if (_PyUnicode_Resize(&result, reslen) < 0)
8284                     goto onError;
8285                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8286                 --rescnt;
8287             }
8288             *res++ = *fmt++;
8289         }
8290         else {
8291             /* Got a format specifier */
8292             int flags = 0;
8293             Py_ssize_t width = -1;
8294             int prec = -1;
8295             Py_UNICODE c = '\0';
8296             Py_UNICODE fill;
8297             int isnumok;
8298             PyObject *v       = NULL;
8299             PyObject *temp    = NULL;
8300             Py_UNICODE *pbuf  = NULL;
8301             Py_UNICODE sign;
8302             Py_ssize_t len;
8303             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8304 
8305             fmt++;
8306             if (*fmt == '(') {
8307                 Py_UNICODE *keystart;
8308                 Py_ssize_t keylen;
8309                 PyObject *key;
8310                 int pcount = 1;
8311 
8312                 if (dict == NULL) {
8313                     PyErr_SetString(PyExc_TypeError,
8314                                     "format requires a mapping");
8315                     goto onError;
8316                 }
8317                 ++fmt;
8318                 --fmtcnt;
8319                 keystart = fmt;
8320                 /* Skip over balanced parentheses */
8321                 while (pcount > 0 && --fmtcnt >= 0) {
8322                     if (*fmt == ')')
8323                         --pcount;
8324                     else if (*fmt == '(')
8325                         ++pcount;
8326                     fmt++;
8327                 }
8328                 keylen = fmt - keystart - 1;
8329                 if (fmtcnt < 0 || pcount > 0) {
8330                     PyErr_SetString(PyExc_ValueError,
8331                                     "incomplete format key");
8332                     goto onError;
8333                 }
8334 #if 0
8335                 /* keys are converted to strings using UTF-8 and
8336                    then looked up since Python uses strings to hold
8337                    variables names etc. in its namespaces and we
8338                    wouldn't want to break common idioms. */
8339                 key = PyUnicode_EncodeUTF8(keystart,
8340                                            keylen,
8341                                            NULL);
8342 #else
8343                 key = PyUnicode_FromUnicode(keystart, keylen);
8344 #endif
8345                 if (key == NULL)
8346                     goto onError;
8347                 if (args_owned) {
8348                     Py_DECREF(args);
8349                     args_owned = 0;
8350                 }
8351                 args = PyObject_GetItem(dict, key);
8352                 Py_DECREF(key);
8353                 if (args == NULL) {
8354                     goto onError;
8355                 }
8356                 args_owned = 1;
8357                 arglen = -1;
8358                 argidx = -2;
8359             }
8360             while (--fmtcnt >= 0) {
8361                 switch (c = *fmt++) {
8362                 case '-': flags |= F_LJUST; continue;
8363                 case '+': flags |= F_SIGN; continue;
8364                 case ' ': flags |= F_BLANK; continue;
8365                 case '#': flags |= F_ALT; continue;
8366                 case '0': flags |= F_ZERO; continue;
8367                 }
8368                 break;
8369             }
8370             if (c == '*') {
8371                 v = getnextarg(args, arglen, &argidx);
8372                 if (v == NULL)
8373                     goto onError;
8374                 if (!PyInt_Check(v)) {
8375                     PyErr_SetString(PyExc_TypeError,
8376                                     "* wants int");
8377                     goto onError;
8378                 }
8379                 width = PyInt_AsLong(v);
8380                 if (width < 0) {
8381                     flags |= F_LJUST;
8382                     width = -width;
8383                 }
8384                 if (--fmtcnt >= 0)
8385                     c = *fmt++;
8386             }
8387             else if (c >= '0' && c <= '9') {
8388                 width = c - '0';
8389                 while (--fmtcnt >= 0) {
8390                     c = *fmt++;
8391                     if (c < '0' || c > '9')
8392                         break;
8393                     if ((width*10) / 10 != width) {
8394                         PyErr_SetString(PyExc_ValueError,
8395                                         "width too big");
8396                         goto onError;
8397                     }
8398                     width = width*10 + (c - '0');
8399                 }
8400             }
8401             if (c == '.') {
8402                 prec = 0;
8403                 if (--fmtcnt >= 0)
8404                     c = *fmt++;
8405                 if (c == '*') {
8406                     v = getnextarg(args, arglen, &argidx);
8407                     if (v == NULL)
8408                         goto onError;
8409                     if (!PyInt_Check(v)) {
8410                         PyErr_SetString(PyExc_TypeError,
8411                                         "* wants int");
8412                         goto onError;
8413                     }
8414                     prec = PyInt_AsLong(v);
8415                     if (prec < 0)
8416                         prec = 0;
8417                     if (--fmtcnt >= 0)
8418                         c = *fmt++;
8419                 }
8420                 else if (c >= '0' && c <= '9') {
8421                     prec = c - '0';
8422                     while (--fmtcnt >= 0) {
8423                         c = *fmt++;
8424                         if (c < '0' || c > '9')
8425                             break;
8426                         if ((prec*10) / 10 != prec) {
8427                             PyErr_SetString(PyExc_ValueError,
8428                                             "prec too big");
8429                             goto onError;
8430                         }
8431                         prec = prec*10 + (c - '0');
8432                     }
8433                 }
8434             } /* prec */
8435             if (fmtcnt >= 0) {
8436                 if (c == 'h' || c == 'l' || c == 'L') {
8437                     if (--fmtcnt >= 0)
8438                         c = *fmt++;
8439                 }
8440             }
8441             if (fmtcnt < 0) {
8442                 PyErr_SetString(PyExc_ValueError,
8443                                 "incomplete format");
8444                 goto onError;
8445             }
8446             if (c != '%') {
8447                 v = getnextarg(args, arglen, &argidx);
8448                 if (v == NULL)
8449                     goto onError;
8450             }
8451             sign = 0;
8452             fill = ' ';
8453             switch (c) {
8454 
8455             case '%':
8456                 pbuf = formatbuf;
8457                 /* presume that buffer length is at least 1 */
8458                 pbuf[0] = '%';
8459                 len = 1;
8460                 break;
8461 
8462             case 's':
8463             case 'r':
8464                 if (PyUnicode_CheckExact(v) && c == 's') {
8465                     temp = v;
8466                     Py_INCREF(temp);
8467                 }
8468                 else {
8469                     PyObject *unicode;
8470                     if (c == 's')
8471                         temp = PyObject_Unicode(v);
8472                     else
8473                         temp = PyObject_Repr(v);
8474                     if (temp == NULL)
8475                         goto onError;
8476                     if (PyUnicode_Check(temp))
8477                         /* nothing to do */;
8478                     else if (PyString_Check(temp)) {
8479                         /* convert to string to Unicode */
8480                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8481                                                    PyString_GET_SIZE(temp),
8482                                                    NULL,
8483                                                    "strict");
8484                         Py_DECREF(temp);
8485                         temp = unicode;
8486                         if (temp == NULL)
8487                             goto onError;
8488                     }
8489                     else {
8490                         Py_DECREF(temp);
8491                         PyErr_SetString(PyExc_TypeError,
8492                                         "%s argument has non-string str()");
8493                         goto onError;
8494                     }
8495                 }
8496                 pbuf = PyUnicode_AS_UNICODE(temp);
8497                 len = PyUnicode_GET_SIZE(temp);
8498                 if (prec >= 0 && len > prec)
8499                     len = prec;
8500                 break;
8501 
8502             case 'i':
8503             case 'd':
8504             case 'u':
8505             case 'o':
8506             case 'x':
8507             case 'X':
8508                 if (c == 'i')
8509                     c = 'd';
8510                 isnumok = 0;
8511                 if (PyNumber_Check(v)) {
8512                     PyObject *iobj=NULL;
8513 
8514                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8515                         iobj = v;
8516                         Py_INCREF(iobj);
8517                     }
8518                     else {
8519                         iobj = PyNumber_Int(v);
8520                         if (iobj==NULL) iobj = PyNumber_Long(v);
8521                     }
8522                     if (iobj!=NULL) {
8523                         if (PyInt_Check(iobj)) {
8524                             isnumok = 1;
8525                             pbuf = formatbuf;
8526                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8527                                             flags, prec, c, iobj);
8528                             Py_DECREF(iobj);
8529                             if (len < 0)
8530                                 goto onError;
8531                             sign = 1;
8532                         }
8533                         else if (PyLong_Check(iobj)) {
8534                             isnumok = 1;
8535                             temp = formatlong(iobj, flags, prec, c);
8536                             Py_DECREF(iobj);
8537                             if (!temp)
8538                                 goto onError;
8539                             pbuf = PyUnicode_AS_UNICODE(temp);
8540                             len = PyUnicode_GET_SIZE(temp);
8541                             sign = 1;
8542                         }
8543                         else {
8544                             Py_DECREF(iobj);
8545                         }
8546                     }
8547                 }
8548                 if (!isnumok) {
8549                     PyErr_Format(PyExc_TypeError,
8550                                  "%%%c format: a number is required, "
8551                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8552                     goto onError;
8553                 }
8554                 if (flags & F_ZERO)
8555                     fill = '0';
8556                 break;
8557 
8558             case 'e':
8559             case 'E':
8560             case 'f':
8561             case 'F':
8562             case 'g':
8563             case 'G':
8564                 temp = formatfloat(v, flags, prec, c);
8565                 if (temp == NULL)
8566                     goto onError;
8567                 pbuf = PyUnicode_AS_UNICODE(temp);
8568                 len = PyUnicode_GET_SIZE(temp);
8569                 sign = 1;
8570                 if (flags & F_ZERO)
8571                     fill = '0';
8572                 break;
8573 
8574             case 'c':
8575                 pbuf = formatbuf;
8576                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8577                 if (len < 0)
8578                     goto onError;
8579                 break;
8580 
8581             default:
8582                 PyErr_Format(PyExc_ValueError,
8583                              "unsupported format character '%c' (0x%x) "
8584                              "at index %zd",
8585                              (31<=c && c<=126) ? (char)c : '?',
8586                              (int)c,
8587                              (Py_ssize_t)(fmt - 1 -
8588                                           PyUnicode_AS_UNICODE(uformat)));
8589                 goto onError;
8590             }
8591             if (sign) {
8592                 if (*pbuf == '-' || *pbuf == '+') {
8593                     sign = *pbuf++;
8594                     len--;
8595                 }
8596                 else if (flags & F_SIGN)
8597                     sign = '+';
8598                 else if (flags & F_BLANK)
8599                     sign = ' ';
8600                 else
8601                     sign = 0;
8602             }
8603             if (width < len)
8604                 width = len;
8605             if (rescnt - (sign != 0) < width) {
8606                 reslen -= rescnt;
8607                 rescnt = width + fmtcnt + 100;
8608                 reslen += rescnt;
8609                 if (reslen < 0) {
8610                     Py_XDECREF(temp);
8611                     PyErr_NoMemory();
8612                     goto onError;
8613                 }
8614                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8615                     Py_XDECREF(temp);
8616                     goto onError;
8617                 }
8618                 res = PyUnicode_AS_UNICODE(result)
8619                     + reslen - rescnt;
8620             }
8621             if (sign) {
8622                 if (fill != ' ')
8623                     *res++ = sign;
8624                 rescnt--;
8625                 if (width > len)
8626                     width--;
8627             }
8628             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8629                 assert(pbuf[0] == '0');
8630                 assert(pbuf[1] == c);
8631                 if (fill != ' ') {
8632                     *res++ = *pbuf++;
8633                     *res++ = *pbuf++;
8634                 }
8635                 rescnt -= 2;
8636                 width -= 2;
8637                 if (width < 0)
8638                     width = 0;
8639                 len -= 2;
8640             }
8641             if (width > len && !(flags & F_LJUST)) {
8642                 do {
8643                     --rescnt;
8644                     *res++ = fill;
8645                 } while (--width > len);
8646             }
8647             if (fill == ' ') {
8648                 if (sign)
8649                     *res++ = sign;
8650                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8651                     assert(pbuf[0] == '0');
8652                     assert(pbuf[1] == c);
8653                     *res++ = *pbuf++;
8654                     *res++ = *pbuf++;
8655                 }
8656             }
8657             Py_UNICODE_COPY(res, pbuf, len);
8658             res += len;
8659             rescnt -= len;
8660             while (--width >= len) {
8661                 --rescnt;
8662                 *res++ = ' ';
8663             }
8664             if (dict && (argidx < arglen) && c != '%') {
8665                 PyErr_SetString(PyExc_TypeError,
8666                                 "not all arguments converted during string formatting");
8667                 Py_XDECREF(temp);
8668                 goto onError;
8669             }
8670             Py_XDECREF(temp);
8671         } /* '%' */
8672     } /* until end */
8673     if (argidx < arglen && !dict) {
8674         PyErr_SetString(PyExc_TypeError,
8675                         "not all arguments converted during string formatting");
8676         goto onError;
8677     }
8678 
8679     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8680         goto onError;
8681     if (args_owned) {
8682         Py_DECREF(args);
8683     }
8684     Py_DECREF(uformat);
8685     return (PyObject *)result;
8686 
8687   onError:
8688     Py_XDECREF(result);
8689     Py_DECREF(uformat);
8690     if (args_owned) {
8691         Py_DECREF(args);
8692     }
8693     return NULL;
8694 }
8695 
8696 static PyBufferProcs unicode_as_buffer = {
8697     (readbufferproc) unicode_buffer_getreadbuf,
8698     (writebufferproc) unicode_buffer_getwritebuf,
8699     (segcountproc) unicode_buffer_getsegcount,
8700     (charbufferproc) unicode_buffer_getcharbuf,
8701 };
8702 
8703 static PyObject *
8704 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8705 
8706 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8707 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8708 {
8709     PyObject *x = NULL;
8710     static char *kwlist[] = {"string", "encoding", "errors", 0};
8711     char *encoding = NULL;
8712     char *errors = NULL;
8713 
8714     if (type != &PyUnicode_Type)
8715         return unicode_subtype_new(type, args, kwds);
8716     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8717                                      kwlist, &x, &encoding, &errors))
8718         return NULL;
8719     if (x == NULL)
8720         return (PyObject *)_PyUnicode_New(0);
8721     if (encoding == NULL && errors == NULL)
8722         return PyObject_Unicode(x);
8723     else
8724         return PyUnicode_FromEncodedObject(x, encoding, errors);
8725 }
8726 
8727 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)8728 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8729 {
8730     PyUnicodeObject *tmp, *pnew;
8731     Py_ssize_t n;
8732 
8733     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8734     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8735     if (tmp == NULL)
8736         return NULL;
8737     assert(PyUnicode_Check(tmp));
8738     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8739     if (pnew == NULL) {
8740         Py_DECREF(tmp);
8741         return NULL;
8742     }
8743     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8744     if (pnew->str == NULL) {
8745         _Py_ForgetReference((PyObject *)pnew);
8746         PyObject_Del(pnew);
8747         Py_DECREF(tmp);
8748         return PyErr_NoMemory();
8749     }
8750     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8751     pnew->length = n;
8752     pnew->hash = tmp->hash;
8753     Py_DECREF(tmp);
8754     return (PyObject *)pnew;
8755 }
8756 
8757 PyDoc_STRVAR(unicode_doc,
8758              "unicode(string [, encoding[, errors]]) -> object\n\
8759 \n\
8760 Create a new Unicode object from the given encoded string.\n\
8761 encoding defaults to the current default string encoding.\n\
8762 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8763 
8764 PyTypeObject PyUnicode_Type = {
8765     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8766     "unicode",              /* tp_name */
8767     sizeof(PyUnicodeObject),        /* tp_size */
8768     0,                  /* tp_itemsize */
8769     /* Slots */
8770     (destructor)unicode_dealloc,    /* tp_dealloc */
8771     0,                  /* tp_print */
8772     0,                  /* tp_getattr */
8773     0,                  /* tp_setattr */
8774     0,                  /* tp_compare */
8775     unicode_repr,           /* tp_repr */
8776     &unicode_as_number,         /* tp_as_number */
8777     &unicode_as_sequence,       /* tp_as_sequence */
8778     &unicode_as_mapping,        /* tp_as_mapping */
8779     (hashfunc) unicode_hash,        /* tp_hash*/
8780     0,                  /* tp_call*/
8781     (reprfunc) unicode_str,     /* tp_str */
8782     PyObject_GenericGetAttr,        /* tp_getattro */
8783     0,                  /* tp_setattro */
8784     &unicode_as_buffer,         /* tp_as_buffer */
8785     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8786     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8787     unicode_doc,            /* tp_doc */
8788     0,                  /* tp_traverse */
8789     0,                  /* tp_clear */
8790     PyUnicode_RichCompare,      /* tp_richcompare */
8791     0,                  /* tp_weaklistoffset */
8792     0,                  /* tp_iter */
8793     0,                  /* tp_iternext */
8794     unicode_methods,            /* tp_methods */
8795     0,                  /* tp_members */
8796     0,                  /* tp_getset */
8797     &PyBaseString_Type,         /* tp_base */
8798     0,                  /* tp_dict */
8799     0,                  /* tp_descr_get */
8800     0,                  /* tp_descr_set */
8801     0,                  /* tp_dictoffset */
8802     0,                  /* tp_init */
8803     0,                  /* tp_alloc */
8804     unicode_new,            /* tp_new */
8805     PyObject_Del,           /* tp_free */
8806 };
8807 
8808 /* Initialize the Unicode implementation */
8809 
_PyUnicode_Init(void)8810 void _PyUnicode_Init(void)
8811 {
8812     int i;
8813 
8814     /* XXX - move this array to unicodectype.c ? */
8815     Py_UNICODE linebreak[] = {
8816         0x000A, /* LINE FEED */
8817         0x000D, /* CARRIAGE RETURN */
8818         0x001C, /* FILE SEPARATOR */
8819         0x001D, /* GROUP SEPARATOR */
8820         0x001E, /* RECORD SEPARATOR */
8821         0x0085, /* NEXT LINE */
8822         0x2028, /* LINE SEPARATOR */
8823         0x2029, /* PARAGRAPH SEPARATOR */
8824     };
8825 
8826     /* Init the implementation */
8827     free_list = NULL;
8828     numfree = 0;
8829     unicode_empty = _PyUnicode_New(0);
8830     if (!unicode_empty)
8831         return;
8832 
8833     strcpy(unicode_default_encoding, "ascii");
8834     for (i = 0; i < 256; i++)
8835         unicode_latin1[i] = NULL;
8836     if (PyType_Ready(&PyUnicode_Type) < 0)
8837         Py_FatalError("Can't initialize 'unicode'");
8838 
8839     /* initialize the linebreak bloom filter */
8840     bloom_linebreak = make_bloom_mask(
8841         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8842         );
8843 
8844     PyType_Ready(&EncodingMapType);
8845 }
8846 
8847 /* Finalize the Unicode implementation */
8848 
8849 int
PyUnicode_ClearFreeList(void)8850 PyUnicode_ClearFreeList(void)
8851 {
8852     int freelist_size = numfree;
8853     PyUnicodeObject *u;
8854 
8855     for (u = free_list; u != NULL;) {
8856         PyUnicodeObject *v = u;
8857         u = *(PyUnicodeObject **)u;
8858         if (v->str)
8859             PyObject_DEL(v->str);
8860         Py_XDECREF(v->defenc);
8861         PyObject_Del(v);
8862         numfree--;
8863     }
8864     free_list = NULL;
8865     assert(numfree == 0);
8866     return freelist_size;
8867 }
8868 
8869 void
_PyUnicode_Fini(void)8870 _PyUnicode_Fini(void)
8871 {
8872     int i;
8873 
8874     Py_XDECREF(unicode_empty);
8875     unicode_empty = NULL;
8876 
8877     for (i = 0; i < 256; i++) {
8878         if (unicode_latin1[i]) {
8879             Py_DECREF(unicode_latin1[i]);
8880             unicode_latin1[i] = NULL;
8881         }
8882     }
8883     (void)PyUnicode_ClearFreeList();
8884 }
8885 
8886 #ifdef __cplusplus
8887 }
8888 #endif
8889