1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "internal/pystate.h"
44 #include "ucnhash.h"
45 #include "bytes_methods.h"
46 #include "stringlib/eq.h"
47 
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51 
52 /*[clinic input]
53 class str "PyObject *" "&PyUnicode_Type"
54 [clinic start generated code]*/
55 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56 
57 /*[python input]
58 class Py_UCS4_converter(CConverter):
59     type = 'Py_UCS4'
60     converter = 'convert_uc'
61 
62     def converter_init(self):
63         if self.default is not unspecified:
64             self.c_default = ascii(self.default)
65             if len(self.c_default) > 4 or self.c_default[0] != "'":
66                 self.c_default = hex(ord(self.default))
67 
68 [python start generated code]*/
69 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
70 
71 /* --- Globals ------------------------------------------------------------
72 
73 NOTE: In the interpreter's initialization phase, some globals are currently
74       initialized dynamically as needed. In the process Unicode objects may
75       be created before the Unicode type is ready.
76 
77 */
78 
79 
80 #ifdef __cplusplus
81 extern "C" {
82 #endif
83 
84 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85 #define MAX_UNICODE 0x10ffff
86 
87 #ifdef Py_DEBUG
88 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
89 #else
90 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91 #endif
92 
93 #define _PyUnicode_UTF8(op)                             \
94     (((PyCompactUnicodeObject*)(op))->utf8)
95 #define PyUnicode_UTF8(op)                              \
96     (assert(_PyUnicode_CHECK(op)),                      \
97      assert(PyUnicode_IS_READY(op)),                    \
98      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
99          ((char*)((PyASCIIObject*)(op) + 1)) :          \
100          _PyUnicode_UTF8(op))
101 #define _PyUnicode_UTF8_LENGTH(op)                      \
102     (((PyCompactUnicodeObject*)(op))->utf8_length)
103 #define PyUnicode_UTF8_LENGTH(op)                       \
104     (assert(_PyUnicode_CHECK(op)),                      \
105      assert(PyUnicode_IS_READY(op)),                    \
106      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
107          ((PyASCIIObject*)(op))->length :               \
108          _PyUnicode_UTF8_LENGTH(op))
109 #define _PyUnicode_WSTR(op)                             \
110     (((PyASCIIObject*)(op))->wstr)
111 #define _PyUnicode_WSTR_LENGTH(op)                      \
112     (((PyCompactUnicodeObject*)(op))->wstr_length)
113 #define _PyUnicode_LENGTH(op)                           \
114     (((PyASCIIObject *)(op))->length)
115 #define _PyUnicode_STATE(op)                            \
116     (((PyASCIIObject *)(op))->state)
117 #define _PyUnicode_HASH(op)                             \
118     (((PyASCIIObject *)(op))->hash)
119 #define _PyUnicode_KIND(op)                             \
120     (assert(_PyUnicode_CHECK(op)),                      \
121      ((PyASCIIObject *)(op))->state.kind)
122 #define _PyUnicode_GET_LENGTH(op)                       \
123     (assert(_PyUnicode_CHECK(op)),                      \
124      ((PyASCIIObject *)(op))->length)
125 #define _PyUnicode_DATA_ANY(op)                         \
126     (((PyUnicodeObject*)(op))->data.any)
127 
128 #undef PyUnicode_READY
129 #define PyUnicode_READY(op)                             \
130     (assert(_PyUnicode_CHECK(op)),                      \
131      (PyUnicode_IS_READY(op) ?                          \
132       0 :                                               \
133       _PyUnicode_Ready(op)))
134 
135 #define _PyUnicode_SHARE_UTF8(op)                       \
136     (assert(_PyUnicode_CHECK(op)),                      \
137      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
138      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139 #define _PyUnicode_SHARE_WSTR(op)                       \
140     (assert(_PyUnicode_CHECK(op)),                      \
141      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142 
143 /* true if the Unicode object has an allocated UTF-8 memory block
144    (not shared with other data) */
145 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
146     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
147       && _PyUnicode_UTF8(op)                            \
148       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149 
150 /* true if the Unicode object has an allocated wstr memory block
151    (not shared with other data) */
152 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
153     ((_PyUnicode_WSTR(op) &&                            \
154       (!PyUnicode_IS_READY(op) ||                       \
155        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156 
157 /* Generic helper macro to convert characters of different types.
158    from_type and to_type have to be valid type names, begin and end
159    are pointers to the source characters which should be of type
160    "from_type *".  to is a pointer of type "to_type *" and points to the
161    buffer where the result characters are written to. */
162 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163     do {                                                \
164         to_type *_to = (to_type *)(to);                \
165         const from_type *_iter = (from_type *)(begin);  \
166         const from_type *_end = (from_type *)(end);     \
167         Py_ssize_t n = (_end) - (_iter);                \
168         const from_type *_unrolled_end =                \
169             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
170         while (_iter < (_unrolled_end)) {               \
171             _to[0] = (to_type) _iter[0];                \
172             _to[1] = (to_type) _iter[1];                \
173             _to[2] = (to_type) _iter[2];                \
174             _to[3] = (to_type) _iter[3];                \
175             _iter += 4; _to += 4;                       \
176         }                                               \
177         while (_iter < (_end))                          \
178             *_to++ = (to_type) *_iter++;                \
179     } while (0)
180 
181 #ifdef MS_WINDOWS
182    /* On Windows, overallocate by 50% is the best factor */
183 #  define OVERALLOCATE_FACTOR 2
184 #else
185    /* On Linux, overallocate by 25% is the best factor */
186 #  define OVERALLOCATE_FACTOR 4
187 #endif
188 
189 /* This dictionary holds all interned unicode strings.  Note that references
190    to strings in this dictionary are *not* counted in the string's ob_refcnt.
191    When the interned string reaches a refcnt of 0 the string deallocation
192    function will delete the reference from this dictionary.
193 
194    Another way to look at this is that to say that the actual reference
195    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
196 */
197 static PyObject *interned = NULL;
198 
199 /* The empty Unicode object is shared to improve performance. */
200 static PyObject *unicode_empty = NULL;
201 
202 #define _Py_INCREF_UNICODE_EMPTY()                      \
203     do {                                                \
204         if (unicode_empty != NULL)                      \
205             Py_INCREF(unicode_empty);                   \
206         else {                                          \
207             unicode_empty = PyUnicode_New(0, 0);        \
208             if (unicode_empty != NULL) {                \
209                 Py_INCREF(unicode_empty);               \
210                 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211             }                                           \
212         }                                               \
213     } while (0)
214 
215 #define _Py_RETURN_UNICODE_EMPTY()                      \
216     do {                                                \
217         _Py_INCREF_UNICODE_EMPTY();                     \
218         return unicode_empty;                           \
219     } while (0)
220 
221 #define FILL(kind, data, value, start, length) \
222     do { \
223         assert(0 <= start); \
224         assert(kind != PyUnicode_WCHAR_KIND); \
225         switch (kind) { \
226         case PyUnicode_1BYTE_KIND: { \
227             assert(value <= 0xff); \
228             Py_UCS1 ch = (unsigned char)value; \
229             Py_UCS1 *to = (Py_UCS1 *)data + start; \
230             memset(to, ch, length); \
231             break; \
232         } \
233         case PyUnicode_2BYTE_KIND: { \
234             assert(value <= 0xffff); \
235             Py_UCS2 ch = (Py_UCS2)value; \
236             Py_UCS2 *to = (Py_UCS2 *)data + start; \
237             const Py_UCS2 *end = to + length; \
238             for (; to < end; ++to) *to = ch; \
239             break; \
240         } \
241         case PyUnicode_4BYTE_KIND: { \
242             assert(value <= MAX_UNICODE); \
243             Py_UCS4 ch = value; \
244             Py_UCS4 * to = (Py_UCS4 *)data + start; \
245             const Py_UCS4 *end = to + length; \
246             for (; to < end; ++to) *to = ch; \
247             break; \
248         } \
249         default: Py_UNREACHABLE(); \
250         } \
251     } while (0)
252 
253 
254 /* Forward declaration */
255 static inline int
256 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
257 
258 /* List of static strings. */
259 static _Py_Identifier *static_strings = NULL;
260 
261 /* Single character Unicode strings in the Latin-1 range are being
262    shared as well. */
263 static PyObject *unicode_latin1[256] = {NULL};
264 
265 /* Fast detection of the most frequent whitespace characters */
266 const unsigned char _Py_ascii_whitespace[] = {
267     0, 0, 0, 0, 0, 0, 0, 0,
268 /*     case 0x0009: * CHARACTER TABULATION */
269 /*     case 0x000A: * LINE FEED */
270 /*     case 0x000B: * LINE TABULATION */
271 /*     case 0x000C: * FORM FEED */
272 /*     case 0x000D: * CARRIAGE RETURN */
273     0, 1, 1, 1, 1, 1, 0, 0,
274     0, 0, 0, 0, 0, 0, 0, 0,
275 /*     case 0x001C: * FILE SEPARATOR */
276 /*     case 0x001D: * GROUP SEPARATOR */
277 /*     case 0x001E: * RECORD SEPARATOR */
278 /*     case 0x001F: * UNIT SEPARATOR */
279     0, 0, 0, 0, 1, 1, 1, 1,
280 /*     case 0x0020: * SPACE */
281     1, 0, 0, 0, 0, 0, 0, 0,
282     0, 0, 0, 0, 0, 0, 0, 0,
283     0, 0, 0, 0, 0, 0, 0, 0,
284     0, 0, 0, 0, 0, 0, 0, 0,
285 
286     0, 0, 0, 0, 0, 0, 0, 0,
287     0, 0, 0, 0, 0, 0, 0, 0,
288     0, 0, 0, 0, 0, 0, 0, 0,
289     0, 0, 0, 0, 0, 0, 0, 0,
290     0, 0, 0, 0, 0, 0, 0, 0,
291     0, 0, 0, 0, 0, 0, 0, 0,
292     0, 0, 0, 0, 0, 0, 0, 0,
293     0, 0, 0, 0, 0, 0, 0, 0
294 };
295 
296 /* forward */
297 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
298 static PyObject* get_latin1_char(unsigned char ch);
299 static int unicode_modifiable(PyObject *unicode);
300 
301 
302 static PyObject *
303 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
304 static PyObject *
305 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
306 static PyObject *
307 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
308 
309 static PyObject *
310 unicode_encode_call_errorhandler(const char *errors,
311        PyObject **errorHandler,const char *encoding, const char *reason,
312        PyObject *unicode, PyObject **exceptionObject,
313        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
314 
315 static void
316 raise_encode_exception(PyObject **exceptionObject,
317                        const char *encoding,
318                        PyObject *unicode,
319                        Py_ssize_t startpos, Py_ssize_t endpos,
320                        const char *reason);
321 
322 /* Same for linebreaks */
323 static const unsigned char ascii_linebreak[] = {
324     0, 0, 0, 0, 0, 0, 0, 0,
325 /*         0x000A, * LINE FEED */
326 /*         0x000B, * LINE TABULATION */
327 /*         0x000C, * FORM FEED */
328 /*         0x000D, * CARRIAGE RETURN */
329     0, 0, 1, 1, 1, 1, 0, 0,
330     0, 0, 0, 0, 0, 0, 0, 0,
331 /*         0x001C, * FILE SEPARATOR */
332 /*         0x001D, * GROUP SEPARATOR */
333 /*         0x001E, * RECORD SEPARATOR */
334     0, 0, 0, 0, 1, 1, 1, 0,
335     0, 0, 0, 0, 0, 0, 0, 0,
336     0, 0, 0, 0, 0, 0, 0, 0,
337     0, 0, 0, 0, 0, 0, 0, 0,
338     0, 0, 0, 0, 0, 0, 0, 0,
339 
340     0, 0, 0, 0, 0, 0, 0, 0,
341     0, 0, 0, 0, 0, 0, 0, 0,
342     0, 0, 0, 0, 0, 0, 0, 0,
343     0, 0, 0, 0, 0, 0, 0, 0,
344     0, 0, 0, 0, 0, 0, 0, 0,
345     0, 0, 0, 0, 0, 0, 0, 0,
346     0, 0, 0, 0, 0, 0, 0, 0,
347     0, 0, 0, 0, 0, 0, 0, 0
348 };
349 
350 static int convert_uc(PyObject *obj, void *addr);
351 
352 #include "clinic/unicodeobject.c.h"
353 
354 typedef enum {
355     _Py_ERROR_UNKNOWN=0,
356     _Py_ERROR_STRICT,
357     _Py_ERROR_SURROGATEESCAPE,
358     _Py_ERROR_REPLACE,
359     _Py_ERROR_IGNORE,
360     _Py_ERROR_BACKSLASHREPLACE,
361     _Py_ERROR_SURROGATEPASS,
362     _Py_ERROR_XMLCHARREFREPLACE,
363     _Py_ERROR_OTHER
364 } _Py_error_handler;
365 
366 static _Py_error_handler
get_error_handler(const char * errors)367 get_error_handler(const char *errors)
368 {
369     if (errors == NULL || strcmp(errors, "strict") == 0) {
370         return _Py_ERROR_STRICT;
371     }
372     if (strcmp(errors, "surrogateescape") == 0) {
373         return _Py_ERROR_SURROGATEESCAPE;
374     }
375     if (strcmp(errors, "replace") == 0) {
376         return _Py_ERROR_REPLACE;
377     }
378     if (strcmp(errors, "ignore") == 0) {
379         return _Py_ERROR_IGNORE;
380     }
381     if (strcmp(errors, "backslashreplace") == 0) {
382         return _Py_ERROR_BACKSLASHREPLACE;
383     }
384     if (strcmp(errors, "surrogatepass") == 0) {
385         return _Py_ERROR_SURROGATEPASS;
386     }
387     if (strcmp(errors, "xmlcharrefreplace") == 0) {
388         return _Py_ERROR_XMLCHARREFREPLACE;
389     }
390     return _Py_ERROR_OTHER;
391 }
392 
393 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
394    This function is kept for backward compatibility with the old API. */
395 Py_UNICODE
PyUnicode_GetMax(void)396 PyUnicode_GetMax(void)
397 {
398 #ifdef Py_UNICODE_WIDE
399     return 0x10FFFF;
400 #else
401     /* This is actually an illegal character, so it should
402        not be passed to unichr. */
403     return 0xFFFF;
404 #endif
405 }
406 
407 #ifdef Py_DEBUG
408 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)409 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
410 {
411     PyASCIIObject *ascii;
412     unsigned int kind;
413 
414     assert(PyUnicode_Check(op));
415 
416     ascii = (PyASCIIObject *)op;
417     kind = ascii->state.kind;
418 
419     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
420         assert(kind == PyUnicode_1BYTE_KIND);
421         assert(ascii->state.ready == 1);
422     }
423     else {
424         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
425         void *data;
426 
427         if (ascii->state.compact == 1) {
428             data = compact + 1;
429             assert(kind == PyUnicode_1BYTE_KIND
430                    || kind == PyUnicode_2BYTE_KIND
431                    || kind == PyUnicode_4BYTE_KIND);
432             assert(ascii->state.ascii == 0);
433             assert(ascii->state.ready == 1);
434             assert (compact->utf8 != data);
435         }
436         else {
437             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
438 
439             data = unicode->data.any;
440             if (kind == PyUnicode_WCHAR_KIND) {
441                 assert(ascii->length == 0);
442                 assert(ascii->hash == -1);
443                 assert(ascii->state.compact == 0);
444                 assert(ascii->state.ascii == 0);
445                 assert(ascii->state.ready == 0);
446                 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
447                 assert(ascii->wstr != NULL);
448                 assert(data == NULL);
449                 assert(compact->utf8 == NULL);
450             }
451             else {
452                 assert(kind == PyUnicode_1BYTE_KIND
453                        || kind == PyUnicode_2BYTE_KIND
454                        || kind == PyUnicode_4BYTE_KIND);
455                 assert(ascii->state.compact == 0);
456                 assert(ascii->state.ready == 1);
457                 assert(data != NULL);
458                 if (ascii->state.ascii) {
459                     assert (compact->utf8 == data);
460                     assert (compact->utf8_length == ascii->length);
461                 }
462                 else
463                     assert (compact->utf8 != data);
464             }
465         }
466         if (kind != PyUnicode_WCHAR_KIND) {
467             if (
468 #if SIZEOF_WCHAR_T == 2
469                 kind == PyUnicode_2BYTE_KIND
470 #else
471                 kind == PyUnicode_4BYTE_KIND
472 #endif
473                )
474             {
475                 assert(ascii->wstr == data);
476                 assert(compact->wstr_length == ascii->length);
477             } else
478                 assert(ascii->wstr != data);
479         }
480 
481         if (compact->utf8 == NULL)
482             assert(compact->utf8_length == 0);
483         if (ascii->wstr == NULL)
484             assert(compact->wstr_length == 0);
485     }
486     /* check that the best kind is used */
487     if (check_content && kind != PyUnicode_WCHAR_KIND)
488     {
489         Py_ssize_t i;
490         Py_UCS4 maxchar = 0;
491         void *data;
492         Py_UCS4 ch;
493 
494         data = PyUnicode_DATA(ascii);
495         for (i=0; i < ascii->length; i++)
496         {
497             ch = PyUnicode_READ(kind, data, i);
498             if (ch > maxchar)
499                 maxchar = ch;
500         }
501         if (kind == PyUnicode_1BYTE_KIND) {
502             if (ascii->state.ascii == 0) {
503                 assert(maxchar >= 128);
504                 assert(maxchar <= 255);
505             }
506             else
507                 assert(maxchar < 128);
508         }
509         else if (kind == PyUnicode_2BYTE_KIND) {
510             assert(maxchar >= 0x100);
511             assert(maxchar <= 0xFFFF);
512         }
513         else {
514             assert(maxchar >= 0x10000);
515             assert(maxchar <= MAX_UNICODE);
516         }
517         assert(PyUnicode_READ(kind, data, ascii->length) == 0);
518     }
519     return 1;
520 }
521 #endif
522 
523 static PyObject*
unicode_result_wchar(PyObject * unicode)524 unicode_result_wchar(PyObject *unicode)
525 {
526 #ifndef Py_DEBUG
527     Py_ssize_t len;
528 
529     len = _PyUnicode_WSTR_LENGTH(unicode);
530     if (len == 0) {
531         Py_DECREF(unicode);
532         _Py_RETURN_UNICODE_EMPTY();
533     }
534 
535     if (len == 1) {
536         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
537         if ((Py_UCS4)ch < 256) {
538             PyObject *latin1_char = get_latin1_char((unsigned char)ch);
539             Py_DECREF(unicode);
540             return latin1_char;
541         }
542     }
543 
544     if (_PyUnicode_Ready(unicode) < 0) {
545         Py_DECREF(unicode);
546         return NULL;
547     }
548 #else
549     assert(Py_REFCNT(unicode) == 1);
550 
551     /* don't make the result ready in debug mode to ensure that the caller
552        makes the string ready before using it */
553     assert(_PyUnicode_CheckConsistency(unicode, 1));
554 #endif
555     return unicode;
556 }
557 
558 static PyObject*
unicode_result_ready(PyObject * unicode)559 unicode_result_ready(PyObject *unicode)
560 {
561     Py_ssize_t length;
562 
563     length = PyUnicode_GET_LENGTH(unicode);
564     if (length == 0) {
565         if (unicode != unicode_empty) {
566             Py_DECREF(unicode);
567             _Py_RETURN_UNICODE_EMPTY();
568         }
569         return unicode_empty;
570     }
571 
572     if (length == 1) {
573         void *data = PyUnicode_DATA(unicode);
574         int kind = PyUnicode_KIND(unicode);
575         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
576         if (ch < 256) {
577             PyObject *latin1_char = unicode_latin1[ch];
578             if (latin1_char != NULL) {
579                 if (unicode != latin1_char) {
580                     Py_INCREF(latin1_char);
581                     Py_DECREF(unicode);
582                 }
583                 return latin1_char;
584             }
585             else {
586                 assert(_PyUnicode_CheckConsistency(unicode, 1));
587                 Py_INCREF(unicode);
588                 unicode_latin1[ch] = unicode;
589                 return unicode;
590             }
591         }
592     }
593 
594     assert(_PyUnicode_CheckConsistency(unicode, 1));
595     return unicode;
596 }
597 
598 static PyObject*
unicode_result(PyObject * unicode)599 unicode_result(PyObject *unicode)
600 {
601     assert(_PyUnicode_CHECK(unicode));
602     if (PyUnicode_IS_READY(unicode))
603         return unicode_result_ready(unicode);
604     else
605         return unicode_result_wchar(unicode);
606 }
607 
608 static PyObject*
unicode_result_unchanged(PyObject * unicode)609 unicode_result_unchanged(PyObject *unicode)
610 {
611     if (PyUnicode_CheckExact(unicode)) {
612         if (PyUnicode_READY(unicode) == -1)
613             return NULL;
614         Py_INCREF(unicode);
615         return unicode;
616     }
617     else
618         /* Subtype -- return genuine unicode string with the same value. */
619         return _PyUnicode_Copy(unicode);
620 }
621 
622 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
623    ASCII, Latin1, UTF-8, etc. */
624 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)625 backslashreplace(_PyBytesWriter *writer, char *str,
626                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
627 {
628     Py_ssize_t size, i;
629     Py_UCS4 ch;
630     enum PyUnicode_Kind kind;
631     void *data;
632 
633     assert(PyUnicode_IS_READY(unicode));
634     kind = PyUnicode_KIND(unicode);
635     data = PyUnicode_DATA(unicode);
636 
637     size = 0;
638     /* determine replacement size */
639     for (i = collstart; i < collend; ++i) {
640         Py_ssize_t incr;
641 
642         ch = PyUnicode_READ(kind, data, i);
643         if (ch < 0x100)
644             incr = 2+2;
645         else if (ch < 0x10000)
646             incr = 2+4;
647         else {
648             assert(ch <= MAX_UNICODE);
649             incr = 2+8;
650         }
651         if (size > PY_SSIZE_T_MAX - incr) {
652             PyErr_SetString(PyExc_OverflowError,
653                             "encoded result is too long for a Python string");
654             return NULL;
655         }
656         size += incr;
657     }
658 
659     str = _PyBytesWriter_Prepare(writer, str, size);
660     if (str == NULL)
661         return NULL;
662 
663     /* generate replacement */
664     for (i = collstart; i < collend; ++i) {
665         ch = PyUnicode_READ(kind, data, i);
666         *str++ = '\\';
667         if (ch >= 0x00010000) {
668             *str++ = 'U';
669             *str++ = Py_hexdigits[(ch>>28)&0xf];
670             *str++ = Py_hexdigits[(ch>>24)&0xf];
671             *str++ = Py_hexdigits[(ch>>20)&0xf];
672             *str++ = Py_hexdigits[(ch>>16)&0xf];
673             *str++ = Py_hexdigits[(ch>>12)&0xf];
674             *str++ = Py_hexdigits[(ch>>8)&0xf];
675         }
676         else if (ch >= 0x100) {
677             *str++ = 'u';
678             *str++ = Py_hexdigits[(ch>>12)&0xf];
679             *str++ = Py_hexdigits[(ch>>8)&0xf];
680         }
681         else
682             *str++ = 'x';
683         *str++ = Py_hexdigits[(ch>>4)&0xf];
684         *str++ = Py_hexdigits[ch&0xf];
685     }
686     return str;
687 }
688 
689 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
690    ASCII, Latin1, UTF-8, etc. */
691 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)692 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
693                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
694 {
695     Py_ssize_t size, i;
696     Py_UCS4 ch;
697     enum PyUnicode_Kind kind;
698     void *data;
699 
700     assert(PyUnicode_IS_READY(unicode));
701     kind = PyUnicode_KIND(unicode);
702     data = PyUnicode_DATA(unicode);
703 
704     size = 0;
705     /* determine replacement size */
706     for (i = collstart; i < collend; ++i) {
707         Py_ssize_t incr;
708 
709         ch = PyUnicode_READ(kind, data, i);
710         if (ch < 10)
711             incr = 2+1+1;
712         else if (ch < 100)
713             incr = 2+2+1;
714         else if (ch < 1000)
715             incr = 2+3+1;
716         else if (ch < 10000)
717             incr = 2+4+1;
718         else if (ch < 100000)
719             incr = 2+5+1;
720         else if (ch < 1000000)
721             incr = 2+6+1;
722         else {
723             assert(ch <= MAX_UNICODE);
724             incr = 2+7+1;
725         }
726         if (size > PY_SSIZE_T_MAX - incr) {
727             PyErr_SetString(PyExc_OverflowError,
728                             "encoded result is too long for a Python string");
729             return NULL;
730         }
731         size += incr;
732     }
733 
734     str = _PyBytesWriter_Prepare(writer, str, size);
735     if (str == NULL)
736         return NULL;
737 
738     /* generate replacement */
739     for (i = collstart; i < collend; ++i) {
740         str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
741     }
742     return str;
743 }
744 
745 /* --- Bloom Filters ----------------------------------------------------- */
746 
747 /* stuff to implement simple "bloom filters" for Unicode characters.
748    to keep things simple, we use a single bitmask, using the least 5
749    bits from each unicode characters as the bit index. */
750 
751 /* the linebreak mask is set up by Unicode_Init below */
752 
753 #if LONG_BIT >= 128
754 #define BLOOM_WIDTH 128
755 #elif LONG_BIT >= 64
756 #define BLOOM_WIDTH 64
757 #elif LONG_BIT >= 32
758 #define BLOOM_WIDTH 32
759 #else
760 #error "LONG_BIT is smaller than 32"
761 #endif
762 
763 #define BLOOM_MASK unsigned long
764 
765 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
766 
767 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
768 
769 #define BLOOM_LINEBREAK(ch)                                             \
770     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
771      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
772 
773 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)774 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
775 {
776 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
777     do {                                               \
778         TYPE *data = (TYPE *)PTR;                      \
779         TYPE *end = data + LEN;                        \
780         Py_UCS4 ch;                                    \
781         for (; data != end; data++) {                  \
782             ch = *data;                                \
783             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
784         }                                              \
785         break;                                         \
786     } while (0)
787 
788     /* calculate simple bloom-style bitmask for a given unicode string */
789 
790     BLOOM_MASK mask;
791 
792     mask = 0;
793     switch (kind) {
794     case PyUnicode_1BYTE_KIND:
795         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
796         break;
797     case PyUnicode_2BYTE_KIND:
798         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
799         break;
800     case PyUnicode_4BYTE_KIND:
801         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
802         break;
803     default:
804         Py_UNREACHABLE();
805     }
806     return mask;
807 
808 #undef BLOOM_UPDATE
809 }
810 
811 static int
ensure_unicode(PyObject * obj)812 ensure_unicode(PyObject *obj)
813 {
814     if (!PyUnicode_Check(obj)) {
815         PyErr_Format(PyExc_TypeError,
816                      "must be str, not %.100s",
817                      Py_TYPE(obj)->tp_name);
818         return -1;
819     }
820     return PyUnicode_READY(obj);
821 }
822 
823 /* Compilation of templated routines */
824 
825 #include "stringlib/asciilib.h"
826 #include "stringlib/fastsearch.h"
827 #include "stringlib/partition.h"
828 #include "stringlib/split.h"
829 #include "stringlib/count.h"
830 #include "stringlib/find.h"
831 #include "stringlib/find_max_char.h"
832 #include "stringlib/undef.h"
833 
834 #include "stringlib/ucs1lib.h"
835 #include "stringlib/fastsearch.h"
836 #include "stringlib/partition.h"
837 #include "stringlib/split.h"
838 #include "stringlib/count.h"
839 #include "stringlib/find.h"
840 #include "stringlib/replace.h"
841 #include "stringlib/find_max_char.h"
842 #include "stringlib/undef.h"
843 
844 #include "stringlib/ucs2lib.h"
845 #include "stringlib/fastsearch.h"
846 #include "stringlib/partition.h"
847 #include "stringlib/split.h"
848 #include "stringlib/count.h"
849 #include "stringlib/find.h"
850 #include "stringlib/replace.h"
851 #include "stringlib/find_max_char.h"
852 #include "stringlib/undef.h"
853 
854 #include "stringlib/ucs4lib.h"
855 #include "stringlib/fastsearch.h"
856 #include "stringlib/partition.h"
857 #include "stringlib/split.h"
858 #include "stringlib/count.h"
859 #include "stringlib/find.h"
860 #include "stringlib/replace.h"
861 #include "stringlib/find_max_char.h"
862 #include "stringlib/undef.h"
863 
864 #include "stringlib/unicodedefs.h"
865 #include "stringlib/fastsearch.h"
866 #include "stringlib/count.h"
867 #include "stringlib/find.h"
868 #include "stringlib/undef.h"
869 
870 /* --- Unicode Object ----------------------------------------------------- */
871 
872 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)873 findchar(const void *s, int kind,
874          Py_ssize_t size, Py_UCS4 ch,
875          int direction)
876 {
877     switch (kind) {
878     case PyUnicode_1BYTE_KIND:
879         if ((Py_UCS1) ch != ch)
880             return -1;
881         if (direction > 0)
882             return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
883         else
884             return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
885     case PyUnicode_2BYTE_KIND:
886         if ((Py_UCS2) ch != ch)
887             return -1;
888         if (direction > 0)
889             return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
890         else
891             return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
892     case PyUnicode_4BYTE_KIND:
893         if (direction > 0)
894             return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
895         else
896             return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
897     default:
898         Py_UNREACHABLE();
899     }
900 }
901 
902 #ifdef Py_DEBUG
903 /* Fill the data of a Unicode string with invalid characters to detect bugs
904    earlier.
905 
906    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
907    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
908    invalid character in Unicode 6.0. */
909 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)910 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
911 {
912     int kind = PyUnicode_KIND(unicode);
913     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
914     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
915     if (length <= old_length)
916         return;
917     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
918 }
919 #endif
920 
921 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)922 resize_compact(PyObject *unicode, Py_ssize_t length)
923 {
924     Py_ssize_t char_size;
925     Py_ssize_t struct_size;
926     Py_ssize_t new_size;
927     int share_wstr;
928     PyObject *new_unicode;
929 #ifdef Py_DEBUG
930     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
931 #endif
932 
933     assert(unicode_modifiable(unicode));
934     assert(PyUnicode_IS_READY(unicode));
935     assert(PyUnicode_IS_COMPACT(unicode));
936 
937     char_size = PyUnicode_KIND(unicode);
938     if (PyUnicode_IS_ASCII(unicode))
939         struct_size = sizeof(PyASCIIObject);
940     else
941         struct_size = sizeof(PyCompactUnicodeObject);
942     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
943 
944     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
945         PyErr_NoMemory();
946         return NULL;
947     }
948     new_size = (struct_size + (length + 1) * char_size);
949 
950     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
951         PyObject_DEL(_PyUnicode_UTF8(unicode));
952         _PyUnicode_UTF8(unicode) = NULL;
953         _PyUnicode_UTF8_LENGTH(unicode) = 0;
954     }
955     _Py_DEC_REFTOTAL;
956     _Py_ForgetReference(unicode);
957 
958     new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
959     if (new_unicode == NULL) {
960         _Py_NewReference(unicode);
961         PyErr_NoMemory();
962         return NULL;
963     }
964     unicode = new_unicode;
965     _Py_NewReference(unicode);
966 
967     _PyUnicode_LENGTH(unicode) = length;
968     if (share_wstr) {
969         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
970         if (!PyUnicode_IS_ASCII(unicode))
971             _PyUnicode_WSTR_LENGTH(unicode) = length;
972     }
973     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
974         PyObject_DEL(_PyUnicode_WSTR(unicode));
975         _PyUnicode_WSTR(unicode) = NULL;
976         if (!PyUnicode_IS_ASCII(unicode))
977             _PyUnicode_WSTR_LENGTH(unicode) = 0;
978     }
979 #ifdef Py_DEBUG
980     unicode_fill_invalid(unicode, old_length);
981 #endif
982     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
983                     length, 0);
984     assert(_PyUnicode_CheckConsistency(unicode, 0));
985     return unicode;
986 }
987 
988 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)989 resize_inplace(PyObject *unicode, Py_ssize_t length)
990 {
991     wchar_t *wstr;
992     Py_ssize_t new_size;
993     assert(!PyUnicode_IS_COMPACT(unicode));
994     assert(Py_REFCNT(unicode) == 1);
995 
996     if (PyUnicode_IS_READY(unicode)) {
997         Py_ssize_t char_size;
998         int share_wstr, share_utf8;
999         void *data;
1000 #ifdef Py_DEBUG
1001         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1002 #endif
1003 
1004         data = _PyUnicode_DATA_ANY(unicode);
1005         char_size = PyUnicode_KIND(unicode);
1006         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1007         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1008 
1009         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1010             PyErr_NoMemory();
1011             return -1;
1012         }
1013         new_size = (length + 1) * char_size;
1014 
1015         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1016         {
1017             PyObject_DEL(_PyUnicode_UTF8(unicode));
1018             _PyUnicode_UTF8(unicode) = NULL;
1019             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1020         }
1021 
1022         data = (PyObject *)PyObject_REALLOC(data, new_size);
1023         if (data == NULL) {
1024             PyErr_NoMemory();
1025             return -1;
1026         }
1027         _PyUnicode_DATA_ANY(unicode) = data;
1028         if (share_wstr) {
1029             _PyUnicode_WSTR(unicode) = data;
1030             _PyUnicode_WSTR_LENGTH(unicode) = length;
1031         }
1032         if (share_utf8) {
1033             _PyUnicode_UTF8(unicode) = data;
1034             _PyUnicode_UTF8_LENGTH(unicode) = length;
1035         }
1036         _PyUnicode_LENGTH(unicode) = length;
1037         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1038 #ifdef Py_DEBUG
1039         unicode_fill_invalid(unicode, old_length);
1040 #endif
1041         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1042             assert(_PyUnicode_CheckConsistency(unicode, 0));
1043             return 0;
1044         }
1045     }
1046     assert(_PyUnicode_WSTR(unicode) != NULL);
1047 
1048     /* check for integer overflow */
1049     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1050         PyErr_NoMemory();
1051         return -1;
1052     }
1053     new_size = sizeof(wchar_t) * (length + 1);
1054     wstr =  _PyUnicode_WSTR(unicode);
1055     wstr = PyObject_REALLOC(wstr, new_size);
1056     if (!wstr) {
1057         PyErr_NoMemory();
1058         return -1;
1059     }
1060     _PyUnicode_WSTR(unicode) = wstr;
1061     _PyUnicode_WSTR(unicode)[length] = 0;
1062     _PyUnicode_WSTR_LENGTH(unicode) = length;
1063     assert(_PyUnicode_CheckConsistency(unicode, 0));
1064     return 0;
1065 }
1066 
1067 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1068 resize_copy(PyObject *unicode, Py_ssize_t length)
1069 {
1070     Py_ssize_t copy_length;
1071     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1072         PyObject *copy;
1073 
1074         assert(PyUnicode_IS_READY(unicode));
1075 
1076         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1077         if (copy == NULL)
1078             return NULL;
1079 
1080         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1081         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1082         return copy;
1083     }
1084     else {
1085         PyObject *w;
1086 
1087         w = (PyObject*)_PyUnicode_New(length);
1088         if (w == NULL)
1089             return NULL;
1090         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1091         copy_length = Py_MIN(copy_length, length);
1092         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1093                   copy_length * sizeof(wchar_t));
1094         return w;
1095     }
1096 }
1097 
1098 /* We allocate one more byte to make sure the string is
1099    Ux0000 terminated; some code (e.g. new_identifier)
1100    relies on that.
1101 
1102    XXX This allocator could further be enhanced by assuring that the
1103    free list never reduces its size below 1.
1104 
1105 */
1106 
1107 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1108 _PyUnicode_New(Py_ssize_t length)
1109 {
1110     PyUnicodeObject *unicode;
1111     size_t new_size;
1112 
1113     /* Optimization for empty strings */
1114     if (length == 0 && unicode_empty != NULL) {
1115         Py_INCREF(unicode_empty);
1116         return (PyUnicodeObject*)unicode_empty;
1117     }
1118 
1119     /* Ensure we won't overflow the size. */
1120     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1121         return (PyUnicodeObject *)PyErr_NoMemory();
1122     }
1123     if (length < 0) {
1124         PyErr_SetString(PyExc_SystemError,
1125                         "Negative size passed to _PyUnicode_New");
1126         return NULL;
1127     }
1128 
1129     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1130     if (unicode == NULL)
1131         return NULL;
1132     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1133 
1134     _PyUnicode_WSTR_LENGTH(unicode) = length;
1135     _PyUnicode_HASH(unicode) = -1;
1136     _PyUnicode_STATE(unicode).interned = 0;
1137     _PyUnicode_STATE(unicode).kind = 0;
1138     _PyUnicode_STATE(unicode).compact = 0;
1139     _PyUnicode_STATE(unicode).ready = 0;
1140     _PyUnicode_STATE(unicode).ascii = 0;
1141     _PyUnicode_DATA_ANY(unicode) = NULL;
1142     _PyUnicode_LENGTH(unicode) = 0;
1143     _PyUnicode_UTF8(unicode) = NULL;
1144     _PyUnicode_UTF8_LENGTH(unicode) = 0;
1145 
1146     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1147     if (!_PyUnicode_WSTR(unicode)) {
1148         Py_DECREF(unicode);
1149         PyErr_NoMemory();
1150         return NULL;
1151     }
1152 
1153     /* Initialize the first element to guard against cases where
1154      * the caller fails before initializing str -- unicode_resize()
1155      * reads str[0], and the Keep-Alive optimization can keep memory
1156      * allocated for str alive across a call to unicode_dealloc(unicode).
1157      * We don't want unicode_resize to read uninitialized memory in
1158      * that case.
1159      */
1160     _PyUnicode_WSTR(unicode)[0] = 0;
1161     _PyUnicode_WSTR(unicode)[length] = 0;
1162 
1163     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1164     return unicode;
1165 }
1166 
1167 static const char*
unicode_kind_name(PyObject * unicode)1168 unicode_kind_name(PyObject *unicode)
1169 {
1170     /* don't check consistency: unicode_kind_name() is called from
1171        _PyUnicode_Dump() */
1172     if (!PyUnicode_IS_COMPACT(unicode))
1173     {
1174         if (!PyUnicode_IS_READY(unicode))
1175             return "wstr";
1176         switch (PyUnicode_KIND(unicode))
1177         {
1178         case PyUnicode_1BYTE_KIND:
1179             if (PyUnicode_IS_ASCII(unicode))
1180                 return "legacy ascii";
1181             else
1182                 return "legacy latin1";
1183         case PyUnicode_2BYTE_KIND:
1184             return "legacy UCS2";
1185         case PyUnicode_4BYTE_KIND:
1186             return "legacy UCS4";
1187         default:
1188             return "<legacy invalid kind>";
1189         }
1190     }
1191     assert(PyUnicode_IS_READY(unicode));
1192     switch (PyUnicode_KIND(unicode)) {
1193     case PyUnicode_1BYTE_KIND:
1194         if (PyUnicode_IS_ASCII(unicode))
1195             return "ascii";
1196         else
1197             return "latin1";
1198     case PyUnicode_2BYTE_KIND:
1199         return "UCS2";
1200     case PyUnicode_4BYTE_KIND:
1201         return "UCS4";
1202     default:
1203         return "<invalid compact kind>";
1204     }
1205 }
1206 
1207 #ifdef Py_DEBUG
1208 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode)1209 char *_PyUnicode_utf8(void *unicode){
1210     return PyUnicode_UTF8(unicode);
1211 }
1212 
_PyUnicode_compact_data(void * unicode)1213 void *_PyUnicode_compact_data(void *unicode) {
1214     return _PyUnicode_COMPACT_DATA(unicode);
1215 }
_PyUnicode_data(void * unicode)1216 void *_PyUnicode_data(void *unicode){
1217     printf("obj %p\n", unicode);
1218     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1219     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1220     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1221     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1222     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1223     return PyUnicode_DATA(unicode);
1224 }
1225 
1226 void
_PyUnicode_Dump(PyObject * op)1227 _PyUnicode_Dump(PyObject *op)
1228 {
1229     PyASCIIObject *ascii = (PyASCIIObject *)op;
1230     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1231     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1232     void *data;
1233 
1234     if (ascii->state.compact)
1235     {
1236         if (ascii->state.ascii)
1237             data = (ascii + 1);
1238         else
1239             data = (compact + 1);
1240     }
1241     else
1242         data = unicode->data.any;
1243     printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1244            unicode_kind_name(op), ascii->length);
1245 
1246     if (ascii->wstr == data)
1247         printf("shared ");
1248     printf("wstr=%p", ascii->wstr);
1249 
1250     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1251         printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1252         if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1253             printf("shared ");
1254         printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1255                compact->utf8, compact->utf8_length);
1256     }
1257     printf(", data=%p\n", data);
1258 }
1259 #endif
1260 
1261 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1262 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1263 {
1264     PyObject *obj;
1265     PyCompactUnicodeObject *unicode;
1266     void *data;
1267     enum PyUnicode_Kind kind;
1268     int is_sharing, is_ascii;
1269     Py_ssize_t char_size;
1270     Py_ssize_t struct_size;
1271 
1272     /* Optimization for empty strings */
1273     if (size == 0 && unicode_empty != NULL) {
1274         Py_INCREF(unicode_empty);
1275         return unicode_empty;
1276     }
1277 
1278     is_ascii = 0;
1279     is_sharing = 0;
1280     struct_size = sizeof(PyCompactUnicodeObject);
1281     if (maxchar < 128) {
1282         kind = PyUnicode_1BYTE_KIND;
1283         char_size = 1;
1284         is_ascii = 1;
1285         struct_size = sizeof(PyASCIIObject);
1286     }
1287     else if (maxchar < 256) {
1288         kind = PyUnicode_1BYTE_KIND;
1289         char_size = 1;
1290     }
1291     else if (maxchar < 65536) {
1292         kind = PyUnicode_2BYTE_KIND;
1293         char_size = 2;
1294         if (sizeof(wchar_t) == 2)
1295             is_sharing = 1;
1296     }
1297     else {
1298         if (maxchar > MAX_UNICODE) {
1299             PyErr_SetString(PyExc_SystemError,
1300                             "invalid maximum character passed to PyUnicode_New");
1301             return NULL;
1302         }
1303         kind = PyUnicode_4BYTE_KIND;
1304         char_size = 4;
1305         if (sizeof(wchar_t) == 4)
1306             is_sharing = 1;
1307     }
1308 
1309     /* Ensure we won't overflow the size. */
1310     if (size < 0) {
1311         PyErr_SetString(PyExc_SystemError,
1312                         "Negative size passed to PyUnicode_New");
1313         return NULL;
1314     }
1315     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1316         return PyErr_NoMemory();
1317 
1318     /* Duplicated allocation code from _PyObject_New() instead of a call to
1319      * PyObject_New() so we are able to allocate space for the object and
1320      * it's data buffer.
1321      */
1322     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1323     if (obj == NULL)
1324         return PyErr_NoMemory();
1325     obj = PyObject_INIT(obj, &PyUnicode_Type);
1326     if (obj == NULL)
1327         return NULL;
1328 
1329     unicode = (PyCompactUnicodeObject *)obj;
1330     if (is_ascii)
1331         data = ((PyASCIIObject*)obj) + 1;
1332     else
1333         data = unicode + 1;
1334     _PyUnicode_LENGTH(unicode) = size;
1335     _PyUnicode_HASH(unicode) = -1;
1336     _PyUnicode_STATE(unicode).interned = 0;
1337     _PyUnicode_STATE(unicode).kind = kind;
1338     _PyUnicode_STATE(unicode).compact = 1;
1339     _PyUnicode_STATE(unicode).ready = 1;
1340     _PyUnicode_STATE(unicode).ascii = is_ascii;
1341     if (is_ascii) {
1342         ((char*)data)[size] = 0;
1343         _PyUnicode_WSTR(unicode) = NULL;
1344     }
1345     else if (kind == PyUnicode_1BYTE_KIND) {
1346         ((char*)data)[size] = 0;
1347         _PyUnicode_WSTR(unicode) = NULL;
1348         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1349         unicode->utf8 = NULL;
1350         unicode->utf8_length = 0;
1351     }
1352     else {
1353         unicode->utf8 = NULL;
1354         unicode->utf8_length = 0;
1355         if (kind == PyUnicode_2BYTE_KIND)
1356             ((Py_UCS2*)data)[size] = 0;
1357         else /* kind == PyUnicode_4BYTE_KIND */
1358             ((Py_UCS4*)data)[size] = 0;
1359         if (is_sharing) {
1360             _PyUnicode_WSTR_LENGTH(unicode) = size;
1361             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1362         }
1363         else {
1364             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1365             _PyUnicode_WSTR(unicode) = NULL;
1366         }
1367     }
1368 #ifdef Py_DEBUG
1369     unicode_fill_invalid((PyObject*)unicode, 0);
1370 #endif
1371     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1372     return obj;
1373 }
1374 
1375 #if SIZEOF_WCHAR_T == 2
1376 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1377    will decode surrogate pairs, the other conversions are implemented as macros
1378    for efficiency.
1379 
1380    This function assumes that unicode can hold one more code point than wstr
1381    characters for a terminating null character. */
1382 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1383 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1384                               PyObject *unicode)
1385 {
1386     const wchar_t *iter;
1387     Py_UCS4 *ucs4_out;
1388 
1389     assert(unicode != NULL);
1390     assert(_PyUnicode_CHECK(unicode));
1391     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1392     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1393 
1394     for (iter = begin; iter < end; ) {
1395         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1396                            _PyUnicode_GET_LENGTH(unicode)));
1397         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1398             && (iter+1) < end
1399             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1400         {
1401             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1402             iter += 2;
1403         }
1404         else {
1405             *ucs4_out++ = *iter;
1406             iter++;
1407         }
1408     }
1409     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1410                         _PyUnicode_GET_LENGTH(unicode)));
1411 
1412 }
1413 #endif
1414 
1415 static int
unicode_check_modifiable(PyObject * unicode)1416 unicode_check_modifiable(PyObject *unicode)
1417 {
1418     if (!unicode_modifiable(unicode)) {
1419         PyErr_SetString(PyExc_SystemError,
1420                         "Cannot modify a string currently used");
1421         return -1;
1422     }
1423     return 0;
1424 }
1425 
1426 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1427 _copy_characters(PyObject *to, Py_ssize_t to_start,
1428                  PyObject *from, Py_ssize_t from_start,
1429                  Py_ssize_t how_many, int check_maxchar)
1430 {
1431     unsigned int from_kind, to_kind;
1432     void *from_data, *to_data;
1433 
1434     assert(0 <= how_many);
1435     assert(0 <= from_start);
1436     assert(0 <= to_start);
1437     assert(PyUnicode_Check(from));
1438     assert(PyUnicode_IS_READY(from));
1439     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1440 
1441     assert(PyUnicode_Check(to));
1442     assert(PyUnicode_IS_READY(to));
1443     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1444 
1445     if (how_many == 0)
1446         return 0;
1447 
1448     from_kind = PyUnicode_KIND(from);
1449     from_data = PyUnicode_DATA(from);
1450     to_kind = PyUnicode_KIND(to);
1451     to_data = PyUnicode_DATA(to);
1452 
1453 #ifdef Py_DEBUG
1454     if (!check_maxchar
1455         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1456     {
1457         const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1458         Py_UCS4 ch;
1459         Py_ssize_t i;
1460         for (i=0; i < how_many; i++) {
1461             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1462             assert(ch <= to_maxchar);
1463         }
1464     }
1465 #endif
1466 
1467     if (from_kind == to_kind) {
1468         if (check_maxchar
1469             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1470         {
1471             /* Writing Latin-1 characters into an ASCII string requires to
1472                check that all written characters are pure ASCII */
1473             Py_UCS4 max_char;
1474             max_char = ucs1lib_find_max_char(from_data,
1475                                              (Py_UCS1*)from_data + how_many);
1476             if (max_char >= 128)
1477                 return -1;
1478         }
1479         memcpy((char*)to_data + to_kind * to_start,
1480                   (char*)from_data + from_kind * from_start,
1481                   to_kind * how_many);
1482     }
1483     else if (from_kind == PyUnicode_1BYTE_KIND
1484              && to_kind == PyUnicode_2BYTE_KIND)
1485     {
1486         _PyUnicode_CONVERT_BYTES(
1487             Py_UCS1, Py_UCS2,
1488             PyUnicode_1BYTE_DATA(from) + from_start,
1489             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1490             PyUnicode_2BYTE_DATA(to) + to_start
1491             );
1492     }
1493     else if (from_kind == PyUnicode_1BYTE_KIND
1494              && to_kind == PyUnicode_4BYTE_KIND)
1495     {
1496         _PyUnicode_CONVERT_BYTES(
1497             Py_UCS1, Py_UCS4,
1498             PyUnicode_1BYTE_DATA(from) + from_start,
1499             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1500             PyUnicode_4BYTE_DATA(to) + to_start
1501             );
1502     }
1503     else if (from_kind == PyUnicode_2BYTE_KIND
1504              && to_kind == PyUnicode_4BYTE_KIND)
1505     {
1506         _PyUnicode_CONVERT_BYTES(
1507             Py_UCS2, Py_UCS4,
1508             PyUnicode_2BYTE_DATA(from) + from_start,
1509             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1510             PyUnicode_4BYTE_DATA(to) + to_start
1511             );
1512     }
1513     else {
1514         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1515 
1516         if (!check_maxchar) {
1517             if (from_kind == PyUnicode_2BYTE_KIND
1518                 && to_kind == PyUnicode_1BYTE_KIND)
1519             {
1520                 _PyUnicode_CONVERT_BYTES(
1521                     Py_UCS2, Py_UCS1,
1522                     PyUnicode_2BYTE_DATA(from) + from_start,
1523                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1524                     PyUnicode_1BYTE_DATA(to) + to_start
1525                     );
1526             }
1527             else if (from_kind == PyUnicode_4BYTE_KIND
1528                      && to_kind == PyUnicode_1BYTE_KIND)
1529             {
1530                 _PyUnicode_CONVERT_BYTES(
1531                     Py_UCS4, Py_UCS1,
1532                     PyUnicode_4BYTE_DATA(from) + from_start,
1533                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1534                     PyUnicode_1BYTE_DATA(to) + to_start
1535                     );
1536             }
1537             else if (from_kind == PyUnicode_4BYTE_KIND
1538                      && to_kind == PyUnicode_2BYTE_KIND)
1539             {
1540                 _PyUnicode_CONVERT_BYTES(
1541                     Py_UCS4, Py_UCS2,
1542                     PyUnicode_4BYTE_DATA(from) + from_start,
1543                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1544                     PyUnicode_2BYTE_DATA(to) + to_start
1545                     );
1546             }
1547             else {
1548                 Py_UNREACHABLE();
1549             }
1550         }
1551         else {
1552             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1553             Py_UCS4 ch;
1554             Py_ssize_t i;
1555 
1556             for (i=0; i < how_many; i++) {
1557                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1558                 if (ch > to_maxchar)
1559                     return -1;
1560                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1561             }
1562         }
1563     }
1564     return 0;
1565 }
1566 
1567 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1568 _PyUnicode_FastCopyCharacters(
1569     PyObject *to, Py_ssize_t to_start,
1570     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1571 {
1572     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1573 }
1574 
1575 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1576 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1577                          PyObject *from, Py_ssize_t from_start,
1578                          Py_ssize_t how_many)
1579 {
1580     int err;
1581 
1582     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1583         PyErr_BadInternalCall();
1584         return -1;
1585     }
1586 
1587     if (PyUnicode_READY(from) == -1)
1588         return -1;
1589     if (PyUnicode_READY(to) == -1)
1590         return -1;
1591 
1592     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1593         PyErr_SetString(PyExc_IndexError, "string index out of range");
1594         return -1;
1595     }
1596     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1597         PyErr_SetString(PyExc_IndexError, "string index out of range");
1598         return -1;
1599     }
1600     if (how_many < 0) {
1601         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1602         return -1;
1603     }
1604     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1605     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1606         PyErr_Format(PyExc_SystemError,
1607                      "Cannot write %zi characters at %zi "
1608                      "in a string of %zi characters",
1609                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1610         return -1;
1611     }
1612 
1613     if (how_many == 0)
1614         return 0;
1615 
1616     if (unicode_check_modifiable(to))
1617         return -1;
1618 
1619     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1620     if (err) {
1621         PyErr_Format(PyExc_SystemError,
1622                      "Cannot copy %s characters "
1623                      "into a string of %s characters",
1624                      unicode_kind_name(from),
1625                      unicode_kind_name(to));
1626         return -1;
1627     }
1628     return how_many;
1629 }
1630 
1631 /* Find the maximum code point and count the number of surrogate pairs so a
1632    correct string length can be computed before converting a string to UCS4.
1633    This function counts single surrogates as a character and not as a pair.
1634 
1635    Return 0 on success, or -1 on error. */
1636 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1637 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1638                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1639 {
1640     const wchar_t *iter;
1641     Py_UCS4 ch;
1642 
1643     assert(num_surrogates != NULL && maxchar != NULL);
1644     *num_surrogates = 0;
1645     *maxchar = 0;
1646 
1647     for (iter = begin; iter < end; ) {
1648 #if SIZEOF_WCHAR_T == 2
1649         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1650             && (iter+1) < end
1651             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1652         {
1653             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1654             ++(*num_surrogates);
1655             iter += 2;
1656         }
1657         else
1658 #endif
1659         {
1660             ch = *iter;
1661             iter++;
1662         }
1663         if (ch > *maxchar) {
1664             *maxchar = ch;
1665             if (*maxchar > MAX_UNICODE) {
1666                 PyErr_Format(PyExc_ValueError,
1667                              "character U+%x is not in range [U+0000; U+10ffff]",
1668                              ch);
1669                 return -1;
1670             }
1671         }
1672     }
1673     return 0;
1674 }
1675 
1676 int
_PyUnicode_Ready(PyObject * unicode)1677 _PyUnicode_Ready(PyObject *unicode)
1678 {
1679     wchar_t *end;
1680     Py_UCS4 maxchar = 0;
1681     Py_ssize_t num_surrogates;
1682 #if SIZEOF_WCHAR_T == 2
1683     Py_ssize_t length_wo_surrogates;
1684 #endif
1685 
1686     /* _PyUnicode_Ready() is only intended for old-style API usage where
1687        strings were created using _PyObject_New() and where no canonical
1688        representation (the str field) has been set yet aka strings
1689        which are not yet ready. */
1690     assert(_PyUnicode_CHECK(unicode));
1691     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1692     assert(_PyUnicode_WSTR(unicode) != NULL);
1693     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1694     assert(_PyUnicode_UTF8(unicode) == NULL);
1695     /* Actually, it should neither be interned nor be anything else: */
1696     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1697 
1698     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1699     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1700                                 &maxchar, &num_surrogates) == -1)
1701         return -1;
1702 
1703     if (maxchar < 256) {
1704         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1705         if (!_PyUnicode_DATA_ANY(unicode)) {
1706             PyErr_NoMemory();
1707             return -1;
1708         }
1709         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1710                                 _PyUnicode_WSTR(unicode), end,
1711                                 PyUnicode_1BYTE_DATA(unicode));
1712         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1715         if (maxchar < 128) {
1716             _PyUnicode_STATE(unicode).ascii = 1;
1717             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1718             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1719         }
1720         else {
1721             _PyUnicode_STATE(unicode).ascii = 0;
1722             _PyUnicode_UTF8(unicode) = NULL;
1723             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1724         }
1725         PyObject_FREE(_PyUnicode_WSTR(unicode));
1726         _PyUnicode_WSTR(unicode) = NULL;
1727         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1728     }
1729     /* In this case we might have to convert down from 4-byte native
1730        wchar_t to 2-byte unicode. */
1731     else if (maxchar < 65536) {
1732         assert(num_surrogates == 0 &&
1733                "FindMaxCharAndNumSurrogatePairs() messed up");
1734 
1735 #if SIZEOF_WCHAR_T == 2
1736         /* We can share representations and are done. */
1737         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1738         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1739         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1740         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1741         _PyUnicode_UTF8(unicode) = NULL;
1742         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1743 #else
1744         /* sizeof(wchar_t) == 4 */
1745         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1746             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1747         if (!_PyUnicode_DATA_ANY(unicode)) {
1748             PyErr_NoMemory();
1749             return -1;
1750         }
1751         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1752                                 _PyUnicode_WSTR(unicode), end,
1753                                 PyUnicode_2BYTE_DATA(unicode));
1754         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1755         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1756         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1757         _PyUnicode_UTF8(unicode) = NULL;
1758         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1759         PyObject_FREE(_PyUnicode_WSTR(unicode));
1760         _PyUnicode_WSTR(unicode) = NULL;
1761         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1762 #endif
1763     }
1764     /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1765     else {
1766 #if SIZEOF_WCHAR_T == 2
1767         /* in case the native representation is 2-bytes, we need to allocate a
1768            new normalized 4-byte version. */
1769         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1770         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1771             PyErr_NoMemory();
1772             return -1;
1773         }
1774         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1775         if (!_PyUnicode_DATA_ANY(unicode)) {
1776             PyErr_NoMemory();
1777             return -1;
1778         }
1779         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1780         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1781         _PyUnicode_UTF8(unicode) = NULL;
1782         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1783         /* unicode_convert_wchar_to_ucs4() requires a ready string */
1784         _PyUnicode_STATE(unicode).ready = 1;
1785         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1786         PyObject_FREE(_PyUnicode_WSTR(unicode));
1787         _PyUnicode_WSTR(unicode) = NULL;
1788         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1789 #else
1790         assert(num_surrogates == 0);
1791 
1792         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1793         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1794         _PyUnicode_UTF8(unicode) = NULL;
1795         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1796         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1797 #endif
1798         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1799     }
1800     _PyUnicode_STATE(unicode).ready = 1;
1801     assert(_PyUnicode_CheckConsistency(unicode, 1));
1802     return 0;
1803 }
1804 
1805 static void
unicode_dealloc(PyObject * unicode)1806 unicode_dealloc(PyObject *unicode)
1807 {
1808     switch (PyUnicode_CHECK_INTERNED(unicode)) {
1809     case SSTATE_NOT_INTERNED:
1810         break;
1811 
1812     case SSTATE_INTERNED_MORTAL:
1813         /* revive dead object temporarily for DelItem */
1814         Py_REFCNT(unicode) = 3;
1815         if (PyDict_DelItem(interned, unicode) != 0)
1816             Py_FatalError(
1817                 "deletion of interned string failed");
1818         break;
1819 
1820     case SSTATE_INTERNED_IMMORTAL:
1821         Py_FatalError("Immortal interned string died.");
1822         /* fall through */
1823 
1824     default:
1825         Py_FatalError("Inconsistent interned string state.");
1826     }
1827 
1828     if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1829         PyObject_DEL(_PyUnicode_WSTR(unicode));
1830     if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1831         PyObject_DEL(_PyUnicode_UTF8(unicode));
1832     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1833         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1834 
1835     Py_TYPE(unicode)->tp_free(unicode);
1836 }
1837 
1838 #ifdef Py_DEBUG
1839 static int
unicode_is_singleton(PyObject * unicode)1840 unicode_is_singleton(PyObject *unicode)
1841 {
1842     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1843     if (unicode == unicode_empty)
1844         return 1;
1845     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1846     {
1847         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1848         if (ch < 256 && unicode_latin1[ch] == unicode)
1849             return 1;
1850     }
1851     return 0;
1852 }
1853 #endif
1854 
1855 static int
unicode_modifiable(PyObject * unicode)1856 unicode_modifiable(PyObject *unicode)
1857 {
1858     assert(_PyUnicode_CHECK(unicode));
1859     if (Py_REFCNT(unicode) != 1)
1860         return 0;
1861     if (_PyUnicode_HASH(unicode) != -1)
1862         return 0;
1863     if (PyUnicode_CHECK_INTERNED(unicode))
1864         return 0;
1865     if (!PyUnicode_CheckExact(unicode))
1866         return 0;
1867 #ifdef Py_DEBUG
1868     /* singleton refcount is greater than 1 */
1869     assert(!unicode_is_singleton(unicode));
1870 #endif
1871     return 1;
1872 }
1873 
1874 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1875 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1876 {
1877     PyObject *unicode;
1878     Py_ssize_t old_length;
1879 
1880     assert(p_unicode != NULL);
1881     unicode = *p_unicode;
1882 
1883     assert(unicode != NULL);
1884     assert(PyUnicode_Check(unicode));
1885     assert(0 <= length);
1886 
1887     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1888         old_length = PyUnicode_WSTR_LENGTH(unicode);
1889     else
1890         old_length = PyUnicode_GET_LENGTH(unicode);
1891     if (old_length == length)
1892         return 0;
1893 
1894     if (length == 0) {
1895         _Py_INCREF_UNICODE_EMPTY();
1896         if (!unicode_empty)
1897             return -1;
1898         Py_SETREF(*p_unicode, unicode_empty);
1899         return 0;
1900     }
1901 
1902     if (!unicode_modifiable(unicode)) {
1903         PyObject *copy = resize_copy(unicode, length);
1904         if (copy == NULL)
1905             return -1;
1906         Py_SETREF(*p_unicode, copy);
1907         return 0;
1908     }
1909 
1910     if (PyUnicode_IS_COMPACT(unicode)) {
1911         PyObject *new_unicode = resize_compact(unicode, length);
1912         if (new_unicode == NULL)
1913             return -1;
1914         *p_unicode = new_unicode;
1915         return 0;
1916     }
1917     return resize_inplace(unicode, length);
1918 }
1919 
1920 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1921 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1922 {
1923     PyObject *unicode;
1924     if (p_unicode == NULL) {
1925         PyErr_BadInternalCall();
1926         return -1;
1927     }
1928     unicode = *p_unicode;
1929     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1930     {
1931         PyErr_BadInternalCall();
1932         return -1;
1933     }
1934     return unicode_resize(p_unicode, length);
1935 }
1936 
1937 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1938 
1939    WARNING: The function doesn't copy the terminating null character and
1940    doesn't check the maximum character (may write a latin1 character in an
1941    ASCII string). */
1942 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1943 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1944                    const char *str, Py_ssize_t len)
1945 {
1946     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1947     void *data = PyUnicode_DATA(unicode);
1948     const char *end = str + len;
1949 
1950     switch (kind) {
1951     case PyUnicode_1BYTE_KIND: {
1952         assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1953 #ifdef Py_DEBUG
1954         if (PyUnicode_IS_ASCII(unicode)) {
1955             Py_UCS4 maxchar = ucs1lib_find_max_char(
1956                 (const Py_UCS1*)str,
1957                 (const Py_UCS1*)str + len);
1958             assert(maxchar < 128);
1959         }
1960 #endif
1961         memcpy((char *) data + index, str, len);
1962         break;
1963     }
1964     case PyUnicode_2BYTE_KIND: {
1965         Py_UCS2 *start = (Py_UCS2 *)data + index;
1966         Py_UCS2 *ucs2 = start;
1967         assert(index <= PyUnicode_GET_LENGTH(unicode));
1968 
1969         for (; str < end; ++ucs2, ++str)
1970             *ucs2 = (Py_UCS2)*str;
1971 
1972         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1973         break;
1974     }
1975     default: {
1976         Py_UCS4 *start = (Py_UCS4 *)data + index;
1977         Py_UCS4 *ucs4 = start;
1978         assert(kind == PyUnicode_4BYTE_KIND);
1979         assert(index <= PyUnicode_GET_LENGTH(unicode));
1980 
1981         for (; str < end; ++ucs4, ++str)
1982             *ucs4 = (Py_UCS4)*str;
1983 
1984         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1985     }
1986     }
1987 }
1988 
1989 static PyObject*
get_latin1_char(unsigned char ch)1990 get_latin1_char(unsigned char ch)
1991 {
1992     PyObject *unicode = unicode_latin1[ch];
1993     if (!unicode) {
1994         unicode = PyUnicode_New(1, ch);
1995         if (!unicode)
1996             return NULL;
1997         PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1998         assert(_PyUnicode_CheckConsistency(unicode, 1));
1999         unicode_latin1[ch] = unicode;
2000     }
2001     Py_INCREF(unicode);
2002     return unicode;
2003 }
2004 
2005 static PyObject*
unicode_char(Py_UCS4 ch)2006 unicode_char(Py_UCS4 ch)
2007 {
2008     PyObject *unicode;
2009 
2010     assert(ch <= MAX_UNICODE);
2011 
2012     if (ch < 256)
2013         return get_latin1_char(ch);
2014 
2015     unicode = PyUnicode_New(1, ch);
2016     if (unicode == NULL)
2017         return NULL;
2018 
2019     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2020     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2021         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2022     } else {
2023         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2024         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2025     }
2026     assert(_PyUnicode_CheckConsistency(unicode, 1));
2027     return unicode;
2028 }
2029 
2030 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2031 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2032 {
2033     if (u == NULL)
2034         return (PyObject*)_PyUnicode_New(size);
2035 
2036     if (size < 0) {
2037         PyErr_BadInternalCall();
2038         return NULL;
2039     }
2040 
2041     return PyUnicode_FromWideChar(u, size);
2042 }
2043 
2044 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2045 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2046 {
2047     PyObject *unicode;
2048     Py_UCS4 maxchar = 0;
2049     Py_ssize_t num_surrogates;
2050 
2051     if (u == NULL && size != 0) {
2052         PyErr_BadInternalCall();
2053         return NULL;
2054     }
2055 
2056     if (size == -1) {
2057         size = wcslen(u);
2058     }
2059 
2060     /* If the Unicode data is known at construction time, we can apply
2061        some optimizations which share commonly used objects. */
2062 
2063     /* Optimization for empty strings */
2064     if (size == 0)
2065         _Py_RETURN_UNICODE_EMPTY();
2066 
2067     /* Single character Unicode objects in the Latin-1 range are
2068        shared when using this constructor */
2069     if (size == 1 && (Py_UCS4)*u < 256)
2070         return get_latin1_char((unsigned char)*u);
2071 
2072     /* If not empty and not single character, copy the Unicode data
2073        into the new object */
2074     if (find_maxchar_surrogates(u, u + size,
2075                                 &maxchar, &num_surrogates) == -1)
2076         return NULL;
2077 
2078     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2079     if (!unicode)
2080         return NULL;
2081 
2082     switch (PyUnicode_KIND(unicode)) {
2083     case PyUnicode_1BYTE_KIND:
2084         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2085                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2086         break;
2087     case PyUnicode_2BYTE_KIND:
2088 #if Py_UNICODE_SIZE == 2
2089         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2090 #else
2091         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2092                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2093 #endif
2094         break;
2095     case PyUnicode_4BYTE_KIND:
2096 #if SIZEOF_WCHAR_T == 2
2097         /* This is the only case which has to process surrogates, thus
2098            a simple copy loop is not enough and we need a function. */
2099         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2100 #else
2101         assert(num_surrogates == 0);
2102         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2103 #endif
2104         break;
2105     default:
2106         Py_UNREACHABLE();
2107     }
2108 
2109     return unicode_result(unicode);
2110 }
2111 
2112 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2113 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2114 {
2115     if (size < 0) {
2116         PyErr_SetString(PyExc_SystemError,
2117                         "Negative size passed to PyUnicode_FromStringAndSize");
2118         return NULL;
2119     }
2120     if (u != NULL)
2121         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2122     else
2123         return (PyObject *)_PyUnicode_New(size);
2124 }
2125 
2126 PyObject *
PyUnicode_FromString(const char * u)2127 PyUnicode_FromString(const char *u)
2128 {
2129     size_t size = strlen(u);
2130     if (size > PY_SSIZE_T_MAX) {
2131         PyErr_SetString(PyExc_OverflowError, "input too long");
2132         return NULL;
2133     }
2134     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2135 }
2136 
2137 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2138 _PyUnicode_FromId(_Py_Identifier *id)
2139 {
2140     if (!id->object) {
2141         id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2142                                                   strlen(id->string),
2143                                                   NULL, NULL);
2144         if (!id->object)
2145             return NULL;
2146         PyUnicode_InternInPlace(&id->object);
2147         assert(!id->next);
2148         id->next = static_strings;
2149         static_strings = id;
2150     }
2151     return id->object;
2152 }
2153 
2154 void
_PyUnicode_ClearStaticStrings()2155 _PyUnicode_ClearStaticStrings()
2156 {
2157     _Py_Identifier *tmp, *s = static_strings;
2158     while (s) {
2159         Py_CLEAR(s->object);
2160         tmp = s->next;
2161         s->next = NULL;
2162         s = tmp;
2163     }
2164     static_strings = NULL;
2165 }
2166 
2167 /* Internal function, doesn't check maximum character */
2168 
2169 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2170 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2171 {
2172     const unsigned char *s = (const unsigned char *)buffer;
2173     PyObject *unicode;
2174     if (size == 1) {
2175 #ifdef Py_DEBUG
2176         assert((unsigned char)s[0] < 128);
2177 #endif
2178         return get_latin1_char(s[0]);
2179     }
2180     unicode = PyUnicode_New(size, 127);
2181     if (!unicode)
2182         return NULL;
2183     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2184     assert(_PyUnicode_CheckConsistency(unicode, 1));
2185     return unicode;
2186 }
2187 
2188 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2189 kind_maxchar_limit(unsigned int kind)
2190 {
2191     switch (kind) {
2192     case PyUnicode_1BYTE_KIND:
2193         return 0x80;
2194     case PyUnicode_2BYTE_KIND:
2195         return 0x100;
2196     case PyUnicode_4BYTE_KIND:
2197         return 0x10000;
2198     default:
2199         Py_UNREACHABLE();
2200     }
2201 }
2202 
2203 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2204 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2205 {
2206     PyObject *res;
2207     unsigned char max_char;
2208 
2209     if (size == 0)
2210         _Py_RETURN_UNICODE_EMPTY();
2211     assert(size > 0);
2212     if (size == 1)
2213         return get_latin1_char(u[0]);
2214 
2215     max_char = ucs1lib_find_max_char(u, u + size);
2216     res = PyUnicode_New(size, max_char);
2217     if (!res)
2218         return NULL;
2219     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2220     assert(_PyUnicode_CheckConsistency(res, 1));
2221     return res;
2222 }
2223 
2224 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2225 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2226 {
2227     PyObject *res;
2228     Py_UCS2 max_char;
2229 
2230     if (size == 0)
2231         _Py_RETURN_UNICODE_EMPTY();
2232     assert(size > 0);
2233     if (size == 1)
2234         return unicode_char(u[0]);
2235 
2236     max_char = ucs2lib_find_max_char(u, u + size);
2237     res = PyUnicode_New(size, max_char);
2238     if (!res)
2239         return NULL;
2240     if (max_char >= 256)
2241         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2242     else {
2243         _PyUnicode_CONVERT_BYTES(
2244             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2245     }
2246     assert(_PyUnicode_CheckConsistency(res, 1));
2247     return res;
2248 }
2249 
2250 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2251 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2252 {
2253     PyObject *res;
2254     Py_UCS4 max_char;
2255 
2256     if (size == 0)
2257         _Py_RETURN_UNICODE_EMPTY();
2258     assert(size > 0);
2259     if (size == 1)
2260         return unicode_char(u[0]);
2261 
2262     max_char = ucs4lib_find_max_char(u, u + size);
2263     res = PyUnicode_New(size, max_char);
2264     if (!res)
2265         return NULL;
2266     if (max_char < 256)
2267         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2268                                  PyUnicode_1BYTE_DATA(res));
2269     else if (max_char < 0x10000)
2270         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2271                                  PyUnicode_2BYTE_DATA(res));
2272     else
2273         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2274     assert(_PyUnicode_CheckConsistency(res, 1));
2275     return res;
2276 }
2277 
2278 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2279 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2280 {
2281     if (size < 0) {
2282         PyErr_SetString(PyExc_ValueError, "size must be positive");
2283         return NULL;
2284     }
2285     switch (kind) {
2286     case PyUnicode_1BYTE_KIND:
2287         return _PyUnicode_FromUCS1(buffer, size);
2288     case PyUnicode_2BYTE_KIND:
2289         return _PyUnicode_FromUCS2(buffer, size);
2290     case PyUnicode_4BYTE_KIND:
2291         return _PyUnicode_FromUCS4(buffer, size);
2292     default:
2293         PyErr_SetString(PyExc_SystemError, "invalid kind");
2294         return NULL;
2295     }
2296 }
2297 
2298 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2299 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2300 {
2301     enum PyUnicode_Kind kind;
2302     void *startptr, *endptr;
2303 
2304     assert(PyUnicode_IS_READY(unicode));
2305     assert(0 <= start);
2306     assert(end <= PyUnicode_GET_LENGTH(unicode));
2307     assert(start <= end);
2308 
2309     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2310         return PyUnicode_MAX_CHAR_VALUE(unicode);
2311 
2312     if (start == end)
2313         return 127;
2314 
2315     if (PyUnicode_IS_ASCII(unicode))
2316         return 127;
2317 
2318     kind = PyUnicode_KIND(unicode);
2319     startptr = PyUnicode_DATA(unicode);
2320     endptr = (char *)startptr + end * kind;
2321     startptr = (char *)startptr + start * kind;
2322     switch(kind) {
2323     case PyUnicode_1BYTE_KIND:
2324         return ucs1lib_find_max_char(startptr, endptr);
2325     case PyUnicode_2BYTE_KIND:
2326         return ucs2lib_find_max_char(startptr, endptr);
2327     case PyUnicode_4BYTE_KIND:
2328         return ucs4lib_find_max_char(startptr, endptr);
2329     default:
2330         Py_UNREACHABLE();
2331     }
2332 }
2333 
2334 /* Ensure that a string uses the most efficient storage, if it is not the
2335    case: create a new string with of the right kind. Write NULL into *p_unicode
2336    on error. */
2337 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2338 unicode_adjust_maxchar(PyObject **p_unicode)
2339 {
2340     PyObject *unicode, *copy;
2341     Py_UCS4 max_char;
2342     Py_ssize_t len;
2343     unsigned int kind;
2344 
2345     assert(p_unicode != NULL);
2346     unicode = *p_unicode;
2347     assert(PyUnicode_IS_READY(unicode));
2348     if (PyUnicode_IS_ASCII(unicode))
2349         return;
2350 
2351     len = PyUnicode_GET_LENGTH(unicode);
2352     kind = PyUnicode_KIND(unicode);
2353     if (kind == PyUnicode_1BYTE_KIND) {
2354         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2355         max_char = ucs1lib_find_max_char(u, u + len);
2356         if (max_char >= 128)
2357             return;
2358     }
2359     else if (kind == PyUnicode_2BYTE_KIND) {
2360         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2361         max_char = ucs2lib_find_max_char(u, u + len);
2362         if (max_char >= 256)
2363             return;
2364     }
2365     else {
2366         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2367         assert(kind == PyUnicode_4BYTE_KIND);
2368         max_char = ucs4lib_find_max_char(u, u + len);
2369         if (max_char >= 0x10000)
2370             return;
2371     }
2372     copy = PyUnicode_New(len, max_char);
2373     if (copy != NULL)
2374         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2375     Py_DECREF(unicode);
2376     *p_unicode = copy;
2377 }
2378 
2379 PyObject*
_PyUnicode_Copy(PyObject * unicode)2380 _PyUnicode_Copy(PyObject *unicode)
2381 {
2382     Py_ssize_t length;
2383     PyObject *copy;
2384 
2385     if (!PyUnicode_Check(unicode)) {
2386         PyErr_BadInternalCall();
2387         return NULL;
2388     }
2389     if (PyUnicode_READY(unicode) == -1)
2390         return NULL;
2391 
2392     length = PyUnicode_GET_LENGTH(unicode);
2393     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2394     if (!copy)
2395         return NULL;
2396     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2397 
2398     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2399               length * PyUnicode_KIND(unicode));
2400     assert(_PyUnicode_CheckConsistency(copy, 1));
2401     return copy;
2402 }
2403 
2404 
2405 /* Widen Unicode objects to larger buffers. Don't write terminating null
2406    character. Return NULL on error. */
2407 
2408 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2409 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2410 {
2411     Py_ssize_t len;
2412     void *result;
2413     unsigned int skind;
2414 
2415     if (PyUnicode_READY(s) == -1)
2416         return NULL;
2417 
2418     len = PyUnicode_GET_LENGTH(s);
2419     skind = PyUnicode_KIND(s);
2420     if (skind >= kind) {
2421         PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2422         return NULL;
2423     }
2424     switch (kind) {
2425     case PyUnicode_2BYTE_KIND:
2426         result = PyMem_New(Py_UCS2, len);
2427         if (!result)
2428             return PyErr_NoMemory();
2429         assert(skind == PyUnicode_1BYTE_KIND);
2430         _PyUnicode_CONVERT_BYTES(
2431             Py_UCS1, Py_UCS2,
2432             PyUnicode_1BYTE_DATA(s),
2433             PyUnicode_1BYTE_DATA(s) + len,
2434             result);
2435         return result;
2436     case PyUnicode_4BYTE_KIND:
2437         result = PyMem_New(Py_UCS4, len);
2438         if (!result)
2439             return PyErr_NoMemory();
2440         if (skind == PyUnicode_2BYTE_KIND) {
2441             _PyUnicode_CONVERT_BYTES(
2442                 Py_UCS2, Py_UCS4,
2443                 PyUnicode_2BYTE_DATA(s),
2444                 PyUnicode_2BYTE_DATA(s) + len,
2445                 result);
2446         }
2447         else {
2448             assert(skind == PyUnicode_1BYTE_KIND);
2449             _PyUnicode_CONVERT_BYTES(
2450                 Py_UCS1, Py_UCS4,
2451                 PyUnicode_1BYTE_DATA(s),
2452                 PyUnicode_1BYTE_DATA(s) + len,
2453                 result);
2454         }
2455         return result;
2456     default:
2457         break;
2458     }
2459     PyErr_SetString(PyExc_SystemError, "invalid kind");
2460     return NULL;
2461 }
2462 
2463 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2464 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465         int copy_null)
2466 {
2467     int kind;
2468     void *data;
2469     Py_ssize_t len, targetlen;
2470     if (PyUnicode_READY(string) == -1)
2471         return NULL;
2472     kind = PyUnicode_KIND(string);
2473     data = PyUnicode_DATA(string);
2474     len = PyUnicode_GET_LENGTH(string);
2475     targetlen = len;
2476     if (copy_null)
2477         targetlen++;
2478     if (!target) {
2479         target = PyMem_New(Py_UCS4, targetlen);
2480         if (!target) {
2481             PyErr_NoMemory();
2482             return NULL;
2483         }
2484     }
2485     else {
2486         if (targetsize < targetlen) {
2487             PyErr_Format(PyExc_SystemError,
2488                          "string is longer than the buffer");
2489             if (copy_null && 0 < targetsize)
2490                 target[0] = 0;
2491             return NULL;
2492         }
2493     }
2494     if (kind == PyUnicode_1BYTE_KIND) {
2495         Py_UCS1 *start = (Py_UCS1 *) data;
2496         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2497     }
2498     else if (kind == PyUnicode_2BYTE_KIND) {
2499         Py_UCS2 *start = (Py_UCS2 *) data;
2500         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2501     }
2502     else {
2503         assert(kind == PyUnicode_4BYTE_KIND);
2504         memcpy(target, data, len * sizeof(Py_UCS4));
2505     }
2506     if (copy_null)
2507         target[len] = 0;
2508     return target;
2509 }
2510 
2511 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2512 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2513                  int copy_null)
2514 {
2515     if (target == NULL || targetsize < 0) {
2516         PyErr_BadInternalCall();
2517         return NULL;
2518     }
2519     return as_ucs4(string, target, targetsize, copy_null);
2520 }
2521 
2522 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2523 PyUnicode_AsUCS4Copy(PyObject *string)
2524 {
2525     return as_ucs4(string, NULL, 0, 1);
2526 }
2527 
2528 /* maximum number of characters required for output of %lld or %p.
2529    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2530    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2531 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2532 
2533 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2534 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2535                              Py_ssize_t width, Py_ssize_t precision)
2536 {
2537     Py_ssize_t length, fill, arglen;
2538     Py_UCS4 maxchar;
2539 
2540     if (PyUnicode_READY(str) == -1)
2541         return -1;
2542 
2543     length = PyUnicode_GET_LENGTH(str);
2544     if ((precision == -1 || precision >= length)
2545         && width <= length)
2546         return _PyUnicodeWriter_WriteStr(writer, str);
2547 
2548     if (precision != -1)
2549         length = Py_MIN(precision, length);
2550 
2551     arglen = Py_MAX(length, width);
2552     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2553         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2554     else
2555         maxchar = writer->maxchar;
2556 
2557     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2558         return -1;
2559 
2560     if (width > length) {
2561         fill = width - length;
2562         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2563             return -1;
2564         writer->pos += fill;
2565     }
2566 
2567     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2568                                   str, 0, length);
2569     writer->pos += length;
2570     return 0;
2571 }
2572 
2573 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2574 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2575                               Py_ssize_t width, Py_ssize_t precision)
2576 {
2577     /* UTF-8 */
2578     Py_ssize_t length;
2579     PyObject *unicode;
2580     int res;
2581 
2582     if (precision == -1) {
2583         length = strlen(str);
2584     }
2585     else {
2586         length = 0;
2587         while (length < precision && str[length]) {
2588             length++;
2589         }
2590     }
2591     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2592     if (unicode == NULL)
2593         return -1;
2594 
2595     res = unicode_fromformat_write_str(writer, unicode, width, -1);
2596     Py_DECREF(unicode);
2597     return res;
2598 }
2599 
2600 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2601 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2602                        const char *f, va_list *vargs)
2603 {
2604     const char *p;
2605     Py_ssize_t len;
2606     int zeropad;
2607     Py_ssize_t width;
2608     Py_ssize_t precision;
2609     int longflag;
2610     int longlongflag;
2611     int size_tflag;
2612     Py_ssize_t fill;
2613 
2614     p = f;
2615     f++;
2616     zeropad = 0;
2617     if (*f == '0') {
2618         zeropad = 1;
2619         f++;
2620     }
2621 
2622     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2623     width = -1;
2624     if (Py_ISDIGIT((unsigned)*f)) {
2625         width = *f - '0';
2626         f++;
2627         while (Py_ISDIGIT((unsigned)*f)) {
2628             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2629                 PyErr_SetString(PyExc_ValueError,
2630                                 "width too big");
2631                 return NULL;
2632             }
2633             width = (width * 10) + (*f - '0');
2634             f++;
2635         }
2636     }
2637     precision = -1;
2638     if (*f == '.') {
2639         f++;
2640         if (Py_ISDIGIT((unsigned)*f)) {
2641             precision = (*f - '0');
2642             f++;
2643             while (Py_ISDIGIT((unsigned)*f)) {
2644                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2645                     PyErr_SetString(PyExc_ValueError,
2646                                     "precision too big");
2647                     return NULL;
2648                 }
2649                 precision = (precision * 10) + (*f - '0');
2650                 f++;
2651             }
2652         }
2653         if (*f == '%') {
2654             /* "%.3%s" => f points to "3" */
2655             f--;
2656         }
2657     }
2658     if (*f == '\0') {
2659         /* bogus format "%.123" => go backward, f points to "3" */
2660         f--;
2661     }
2662 
2663     /* Handle %ld, %lu, %lld and %llu. */
2664     longflag = 0;
2665     longlongflag = 0;
2666     size_tflag = 0;
2667     if (*f == 'l') {
2668         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2669             longflag = 1;
2670             ++f;
2671         }
2672         else if (f[1] == 'l' &&
2673                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2674             longlongflag = 1;
2675             f += 2;
2676         }
2677     }
2678     /* handle the size_t flag. */
2679     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2680         size_tflag = 1;
2681         ++f;
2682     }
2683 
2684     if (f[1] == '\0')
2685         writer->overallocate = 0;
2686 
2687     switch (*f) {
2688     case 'c':
2689     {
2690         int ordinal = va_arg(*vargs, int);
2691         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2692             PyErr_SetString(PyExc_OverflowError,
2693                             "character argument not in range(0x110000)");
2694             return NULL;
2695         }
2696         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2697             return NULL;
2698         break;
2699     }
2700 
2701     case 'i':
2702     case 'd':
2703     case 'u':
2704     case 'x':
2705     {
2706         /* used by sprintf */
2707         char buffer[MAX_LONG_LONG_CHARS];
2708         Py_ssize_t arglen;
2709 
2710         if (*f == 'u') {
2711             if (longflag)
2712                 len = sprintf(buffer, "%lu",
2713                         va_arg(*vargs, unsigned long));
2714             else if (longlongflag)
2715                 len = sprintf(buffer, "%llu",
2716                         va_arg(*vargs, unsigned long long));
2717             else if (size_tflag)
2718                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2719                         va_arg(*vargs, size_t));
2720             else
2721                 len = sprintf(buffer, "%u",
2722                         va_arg(*vargs, unsigned int));
2723         }
2724         else if (*f == 'x') {
2725             len = sprintf(buffer, "%x", va_arg(*vargs, int));
2726         }
2727         else {
2728             if (longflag)
2729                 len = sprintf(buffer, "%li",
2730                         va_arg(*vargs, long));
2731             else if (longlongflag)
2732                 len = sprintf(buffer, "%lli",
2733                         va_arg(*vargs, long long));
2734             else if (size_tflag)
2735                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2736                         va_arg(*vargs, Py_ssize_t));
2737             else
2738                 len = sprintf(buffer, "%i",
2739                         va_arg(*vargs, int));
2740         }
2741         assert(len >= 0);
2742 
2743         if (precision < len)
2744             precision = len;
2745 
2746         arglen = Py_MAX(precision, width);
2747         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2748             return NULL;
2749 
2750         if (width > precision) {
2751             Py_UCS4 fillchar;
2752             fill = width - precision;
2753             fillchar = zeropad?'0':' ';
2754             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2755                 return NULL;
2756             writer->pos += fill;
2757         }
2758         if (precision > len) {
2759             fill = precision - len;
2760             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2761                 return NULL;
2762             writer->pos += fill;
2763         }
2764 
2765         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2766             return NULL;
2767         break;
2768     }
2769 
2770     case 'p':
2771     {
2772         char number[MAX_LONG_LONG_CHARS];
2773 
2774         len = sprintf(number, "%p", va_arg(*vargs, void*));
2775         assert(len >= 0);
2776 
2777         /* %p is ill-defined:  ensure leading 0x. */
2778         if (number[1] == 'X')
2779             number[1] = 'x';
2780         else if (number[1] != 'x') {
2781             memmove(number + 2, number,
2782                     strlen(number) + 1);
2783             number[0] = '0';
2784             number[1] = 'x';
2785             len += 2;
2786         }
2787 
2788         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2789             return NULL;
2790         break;
2791     }
2792 
2793     case 's':
2794     {
2795         /* UTF-8 */
2796         const char *s = va_arg(*vargs, const char*);
2797         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2798             return NULL;
2799         break;
2800     }
2801 
2802     case 'U':
2803     {
2804         PyObject *obj = va_arg(*vargs, PyObject *);
2805         assert(obj && _PyUnicode_CHECK(obj));
2806 
2807         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2808             return NULL;
2809         break;
2810     }
2811 
2812     case 'V':
2813     {
2814         PyObject *obj = va_arg(*vargs, PyObject *);
2815         const char *str = va_arg(*vargs, const char *);
2816         if (obj) {
2817             assert(_PyUnicode_CHECK(obj));
2818             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2819                 return NULL;
2820         }
2821         else {
2822             assert(str != NULL);
2823             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2824                 return NULL;
2825         }
2826         break;
2827     }
2828 
2829     case 'S':
2830     {
2831         PyObject *obj = va_arg(*vargs, PyObject *);
2832         PyObject *str;
2833         assert(obj);
2834         str = PyObject_Str(obj);
2835         if (!str)
2836             return NULL;
2837         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2838             Py_DECREF(str);
2839             return NULL;
2840         }
2841         Py_DECREF(str);
2842         break;
2843     }
2844 
2845     case 'R':
2846     {
2847         PyObject *obj = va_arg(*vargs, PyObject *);
2848         PyObject *repr;
2849         assert(obj);
2850         repr = PyObject_Repr(obj);
2851         if (!repr)
2852             return NULL;
2853         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2854             Py_DECREF(repr);
2855             return NULL;
2856         }
2857         Py_DECREF(repr);
2858         break;
2859     }
2860 
2861     case 'A':
2862     {
2863         PyObject *obj = va_arg(*vargs, PyObject *);
2864         PyObject *ascii;
2865         assert(obj);
2866         ascii = PyObject_ASCII(obj);
2867         if (!ascii)
2868             return NULL;
2869         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2870             Py_DECREF(ascii);
2871             return NULL;
2872         }
2873         Py_DECREF(ascii);
2874         break;
2875     }
2876 
2877     case '%':
2878         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2879             return NULL;
2880         break;
2881 
2882     default:
2883         /* if we stumble upon an unknown formatting code, copy the rest
2884            of the format string to the output string. (we cannot just
2885            skip the code, since there's no way to know what's in the
2886            argument list) */
2887         len = strlen(p);
2888         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2889             return NULL;
2890         f = p+len;
2891         return f;
2892     }
2893 
2894     f++;
2895     return f;
2896 }
2897 
2898 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2899 PyUnicode_FromFormatV(const char *format, va_list vargs)
2900 {
2901     va_list vargs2;
2902     const char *f;
2903     _PyUnicodeWriter writer;
2904 
2905     _PyUnicodeWriter_Init(&writer);
2906     writer.min_length = strlen(format) + 100;
2907     writer.overallocate = 1;
2908 
2909     // Copy varags to be able to pass a reference to a subfunction.
2910     va_copy(vargs2, vargs);
2911 
2912     for (f = format; *f; ) {
2913         if (*f == '%') {
2914             f = unicode_fromformat_arg(&writer, f, &vargs2);
2915             if (f == NULL)
2916                 goto fail;
2917         }
2918         else {
2919             const char *p;
2920             Py_ssize_t len;
2921 
2922             p = f;
2923             do
2924             {
2925                 if ((unsigned char)*p > 127) {
2926                     PyErr_Format(PyExc_ValueError,
2927                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2928                         "string, got a non-ASCII byte: 0x%02x",
2929                         (unsigned char)*p);
2930                     goto fail;
2931                 }
2932                 p++;
2933             }
2934             while (*p != '\0' && *p != '%');
2935             len = p - f;
2936 
2937             if (*p == '\0')
2938                 writer.overallocate = 0;
2939 
2940             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2941                 goto fail;
2942 
2943             f = p;
2944         }
2945     }
2946     va_end(vargs2);
2947     return _PyUnicodeWriter_Finish(&writer);
2948 
2949   fail:
2950     va_end(vargs2);
2951     _PyUnicodeWriter_Dealloc(&writer);
2952     return NULL;
2953 }
2954 
2955 PyObject *
PyUnicode_FromFormat(const char * format,...)2956 PyUnicode_FromFormat(const char *format, ...)
2957 {
2958     PyObject* ret;
2959     va_list vargs;
2960 
2961 #ifdef HAVE_STDARG_PROTOTYPES
2962     va_start(vargs, format);
2963 #else
2964     va_start(vargs);
2965 #endif
2966     ret = PyUnicode_FromFormatV(format, vargs);
2967     va_end(vargs);
2968     return ret;
2969 }
2970 
2971 #ifdef HAVE_WCHAR_H
2972 
2973 /* Convert a Unicode object to a wide character string.
2974 
2975    - If w is NULL: return the number of wide characters (including the null
2976      character) required to convert the unicode object. Ignore size argument.
2977 
2978    - Otherwise: return the number of wide characters (excluding the null
2979      character) written into w. Write at most size wide characters (including
2980      the null character). */
2981 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)2982 PyUnicode_AsWideChar(PyObject *unicode,
2983                      wchar_t *w,
2984                      Py_ssize_t size)
2985 {
2986     Py_ssize_t res;
2987     const wchar_t *wstr;
2988 
2989     if (unicode == NULL) {
2990         PyErr_BadInternalCall();
2991         return -1;
2992     }
2993     wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2994     if (wstr == NULL)
2995         return -1;
2996 
2997     if (w != NULL) {
2998         if (size > res)
2999             size = res + 1;
3000         else
3001             res = size;
3002         memcpy(w, wstr, size * sizeof(wchar_t));
3003         return res;
3004     }
3005     else
3006         return res + 1;
3007 }
3008 
3009 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3010 PyUnicode_AsWideCharString(PyObject *unicode,
3011                            Py_ssize_t *size)
3012 {
3013     const wchar_t *wstr;
3014     wchar_t *buffer;
3015     Py_ssize_t buflen;
3016 
3017     if (unicode == NULL) {
3018         PyErr_BadInternalCall();
3019         return NULL;
3020     }
3021 
3022     wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3023     if (wstr == NULL) {
3024         return NULL;
3025     }
3026     if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3027         PyErr_SetString(PyExc_ValueError,
3028                         "embedded null character");
3029         return NULL;
3030     }
3031 
3032     buffer = PyMem_NEW(wchar_t, buflen + 1);
3033     if (buffer == NULL) {
3034         PyErr_NoMemory();
3035         return NULL;
3036     }
3037     memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
3038     if (size != NULL)
3039         *size = buflen;
3040     return buffer;
3041 }
3042 
3043 #endif /* HAVE_WCHAR_H */
3044 
3045 PyObject *
PyUnicode_FromOrdinal(int ordinal)3046 PyUnicode_FromOrdinal(int ordinal)
3047 {
3048     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3049         PyErr_SetString(PyExc_ValueError,
3050                         "chr() arg not in range(0x110000)");
3051         return NULL;
3052     }
3053 
3054     return unicode_char((Py_UCS4)ordinal);
3055 }
3056 
3057 PyObject *
PyUnicode_FromObject(PyObject * obj)3058 PyUnicode_FromObject(PyObject *obj)
3059 {
3060     /* XXX Perhaps we should make this API an alias of
3061        PyObject_Str() instead ?! */
3062     if (PyUnicode_CheckExact(obj)) {
3063         if (PyUnicode_READY(obj) == -1)
3064             return NULL;
3065         Py_INCREF(obj);
3066         return obj;
3067     }
3068     if (PyUnicode_Check(obj)) {
3069         /* For a Unicode subtype that's not a Unicode object,
3070            return a true Unicode object with the same data. */
3071         return _PyUnicode_Copy(obj);
3072     }
3073     PyErr_Format(PyExc_TypeError,
3074                  "Can't convert '%.100s' object to str implicitly",
3075                  Py_TYPE(obj)->tp_name);
3076     return NULL;
3077 }
3078 
3079 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3080 PyUnicode_FromEncodedObject(PyObject *obj,
3081                             const char *encoding,
3082                             const char *errors)
3083 {
3084     Py_buffer buffer;
3085     PyObject *v;
3086 
3087     if (obj == NULL) {
3088         PyErr_BadInternalCall();
3089         return NULL;
3090     }
3091 
3092     /* Decoding bytes objects is the most common case and should be fast */
3093     if (PyBytes_Check(obj)) {
3094         if (PyBytes_GET_SIZE(obj) == 0)
3095             _Py_RETURN_UNICODE_EMPTY();
3096         v = PyUnicode_Decode(
3097                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3098                 encoding, errors);
3099         return v;
3100     }
3101 
3102     if (PyUnicode_Check(obj)) {
3103         PyErr_SetString(PyExc_TypeError,
3104                         "decoding str is not supported");
3105         return NULL;
3106     }
3107 
3108     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3109     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3110         PyErr_Format(PyExc_TypeError,
3111                      "decoding to str: need a bytes-like object, %.80s found",
3112                      Py_TYPE(obj)->tp_name);
3113         return NULL;
3114     }
3115 
3116     if (buffer.len == 0) {
3117         PyBuffer_Release(&buffer);
3118         _Py_RETURN_UNICODE_EMPTY();
3119     }
3120 
3121     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3122     PyBuffer_Release(&buffer);
3123     return v;
3124 }
3125 
3126 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3127    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3128    longer than lower_len-1). */
3129 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3130 _Py_normalize_encoding(const char *encoding,
3131                        char *lower,
3132                        size_t lower_len)
3133 {
3134     const char *e;
3135     char *l;
3136     char *l_end;
3137     int punct;
3138 
3139     assert(encoding != NULL);
3140 
3141     e = encoding;
3142     l = lower;
3143     l_end = &lower[lower_len - 1];
3144     punct = 0;
3145     while (1) {
3146         char c = *e;
3147         if (c == 0) {
3148             break;
3149         }
3150 
3151         if (Py_ISALNUM(c) || c == '.') {
3152             if (punct && l != lower) {
3153                 if (l == l_end) {
3154                     return 0;
3155                 }
3156                 *l++ = '_';
3157             }
3158             punct = 0;
3159 
3160             if (l == l_end) {
3161                 return 0;
3162             }
3163             *l++ = Py_TOLOWER(c);
3164         }
3165         else {
3166             punct = 1;
3167         }
3168 
3169         e++;
3170     }
3171     *l = '\0';
3172     return 1;
3173 }
3174 
3175 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3176 PyUnicode_Decode(const char *s,
3177                  Py_ssize_t size,
3178                  const char *encoding,
3179                  const char *errors)
3180 {
3181     PyObject *buffer = NULL, *unicode;
3182     Py_buffer info;
3183     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3184 
3185     if (encoding == NULL) {
3186         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3187     }
3188 
3189     /* Shortcuts for common default encodings */
3190     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3191         char *lower = buflower;
3192 
3193         /* Fast paths */
3194         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3195             lower += 3;
3196             if (*lower == '_') {
3197                 /* Match "utf8" and "utf_8" */
3198                 lower++;
3199             }
3200 
3201             if (lower[0] == '8' && lower[1] == 0) {
3202                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3203             }
3204             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3205                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3206             }
3207             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3208                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3209             }
3210         }
3211         else {
3212             if (strcmp(lower, "ascii") == 0
3213                 || strcmp(lower, "us_ascii") == 0) {
3214                 return PyUnicode_DecodeASCII(s, size, errors);
3215             }
3216     #ifdef MS_WINDOWS
3217             else if (strcmp(lower, "mbcs") == 0) {
3218                 return PyUnicode_DecodeMBCS(s, size, errors);
3219             }
3220     #endif
3221             else if (strcmp(lower, "latin1") == 0
3222                      || strcmp(lower, "latin_1") == 0
3223                      || strcmp(lower, "iso_8859_1") == 0
3224                      || strcmp(lower, "iso8859_1") == 0) {
3225                 return PyUnicode_DecodeLatin1(s, size, errors);
3226             }
3227         }
3228     }
3229 
3230     /* Decode via the codec registry */
3231     buffer = NULL;
3232     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3233         goto onError;
3234     buffer = PyMemoryView_FromBuffer(&info);
3235     if (buffer == NULL)
3236         goto onError;
3237     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3238     if (unicode == NULL)
3239         goto onError;
3240     if (!PyUnicode_Check(unicode)) {
3241         PyErr_Format(PyExc_TypeError,
3242                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3243                      "use codecs.decode() to decode to arbitrary types",
3244                      encoding,
3245                      Py_TYPE(unicode)->tp_name);
3246         Py_DECREF(unicode);
3247         goto onError;
3248     }
3249     Py_DECREF(buffer);
3250     return unicode_result(unicode);
3251 
3252   onError:
3253     Py_XDECREF(buffer);
3254     return NULL;
3255 }
3256 
3257 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3258 PyUnicode_AsDecodedObject(PyObject *unicode,
3259                           const char *encoding,
3260                           const char *errors)
3261 {
3262     if (!PyUnicode_Check(unicode)) {
3263         PyErr_BadArgument();
3264         return NULL;
3265     }
3266 
3267     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3268                      "PyUnicode_AsDecodedObject() is deprecated; "
3269                      "use PyCodec_Decode() to decode from str", 1) < 0)
3270         return NULL;
3271 
3272     if (encoding == NULL)
3273         encoding = PyUnicode_GetDefaultEncoding();
3274 
3275     /* Decode via the codec registry */
3276     return PyCodec_Decode(unicode, encoding, errors);
3277 }
3278 
3279 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3280 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3281                            const char *encoding,
3282                            const char *errors)
3283 {
3284     PyObject *v;
3285 
3286     if (!PyUnicode_Check(unicode)) {
3287         PyErr_BadArgument();
3288         goto onError;
3289     }
3290 
3291     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3292                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3293                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3294         return NULL;
3295 
3296     if (encoding == NULL)
3297         encoding = PyUnicode_GetDefaultEncoding();
3298 
3299     /* Decode via the codec registry */
3300     v = PyCodec_Decode(unicode, encoding, errors);
3301     if (v == NULL)
3302         goto onError;
3303     if (!PyUnicode_Check(v)) {
3304         PyErr_Format(PyExc_TypeError,
3305                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3306                      "use codecs.decode() to decode to arbitrary types",
3307                      encoding,
3308                      Py_TYPE(unicode)->tp_name);
3309         Py_DECREF(v);
3310         goto onError;
3311     }
3312     return unicode_result(v);
3313 
3314   onError:
3315     return NULL;
3316 }
3317 
3318 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3319 PyUnicode_Encode(const Py_UNICODE *s,
3320                  Py_ssize_t size,
3321                  const char *encoding,
3322                  const char *errors)
3323 {
3324     PyObject *v, *unicode;
3325 
3326     unicode = PyUnicode_FromWideChar(s, size);
3327     if (unicode == NULL)
3328         return NULL;
3329     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3330     Py_DECREF(unicode);
3331     return v;
3332 }
3333 
3334 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3335 PyUnicode_AsEncodedObject(PyObject *unicode,
3336                           const char *encoding,
3337                           const char *errors)
3338 {
3339     PyObject *v;
3340 
3341     if (!PyUnicode_Check(unicode)) {
3342         PyErr_BadArgument();
3343         goto onError;
3344     }
3345 
3346     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3347                      "PyUnicode_AsEncodedObject() is deprecated; "
3348                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3349                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3350         return NULL;
3351 
3352     if (encoding == NULL)
3353         encoding = PyUnicode_GetDefaultEncoding();
3354 
3355     /* Encode via the codec registry */
3356     v = PyCodec_Encode(unicode, encoding, errors);
3357     if (v == NULL)
3358         goto onError;
3359     return v;
3360 
3361   onError:
3362     return NULL;
3363 }
3364 
3365 static int
locale_error_handler(const char * errors,int * surrogateescape)3366 locale_error_handler(const char *errors, int *surrogateescape)
3367 {
3368     _Py_error_handler error_handler = get_error_handler(errors);
3369     switch (error_handler)
3370     {
3371     case _Py_ERROR_STRICT:
3372         *surrogateescape = 0;
3373         return 0;
3374     case _Py_ERROR_SURROGATEESCAPE:
3375         *surrogateescape = 1;
3376         return 0;
3377     default:
3378         PyErr_Format(PyExc_ValueError,
3379                      "only 'strict' and 'surrogateescape' error handlers "
3380                      "are supported, not '%s'",
3381                      errors);
3382         return -1;
3383     }
3384 }
3385 
3386 static PyObject *
unicode_encode_locale(PyObject * unicode,const char * errors,int current_locale)3387 unicode_encode_locale(PyObject *unicode, const char *errors,
3388                       int current_locale)
3389 {
3390     int surrogateescape;
3391     if (locale_error_handler(errors, &surrogateescape) < 0)
3392         return NULL;
3393 
3394     Py_ssize_t wlen;
3395     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3396     if (wstr == NULL) {
3397         return NULL;
3398     }
3399 
3400     if ((size_t)wlen != wcslen(wstr)) {
3401         PyErr_SetString(PyExc_ValueError, "embedded null character");
3402         PyMem_Free(wstr);
3403         return NULL;
3404     }
3405 
3406     char *str;
3407     size_t error_pos;
3408     const char *reason;
3409     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3410                                  current_locale, surrogateescape);
3411     PyMem_Free(wstr);
3412 
3413     if (res != 0) {
3414         if (res == -2) {
3415             PyObject *exc;
3416             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3417                     "locale", unicode,
3418                     (Py_ssize_t)error_pos,
3419                     (Py_ssize_t)(error_pos+1),
3420                     reason);
3421             if (exc != NULL) {
3422                 PyCodec_StrictErrors(exc);
3423                 Py_DECREF(exc);
3424             }
3425         }
3426         else {
3427             PyErr_NoMemory();
3428         }
3429         return NULL;
3430     }
3431 
3432     PyObject *bytes = PyBytes_FromString(str);
3433     PyMem_RawFree(str);
3434     return bytes;
3435 }
3436 
3437 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3438 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3439 {
3440     return unicode_encode_locale(unicode, errors, 1);
3441 }
3442 
3443 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3444 PyUnicode_EncodeFSDefault(PyObject *unicode)
3445 {
3446 #if defined(__APPLE__)
3447     return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3448 #else
3449     PyInterpreterState *interp = PyThreadState_GET()->interp;
3450     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3451        cannot use it to encode and decode filenames before it is loaded. Load
3452        the Python codec requires to encode at least its own filename. Use the C
3453        version of the locale codec until the codec registry is initialized and
3454        the Python codec is loaded.
3455 
3456        Py_FileSystemDefaultEncoding is shared between all interpreters, we
3457        cannot only rely on it: check also interp->fscodec_initialized for
3458        subinterpreters. */
3459     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3460         return PyUnicode_AsEncodedString(unicode,
3461                                          Py_FileSystemDefaultEncoding,
3462                                          Py_FileSystemDefaultEncodeErrors);
3463     }
3464     else {
3465         return unicode_encode_locale(unicode,
3466                                      Py_FileSystemDefaultEncodeErrors, 0);
3467     }
3468 #endif
3469 }
3470 
3471 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3472 PyUnicode_AsEncodedString(PyObject *unicode,
3473                           const char *encoding,
3474                           const char *errors)
3475 {
3476     PyObject *v;
3477     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3478 
3479     if (!PyUnicode_Check(unicode)) {
3480         PyErr_BadArgument();
3481         return NULL;
3482     }
3483 
3484     if (encoding == NULL) {
3485         return _PyUnicode_AsUTF8String(unicode, errors);
3486     }
3487 
3488     /* Shortcuts for common default encodings */
3489     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3490         char *lower = buflower;
3491 
3492         /* Fast paths */
3493         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3494             lower += 3;
3495             if (*lower == '_') {
3496                 /* Match "utf8" and "utf_8" */
3497                 lower++;
3498             }
3499 
3500             if (lower[0] == '8' && lower[1] == 0) {
3501                 return _PyUnicode_AsUTF8String(unicode, errors);
3502             }
3503             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3504                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3505             }
3506             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3507                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3508             }
3509         }
3510         else {
3511             if (strcmp(lower, "ascii") == 0
3512                 || strcmp(lower, "us_ascii") == 0) {
3513                 return _PyUnicode_AsASCIIString(unicode, errors);
3514             }
3515 #ifdef MS_WINDOWS
3516             else if (strcmp(lower, "mbcs") == 0) {
3517                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3518             }
3519 #endif
3520             else if (strcmp(lower, "latin1") == 0 ||
3521                      strcmp(lower, "latin_1") == 0 ||
3522                      strcmp(lower, "iso_8859_1") == 0 ||
3523                      strcmp(lower, "iso8859_1") == 0) {
3524                 return _PyUnicode_AsLatin1String(unicode, errors);
3525             }
3526         }
3527     }
3528 
3529     /* Encode via the codec registry */
3530     v = _PyCodec_EncodeText(unicode, encoding, errors);
3531     if (v == NULL)
3532         return NULL;
3533 
3534     /* The normal path */
3535     if (PyBytes_Check(v))
3536         return v;
3537 
3538     /* If the codec returns a buffer, raise a warning and convert to bytes */
3539     if (PyByteArray_Check(v)) {
3540         int error;
3541         PyObject *b;
3542 
3543         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3544             "encoder %s returned bytearray instead of bytes; "
3545             "use codecs.encode() to encode to arbitrary types",
3546             encoding);
3547         if (error) {
3548             Py_DECREF(v);
3549             return NULL;
3550         }
3551 
3552         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3553                                       PyByteArray_GET_SIZE(v));
3554         Py_DECREF(v);
3555         return b;
3556     }
3557 
3558     PyErr_Format(PyExc_TypeError,
3559                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3560                  "use codecs.encode() to encode to arbitrary types",
3561                  encoding,
3562                  Py_TYPE(v)->tp_name);
3563     Py_DECREF(v);
3564     return NULL;
3565 }
3566 
3567 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3568 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3569                            const char *encoding,
3570                            const char *errors)
3571 {
3572     PyObject *v;
3573 
3574     if (!PyUnicode_Check(unicode)) {
3575         PyErr_BadArgument();
3576         goto onError;
3577     }
3578 
3579     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3580                      "PyUnicode_AsEncodedUnicode() is deprecated; "
3581                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
3582         return NULL;
3583 
3584     if (encoding == NULL)
3585         encoding = PyUnicode_GetDefaultEncoding();
3586 
3587     /* Encode via the codec registry */
3588     v = PyCodec_Encode(unicode, encoding, errors);
3589     if (v == NULL)
3590         goto onError;
3591     if (!PyUnicode_Check(v)) {
3592         PyErr_Format(PyExc_TypeError,
3593                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
3594                      "use codecs.encode() to encode to arbitrary types",
3595                      encoding,
3596                      Py_TYPE(v)->tp_name);
3597         Py_DECREF(v);
3598         goto onError;
3599     }
3600     return v;
3601 
3602   onError:
3603     return NULL;
3604 }
3605 
3606 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,const char * errors,int current_locale)3607 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3608                       int current_locale)
3609 {
3610     int surrogateescape;
3611     if (locale_error_handler(errors, &surrogateescape) < 0)
3612         return NULL;
3613 
3614     if (str[len] != '\0' || (size_t)len != strlen(str))  {
3615         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3616         return NULL;
3617     }
3618 
3619     wchar_t *wstr;
3620     size_t wlen;
3621     const char *reason;
3622     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3623                                  current_locale, surrogateescape);
3624     if (res != 0) {
3625         if (res == -2) {
3626             PyObject *exc;
3627             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3628                                         "locale", str, len,
3629                                         (Py_ssize_t)wlen,
3630                                         (Py_ssize_t)(wlen + 1),
3631                                         reason);
3632             if (exc != NULL) {
3633                 PyCodec_StrictErrors(exc);
3634                 Py_DECREF(exc);
3635             }
3636         }
3637         else {
3638             PyErr_NoMemory();
3639         }
3640         return NULL;
3641     }
3642 
3643     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3644     PyMem_RawFree(wstr);
3645     return unicode;
3646 }
3647 
3648 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3649 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3650                               const char *errors)
3651 {
3652     return unicode_decode_locale(str, len, errors, 1);
3653 }
3654 
3655 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3656 PyUnicode_DecodeLocale(const char *str, const char *errors)
3657 {
3658     Py_ssize_t size = (Py_ssize_t)strlen(str);
3659     return unicode_decode_locale(str, size, errors, 1);
3660 }
3661 
3662 
3663 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3664 PyUnicode_DecodeFSDefault(const char *s) {
3665     Py_ssize_t size = (Py_ssize_t)strlen(s);
3666     return PyUnicode_DecodeFSDefaultAndSize(s, size);
3667 }
3668 
3669 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3670 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3671 {
3672 #if defined(__APPLE__)
3673     return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3674 #else
3675     PyInterpreterState *interp = PyThreadState_GET()->interp;
3676     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3677        cannot use it to encode and decode filenames before it is loaded. Load
3678        the Python codec requires to encode at least its own filename. Use the C
3679        version of the locale codec until the codec registry is initialized and
3680        the Python codec is loaded.
3681 
3682        Py_FileSystemDefaultEncoding is shared between all interpreters, we
3683        cannot only rely on it: check also interp->fscodec_initialized for
3684        subinterpreters. */
3685     if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3686         return PyUnicode_Decode(s, size,
3687                                 Py_FileSystemDefaultEncoding,
3688                                 Py_FileSystemDefaultEncodeErrors);
3689     }
3690     else {
3691         return unicode_decode_locale(s, size,
3692                                      Py_FileSystemDefaultEncodeErrors, 0);
3693     }
3694 #endif
3695 }
3696 
3697 
3698 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3699 PyUnicode_FSConverter(PyObject* arg, void* addr)
3700 {
3701     PyObject *path = NULL;
3702     PyObject *output = NULL;
3703     Py_ssize_t size;
3704     void *data;
3705     if (arg == NULL) {
3706         Py_DECREF(*(PyObject**)addr);
3707         *(PyObject**)addr = NULL;
3708         return 1;
3709     }
3710     path = PyOS_FSPath(arg);
3711     if (path == NULL) {
3712         return 0;
3713     }
3714     if (PyBytes_Check(path)) {
3715         output = path;
3716     }
3717     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3718         output = PyUnicode_EncodeFSDefault(path);
3719         Py_DECREF(path);
3720         if (!output) {
3721             return 0;
3722         }
3723         assert(PyBytes_Check(output));
3724     }
3725 
3726     size = PyBytes_GET_SIZE(output);
3727     data = PyBytes_AS_STRING(output);
3728     if ((size_t)size != strlen(data)) {
3729         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3730         Py_DECREF(output);
3731         return 0;
3732     }
3733     *(PyObject**)addr = output;
3734     return Py_CLEANUP_SUPPORTED;
3735 }
3736 
3737 
3738 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3739 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3740 {
3741     int is_buffer = 0;
3742     PyObject *path = NULL;
3743     PyObject *output = NULL;
3744     if (arg == NULL) {
3745         Py_DECREF(*(PyObject**)addr);
3746         *(PyObject**)addr = NULL;
3747         return 1;
3748     }
3749 
3750     is_buffer = PyObject_CheckBuffer(arg);
3751     if (!is_buffer) {
3752         path = PyOS_FSPath(arg);
3753         if (path == NULL) {
3754             return 0;
3755         }
3756     }
3757     else {
3758         path = arg;
3759         Py_INCREF(arg);
3760     }
3761 
3762     if (PyUnicode_Check(path)) {
3763         if (PyUnicode_READY(path) == -1) {
3764             Py_DECREF(path);
3765             return 0;
3766         }
3767         output = path;
3768     }
3769     else if (PyBytes_Check(path) || is_buffer) {
3770         PyObject *path_bytes = NULL;
3771 
3772         if (!PyBytes_Check(path) &&
3773             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3774             "path should be string, bytes, or os.PathLike, not %.200s",
3775             Py_TYPE(arg)->tp_name)) {
3776                 Py_DECREF(path);
3777             return 0;
3778         }
3779         path_bytes = PyBytes_FromObject(path);
3780         Py_DECREF(path);
3781         if (!path_bytes) {
3782             return 0;
3783         }
3784         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3785                                                   PyBytes_GET_SIZE(path_bytes));
3786         Py_DECREF(path_bytes);
3787         if (!output) {
3788             return 0;
3789         }
3790     }
3791     else {
3792         PyErr_Format(PyExc_TypeError,
3793                      "path should be string, bytes, or os.PathLike, not %.200s",
3794                      Py_TYPE(arg)->tp_name);
3795         Py_DECREF(path);
3796         return 0;
3797     }
3798     if (PyUnicode_READY(output) == -1) {
3799         Py_DECREF(output);
3800         return 0;
3801     }
3802     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3803                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3804         PyErr_SetString(PyExc_ValueError, "embedded null character");
3805         Py_DECREF(output);
3806         return 0;
3807     }
3808     *(PyObject**)addr = output;
3809     return Py_CLEANUP_SUPPORTED;
3810 }
3811 
3812 
3813 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3814 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3815 {
3816     PyObject *bytes;
3817 
3818     if (!PyUnicode_Check(unicode)) {
3819         PyErr_BadArgument();
3820         return NULL;
3821     }
3822     if (PyUnicode_READY(unicode) == -1)
3823         return NULL;
3824 
3825     if (PyUnicode_UTF8(unicode) == NULL) {
3826         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3827         bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3828         if (bytes == NULL)
3829             return NULL;
3830         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3831         if (_PyUnicode_UTF8(unicode) == NULL) {
3832             PyErr_NoMemory();
3833             Py_DECREF(bytes);
3834             return NULL;
3835         }
3836         _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3837         memcpy(_PyUnicode_UTF8(unicode),
3838                   PyBytes_AS_STRING(bytes),
3839                   _PyUnicode_UTF8_LENGTH(unicode) + 1);
3840         Py_DECREF(bytes);
3841     }
3842 
3843     if (psize)
3844         *psize = PyUnicode_UTF8_LENGTH(unicode);
3845     return PyUnicode_UTF8(unicode);
3846 }
3847 
3848 const char *
PyUnicode_AsUTF8(PyObject * unicode)3849 PyUnicode_AsUTF8(PyObject *unicode)
3850 {
3851     return PyUnicode_AsUTF8AndSize(unicode, NULL);
3852 }
3853 
3854 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)3855 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3856 {
3857     const unsigned char *one_byte;
3858 #if SIZEOF_WCHAR_T == 4
3859     const Py_UCS2 *two_bytes;
3860 #else
3861     const Py_UCS4 *four_bytes;
3862     const Py_UCS4 *ucs4_end;
3863     Py_ssize_t num_surrogates;
3864 #endif
3865     wchar_t *w;
3866     wchar_t *wchar_end;
3867 
3868     if (!PyUnicode_Check(unicode)) {
3869         PyErr_BadArgument();
3870         return NULL;
3871     }
3872     if (_PyUnicode_WSTR(unicode) == NULL) {
3873         /* Non-ASCII compact unicode object */
3874         assert(_PyUnicode_KIND(unicode) != 0);
3875         assert(PyUnicode_IS_READY(unicode));
3876 
3877         if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3878 #if SIZEOF_WCHAR_T == 2
3879             four_bytes = PyUnicode_4BYTE_DATA(unicode);
3880             ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3881             num_surrogates = 0;
3882 
3883             for (; four_bytes < ucs4_end; ++four_bytes) {
3884                 if (*four_bytes > 0xFFFF)
3885                     ++num_surrogates;
3886             }
3887 
3888             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3889                     sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3890             if (!_PyUnicode_WSTR(unicode)) {
3891                 PyErr_NoMemory();
3892                 return NULL;
3893             }
3894             _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3895 
3896             w = _PyUnicode_WSTR(unicode);
3897             wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3898             four_bytes = PyUnicode_4BYTE_DATA(unicode);
3899             for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3900                 if (*four_bytes > 0xFFFF) {
3901                     assert(*four_bytes <= MAX_UNICODE);
3902                     /* encode surrogate pair in this case */
3903                     *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3904                     *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3905                 }
3906                 else
3907                     *w = *four_bytes;
3908 
3909                 if (w > wchar_end) {
3910                     Py_UNREACHABLE();
3911                 }
3912             }
3913             *w = 0;
3914 #else
3915             /* sizeof(wchar_t) == 4 */
3916             Py_FatalError("Impossible unicode object state, wstr and str "
3917                           "should share memory already.");
3918             return NULL;
3919 #endif
3920         }
3921         else {
3922             if ((size_t)_PyUnicode_LENGTH(unicode) >
3923                     PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3924                 PyErr_NoMemory();
3925                 return NULL;
3926             }
3927             _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3928                                                   (_PyUnicode_LENGTH(unicode) + 1));
3929             if (!_PyUnicode_WSTR(unicode)) {
3930                 PyErr_NoMemory();
3931                 return NULL;
3932             }
3933             if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3934                 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3935             w = _PyUnicode_WSTR(unicode);
3936             wchar_end = w + _PyUnicode_LENGTH(unicode);
3937 
3938             if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3939                 one_byte = PyUnicode_1BYTE_DATA(unicode);
3940                 for (; w < wchar_end; ++one_byte, ++w)
3941                     *w = *one_byte;
3942                 /* null-terminate the wstr */
3943                 *w = 0;
3944             }
3945             else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3946 #if SIZEOF_WCHAR_T == 4
3947                 two_bytes = PyUnicode_2BYTE_DATA(unicode);
3948                 for (; w < wchar_end; ++two_bytes, ++w)
3949                     *w = *two_bytes;
3950                 /* null-terminate the wstr */
3951                 *w = 0;
3952 #else
3953                 /* sizeof(wchar_t) == 2 */
3954                 PyObject_FREE(_PyUnicode_WSTR(unicode));
3955                 _PyUnicode_WSTR(unicode) = NULL;
3956                 Py_FatalError("Impossible unicode object state, wstr "
3957                               "and str should share memory already.");
3958                 return NULL;
3959 #endif
3960             }
3961             else {
3962                 Py_UNREACHABLE();
3963             }
3964         }
3965     }
3966     if (size != NULL)
3967         *size = PyUnicode_WSTR_LENGTH(unicode);
3968     return _PyUnicode_WSTR(unicode);
3969 }
3970 
3971 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)3972 PyUnicode_AsUnicode(PyObject *unicode)
3973 {
3974     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3975 }
3976 
3977 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)3978 _PyUnicode_AsUnicode(PyObject *unicode)
3979 {
3980     Py_ssize_t size;
3981     const Py_UNICODE *wstr;
3982 
3983     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3984     if (wstr && wcslen(wstr) != (size_t)size) {
3985         PyErr_SetString(PyExc_ValueError, "embedded null character");
3986         return NULL;
3987     }
3988     return wstr;
3989 }
3990 
3991 
3992 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)3993 PyUnicode_GetSize(PyObject *unicode)
3994 {
3995     if (!PyUnicode_Check(unicode)) {
3996         PyErr_BadArgument();
3997         goto onError;
3998     }
3999     if (_PyUnicode_WSTR(unicode) == NULL) {
4000         if (PyUnicode_AsUnicode(unicode) == NULL)
4001             goto onError;
4002     }
4003     return PyUnicode_WSTR_LENGTH(unicode);
4004 
4005   onError:
4006     return -1;
4007 }
4008 
4009 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4010 PyUnicode_GetLength(PyObject *unicode)
4011 {
4012     if (!PyUnicode_Check(unicode)) {
4013         PyErr_BadArgument();
4014         return -1;
4015     }
4016     if (PyUnicode_READY(unicode) == -1)
4017         return -1;
4018     return PyUnicode_GET_LENGTH(unicode);
4019 }
4020 
4021 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4022 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4023 {
4024     void *data;
4025     int kind;
4026 
4027     if (!PyUnicode_Check(unicode)) {
4028         PyErr_BadArgument();
4029         return (Py_UCS4)-1;
4030     }
4031     if (PyUnicode_READY(unicode) == -1) {
4032         return (Py_UCS4)-1;
4033     }
4034     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4035         PyErr_SetString(PyExc_IndexError, "string index out of range");
4036         return (Py_UCS4)-1;
4037     }
4038     data = PyUnicode_DATA(unicode);
4039     kind = PyUnicode_KIND(unicode);
4040     return PyUnicode_READ(kind, data, index);
4041 }
4042 
4043 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4044 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4045 {
4046     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4047         PyErr_BadArgument();
4048         return -1;
4049     }
4050     assert(PyUnicode_IS_READY(unicode));
4051     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4052         PyErr_SetString(PyExc_IndexError, "string index out of range");
4053         return -1;
4054     }
4055     if (unicode_check_modifiable(unicode))
4056         return -1;
4057     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4058         PyErr_SetString(PyExc_ValueError, "character out of range");
4059         return -1;
4060     }
4061     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4062                     index, ch);
4063     return 0;
4064 }
4065 
4066 const char *
PyUnicode_GetDefaultEncoding(void)4067 PyUnicode_GetDefaultEncoding(void)
4068 {
4069     return "utf-8";
4070 }
4071 
4072 /* create or adjust a UnicodeDecodeError */
4073 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4074 make_decode_exception(PyObject **exceptionObject,
4075                       const char *encoding,
4076                       const char *input, Py_ssize_t length,
4077                       Py_ssize_t startpos, Py_ssize_t endpos,
4078                       const char *reason)
4079 {
4080     if (*exceptionObject == NULL) {
4081         *exceptionObject = PyUnicodeDecodeError_Create(
4082             encoding, input, length, startpos, endpos, reason);
4083     }
4084     else {
4085         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4086             goto onError;
4087         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4088             goto onError;
4089         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4090             goto onError;
4091     }
4092     return;
4093 
4094 onError:
4095     Py_CLEAR(*exceptionObject);
4096 }
4097 
4098 #ifdef MS_WINDOWS
4099 /* error handling callback helper:
4100    build arguments, call the callback and check the arguments,
4101    if no exception occurred, copy the replacement to the output
4102    and adjust various state variables.
4103    return 0 on success, -1 on error
4104 */
4105 
4106 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyObject ** output,Py_ssize_t * outpos)4107 unicode_decode_call_errorhandler_wchar(
4108     const char *errors, PyObject **errorHandler,
4109     const char *encoding, const char *reason,
4110     const char **input, const char **inend, Py_ssize_t *startinpos,
4111     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4112     PyObject **output, Py_ssize_t *outpos)
4113 {
4114     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4115 
4116     PyObject *restuple = NULL;
4117     PyObject *repunicode = NULL;
4118     Py_ssize_t outsize;
4119     Py_ssize_t insize;
4120     Py_ssize_t requiredsize;
4121     Py_ssize_t newpos;
4122     PyObject *inputobj = NULL;
4123     wchar_t *repwstr;
4124     Py_ssize_t repwlen;
4125 
4126     assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4127     outsize = _PyUnicode_WSTR_LENGTH(*output);
4128 
4129     if (*errorHandler == NULL) {
4130         *errorHandler = PyCodec_LookupError(errors);
4131         if (*errorHandler == NULL)
4132             goto onError;
4133     }
4134 
4135     make_decode_exception(exceptionObject,
4136         encoding,
4137         *input, *inend - *input,
4138         *startinpos, *endinpos,
4139         reason);
4140     if (*exceptionObject == NULL)
4141         goto onError;
4142 
4143     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4144     if (restuple == NULL)
4145         goto onError;
4146     if (!PyTuple_Check(restuple)) {
4147         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4148         goto onError;
4149     }
4150     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4151         goto onError;
4152 
4153     /* Copy back the bytes variables, which might have been modified by the
4154        callback */
4155     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4156     if (!inputobj)
4157         goto onError;
4158     *input = PyBytes_AS_STRING(inputobj);
4159     insize = PyBytes_GET_SIZE(inputobj);
4160     *inend = *input + insize;
4161     /* we can DECREF safely, as the exception has another reference,
4162        so the object won't go away. */
4163     Py_DECREF(inputobj);
4164 
4165     if (newpos<0)
4166         newpos = insize+newpos;
4167     if (newpos<0 || newpos>insize) {
4168         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4169         goto onError;
4170     }
4171 
4172     repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4173     if (repwstr == NULL)
4174         goto onError;
4175     /* need more space? (at least enough for what we
4176        have+the replacement+the rest of the string (starting
4177        at the new input position), so we won't have to check space
4178        when there are no errors in the rest of the string) */
4179     requiredsize = *outpos;
4180     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4181         goto overflow;
4182     requiredsize += repwlen;
4183     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4184         goto overflow;
4185     requiredsize += insize - newpos;
4186     if (requiredsize > outsize) {
4187         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4188             requiredsize = 2*outsize;
4189         if (unicode_resize(output, requiredsize) < 0)
4190             goto onError;
4191     }
4192     wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4193     *outpos += repwlen;
4194     *endinpos = newpos;
4195     *inptr = *input + newpos;
4196 
4197     /* we made it! */
4198     Py_DECREF(restuple);
4199     return 0;
4200 
4201   overflow:
4202     PyErr_SetString(PyExc_OverflowError,
4203                     "decoded result is too long for a Python string");
4204 
4205   onError:
4206     Py_XDECREF(restuple);
4207     return -1;
4208 }
4209 #endif   /* MS_WINDOWS */
4210 
4211 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4212 unicode_decode_call_errorhandler_writer(
4213     const char *errors, PyObject **errorHandler,
4214     const char *encoding, const char *reason,
4215     const char **input, const char **inend, Py_ssize_t *startinpos,
4216     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4217     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4218 {
4219     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4220 
4221     PyObject *restuple = NULL;
4222     PyObject *repunicode = NULL;
4223     Py_ssize_t insize;
4224     Py_ssize_t newpos;
4225     Py_ssize_t replen;
4226     Py_ssize_t remain;
4227     PyObject *inputobj = NULL;
4228     int need_to_grow = 0;
4229     const char *new_inptr;
4230 
4231     if (*errorHandler == NULL) {
4232         *errorHandler = PyCodec_LookupError(errors);
4233         if (*errorHandler == NULL)
4234             goto onError;
4235     }
4236 
4237     make_decode_exception(exceptionObject,
4238         encoding,
4239         *input, *inend - *input,
4240         *startinpos, *endinpos,
4241         reason);
4242     if (*exceptionObject == NULL)
4243         goto onError;
4244 
4245     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4246     if (restuple == NULL)
4247         goto onError;
4248     if (!PyTuple_Check(restuple)) {
4249         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4250         goto onError;
4251     }
4252     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4253         goto onError;
4254 
4255     /* Copy back the bytes variables, which might have been modified by the
4256        callback */
4257     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4258     if (!inputobj)
4259         goto onError;
4260     remain = *inend - *input - *endinpos;
4261     *input = PyBytes_AS_STRING(inputobj);
4262     insize = PyBytes_GET_SIZE(inputobj);
4263     *inend = *input + insize;
4264     /* we can DECREF safely, as the exception has another reference,
4265        so the object won't go away. */
4266     Py_DECREF(inputobj);
4267 
4268     if (newpos<0)
4269         newpos = insize+newpos;
4270     if (newpos<0 || newpos>insize) {
4271         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4272         goto onError;
4273     }
4274 
4275     replen = PyUnicode_GET_LENGTH(repunicode);
4276     if (replen > 1) {
4277         writer->min_length += replen - 1;
4278         need_to_grow = 1;
4279     }
4280     new_inptr = *input + newpos;
4281     if (*inend - new_inptr > remain) {
4282         /* We don't know the decoding algorithm here so we make the worst
4283            assumption that one byte decodes to one unicode character.
4284            If unfortunately one byte could decode to more unicode characters,
4285            the decoder may write out-of-bound then.  Is it possible for the
4286            algorithms using this function? */
4287         writer->min_length += *inend - new_inptr - remain;
4288         need_to_grow = 1;
4289     }
4290     if (need_to_grow) {
4291         writer->overallocate = 1;
4292         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4293                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4294             goto onError;
4295     }
4296     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4297         goto onError;
4298 
4299     *endinpos = newpos;
4300     *inptr = new_inptr;
4301 
4302     /* we made it! */
4303     Py_DECREF(restuple);
4304     return 0;
4305 
4306   onError:
4307     Py_XDECREF(restuple);
4308     return -1;
4309 }
4310 
4311 /* --- UTF-7 Codec -------------------------------------------------------- */
4312 
4313 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4314 
4315 /* Three simple macros defining base-64. */
4316 
4317 /* Is c a base-64 character? */
4318 
4319 #define IS_BASE64(c) \
4320     (((c) >= 'A' && (c) <= 'Z') ||     \
4321      ((c) >= 'a' && (c) <= 'z') ||     \
4322      ((c) >= '0' && (c) <= '9') ||     \
4323      (c) == '+' || (c) == '/')
4324 
4325 /* given that c is a base-64 character, what is its base-64 value? */
4326 
4327 #define FROM_BASE64(c)                                                  \
4328     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4329      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4330      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4331      (c) == '+' ? 62 : 63)
4332 
4333 /* What is the base-64 character of the bottom 6 bits of n? */
4334 
4335 #define TO_BASE64(n)  \
4336     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4337 
4338 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4339  * decoded as itself.  We are permissive on decoding; the only ASCII
4340  * byte not decoding to itself is the + which begins a base64
4341  * string. */
4342 
4343 #define DECODE_DIRECT(c)                                \
4344     ((c) <= 127 && (c) != '+')
4345 
4346 /* The UTF-7 encoder treats ASCII characters differently according to
4347  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4348  * the above).  See RFC2152.  This array identifies these different
4349  * sets:
4350  * 0 : "Set D"
4351  *     alphanumeric and '(),-./:?
4352  * 1 : "Set O"
4353  *     !"#$%&*;<=>@[]^_`{|}
4354  * 2 : "whitespace"
4355  *     ht nl cr sp
4356  * 3 : special (must be base64 encoded)
4357  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4358  */
4359 
4360 static
4361 char utf7_category[128] = {
4362 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4363     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4364 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4365     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4366 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4367     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4368 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4369     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4370 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4371     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4372 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4373     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4374 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4375     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4376 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4377     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4378 };
4379 
4380 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4381  * answer depends on whether we are encoding set O as itself, and also
4382  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4383  * clear that the answers to these questions vary between
4384  * applications, so this code needs to be flexible.  */
4385 
4386 #define ENCODE_DIRECT(c, directO, directWS)             \
4387     ((c) < 128 && (c) > 0 &&                            \
4388      ((utf7_category[(c)] == 0) ||                      \
4389       (directWS && (utf7_category[(c)] == 2)) ||        \
4390       (directO && (utf7_category[(c)] == 1))))
4391 
4392 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4393 PyUnicode_DecodeUTF7(const char *s,
4394                      Py_ssize_t size,
4395                      const char *errors)
4396 {
4397     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4398 }
4399 
4400 /* The decoder.  The only state we preserve is our read position,
4401  * i.e. how many characters we have consumed.  So if we end in the
4402  * middle of a shift sequence we have to back off the read position
4403  * and the output to the beginning of the sequence, otherwise we lose
4404  * all the shift state (seen bits, number of bits seen, high
4405  * surrogate). */
4406 
4407 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4408 PyUnicode_DecodeUTF7Stateful(const char *s,
4409                              Py_ssize_t size,
4410                              const char *errors,
4411                              Py_ssize_t *consumed)
4412 {
4413     const char *starts = s;
4414     Py_ssize_t startinpos;
4415     Py_ssize_t endinpos;
4416     const char *e;
4417     _PyUnicodeWriter writer;
4418     const char *errmsg = "";
4419     int inShift = 0;
4420     Py_ssize_t shiftOutStart;
4421     unsigned int base64bits = 0;
4422     unsigned long base64buffer = 0;
4423     Py_UCS4 surrogate = 0;
4424     PyObject *errorHandler = NULL;
4425     PyObject *exc = NULL;
4426 
4427     if (size == 0) {
4428         if (consumed)
4429             *consumed = 0;
4430         _Py_RETURN_UNICODE_EMPTY();
4431     }
4432 
4433     /* Start off assuming it's all ASCII. Widen later as necessary. */
4434     _PyUnicodeWriter_Init(&writer);
4435     writer.min_length = size;
4436 
4437     shiftOutStart = 0;
4438     e = s + size;
4439 
4440     while (s < e) {
4441         Py_UCS4 ch;
4442       restart:
4443         ch = (unsigned char) *s;
4444 
4445         if (inShift) { /* in a base-64 section */
4446             if (IS_BASE64(ch)) { /* consume a base-64 character */
4447                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4448                 base64bits += 6;
4449                 s++;
4450                 if (base64bits >= 16) {
4451                     /* we have enough bits for a UTF-16 value */
4452                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4453                     base64bits -= 16;
4454                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4455                     assert(outCh <= 0xffff);
4456                     if (surrogate) {
4457                         /* expecting a second surrogate */
4458                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4459                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4460                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4461                                 goto onError;
4462                             surrogate = 0;
4463                             continue;
4464                         }
4465                         else {
4466                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4467                                 goto onError;
4468                             surrogate = 0;
4469                         }
4470                     }
4471                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4472                         /* first surrogate */
4473                         surrogate = outCh;
4474                     }
4475                     else {
4476                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4477                             goto onError;
4478                     }
4479                 }
4480             }
4481             else { /* now leaving a base-64 section */
4482                 inShift = 0;
4483                 if (base64bits > 0) { /* left-over bits */
4484                     if (base64bits >= 6) {
4485                         /* We've seen at least one base-64 character */
4486                         s++;
4487                         errmsg = "partial character in shift sequence";
4488                         goto utf7Error;
4489                     }
4490                     else {
4491                         /* Some bits remain; they should be zero */
4492                         if (base64buffer != 0) {
4493                             s++;
4494                             errmsg = "non-zero padding bits in shift sequence";
4495                             goto utf7Error;
4496                         }
4497                     }
4498                 }
4499                 if (surrogate && DECODE_DIRECT(ch)) {
4500                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4501                         goto onError;
4502                 }
4503                 surrogate = 0;
4504                 if (ch == '-') {
4505                     /* '-' is absorbed; other terminating
4506                        characters are preserved */
4507                     s++;
4508                 }
4509             }
4510         }
4511         else if ( ch == '+' ) {
4512             startinpos = s-starts;
4513             s++; /* consume '+' */
4514             if (s < e && *s == '-') { /* '+-' encodes '+' */
4515                 s++;
4516                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4517                     goto onError;
4518             }
4519             else { /* begin base64-encoded section */
4520                 inShift = 1;
4521                 surrogate = 0;
4522                 shiftOutStart = writer.pos;
4523                 base64bits = 0;
4524                 base64buffer = 0;
4525             }
4526         }
4527         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4528             s++;
4529             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4530                 goto onError;
4531         }
4532         else {
4533             startinpos = s-starts;
4534             s++;
4535             errmsg = "unexpected special character";
4536             goto utf7Error;
4537         }
4538         continue;
4539 utf7Error:
4540         endinpos = s-starts;
4541         if (unicode_decode_call_errorhandler_writer(
4542                 errors, &errorHandler,
4543                 "utf7", errmsg,
4544                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4545                 &writer))
4546             goto onError;
4547     }
4548 
4549     /* end of string */
4550 
4551     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4552         /* if we're in an inconsistent state, that's an error */
4553         inShift = 0;
4554         if (surrogate ||
4555                 (base64bits >= 6) ||
4556                 (base64bits > 0 && base64buffer != 0)) {
4557             endinpos = size;
4558             if (unicode_decode_call_errorhandler_writer(
4559                     errors, &errorHandler,
4560                     "utf7", "unterminated shift sequence",
4561                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4562                     &writer))
4563                 goto onError;
4564             if (s < e)
4565                 goto restart;
4566         }
4567     }
4568 
4569     /* return state */
4570     if (consumed) {
4571         if (inShift) {
4572             *consumed = startinpos;
4573             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4574                 PyObject *result = PyUnicode_FromKindAndData(
4575                         writer.kind, writer.data, shiftOutStart);
4576                 Py_XDECREF(errorHandler);
4577                 Py_XDECREF(exc);
4578                 _PyUnicodeWriter_Dealloc(&writer);
4579                 return result;
4580             }
4581             writer.pos = shiftOutStart; /* back off output */
4582         }
4583         else {
4584             *consumed = s-starts;
4585         }
4586     }
4587 
4588     Py_XDECREF(errorHandler);
4589     Py_XDECREF(exc);
4590     return _PyUnicodeWriter_Finish(&writer);
4591 
4592   onError:
4593     Py_XDECREF(errorHandler);
4594     Py_XDECREF(exc);
4595     _PyUnicodeWriter_Dealloc(&writer);
4596     return NULL;
4597 }
4598 
4599 
4600 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4601 _PyUnicode_EncodeUTF7(PyObject *str,
4602                       int base64SetO,
4603                       int base64WhiteSpace,
4604                       const char *errors)
4605 {
4606     int kind;
4607     void *data;
4608     Py_ssize_t len;
4609     PyObject *v;
4610     int inShift = 0;
4611     Py_ssize_t i;
4612     unsigned int base64bits = 0;
4613     unsigned long base64buffer = 0;
4614     char * out;
4615     char * start;
4616 
4617     if (PyUnicode_READY(str) == -1)
4618         return NULL;
4619     kind = PyUnicode_KIND(str);
4620     data = PyUnicode_DATA(str);
4621     len = PyUnicode_GET_LENGTH(str);
4622 
4623     if (len == 0)
4624         return PyBytes_FromStringAndSize(NULL, 0);
4625 
4626     /* It might be possible to tighten this worst case */
4627     if (len > PY_SSIZE_T_MAX / 8)
4628         return PyErr_NoMemory();
4629     v = PyBytes_FromStringAndSize(NULL, len * 8);
4630     if (v == NULL)
4631         return NULL;
4632 
4633     start = out = PyBytes_AS_STRING(v);
4634     for (i = 0; i < len; ++i) {
4635         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4636 
4637         if (inShift) {
4638             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4639                 /* shifting out */
4640                 if (base64bits) { /* output remaining bits */
4641                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
4642                     base64buffer = 0;
4643                     base64bits = 0;
4644                 }
4645                 inShift = 0;
4646                 /* Characters not in the BASE64 set implicitly unshift the sequence
4647                    so no '-' is required, except if the character is itself a '-' */
4648                 if (IS_BASE64(ch) || ch == '-') {
4649                     *out++ = '-';
4650                 }
4651                 *out++ = (char) ch;
4652             }
4653             else {
4654                 goto encode_char;
4655             }
4656         }
4657         else { /* not in a shift sequence */
4658             if (ch == '+') {
4659                 *out++ = '+';
4660                         *out++ = '-';
4661             }
4662             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4663                 *out++ = (char) ch;
4664             }
4665             else {
4666                 *out++ = '+';
4667                 inShift = 1;
4668                 goto encode_char;
4669             }
4670         }
4671         continue;
4672 encode_char:
4673         if (ch >= 0x10000) {
4674             assert(ch <= MAX_UNICODE);
4675 
4676             /* code first surrogate */
4677             base64bits += 16;
4678             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4679             while (base64bits >= 6) {
4680                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4681                 base64bits -= 6;
4682             }
4683             /* prepare second surrogate */
4684             ch = Py_UNICODE_LOW_SURROGATE(ch);
4685         }
4686         base64bits += 16;
4687         base64buffer = (base64buffer << 16) | ch;
4688         while (base64bits >= 6) {
4689             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4690             base64bits -= 6;
4691         }
4692     }
4693     if (base64bits)
4694         *out++= TO_BASE64(base64buffer << (6-base64bits) );
4695     if (inShift)
4696         *out++ = '-';
4697     if (_PyBytes_Resize(&v, out - start) < 0)
4698         return NULL;
4699     return v;
4700 }
4701 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4702 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4703                      Py_ssize_t size,
4704                      int base64SetO,
4705                      int base64WhiteSpace,
4706                      const char *errors)
4707 {
4708     PyObject *result;
4709     PyObject *tmp = PyUnicode_FromWideChar(s, size);
4710     if (tmp == NULL)
4711         return NULL;
4712     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4713                                    base64WhiteSpace, errors);
4714     Py_DECREF(tmp);
4715     return result;
4716 }
4717 
4718 #undef IS_BASE64
4719 #undef FROM_BASE64
4720 #undef TO_BASE64
4721 #undef DECODE_DIRECT
4722 #undef ENCODE_DIRECT
4723 
4724 /* --- UTF-8 Codec -------------------------------------------------------- */
4725 
4726 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4727 PyUnicode_DecodeUTF8(const char *s,
4728                      Py_ssize_t size,
4729                      const char *errors)
4730 {
4731     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4732 }
4733 
4734 #include "stringlib/asciilib.h"
4735 #include "stringlib/codecs.h"
4736 #include "stringlib/undef.h"
4737 
4738 #include "stringlib/ucs1lib.h"
4739 #include "stringlib/codecs.h"
4740 #include "stringlib/undef.h"
4741 
4742 #include "stringlib/ucs2lib.h"
4743 #include "stringlib/codecs.h"
4744 #include "stringlib/undef.h"
4745 
4746 #include "stringlib/ucs4lib.h"
4747 #include "stringlib/codecs.h"
4748 #include "stringlib/undef.h"
4749 
4750 /* Mask to quickly check whether a C 'long' contains a
4751    non-ASCII, UTF8-encoded char. */
4752 #if (SIZEOF_LONG == 8)
4753 # define ASCII_CHAR_MASK 0x8080808080808080UL
4754 #elif (SIZEOF_LONG == 4)
4755 # define ASCII_CHAR_MASK 0x80808080UL
4756 #else
4757 # error C 'long' size should be either 4 or 8!
4758 #endif
4759 
4760 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4761 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4762 {
4763     const char *p = start;
4764     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4765 
4766     /*
4767      * Issue #17237: m68k is a bit different from most architectures in
4768      * that objects do not use "natural alignment" - for example, int and
4769      * long are only aligned at 2-byte boundaries.  Therefore the assert()
4770      * won't work; also, tests have shown that skipping the "optimised
4771      * version" will even speed up m68k.
4772      */
4773 #if !defined(__m68k__)
4774 #if SIZEOF_LONG <= SIZEOF_VOID_P
4775     assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4776     if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4777         /* Fast path, see in STRINGLIB(utf8_decode) for
4778            an explanation. */
4779         /* Help allocation */
4780         const char *_p = p;
4781         Py_UCS1 * q = dest;
4782         while (_p < aligned_end) {
4783             unsigned long value = *(const unsigned long *) _p;
4784             if (value & ASCII_CHAR_MASK)
4785                 break;
4786             *((unsigned long *)q) = value;
4787             _p += SIZEOF_LONG;
4788             q += SIZEOF_LONG;
4789         }
4790         p = _p;
4791         while (p < end) {
4792             if ((unsigned char)*p & 0x80)
4793                 break;
4794             *q++ = *p++;
4795         }
4796         return p - start;
4797     }
4798 #endif
4799 #endif
4800     while (p < end) {
4801         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4802            for an explanation. */
4803         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4804             /* Help allocation */
4805             const char *_p = p;
4806             while (_p < aligned_end) {
4807                 unsigned long value = *(unsigned long *) _p;
4808                 if (value & ASCII_CHAR_MASK)
4809                     break;
4810                 _p += SIZEOF_LONG;
4811             }
4812             p = _p;
4813             if (_p == end)
4814                 break;
4815         }
4816         if ((unsigned char)*p & 0x80)
4817             break;
4818         ++p;
4819     }
4820     memcpy(dest, start, p - start);
4821     return p - start;
4822 }
4823 
4824 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4825 PyUnicode_DecodeUTF8Stateful(const char *s,
4826                              Py_ssize_t size,
4827                              const char *errors,
4828                              Py_ssize_t *consumed)
4829 {
4830     _PyUnicodeWriter writer;
4831     const char *starts = s;
4832     const char *end = s + size;
4833 
4834     Py_ssize_t startinpos;
4835     Py_ssize_t endinpos;
4836     const char *errmsg = "";
4837     PyObject *error_handler_obj = NULL;
4838     PyObject *exc = NULL;
4839     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4840 
4841     if (size == 0) {
4842         if (consumed)
4843             *consumed = 0;
4844         _Py_RETURN_UNICODE_EMPTY();
4845     }
4846 
4847     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4848     if (size == 1 && (unsigned char)s[0] < 128) {
4849         if (consumed)
4850             *consumed = 1;
4851         return get_latin1_char((unsigned char)s[0]);
4852     }
4853 
4854     _PyUnicodeWriter_Init(&writer);
4855     writer.min_length = size;
4856     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4857         goto onError;
4858 
4859     writer.pos = ascii_decode(s, end, writer.data);
4860     s += writer.pos;
4861     while (s < end) {
4862         Py_UCS4 ch;
4863         int kind = writer.kind;
4864 
4865         if (kind == PyUnicode_1BYTE_KIND) {
4866             if (PyUnicode_IS_ASCII(writer.buffer))
4867                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4868             else
4869                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4870         } else if (kind == PyUnicode_2BYTE_KIND) {
4871             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4872         } else {
4873             assert(kind == PyUnicode_4BYTE_KIND);
4874             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4875         }
4876 
4877         switch (ch) {
4878         case 0:
4879             if (s == end || consumed)
4880                 goto End;
4881             errmsg = "unexpected end of data";
4882             startinpos = s - starts;
4883             endinpos = end - starts;
4884             break;
4885         case 1:
4886             errmsg = "invalid start byte";
4887             startinpos = s - starts;
4888             endinpos = startinpos + 1;
4889             break;
4890         case 2:
4891             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4892                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4893             {
4894                 /* Truncated surrogate code in range D800-DFFF */
4895                 goto End;
4896             }
4897             /* fall through */
4898         case 3:
4899         case 4:
4900             errmsg = "invalid continuation byte";
4901             startinpos = s - starts;
4902             endinpos = startinpos + ch - 1;
4903             break;
4904         default:
4905             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4906                 goto onError;
4907             continue;
4908         }
4909 
4910         if (error_handler == _Py_ERROR_UNKNOWN)
4911             error_handler = get_error_handler(errors);
4912 
4913         switch (error_handler) {
4914         case _Py_ERROR_IGNORE:
4915             s += (endinpos - startinpos);
4916             break;
4917 
4918         case _Py_ERROR_REPLACE:
4919             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4920                 goto onError;
4921             s += (endinpos - startinpos);
4922             break;
4923 
4924         case _Py_ERROR_SURROGATEESCAPE:
4925         {
4926             Py_ssize_t i;
4927 
4928             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4929                 goto onError;
4930             for (i=startinpos; i<endinpos; i++) {
4931                 ch = (Py_UCS4)(unsigned char)(starts[i]);
4932                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4933                                 ch + 0xdc00);
4934                 writer.pos++;
4935             }
4936             s += (endinpos - startinpos);
4937             break;
4938         }
4939 
4940         default:
4941             if (unicode_decode_call_errorhandler_writer(
4942                     errors, &error_handler_obj,
4943                     "utf-8", errmsg,
4944                     &starts, &end, &startinpos, &endinpos, &exc, &s,
4945                     &writer))
4946                 goto onError;
4947         }
4948     }
4949 
4950 End:
4951     if (consumed)
4952         *consumed = s - starts;
4953 
4954     Py_XDECREF(error_handler_obj);
4955     Py_XDECREF(exc);
4956     return _PyUnicodeWriter_Finish(&writer);
4957 
4958 onError:
4959     Py_XDECREF(error_handler_obj);
4960     Py_XDECREF(exc);
4961     _PyUnicodeWriter_Dealloc(&writer);
4962     return NULL;
4963 }
4964 
4965 
4966 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4967    non-zero, use strict error handler otherwise.
4968 
4969    On success, write a pointer to a newly allocated wide character string into
4970    *wstr (use PyMem_RawFree() to free the memory) and write the output length
4971    (in number of wchar_t units) into *wlen (if wlen is set).
4972 
4973    On memory allocation failure, return -1.
4974 
4975    On decoding error (if surrogateescape is zero), return -2. If wlen is
4976    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4977    is not NULL, write the decoding error message into *reason. */
4978 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,int surrogateescape)4979 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4980                  const char **reason, int surrogateescape)
4981 {
4982     const char *orig_s = s;
4983     const char *e;
4984     wchar_t *unicode;
4985     Py_ssize_t outpos;
4986 
4987     /* Note: size will always be longer than the resulting Unicode
4988        character count */
4989     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
4990         return -1;
4991     }
4992 
4993     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4994     if (!unicode) {
4995         return -1;
4996     }
4997 
4998     /* Unpack UTF-8 encoded data */
4999     e = s + size;
5000     outpos = 0;
5001     while (s < e) {
5002         Py_UCS4 ch;
5003 #if SIZEOF_WCHAR_T == 4
5004         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5005 #else
5006         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5007 #endif
5008         if (ch > 0xFF) {
5009 #if SIZEOF_WCHAR_T == 4
5010             Py_UNREACHABLE();
5011 #else
5012             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5013             /* write a surrogate pair */
5014             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5015             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5016 #endif
5017         }
5018         else {
5019             if (!ch && s == e)
5020                 break;
5021             if (!surrogateescape) {
5022                 PyMem_RawFree(unicode );
5023                 if (reason != NULL) {
5024                     switch (ch) {
5025                     case 0:
5026                         *reason = "unexpected end of data";
5027                         break;
5028                     case 1:
5029                         *reason = "invalid start byte";
5030                         break;
5031                     /* 2, 3, 4 */
5032                     default:
5033                         *reason = "invalid continuation byte";
5034                         break;
5035                     }
5036                 }
5037                 if (wlen != NULL) {
5038                     *wlen = s - orig_s;
5039                 }
5040                 return -2;
5041             }
5042             /* surrogateescape */
5043             unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5044         }
5045     }
5046     unicode[outpos] = L'\0';
5047     if (wlen) {
5048         *wlen = outpos;
5049     }
5050     *wstr = unicode;
5051     return 0;
5052 }
5053 
5054 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen)5055 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5056 {
5057     wchar_t *wstr;
5058     int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5059     if (res != 0) {
5060         return NULL;
5061     }
5062     return wstr;
5063 }
5064 
5065 
5066 /* UTF-8 encoder using the surrogateescape error handler .
5067 
5068    On success, return 0 and write the newly allocated character string (use
5069    PyMem_Free() to free the memory) into *str.
5070 
5071    On encoding failure, return -2 and write the position of the invalid
5072    surrogate character into *error_pos (if error_pos is set) and the decoding
5073    error message into *reason (if reason is set).
5074 
5075    On memory allocation failure, return -1. */
5076 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,int surrogateescape)5077 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5078                  const char **reason, int raw_malloc, int surrogateescape)
5079 {
5080     const Py_ssize_t max_char_size = 4;
5081     Py_ssize_t len = wcslen(text);
5082 
5083     assert(len >= 0);
5084 
5085     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5086         return -1;
5087     }
5088     char *bytes;
5089     if (raw_malloc) {
5090         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5091     }
5092     else {
5093         bytes = PyMem_Malloc((len + 1) * max_char_size);
5094     }
5095     if (bytes == NULL) {
5096         return -1;
5097     }
5098 
5099     char *p = bytes;
5100     Py_ssize_t i;
5101     for (i = 0; i < len; i++) {
5102         Py_UCS4 ch = text[i];
5103 
5104         if (ch < 0x80) {
5105             /* Encode ASCII */
5106             *p++ = (char) ch;
5107 
5108         }
5109         else if (ch < 0x0800) {
5110             /* Encode Latin-1 */
5111             *p++ = (char)(0xc0 | (ch >> 6));
5112             *p++ = (char)(0x80 | (ch & 0x3f));
5113         }
5114         else if (Py_UNICODE_IS_SURROGATE(ch)) {
5115             /* surrogateescape error handler */
5116             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5117                 if (error_pos != NULL) {
5118                     *error_pos = (size_t)i;
5119                 }
5120                 if (reason != NULL) {
5121                     *reason = "encoding error";
5122                 }
5123                 if (raw_malloc) {
5124                     PyMem_RawFree(bytes);
5125                 }
5126                 else {
5127                     PyMem_Free(bytes);
5128                 }
5129                 return -2;
5130             }
5131             *p++ = (char)(ch & 0xff);
5132         }
5133         else if (ch < 0x10000) {
5134             *p++ = (char)(0xe0 | (ch >> 12));
5135             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5136             *p++ = (char)(0x80 | (ch & 0x3f));
5137         }
5138         else {  /* ch >= 0x10000 */
5139             assert(ch <= MAX_UNICODE);
5140             /* Encode UCS4 Unicode ordinals */
5141             *p++ = (char)(0xf0 | (ch >> 18));
5142             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5143             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5144             *p++ = (char)(0x80 | (ch & 0x3f));
5145         }
5146     }
5147     *p++ = '\0';
5148 
5149     size_t final_size = (p - bytes);
5150     char *bytes2;
5151     if (raw_malloc) {
5152         bytes2 = PyMem_RawRealloc(bytes, final_size);
5153     }
5154     else {
5155         bytes2 = PyMem_Realloc(bytes, final_size);
5156     }
5157     if (bytes2 == NULL) {
5158         if (error_pos != NULL) {
5159             *error_pos = (size_t)-1;
5160         }
5161         if (raw_malloc) {
5162             PyMem_RawFree(bytes);
5163         }
5164         else {
5165             PyMem_Free(bytes);
5166         }
5167         return -1;
5168     }
5169     *str = bytes2;
5170     return 0;
5171 }
5172 
5173 
5174 /* Primary internal function which creates utf8 encoded bytes objects.
5175 
5176    Allocation strategy:  if the string is short, convert into a stack buffer
5177    and allocate exactly as much space needed at the end.  Else allocate the
5178    maximum possible needed (4 result bytes per Unicode character), and return
5179    the excess memory at the end.
5180 */
5181 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5182 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5183 {
5184     enum PyUnicode_Kind kind;
5185     void *data;
5186     Py_ssize_t size;
5187 
5188     if (!PyUnicode_Check(unicode)) {
5189         PyErr_BadArgument();
5190         return NULL;
5191     }
5192 
5193     if (PyUnicode_READY(unicode) == -1)
5194         return NULL;
5195 
5196     if (PyUnicode_UTF8(unicode))
5197         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5198                                          PyUnicode_UTF8_LENGTH(unicode));
5199 
5200     kind = PyUnicode_KIND(unicode);
5201     data = PyUnicode_DATA(unicode);
5202     size = PyUnicode_GET_LENGTH(unicode);
5203 
5204     switch (kind) {
5205     default:
5206         Py_UNREACHABLE();
5207     case PyUnicode_1BYTE_KIND:
5208         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5209         assert(!PyUnicode_IS_ASCII(unicode));
5210         return ucs1lib_utf8_encoder(unicode, data, size, errors);
5211     case PyUnicode_2BYTE_KIND:
5212         return ucs2lib_utf8_encoder(unicode, data, size, errors);
5213     case PyUnicode_4BYTE_KIND:
5214         return ucs4lib_utf8_encoder(unicode, data, size, errors);
5215     }
5216 }
5217 
5218 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5219 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5220                      Py_ssize_t size,
5221                      const char *errors)
5222 {
5223     PyObject *v, *unicode;
5224 
5225     unicode = PyUnicode_FromWideChar(s, size);
5226     if (unicode == NULL)
5227         return NULL;
5228     v = _PyUnicode_AsUTF8String(unicode, errors);
5229     Py_DECREF(unicode);
5230     return v;
5231 }
5232 
5233 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5234 PyUnicode_AsUTF8String(PyObject *unicode)
5235 {
5236     return _PyUnicode_AsUTF8String(unicode, NULL);
5237 }
5238 
5239 /* --- UTF-32 Codec ------------------------------------------------------- */
5240 
5241 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5242 PyUnicode_DecodeUTF32(const char *s,
5243                       Py_ssize_t size,
5244                       const char *errors,
5245                       int *byteorder)
5246 {
5247     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5248 }
5249 
5250 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5251 PyUnicode_DecodeUTF32Stateful(const char *s,
5252                               Py_ssize_t size,
5253                               const char *errors,
5254                               int *byteorder,
5255                               Py_ssize_t *consumed)
5256 {
5257     const char *starts = s;
5258     Py_ssize_t startinpos;
5259     Py_ssize_t endinpos;
5260     _PyUnicodeWriter writer;
5261     const unsigned char *q, *e;
5262     int le, bo = 0;       /* assume native ordering by default */
5263     const char *encoding;
5264     const char *errmsg = "";
5265     PyObject *errorHandler = NULL;
5266     PyObject *exc = NULL;
5267 
5268     q = (unsigned char *)s;
5269     e = q + size;
5270 
5271     if (byteorder)
5272         bo = *byteorder;
5273 
5274     /* Check for BOM marks (U+FEFF) in the input and adjust current
5275        byte order setting accordingly. In native mode, the leading BOM
5276        mark is skipped, in all other modes, it is copied to the output
5277        stream as-is (giving a ZWNBSP character). */
5278     if (bo == 0 && size >= 4) {
5279         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5280         if (bom == 0x0000FEFF) {
5281             bo = -1;
5282             q += 4;
5283         }
5284         else if (bom == 0xFFFE0000) {
5285             bo = 1;
5286             q += 4;
5287         }
5288         if (byteorder)
5289             *byteorder = bo;
5290     }
5291 
5292     if (q == e) {
5293         if (consumed)
5294             *consumed = size;
5295         _Py_RETURN_UNICODE_EMPTY();
5296     }
5297 
5298 #ifdef WORDS_BIGENDIAN
5299     le = bo < 0;
5300 #else
5301     le = bo <= 0;
5302 #endif
5303     encoding = le ? "utf-32-le" : "utf-32-be";
5304 
5305     _PyUnicodeWriter_Init(&writer);
5306     writer.min_length = (e - q + 3) / 4;
5307     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5308         goto onError;
5309 
5310     while (1) {
5311         Py_UCS4 ch = 0;
5312         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5313 
5314         if (e - q >= 4) {
5315             enum PyUnicode_Kind kind = writer.kind;
5316             void *data = writer.data;
5317             const unsigned char *last = e - 4;
5318             Py_ssize_t pos = writer.pos;
5319             if (le) {
5320                 do {
5321                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5322                     if (ch > maxch)
5323                         break;
5324                     if (kind != PyUnicode_1BYTE_KIND &&
5325                         Py_UNICODE_IS_SURROGATE(ch))
5326                         break;
5327                     PyUnicode_WRITE(kind, data, pos++, ch);
5328                     q += 4;
5329                 } while (q <= last);
5330             }
5331             else {
5332                 do {
5333                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5334                     if (ch > maxch)
5335                         break;
5336                     if (kind != PyUnicode_1BYTE_KIND &&
5337                         Py_UNICODE_IS_SURROGATE(ch))
5338                         break;
5339                     PyUnicode_WRITE(kind, data, pos++, ch);
5340                     q += 4;
5341                 } while (q <= last);
5342             }
5343             writer.pos = pos;
5344         }
5345 
5346         if (Py_UNICODE_IS_SURROGATE(ch)) {
5347             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5348             startinpos = ((const char *)q) - starts;
5349             endinpos = startinpos + 4;
5350         }
5351         else if (ch <= maxch) {
5352             if (q == e || consumed)
5353                 break;
5354             /* remaining bytes at the end? (size should be divisible by 4) */
5355             errmsg = "truncated data";
5356             startinpos = ((const char *)q) - starts;
5357             endinpos = ((const char *)e) - starts;
5358         }
5359         else {
5360             if (ch < 0x110000) {
5361                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5362                     goto onError;
5363                 q += 4;
5364                 continue;
5365             }
5366             errmsg = "code point not in range(0x110000)";
5367             startinpos = ((const char *)q) - starts;
5368             endinpos = startinpos + 4;
5369         }
5370 
5371         /* The remaining input chars are ignored if the callback
5372            chooses to skip the input */
5373         if (unicode_decode_call_errorhandler_writer(
5374                 errors, &errorHandler,
5375                 encoding, errmsg,
5376                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5377                 &writer))
5378             goto onError;
5379     }
5380 
5381     if (consumed)
5382         *consumed = (const char *)q-starts;
5383 
5384     Py_XDECREF(errorHandler);
5385     Py_XDECREF(exc);
5386     return _PyUnicodeWriter_Finish(&writer);
5387 
5388   onError:
5389     _PyUnicodeWriter_Dealloc(&writer);
5390     Py_XDECREF(errorHandler);
5391     Py_XDECREF(exc);
5392     return NULL;
5393 }
5394 
5395 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5396 _PyUnicode_EncodeUTF32(PyObject *str,
5397                        const char *errors,
5398                        int byteorder)
5399 {
5400     enum PyUnicode_Kind kind;
5401     const void *data;
5402     Py_ssize_t len;
5403     PyObject *v;
5404     uint32_t *out;
5405 #if PY_LITTLE_ENDIAN
5406     int native_ordering = byteorder <= 0;
5407 #else
5408     int native_ordering = byteorder >= 0;
5409 #endif
5410     const char *encoding;
5411     Py_ssize_t nsize, pos;
5412     PyObject *errorHandler = NULL;
5413     PyObject *exc = NULL;
5414     PyObject *rep = NULL;
5415 
5416     if (!PyUnicode_Check(str)) {
5417         PyErr_BadArgument();
5418         return NULL;
5419     }
5420     if (PyUnicode_READY(str) == -1)
5421         return NULL;
5422     kind = PyUnicode_KIND(str);
5423     data = PyUnicode_DATA(str);
5424     len = PyUnicode_GET_LENGTH(str);
5425 
5426     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5427         return PyErr_NoMemory();
5428     nsize = len + (byteorder == 0);
5429     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5430     if (v == NULL)
5431         return NULL;
5432 
5433     /* output buffer is 4-bytes aligned */
5434     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5435     out = (uint32_t *)PyBytes_AS_STRING(v);
5436     if (byteorder == 0)
5437         *out++ = 0xFEFF;
5438     if (len == 0)
5439         goto done;
5440 
5441     if (byteorder == -1)
5442         encoding = "utf-32-le";
5443     else if (byteorder == 1)
5444         encoding = "utf-32-be";
5445     else
5446         encoding = "utf-32";
5447 
5448     if (kind == PyUnicode_1BYTE_KIND) {
5449         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5450         goto done;
5451     }
5452 
5453     pos = 0;
5454     while (pos < len) {
5455         Py_ssize_t repsize, moreunits;
5456 
5457         if (kind == PyUnicode_2BYTE_KIND) {
5458             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5459                                         &out, native_ordering);
5460         }
5461         else {
5462             assert(kind == PyUnicode_4BYTE_KIND);
5463             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5464                                         &out, native_ordering);
5465         }
5466         if (pos == len)
5467             break;
5468 
5469         rep = unicode_encode_call_errorhandler(
5470                 errors, &errorHandler,
5471                 encoding, "surrogates not allowed",
5472                 str, &exc, pos, pos + 1, &pos);
5473         if (!rep)
5474             goto error;
5475 
5476         if (PyBytes_Check(rep)) {
5477             repsize = PyBytes_GET_SIZE(rep);
5478             if (repsize & 3) {
5479                 raise_encode_exception(&exc, encoding,
5480                                        str, pos - 1, pos,
5481                                        "surrogates not allowed");
5482                 goto error;
5483             }
5484             moreunits = repsize / 4;
5485         }
5486         else {
5487             assert(PyUnicode_Check(rep));
5488             if (PyUnicode_READY(rep) < 0)
5489                 goto error;
5490             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5491             if (!PyUnicode_IS_ASCII(rep)) {
5492                 raise_encode_exception(&exc, encoding,
5493                                        str, pos - 1, pos,
5494                                        "surrogates not allowed");
5495                 goto error;
5496             }
5497         }
5498 
5499         /* four bytes are reserved for each surrogate */
5500         if (moreunits > 1) {
5501             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5502             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5503                 /* integer overflow */
5504                 PyErr_NoMemory();
5505                 goto error;
5506             }
5507             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5508                 goto error;
5509             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5510         }
5511 
5512         if (PyBytes_Check(rep)) {
5513             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5514             out += moreunits;
5515         } else /* rep is unicode */ {
5516             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5517             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5518                                  &out, native_ordering);
5519         }
5520 
5521         Py_CLEAR(rep);
5522     }
5523 
5524     /* Cut back to size actually needed. This is necessary for, for example,
5525        encoding of a string containing isolated surrogates and the 'ignore'
5526        handler is used. */
5527     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5528     if (nsize != PyBytes_GET_SIZE(v))
5529       _PyBytes_Resize(&v, nsize);
5530     Py_XDECREF(errorHandler);
5531     Py_XDECREF(exc);
5532   done:
5533     return v;
5534   error:
5535     Py_XDECREF(rep);
5536     Py_XDECREF(errorHandler);
5537     Py_XDECREF(exc);
5538     Py_XDECREF(v);
5539     return NULL;
5540 }
5541 
5542 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5543 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5544                       Py_ssize_t size,
5545                       const char *errors,
5546                       int byteorder)
5547 {
5548     PyObject *result;
5549     PyObject *tmp = PyUnicode_FromWideChar(s, size);
5550     if (tmp == NULL)
5551         return NULL;
5552     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5553     Py_DECREF(tmp);
5554     return result;
5555 }
5556 
5557 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5558 PyUnicode_AsUTF32String(PyObject *unicode)
5559 {
5560     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5561 }
5562 
5563 /* --- UTF-16 Codec ------------------------------------------------------- */
5564 
5565 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5566 PyUnicode_DecodeUTF16(const char *s,
5567                       Py_ssize_t size,
5568                       const char *errors,
5569                       int *byteorder)
5570 {
5571     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5572 }
5573 
5574 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5575 PyUnicode_DecodeUTF16Stateful(const char *s,
5576                               Py_ssize_t size,
5577                               const char *errors,
5578                               int *byteorder,
5579                               Py_ssize_t *consumed)
5580 {
5581     const char *starts = s;
5582     Py_ssize_t startinpos;
5583     Py_ssize_t endinpos;
5584     _PyUnicodeWriter writer;
5585     const unsigned char *q, *e;
5586     int bo = 0;       /* assume native ordering by default */
5587     int native_ordering;
5588     const char *errmsg = "";
5589     PyObject *errorHandler = NULL;
5590     PyObject *exc = NULL;
5591     const char *encoding;
5592 
5593     q = (unsigned char *)s;
5594     e = q + size;
5595 
5596     if (byteorder)
5597         bo = *byteorder;
5598 
5599     /* Check for BOM marks (U+FEFF) in the input and adjust current
5600        byte order setting accordingly. In native mode, the leading BOM
5601        mark is skipped, in all other modes, it is copied to the output
5602        stream as-is (giving a ZWNBSP character). */
5603     if (bo == 0 && size >= 2) {
5604         const Py_UCS4 bom = (q[1] << 8) | q[0];
5605         if (bom == 0xFEFF) {
5606             q += 2;
5607             bo = -1;
5608         }
5609         else if (bom == 0xFFFE) {
5610             q += 2;
5611             bo = 1;
5612         }
5613         if (byteorder)
5614             *byteorder = bo;
5615     }
5616 
5617     if (q == e) {
5618         if (consumed)
5619             *consumed = size;
5620         _Py_RETURN_UNICODE_EMPTY();
5621     }
5622 
5623 #if PY_LITTLE_ENDIAN
5624     native_ordering = bo <= 0;
5625     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5626 #else
5627     native_ordering = bo >= 0;
5628     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5629 #endif
5630 
5631     /* Note: size will always be longer than the resulting Unicode
5632        character count normally.  Error handler will take care of
5633        resizing when needed. */
5634     _PyUnicodeWriter_Init(&writer);
5635     writer.min_length = (e - q + 1) / 2;
5636     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5637         goto onError;
5638 
5639     while (1) {
5640         Py_UCS4 ch = 0;
5641         if (e - q >= 2) {
5642             int kind = writer.kind;
5643             if (kind == PyUnicode_1BYTE_KIND) {
5644                 if (PyUnicode_IS_ASCII(writer.buffer))
5645                     ch = asciilib_utf16_decode(&q, e,
5646                             (Py_UCS1*)writer.data, &writer.pos,
5647                             native_ordering);
5648                 else
5649                     ch = ucs1lib_utf16_decode(&q, e,
5650                             (Py_UCS1*)writer.data, &writer.pos,
5651                             native_ordering);
5652             } else if (kind == PyUnicode_2BYTE_KIND) {
5653                 ch = ucs2lib_utf16_decode(&q, e,
5654                         (Py_UCS2*)writer.data, &writer.pos,
5655                         native_ordering);
5656             } else {
5657                 assert(kind == PyUnicode_4BYTE_KIND);
5658                 ch = ucs4lib_utf16_decode(&q, e,
5659                         (Py_UCS4*)writer.data, &writer.pos,
5660                         native_ordering);
5661             }
5662         }
5663 
5664         switch (ch)
5665         {
5666         case 0:
5667             /* remaining byte at the end? (size should be even) */
5668             if (q == e || consumed)
5669                 goto End;
5670             errmsg = "truncated data";
5671             startinpos = ((const char *)q) - starts;
5672             endinpos = ((const char *)e) - starts;
5673             break;
5674             /* The remaining input chars are ignored if the callback
5675                chooses to skip the input */
5676         case 1:
5677             q -= 2;
5678             if (consumed)
5679                 goto End;
5680             errmsg = "unexpected end of data";
5681             startinpos = ((const char *)q) - starts;
5682             endinpos = ((const char *)e) - starts;
5683             break;
5684         case 2:
5685             errmsg = "illegal encoding";
5686             startinpos = ((const char *)q) - 2 - starts;
5687             endinpos = startinpos + 2;
5688             break;
5689         case 3:
5690             errmsg = "illegal UTF-16 surrogate";
5691             startinpos = ((const char *)q) - 4 - starts;
5692             endinpos = startinpos + 2;
5693             break;
5694         default:
5695             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5696                 goto onError;
5697             continue;
5698         }
5699 
5700         if (unicode_decode_call_errorhandler_writer(
5701                 errors,
5702                 &errorHandler,
5703                 encoding, errmsg,
5704                 &starts,
5705                 (const char **)&e,
5706                 &startinpos,
5707                 &endinpos,
5708                 &exc,
5709                 (const char **)&q,
5710                 &writer))
5711             goto onError;
5712     }
5713 
5714 End:
5715     if (consumed)
5716         *consumed = (const char *)q-starts;
5717 
5718     Py_XDECREF(errorHandler);
5719     Py_XDECREF(exc);
5720     return _PyUnicodeWriter_Finish(&writer);
5721 
5722   onError:
5723     _PyUnicodeWriter_Dealloc(&writer);
5724     Py_XDECREF(errorHandler);
5725     Py_XDECREF(exc);
5726     return NULL;
5727 }
5728 
5729 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5730 _PyUnicode_EncodeUTF16(PyObject *str,
5731                        const char *errors,
5732                        int byteorder)
5733 {
5734     enum PyUnicode_Kind kind;
5735     const void *data;
5736     Py_ssize_t len;
5737     PyObject *v;
5738     unsigned short *out;
5739     Py_ssize_t pairs;
5740 #if PY_BIG_ENDIAN
5741     int native_ordering = byteorder >= 0;
5742 #else
5743     int native_ordering = byteorder <= 0;
5744 #endif
5745     const char *encoding;
5746     Py_ssize_t nsize, pos;
5747     PyObject *errorHandler = NULL;
5748     PyObject *exc = NULL;
5749     PyObject *rep = NULL;
5750 
5751     if (!PyUnicode_Check(str)) {
5752         PyErr_BadArgument();
5753         return NULL;
5754     }
5755     if (PyUnicode_READY(str) == -1)
5756         return NULL;
5757     kind = PyUnicode_KIND(str);
5758     data = PyUnicode_DATA(str);
5759     len = PyUnicode_GET_LENGTH(str);
5760 
5761     pairs = 0;
5762     if (kind == PyUnicode_4BYTE_KIND) {
5763         const Py_UCS4 *in = (const Py_UCS4 *)data;
5764         const Py_UCS4 *end = in + len;
5765         while (in < end) {
5766             if (*in++ >= 0x10000) {
5767                 pairs++;
5768             }
5769         }
5770     }
5771     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5772         return PyErr_NoMemory();
5773     }
5774     nsize = len + pairs + (byteorder == 0);
5775     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5776     if (v == NULL) {
5777         return NULL;
5778     }
5779 
5780     /* output buffer is 2-bytes aligned */
5781     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5782     out = (unsigned short *)PyBytes_AS_STRING(v);
5783     if (byteorder == 0) {
5784         *out++ = 0xFEFF;
5785     }
5786     if (len == 0) {
5787         goto done;
5788     }
5789 
5790     if (kind == PyUnicode_1BYTE_KIND) {
5791         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5792         goto done;
5793     }
5794 
5795     if (byteorder < 0) {
5796         encoding = "utf-16-le";
5797     }
5798     else if (byteorder > 0) {
5799         encoding = "utf-16-be";
5800     }
5801     else {
5802         encoding = "utf-16";
5803     }
5804 
5805     pos = 0;
5806     while (pos < len) {
5807         Py_ssize_t repsize, moreunits;
5808 
5809         if (kind == PyUnicode_2BYTE_KIND) {
5810             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5811                                         &out, native_ordering);
5812         }
5813         else {
5814             assert(kind == PyUnicode_4BYTE_KIND);
5815             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5816                                         &out, native_ordering);
5817         }
5818         if (pos == len)
5819             break;
5820 
5821         rep = unicode_encode_call_errorhandler(
5822                 errors, &errorHandler,
5823                 encoding, "surrogates not allowed",
5824                 str, &exc, pos, pos + 1, &pos);
5825         if (!rep)
5826             goto error;
5827 
5828         if (PyBytes_Check(rep)) {
5829             repsize = PyBytes_GET_SIZE(rep);
5830             if (repsize & 1) {
5831                 raise_encode_exception(&exc, encoding,
5832                                        str, pos - 1, pos,
5833                                        "surrogates not allowed");
5834                 goto error;
5835             }
5836             moreunits = repsize / 2;
5837         }
5838         else {
5839             assert(PyUnicode_Check(rep));
5840             if (PyUnicode_READY(rep) < 0)
5841                 goto error;
5842             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5843             if (!PyUnicode_IS_ASCII(rep)) {
5844                 raise_encode_exception(&exc, encoding,
5845                                        str, pos - 1, pos,
5846                                        "surrogates not allowed");
5847                 goto error;
5848             }
5849         }
5850 
5851         /* two bytes are reserved for each surrogate */
5852         if (moreunits > 1) {
5853             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5854             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5855                 /* integer overflow */
5856                 PyErr_NoMemory();
5857                 goto error;
5858             }
5859             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
5860                 goto error;
5861             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5862         }
5863 
5864         if (PyBytes_Check(rep)) {
5865             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5866             out += moreunits;
5867         } else /* rep is unicode */ {
5868             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5869             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5870                                  &out, native_ordering);
5871         }
5872 
5873         Py_CLEAR(rep);
5874     }
5875 
5876     /* Cut back to size actually needed. This is necessary for, for example,
5877     encoding of a string containing isolated surrogates and the 'ignore' handler
5878     is used. */
5879     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5880     if (nsize != PyBytes_GET_SIZE(v))
5881       _PyBytes_Resize(&v, nsize);
5882     Py_XDECREF(errorHandler);
5883     Py_XDECREF(exc);
5884   done:
5885     return v;
5886   error:
5887     Py_XDECREF(rep);
5888     Py_XDECREF(errorHandler);
5889     Py_XDECREF(exc);
5890     Py_XDECREF(v);
5891     return NULL;
5892 #undef STORECHAR
5893 }
5894 
5895 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5896 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5897                       Py_ssize_t size,
5898                       const char *errors,
5899                       int byteorder)
5900 {
5901     PyObject *result;
5902     PyObject *tmp = PyUnicode_FromWideChar(s, size);
5903     if (tmp == NULL)
5904         return NULL;
5905     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5906     Py_DECREF(tmp);
5907     return result;
5908 }
5909 
5910 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)5911 PyUnicode_AsUTF16String(PyObject *unicode)
5912 {
5913     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5914 }
5915 
5916 /* --- Unicode Escape Codec ----------------------------------------------- */
5917 
5918 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5919 
5920 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)5921 _PyUnicode_DecodeUnicodeEscape(const char *s,
5922                                Py_ssize_t size,
5923                                const char *errors,
5924                                const char **first_invalid_escape)
5925 {
5926     const char *starts = s;
5927     _PyUnicodeWriter writer;
5928     const char *end;
5929     PyObject *errorHandler = NULL;
5930     PyObject *exc = NULL;
5931 
5932     // so we can remember if we've seen an invalid escape char or not
5933     *first_invalid_escape = NULL;
5934 
5935     if (size == 0) {
5936         _Py_RETURN_UNICODE_EMPTY();
5937     }
5938     /* Escaped strings will always be longer than the resulting
5939        Unicode string, so we start with size here and then reduce the
5940        length after conversion to the true value.
5941        (but if the error callback returns a long replacement string
5942        we'll have to allocate more space) */
5943     _PyUnicodeWriter_Init(&writer);
5944     writer.min_length = size;
5945     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5946         goto onError;
5947     }
5948 
5949     end = s + size;
5950     while (s < end) {
5951         unsigned char c = (unsigned char) *s++;
5952         Py_UCS4 ch;
5953         int count;
5954         Py_ssize_t startinpos;
5955         Py_ssize_t endinpos;
5956         const char *message;
5957 
5958 #define WRITE_ASCII_CHAR(ch)                                                  \
5959             do {                                                              \
5960                 assert(ch <= 127);                                            \
5961                 assert(writer.pos < writer.size);                             \
5962                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
5963             } while(0)
5964 
5965 #define WRITE_CHAR(ch)                                                        \
5966             do {                                                              \
5967                 if (ch <= writer.maxchar) {                                   \
5968                     assert(writer.pos < writer.size);                         \
5969                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5970                 }                                                             \
5971                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5972                     goto onError;                                             \
5973                 }                                                             \
5974             } while(0)
5975 
5976         /* Non-escape characters are interpreted as Unicode ordinals */
5977         if (c != '\\') {
5978             WRITE_CHAR(c);
5979             continue;
5980         }
5981 
5982         startinpos = s - starts - 1;
5983         /* \ - Escapes */
5984         if (s >= end) {
5985             message = "\\ at end of string";
5986             goto error;
5987         }
5988         c = (unsigned char) *s++;
5989 
5990         assert(writer.pos < writer.size);
5991         switch (c) {
5992 
5993             /* \x escapes */
5994         case '\n': continue;
5995         case '\\': WRITE_ASCII_CHAR('\\'); continue;
5996         case '\'': WRITE_ASCII_CHAR('\''); continue;
5997         case '\"': WRITE_ASCII_CHAR('\"'); continue;
5998         case 'b': WRITE_ASCII_CHAR('\b'); continue;
5999         /* FF */
6000         case 'f': WRITE_ASCII_CHAR('\014'); continue;
6001         case 't': WRITE_ASCII_CHAR('\t'); continue;
6002         case 'n': WRITE_ASCII_CHAR('\n'); continue;
6003         case 'r': WRITE_ASCII_CHAR('\r'); continue;
6004         /* VT */
6005         case 'v': WRITE_ASCII_CHAR('\013'); continue;
6006         /* BEL, not classic C */
6007         case 'a': WRITE_ASCII_CHAR('\007'); continue;
6008 
6009             /* \OOO (octal) escapes */
6010         case '0': case '1': case '2': case '3':
6011         case '4': case '5': case '6': case '7':
6012             ch = c - '0';
6013             if (s < end && '0' <= *s && *s <= '7') {
6014                 ch = (ch<<3) + *s++ - '0';
6015                 if (s < end && '0' <= *s && *s <= '7') {
6016                     ch = (ch<<3) + *s++ - '0';
6017                 }
6018             }
6019             WRITE_CHAR(ch);
6020             continue;
6021 
6022             /* hex escapes */
6023             /* \xXX */
6024         case 'x':
6025             count = 2;
6026             message = "truncated \\xXX escape";
6027             goto hexescape;
6028 
6029             /* \uXXXX */
6030         case 'u':
6031             count = 4;
6032             message = "truncated \\uXXXX escape";
6033             goto hexescape;
6034 
6035             /* \UXXXXXXXX */
6036         case 'U':
6037             count = 8;
6038             message = "truncated \\UXXXXXXXX escape";
6039         hexescape:
6040             for (ch = 0; count && s < end; ++s, --count) {
6041                 c = (unsigned char)*s;
6042                 ch <<= 4;
6043                 if (c >= '0' && c <= '9') {
6044                     ch += c - '0';
6045                 }
6046                 else if (c >= 'a' && c <= 'f') {
6047                     ch += c - ('a' - 10);
6048                 }
6049                 else if (c >= 'A' && c <= 'F') {
6050                     ch += c - ('A' - 10);
6051                 }
6052                 else {
6053                     break;
6054                 }
6055             }
6056             if (count) {
6057                 goto error;
6058             }
6059 
6060             /* when we get here, ch is a 32-bit unicode character */
6061             if (ch > MAX_UNICODE) {
6062                 message = "illegal Unicode character";
6063                 goto error;
6064             }
6065 
6066             WRITE_CHAR(ch);
6067             continue;
6068 
6069             /* \N{name} */
6070         case 'N':
6071             if (ucnhash_CAPI == NULL) {
6072                 /* load the unicode data module */
6073                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6074                                                 PyUnicodeData_CAPSULE_NAME, 1);
6075                 if (ucnhash_CAPI == NULL) {
6076                     PyErr_SetString(
6077                         PyExc_UnicodeError,
6078                         "\\N escapes not supported (can't load unicodedata module)"
6079                         );
6080                     goto onError;
6081                 }
6082             }
6083 
6084             message = "malformed \\N character escape";
6085             if (s < end && *s == '{') {
6086                 const char *start = ++s;
6087                 size_t namelen;
6088                 /* look for the closing brace */
6089                 while (s < end && *s != '}')
6090                     s++;
6091                 namelen = s - start;
6092                 if (namelen && s < end) {
6093                     /* found a name.  look it up in the unicode database */
6094                     s++;
6095                     ch = 0xffffffff; /* in case 'getcode' messes up */
6096                     if (namelen <= INT_MAX &&
6097                         ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6098                                               &ch, 0)) {
6099                         assert(ch <= MAX_UNICODE);
6100                         WRITE_CHAR(ch);
6101                         continue;
6102                     }
6103                     message = "unknown Unicode character name";
6104                 }
6105             }
6106             goto error;
6107 
6108         default:
6109             if (*first_invalid_escape == NULL) {
6110                 *first_invalid_escape = s-1; /* Back up one char, since we've
6111                                                 already incremented s. */
6112             }
6113             WRITE_ASCII_CHAR('\\');
6114             WRITE_CHAR(c);
6115             continue;
6116         }
6117 
6118       error:
6119         endinpos = s-starts;
6120         writer.min_length = end - s + writer.pos;
6121         if (unicode_decode_call_errorhandler_writer(
6122                 errors, &errorHandler,
6123                 "unicodeescape", message,
6124                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6125                 &writer)) {
6126             goto onError;
6127         }
6128         assert(end - s <= writer.size - writer.pos);
6129 
6130 #undef WRITE_ASCII_CHAR
6131 #undef WRITE_CHAR
6132     }
6133 
6134     Py_XDECREF(errorHandler);
6135     Py_XDECREF(exc);
6136     return _PyUnicodeWriter_Finish(&writer);
6137 
6138   onError:
6139     _PyUnicodeWriter_Dealloc(&writer);
6140     Py_XDECREF(errorHandler);
6141     Py_XDECREF(exc);
6142     return NULL;
6143 }
6144 
6145 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6146 PyUnicode_DecodeUnicodeEscape(const char *s,
6147                               Py_ssize_t size,
6148                               const char *errors)
6149 {
6150     const char *first_invalid_escape;
6151     PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6152                                                       &first_invalid_escape);
6153     if (result == NULL)
6154         return NULL;
6155     if (first_invalid_escape != NULL) {
6156         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6157                              "invalid escape sequence '\\%c'",
6158                              (unsigned char)*first_invalid_escape) < 0) {
6159             Py_DECREF(result);
6160             return NULL;
6161         }
6162     }
6163     return result;
6164 }
6165 
6166 /* Return a Unicode-Escape string version of the Unicode object. */
6167 
6168 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6169 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6170 {
6171     Py_ssize_t i, len;
6172     PyObject *repr;
6173     char *p;
6174     enum PyUnicode_Kind kind;
6175     void *data;
6176     Py_ssize_t expandsize;
6177 
6178     /* Initial allocation is based on the longest-possible character
6179        escape.
6180 
6181        For UCS1 strings it's '\xxx', 4 bytes per source character.
6182        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6183        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6184     */
6185 
6186     if (!PyUnicode_Check(unicode)) {
6187         PyErr_BadArgument();
6188         return NULL;
6189     }
6190     if (PyUnicode_READY(unicode) == -1) {
6191         return NULL;
6192     }
6193 
6194     len = PyUnicode_GET_LENGTH(unicode);
6195     if (len == 0) {
6196         return PyBytes_FromStringAndSize(NULL, 0);
6197     }
6198 
6199     kind = PyUnicode_KIND(unicode);
6200     data = PyUnicode_DATA(unicode);
6201     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6202        bytes, and 1 byte characters 4. */
6203     expandsize = kind * 2 + 2;
6204     if (len > PY_SSIZE_T_MAX / expandsize) {
6205         return PyErr_NoMemory();
6206     }
6207     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6208     if (repr == NULL) {
6209         return NULL;
6210     }
6211 
6212     p = PyBytes_AS_STRING(repr);
6213     for (i = 0; i < len; i++) {
6214         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6215 
6216         /* U+0000-U+00ff range */
6217         if (ch < 0x100) {
6218             if (ch >= ' ' && ch < 127) {
6219                 if (ch != '\\') {
6220                     /* Copy printable US ASCII as-is */
6221                     *p++ = (char) ch;
6222                 }
6223                 /* Escape backslashes */
6224                 else {
6225                     *p++ = '\\';
6226                     *p++ = '\\';
6227                 }
6228             }
6229 
6230             /* Map special whitespace to '\t', \n', '\r' */
6231             else if (ch == '\t') {
6232                 *p++ = '\\';
6233                 *p++ = 't';
6234             }
6235             else if (ch == '\n') {
6236                 *p++ = '\\';
6237                 *p++ = 'n';
6238             }
6239             else if (ch == '\r') {
6240                 *p++ = '\\';
6241                 *p++ = 'r';
6242             }
6243 
6244             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6245             else {
6246                 *p++ = '\\';
6247                 *p++ = 'x';
6248                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6249                 *p++ = Py_hexdigits[ch & 0x000F];
6250             }
6251         }
6252         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6253         else if (ch < 0x10000) {
6254             *p++ = '\\';
6255             *p++ = 'u';
6256             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6257             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6258             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6259             *p++ = Py_hexdigits[ch & 0x000F];
6260         }
6261         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6262         else {
6263 
6264             /* Make sure that the first two digits are zero */
6265             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6266             *p++ = '\\';
6267             *p++ = 'U';
6268             *p++ = '0';
6269             *p++ = '0';
6270             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6271             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6272             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6273             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6274             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6275             *p++ = Py_hexdigits[ch & 0x0000000F];
6276         }
6277     }
6278 
6279     assert(p - PyBytes_AS_STRING(repr) > 0);
6280     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6281         return NULL;
6282     }
6283     return repr;
6284 }
6285 
6286 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6287 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6288                               Py_ssize_t size)
6289 {
6290     PyObject *result;
6291     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6292     if (tmp == NULL) {
6293         return NULL;
6294     }
6295 
6296     result = PyUnicode_AsUnicodeEscapeString(tmp);
6297     Py_DECREF(tmp);
6298     return result;
6299 }
6300 
6301 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6302 
6303 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6304 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6305                                  Py_ssize_t size,
6306                                  const char *errors)
6307 {
6308     const char *starts = s;
6309     _PyUnicodeWriter writer;
6310     const char *end;
6311     PyObject *errorHandler = NULL;
6312     PyObject *exc = NULL;
6313 
6314     if (size == 0) {
6315         _Py_RETURN_UNICODE_EMPTY();
6316     }
6317 
6318     /* Escaped strings will always be longer than the resulting
6319        Unicode string, so we start with size here and then reduce the
6320        length after conversion to the true value. (But decoding error
6321        handler might have to resize the string) */
6322     _PyUnicodeWriter_Init(&writer);
6323      writer.min_length = size;
6324     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6325         goto onError;
6326     }
6327 
6328     end = s + size;
6329     while (s < end) {
6330         unsigned char c = (unsigned char) *s++;
6331         Py_UCS4 ch;
6332         int count;
6333         Py_ssize_t startinpos;
6334         Py_ssize_t endinpos;
6335         const char *message;
6336 
6337 #define WRITE_CHAR(ch)                                                        \
6338             do {                                                              \
6339                 if (ch <= writer.maxchar) {                                   \
6340                     assert(writer.pos < writer.size);                         \
6341                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6342                 }                                                             \
6343                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6344                     goto onError;                                             \
6345                 }                                                             \
6346             } while(0)
6347 
6348         /* Non-escape characters are interpreted as Unicode ordinals */
6349         if (c != '\\' || s >= end) {
6350             WRITE_CHAR(c);
6351             continue;
6352         }
6353 
6354         c = (unsigned char) *s++;
6355         if (c == 'u') {
6356             count = 4;
6357             message = "truncated \\uXXXX escape";
6358         }
6359         else if (c == 'U') {
6360             count = 8;
6361             message = "truncated \\UXXXXXXXX escape";
6362         }
6363         else {
6364             assert(writer.pos < writer.size);
6365             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6366             WRITE_CHAR(c);
6367             continue;
6368         }
6369         startinpos = s - starts - 2;
6370 
6371         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6372         for (ch = 0; count && s < end; ++s, --count) {
6373             c = (unsigned char)*s;
6374             ch <<= 4;
6375             if (c >= '0' && c <= '9') {
6376                 ch += c - '0';
6377             }
6378             else if (c >= 'a' && c <= 'f') {
6379                 ch += c - ('a' - 10);
6380             }
6381             else if (c >= 'A' && c <= 'F') {
6382                 ch += c - ('A' - 10);
6383             }
6384             else {
6385                 break;
6386             }
6387         }
6388         if (!count) {
6389             if (ch <= MAX_UNICODE) {
6390                 WRITE_CHAR(ch);
6391                 continue;
6392             }
6393             message = "\\Uxxxxxxxx out of range";
6394         }
6395 
6396         endinpos = s-starts;
6397         writer.min_length = end - s + writer.pos;
6398         if (unicode_decode_call_errorhandler_writer(
6399                 errors, &errorHandler,
6400                 "rawunicodeescape", message,
6401                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6402                 &writer)) {
6403             goto onError;
6404         }
6405         assert(end - s <= writer.size - writer.pos);
6406 
6407 #undef WRITE_CHAR
6408     }
6409     Py_XDECREF(errorHandler);
6410     Py_XDECREF(exc);
6411     return _PyUnicodeWriter_Finish(&writer);
6412 
6413   onError:
6414     _PyUnicodeWriter_Dealloc(&writer);
6415     Py_XDECREF(errorHandler);
6416     Py_XDECREF(exc);
6417     return NULL;
6418 
6419 }
6420 
6421 
6422 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6423 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6424 {
6425     PyObject *repr;
6426     char *p;
6427     Py_ssize_t expandsize, pos;
6428     int kind;
6429     void *data;
6430     Py_ssize_t len;
6431 
6432     if (!PyUnicode_Check(unicode)) {
6433         PyErr_BadArgument();
6434         return NULL;
6435     }
6436     if (PyUnicode_READY(unicode) == -1) {
6437         return NULL;
6438     }
6439     kind = PyUnicode_KIND(unicode);
6440     data = PyUnicode_DATA(unicode);
6441     len = PyUnicode_GET_LENGTH(unicode);
6442     if (kind == PyUnicode_1BYTE_KIND) {
6443         return PyBytes_FromStringAndSize(data, len);
6444     }
6445 
6446     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6447        bytes, and 1 byte characters 4. */
6448     expandsize = kind * 2 + 2;
6449 
6450     if (len > PY_SSIZE_T_MAX / expandsize) {
6451         return PyErr_NoMemory();
6452     }
6453     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6454     if (repr == NULL) {
6455         return NULL;
6456     }
6457     if (len == 0) {
6458         return repr;
6459     }
6460 
6461     p = PyBytes_AS_STRING(repr);
6462     for (pos = 0; pos < len; pos++) {
6463         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6464 
6465         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6466         if (ch < 0x100) {
6467             *p++ = (char) ch;
6468         }
6469         /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6470         else if (ch < 0x10000) {
6471             *p++ = '\\';
6472             *p++ = 'u';
6473             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6474             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6475             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6476             *p++ = Py_hexdigits[ch & 15];
6477         }
6478         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6479         else {
6480             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6481             *p++ = '\\';
6482             *p++ = 'U';
6483             *p++ = '0';
6484             *p++ = '0';
6485             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6486             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6487             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6488             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6489             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6490             *p++ = Py_hexdigits[ch & 15];
6491         }
6492     }
6493 
6494     assert(p > PyBytes_AS_STRING(repr));
6495     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6496         return NULL;
6497     }
6498     return repr;
6499 }
6500 
6501 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6502 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6503                                  Py_ssize_t size)
6504 {
6505     PyObject *result;
6506     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6507     if (tmp == NULL)
6508         return NULL;
6509     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6510     Py_DECREF(tmp);
6511     return result;
6512 }
6513 
6514 /* --- Unicode Internal Codec ------------------------------------------- */
6515 
6516 PyObject *
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)6517 _PyUnicode_DecodeUnicodeInternal(const char *s,
6518                                  Py_ssize_t size,
6519                                  const char *errors)
6520 {
6521     const char *starts = s;
6522     Py_ssize_t startinpos;
6523     Py_ssize_t endinpos;
6524     _PyUnicodeWriter writer;
6525     const char *end;
6526     const char *reason;
6527     PyObject *errorHandler = NULL;
6528     PyObject *exc = NULL;
6529 
6530     if (PyErr_WarnEx(PyExc_DeprecationWarning,
6531                      "unicode_internal codec has been deprecated",
6532                      1))
6533         return NULL;
6534 
6535     if (size < 0) {
6536         PyErr_BadInternalCall();
6537         return NULL;
6538     }
6539     if (size == 0)
6540         _Py_RETURN_UNICODE_EMPTY();
6541 
6542     _PyUnicodeWriter_Init(&writer);
6543     if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6544         PyErr_NoMemory();
6545         goto onError;
6546     }
6547     writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6548 
6549     end = s + size;
6550     while (s < end) {
6551         Py_UNICODE uch;
6552         Py_UCS4 ch;
6553         if (end - s < Py_UNICODE_SIZE) {
6554             endinpos = end-starts;
6555             reason = "truncated input";
6556             goto error;
6557         }
6558         /* We copy the raw representation one byte at a time because the
6559            pointer may be unaligned (see test_codeccallbacks). */
6560         ((char *) &uch)[0] = s[0];
6561         ((char *) &uch)[1] = s[1];
6562 #ifdef Py_UNICODE_WIDE
6563         ((char *) &uch)[2] = s[2];
6564         ((char *) &uch)[3] = s[3];
6565 #endif
6566         ch = uch;
6567 #ifdef Py_UNICODE_WIDE
6568         /* We have to sanity check the raw data, otherwise doom looms for
6569            some malformed UCS-4 data. */
6570         if (ch > 0x10ffff) {
6571             endinpos = s - starts + Py_UNICODE_SIZE;
6572             reason = "illegal code point (> 0x10FFFF)";
6573             goto error;
6574         }
6575 #endif
6576         s += Py_UNICODE_SIZE;
6577 #ifndef Py_UNICODE_WIDE
6578         if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6579         {
6580             Py_UNICODE uch2;
6581             ((char *) &uch2)[0] = s[0];
6582             ((char *) &uch2)[1] = s[1];
6583             if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6584             {
6585                 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6586                 s += Py_UNICODE_SIZE;
6587             }
6588         }
6589 #endif
6590 
6591         if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6592             goto onError;
6593         continue;
6594 
6595   error:
6596         startinpos = s - starts;
6597         if (unicode_decode_call_errorhandler_writer(
6598                 errors, &errorHandler,
6599                 "unicode_internal", reason,
6600                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6601                 &writer))
6602             goto onError;
6603     }
6604 
6605     Py_XDECREF(errorHandler);
6606     Py_XDECREF(exc);
6607     return _PyUnicodeWriter_Finish(&writer);
6608 
6609   onError:
6610     _PyUnicodeWriter_Dealloc(&writer);
6611     Py_XDECREF(errorHandler);
6612     Py_XDECREF(exc);
6613     return NULL;
6614 }
6615 
6616 /* --- Latin-1 Codec ------------------------------------------------------ */
6617 
6618 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6619 PyUnicode_DecodeLatin1(const char *s,
6620                        Py_ssize_t size,
6621                        const char *errors)
6622 {
6623     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6624     return _PyUnicode_FromUCS1((unsigned char*)s, size);
6625 }
6626 
6627 /* create or adjust a UnicodeEncodeError */
6628 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6629 make_encode_exception(PyObject **exceptionObject,
6630                       const char *encoding,
6631                       PyObject *unicode,
6632                       Py_ssize_t startpos, Py_ssize_t endpos,
6633                       const char *reason)
6634 {
6635     if (*exceptionObject == NULL) {
6636         *exceptionObject = PyObject_CallFunction(
6637             PyExc_UnicodeEncodeError, "sOnns",
6638             encoding, unicode, startpos, endpos, reason);
6639     }
6640     else {
6641         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6642             goto onError;
6643         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6644             goto onError;
6645         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6646             goto onError;
6647         return;
6648       onError:
6649         Py_CLEAR(*exceptionObject);
6650     }
6651 }
6652 
6653 /* raises a UnicodeEncodeError */
6654 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6655 raise_encode_exception(PyObject **exceptionObject,
6656                        const char *encoding,
6657                        PyObject *unicode,
6658                        Py_ssize_t startpos, Py_ssize_t endpos,
6659                        const char *reason)
6660 {
6661     make_encode_exception(exceptionObject,
6662                           encoding, unicode, startpos, endpos, reason);
6663     if (*exceptionObject != NULL)
6664         PyCodec_StrictErrors(*exceptionObject);
6665 }
6666 
6667 /* error handling callback helper:
6668    build arguments, call the callback and check the arguments,
6669    put the result into newpos and return the replacement string, which
6670    has to be freed by the caller */
6671 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6672 unicode_encode_call_errorhandler(const char *errors,
6673                                  PyObject **errorHandler,
6674                                  const char *encoding, const char *reason,
6675                                  PyObject *unicode, PyObject **exceptionObject,
6676                                  Py_ssize_t startpos, Py_ssize_t endpos,
6677                                  Py_ssize_t *newpos)
6678 {
6679     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6680     Py_ssize_t len;
6681     PyObject *restuple;
6682     PyObject *resunicode;
6683 
6684     if (*errorHandler == NULL) {
6685         *errorHandler = PyCodec_LookupError(errors);
6686         if (*errorHandler == NULL)
6687             return NULL;
6688     }
6689 
6690     if (PyUnicode_READY(unicode) == -1)
6691         return NULL;
6692     len = PyUnicode_GET_LENGTH(unicode);
6693 
6694     make_encode_exception(exceptionObject,
6695                           encoding, unicode, startpos, endpos, reason);
6696     if (*exceptionObject == NULL)
6697         return NULL;
6698 
6699     restuple = PyObject_CallFunctionObjArgs(
6700         *errorHandler, *exceptionObject, NULL);
6701     if (restuple == NULL)
6702         return NULL;
6703     if (!PyTuple_Check(restuple)) {
6704         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6705         Py_DECREF(restuple);
6706         return NULL;
6707     }
6708     if (!PyArg_ParseTuple(restuple, argparse,
6709                           &resunicode, newpos)) {
6710         Py_DECREF(restuple);
6711         return NULL;
6712     }
6713     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6714         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6715         Py_DECREF(restuple);
6716         return NULL;
6717     }
6718     if (*newpos<0)
6719         *newpos = len + *newpos;
6720     if (*newpos<0 || *newpos>len) {
6721         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6722         Py_DECREF(restuple);
6723         return NULL;
6724     }
6725     Py_INCREF(resunicode);
6726     Py_DECREF(restuple);
6727     return resunicode;
6728 }
6729 
6730 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6731 unicode_encode_ucs1(PyObject *unicode,
6732                     const char *errors,
6733                     const Py_UCS4 limit)
6734 {
6735     /* input state */
6736     Py_ssize_t pos=0, size;
6737     int kind;
6738     void *data;
6739     /* pointer into the output */
6740     char *str;
6741     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6742     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6743     PyObject *error_handler_obj = NULL;
6744     PyObject *exc = NULL;
6745     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6746     PyObject *rep = NULL;
6747     /* output object */
6748     _PyBytesWriter writer;
6749 
6750     if (PyUnicode_READY(unicode) == -1)
6751         return NULL;
6752     size = PyUnicode_GET_LENGTH(unicode);
6753     kind = PyUnicode_KIND(unicode);
6754     data = PyUnicode_DATA(unicode);
6755     /* allocate enough for a simple encoding without
6756        replacements, if we need more, we'll resize */
6757     if (size == 0)
6758         return PyBytes_FromStringAndSize(NULL, 0);
6759 
6760     _PyBytesWriter_Init(&writer);
6761     str = _PyBytesWriter_Alloc(&writer, size);
6762     if (str == NULL)
6763         return NULL;
6764 
6765     while (pos < size) {
6766         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6767 
6768         /* can we encode this? */
6769         if (ch < limit) {
6770             /* no overflow check, because we know that the space is enough */
6771             *str++ = (char)ch;
6772             ++pos;
6773         }
6774         else {
6775             Py_ssize_t newpos, i;
6776             /* startpos for collecting unencodable chars */
6777             Py_ssize_t collstart = pos;
6778             Py_ssize_t collend = collstart + 1;
6779             /* find all unecodable characters */
6780 
6781             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6782                 ++collend;
6783 
6784             /* Only overallocate the buffer if it's not the last write */
6785             writer.overallocate = (collend < size);
6786 
6787             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6788             if (error_handler == _Py_ERROR_UNKNOWN)
6789                 error_handler = get_error_handler(errors);
6790 
6791             switch (error_handler) {
6792             case _Py_ERROR_STRICT:
6793                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6794                 goto onError;
6795 
6796             case _Py_ERROR_REPLACE:
6797                 memset(str, '?', collend - collstart);
6798                 str += (collend - collstart);
6799                 /* fall through */
6800             case _Py_ERROR_IGNORE:
6801                 pos = collend;
6802                 break;
6803 
6804             case _Py_ERROR_BACKSLASHREPLACE:
6805                 /* subtract preallocated bytes */
6806                 writer.min_size -= (collend - collstart);
6807                 str = backslashreplace(&writer, str,
6808                                        unicode, collstart, collend);
6809                 if (str == NULL)
6810                     goto onError;
6811                 pos = collend;
6812                 break;
6813 
6814             case _Py_ERROR_XMLCHARREFREPLACE:
6815                 /* subtract preallocated bytes */
6816                 writer.min_size -= (collend - collstart);
6817                 str = xmlcharrefreplace(&writer, str,
6818                                         unicode, collstart, collend);
6819                 if (str == NULL)
6820                     goto onError;
6821                 pos = collend;
6822                 break;
6823 
6824             case _Py_ERROR_SURROGATEESCAPE:
6825                 for (i = collstart; i < collend; ++i) {
6826                     ch = PyUnicode_READ(kind, data, i);
6827                     if (ch < 0xdc80 || 0xdcff < ch) {
6828                         /* Not a UTF-8b surrogate */
6829                         break;
6830                     }
6831                     *str++ = (char)(ch - 0xdc00);
6832                     ++pos;
6833                 }
6834                 if (i >= collend)
6835                     break;
6836                 collstart = pos;
6837                 assert(collstart != collend);
6838                 /* fall through */
6839 
6840             default:
6841                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6842                                                        encoding, reason, unicode, &exc,
6843                                                        collstart, collend, &newpos);
6844                 if (rep == NULL)
6845                     goto onError;
6846 
6847                 /* subtract preallocated bytes */
6848                 writer.min_size -= newpos - collstart;
6849 
6850                 if (PyBytes_Check(rep)) {
6851                     /* Directly copy bytes result to output. */
6852                     str = _PyBytesWriter_WriteBytes(&writer, str,
6853                                                     PyBytes_AS_STRING(rep),
6854                                                     PyBytes_GET_SIZE(rep));
6855                 }
6856                 else {
6857                     assert(PyUnicode_Check(rep));
6858 
6859                     if (PyUnicode_READY(rep) < 0)
6860                         goto onError;
6861 
6862                     if (limit == 256 ?
6863                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6864                         !PyUnicode_IS_ASCII(rep))
6865                     {
6866                         /* Not all characters are smaller than limit */
6867                         raise_encode_exception(&exc, encoding, unicode,
6868                                                collstart, collend, reason);
6869                         goto onError;
6870                     }
6871                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6872                     str = _PyBytesWriter_WriteBytes(&writer, str,
6873                                                     PyUnicode_DATA(rep),
6874                                                     PyUnicode_GET_LENGTH(rep));
6875                 }
6876                 if (str == NULL)
6877                     goto onError;
6878 
6879                 pos = newpos;
6880                 Py_CLEAR(rep);
6881             }
6882 
6883             /* If overallocation was disabled, ensure that it was the last
6884                write. Otherwise, we missed an optimization */
6885             assert(writer.overallocate || pos == size);
6886         }
6887     }
6888 
6889     Py_XDECREF(error_handler_obj);
6890     Py_XDECREF(exc);
6891     return _PyBytesWriter_Finish(&writer, str);
6892 
6893   onError:
6894     Py_XDECREF(rep);
6895     _PyBytesWriter_Dealloc(&writer);
6896     Py_XDECREF(error_handler_obj);
6897     Py_XDECREF(exc);
6898     return NULL;
6899 }
6900 
6901 /* Deprecated */
6902 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6903 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6904                        Py_ssize_t size,
6905                        const char *errors)
6906 {
6907     PyObject *result;
6908     PyObject *unicode = PyUnicode_FromWideChar(p, size);
6909     if (unicode == NULL)
6910         return NULL;
6911     result = unicode_encode_ucs1(unicode, errors, 256);
6912     Py_DECREF(unicode);
6913     return result;
6914 }
6915 
6916 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6917 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6918 {
6919     if (!PyUnicode_Check(unicode)) {
6920         PyErr_BadArgument();
6921         return NULL;
6922     }
6923     if (PyUnicode_READY(unicode) == -1)
6924         return NULL;
6925     /* Fast path: if it is a one-byte string, construct
6926        bytes object directly. */
6927     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6928         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6929                                          PyUnicode_GET_LENGTH(unicode));
6930     /* Non-Latin-1 characters present. Defer to above function to
6931        raise the exception. */
6932     return unicode_encode_ucs1(unicode, errors, 256);
6933 }
6934 
6935 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6936 PyUnicode_AsLatin1String(PyObject *unicode)
6937 {
6938     return _PyUnicode_AsLatin1String(unicode, NULL);
6939 }
6940 
6941 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6942 
6943 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6944 PyUnicode_DecodeASCII(const char *s,
6945                       Py_ssize_t size,
6946                       const char *errors)
6947 {
6948     const char *starts = s;
6949     _PyUnicodeWriter writer;
6950     int kind;
6951     void *data;
6952     Py_ssize_t startinpos;
6953     Py_ssize_t endinpos;
6954     Py_ssize_t outpos;
6955     const char *e;
6956     PyObject *error_handler_obj = NULL;
6957     PyObject *exc = NULL;
6958     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6959 
6960     if (size == 0)
6961         _Py_RETURN_UNICODE_EMPTY();
6962 
6963     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6964     if (size == 1 && (unsigned char)s[0] < 128)
6965         return get_latin1_char((unsigned char)s[0]);
6966 
6967     _PyUnicodeWriter_Init(&writer);
6968     writer.min_length = size;
6969     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6970         return NULL;
6971 
6972     e = s + size;
6973     data = writer.data;
6974     outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6975     writer.pos = outpos;
6976     if (writer.pos == size)
6977         return _PyUnicodeWriter_Finish(&writer);
6978 
6979     s += writer.pos;
6980     kind = writer.kind;
6981     while (s < e) {
6982         unsigned char c = (unsigned char)*s;
6983         if (c < 128) {
6984             PyUnicode_WRITE(kind, data, writer.pos, c);
6985             writer.pos++;
6986             ++s;
6987             continue;
6988         }
6989 
6990         /* byte outsize range 0x00..0x7f: call the error handler */
6991 
6992         if (error_handler == _Py_ERROR_UNKNOWN)
6993             error_handler = get_error_handler(errors);
6994 
6995         switch (error_handler)
6996         {
6997         case _Py_ERROR_REPLACE:
6998         case _Py_ERROR_SURROGATEESCAPE:
6999             /* Fast-path: the error handler only writes one character,
7000                but we may switch to UCS2 at the first write */
7001             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7002                 goto onError;
7003             kind = writer.kind;
7004             data = writer.data;
7005 
7006             if (error_handler == _Py_ERROR_REPLACE)
7007                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7008             else
7009                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7010             writer.pos++;
7011             ++s;
7012             break;
7013 
7014         case _Py_ERROR_IGNORE:
7015             ++s;
7016             break;
7017 
7018         default:
7019             startinpos = s-starts;
7020             endinpos = startinpos + 1;
7021             if (unicode_decode_call_errorhandler_writer(
7022                     errors, &error_handler_obj,
7023                     "ascii", "ordinal not in range(128)",
7024                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7025                     &writer))
7026                 goto onError;
7027             kind = writer.kind;
7028             data = writer.data;
7029         }
7030     }
7031     Py_XDECREF(error_handler_obj);
7032     Py_XDECREF(exc);
7033     return _PyUnicodeWriter_Finish(&writer);
7034 
7035   onError:
7036     _PyUnicodeWriter_Dealloc(&writer);
7037     Py_XDECREF(error_handler_obj);
7038     Py_XDECREF(exc);
7039     return NULL;
7040 }
7041 
7042 /* Deprecated */
7043 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7044 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7045                       Py_ssize_t size,
7046                       const char *errors)
7047 {
7048     PyObject *result;
7049     PyObject *unicode = PyUnicode_FromWideChar(p, size);
7050     if (unicode == NULL)
7051         return NULL;
7052     result = unicode_encode_ucs1(unicode, errors, 128);
7053     Py_DECREF(unicode);
7054     return result;
7055 }
7056 
7057 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7058 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7059 {
7060     if (!PyUnicode_Check(unicode)) {
7061         PyErr_BadArgument();
7062         return NULL;
7063     }
7064     if (PyUnicode_READY(unicode) == -1)
7065         return NULL;
7066     /* Fast path: if it is an ASCII-only string, construct bytes object
7067        directly. Else defer to above function to raise the exception. */
7068     if (PyUnicode_IS_ASCII(unicode))
7069         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7070                                          PyUnicode_GET_LENGTH(unicode));
7071     return unicode_encode_ucs1(unicode, errors, 128);
7072 }
7073 
7074 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7075 PyUnicode_AsASCIIString(PyObject *unicode)
7076 {
7077     return _PyUnicode_AsASCIIString(unicode, NULL);
7078 }
7079 
7080 #ifdef MS_WINDOWS
7081 
7082 /* --- MBCS codecs for Windows -------------------------------------------- */
7083 
7084 #if SIZEOF_INT < SIZEOF_SIZE_T
7085 #define NEED_RETRY
7086 #endif
7087 
7088 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7089    transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7090    both cases also and avoids partial characters overrunning the
7091    length limit in MultiByteToWideChar on Windows */
7092 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7093 
7094 #ifndef WC_ERR_INVALID_CHARS
7095 #  define WC_ERR_INVALID_CHARS 0x0080
7096 #endif
7097 
7098 static const char*
code_page_name(UINT code_page,PyObject ** obj)7099 code_page_name(UINT code_page, PyObject **obj)
7100 {
7101     *obj = NULL;
7102     if (code_page == CP_ACP)
7103         return "mbcs";
7104     if (code_page == CP_UTF7)
7105         return "CP_UTF7";
7106     if (code_page == CP_UTF8)
7107         return "CP_UTF8";
7108 
7109     *obj = PyBytes_FromFormat("cp%u", code_page);
7110     if (*obj == NULL)
7111         return NULL;
7112     return PyBytes_AS_STRING(*obj);
7113 }
7114 
7115 static DWORD
decode_code_page_flags(UINT code_page)7116 decode_code_page_flags(UINT code_page)
7117 {
7118     if (code_page == CP_UTF7) {
7119         /* The CP_UTF7 decoder only supports flags=0 */
7120         return 0;
7121     }
7122     else
7123         return MB_ERR_INVALID_CHARS;
7124 }
7125 
7126 /*
7127  * Decode a byte string from a Windows code page into unicode object in strict
7128  * mode.
7129  *
7130  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7131  * OSError and returns -1 on other error.
7132  */
7133 static int
decode_code_page_strict(UINT code_page,PyObject ** v,const char * in,int insize)7134 decode_code_page_strict(UINT code_page,
7135                         PyObject **v,
7136                         const char *in,
7137                         int insize)
7138 {
7139     DWORD flags = MB_ERR_INVALID_CHARS;
7140     wchar_t *out;
7141     DWORD outsize;
7142 
7143     /* First get the size of the result */
7144     assert(insize > 0);
7145     while ((outsize = MultiByteToWideChar(code_page, flags,
7146                                           in, insize, NULL, 0)) <= 0)
7147     {
7148         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7149             goto error;
7150         }
7151         /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7152         flags = 0;
7153     }
7154 
7155     if (*v == NULL) {
7156         /* Create unicode object */
7157         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7158         *v = (PyObject*)_PyUnicode_New(outsize);
7159         if (*v == NULL)
7160             return -1;
7161         out = PyUnicode_AS_UNICODE(*v);
7162     }
7163     else {
7164         /* Extend unicode object */
7165         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7166         if (unicode_resize(v, n + outsize) < 0)
7167             return -1;
7168         out = PyUnicode_AS_UNICODE(*v) + n;
7169     }
7170 
7171     /* Do the conversion */
7172     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7173     if (outsize <= 0)
7174         goto error;
7175     return insize;
7176 
7177 error:
7178     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7179         return -2;
7180     PyErr_SetFromWindowsErr(0);
7181     return -1;
7182 }
7183 
7184 /*
7185  * Decode a byte string from a code page into unicode object with an error
7186  * handler.
7187  *
7188  * Returns consumed size if succeed, or raise an OSError or
7189  * UnicodeDecodeError exception and returns -1 on error.
7190  */
7191 static int
decode_code_page_errors(UINT code_page,PyObject ** v,const char * in,const int size,const char * errors,int final)7192 decode_code_page_errors(UINT code_page,
7193                         PyObject **v,
7194                         const char *in, const int size,
7195                         const char *errors, int final)
7196 {
7197     const char *startin = in;
7198     const char *endin = in + size;
7199     DWORD flags = MB_ERR_INVALID_CHARS;
7200     /* Ideally, we should get reason from FormatMessage. This is the Windows
7201        2000 English version of the message. */
7202     const char *reason = "No mapping for the Unicode character exists "
7203                          "in the target code page.";
7204     /* each step cannot decode more than 1 character, but a character can be
7205        represented as a surrogate pair */
7206     wchar_t buffer[2], *out;
7207     int insize;
7208     Py_ssize_t outsize;
7209     PyObject *errorHandler = NULL;
7210     PyObject *exc = NULL;
7211     PyObject *encoding_obj = NULL;
7212     const char *encoding;
7213     DWORD err;
7214     int ret = -1;
7215 
7216     assert(size > 0);
7217 
7218     encoding = code_page_name(code_page, &encoding_obj);
7219     if (encoding == NULL)
7220         return -1;
7221 
7222     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7223         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7224            UnicodeDecodeError. */
7225         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7226         if (exc != NULL) {
7227             PyCodec_StrictErrors(exc);
7228             Py_CLEAR(exc);
7229         }
7230         goto error;
7231     }
7232 
7233     if (*v == NULL) {
7234         /* Create unicode object */
7235         if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7236             PyErr_NoMemory();
7237             goto error;
7238         }
7239         /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7240         *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7241         if (*v == NULL)
7242             goto error;
7243         out = PyUnicode_AS_UNICODE(*v);
7244     }
7245     else {
7246         /* Extend unicode object */
7247         Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7248         if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7249             PyErr_NoMemory();
7250             goto error;
7251         }
7252         if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7253             goto error;
7254         out = PyUnicode_AS_UNICODE(*v) + n;
7255     }
7256 
7257     /* Decode the byte string character per character */
7258     while (in < endin)
7259     {
7260         /* Decode a character */
7261         insize = 1;
7262         do
7263         {
7264             outsize = MultiByteToWideChar(code_page, flags,
7265                                           in, insize,
7266                                           buffer, Py_ARRAY_LENGTH(buffer));
7267             if (outsize > 0)
7268                 break;
7269             err = GetLastError();
7270             if (err == ERROR_INVALID_FLAGS && flags) {
7271                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7272                 flags = 0;
7273                 continue;
7274             }
7275             if (err != ERROR_NO_UNICODE_TRANSLATION
7276                 && err != ERROR_INSUFFICIENT_BUFFER)
7277             {
7278                 PyErr_SetFromWindowsErr(0);
7279                 goto error;
7280             }
7281             insize++;
7282         }
7283         /* 4=maximum length of a UTF-8 sequence */
7284         while (insize <= 4 && (in + insize) <= endin);
7285 
7286         if (outsize <= 0) {
7287             Py_ssize_t startinpos, endinpos, outpos;
7288 
7289             /* last character in partial decode? */
7290             if (in + insize >= endin && !final)
7291                 break;
7292 
7293             startinpos = in - startin;
7294             endinpos = startinpos + 1;
7295             outpos = out - PyUnicode_AS_UNICODE(*v);
7296             if (unicode_decode_call_errorhandler_wchar(
7297                     errors, &errorHandler,
7298                     encoding, reason,
7299                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7300                     v, &outpos))
7301             {
7302                 goto error;
7303             }
7304             out = PyUnicode_AS_UNICODE(*v) + outpos;
7305         }
7306         else {
7307             in += insize;
7308             memcpy(out, buffer, outsize * sizeof(wchar_t));
7309             out += outsize;
7310         }
7311     }
7312 
7313     /* write a NUL character at the end */
7314     *out = 0;
7315 
7316     /* Extend unicode object */
7317     outsize = out - PyUnicode_AS_UNICODE(*v);
7318     assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7319     if (unicode_resize(v, outsize) < 0)
7320         goto error;
7321     /* (in - startin) <= size and size is an int */
7322     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7323 
7324 error:
7325     Py_XDECREF(encoding_obj);
7326     Py_XDECREF(errorHandler);
7327     Py_XDECREF(exc);
7328     return ret;
7329 }
7330 
7331 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7332 decode_code_page_stateful(int code_page,
7333                           const char *s, Py_ssize_t size,
7334                           const char *errors, Py_ssize_t *consumed)
7335 {
7336     PyObject *v = NULL;
7337     int chunk_size, final, converted, done;
7338 
7339     if (code_page < 0) {
7340         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7341         return NULL;
7342     }
7343     if (size < 0) {
7344         PyErr_BadInternalCall();
7345         return NULL;
7346     }
7347 
7348     if (consumed)
7349         *consumed = 0;
7350 
7351     do
7352     {
7353 #ifdef NEED_RETRY
7354         if (size > DECODING_CHUNK_SIZE) {
7355             chunk_size = DECODING_CHUNK_SIZE;
7356             final = 0;
7357             done = 0;
7358         }
7359         else
7360 #endif
7361         {
7362             chunk_size = (int)size;
7363             final = (consumed == NULL);
7364             done = 1;
7365         }
7366 
7367         if (chunk_size == 0 && done) {
7368             if (v != NULL)
7369                 break;
7370             _Py_RETURN_UNICODE_EMPTY();
7371         }
7372 
7373         converted = decode_code_page_strict(code_page, &v,
7374                                             s, chunk_size);
7375         if (converted == -2)
7376             converted = decode_code_page_errors(code_page, &v,
7377                                                 s, chunk_size,
7378                                                 errors, final);
7379         assert(converted != 0 || done);
7380 
7381         if (converted < 0) {
7382             Py_XDECREF(v);
7383             return NULL;
7384         }
7385 
7386         if (consumed)
7387             *consumed += converted;
7388 
7389         s += converted;
7390         size -= converted;
7391     } while (!done);
7392 
7393     return unicode_result(v);
7394 }
7395 
7396 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7397 PyUnicode_DecodeCodePageStateful(int code_page,
7398                                  const char *s,
7399                                  Py_ssize_t size,
7400                                  const char *errors,
7401                                  Py_ssize_t *consumed)
7402 {
7403     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7404 }
7405 
7406 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7407 PyUnicode_DecodeMBCSStateful(const char *s,
7408                              Py_ssize_t size,
7409                              const char *errors,
7410                              Py_ssize_t *consumed)
7411 {
7412     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7413 }
7414 
7415 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7416 PyUnicode_DecodeMBCS(const char *s,
7417                      Py_ssize_t size,
7418                      const char *errors)
7419 {
7420     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7421 }
7422 
7423 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7424 encode_code_page_flags(UINT code_page, const char *errors)
7425 {
7426     if (code_page == CP_UTF8) {
7427         return WC_ERR_INVALID_CHARS;
7428     }
7429     else if (code_page == CP_UTF7) {
7430         /* CP_UTF7 only supports flags=0 */
7431         return 0;
7432     }
7433     else {
7434         if (errors != NULL && strcmp(errors, "replace") == 0)
7435             return 0;
7436         else
7437             return WC_NO_BEST_FIT_CHARS;
7438     }
7439 }
7440 
7441 /*
7442  * Encode a Unicode string to a Windows code page into a byte string in strict
7443  * mode.
7444  *
7445  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7446  * an OSError and returns -1 on other error.
7447  */
7448 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7449 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7450                         PyObject *unicode, Py_ssize_t offset, int len,
7451                         const char* errors)
7452 {
7453     BOOL usedDefaultChar = FALSE;
7454     BOOL *pusedDefaultChar = &usedDefaultChar;
7455     int outsize;
7456     wchar_t *p;
7457     Py_ssize_t size;
7458     const DWORD flags = encode_code_page_flags(code_page, NULL);
7459     char *out;
7460     /* Create a substring so that we can get the UTF-16 representation
7461        of just the slice under consideration. */
7462     PyObject *substring;
7463 
7464     assert(len > 0);
7465 
7466     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7467         pusedDefaultChar = &usedDefaultChar;
7468     else
7469         pusedDefaultChar = NULL;
7470 
7471     substring = PyUnicode_Substring(unicode, offset, offset+len);
7472     if (substring == NULL)
7473         return -1;
7474     p = PyUnicode_AsUnicodeAndSize(substring, &size);
7475     if (p == NULL) {
7476         Py_DECREF(substring);
7477         return -1;
7478     }
7479     assert(size <= INT_MAX);
7480 
7481     /* First get the size of the result */
7482     outsize = WideCharToMultiByte(code_page, flags,
7483                                   p, (int)size,
7484                                   NULL, 0,
7485                                   NULL, pusedDefaultChar);
7486     if (outsize <= 0)
7487         goto error;
7488     /* If we used a default char, then we failed! */
7489     if (pusedDefaultChar && *pusedDefaultChar) {
7490         Py_DECREF(substring);
7491         return -2;
7492     }
7493 
7494     if (*outbytes == NULL) {
7495         /* Create string object */
7496         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7497         if (*outbytes == NULL) {
7498             Py_DECREF(substring);
7499             return -1;
7500         }
7501         out = PyBytes_AS_STRING(*outbytes);
7502     }
7503     else {
7504         /* Extend string object */
7505         const Py_ssize_t n = PyBytes_Size(*outbytes);
7506         if (outsize > PY_SSIZE_T_MAX - n) {
7507             PyErr_NoMemory();
7508             Py_DECREF(substring);
7509             return -1;
7510         }
7511         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7512             Py_DECREF(substring);
7513             return -1;
7514         }
7515         out = PyBytes_AS_STRING(*outbytes) + n;
7516     }
7517 
7518     /* Do the conversion */
7519     outsize = WideCharToMultiByte(code_page, flags,
7520                                   p, (int)size,
7521                                   out, outsize,
7522                                   NULL, pusedDefaultChar);
7523     Py_CLEAR(substring);
7524     if (outsize <= 0)
7525         goto error;
7526     if (pusedDefaultChar && *pusedDefaultChar)
7527         return -2;
7528     return 0;
7529 
7530 error:
7531     Py_XDECREF(substring);
7532     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7533         return -2;
7534     PyErr_SetFromWindowsErr(0);
7535     return -1;
7536 }
7537 
7538 /*
7539  * Encode a Unicode string to a Windows code page into a byte string using an
7540  * error handler.
7541  *
7542  * Returns consumed characters if succeed, or raise an OSError and returns
7543  * -1 on other error.
7544  */
7545 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7546 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7547                         PyObject *unicode, Py_ssize_t unicode_offset,
7548                         Py_ssize_t insize, const char* errors)
7549 {
7550     const DWORD flags = encode_code_page_flags(code_page, errors);
7551     Py_ssize_t pos = unicode_offset;
7552     Py_ssize_t endin = unicode_offset + insize;
7553     /* Ideally, we should get reason from FormatMessage. This is the Windows
7554        2000 English version of the message. */
7555     const char *reason = "invalid character";
7556     /* 4=maximum length of a UTF-8 sequence */
7557     char buffer[4];
7558     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7559     Py_ssize_t outsize;
7560     char *out;
7561     PyObject *errorHandler = NULL;
7562     PyObject *exc = NULL;
7563     PyObject *encoding_obj = NULL;
7564     const char *encoding;
7565     Py_ssize_t newpos, newoutsize;
7566     PyObject *rep;
7567     int ret = -1;
7568 
7569     assert(insize > 0);
7570 
7571     encoding = code_page_name(code_page, &encoding_obj);
7572     if (encoding == NULL)
7573         return -1;
7574 
7575     if (errors == NULL || strcmp(errors, "strict") == 0) {
7576         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7577            then we raise a UnicodeEncodeError. */
7578         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7579         if (exc != NULL) {
7580             PyCodec_StrictErrors(exc);
7581             Py_DECREF(exc);
7582         }
7583         Py_XDECREF(encoding_obj);
7584         return -1;
7585     }
7586 
7587     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7588         pusedDefaultChar = &usedDefaultChar;
7589     else
7590         pusedDefaultChar = NULL;
7591 
7592     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7593         PyErr_NoMemory();
7594         goto error;
7595     }
7596     outsize = insize * Py_ARRAY_LENGTH(buffer);
7597 
7598     if (*outbytes == NULL) {
7599         /* Create string object */
7600         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7601         if (*outbytes == NULL)
7602             goto error;
7603         out = PyBytes_AS_STRING(*outbytes);
7604     }
7605     else {
7606         /* Extend string object */
7607         Py_ssize_t n = PyBytes_Size(*outbytes);
7608         if (n > PY_SSIZE_T_MAX - outsize) {
7609             PyErr_NoMemory();
7610             goto error;
7611         }
7612         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7613             goto error;
7614         out = PyBytes_AS_STRING(*outbytes) + n;
7615     }
7616 
7617     /* Encode the string character per character */
7618     while (pos < endin)
7619     {
7620         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7621         wchar_t chars[2];
7622         int charsize;
7623         if (ch < 0x10000) {
7624             chars[0] = (wchar_t)ch;
7625             charsize = 1;
7626         }
7627         else {
7628             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7629             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7630             charsize = 2;
7631         }
7632 
7633         outsize = WideCharToMultiByte(code_page, flags,
7634                                       chars, charsize,
7635                                       buffer, Py_ARRAY_LENGTH(buffer),
7636                                       NULL, pusedDefaultChar);
7637         if (outsize > 0) {
7638             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7639             {
7640                 pos++;
7641                 memcpy(out, buffer, outsize);
7642                 out += outsize;
7643                 continue;
7644             }
7645         }
7646         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7647             PyErr_SetFromWindowsErr(0);
7648             goto error;
7649         }
7650 
7651         rep = unicode_encode_call_errorhandler(
7652                   errors, &errorHandler, encoding, reason,
7653                   unicode, &exc,
7654                   pos, pos + 1, &newpos);
7655         if (rep == NULL)
7656             goto error;
7657         pos = newpos;
7658 
7659         if (PyBytes_Check(rep)) {
7660             outsize = PyBytes_GET_SIZE(rep);
7661             if (outsize != 1) {
7662                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7663                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7664                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7665                     Py_DECREF(rep);
7666                     goto error;
7667                 }
7668                 out = PyBytes_AS_STRING(*outbytes) + offset;
7669             }
7670             memcpy(out, PyBytes_AS_STRING(rep), outsize);
7671             out += outsize;
7672         }
7673         else {
7674             Py_ssize_t i;
7675             enum PyUnicode_Kind kind;
7676             void *data;
7677 
7678             if (PyUnicode_READY(rep) == -1) {
7679                 Py_DECREF(rep);
7680                 goto error;
7681             }
7682 
7683             outsize = PyUnicode_GET_LENGTH(rep);
7684             if (outsize != 1) {
7685                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7686                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7687                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7688                     Py_DECREF(rep);
7689                     goto error;
7690                 }
7691                 out = PyBytes_AS_STRING(*outbytes) + offset;
7692             }
7693             kind = PyUnicode_KIND(rep);
7694             data = PyUnicode_DATA(rep);
7695             for (i=0; i < outsize; i++) {
7696                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7697                 if (ch > 127) {
7698                     raise_encode_exception(&exc,
7699                         encoding, unicode,
7700                         pos, pos + 1,
7701                         "unable to encode error handler result to ASCII");
7702                     Py_DECREF(rep);
7703                     goto error;
7704                 }
7705                 *out = (unsigned char)ch;
7706                 out++;
7707             }
7708         }
7709         Py_DECREF(rep);
7710     }
7711     /* write a NUL byte */
7712     *out = 0;
7713     outsize = out - PyBytes_AS_STRING(*outbytes);
7714     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7715     if (_PyBytes_Resize(outbytes, outsize) < 0)
7716         goto error;
7717     ret = 0;
7718 
7719 error:
7720     Py_XDECREF(encoding_obj);
7721     Py_XDECREF(errorHandler);
7722     Py_XDECREF(exc);
7723     return ret;
7724 }
7725 
7726 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7727 encode_code_page(int code_page,
7728                  PyObject *unicode,
7729                  const char *errors)
7730 {
7731     Py_ssize_t len;
7732     PyObject *outbytes = NULL;
7733     Py_ssize_t offset;
7734     int chunk_len, ret, done;
7735 
7736     if (!PyUnicode_Check(unicode)) {
7737         PyErr_BadArgument();
7738         return NULL;
7739     }
7740 
7741     if (PyUnicode_READY(unicode) == -1)
7742         return NULL;
7743     len = PyUnicode_GET_LENGTH(unicode);
7744 
7745     if (code_page < 0) {
7746         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7747         return NULL;
7748     }
7749 
7750     if (len == 0)
7751         return PyBytes_FromStringAndSize(NULL, 0);
7752 
7753     offset = 0;
7754     do
7755     {
7756 #ifdef NEED_RETRY
7757         if (len > DECODING_CHUNK_SIZE) {
7758             chunk_len = DECODING_CHUNK_SIZE;
7759             done = 0;
7760         }
7761         else
7762 #endif
7763         {
7764             chunk_len = (int)len;
7765             done = 1;
7766         }
7767 
7768         ret = encode_code_page_strict(code_page, &outbytes,
7769                                       unicode, offset, chunk_len,
7770                                       errors);
7771         if (ret == -2)
7772             ret = encode_code_page_errors(code_page, &outbytes,
7773                                           unicode, offset,
7774                                           chunk_len, errors);
7775         if (ret < 0) {
7776             Py_XDECREF(outbytes);
7777             return NULL;
7778         }
7779 
7780         offset += chunk_len;
7781         len -= chunk_len;
7782     } while (!done);
7783 
7784     return outbytes;
7785 }
7786 
7787 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7788 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7789                      Py_ssize_t size,
7790                      const char *errors)
7791 {
7792     PyObject *unicode, *res;
7793     unicode = PyUnicode_FromWideChar(p, size);
7794     if (unicode == NULL)
7795         return NULL;
7796     res = encode_code_page(CP_ACP, unicode, errors);
7797     Py_DECREF(unicode);
7798     return res;
7799 }
7800 
7801 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7802 PyUnicode_EncodeCodePage(int code_page,
7803                          PyObject *unicode,
7804                          const char *errors)
7805 {
7806     return encode_code_page(code_page, unicode, errors);
7807 }
7808 
7809 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7810 PyUnicode_AsMBCSString(PyObject *unicode)
7811 {
7812     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7813 }
7814 
7815 #undef NEED_RETRY
7816 
7817 #endif /* MS_WINDOWS */
7818 
7819 /* --- Character Mapping Codec -------------------------------------------- */
7820 
7821 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7822 charmap_decode_string(const char *s,
7823                       Py_ssize_t size,
7824                       PyObject *mapping,
7825                       const char *errors,
7826                       _PyUnicodeWriter *writer)
7827 {
7828     const char *starts = s;
7829     const char *e;
7830     Py_ssize_t startinpos, endinpos;
7831     PyObject *errorHandler = NULL, *exc = NULL;
7832     Py_ssize_t maplen;
7833     enum PyUnicode_Kind mapkind;
7834     void *mapdata;
7835     Py_UCS4 x;
7836     unsigned char ch;
7837 
7838     if (PyUnicode_READY(mapping) == -1)
7839         return -1;
7840 
7841     maplen = PyUnicode_GET_LENGTH(mapping);
7842     mapdata = PyUnicode_DATA(mapping);
7843     mapkind = PyUnicode_KIND(mapping);
7844 
7845     e = s + size;
7846 
7847     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7848         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7849          * is disabled in encoding aliases, latin1 is preferred because
7850          * its implementation is faster. */
7851         Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7852         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7853         Py_UCS4 maxchar = writer->maxchar;
7854 
7855         assert (writer->kind == PyUnicode_1BYTE_KIND);
7856         while (s < e) {
7857             ch = *s;
7858             x = mapdata_ucs1[ch];
7859             if (x > maxchar) {
7860                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7861                     goto onError;
7862                 maxchar = writer->maxchar;
7863                 outdata = (Py_UCS1 *)writer->data;
7864             }
7865             outdata[writer->pos] = x;
7866             writer->pos++;
7867             ++s;
7868         }
7869         return 0;
7870     }
7871 
7872     while (s < e) {
7873         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7874             enum PyUnicode_Kind outkind = writer->kind;
7875             Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7876             if (outkind == PyUnicode_1BYTE_KIND) {
7877                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7878                 Py_UCS4 maxchar = writer->maxchar;
7879                 while (s < e) {
7880                     ch = *s;
7881                     x = mapdata_ucs2[ch];
7882                     if (x > maxchar)
7883                         goto Error;
7884                     outdata[writer->pos] = x;
7885                     writer->pos++;
7886                     ++s;
7887                 }
7888                 break;
7889             }
7890             else if (outkind == PyUnicode_2BYTE_KIND) {
7891                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7892                 while (s < e) {
7893                     ch = *s;
7894                     x = mapdata_ucs2[ch];
7895                     if (x == 0xFFFE)
7896                         goto Error;
7897                     outdata[writer->pos] = x;
7898                     writer->pos++;
7899                     ++s;
7900                 }
7901                 break;
7902             }
7903         }
7904         ch = *s;
7905 
7906         if (ch < maplen)
7907             x = PyUnicode_READ(mapkind, mapdata, ch);
7908         else
7909             x = 0xfffe; /* invalid value */
7910 Error:
7911         if (x == 0xfffe)
7912         {
7913             /* undefined mapping */
7914             startinpos = s-starts;
7915             endinpos = startinpos+1;
7916             if (unicode_decode_call_errorhandler_writer(
7917                     errors, &errorHandler,
7918                     "charmap", "character maps to <undefined>",
7919                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7920                     writer)) {
7921                 goto onError;
7922             }
7923             continue;
7924         }
7925 
7926         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7927             goto onError;
7928         ++s;
7929     }
7930     Py_XDECREF(errorHandler);
7931     Py_XDECREF(exc);
7932     return 0;
7933 
7934 onError:
7935     Py_XDECREF(errorHandler);
7936     Py_XDECREF(exc);
7937     return -1;
7938 }
7939 
7940 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7941 charmap_decode_mapping(const char *s,
7942                        Py_ssize_t size,
7943                        PyObject *mapping,
7944                        const char *errors,
7945                        _PyUnicodeWriter *writer)
7946 {
7947     const char *starts = s;
7948     const char *e;
7949     Py_ssize_t startinpos, endinpos;
7950     PyObject *errorHandler = NULL, *exc = NULL;
7951     unsigned char ch;
7952     PyObject *key, *item = NULL;
7953 
7954     e = s + size;
7955 
7956     while (s < e) {
7957         ch = *s;
7958 
7959         /* Get mapping (char ordinal -> integer, Unicode char or None) */
7960         key = PyLong_FromLong((long)ch);
7961         if (key == NULL)
7962             goto onError;
7963 
7964         item = PyObject_GetItem(mapping, key);
7965         Py_DECREF(key);
7966         if (item == NULL) {
7967             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7968                 /* No mapping found means: mapping is undefined. */
7969                 PyErr_Clear();
7970                 goto Undefined;
7971             } else
7972                 goto onError;
7973         }
7974 
7975         /* Apply mapping */
7976         if (item == Py_None)
7977             goto Undefined;
7978         if (PyLong_Check(item)) {
7979             long value = PyLong_AS_LONG(item);
7980             if (value == 0xFFFE)
7981                 goto Undefined;
7982             if (value < 0 || value > MAX_UNICODE) {
7983                 PyErr_Format(PyExc_TypeError,
7984                              "character mapping must be in range(0x%lx)",
7985                              (unsigned long)MAX_UNICODE + 1);
7986                 goto onError;
7987             }
7988 
7989             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7990                 goto onError;
7991         }
7992         else if (PyUnicode_Check(item)) {
7993             if (PyUnicode_READY(item) == -1)
7994                 goto onError;
7995             if (PyUnicode_GET_LENGTH(item) == 1) {
7996                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7997                 if (value == 0xFFFE)
7998                     goto Undefined;
7999                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8000                     goto onError;
8001             }
8002             else {
8003                 writer->overallocate = 1;
8004                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8005                     goto onError;
8006             }
8007         }
8008         else {
8009             /* wrong return value */
8010             PyErr_SetString(PyExc_TypeError,
8011                             "character mapping must return integer, None or str");
8012             goto onError;
8013         }
8014         Py_CLEAR(item);
8015         ++s;
8016         continue;
8017 
8018 Undefined:
8019         /* undefined mapping */
8020         Py_CLEAR(item);
8021         startinpos = s-starts;
8022         endinpos = startinpos+1;
8023         if (unicode_decode_call_errorhandler_writer(
8024                 errors, &errorHandler,
8025                 "charmap", "character maps to <undefined>",
8026                 &starts, &e, &startinpos, &endinpos, &exc, &s,
8027                 writer)) {
8028             goto onError;
8029         }
8030     }
8031     Py_XDECREF(errorHandler);
8032     Py_XDECREF(exc);
8033     return 0;
8034 
8035 onError:
8036     Py_XDECREF(item);
8037     Py_XDECREF(errorHandler);
8038     Py_XDECREF(exc);
8039     return -1;
8040 }
8041 
8042 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8043 PyUnicode_DecodeCharmap(const char *s,
8044                         Py_ssize_t size,
8045                         PyObject *mapping,
8046                         const char *errors)
8047 {
8048     _PyUnicodeWriter writer;
8049 
8050     /* Default to Latin-1 */
8051     if (mapping == NULL)
8052         return PyUnicode_DecodeLatin1(s, size, errors);
8053 
8054     if (size == 0)
8055         _Py_RETURN_UNICODE_EMPTY();
8056     _PyUnicodeWriter_Init(&writer);
8057     writer.min_length = size;
8058     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8059         goto onError;
8060 
8061     if (PyUnicode_CheckExact(mapping)) {
8062         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8063             goto onError;
8064     }
8065     else {
8066         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8067             goto onError;
8068     }
8069     return _PyUnicodeWriter_Finish(&writer);
8070 
8071   onError:
8072     _PyUnicodeWriter_Dealloc(&writer);
8073     return NULL;
8074 }
8075 
8076 /* Charmap encoding: the lookup table */
8077 
8078 struct encoding_map {
8079     PyObject_HEAD
8080     unsigned char level1[32];
8081     int count2, count3;
8082     unsigned char level23[1];
8083 };
8084 
8085 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8086 encoding_map_size(PyObject *obj, PyObject* args)
8087 {
8088     struct encoding_map *map = (struct encoding_map*)obj;
8089     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8090                            128*map->count3);
8091 }
8092 
8093 static PyMethodDef encoding_map_methods[] = {
8094     {"size", encoding_map_size, METH_NOARGS,
8095      PyDoc_STR("Return the size (in bytes) of this object") },
8096     { 0 }
8097 };
8098 
8099 static void
encoding_map_dealloc(PyObject * o)8100 encoding_map_dealloc(PyObject* o)
8101 {
8102     PyObject_FREE(o);
8103 }
8104 
8105 static PyTypeObject EncodingMapType = {
8106     PyVarObject_HEAD_INIT(NULL, 0)
8107     "EncodingMap",          /*tp_name*/
8108     sizeof(struct encoding_map),   /*tp_basicsize*/
8109     0,                      /*tp_itemsize*/
8110     /* methods */
8111     encoding_map_dealloc,   /*tp_dealloc*/
8112     0,                      /*tp_print*/
8113     0,                      /*tp_getattr*/
8114     0,                      /*tp_setattr*/
8115     0,                      /*tp_reserved*/
8116     0,                      /*tp_repr*/
8117     0,                      /*tp_as_number*/
8118     0,                      /*tp_as_sequence*/
8119     0,                      /*tp_as_mapping*/
8120     0,                      /*tp_hash*/
8121     0,                      /*tp_call*/
8122     0,                      /*tp_str*/
8123     0,                      /*tp_getattro*/
8124     0,                      /*tp_setattro*/
8125     0,                      /*tp_as_buffer*/
8126     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8127     0,                      /*tp_doc*/
8128     0,                      /*tp_traverse*/
8129     0,                      /*tp_clear*/
8130     0,                      /*tp_richcompare*/
8131     0,                      /*tp_weaklistoffset*/
8132     0,                      /*tp_iter*/
8133     0,                      /*tp_iternext*/
8134     encoding_map_methods,   /*tp_methods*/
8135     0,                      /*tp_members*/
8136     0,                      /*tp_getset*/
8137     0,                      /*tp_base*/
8138     0,                      /*tp_dict*/
8139     0,                      /*tp_descr_get*/
8140     0,                      /*tp_descr_set*/
8141     0,                      /*tp_dictoffset*/
8142     0,                      /*tp_init*/
8143     0,                      /*tp_alloc*/
8144     0,                      /*tp_new*/
8145     0,                      /*tp_free*/
8146     0,                      /*tp_is_gc*/
8147 };
8148 
8149 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8150 PyUnicode_BuildEncodingMap(PyObject* string)
8151 {
8152     PyObject *result;
8153     struct encoding_map *mresult;
8154     int i;
8155     int need_dict = 0;
8156     unsigned char level1[32];
8157     unsigned char level2[512];
8158     unsigned char *mlevel1, *mlevel2, *mlevel3;
8159     int count2 = 0, count3 = 0;
8160     int kind;
8161     void *data;
8162     Py_ssize_t length;
8163     Py_UCS4 ch;
8164 
8165     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8166         PyErr_BadArgument();
8167         return NULL;
8168     }
8169     kind = PyUnicode_KIND(string);
8170     data = PyUnicode_DATA(string);
8171     length = PyUnicode_GET_LENGTH(string);
8172     length = Py_MIN(length, 256);
8173     memset(level1, 0xFF, sizeof level1);
8174     memset(level2, 0xFF, sizeof level2);
8175 
8176     /* If there isn't a one-to-one mapping of NULL to \0,
8177        or if there are non-BMP characters, we need to use
8178        a mapping dictionary. */
8179     if (PyUnicode_READ(kind, data, 0) != 0)
8180         need_dict = 1;
8181     for (i = 1; i < length; i++) {
8182         int l1, l2;
8183         ch = PyUnicode_READ(kind, data, i);
8184         if (ch == 0 || ch > 0xFFFF) {
8185             need_dict = 1;
8186             break;
8187         }
8188         if (ch == 0xFFFE)
8189             /* unmapped character */
8190             continue;
8191         l1 = ch >> 11;
8192         l2 = ch >> 7;
8193         if (level1[l1] == 0xFF)
8194             level1[l1] = count2++;
8195         if (level2[l2] == 0xFF)
8196             level2[l2] = count3++;
8197     }
8198 
8199     if (count2 >= 0xFF || count3 >= 0xFF)
8200         need_dict = 1;
8201 
8202     if (need_dict) {
8203         PyObject *result = PyDict_New();
8204         PyObject *key, *value;
8205         if (!result)
8206             return NULL;
8207         for (i = 0; i < length; i++) {
8208             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8209             value = PyLong_FromLong(i);
8210             if (!key || !value)
8211                 goto failed1;
8212             if (PyDict_SetItem(result, key, value) == -1)
8213                 goto failed1;
8214             Py_DECREF(key);
8215             Py_DECREF(value);
8216         }
8217         return result;
8218       failed1:
8219         Py_XDECREF(key);
8220         Py_XDECREF(value);
8221         Py_DECREF(result);
8222         return NULL;
8223     }
8224 
8225     /* Create a three-level trie */
8226     result = PyObject_MALLOC(sizeof(struct encoding_map) +
8227                              16*count2 + 128*count3 - 1);
8228     if (!result)
8229         return PyErr_NoMemory();
8230     PyObject_Init(result, &EncodingMapType);
8231     mresult = (struct encoding_map*)result;
8232     mresult->count2 = count2;
8233     mresult->count3 = count3;
8234     mlevel1 = mresult->level1;
8235     mlevel2 = mresult->level23;
8236     mlevel3 = mresult->level23 + 16*count2;
8237     memcpy(mlevel1, level1, 32);
8238     memset(mlevel2, 0xFF, 16*count2);
8239     memset(mlevel3, 0, 128*count3);
8240     count3 = 0;
8241     for (i = 1; i < length; i++) {
8242         int o1, o2, o3, i2, i3;
8243         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8244         if (ch == 0xFFFE)
8245             /* unmapped character */
8246             continue;
8247         o1 = ch>>11;
8248         o2 = (ch>>7) & 0xF;
8249         i2 = 16*mlevel1[o1] + o2;
8250         if (mlevel2[i2] == 0xFF)
8251             mlevel2[i2] = count3++;
8252         o3 = ch & 0x7F;
8253         i3 = 128*mlevel2[i2] + o3;
8254         mlevel3[i3] = i;
8255     }
8256     return result;
8257 }
8258 
8259 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8260 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8261 {
8262     struct encoding_map *map = (struct encoding_map*)mapping;
8263     int l1 = c>>11;
8264     int l2 = (c>>7) & 0xF;
8265     int l3 = c & 0x7F;
8266     int i;
8267 
8268     if (c > 0xFFFF)
8269         return -1;
8270     if (c == 0)
8271         return 0;
8272     /* level 1*/
8273     i = map->level1[l1];
8274     if (i == 0xFF) {
8275         return -1;
8276     }
8277     /* level 2*/
8278     i = map->level23[16*i+l2];
8279     if (i == 0xFF) {
8280         return -1;
8281     }
8282     /* level 3 */
8283     i = map->level23[16*map->count2 + 128*i + l3];
8284     if (i == 0) {
8285         return -1;
8286     }
8287     return i;
8288 }
8289 
8290 /* Lookup the character ch in the mapping. If the character
8291    can't be found, Py_None is returned (or NULL, if another
8292    error occurred). */
8293 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8294 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8295 {
8296     PyObject *w = PyLong_FromLong((long)c);
8297     PyObject *x;
8298 
8299     if (w == NULL)
8300         return NULL;
8301     x = PyObject_GetItem(mapping, w);
8302     Py_DECREF(w);
8303     if (x == NULL) {
8304         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8305             /* No mapping found means: mapping is undefined. */
8306             PyErr_Clear();
8307             Py_RETURN_NONE;
8308         } else
8309             return NULL;
8310     }
8311     else if (x == Py_None)
8312         return x;
8313     else if (PyLong_Check(x)) {
8314         long value = PyLong_AS_LONG(x);
8315         if (value < 0 || value > 255) {
8316             PyErr_SetString(PyExc_TypeError,
8317                             "character mapping must be in range(256)");
8318             Py_DECREF(x);
8319             return NULL;
8320         }
8321         return x;
8322     }
8323     else if (PyBytes_Check(x))
8324         return x;
8325     else {
8326         /* wrong return value */
8327         PyErr_Format(PyExc_TypeError,
8328                      "character mapping must return integer, bytes or None, not %.400s",
8329                      x->ob_type->tp_name);
8330         Py_DECREF(x);
8331         return NULL;
8332     }
8333 }
8334 
8335 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8336 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8337 {
8338     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8339     /* exponentially overallocate to minimize reallocations */
8340     if (requiredsize < 2*outsize)
8341         requiredsize = 2*outsize;
8342     if (_PyBytes_Resize(outobj, requiredsize))
8343         return -1;
8344     return 0;
8345 }
8346 
8347 typedef enum charmapencode_result {
8348     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8349 } charmapencode_result;
8350 /* lookup the character, put the result in the output string and adjust
8351    various state variables. Resize the output bytes object if not enough
8352    space is available. Return a new reference to the object that
8353    was put in the output buffer, or Py_None, if the mapping was undefined
8354    (in which case no character was written) or NULL, if a
8355    reallocation error occurred. The caller must decref the result */
8356 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8357 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8358                      PyObject **outobj, Py_ssize_t *outpos)
8359 {
8360     PyObject *rep;
8361     char *outstart;
8362     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8363 
8364     if (Py_TYPE(mapping) == &EncodingMapType) {
8365         int res = encoding_map_lookup(c, mapping);
8366         Py_ssize_t requiredsize = *outpos+1;
8367         if (res == -1)
8368             return enc_FAILED;
8369         if (outsize<requiredsize)
8370             if (charmapencode_resize(outobj, outpos, requiredsize))
8371                 return enc_EXCEPTION;
8372         outstart = PyBytes_AS_STRING(*outobj);
8373         outstart[(*outpos)++] = (char)res;
8374         return enc_SUCCESS;
8375     }
8376 
8377     rep = charmapencode_lookup(c, mapping);
8378     if (rep==NULL)
8379         return enc_EXCEPTION;
8380     else if (rep==Py_None) {
8381         Py_DECREF(rep);
8382         return enc_FAILED;
8383     } else {
8384         if (PyLong_Check(rep)) {
8385             Py_ssize_t requiredsize = *outpos+1;
8386             if (outsize<requiredsize)
8387                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8388                     Py_DECREF(rep);
8389                     return enc_EXCEPTION;
8390                 }
8391             outstart = PyBytes_AS_STRING(*outobj);
8392             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8393         }
8394         else {
8395             const char *repchars = PyBytes_AS_STRING(rep);
8396             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8397             Py_ssize_t requiredsize = *outpos+repsize;
8398             if (outsize<requiredsize)
8399                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8400                     Py_DECREF(rep);
8401                     return enc_EXCEPTION;
8402                 }
8403             outstart = PyBytes_AS_STRING(*outobj);
8404             memcpy(outstart + *outpos, repchars, repsize);
8405             *outpos += repsize;
8406         }
8407     }
8408     Py_DECREF(rep);
8409     return enc_SUCCESS;
8410 }
8411 
8412 /* handle an error in PyUnicode_EncodeCharmap
8413    Return 0 on success, -1 on error */
8414 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8415 charmap_encoding_error(
8416     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8417     PyObject **exceptionObject,
8418     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8419     PyObject **res, Py_ssize_t *respos)
8420 {
8421     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8422     Py_ssize_t size, repsize;
8423     Py_ssize_t newpos;
8424     enum PyUnicode_Kind kind;
8425     void *data;
8426     Py_ssize_t index;
8427     /* startpos for collecting unencodable chars */
8428     Py_ssize_t collstartpos = *inpos;
8429     Py_ssize_t collendpos = *inpos+1;
8430     Py_ssize_t collpos;
8431     const char *encoding = "charmap";
8432     const char *reason = "character maps to <undefined>";
8433     charmapencode_result x;
8434     Py_UCS4 ch;
8435     int val;
8436 
8437     if (PyUnicode_READY(unicode) == -1)
8438         return -1;
8439     size = PyUnicode_GET_LENGTH(unicode);
8440     /* find all unencodable characters */
8441     while (collendpos < size) {
8442         PyObject *rep;
8443         if (Py_TYPE(mapping) == &EncodingMapType) {
8444             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8445             val = encoding_map_lookup(ch, mapping);
8446             if (val != -1)
8447                 break;
8448             ++collendpos;
8449             continue;
8450         }
8451 
8452         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8453         rep = charmapencode_lookup(ch, mapping);
8454         if (rep==NULL)
8455             return -1;
8456         else if (rep!=Py_None) {
8457             Py_DECREF(rep);
8458             break;
8459         }
8460         Py_DECREF(rep);
8461         ++collendpos;
8462     }
8463     /* cache callback name lookup
8464      * (if not done yet, i.e. it's the first error) */
8465     if (*error_handler == _Py_ERROR_UNKNOWN)
8466         *error_handler = get_error_handler(errors);
8467 
8468     switch (*error_handler) {
8469     case _Py_ERROR_STRICT:
8470         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8471         return -1;
8472 
8473     case _Py_ERROR_REPLACE:
8474         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8475             x = charmapencode_output('?', mapping, res, respos);
8476             if (x==enc_EXCEPTION) {
8477                 return -1;
8478             }
8479             else if (x==enc_FAILED) {
8480                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8481                 return -1;
8482             }
8483         }
8484         /* fall through */
8485     case _Py_ERROR_IGNORE:
8486         *inpos = collendpos;
8487         break;
8488 
8489     case _Py_ERROR_XMLCHARREFREPLACE:
8490         /* generate replacement (temporarily (mis)uses p) */
8491         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8492             char buffer[2+29+1+1];
8493             char *cp;
8494             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8495             for (cp = buffer; *cp; ++cp) {
8496                 x = charmapencode_output(*cp, mapping, res, respos);
8497                 if (x==enc_EXCEPTION)
8498                     return -1;
8499                 else if (x==enc_FAILED) {
8500                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8501                     return -1;
8502                 }
8503             }
8504         }
8505         *inpos = collendpos;
8506         break;
8507 
8508     default:
8509         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8510                                                       encoding, reason, unicode, exceptionObject,
8511                                                       collstartpos, collendpos, &newpos);
8512         if (repunicode == NULL)
8513             return -1;
8514         if (PyBytes_Check(repunicode)) {
8515             /* Directly copy bytes result to output. */
8516             Py_ssize_t outsize = PyBytes_Size(*res);
8517             Py_ssize_t requiredsize;
8518             repsize = PyBytes_Size(repunicode);
8519             requiredsize = *respos + repsize;
8520             if (requiredsize > outsize)
8521                 /* Make room for all additional bytes. */
8522                 if (charmapencode_resize(res, respos, requiredsize)) {
8523                     Py_DECREF(repunicode);
8524                     return -1;
8525                 }
8526             memcpy(PyBytes_AsString(*res) + *respos,
8527                    PyBytes_AsString(repunicode),  repsize);
8528             *respos += repsize;
8529             *inpos = newpos;
8530             Py_DECREF(repunicode);
8531             break;
8532         }
8533         /* generate replacement  */
8534         if (PyUnicode_READY(repunicode) == -1) {
8535             Py_DECREF(repunicode);
8536             return -1;
8537         }
8538         repsize = PyUnicode_GET_LENGTH(repunicode);
8539         data = PyUnicode_DATA(repunicode);
8540         kind = PyUnicode_KIND(repunicode);
8541         for (index = 0; index < repsize; index++) {
8542             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8543             x = charmapencode_output(repch, mapping, res, respos);
8544             if (x==enc_EXCEPTION) {
8545                 Py_DECREF(repunicode);
8546                 return -1;
8547             }
8548             else if (x==enc_FAILED) {
8549                 Py_DECREF(repunicode);
8550                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8551                 return -1;
8552             }
8553         }
8554         *inpos = newpos;
8555         Py_DECREF(repunicode);
8556     }
8557     return 0;
8558 }
8559 
8560 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8561 _PyUnicode_EncodeCharmap(PyObject *unicode,
8562                          PyObject *mapping,
8563                          const char *errors)
8564 {
8565     /* output object */
8566     PyObject *res = NULL;
8567     /* current input position */
8568     Py_ssize_t inpos = 0;
8569     Py_ssize_t size;
8570     /* current output position */
8571     Py_ssize_t respos = 0;
8572     PyObject *error_handler_obj = NULL;
8573     PyObject *exc = NULL;
8574     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8575     void *data;
8576     int kind;
8577 
8578     if (PyUnicode_READY(unicode) == -1)
8579         return NULL;
8580     size = PyUnicode_GET_LENGTH(unicode);
8581     data = PyUnicode_DATA(unicode);
8582     kind = PyUnicode_KIND(unicode);
8583 
8584     /* Default to Latin-1 */
8585     if (mapping == NULL)
8586         return unicode_encode_ucs1(unicode, errors, 256);
8587 
8588     /* allocate enough for a simple encoding without
8589        replacements, if we need more, we'll resize */
8590     res = PyBytes_FromStringAndSize(NULL, size);
8591     if (res == NULL)
8592         goto onError;
8593     if (size == 0)
8594         return res;
8595 
8596     while (inpos<size) {
8597         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8598         /* try to encode it */
8599         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8600         if (x==enc_EXCEPTION) /* error */
8601             goto onError;
8602         if (x==enc_FAILED) { /* unencodable character */
8603             if (charmap_encoding_error(unicode, &inpos, mapping,
8604                                        &exc,
8605                                        &error_handler, &error_handler_obj, errors,
8606                                        &res, &respos)) {
8607                 goto onError;
8608             }
8609         }
8610         else
8611             /* done with this character => adjust input position */
8612             ++inpos;
8613     }
8614 
8615     /* Resize if we allocated to much */
8616     if (respos<PyBytes_GET_SIZE(res))
8617         if (_PyBytes_Resize(&res, respos) < 0)
8618             goto onError;
8619 
8620     Py_XDECREF(exc);
8621     Py_XDECREF(error_handler_obj);
8622     return res;
8623 
8624   onError:
8625     Py_XDECREF(res);
8626     Py_XDECREF(exc);
8627     Py_XDECREF(error_handler_obj);
8628     return NULL;
8629 }
8630 
8631 /* Deprecated */
8632 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8633 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8634                         Py_ssize_t size,
8635                         PyObject *mapping,
8636                         const char *errors)
8637 {
8638     PyObject *result;
8639     PyObject *unicode = PyUnicode_FromWideChar(p, size);
8640     if (unicode == NULL)
8641         return NULL;
8642     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8643     Py_DECREF(unicode);
8644     return result;
8645 }
8646 
8647 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8648 PyUnicode_AsCharmapString(PyObject *unicode,
8649                           PyObject *mapping)
8650 {
8651     if (!PyUnicode_Check(unicode) || mapping == NULL) {
8652         PyErr_BadArgument();
8653         return NULL;
8654     }
8655     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8656 }
8657 
8658 /* create or adjust a UnicodeTranslateError */
8659 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8660 make_translate_exception(PyObject **exceptionObject,
8661                          PyObject *unicode,
8662                          Py_ssize_t startpos, Py_ssize_t endpos,
8663                          const char *reason)
8664 {
8665     if (*exceptionObject == NULL) {
8666         *exceptionObject = _PyUnicodeTranslateError_Create(
8667             unicode, startpos, endpos, reason);
8668     }
8669     else {
8670         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8671             goto onError;
8672         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8673             goto onError;
8674         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8675             goto onError;
8676         return;
8677       onError:
8678         Py_CLEAR(*exceptionObject);
8679     }
8680 }
8681 
8682 /* error handling callback helper:
8683    build arguments, call the callback and check the arguments,
8684    put the result into newpos and return the replacement string, which
8685    has to be freed by the caller */
8686 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8687 unicode_translate_call_errorhandler(const char *errors,
8688                                     PyObject **errorHandler,
8689                                     const char *reason,
8690                                     PyObject *unicode, PyObject **exceptionObject,
8691                                     Py_ssize_t startpos, Py_ssize_t endpos,
8692                                     Py_ssize_t *newpos)
8693 {
8694     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8695 
8696     Py_ssize_t i_newpos;
8697     PyObject *restuple;
8698     PyObject *resunicode;
8699 
8700     if (*errorHandler == NULL) {
8701         *errorHandler = PyCodec_LookupError(errors);
8702         if (*errorHandler == NULL)
8703             return NULL;
8704     }
8705 
8706     make_translate_exception(exceptionObject,
8707                              unicode, startpos, endpos, reason);
8708     if (*exceptionObject == NULL)
8709         return NULL;
8710 
8711     restuple = PyObject_CallFunctionObjArgs(
8712         *errorHandler, *exceptionObject, NULL);
8713     if (restuple == NULL)
8714         return NULL;
8715     if (!PyTuple_Check(restuple)) {
8716         PyErr_SetString(PyExc_TypeError, &argparse[3]);
8717         Py_DECREF(restuple);
8718         return NULL;
8719     }
8720     if (!PyArg_ParseTuple(restuple, argparse,
8721                           &resunicode, &i_newpos)) {
8722         Py_DECREF(restuple);
8723         return NULL;
8724     }
8725     if (i_newpos<0)
8726         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8727     else
8728         *newpos = i_newpos;
8729     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8730         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8731         Py_DECREF(restuple);
8732         return NULL;
8733     }
8734     Py_INCREF(resunicode);
8735     Py_DECREF(restuple);
8736     return resunicode;
8737 }
8738 
8739 /* Lookup the character ch in the mapping and put the result in result,
8740    which must be decrefed by the caller.
8741    Return 0 on success, -1 on error */
8742 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8743 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8744 {
8745     PyObject *w = PyLong_FromLong((long)c);
8746     PyObject *x;
8747 
8748     if (w == NULL)
8749         return -1;
8750     x = PyObject_GetItem(mapping, w);
8751     Py_DECREF(w);
8752     if (x == NULL) {
8753         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8754             /* No mapping found means: use 1:1 mapping. */
8755             PyErr_Clear();
8756             *result = NULL;
8757             return 0;
8758         } else
8759             return -1;
8760     }
8761     else if (x == Py_None) {
8762         *result = x;
8763         return 0;
8764     }
8765     else if (PyLong_Check(x)) {
8766         long value = PyLong_AS_LONG(x);
8767         if (value < 0 || value > MAX_UNICODE) {
8768             PyErr_Format(PyExc_ValueError,
8769                          "character mapping must be in range(0x%x)",
8770                          MAX_UNICODE+1);
8771             Py_DECREF(x);
8772             return -1;
8773         }
8774         *result = x;
8775         return 0;
8776     }
8777     else if (PyUnicode_Check(x)) {
8778         *result = x;
8779         return 0;
8780     }
8781     else {
8782         /* wrong return value */
8783         PyErr_SetString(PyExc_TypeError,
8784                         "character mapping must return integer, None or str");
8785         Py_DECREF(x);
8786         return -1;
8787     }
8788 }
8789 
8790 /* lookup the character, write the result into the writer.
8791    Return 1 if the result was written into the writer, return 0 if the mapping
8792    was undefined, raise an exception return -1 on error. */
8793 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8794 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8795                         _PyUnicodeWriter *writer)
8796 {
8797     PyObject *item;
8798 
8799     if (charmaptranslate_lookup(ch, mapping, &item))
8800         return -1;
8801 
8802     if (item == NULL) {
8803         /* not found => default to 1:1 mapping */
8804         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8805             return -1;
8806         }
8807         return 1;
8808     }
8809 
8810     if (item == Py_None) {
8811         Py_DECREF(item);
8812         return 0;
8813     }
8814 
8815     if (PyLong_Check(item)) {
8816         long ch = (Py_UCS4)PyLong_AS_LONG(item);
8817         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8818            used it */
8819         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8820             Py_DECREF(item);
8821             return -1;
8822         }
8823         Py_DECREF(item);
8824         return 1;
8825     }
8826 
8827     if (!PyUnicode_Check(item)) {
8828         Py_DECREF(item);
8829         return -1;
8830     }
8831 
8832     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8833         Py_DECREF(item);
8834         return -1;
8835     }
8836 
8837     Py_DECREF(item);
8838     return 1;
8839 }
8840 
8841 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8842 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8843                               Py_UCS1 *translate)
8844 {
8845     PyObject *item = NULL;
8846     int ret = 0;
8847 
8848     if (charmaptranslate_lookup(ch, mapping, &item)) {
8849         return -1;
8850     }
8851 
8852     if (item == Py_None) {
8853         /* deletion */
8854         translate[ch] = 0xfe;
8855     }
8856     else if (item == NULL) {
8857         /* not found => default to 1:1 mapping */
8858         translate[ch] = ch;
8859         return 1;
8860     }
8861     else if (PyLong_Check(item)) {
8862         long replace = PyLong_AS_LONG(item);
8863         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8864            used it */
8865         if (127 < replace) {
8866             /* invalid character or character outside ASCII:
8867                skip the fast translate */
8868             goto exit;
8869         }
8870         translate[ch] = (Py_UCS1)replace;
8871     }
8872     else if (PyUnicode_Check(item)) {
8873         Py_UCS4 replace;
8874 
8875         if (PyUnicode_READY(item) == -1) {
8876             Py_DECREF(item);
8877             return -1;
8878         }
8879         if (PyUnicode_GET_LENGTH(item) != 1)
8880             goto exit;
8881 
8882         replace = PyUnicode_READ_CHAR(item, 0);
8883         if (replace > 127)
8884             goto exit;
8885         translate[ch] = (Py_UCS1)replace;
8886     }
8887     else {
8888         /* not None, NULL, long or unicode */
8889         goto exit;
8890     }
8891     ret = 1;
8892 
8893   exit:
8894     Py_DECREF(item);
8895     return ret;
8896 }
8897 
8898 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8899    was translated into writer, return 0 if the input string was partially
8900    translated into writer, raise an exception and return -1 on error. */
8901 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8902 unicode_fast_translate(PyObject *input, PyObject *mapping,
8903                        _PyUnicodeWriter *writer, int ignore,
8904                        Py_ssize_t *input_pos)
8905 {
8906     Py_UCS1 ascii_table[128], ch, ch2;
8907     Py_ssize_t len;
8908     Py_UCS1 *in, *end, *out;
8909     int res = 0;
8910 
8911     len = PyUnicode_GET_LENGTH(input);
8912 
8913     memset(ascii_table, 0xff, 128);
8914 
8915     in = PyUnicode_1BYTE_DATA(input);
8916     end = in + len;
8917 
8918     assert(PyUnicode_IS_ASCII(writer->buffer));
8919     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8920     out = PyUnicode_1BYTE_DATA(writer->buffer);
8921 
8922     for (; in < end; in++) {
8923         ch = *in;
8924         ch2 = ascii_table[ch];
8925         if (ch2 == 0xff) {
8926             int translate = unicode_fast_translate_lookup(mapping, ch,
8927                                                           ascii_table);
8928             if (translate < 0)
8929                 return -1;
8930             if (translate == 0)
8931                 goto exit;
8932             ch2 = ascii_table[ch];
8933         }
8934         if (ch2 == 0xfe) {
8935             if (ignore)
8936                 continue;
8937             goto exit;
8938         }
8939         assert(ch2 < 128);
8940         *out = ch2;
8941         out++;
8942     }
8943     res = 1;
8944 
8945 exit:
8946     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8947     *input_pos = in - PyUnicode_1BYTE_DATA(input);
8948     return res;
8949 }
8950 
8951 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8952 _PyUnicode_TranslateCharmap(PyObject *input,
8953                             PyObject *mapping,
8954                             const char *errors)
8955 {
8956     /* input object */
8957     char *data;
8958     Py_ssize_t size, i;
8959     int kind;
8960     /* output buffer */
8961     _PyUnicodeWriter writer;
8962     /* error handler */
8963     const char *reason = "character maps to <undefined>";
8964     PyObject *errorHandler = NULL;
8965     PyObject *exc = NULL;
8966     int ignore;
8967     int res;
8968 
8969     if (mapping == NULL) {
8970         PyErr_BadArgument();
8971         return NULL;
8972     }
8973 
8974     if (PyUnicode_READY(input) == -1)
8975         return NULL;
8976     data = (char*)PyUnicode_DATA(input);
8977     kind = PyUnicode_KIND(input);
8978     size = PyUnicode_GET_LENGTH(input);
8979 
8980     if (size == 0)
8981         return PyUnicode_FromObject(input);
8982 
8983     /* allocate enough for a simple 1:1 translation without
8984        replacements, if we need more, we'll resize */
8985     _PyUnicodeWriter_Init(&writer);
8986     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8987         goto onError;
8988 
8989     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8990 
8991     if (PyUnicode_READY(input) == -1)
8992         return NULL;
8993     if (PyUnicode_IS_ASCII(input)) {
8994         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8995         if (res < 0) {
8996             _PyUnicodeWriter_Dealloc(&writer);
8997             return NULL;
8998         }
8999         if (res == 1)
9000             return _PyUnicodeWriter_Finish(&writer);
9001     }
9002     else {
9003         i = 0;
9004     }
9005 
9006     while (i<size) {
9007         /* try to encode it */
9008         int translate;
9009         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9010         Py_ssize_t newpos;
9011         /* startpos for collecting untranslatable chars */
9012         Py_ssize_t collstart;
9013         Py_ssize_t collend;
9014         Py_UCS4 ch;
9015 
9016         ch = PyUnicode_READ(kind, data, i);
9017         translate = charmaptranslate_output(ch, mapping, &writer);
9018         if (translate < 0)
9019             goto onError;
9020 
9021         if (translate != 0) {
9022             /* it worked => adjust input pointer */
9023             ++i;
9024             continue;
9025         }
9026 
9027         /* untranslatable character */
9028         collstart = i;
9029         collend = i+1;
9030 
9031         /* find all untranslatable characters */
9032         while (collend < size) {
9033             PyObject *x;
9034             ch = PyUnicode_READ(kind, data, collend);
9035             if (charmaptranslate_lookup(ch, mapping, &x))
9036                 goto onError;
9037             Py_XDECREF(x);
9038             if (x != Py_None)
9039                 break;
9040             ++collend;
9041         }
9042 
9043         if (ignore) {
9044             i = collend;
9045         }
9046         else {
9047             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9048                                                              reason, input, &exc,
9049                                                              collstart, collend, &newpos);
9050             if (repunicode == NULL)
9051                 goto onError;
9052             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9053                 Py_DECREF(repunicode);
9054                 goto onError;
9055             }
9056             Py_DECREF(repunicode);
9057             i = newpos;
9058         }
9059     }
9060     Py_XDECREF(exc);
9061     Py_XDECREF(errorHandler);
9062     return _PyUnicodeWriter_Finish(&writer);
9063 
9064   onError:
9065     _PyUnicodeWriter_Dealloc(&writer);
9066     Py_XDECREF(exc);
9067     Py_XDECREF(errorHandler);
9068     return NULL;
9069 }
9070 
9071 /* Deprecated. Use PyUnicode_Translate instead. */
9072 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9073 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9074                            Py_ssize_t size,
9075                            PyObject *mapping,
9076                            const char *errors)
9077 {
9078     PyObject *result;
9079     PyObject *unicode = PyUnicode_FromWideChar(p, size);
9080     if (!unicode)
9081         return NULL;
9082     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9083     Py_DECREF(unicode);
9084     return result;
9085 }
9086 
9087 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9088 PyUnicode_Translate(PyObject *str,
9089                     PyObject *mapping,
9090                     const char *errors)
9091 {
9092     if (ensure_unicode(str) < 0)
9093         return NULL;
9094     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9095 }
9096 
9097 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9098 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9099 {
9100     if (!PyUnicode_Check(unicode)) {
9101         PyErr_BadInternalCall();
9102         return NULL;
9103     }
9104     if (PyUnicode_READY(unicode) == -1)
9105         return NULL;
9106     if (PyUnicode_IS_ASCII(unicode)) {
9107         /* If the string is already ASCII, just return the same string */
9108         Py_INCREF(unicode);
9109         return unicode;
9110     }
9111 
9112     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9113     PyObject *result = PyUnicode_New(len, 127);
9114     if (result == NULL) {
9115         return NULL;
9116     }
9117 
9118     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9119     int kind = PyUnicode_KIND(unicode);
9120     const void *data = PyUnicode_DATA(unicode);
9121     Py_ssize_t i;
9122     for (i = 0; i < len; ++i) {
9123         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9124         if (ch < 127) {
9125             out[i] = ch;
9126         }
9127         else if (Py_UNICODE_ISSPACE(ch)) {
9128             out[i] = ' ';
9129         }
9130         else {
9131             int decimal = Py_UNICODE_TODECIMAL(ch);
9132             if (decimal < 0) {
9133                 out[i] = '?';
9134                 out[i+1] = '\0';
9135                 _PyUnicode_LENGTH(result) = i + 1;
9136                 break;
9137             }
9138             out[i] = '0' + decimal;
9139         }
9140     }
9141 
9142     assert(_PyUnicode_CheckConsistency(result, 1));
9143     return result;
9144 }
9145 
9146 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9147 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9148                                   Py_ssize_t length)
9149 {
9150     PyObject *decimal;
9151     Py_ssize_t i;
9152     Py_UCS4 maxchar;
9153     enum PyUnicode_Kind kind;
9154     void *data;
9155 
9156     maxchar = 127;
9157     for (i = 0; i < length; i++) {
9158         Py_UCS4 ch = s[i];
9159         if (ch > 127) {
9160             int decimal = Py_UNICODE_TODECIMAL(ch);
9161             if (decimal >= 0)
9162                 ch = '0' + decimal;
9163             maxchar = Py_MAX(maxchar, ch);
9164         }
9165     }
9166 
9167     /* Copy to a new string */
9168     decimal = PyUnicode_New(length, maxchar);
9169     if (decimal == NULL)
9170         return decimal;
9171     kind = PyUnicode_KIND(decimal);
9172     data = PyUnicode_DATA(decimal);
9173     /* Iterate over code points */
9174     for (i = 0; i < length; i++) {
9175         Py_UCS4 ch = s[i];
9176         if (ch > 127) {
9177             int decimal = Py_UNICODE_TODECIMAL(ch);
9178             if (decimal >= 0)
9179                 ch = '0' + decimal;
9180         }
9181         PyUnicode_WRITE(kind, data, i, ch);
9182     }
9183     return unicode_result(decimal);
9184 }
9185 /* --- Decimal Encoder ---------------------------------------------------- */
9186 
9187 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9188 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9189                         Py_ssize_t length,
9190                         char *output,
9191                         const char *errors)
9192 {
9193     PyObject *unicode;
9194     Py_ssize_t i;
9195     enum PyUnicode_Kind kind;
9196     void *data;
9197 
9198     if (output == NULL) {
9199         PyErr_BadArgument();
9200         return -1;
9201     }
9202 
9203     unicode = PyUnicode_FromWideChar(s, length);
9204     if (unicode == NULL)
9205         return -1;
9206 
9207     kind = PyUnicode_KIND(unicode);
9208     data = PyUnicode_DATA(unicode);
9209 
9210     for (i=0; i < length; ) {
9211         PyObject *exc;
9212         Py_UCS4 ch;
9213         int decimal;
9214         Py_ssize_t startpos;
9215 
9216         ch = PyUnicode_READ(kind, data, i);
9217 
9218         if (Py_UNICODE_ISSPACE(ch)) {
9219             *output++ = ' ';
9220             i++;
9221             continue;
9222         }
9223         decimal = Py_UNICODE_TODECIMAL(ch);
9224         if (decimal >= 0) {
9225             *output++ = '0' + decimal;
9226             i++;
9227             continue;
9228         }
9229         if (0 < ch && ch < 256) {
9230             *output++ = (char)ch;
9231             i++;
9232             continue;
9233         }
9234 
9235         startpos = i;
9236         exc = NULL;
9237         raise_encode_exception(&exc, "decimal", unicode,
9238                                startpos, startpos+1,
9239                                "invalid decimal Unicode string");
9240         Py_XDECREF(exc);
9241         Py_DECREF(unicode);
9242         return -1;
9243     }
9244     /* 0-terminate the output string */
9245     *output++ = '\0';
9246     Py_DECREF(unicode);
9247     return 0;
9248 }
9249 
9250 /* --- Helpers ------------------------------------------------------------ */
9251 
9252 /* helper macro to fixup start/end slice values */
9253 #define ADJUST_INDICES(start, end, len)         \
9254     if (end > len)                              \
9255         end = len;                              \
9256     else if (end < 0) {                         \
9257         end += len;                             \
9258         if (end < 0)                            \
9259             end = 0;                            \
9260     }                                           \
9261     if (start < 0) {                            \
9262         start += len;                           \
9263         if (start < 0)                          \
9264             start = 0;                          \
9265     }
9266 
9267 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9268 any_find_slice(PyObject* s1, PyObject* s2,
9269                Py_ssize_t start,
9270                Py_ssize_t end,
9271                int direction)
9272 {
9273     int kind1, kind2;
9274     void *buf1, *buf2;
9275     Py_ssize_t len1, len2, result;
9276 
9277     kind1 = PyUnicode_KIND(s1);
9278     kind2 = PyUnicode_KIND(s2);
9279     if (kind1 < kind2)
9280         return -1;
9281 
9282     len1 = PyUnicode_GET_LENGTH(s1);
9283     len2 = PyUnicode_GET_LENGTH(s2);
9284     ADJUST_INDICES(start, end, len1);
9285     if (end - start < len2)
9286         return -1;
9287 
9288     buf1 = PyUnicode_DATA(s1);
9289     buf2 = PyUnicode_DATA(s2);
9290     if (len2 == 1) {
9291         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9292         result = findchar((const char *)buf1 + kind1*start,
9293                           kind1, end - start, ch, direction);
9294         if (result == -1)
9295             return -1;
9296         else
9297             return start + result;
9298     }
9299 
9300     if (kind2 != kind1) {
9301         buf2 = _PyUnicode_AsKind(s2, kind1);
9302         if (!buf2)
9303             return -2;
9304     }
9305 
9306     if (direction > 0) {
9307         switch (kind1) {
9308         case PyUnicode_1BYTE_KIND:
9309             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9310                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9311             else
9312                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9313             break;
9314         case PyUnicode_2BYTE_KIND:
9315             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9316             break;
9317         case PyUnicode_4BYTE_KIND:
9318             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9319             break;
9320         default:
9321             Py_UNREACHABLE();
9322         }
9323     }
9324     else {
9325         switch (kind1) {
9326         case PyUnicode_1BYTE_KIND:
9327             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9328                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9329             else
9330                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9331             break;
9332         case PyUnicode_2BYTE_KIND:
9333             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9334             break;
9335         case PyUnicode_4BYTE_KIND:
9336             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9337             break;
9338         default:
9339             Py_UNREACHABLE();
9340         }
9341     }
9342 
9343     if (kind2 != kind1)
9344         PyMem_Free(buf2);
9345 
9346     return result;
9347 }
9348 
9349 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9350 #include "stringlib/localeutil.h"
9351 
9352 /**
9353  * InsertThousandsGrouping:
9354  * @writer: Unicode writer.
9355  * @n_buffer: Number of characters in @buffer.
9356  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9357  * @d_pos: Start of digits string.
9358  * @n_digits: The number of digits in the string, in which we want
9359  *            to put the grouping chars.
9360  * @min_width: The minimum width of the digits in the output string.
9361  *             Output will be zero-padded on the left to fill.
9362  * @grouping: see definition in localeconv().
9363  * @thousands_sep: see definition in localeconv().
9364  *
9365  * There are 2 modes: counting and filling. If @writer is NULL,
9366  *  we are in counting mode, else filling mode.
9367  * If counting, the required buffer size is returned.
9368  * If filling, we know the buffer will be large enough, so we don't
9369  *  need to pass in the buffer size.
9370  * Inserts thousand grouping characters (as defined by grouping and
9371  *  thousands_sep) into @writer.
9372  *
9373  * Return value: -1 on error, number of characters otherwise.
9374  **/
9375 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9376 _PyUnicode_InsertThousandsGrouping(
9377     _PyUnicodeWriter *writer,
9378     Py_ssize_t n_buffer,
9379     PyObject *digits,
9380     Py_ssize_t d_pos,
9381     Py_ssize_t n_digits,
9382     Py_ssize_t min_width,
9383     const char *grouping,
9384     PyObject *thousands_sep,
9385     Py_UCS4 *maxchar)
9386 {
9387     min_width = Py_MAX(0, min_width);
9388     if (writer) {
9389         assert(digits != NULL);
9390         assert(maxchar == NULL);
9391     }
9392     else {
9393         assert(digits == NULL);
9394         assert(maxchar != NULL);
9395     }
9396     assert(0 <= d_pos);
9397     assert(0 <= n_digits);
9398     assert(grouping != NULL);
9399 
9400     if (digits != NULL) {
9401         if (PyUnicode_READY(digits) == -1) {
9402             return -1;
9403         }
9404     }
9405     if (PyUnicode_READY(thousands_sep) == -1) {
9406         return -1;
9407     }
9408 
9409     Py_ssize_t count = 0;
9410     Py_ssize_t n_zeros;
9411     int loop_broken = 0;
9412     int use_separator = 0; /* First time through, don't append the
9413                               separator. They only go between
9414                               groups. */
9415     Py_ssize_t buffer_pos;
9416     Py_ssize_t digits_pos;
9417     Py_ssize_t len;
9418     Py_ssize_t n_chars;
9419     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9420                                         be looked at */
9421     /* A generator that returns all of the grouping widths, until it
9422        returns 0. */
9423     GroupGenerator groupgen;
9424     GroupGenerator_init(&groupgen, grouping);
9425     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9426 
9427     /* if digits are not grouped, thousands separator
9428        should be an empty string */
9429     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9430 
9431     digits_pos = d_pos + n_digits;
9432     if (writer) {
9433         buffer_pos = writer->pos + n_buffer;
9434         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9435         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9436     }
9437     else {
9438         buffer_pos = n_buffer;
9439     }
9440 
9441     if (!writer) {
9442         *maxchar = 127;
9443     }
9444 
9445     while ((len = GroupGenerator_next(&groupgen)) > 0) {
9446         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9447         n_zeros = Py_MAX(0, len - remaining);
9448         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9449 
9450         /* Use n_zero zero's and n_chars chars */
9451 
9452         /* Count only, don't do anything. */
9453         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9454 
9455         /* Copy into the writer. */
9456         InsertThousandsGrouping_fill(writer, &buffer_pos,
9457                                      digits, &digits_pos,
9458                                      n_chars, n_zeros,
9459                                      use_separator ? thousands_sep : NULL,
9460                                      thousands_sep_len, maxchar);
9461 
9462         /* Use a separator next time. */
9463         use_separator = 1;
9464 
9465         remaining -= n_chars;
9466         min_width -= len;
9467 
9468         if (remaining <= 0 && min_width <= 0) {
9469             loop_broken = 1;
9470             break;
9471         }
9472         min_width -= thousands_sep_len;
9473     }
9474     if (!loop_broken) {
9475         /* We left the loop without using a break statement. */
9476 
9477         len = Py_MAX(Py_MAX(remaining, min_width), 1);
9478         n_zeros = Py_MAX(0, len - remaining);
9479         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9480 
9481         /* Use n_zero zero's and n_chars chars */
9482         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9483 
9484         /* Copy into the writer. */
9485         InsertThousandsGrouping_fill(writer, &buffer_pos,
9486                                      digits, &digits_pos,
9487                                      n_chars, n_zeros,
9488                                      use_separator ? thousands_sep : NULL,
9489                                      thousands_sep_len, maxchar);
9490     }
9491     return count;
9492 }
9493 
9494 
9495 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9496 PyUnicode_Count(PyObject *str,
9497                 PyObject *substr,
9498                 Py_ssize_t start,
9499                 Py_ssize_t end)
9500 {
9501     Py_ssize_t result;
9502     int kind1, kind2;
9503     void *buf1 = NULL, *buf2 = NULL;
9504     Py_ssize_t len1, len2;
9505 
9506     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9507         return -1;
9508 
9509     kind1 = PyUnicode_KIND(str);
9510     kind2 = PyUnicode_KIND(substr);
9511     if (kind1 < kind2)
9512         return 0;
9513 
9514     len1 = PyUnicode_GET_LENGTH(str);
9515     len2 = PyUnicode_GET_LENGTH(substr);
9516     ADJUST_INDICES(start, end, len1);
9517     if (end - start < len2)
9518         return 0;
9519 
9520     buf1 = PyUnicode_DATA(str);
9521     buf2 = PyUnicode_DATA(substr);
9522     if (kind2 != kind1) {
9523         buf2 = _PyUnicode_AsKind(substr, kind1);
9524         if (!buf2)
9525             goto onError;
9526     }
9527 
9528     switch (kind1) {
9529     case PyUnicode_1BYTE_KIND:
9530         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9531             result = asciilib_count(
9532                 ((Py_UCS1*)buf1) + start, end - start,
9533                 buf2, len2, PY_SSIZE_T_MAX
9534                 );
9535         else
9536             result = ucs1lib_count(
9537                 ((Py_UCS1*)buf1) + start, end - start,
9538                 buf2, len2, PY_SSIZE_T_MAX
9539                 );
9540         break;
9541     case PyUnicode_2BYTE_KIND:
9542         result = ucs2lib_count(
9543             ((Py_UCS2*)buf1) + start, end - start,
9544             buf2, len2, PY_SSIZE_T_MAX
9545             );
9546         break;
9547     case PyUnicode_4BYTE_KIND:
9548         result = ucs4lib_count(
9549             ((Py_UCS4*)buf1) + start, end - start,
9550             buf2, len2, PY_SSIZE_T_MAX
9551             );
9552         break;
9553     default:
9554         Py_UNREACHABLE();
9555     }
9556 
9557     if (kind2 != kind1)
9558         PyMem_Free(buf2);
9559 
9560     return result;
9561   onError:
9562     if (kind2 != kind1 && buf2)
9563         PyMem_Free(buf2);
9564     return -1;
9565 }
9566 
9567 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9568 PyUnicode_Find(PyObject *str,
9569                PyObject *substr,
9570                Py_ssize_t start,
9571                Py_ssize_t end,
9572                int direction)
9573 {
9574     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9575         return -2;
9576 
9577     return any_find_slice(str, substr, start, end, direction);
9578 }
9579 
9580 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9581 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9582                    Py_ssize_t start, Py_ssize_t end,
9583                    int direction)
9584 {
9585     int kind;
9586     Py_ssize_t len, result;
9587     if (PyUnicode_READY(str) == -1)
9588         return -2;
9589     len = PyUnicode_GET_LENGTH(str);
9590     ADJUST_INDICES(start, end, len);
9591     if (end - start < 1)
9592         return -1;
9593     kind = PyUnicode_KIND(str);
9594     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9595                       kind, end-start, ch, direction);
9596     if (result == -1)
9597         return -1;
9598     else
9599         return start + result;
9600 }
9601 
9602 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9603 tailmatch(PyObject *self,
9604           PyObject *substring,
9605           Py_ssize_t start,
9606           Py_ssize_t end,
9607           int direction)
9608 {
9609     int kind_self;
9610     int kind_sub;
9611     void *data_self;
9612     void *data_sub;
9613     Py_ssize_t offset;
9614     Py_ssize_t i;
9615     Py_ssize_t end_sub;
9616 
9617     if (PyUnicode_READY(self) == -1 ||
9618         PyUnicode_READY(substring) == -1)
9619         return -1;
9620 
9621     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9622     end -= PyUnicode_GET_LENGTH(substring);
9623     if (end < start)
9624         return 0;
9625 
9626     if (PyUnicode_GET_LENGTH(substring) == 0)
9627         return 1;
9628 
9629     kind_self = PyUnicode_KIND(self);
9630     data_self = PyUnicode_DATA(self);
9631     kind_sub = PyUnicode_KIND(substring);
9632     data_sub = PyUnicode_DATA(substring);
9633     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9634 
9635     if (direction > 0)
9636         offset = end;
9637     else
9638         offset = start;
9639 
9640     if (PyUnicode_READ(kind_self, data_self, offset) ==
9641         PyUnicode_READ(kind_sub, data_sub, 0) &&
9642         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9643         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9644         /* If both are of the same kind, memcmp is sufficient */
9645         if (kind_self == kind_sub) {
9646             return ! memcmp((char *)data_self +
9647                                 (offset * PyUnicode_KIND(substring)),
9648                             data_sub,
9649                             PyUnicode_GET_LENGTH(substring) *
9650                                 PyUnicode_KIND(substring));
9651         }
9652         /* otherwise we have to compare each character by first accessing it */
9653         else {
9654             /* We do not need to compare 0 and len(substring)-1 because
9655                the if statement above ensured already that they are equal
9656                when we end up here. */
9657             for (i = 1; i < end_sub; ++i) {
9658                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9659                     PyUnicode_READ(kind_sub, data_sub, i))
9660                     return 0;
9661             }
9662             return 1;
9663         }
9664     }
9665 
9666     return 0;
9667 }
9668 
9669 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9670 PyUnicode_Tailmatch(PyObject *str,
9671                     PyObject *substr,
9672                     Py_ssize_t start,
9673                     Py_ssize_t end,
9674                     int direction)
9675 {
9676     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9677         return -1;
9678 
9679     return tailmatch(str, substr, start, end, direction);
9680 }
9681 
9682 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9683 ascii_upper_or_lower(PyObject *self, int lower)
9684 {
9685     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9686     char *resdata, *data = PyUnicode_DATA(self);
9687     PyObject *res;
9688 
9689     res = PyUnicode_New(len, 127);
9690     if (res == NULL)
9691         return NULL;
9692     resdata = PyUnicode_DATA(res);
9693     if (lower)
9694         _Py_bytes_lower(resdata, data, len);
9695     else
9696         _Py_bytes_upper(resdata, data, len);
9697     return res;
9698 }
9699 
9700 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9701 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9702 {
9703     Py_ssize_t j;
9704     int final_sigma;
9705     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9706     /* U+03A3 is in the Final_Sigma context when, it is found like this:
9707 
9708      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9709 
9710     where ! is a negation and \p{xxx} is a character with property xxx.
9711     */
9712     for (j = i - 1; j >= 0; j--) {
9713         c = PyUnicode_READ(kind, data, j);
9714         if (!_PyUnicode_IsCaseIgnorable(c))
9715             break;
9716     }
9717     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9718     if (final_sigma) {
9719         for (j = i + 1; j < length; j++) {
9720             c = PyUnicode_READ(kind, data, j);
9721             if (!_PyUnicode_IsCaseIgnorable(c))
9722                 break;
9723         }
9724         final_sigma = j == length || !_PyUnicode_IsCased(c);
9725     }
9726     return (final_sigma) ? 0x3C2 : 0x3C3;
9727 }
9728 
9729 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9730 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9731            Py_UCS4 c, Py_UCS4 *mapped)
9732 {
9733     /* Obscure special case. */
9734     if (c == 0x3A3) {
9735         mapped[0] = handle_capital_sigma(kind, data, length, i);
9736         return 1;
9737     }
9738     return _PyUnicode_ToLowerFull(c, mapped);
9739 }
9740 
9741 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9742 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9743 {
9744     Py_ssize_t i, k = 0;
9745     int n_res, j;
9746     Py_UCS4 c, mapped[3];
9747 
9748     c = PyUnicode_READ(kind, data, 0);
9749     n_res = _PyUnicode_ToUpperFull(c, mapped);
9750     for (j = 0; j < n_res; j++) {
9751         *maxchar = Py_MAX(*maxchar, mapped[j]);
9752         res[k++] = mapped[j];
9753     }
9754     for (i = 1; i < length; i++) {
9755         c = PyUnicode_READ(kind, data, i);
9756         n_res = lower_ucs4(kind, data, length, i, c, mapped);
9757         for (j = 0; j < n_res; j++) {
9758             *maxchar = Py_MAX(*maxchar, mapped[j]);
9759             res[k++] = mapped[j];
9760         }
9761     }
9762     return k;
9763 }
9764 
9765 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9766 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9767     Py_ssize_t i, k = 0;
9768 
9769     for (i = 0; i < length; i++) {
9770         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9771         int n_res, j;
9772         if (Py_UNICODE_ISUPPER(c)) {
9773             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9774         }
9775         else if (Py_UNICODE_ISLOWER(c)) {
9776             n_res = _PyUnicode_ToUpperFull(c, mapped);
9777         }
9778         else {
9779             n_res = 1;
9780             mapped[0] = c;
9781         }
9782         for (j = 0; j < n_res; j++) {
9783             *maxchar = Py_MAX(*maxchar, mapped[j]);
9784             res[k++] = mapped[j];
9785         }
9786     }
9787     return k;
9788 }
9789 
9790 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9791 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9792                   Py_UCS4 *maxchar, int lower)
9793 {
9794     Py_ssize_t i, k = 0;
9795 
9796     for (i = 0; i < length; i++) {
9797         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9798         int n_res, j;
9799         if (lower)
9800             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9801         else
9802             n_res = _PyUnicode_ToUpperFull(c, mapped);
9803         for (j = 0; j < n_res; j++) {
9804             *maxchar = Py_MAX(*maxchar, mapped[j]);
9805             res[k++] = mapped[j];
9806         }
9807     }
9808     return k;
9809 }
9810 
9811 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9812 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9813 {
9814     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9815 }
9816 
9817 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9818 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9819 {
9820     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9821 }
9822 
9823 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9824 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9825 {
9826     Py_ssize_t i, k = 0;
9827 
9828     for (i = 0; i < length; i++) {
9829         Py_UCS4 c = PyUnicode_READ(kind, data, i);
9830         Py_UCS4 mapped[3];
9831         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9832         for (j = 0; j < n_res; j++) {
9833             *maxchar = Py_MAX(*maxchar, mapped[j]);
9834             res[k++] = mapped[j];
9835         }
9836     }
9837     return k;
9838 }
9839 
9840 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9841 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9842 {
9843     Py_ssize_t i, k = 0;
9844     int previous_is_cased;
9845 
9846     previous_is_cased = 0;
9847     for (i = 0; i < length; i++) {
9848         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9849         Py_UCS4 mapped[3];
9850         int n_res, j;
9851 
9852         if (previous_is_cased)
9853             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9854         else
9855             n_res = _PyUnicode_ToTitleFull(c, mapped);
9856 
9857         for (j = 0; j < n_res; j++) {
9858             *maxchar = Py_MAX(*maxchar, mapped[j]);
9859             res[k++] = mapped[j];
9860         }
9861 
9862         previous_is_cased = _PyUnicode_IsCased(c);
9863     }
9864     return k;
9865 }
9866 
9867 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9868 case_operation(PyObject *self,
9869                Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9870 {
9871     PyObject *res = NULL;
9872     Py_ssize_t length, newlength = 0;
9873     int kind, outkind;
9874     void *data, *outdata;
9875     Py_UCS4 maxchar = 0, *tmp, *tmpend;
9876 
9877     assert(PyUnicode_IS_READY(self));
9878 
9879     kind = PyUnicode_KIND(self);
9880     data = PyUnicode_DATA(self);
9881     length = PyUnicode_GET_LENGTH(self);
9882     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9883         PyErr_SetString(PyExc_OverflowError, "string is too long");
9884         return NULL;
9885     }
9886     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9887     if (tmp == NULL)
9888         return PyErr_NoMemory();
9889     newlength = perform(kind, data, length, tmp, &maxchar);
9890     res = PyUnicode_New(newlength, maxchar);
9891     if (res == NULL)
9892         goto leave;
9893     tmpend = tmp + newlength;
9894     outdata = PyUnicode_DATA(res);
9895     outkind = PyUnicode_KIND(res);
9896     switch (outkind) {
9897     case PyUnicode_1BYTE_KIND:
9898         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9899         break;
9900     case PyUnicode_2BYTE_KIND:
9901         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9902         break;
9903     case PyUnicode_4BYTE_KIND:
9904         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9905         break;
9906     default:
9907         Py_UNREACHABLE();
9908     }
9909   leave:
9910     PyMem_FREE(tmp);
9911     return res;
9912 }
9913 
9914 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9915 PyUnicode_Join(PyObject *separator, PyObject *seq)
9916 {
9917     PyObject *res;
9918     PyObject *fseq;
9919     Py_ssize_t seqlen;
9920     PyObject **items;
9921 
9922     fseq = PySequence_Fast(seq, "can only join an iterable");
9923     if (fseq == NULL) {
9924         return NULL;
9925     }
9926 
9927     /* NOTE: the following code can't call back into Python code,
9928      * so we are sure that fseq won't be mutated.
9929      */
9930 
9931     items = PySequence_Fast_ITEMS(fseq);
9932     seqlen = PySequence_Fast_GET_SIZE(fseq);
9933     res = _PyUnicode_JoinArray(separator, items, seqlen);
9934     Py_DECREF(fseq);
9935     return res;
9936 }
9937 
9938 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9939 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9940 {
9941     PyObject *res = NULL; /* the result */
9942     PyObject *sep = NULL;
9943     Py_ssize_t seplen;
9944     PyObject *item;
9945     Py_ssize_t sz, i, res_offset;
9946     Py_UCS4 maxchar;
9947     Py_UCS4 item_maxchar;
9948     int use_memcpy;
9949     unsigned char *res_data = NULL, *sep_data = NULL;
9950     PyObject *last_obj;
9951     unsigned int kind = 0;
9952 
9953     /* If empty sequence, return u"". */
9954     if (seqlen == 0) {
9955         _Py_RETURN_UNICODE_EMPTY();
9956     }
9957 
9958     /* If singleton sequence with an exact Unicode, return that. */
9959     last_obj = NULL;
9960     if (seqlen == 1) {
9961         if (PyUnicode_CheckExact(items[0])) {
9962             res = items[0];
9963             Py_INCREF(res);
9964             return res;
9965         }
9966         seplen = 0;
9967         maxchar = 0;
9968     }
9969     else {
9970         /* Set up sep and seplen */
9971         if (separator == NULL) {
9972             /* fall back to a blank space separator */
9973             sep = PyUnicode_FromOrdinal(' ');
9974             if (!sep)
9975                 goto onError;
9976             seplen = 1;
9977             maxchar = 32;
9978         }
9979         else {
9980             if (!PyUnicode_Check(separator)) {
9981                 PyErr_Format(PyExc_TypeError,
9982                              "separator: expected str instance,"
9983                              " %.80s found",
9984                              Py_TYPE(separator)->tp_name);
9985                 goto onError;
9986             }
9987             if (PyUnicode_READY(separator))
9988                 goto onError;
9989             sep = separator;
9990             seplen = PyUnicode_GET_LENGTH(separator);
9991             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9992             /* inc refcount to keep this code path symmetric with the
9993                above case of a blank separator */
9994             Py_INCREF(sep);
9995         }
9996         last_obj = sep;
9997     }
9998 
9999     /* There are at least two things to join, or else we have a subclass
10000      * of str in the sequence.
10001      * Do a pre-pass to figure out the total amount of space we'll
10002      * need (sz), and see whether all argument are strings.
10003      */
10004     sz = 0;
10005 #ifdef Py_DEBUG
10006     use_memcpy = 0;
10007 #else
10008     use_memcpy = 1;
10009 #endif
10010     for (i = 0; i < seqlen; i++) {
10011         size_t add_sz;
10012         item = items[i];
10013         if (!PyUnicode_Check(item)) {
10014             PyErr_Format(PyExc_TypeError,
10015                          "sequence item %zd: expected str instance,"
10016                          " %.80s found",
10017                          i, Py_TYPE(item)->tp_name);
10018             goto onError;
10019         }
10020         if (PyUnicode_READY(item) == -1)
10021             goto onError;
10022         add_sz = PyUnicode_GET_LENGTH(item);
10023         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10024         maxchar = Py_MAX(maxchar, item_maxchar);
10025         if (i != 0) {
10026             add_sz += seplen;
10027         }
10028         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10029             PyErr_SetString(PyExc_OverflowError,
10030                             "join() result is too long for a Python string");
10031             goto onError;
10032         }
10033         sz += add_sz;
10034         if (use_memcpy && last_obj != NULL) {
10035             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10036                 use_memcpy = 0;
10037         }
10038         last_obj = item;
10039     }
10040 
10041     res = PyUnicode_New(sz, maxchar);
10042     if (res == NULL)
10043         goto onError;
10044 
10045     /* Catenate everything. */
10046 #ifdef Py_DEBUG
10047     use_memcpy = 0;
10048 #else
10049     if (use_memcpy) {
10050         res_data = PyUnicode_1BYTE_DATA(res);
10051         kind = PyUnicode_KIND(res);
10052         if (seplen != 0)
10053             sep_data = PyUnicode_1BYTE_DATA(sep);
10054     }
10055 #endif
10056     if (use_memcpy) {
10057         for (i = 0; i < seqlen; ++i) {
10058             Py_ssize_t itemlen;
10059             item = items[i];
10060 
10061             /* Copy item, and maybe the separator. */
10062             if (i && seplen != 0) {
10063                 memcpy(res_data,
10064                           sep_data,
10065                           kind * seplen);
10066                 res_data += kind * seplen;
10067             }
10068 
10069             itemlen = PyUnicode_GET_LENGTH(item);
10070             if (itemlen != 0) {
10071                 memcpy(res_data,
10072                           PyUnicode_DATA(item),
10073                           kind * itemlen);
10074                 res_data += kind * itemlen;
10075             }
10076         }
10077         assert(res_data == PyUnicode_1BYTE_DATA(res)
10078                            + kind * PyUnicode_GET_LENGTH(res));
10079     }
10080     else {
10081         for (i = 0, res_offset = 0; i < seqlen; ++i) {
10082             Py_ssize_t itemlen;
10083             item = items[i];
10084 
10085             /* Copy item, and maybe the separator. */
10086             if (i && seplen != 0) {
10087                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10088                 res_offset += seplen;
10089             }
10090 
10091             itemlen = PyUnicode_GET_LENGTH(item);
10092             if (itemlen != 0) {
10093                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10094                 res_offset += itemlen;
10095             }
10096         }
10097         assert(res_offset == PyUnicode_GET_LENGTH(res));
10098     }
10099 
10100     Py_XDECREF(sep);
10101     assert(_PyUnicode_CheckConsistency(res, 1));
10102     return res;
10103 
10104   onError:
10105     Py_XDECREF(sep);
10106     Py_XDECREF(res);
10107     return NULL;
10108 }
10109 
10110 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10111 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10112                     Py_UCS4 fill_char)
10113 {
10114     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10115     void *data = PyUnicode_DATA(unicode);
10116     assert(PyUnicode_IS_READY(unicode));
10117     assert(unicode_modifiable(unicode));
10118     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10119     assert(start >= 0);
10120     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10121     FILL(kind, data, fill_char, start, length);
10122 }
10123 
10124 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10125 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10126                Py_UCS4 fill_char)
10127 {
10128     Py_ssize_t maxlen;
10129 
10130     if (!PyUnicode_Check(unicode)) {
10131         PyErr_BadInternalCall();
10132         return -1;
10133     }
10134     if (PyUnicode_READY(unicode) == -1)
10135         return -1;
10136     if (unicode_check_modifiable(unicode))
10137         return -1;
10138 
10139     if (start < 0) {
10140         PyErr_SetString(PyExc_IndexError, "string index out of range");
10141         return -1;
10142     }
10143     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10144         PyErr_SetString(PyExc_ValueError,
10145                          "fill character is bigger than "
10146                          "the string maximum character");
10147         return -1;
10148     }
10149 
10150     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10151     length = Py_MIN(maxlen, length);
10152     if (length <= 0)
10153         return 0;
10154 
10155     _PyUnicode_FastFill(unicode, start, length, fill_char);
10156     return length;
10157 }
10158 
10159 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10160 pad(PyObject *self,
10161     Py_ssize_t left,
10162     Py_ssize_t right,
10163     Py_UCS4 fill)
10164 {
10165     PyObject *u;
10166     Py_UCS4 maxchar;
10167     int kind;
10168     void *data;
10169 
10170     if (left < 0)
10171         left = 0;
10172     if (right < 0)
10173         right = 0;
10174 
10175     if (left == 0 && right == 0)
10176         return unicode_result_unchanged(self);
10177 
10178     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10179         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10180         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10181         return NULL;
10182     }
10183     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10184     maxchar = Py_MAX(maxchar, fill);
10185     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10186     if (!u)
10187         return NULL;
10188 
10189     kind = PyUnicode_KIND(u);
10190     data = PyUnicode_DATA(u);
10191     if (left)
10192         FILL(kind, data, fill, 0, left);
10193     if (right)
10194         FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10195     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10196     assert(_PyUnicode_CheckConsistency(u, 1));
10197     return u;
10198 }
10199 
10200 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10201 PyUnicode_Splitlines(PyObject *string, int keepends)
10202 {
10203     PyObject *list;
10204 
10205     if (ensure_unicode(string) < 0)
10206         return NULL;
10207 
10208     switch (PyUnicode_KIND(string)) {
10209     case PyUnicode_1BYTE_KIND:
10210         if (PyUnicode_IS_ASCII(string))
10211             list = asciilib_splitlines(
10212                 string, PyUnicode_1BYTE_DATA(string),
10213                 PyUnicode_GET_LENGTH(string), keepends);
10214         else
10215             list = ucs1lib_splitlines(
10216                 string, PyUnicode_1BYTE_DATA(string),
10217                 PyUnicode_GET_LENGTH(string), keepends);
10218         break;
10219     case PyUnicode_2BYTE_KIND:
10220         list = ucs2lib_splitlines(
10221             string, PyUnicode_2BYTE_DATA(string),
10222             PyUnicode_GET_LENGTH(string), keepends);
10223         break;
10224     case PyUnicode_4BYTE_KIND:
10225         list = ucs4lib_splitlines(
10226             string, PyUnicode_4BYTE_DATA(string),
10227             PyUnicode_GET_LENGTH(string), keepends);
10228         break;
10229     default:
10230         Py_UNREACHABLE();
10231     }
10232     return list;
10233 }
10234 
10235 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10236 split(PyObject *self,
10237       PyObject *substring,
10238       Py_ssize_t maxcount)
10239 {
10240     int kind1, kind2;
10241     void *buf1, *buf2;
10242     Py_ssize_t len1, len2;
10243     PyObject* out;
10244 
10245     if (maxcount < 0)
10246         maxcount = PY_SSIZE_T_MAX;
10247 
10248     if (PyUnicode_READY(self) == -1)
10249         return NULL;
10250 
10251     if (substring == NULL)
10252         switch (PyUnicode_KIND(self)) {
10253         case PyUnicode_1BYTE_KIND:
10254             if (PyUnicode_IS_ASCII(self))
10255                 return asciilib_split_whitespace(
10256                     self,  PyUnicode_1BYTE_DATA(self),
10257                     PyUnicode_GET_LENGTH(self), maxcount
10258                     );
10259             else
10260                 return ucs1lib_split_whitespace(
10261                     self,  PyUnicode_1BYTE_DATA(self),
10262                     PyUnicode_GET_LENGTH(self), maxcount
10263                     );
10264         case PyUnicode_2BYTE_KIND:
10265             return ucs2lib_split_whitespace(
10266                 self,  PyUnicode_2BYTE_DATA(self),
10267                 PyUnicode_GET_LENGTH(self), maxcount
10268                 );
10269         case PyUnicode_4BYTE_KIND:
10270             return ucs4lib_split_whitespace(
10271                 self,  PyUnicode_4BYTE_DATA(self),
10272                 PyUnicode_GET_LENGTH(self), maxcount
10273                 );
10274         default:
10275             Py_UNREACHABLE();
10276         }
10277 
10278     if (PyUnicode_READY(substring) == -1)
10279         return NULL;
10280 
10281     kind1 = PyUnicode_KIND(self);
10282     kind2 = PyUnicode_KIND(substring);
10283     len1 = PyUnicode_GET_LENGTH(self);
10284     len2 = PyUnicode_GET_LENGTH(substring);
10285     if (kind1 < kind2 || len1 < len2) {
10286         out = PyList_New(1);
10287         if (out == NULL)
10288             return NULL;
10289         Py_INCREF(self);
10290         PyList_SET_ITEM(out, 0, self);
10291         return out;
10292     }
10293     buf1 = PyUnicode_DATA(self);
10294     buf2 = PyUnicode_DATA(substring);
10295     if (kind2 != kind1) {
10296         buf2 = _PyUnicode_AsKind(substring, kind1);
10297         if (!buf2)
10298             return NULL;
10299     }
10300 
10301     switch (kind1) {
10302     case PyUnicode_1BYTE_KIND:
10303         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10304             out = asciilib_split(
10305                 self,  buf1, len1, buf2, len2, maxcount);
10306         else
10307             out = ucs1lib_split(
10308                 self,  buf1, len1, buf2, len2, maxcount);
10309         break;
10310     case PyUnicode_2BYTE_KIND:
10311         out = ucs2lib_split(
10312             self,  buf1, len1, buf2, len2, maxcount);
10313         break;
10314     case PyUnicode_4BYTE_KIND:
10315         out = ucs4lib_split(
10316             self,  buf1, len1, buf2, len2, maxcount);
10317         break;
10318     default:
10319         out = NULL;
10320     }
10321     if (kind2 != kind1)
10322         PyMem_Free(buf2);
10323     return out;
10324 }
10325 
10326 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10327 rsplit(PyObject *self,
10328        PyObject *substring,
10329        Py_ssize_t maxcount)
10330 {
10331     int kind1, kind2;
10332     void *buf1, *buf2;
10333     Py_ssize_t len1, len2;
10334     PyObject* out;
10335 
10336     if (maxcount < 0)
10337         maxcount = PY_SSIZE_T_MAX;
10338 
10339     if (PyUnicode_READY(self) == -1)
10340         return NULL;
10341 
10342     if (substring == NULL)
10343         switch (PyUnicode_KIND(self)) {
10344         case PyUnicode_1BYTE_KIND:
10345             if (PyUnicode_IS_ASCII(self))
10346                 return asciilib_rsplit_whitespace(
10347                     self,  PyUnicode_1BYTE_DATA(self),
10348                     PyUnicode_GET_LENGTH(self), maxcount
10349                     );
10350             else
10351                 return ucs1lib_rsplit_whitespace(
10352                     self,  PyUnicode_1BYTE_DATA(self),
10353                     PyUnicode_GET_LENGTH(self), maxcount
10354                     );
10355         case PyUnicode_2BYTE_KIND:
10356             return ucs2lib_rsplit_whitespace(
10357                 self,  PyUnicode_2BYTE_DATA(self),
10358                 PyUnicode_GET_LENGTH(self), maxcount
10359                 );
10360         case PyUnicode_4BYTE_KIND:
10361             return ucs4lib_rsplit_whitespace(
10362                 self,  PyUnicode_4BYTE_DATA(self),
10363                 PyUnicode_GET_LENGTH(self), maxcount
10364                 );
10365         default:
10366             Py_UNREACHABLE();
10367         }
10368 
10369     if (PyUnicode_READY(substring) == -1)
10370         return NULL;
10371 
10372     kind1 = PyUnicode_KIND(self);
10373     kind2 = PyUnicode_KIND(substring);
10374     len1 = PyUnicode_GET_LENGTH(self);
10375     len2 = PyUnicode_GET_LENGTH(substring);
10376     if (kind1 < kind2 || len1 < len2) {
10377         out = PyList_New(1);
10378         if (out == NULL)
10379             return NULL;
10380         Py_INCREF(self);
10381         PyList_SET_ITEM(out, 0, self);
10382         return out;
10383     }
10384     buf1 = PyUnicode_DATA(self);
10385     buf2 = PyUnicode_DATA(substring);
10386     if (kind2 != kind1) {
10387         buf2 = _PyUnicode_AsKind(substring, kind1);
10388         if (!buf2)
10389             return NULL;
10390     }
10391 
10392     switch (kind1) {
10393     case PyUnicode_1BYTE_KIND:
10394         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10395             out = asciilib_rsplit(
10396                 self,  buf1, len1, buf2, len2, maxcount);
10397         else
10398             out = ucs1lib_rsplit(
10399                 self,  buf1, len1, buf2, len2, maxcount);
10400         break;
10401     case PyUnicode_2BYTE_KIND:
10402         out = ucs2lib_rsplit(
10403             self,  buf1, len1, buf2, len2, maxcount);
10404         break;
10405     case PyUnicode_4BYTE_KIND:
10406         out = ucs4lib_rsplit(
10407             self,  buf1, len1, buf2, len2, maxcount);
10408         break;
10409     default:
10410         out = NULL;
10411     }
10412     if (kind2 != kind1)
10413         PyMem_Free(buf2);
10414     return out;
10415 }
10416 
10417 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10418 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10419             PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10420 {
10421     switch (kind) {
10422     case PyUnicode_1BYTE_KIND:
10423         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10424             return asciilib_find(buf1, len1, buf2, len2, offset);
10425         else
10426             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10427     case PyUnicode_2BYTE_KIND:
10428         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10429     case PyUnicode_4BYTE_KIND:
10430         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10431     }
10432     Py_UNREACHABLE();
10433 }
10434 
10435 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10436 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10437              PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10438 {
10439     switch (kind) {
10440     case PyUnicode_1BYTE_KIND:
10441         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10442             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10443         else
10444             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10445     case PyUnicode_2BYTE_KIND:
10446         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10447     case PyUnicode_4BYTE_KIND:
10448         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10449     }
10450     Py_UNREACHABLE();
10451 }
10452 
10453 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10454 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10455                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10456 {
10457     int kind = PyUnicode_KIND(u);
10458     void *data = PyUnicode_DATA(u);
10459     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10460     if (kind == PyUnicode_1BYTE_KIND) {
10461         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10462                                       (Py_UCS1 *)data + len,
10463                                       u1, u2, maxcount);
10464     }
10465     else if (kind == PyUnicode_2BYTE_KIND) {
10466         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10467                                       (Py_UCS2 *)data + len,
10468                                       u1, u2, maxcount);
10469     }
10470     else {
10471         assert(kind == PyUnicode_4BYTE_KIND);
10472         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10473                                       (Py_UCS4 *)data + len,
10474                                       u1, u2, maxcount);
10475     }
10476 }
10477 
10478 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10479 replace(PyObject *self, PyObject *str1,
10480         PyObject *str2, Py_ssize_t maxcount)
10481 {
10482     PyObject *u;
10483     char *sbuf = PyUnicode_DATA(self);
10484     char *buf1 = PyUnicode_DATA(str1);
10485     char *buf2 = PyUnicode_DATA(str2);
10486     int srelease = 0, release1 = 0, release2 = 0;
10487     int skind = PyUnicode_KIND(self);
10488     int kind1 = PyUnicode_KIND(str1);
10489     int kind2 = PyUnicode_KIND(str2);
10490     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10491     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10492     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10493     int mayshrink;
10494     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10495 
10496     if (maxcount < 0)
10497         maxcount = PY_SSIZE_T_MAX;
10498     else if (maxcount == 0 || slen == 0)
10499         goto nothing;
10500 
10501     if (str1 == str2)
10502         goto nothing;
10503 
10504     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10505     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10506     if (maxchar < maxchar_str1)
10507         /* substring too wide to be present */
10508         goto nothing;
10509     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10510     /* Replacing str1 with str2 may cause a maxchar reduction in the
10511        result string. */
10512     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10513     maxchar = Py_MAX(maxchar, maxchar_str2);
10514 
10515     if (len1 == len2) {
10516         /* same length */
10517         if (len1 == 0)
10518             goto nothing;
10519         if (len1 == 1) {
10520             /* replace characters */
10521             Py_UCS4 u1, u2;
10522             Py_ssize_t pos;
10523 
10524             u1 = PyUnicode_READ(kind1, buf1, 0);
10525             pos = findchar(sbuf, skind, slen, u1, 1);
10526             if (pos < 0)
10527                 goto nothing;
10528             u2 = PyUnicode_READ(kind2, buf2, 0);
10529             u = PyUnicode_New(slen, maxchar);
10530             if (!u)
10531                 goto error;
10532 
10533             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10534             replace_1char_inplace(u, pos, u1, u2, maxcount);
10535         }
10536         else {
10537             int rkind = skind;
10538             char *res;
10539             Py_ssize_t i;
10540 
10541             if (kind1 < rkind) {
10542                 /* widen substring */
10543                 buf1 = _PyUnicode_AsKind(str1, rkind);
10544                 if (!buf1) goto error;
10545                 release1 = 1;
10546             }
10547             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10548             if (i < 0)
10549                 goto nothing;
10550             if (rkind > kind2) {
10551                 /* widen replacement */
10552                 buf2 = _PyUnicode_AsKind(str2, rkind);
10553                 if (!buf2) goto error;
10554                 release2 = 1;
10555             }
10556             else if (rkind < kind2) {
10557                 /* widen self and buf1 */
10558                 rkind = kind2;
10559                 if (release1) PyMem_Free(buf1);
10560                 release1 = 0;
10561                 sbuf = _PyUnicode_AsKind(self, rkind);
10562                 if (!sbuf) goto error;
10563                 srelease = 1;
10564                 buf1 = _PyUnicode_AsKind(str1, rkind);
10565                 if (!buf1) goto error;
10566                 release1 = 1;
10567             }
10568             u = PyUnicode_New(slen, maxchar);
10569             if (!u)
10570                 goto error;
10571             assert(PyUnicode_KIND(u) == rkind);
10572             res = PyUnicode_DATA(u);
10573 
10574             memcpy(res, sbuf, rkind * slen);
10575             /* change everything in-place, starting with this one */
10576             memcpy(res + rkind * i,
10577                    buf2,
10578                    rkind * len2);
10579             i += len1;
10580 
10581             while ( --maxcount > 0) {
10582                 i = anylib_find(rkind, self,
10583                                 sbuf+rkind*i, slen-i,
10584                                 str1, buf1, len1, i);
10585                 if (i == -1)
10586                     break;
10587                 memcpy(res + rkind * i,
10588                        buf2,
10589                        rkind * len2);
10590                 i += len1;
10591             }
10592         }
10593     }
10594     else {
10595         Py_ssize_t n, i, j, ires;
10596         Py_ssize_t new_size;
10597         int rkind = skind;
10598         char *res;
10599 
10600         if (kind1 < rkind) {
10601             /* widen substring */
10602             buf1 = _PyUnicode_AsKind(str1, rkind);
10603             if (!buf1) goto error;
10604             release1 = 1;
10605         }
10606         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10607         if (n == 0)
10608             goto nothing;
10609         if (kind2 < rkind) {
10610             /* widen replacement */
10611             buf2 = _PyUnicode_AsKind(str2, rkind);
10612             if (!buf2) goto error;
10613             release2 = 1;
10614         }
10615         else if (kind2 > rkind) {
10616             /* widen self and buf1 */
10617             rkind = kind2;
10618             sbuf = _PyUnicode_AsKind(self, rkind);
10619             if (!sbuf) goto error;
10620             srelease = 1;
10621             if (release1) PyMem_Free(buf1);
10622             release1 = 0;
10623             buf1 = _PyUnicode_AsKind(str1, rkind);
10624             if (!buf1) goto error;
10625             release1 = 1;
10626         }
10627         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10628            PyUnicode_GET_LENGTH(str1))); */
10629         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10630                 PyErr_SetString(PyExc_OverflowError,
10631                                 "replace string is too long");
10632                 goto error;
10633         }
10634         new_size = slen + n * (len2 - len1);
10635         if (new_size == 0) {
10636             _Py_INCREF_UNICODE_EMPTY();
10637             if (!unicode_empty)
10638                 goto error;
10639             u = unicode_empty;
10640             goto done;
10641         }
10642         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10643             PyErr_SetString(PyExc_OverflowError,
10644                             "replace string is too long");
10645             goto error;
10646         }
10647         u = PyUnicode_New(new_size, maxchar);
10648         if (!u)
10649             goto error;
10650         assert(PyUnicode_KIND(u) == rkind);
10651         res = PyUnicode_DATA(u);
10652         ires = i = 0;
10653         if (len1 > 0) {
10654             while (n-- > 0) {
10655                 /* look for next match */
10656                 j = anylib_find(rkind, self,
10657                                 sbuf + rkind * i, slen-i,
10658                                 str1, buf1, len1, i);
10659                 if (j == -1)
10660                     break;
10661                 else if (j > i) {
10662                     /* copy unchanged part [i:j] */
10663                     memcpy(res + rkind * ires,
10664                            sbuf + rkind * i,
10665                            rkind * (j-i));
10666                     ires += j - i;
10667                 }
10668                 /* copy substitution string */
10669                 if (len2 > 0) {
10670                     memcpy(res + rkind * ires,
10671                            buf2,
10672                            rkind * len2);
10673                     ires += len2;
10674                 }
10675                 i = j + len1;
10676             }
10677             if (i < slen)
10678                 /* copy tail [i:] */
10679                 memcpy(res + rkind * ires,
10680                        sbuf + rkind * i,
10681                        rkind * (slen-i));
10682         }
10683         else {
10684             /* interleave */
10685             while (n > 0) {
10686                 memcpy(res + rkind * ires,
10687                        buf2,
10688                        rkind * len2);
10689                 ires += len2;
10690                 if (--n <= 0)
10691                     break;
10692                 memcpy(res + rkind * ires,
10693                        sbuf + rkind * i,
10694                        rkind);
10695                 ires++;
10696                 i++;
10697             }
10698             memcpy(res + rkind * ires,
10699                    sbuf + rkind * i,
10700                    rkind * (slen-i));
10701         }
10702     }
10703 
10704     if (mayshrink) {
10705         unicode_adjust_maxchar(&u);
10706         if (u == NULL)
10707             goto error;
10708     }
10709 
10710   done:
10711     if (srelease)
10712         PyMem_FREE(sbuf);
10713     if (release1)
10714         PyMem_FREE(buf1);
10715     if (release2)
10716         PyMem_FREE(buf2);
10717     assert(_PyUnicode_CheckConsistency(u, 1));
10718     return u;
10719 
10720   nothing:
10721     /* nothing to replace; return original string (when possible) */
10722     if (srelease)
10723         PyMem_FREE(sbuf);
10724     if (release1)
10725         PyMem_FREE(buf1);
10726     if (release2)
10727         PyMem_FREE(buf2);
10728     return unicode_result_unchanged(self);
10729 
10730   error:
10731     if (srelease && sbuf)
10732         PyMem_FREE(sbuf);
10733     if (release1 && buf1)
10734         PyMem_FREE(buf1);
10735     if (release2 && buf2)
10736         PyMem_FREE(buf2);
10737     return NULL;
10738 }
10739 
10740 /* --- Unicode Object Methods --------------------------------------------- */
10741 
10742 /*[clinic input]
10743 str.title as unicode_title
10744 
10745 Return a version of the string where each word is titlecased.
10746 
10747 More specifically, words start with uppercased characters and all remaining
10748 cased characters have lower case.
10749 [clinic start generated code]*/
10750 
10751 static PyObject *
unicode_title_impl(PyObject * self)10752 unicode_title_impl(PyObject *self)
10753 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10754 {
10755     if (PyUnicode_READY(self) == -1)
10756         return NULL;
10757     return case_operation(self, do_title);
10758 }
10759 
10760 /*[clinic input]
10761 str.capitalize as unicode_capitalize
10762 
10763 Return a capitalized version of the string.
10764 
10765 More specifically, make the first character have upper case and the rest lower
10766 case.
10767 [clinic start generated code]*/
10768 
10769 static PyObject *
unicode_capitalize_impl(PyObject * self)10770 unicode_capitalize_impl(PyObject *self)
10771 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10772 {
10773     if (PyUnicode_READY(self) == -1)
10774         return NULL;
10775     if (PyUnicode_GET_LENGTH(self) == 0)
10776         return unicode_result_unchanged(self);
10777     return case_operation(self, do_capitalize);
10778 }
10779 
10780 /*[clinic input]
10781 str.casefold as unicode_casefold
10782 
10783 Return a version of the string suitable for caseless comparisons.
10784 [clinic start generated code]*/
10785 
10786 static PyObject *
unicode_casefold_impl(PyObject * self)10787 unicode_casefold_impl(PyObject *self)
10788 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10789 {
10790     if (PyUnicode_READY(self) == -1)
10791         return NULL;
10792     if (PyUnicode_IS_ASCII(self))
10793         return ascii_upper_or_lower(self, 1);
10794     return case_operation(self, do_casefold);
10795 }
10796 
10797 
10798 /* Argument converter. Accepts a single Unicode character. */
10799 
10800 static int
convert_uc(PyObject * obj,void * addr)10801 convert_uc(PyObject *obj, void *addr)
10802 {
10803     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10804 
10805     if (!PyUnicode_Check(obj)) {
10806         PyErr_Format(PyExc_TypeError,
10807                      "The fill character must be a unicode character, "
10808                      "not %.100s", Py_TYPE(obj)->tp_name);
10809         return 0;
10810     }
10811     if (PyUnicode_READY(obj) < 0)
10812         return 0;
10813     if (PyUnicode_GET_LENGTH(obj) != 1) {
10814         PyErr_SetString(PyExc_TypeError,
10815                         "The fill character must be exactly one character long");
10816         return 0;
10817     }
10818     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10819     return 1;
10820 }
10821 
10822 /*[clinic input]
10823 str.center as unicode_center
10824 
10825     width: Py_ssize_t
10826     fillchar: Py_UCS4 = ' '
10827     /
10828 
10829 Return a centered string of length width.
10830 
10831 Padding is done using the specified fill character (default is a space).
10832 [clinic start generated code]*/
10833 
10834 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10835 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10836 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10837 {
10838     Py_ssize_t marg, left;
10839 
10840     if (PyUnicode_READY(self) == -1)
10841         return NULL;
10842 
10843     if (PyUnicode_GET_LENGTH(self) >= width)
10844         return unicode_result_unchanged(self);
10845 
10846     marg = width - PyUnicode_GET_LENGTH(self);
10847     left = marg / 2 + (marg & width & 1);
10848 
10849     return pad(self, left, marg - left, fillchar);
10850 }
10851 
10852 /* This function assumes that str1 and str2 are readied by the caller. */
10853 
10854 static int
unicode_compare(PyObject * str1,PyObject * str2)10855 unicode_compare(PyObject *str1, PyObject *str2)
10856 {
10857 #define COMPARE(TYPE1, TYPE2) \
10858     do { \
10859         TYPE1* p1 = (TYPE1 *)data1; \
10860         TYPE2* p2 = (TYPE2 *)data2; \
10861         TYPE1* end = p1 + len; \
10862         Py_UCS4 c1, c2; \
10863         for (; p1 != end; p1++, p2++) { \
10864             c1 = *p1; \
10865             c2 = *p2; \
10866             if (c1 != c2) \
10867                 return (c1 < c2) ? -1 : 1; \
10868         } \
10869     } \
10870     while (0)
10871 
10872     int kind1, kind2;
10873     void *data1, *data2;
10874     Py_ssize_t len1, len2, len;
10875 
10876     kind1 = PyUnicode_KIND(str1);
10877     kind2 = PyUnicode_KIND(str2);
10878     data1 = PyUnicode_DATA(str1);
10879     data2 = PyUnicode_DATA(str2);
10880     len1 = PyUnicode_GET_LENGTH(str1);
10881     len2 = PyUnicode_GET_LENGTH(str2);
10882     len = Py_MIN(len1, len2);
10883 
10884     switch(kind1) {
10885     case PyUnicode_1BYTE_KIND:
10886     {
10887         switch(kind2) {
10888         case PyUnicode_1BYTE_KIND:
10889         {
10890             int cmp = memcmp(data1, data2, len);
10891             /* normalize result of memcmp() into the range [-1; 1] */
10892             if (cmp < 0)
10893                 return -1;
10894             if (cmp > 0)
10895                 return 1;
10896             break;
10897         }
10898         case PyUnicode_2BYTE_KIND:
10899             COMPARE(Py_UCS1, Py_UCS2);
10900             break;
10901         case PyUnicode_4BYTE_KIND:
10902             COMPARE(Py_UCS1, Py_UCS4);
10903             break;
10904         default:
10905             Py_UNREACHABLE();
10906         }
10907         break;
10908     }
10909     case PyUnicode_2BYTE_KIND:
10910     {
10911         switch(kind2) {
10912         case PyUnicode_1BYTE_KIND:
10913             COMPARE(Py_UCS2, Py_UCS1);
10914             break;
10915         case PyUnicode_2BYTE_KIND:
10916         {
10917             COMPARE(Py_UCS2, Py_UCS2);
10918             break;
10919         }
10920         case PyUnicode_4BYTE_KIND:
10921             COMPARE(Py_UCS2, Py_UCS4);
10922             break;
10923         default:
10924             Py_UNREACHABLE();
10925         }
10926         break;
10927     }
10928     case PyUnicode_4BYTE_KIND:
10929     {
10930         switch(kind2) {
10931         case PyUnicode_1BYTE_KIND:
10932             COMPARE(Py_UCS4, Py_UCS1);
10933             break;
10934         case PyUnicode_2BYTE_KIND:
10935             COMPARE(Py_UCS4, Py_UCS2);
10936             break;
10937         case PyUnicode_4BYTE_KIND:
10938         {
10939 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10940             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10941             /* normalize result of wmemcmp() into the range [-1; 1] */
10942             if (cmp < 0)
10943                 return -1;
10944             if (cmp > 0)
10945                 return 1;
10946 #else
10947             COMPARE(Py_UCS4, Py_UCS4);
10948 #endif
10949             break;
10950         }
10951         default:
10952             Py_UNREACHABLE();
10953         }
10954         break;
10955     }
10956     default:
10957         Py_UNREACHABLE();
10958     }
10959 
10960     if (len1 == len2)
10961         return 0;
10962     if (len1 < len2)
10963         return -1;
10964     else
10965         return 1;
10966 
10967 #undef COMPARE
10968 }
10969 
10970 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10971 unicode_compare_eq(PyObject *str1, PyObject *str2)
10972 {
10973     int kind;
10974     void *data1, *data2;
10975     Py_ssize_t len;
10976     int cmp;
10977 
10978     len = PyUnicode_GET_LENGTH(str1);
10979     if (PyUnicode_GET_LENGTH(str2) != len)
10980         return 0;
10981     kind = PyUnicode_KIND(str1);
10982     if (PyUnicode_KIND(str2) != kind)
10983         return 0;
10984     data1 = PyUnicode_DATA(str1);
10985     data2 = PyUnicode_DATA(str2);
10986 
10987     cmp = memcmp(data1, data2, len * kind);
10988     return (cmp == 0);
10989 }
10990 
10991 
10992 int
PyUnicode_Compare(PyObject * left,PyObject * right)10993 PyUnicode_Compare(PyObject *left, PyObject *right)
10994 {
10995     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10996         if (PyUnicode_READY(left) == -1 ||
10997             PyUnicode_READY(right) == -1)
10998             return -1;
10999 
11000         /* a string is equal to itself */
11001         if (left == right)
11002             return 0;
11003 
11004         return unicode_compare(left, right);
11005     }
11006     PyErr_Format(PyExc_TypeError,
11007                  "Can't compare %.100s and %.100s",
11008                  left->ob_type->tp_name,
11009                  right->ob_type->tp_name);
11010     return -1;
11011 }
11012 
11013 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11014 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11015 {
11016     Py_ssize_t i;
11017     int kind;
11018     Py_UCS4 chr;
11019     const unsigned char *ustr = (const unsigned char *)str;
11020 
11021     assert(_PyUnicode_CHECK(uni));
11022     if (!PyUnicode_IS_READY(uni)) {
11023         const wchar_t *ws = _PyUnicode_WSTR(uni);
11024         /* Compare Unicode string and source character set string */
11025         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11026             if (chr != ustr[i])
11027                 return (chr < ustr[i]) ? -1 : 1;
11028         }
11029         /* This check keeps Python strings that end in '\0' from comparing equal
11030          to C strings identical up to that point. */
11031         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11032             return 1; /* uni is longer */
11033         if (ustr[i])
11034             return -1; /* str is longer */
11035         return 0;
11036     }
11037     kind = PyUnicode_KIND(uni);
11038     if (kind == PyUnicode_1BYTE_KIND) {
11039         const void *data = PyUnicode_1BYTE_DATA(uni);
11040         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11041         size_t len, len2 = strlen(str);
11042         int cmp;
11043 
11044         len = Py_MIN(len1, len2);
11045         cmp = memcmp(data, str, len);
11046         if (cmp != 0) {
11047             if (cmp < 0)
11048                 return -1;
11049             else
11050                 return 1;
11051         }
11052         if (len1 > len2)
11053             return 1; /* uni is longer */
11054         if (len1 < len2)
11055             return -1; /* str is longer */
11056         return 0;
11057     }
11058     else {
11059         void *data = PyUnicode_DATA(uni);
11060         /* Compare Unicode string and source character set string */
11061         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11062             if (chr != (unsigned char)str[i])
11063                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11064         /* This check keeps Python strings that end in '\0' from comparing equal
11065          to C strings identical up to that point. */
11066         if (PyUnicode_GET_LENGTH(uni) != i || chr)
11067             return 1; /* uni is longer */
11068         if (str[i])
11069             return -1; /* str is longer */
11070         return 0;
11071     }
11072 }
11073 
11074 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11075 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11076 {
11077     size_t i, len;
11078     const wchar_t *p;
11079     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11080     if (strlen(str) != len)
11081         return 0;
11082     p = _PyUnicode_WSTR(unicode);
11083     assert(p);
11084     for (i = 0; i < len; i++) {
11085         unsigned char c = (unsigned char)str[i];
11086         if (c >= 128 || p[i] != (wchar_t)c)
11087             return 0;
11088     }
11089     return 1;
11090 }
11091 
11092 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11093 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11094 {
11095     size_t len;
11096     assert(_PyUnicode_CHECK(unicode));
11097     assert(str);
11098 #ifndef NDEBUG
11099     for (const char *p = str; *p; p++) {
11100         assert((unsigned char)*p < 128);
11101     }
11102 #endif
11103     if (PyUnicode_READY(unicode) == -1) {
11104         /* Memory error or bad data */
11105         PyErr_Clear();
11106         return non_ready_unicode_equal_to_ascii_string(unicode, str);
11107     }
11108     if (!PyUnicode_IS_ASCII(unicode))
11109         return 0;
11110     len = (size_t)PyUnicode_GET_LENGTH(unicode);
11111     return strlen(str) == len &&
11112            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11113 }
11114 
11115 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11116 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11117 {
11118     PyObject *right_uni;
11119     Py_hash_t hash;
11120 
11121     assert(_PyUnicode_CHECK(left));
11122     assert(right->string);
11123 #ifndef NDEBUG
11124     for (const char *p = right->string; *p; p++) {
11125         assert((unsigned char)*p < 128);
11126     }
11127 #endif
11128 
11129     if (PyUnicode_READY(left) == -1) {
11130         /* memory error or bad data */
11131         PyErr_Clear();
11132         return non_ready_unicode_equal_to_ascii_string(left, right->string);
11133     }
11134 
11135     if (!PyUnicode_IS_ASCII(left))
11136         return 0;
11137 
11138     right_uni = _PyUnicode_FromId(right);       /* borrowed */
11139     if (right_uni == NULL) {
11140         /* memory error or bad data */
11141         PyErr_Clear();
11142         return _PyUnicode_EqualToASCIIString(left, right->string);
11143     }
11144 
11145     if (left == right_uni)
11146         return 1;
11147 
11148     if (PyUnicode_CHECK_INTERNED(left))
11149         return 0;
11150 
11151     assert(_PyUnicode_HASH(right_uni) != -1);
11152     hash = _PyUnicode_HASH(left);
11153     if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11154         return 0;
11155 
11156     return unicode_compare_eq(left, right_uni);
11157 }
11158 
11159 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11160 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11161 {
11162     int result;
11163 
11164     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11165         Py_RETURN_NOTIMPLEMENTED;
11166 
11167     if (PyUnicode_READY(left) == -1 ||
11168         PyUnicode_READY(right) == -1)
11169         return NULL;
11170 
11171     if (left == right) {
11172         switch (op) {
11173         case Py_EQ:
11174         case Py_LE:
11175         case Py_GE:
11176             /* a string is equal to itself */
11177             Py_RETURN_TRUE;
11178         case Py_NE:
11179         case Py_LT:
11180         case Py_GT:
11181             Py_RETURN_FALSE;
11182         default:
11183             PyErr_BadArgument();
11184             return NULL;
11185         }
11186     }
11187     else if (op == Py_EQ || op == Py_NE) {
11188         result = unicode_compare_eq(left, right);
11189         result ^= (op == Py_NE);
11190         return PyBool_FromLong(result);
11191     }
11192     else {
11193         result = unicode_compare(left, right);
11194         Py_RETURN_RICHCOMPARE(result, 0, op);
11195     }
11196 }
11197 
11198 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11199 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11200 {
11201     return unicode_eq(aa, bb);
11202 }
11203 
11204 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11205 PyUnicode_Contains(PyObject *str, PyObject *substr)
11206 {
11207     int kind1, kind2;
11208     void *buf1, *buf2;
11209     Py_ssize_t len1, len2;
11210     int result;
11211 
11212     if (!PyUnicode_Check(substr)) {
11213         PyErr_Format(PyExc_TypeError,
11214                      "'in <string>' requires string as left operand, not %.100s",
11215                      Py_TYPE(substr)->tp_name);
11216         return -1;
11217     }
11218     if (PyUnicode_READY(substr) == -1)
11219         return -1;
11220     if (ensure_unicode(str) < 0)
11221         return -1;
11222 
11223     kind1 = PyUnicode_KIND(str);
11224     kind2 = PyUnicode_KIND(substr);
11225     if (kind1 < kind2)
11226         return 0;
11227     len1 = PyUnicode_GET_LENGTH(str);
11228     len2 = PyUnicode_GET_LENGTH(substr);
11229     if (len1 < len2)
11230         return 0;
11231     buf1 = PyUnicode_DATA(str);
11232     buf2 = PyUnicode_DATA(substr);
11233     if (len2 == 1) {
11234         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11235         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11236         return result;
11237     }
11238     if (kind2 != kind1) {
11239         buf2 = _PyUnicode_AsKind(substr, kind1);
11240         if (!buf2)
11241             return -1;
11242     }
11243 
11244     switch (kind1) {
11245     case PyUnicode_1BYTE_KIND:
11246         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11247         break;
11248     case PyUnicode_2BYTE_KIND:
11249         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11250         break;
11251     case PyUnicode_4BYTE_KIND:
11252         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11253         break;
11254     default:
11255         Py_UNREACHABLE();
11256     }
11257 
11258     if (kind2 != kind1)
11259         PyMem_Free(buf2);
11260 
11261     return result;
11262 }
11263 
11264 /* Concat to string or Unicode object giving a new Unicode object. */
11265 
11266 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11267 PyUnicode_Concat(PyObject *left, PyObject *right)
11268 {
11269     PyObject *result;
11270     Py_UCS4 maxchar, maxchar2;
11271     Py_ssize_t left_len, right_len, new_len;
11272 
11273     if (ensure_unicode(left) < 0)
11274         return NULL;
11275 
11276     if (!PyUnicode_Check(right)) {
11277         PyErr_Format(PyExc_TypeError,
11278                      "can only concatenate str (not \"%.200s\") to str",
11279                      right->ob_type->tp_name);
11280         return NULL;
11281     }
11282     if (PyUnicode_READY(right) < 0)
11283         return NULL;
11284 
11285     /* Shortcuts */
11286     if (left == unicode_empty)
11287         return PyUnicode_FromObject(right);
11288     if (right == unicode_empty)
11289         return PyUnicode_FromObject(left);
11290 
11291     left_len = PyUnicode_GET_LENGTH(left);
11292     right_len = PyUnicode_GET_LENGTH(right);
11293     if (left_len > PY_SSIZE_T_MAX - right_len) {
11294         PyErr_SetString(PyExc_OverflowError,
11295                         "strings are too large to concat");
11296         return NULL;
11297     }
11298     new_len = left_len + right_len;
11299 
11300     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11301     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11302     maxchar = Py_MAX(maxchar, maxchar2);
11303 
11304     /* Concat the two Unicode strings */
11305     result = PyUnicode_New(new_len, maxchar);
11306     if (result == NULL)
11307         return NULL;
11308     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11309     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11310     assert(_PyUnicode_CheckConsistency(result, 1));
11311     return result;
11312 }
11313 
11314 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11315 PyUnicode_Append(PyObject **p_left, PyObject *right)
11316 {
11317     PyObject *left, *res;
11318     Py_UCS4 maxchar, maxchar2;
11319     Py_ssize_t left_len, right_len, new_len;
11320 
11321     if (p_left == NULL) {
11322         if (!PyErr_Occurred())
11323             PyErr_BadInternalCall();
11324         return;
11325     }
11326     left = *p_left;
11327     if (right == NULL || left == NULL
11328         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11329         if (!PyErr_Occurred())
11330             PyErr_BadInternalCall();
11331         goto error;
11332     }
11333 
11334     if (PyUnicode_READY(left) == -1)
11335         goto error;
11336     if (PyUnicode_READY(right) == -1)
11337         goto error;
11338 
11339     /* Shortcuts */
11340     if (left == unicode_empty) {
11341         Py_DECREF(left);
11342         Py_INCREF(right);
11343         *p_left = right;
11344         return;
11345     }
11346     if (right == unicode_empty)
11347         return;
11348 
11349     left_len = PyUnicode_GET_LENGTH(left);
11350     right_len = PyUnicode_GET_LENGTH(right);
11351     if (left_len > PY_SSIZE_T_MAX - right_len) {
11352         PyErr_SetString(PyExc_OverflowError,
11353                         "strings are too large to concat");
11354         goto error;
11355     }
11356     new_len = left_len + right_len;
11357 
11358     if (unicode_modifiable(left)
11359         && PyUnicode_CheckExact(right)
11360         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11361         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11362            to change the structure size, but characters are stored just after
11363            the structure, and so it requires to move all characters which is
11364            not so different than duplicating the string. */
11365         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11366     {
11367         /* append inplace */
11368         if (unicode_resize(p_left, new_len) != 0)
11369             goto error;
11370 
11371         /* copy 'right' into the newly allocated area of 'left' */
11372         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11373     }
11374     else {
11375         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11376         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11377         maxchar = Py_MAX(maxchar, maxchar2);
11378 
11379         /* Concat the two Unicode strings */
11380         res = PyUnicode_New(new_len, maxchar);
11381         if (res == NULL)
11382             goto error;
11383         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11384         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11385         Py_DECREF(left);
11386         *p_left = res;
11387     }
11388     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11389     return;
11390 
11391 error:
11392     Py_CLEAR(*p_left);
11393 }
11394 
11395 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11396 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11397 {
11398     PyUnicode_Append(pleft, right);
11399     Py_XDECREF(right);
11400 }
11401 
11402 /*
11403 Wraps stringlib_parse_args_finds() and additionally ensures that the
11404 first argument is a unicode object.
11405 */
11406 
11407 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11408 parse_args_finds_unicode(const char * function_name, PyObject *args,
11409                          PyObject **substring,
11410                          Py_ssize_t *start, Py_ssize_t *end)
11411 {
11412     if(stringlib_parse_args_finds(function_name, args, substring,
11413                                   start, end)) {
11414         if (ensure_unicode(*substring) < 0)
11415             return 0;
11416         return 1;
11417     }
11418     return 0;
11419 }
11420 
11421 PyDoc_STRVAR(count__doc__,
11422              "S.count(sub[, start[, end]]) -> int\n\
11423 \n\
11424 Return the number of non-overlapping occurrences of substring sub in\n\
11425 string S[start:end].  Optional arguments start and end are\n\
11426 interpreted as in slice notation.");
11427 
11428 static PyObject *
unicode_count(PyObject * self,PyObject * args)11429 unicode_count(PyObject *self, PyObject *args)
11430 {
11431     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11432     Py_ssize_t start = 0;
11433     Py_ssize_t end = PY_SSIZE_T_MAX;
11434     PyObject *result;
11435     int kind1, kind2;
11436     void *buf1, *buf2;
11437     Py_ssize_t len1, len2, iresult;
11438 
11439     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11440         return NULL;
11441 
11442     kind1 = PyUnicode_KIND(self);
11443     kind2 = PyUnicode_KIND(substring);
11444     if (kind1 < kind2)
11445         return PyLong_FromLong(0);
11446 
11447     len1 = PyUnicode_GET_LENGTH(self);
11448     len2 = PyUnicode_GET_LENGTH(substring);
11449     ADJUST_INDICES(start, end, len1);
11450     if (end - start < len2)
11451         return PyLong_FromLong(0);
11452 
11453     buf1 = PyUnicode_DATA(self);
11454     buf2 = PyUnicode_DATA(substring);
11455     if (kind2 != kind1) {
11456         buf2 = _PyUnicode_AsKind(substring, kind1);
11457         if (!buf2)
11458             return NULL;
11459     }
11460     switch (kind1) {
11461     case PyUnicode_1BYTE_KIND:
11462         iresult = ucs1lib_count(
11463             ((Py_UCS1*)buf1) + start, end - start,
11464             buf2, len2, PY_SSIZE_T_MAX
11465             );
11466         break;
11467     case PyUnicode_2BYTE_KIND:
11468         iresult = ucs2lib_count(
11469             ((Py_UCS2*)buf1) + start, end - start,
11470             buf2, len2, PY_SSIZE_T_MAX
11471             );
11472         break;
11473     case PyUnicode_4BYTE_KIND:
11474         iresult = ucs4lib_count(
11475             ((Py_UCS4*)buf1) + start, end - start,
11476             buf2, len2, PY_SSIZE_T_MAX
11477             );
11478         break;
11479     default:
11480         Py_UNREACHABLE();
11481     }
11482 
11483     result = PyLong_FromSsize_t(iresult);
11484 
11485     if (kind2 != kind1)
11486         PyMem_Free(buf2);
11487 
11488     return result;
11489 }
11490 
11491 /*[clinic input]
11492 str.encode as unicode_encode
11493 
11494     encoding: str(c_default="NULL") = 'utf-8'
11495         The encoding in which to encode the string.
11496     errors: str(c_default="NULL") = 'strict'
11497         The error handling scheme to use for encoding errors.
11498         The default is 'strict' meaning that encoding errors raise a
11499         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11500         'xmlcharrefreplace' as well as any other name registered with
11501         codecs.register_error that can handle UnicodeEncodeErrors.
11502 
11503 Encode the string using the codec registered for encoding.
11504 [clinic start generated code]*/
11505 
11506 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11507 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11508 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11509 {
11510     return PyUnicode_AsEncodedString(self, encoding, errors);
11511 }
11512 
11513 /*[clinic input]
11514 str.expandtabs as unicode_expandtabs
11515 
11516     tabsize: int = 8
11517 
11518 Return a copy where all tab characters are expanded using spaces.
11519 
11520 If tabsize is not given, a tab size of 8 characters is assumed.
11521 [clinic start generated code]*/
11522 
11523 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11524 unicode_expandtabs_impl(PyObject *self, int tabsize)
11525 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11526 {
11527     Py_ssize_t i, j, line_pos, src_len, incr;
11528     Py_UCS4 ch;
11529     PyObject *u;
11530     void *src_data, *dest_data;
11531     int kind;
11532     int found;
11533 
11534     if (PyUnicode_READY(self) == -1)
11535         return NULL;
11536 
11537     /* First pass: determine size of output string */
11538     src_len = PyUnicode_GET_LENGTH(self);
11539     i = j = line_pos = 0;
11540     kind = PyUnicode_KIND(self);
11541     src_data = PyUnicode_DATA(self);
11542     found = 0;
11543     for (; i < src_len; i++) {
11544         ch = PyUnicode_READ(kind, src_data, i);
11545         if (ch == '\t') {
11546             found = 1;
11547             if (tabsize > 0) {
11548                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11549                 if (j > PY_SSIZE_T_MAX - incr)
11550                     goto overflow;
11551                 line_pos += incr;
11552                 j += incr;
11553             }
11554         }
11555         else {
11556             if (j > PY_SSIZE_T_MAX - 1)
11557                 goto overflow;
11558             line_pos++;
11559             j++;
11560             if (ch == '\n' || ch == '\r')
11561                 line_pos = 0;
11562         }
11563     }
11564     if (!found)
11565         return unicode_result_unchanged(self);
11566 
11567     /* Second pass: create output string and fill it */
11568     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11569     if (!u)
11570         return NULL;
11571     dest_data = PyUnicode_DATA(u);
11572 
11573     i = j = line_pos = 0;
11574 
11575     for (; i < src_len; i++) {
11576         ch = PyUnicode_READ(kind, src_data, i);
11577         if (ch == '\t') {
11578             if (tabsize > 0) {
11579                 incr = tabsize - (line_pos % tabsize);
11580                 line_pos += incr;
11581                 FILL(kind, dest_data, ' ', j, incr);
11582                 j += incr;
11583             }
11584         }
11585         else {
11586             line_pos++;
11587             PyUnicode_WRITE(kind, dest_data, j, ch);
11588             j++;
11589             if (ch == '\n' || ch == '\r')
11590                 line_pos = 0;
11591         }
11592     }
11593     assert (j == PyUnicode_GET_LENGTH(u));
11594     return unicode_result(u);
11595 
11596   overflow:
11597     PyErr_SetString(PyExc_OverflowError, "new string is too long");
11598     return NULL;
11599 }
11600 
11601 PyDoc_STRVAR(find__doc__,
11602              "S.find(sub[, start[, end]]) -> int\n\
11603 \n\
11604 Return the lowest index in S where substring sub is found,\n\
11605 such that sub is contained within S[start:end].  Optional\n\
11606 arguments start and end are interpreted as in slice notation.\n\
11607 \n\
11608 Return -1 on failure.");
11609 
11610 static PyObject *
unicode_find(PyObject * self,PyObject * args)11611 unicode_find(PyObject *self, PyObject *args)
11612 {
11613     /* initialize variables to prevent gcc warning */
11614     PyObject *substring = NULL;
11615     Py_ssize_t start = 0;
11616     Py_ssize_t end = 0;
11617     Py_ssize_t result;
11618 
11619     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11620         return NULL;
11621 
11622     if (PyUnicode_READY(self) == -1)
11623         return NULL;
11624 
11625     result = any_find_slice(self, substring, start, end, 1);
11626 
11627     if (result == -2)
11628         return NULL;
11629 
11630     return PyLong_FromSsize_t(result);
11631 }
11632 
11633 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11634 unicode_getitem(PyObject *self, Py_ssize_t index)
11635 {
11636     void *data;
11637     enum PyUnicode_Kind kind;
11638     Py_UCS4 ch;
11639 
11640     if (!PyUnicode_Check(self)) {
11641         PyErr_BadArgument();
11642         return NULL;
11643     }
11644     if (PyUnicode_READY(self) == -1) {
11645         return NULL;
11646     }
11647     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11648         PyErr_SetString(PyExc_IndexError, "string index out of range");
11649         return NULL;
11650     }
11651     kind = PyUnicode_KIND(self);
11652     data = PyUnicode_DATA(self);
11653     ch = PyUnicode_READ(kind, data, index);
11654     return unicode_char(ch);
11655 }
11656 
11657 /* Believe it or not, this produces the same value for ASCII strings
11658    as bytes_hash(). */
11659 static Py_hash_t
unicode_hash(PyObject * self)11660 unicode_hash(PyObject *self)
11661 {
11662     Py_ssize_t len;
11663     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11664 
11665 #ifdef Py_DEBUG
11666     assert(_Py_HashSecret_Initialized);
11667 #endif
11668     if (_PyUnicode_HASH(self) != -1)
11669         return _PyUnicode_HASH(self);
11670     if (PyUnicode_READY(self) == -1)
11671         return -1;
11672     len = PyUnicode_GET_LENGTH(self);
11673     /*
11674       We make the hash of the empty string be 0, rather than using
11675       (prefix ^ suffix), since this slightly obfuscates the hash secret
11676     */
11677     if (len == 0) {
11678         _PyUnicode_HASH(self) = 0;
11679         return 0;
11680     }
11681     x = _Py_HashBytes(PyUnicode_DATA(self),
11682                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11683     _PyUnicode_HASH(self) = x;
11684     return x;
11685 }
11686 
11687 PyDoc_STRVAR(index__doc__,
11688              "S.index(sub[, start[, end]]) -> int\n\
11689 \n\
11690 Return the lowest index in S where substring sub is found, \n\
11691 such that sub is contained within S[start:end].  Optional\n\
11692 arguments start and end are interpreted as in slice notation.\n\
11693 \n\
11694 Raises ValueError when the substring is not found.");
11695 
11696 static PyObject *
unicode_index(PyObject * self,PyObject * args)11697 unicode_index(PyObject *self, PyObject *args)
11698 {
11699     /* initialize variables to prevent gcc warning */
11700     Py_ssize_t result;
11701     PyObject *substring = NULL;
11702     Py_ssize_t start = 0;
11703     Py_ssize_t end = 0;
11704 
11705     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11706         return NULL;
11707 
11708     if (PyUnicode_READY(self) == -1)
11709         return NULL;
11710 
11711     result = any_find_slice(self, substring, start, end, 1);
11712 
11713     if (result == -2)
11714         return NULL;
11715 
11716     if (result < 0) {
11717         PyErr_SetString(PyExc_ValueError, "substring not found");
11718         return NULL;
11719     }
11720 
11721     return PyLong_FromSsize_t(result);
11722 }
11723 
11724 /*[clinic input]
11725 str.isascii as unicode_isascii
11726 
11727 Return True if all characters in the string are ASCII, False otherwise.
11728 
11729 ASCII characters have code points in the range U+0000-U+007F.
11730 Empty string is ASCII too.
11731 [clinic start generated code]*/
11732 
11733 static PyObject *
unicode_isascii_impl(PyObject * self)11734 unicode_isascii_impl(PyObject *self)
11735 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11736 {
11737     if (PyUnicode_READY(self) == -1) {
11738         return NULL;
11739     }
11740     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11741 }
11742 
11743 /*[clinic input]
11744 str.islower as unicode_islower
11745 
11746 Return True if the string is a lowercase string, False otherwise.
11747 
11748 A string is lowercase if all cased characters in the string are lowercase and
11749 there is at least one cased character in the string.
11750 [clinic start generated code]*/
11751 
11752 static PyObject *
unicode_islower_impl(PyObject * self)11753 unicode_islower_impl(PyObject *self)
11754 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11755 {
11756     Py_ssize_t i, length;
11757     int kind;
11758     void *data;
11759     int cased;
11760 
11761     if (PyUnicode_READY(self) == -1)
11762         return NULL;
11763     length = PyUnicode_GET_LENGTH(self);
11764     kind = PyUnicode_KIND(self);
11765     data = PyUnicode_DATA(self);
11766 
11767     /* Shortcut for single character strings */
11768     if (length == 1)
11769         return PyBool_FromLong(
11770             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11771 
11772     /* Special case for empty strings */
11773     if (length == 0)
11774         Py_RETURN_FALSE;
11775 
11776     cased = 0;
11777     for (i = 0; i < length; i++) {
11778         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11779 
11780         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11781             Py_RETURN_FALSE;
11782         else if (!cased && Py_UNICODE_ISLOWER(ch))
11783             cased = 1;
11784     }
11785     return PyBool_FromLong(cased);
11786 }
11787 
11788 /*[clinic input]
11789 str.isupper as unicode_isupper
11790 
11791 Return True if the string is an uppercase string, False otherwise.
11792 
11793 A string is uppercase if all cased characters in the string are uppercase and
11794 there is at least one cased character in the string.
11795 [clinic start generated code]*/
11796 
11797 static PyObject *
unicode_isupper_impl(PyObject * self)11798 unicode_isupper_impl(PyObject *self)
11799 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11800 {
11801     Py_ssize_t i, length;
11802     int kind;
11803     void *data;
11804     int cased;
11805 
11806     if (PyUnicode_READY(self) == -1)
11807         return NULL;
11808     length = PyUnicode_GET_LENGTH(self);
11809     kind = PyUnicode_KIND(self);
11810     data = PyUnicode_DATA(self);
11811 
11812     /* Shortcut for single character strings */
11813     if (length == 1)
11814         return PyBool_FromLong(
11815             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11816 
11817     /* Special case for empty strings */
11818     if (length == 0)
11819         Py_RETURN_FALSE;
11820 
11821     cased = 0;
11822     for (i = 0; i < length; i++) {
11823         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11824 
11825         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11826             Py_RETURN_FALSE;
11827         else if (!cased && Py_UNICODE_ISUPPER(ch))
11828             cased = 1;
11829     }
11830     return PyBool_FromLong(cased);
11831 }
11832 
11833 /*[clinic input]
11834 str.istitle as unicode_istitle
11835 
11836 Return True if the string is a title-cased string, False otherwise.
11837 
11838 In a title-cased string, upper- and title-case characters may only
11839 follow uncased characters and lowercase characters only cased ones.
11840 [clinic start generated code]*/
11841 
11842 static PyObject *
unicode_istitle_impl(PyObject * self)11843 unicode_istitle_impl(PyObject *self)
11844 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11845 {
11846     Py_ssize_t i, length;
11847     int kind;
11848     void *data;
11849     int cased, previous_is_cased;
11850 
11851     if (PyUnicode_READY(self) == -1)
11852         return NULL;
11853     length = PyUnicode_GET_LENGTH(self);
11854     kind = PyUnicode_KIND(self);
11855     data = PyUnicode_DATA(self);
11856 
11857     /* Shortcut for single character strings */
11858     if (length == 1) {
11859         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11860         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11861                                (Py_UNICODE_ISUPPER(ch) != 0));
11862     }
11863 
11864     /* Special case for empty strings */
11865     if (length == 0)
11866         Py_RETURN_FALSE;
11867 
11868     cased = 0;
11869     previous_is_cased = 0;
11870     for (i = 0; i < length; i++) {
11871         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11872 
11873         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11874             if (previous_is_cased)
11875                 Py_RETURN_FALSE;
11876             previous_is_cased = 1;
11877             cased = 1;
11878         }
11879         else if (Py_UNICODE_ISLOWER(ch)) {
11880             if (!previous_is_cased)
11881                 Py_RETURN_FALSE;
11882             previous_is_cased = 1;
11883             cased = 1;
11884         }
11885         else
11886             previous_is_cased = 0;
11887     }
11888     return PyBool_FromLong(cased);
11889 }
11890 
11891 /*[clinic input]
11892 str.isspace as unicode_isspace
11893 
11894 Return True if the string is a whitespace string, False otherwise.
11895 
11896 A string is whitespace if all characters in the string are whitespace and there
11897 is at least one character in the string.
11898 [clinic start generated code]*/
11899 
11900 static PyObject *
unicode_isspace_impl(PyObject * self)11901 unicode_isspace_impl(PyObject *self)
11902 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11903 {
11904     Py_ssize_t i, length;
11905     int kind;
11906     void *data;
11907 
11908     if (PyUnicode_READY(self) == -1)
11909         return NULL;
11910     length = PyUnicode_GET_LENGTH(self);
11911     kind = PyUnicode_KIND(self);
11912     data = PyUnicode_DATA(self);
11913 
11914     /* Shortcut for single character strings */
11915     if (length == 1)
11916         return PyBool_FromLong(
11917             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11918 
11919     /* Special case for empty strings */
11920     if (length == 0)
11921         Py_RETURN_FALSE;
11922 
11923     for (i = 0; i < length; i++) {
11924         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11925         if (!Py_UNICODE_ISSPACE(ch))
11926             Py_RETURN_FALSE;
11927     }
11928     Py_RETURN_TRUE;
11929 }
11930 
11931 /*[clinic input]
11932 str.isalpha as unicode_isalpha
11933 
11934 Return True if the string is an alphabetic string, False otherwise.
11935 
11936 A string is alphabetic if all characters in the string are alphabetic and there
11937 is at least one character in the string.
11938 [clinic start generated code]*/
11939 
11940 static PyObject *
unicode_isalpha_impl(PyObject * self)11941 unicode_isalpha_impl(PyObject *self)
11942 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11943 {
11944     Py_ssize_t i, length;
11945     int kind;
11946     void *data;
11947 
11948     if (PyUnicode_READY(self) == -1)
11949         return NULL;
11950     length = PyUnicode_GET_LENGTH(self);
11951     kind = PyUnicode_KIND(self);
11952     data = PyUnicode_DATA(self);
11953 
11954     /* Shortcut for single character strings */
11955     if (length == 1)
11956         return PyBool_FromLong(
11957             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11958 
11959     /* Special case for empty strings */
11960     if (length == 0)
11961         Py_RETURN_FALSE;
11962 
11963     for (i = 0; i < length; i++) {
11964         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11965             Py_RETURN_FALSE;
11966     }
11967     Py_RETURN_TRUE;
11968 }
11969 
11970 /*[clinic input]
11971 str.isalnum as unicode_isalnum
11972 
11973 Return True if the string is an alpha-numeric string, False otherwise.
11974 
11975 A string is alpha-numeric if all characters in the string are alpha-numeric and
11976 there is at least one character in the string.
11977 [clinic start generated code]*/
11978 
11979 static PyObject *
unicode_isalnum_impl(PyObject * self)11980 unicode_isalnum_impl(PyObject *self)
11981 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11982 {
11983     int kind;
11984     void *data;
11985     Py_ssize_t len, i;
11986 
11987     if (PyUnicode_READY(self) == -1)
11988         return NULL;
11989 
11990     kind = PyUnicode_KIND(self);
11991     data = PyUnicode_DATA(self);
11992     len = PyUnicode_GET_LENGTH(self);
11993 
11994     /* Shortcut for single character strings */
11995     if (len == 1) {
11996         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11997         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11998     }
11999 
12000     /* Special case for empty strings */
12001     if (len == 0)
12002         Py_RETURN_FALSE;
12003 
12004     for (i = 0; i < len; i++) {
12005         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12006         if (!Py_UNICODE_ISALNUM(ch))
12007             Py_RETURN_FALSE;
12008     }
12009     Py_RETURN_TRUE;
12010 }
12011 
12012 /*[clinic input]
12013 str.isdecimal as unicode_isdecimal
12014 
12015 Return True if the string is a decimal string, False otherwise.
12016 
12017 A string is a decimal string if all characters in the string are decimal and
12018 there is at least one character in the string.
12019 [clinic start generated code]*/
12020 
12021 static PyObject *
unicode_isdecimal_impl(PyObject * self)12022 unicode_isdecimal_impl(PyObject *self)
12023 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12024 {
12025     Py_ssize_t i, length;
12026     int kind;
12027     void *data;
12028 
12029     if (PyUnicode_READY(self) == -1)
12030         return NULL;
12031     length = PyUnicode_GET_LENGTH(self);
12032     kind = PyUnicode_KIND(self);
12033     data = PyUnicode_DATA(self);
12034 
12035     /* Shortcut for single character strings */
12036     if (length == 1)
12037         return PyBool_FromLong(
12038             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12039 
12040     /* Special case for empty strings */
12041     if (length == 0)
12042         Py_RETURN_FALSE;
12043 
12044     for (i = 0; i < length; i++) {
12045         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12046             Py_RETURN_FALSE;
12047     }
12048     Py_RETURN_TRUE;
12049 }
12050 
12051 /*[clinic input]
12052 str.isdigit as unicode_isdigit
12053 
12054 Return True if the string is a digit string, False otherwise.
12055 
12056 A string is a digit string if all characters in the string are digits and there
12057 is at least one character in the string.
12058 [clinic start generated code]*/
12059 
12060 static PyObject *
unicode_isdigit_impl(PyObject * self)12061 unicode_isdigit_impl(PyObject *self)
12062 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12063 {
12064     Py_ssize_t i, length;
12065     int kind;
12066     void *data;
12067 
12068     if (PyUnicode_READY(self) == -1)
12069         return NULL;
12070     length = PyUnicode_GET_LENGTH(self);
12071     kind = PyUnicode_KIND(self);
12072     data = PyUnicode_DATA(self);
12073 
12074     /* Shortcut for single character strings */
12075     if (length == 1) {
12076         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12077         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12078     }
12079 
12080     /* Special case for empty strings */
12081     if (length == 0)
12082         Py_RETURN_FALSE;
12083 
12084     for (i = 0; i < length; i++) {
12085         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12086             Py_RETURN_FALSE;
12087     }
12088     Py_RETURN_TRUE;
12089 }
12090 
12091 /*[clinic input]
12092 str.isnumeric as unicode_isnumeric
12093 
12094 Return True if the string is a numeric string, False otherwise.
12095 
12096 A string is numeric if all characters in the string are numeric and there is at
12097 least one character in the string.
12098 [clinic start generated code]*/
12099 
12100 static PyObject *
unicode_isnumeric_impl(PyObject * self)12101 unicode_isnumeric_impl(PyObject *self)
12102 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12103 {
12104     Py_ssize_t i, length;
12105     int kind;
12106     void *data;
12107 
12108     if (PyUnicode_READY(self) == -1)
12109         return NULL;
12110     length = PyUnicode_GET_LENGTH(self);
12111     kind = PyUnicode_KIND(self);
12112     data = PyUnicode_DATA(self);
12113 
12114     /* Shortcut for single character strings */
12115     if (length == 1)
12116         return PyBool_FromLong(
12117             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12118 
12119     /* Special case for empty strings */
12120     if (length == 0)
12121         Py_RETURN_FALSE;
12122 
12123     for (i = 0; i < length; i++) {
12124         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12125             Py_RETURN_FALSE;
12126     }
12127     Py_RETURN_TRUE;
12128 }
12129 
12130 int
PyUnicode_IsIdentifier(PyObject * self)12131 PyUnicode_IsIdentifier(PyObject *self)
12132 {
12133     int kind;
12134     void *data;
12135     Py_ssize_t i;
12136     Py_UCS4 first;
12137 
12138     if (PyUnicode_READY(self) == -1) {
12139         Py_FatalError("identifier not ready");
12140         return 0;
12141     }
12142 
12143     /* Special case for empty strings */
12144     if (PyUnicode_GET_LENGTH(self) == 0)
12145         return 0;
12146     kind = PyUnicode_KIND(self);
12147     data = PyUnicode_DATA(self);
12148 
12149     /* PEP 3131 says that the first character must be in
12150        XID_Start and subsequent characters in XID_Continue,
12151        and for the ASCII range, the 2.x rules apply (i.e
12152        start with letters and underscore, continue with
12153        letters, digits, underscore). However, given the current
12154        definition of XID_Start and XID_Continue, it is sufficient
12155        to check just for these, except that _ must be allowed
12156        as starting an identifier.  */
12157     first = PyUnicode_READ(kind, data, 0);
12158     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12159         return 0;
12160 
12161     for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12162         if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12163             return 0;
12164     return 1;
12165 }
12166 
12167 /*[clinic input]
12168 str.isidentifier as unicode_isidentifier
12169 
12170 Return True if the string is a valid Python identifier, False otherwise.
12171 
12172 Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12173 "class".
12174 [clinic start generated code]*/
12175 
12176 static PyObject *
unicode_isidentifier_impl(PyObject * self)12177 unicode_isidentifier_impl(PyObject *self)
12178 /*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
12179 {
12180     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12181 }
12182 
12183 /*[clinic input]
12184 str.isprintable as unicode_isprintable
12185 
12186 Return True if the string is printable, False otherwise.
12187 
12188 A string is printable if all of its characters are considered printable in
12189 repr() or if it is empty.
12190 [clinic start generated code]*/
12191 
12192 static PyObject *
unicode_isprintable_impl(PyObject * self)12193 unicode_isprintable_impl(PyObject *self)
12194 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12195 {
12196     Py_ssize_t i, length;
12197     int kind;
12198     void *data;
12199 
12200     if (PyUnicode_READY(self) == -1)
12201         return NULL;
12202     length = PyUnicode_GET_LENGTH(self);
12203     kind = PyUnicode_KIND(self);
12204     data = PyUnicode_DATA(self);
12205 
12206     /* Shortcut for single character strings */
12207     if (length == 1)
12208         return PyBool_FromLong(
12209             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12210 
12211     for (i = 0; i < length; i++) {
12212         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12213             Py_RETURN_FALSE;
12214         }
12215     }
12216     Py_RETURN_TRUE;
12217 }
12218 
12219 /*[clinic input]
12220 str.join as unicode_join
12221 
12222     iterable: object
12223     /
12224 
12225 Concatenate any number of strings.
12226 
12227 The string whose method is called is inserted in between each given string.
12228 The result is returned as a new string.
12229 
12230 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12231 [clinic start generated code]*/
12232 
12233 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12234 unicode_join(PyObject *self, PyObject *iterable)
12235 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12236 {
12237     return PyUnicode_Join(self, iterable);
12238 }
12239 
12240 static Py_ssize_t
unicode_length(PyObject * self)12241 unicode_length(PyObject *self)
12242 {
12243     if (PyUnicode_READY(self) == -1)
12244         return -1;
12245     return PyUnicode_GET_LENGTH(self);
12246 }
12247 
12248 /*[clinic input]
12249 str.ljust as unicode_ljust
12250 
12251     width: Py_ssize_t
12252     fillchar: Py_UCS4 = ' '
12253     /
12254 
12255 Return a left-justified string of length width.
12256 
12257 Padding is done using the specified fill character (default is a space).
12258 [clinic start generated code]*/
12259 
12260 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12261 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12262 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12263 {
12264     if (PyUnicode_READY(self) == -1)
12265         return NULL;
12266 
12267     if (PyUnicode_GET_LENGTH(self) >= width)
12268         return unicode_result_unchanged(self);
12269 
12270     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12271 }
12272 
12273 /*[clinic input]
12274 str.lower as unicode_lower
12275 
12276 Return a copy of the string converted to lowercase.
12277 [clinic start generated code]*/
12278 
12279 static PyObject *
unicode_lower_impl(PyObject * self)12280 unicode_lower_impl(PyObject *self)
12281 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12282 {
12283     if (PyUnicode_READY(self) == -1)
12284         return NULL;
12285     if (PyUnicode_IS_ASCII(self))
12286         return ascii_upper_or_lower(self, 1);
12287     return case_operation(self, do_lower);
12288 }
12289 
12290 #define LEFTSTRIP 0
12291 #define RIGHTSTRIP 1
12292 #define BOTHSTRIP 2
12293 
12294 /* Arrays indexed by above */
12295 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12296 
12297 #define STRIPNAME(i) (stripfuncnames[i])
12298 
12299 /* externally visible for str.strip(unicode) */
12300 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12301 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12302 {
12303     void *data;
12304     int kind;
12305     Py_ssize_t i, j, len;
12306     BLOOM_MASK sepmask;
12307     Py_ssize_t seplen;
12308 
12309     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12310         return NULL;
12311 
12312     kind = PyUnicode_KIND(self);
12313     data = PyUnicode_DATA(self);
12314     len = PyUnicode_GET_LENGTH(self);
12315     seplen = PyUnicode_GET_LENGTH(sepobj);
12316     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12317                               PyUnicode_DATA(sepobj),
12318                               seplen);
12319 
12320     i = 0;
12321     if (striptype != RIGHTSTRIP) {
12322         while (i < len) {
12323             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12324             if (!BLOOM(sepmask, ch))
12325                 break;
12326             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12327                 break;
12328             i++;
12329         }
12330     }
12331 
12332     j = len;
12333     if (striptype != LEFTSTRIP) {
12334         j--;
12335         while (j >= i) {
12336             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12337             if (!BLOOM(sepmask, ch))
12338                 break;
12339             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12340                 break;
12341             j--;
12342         }
12343 
12344         j++;
12345     }
12346 
12347     return PyUnicode_Substring(self, i, j);
12348 }
12349 
12350 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12351 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12352 {
12353     unsigned char *data;
12354     int kind;
12355     Py_ssize_t length;
12356 
12357     if (PyUnicode_READY(self) == -1)
12358         return NULL;
12359 
12360     length = PyUnicode_GET_LENGTH(self);
12361     end = Py_MIN(end, length);
12362 
12363     if (start == 0 && end == length)
12364         return unicode_result_unchanged(self);
12365 
12366     if (start < 0 || end < 0) {
12367         PyErr_SetString(PyExc_IndexError, "string index out of range");
12368         return NULL;
12369     }
12370     if (start >= length || end < start)
12371         _Py_RETURN_UNICODE_EMPTY();
12372 
12373     length = end - start;
12374     if (PyUnicode_IS_ASCII(self)) {
12375         data = PyUnicode_1BYTE_DATA(self);
12376         return _PyUnicode_FromASCII((char*)(data + start), length);
12377     }
12378     else {
12379         kind = PyUnicode_KIND(self);
12380         data = PyUnicode_1BYTE_DATA(self);
12381         return PyUnicode_FromKindAndData(kind,
12382                                          data + kind * start,
12383                                          length);
12384     }
12385 }
12386 
12387 static PyObject *
do_strip(PyObject * self,int striptype)12388 do_strip(PyObject *self, int striptype)
12389 {
12390     Py_ssize_t len, i, j;
12391 
12392     if (PyUnicode_READY(self) == -1)
12393         return NULL;
12394 
12395     len = PyUnicode_GET_LENGTH(self);
12396 
12397     if (PyUnicode_IS_ASCII(self)) {
12398         Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12399 
12400         i = 0;
12401         if (striptype != RIGHTSTRIP) {
12402             while (i < len) {
12403                 Py_UCS1 ch = data[i];
12404                 if (!_Py_ascii_whitespace[ch])
12405                     break;
12406                 i++;
12407             }
12408         }
12409 
12410         j = len;
12411         if (striptype != LEFTSTRIP) {
12412             j--;
12413             while (j >= i) {
12414                 Py_UCS1 ch = data[j];
12415                 if (!_Py_ascii_whitespace[ch])
12416                     break;
12417                 j--;
12418             }
12419             j++;
12420         }
12421     }
12422     else {
12423         int kind = PyUnicode_KIND(self);
12424         void *data = PyUnicode_DATA(self);
12425 
12426         i = 0;
12427         if (striptype != RIGHTSTRIP) {
12428             while (i < len) {
12429                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12430                 if (!Py_UNICODE_ISSPACE(ch))
12431                     break;
12432                 i++;
12433             }
12434         }
12435 
12436         j = len;
12437         if (striptype != LEFTSTRIP) {
12438             j--;
12439             while (j >= i) {
12440                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12441                 if (!Py_UNICODE_ISSPACE(ch))
12442                     break;
12443                 j--;
12444             }
12445             j++;
12446         }
12447     }
12448 
12449     return PyUnicode_Substring(self, i, j);
12450 }
12451 
12452 
12453 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12454 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12455 {
12456     if (sep != NULL && sep != Py_None) {
12457         if (PyUnicode_Check(sep))
12458             return _PyUnicode_XStrip(self, striptype, sep);
12459         else {
12460             PyErr_Format(PyExc_TypeError,
12461                          "%s arg must be None or str",
12462                          STRIPNAME(striptype));
12463             return NULL;
12464         }
12465     }
12466 
12467     return do_strip(self, striptype);
12468 }
12469 
12470 
12471 /*[clinic input]
12472 str.strip as unicode_strip
12473 
12474     chars: object = None
12475     /
12476 
12477 Return a copy of the string with leading and trailing whitespace removed.
12478 
12479 If chars is given and not None, remove characters in chars instead.
12480 [clinic start generated code]*/
12481 
12482 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12483 unicode_strip_impl(PyObject *self, PyObject *chars)
12484 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12485 {
12486     return do_argstrip(self, BOTHSTRIP, chars);
12487 }
12488 
12489 
12490 /*[clinic input]
12491 str.lstrip as unicode_lstrip
12492 
12493     chars: object = NULL
12494     /
12495 
12496 Return a copy of the string with leading whitespace removed.
12497 
12498 If chars is given and not None, remove characters in chars instead.
12499 [clinic start generated code]*/
12500 
12501 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12502 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12503 /*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
12504 {
12505     return do_argstrip(self, LEFTSTRIP, chars);
12506 }
12507 
12508 
12509 /*[clinic input]
12510 str.rstrip as unicode_rstrip
12511 
12512     chars: object = NULL
12513     /
12514 
12515 Return a copy of the string with trailing whitespace removed.
12516 
12517 If chars is given and not None, remove characters in chars instead.
12518 [clinic start generated code]*/
12519 
12520 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12521 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12522 /*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
12523 {
12524     return do_argstrip(self, RIGHTSTRIP, chars);
12525 }
12526 
12527 
12528 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12529 unicode_repeat(PyObject *str, Py_ssize_t len)
12530 {
12531     PyObject *u;
12532     Py_ssize_t nchars, n;
12533 
12534     if (len < 1)
12535         _Py_RETURN_UNICODE_EMPTY();
12536 
12537     /* no repeat, return original string */
12538     if (len == 1)
12539         return unicode_result_unchanged(str);
12540 
12541     if (PyUnicode_READY(str) == -1)
12542         return NULL;
12543 
12544     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12545         PyErr_SetString(PyExc_OverflowError,
12546                         "repeated string is too long");
12547         return NULL;
12548     }
12549     nchars = len * PyUnicode_GET_LENGTH(str);
12550 
12551     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12552     if (!u)
12553         return NULL;
12554     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12555 
12556     if (PyUnicode_GET_LENGTH(str) == 1) {
12557         const int kind = PyUnicode_KIND(str);
12558         const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12559         if (kind == PyUnicode_1BYTE_KIND) {
12560             void *to = PyUnicode_DATA(u);
12561             memset(to, (unsigned char)fill_char, len);
12562         }
12563         else if (kind == PyUnicode_2BYTE_KIND) {
12564             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12565             for (n = 0; n < len; ++n)
12566                 ucs2[n] = fill_char;
12567         } else {
12568             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12569             assert(kind == PyUnicode_4BYTE_KIND);
12570             for (n = 0; n < len; ++n)
12571                 ucs4[n] = fill_char;
12572         }
12573     }
12574     else {
12575         /* number of characters copied this far */
12576         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12577         const Py_ssize_t char_size = PyUnicode_KIND(str);
12578         char *to = (char *) PyUnicode_DATA(u);
12579         memcpy(to, PyUnicode_DATA(str),
12580                   PyUnicode_GET_LENGTH(str) * char_size);
12581         while (done < nchars) {
12582             n = (done <= nchars-done) ? done : nchars-done;
12583             memcpy(to + (done * char_size), to, n * char_size);
12584             done += n;
12585         }
12586     }
12587 
12588     assert(_PyUnicode_CheckConsistency(u, 1));
12589     return u;
12590 }
12591 
12592 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12593 PyUnicode_Replace(PyObject *str,
12594                   PyObject *substr,
12595                   PyObject *replstr,
12596                   Py_ssize_t maxcount)
12597 {
12598     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12599             ensure_unicode(replstr) < 0)
12600         return NULL;
12601     return replace(str, substr, replstr, maxcount);
12602 }
12603 
12604 /*[clinic input]
12605 str.replace as unicode_replace
12606 
12607     old: unicode
12608     new: unicode
12609     count: Py_ssize_t = -1
12610         Maximum number of occurrences to replace.
12611         -1 (the default value) means replace all occurrences.
12612     /
12613 
12614 Return a copy with all occurrences of substring old replaced by new.
12615 
12616 If the optional argument count is given, only the first count occurrences are
12617 replaced.
12618 [clinic start generated code]*/
12619 
12620 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12621 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12622                      Py_ssize_t count)
12623 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12624 {
12625     if (PyUnicode_READY(self) == -1)
12626         return NULL;
12627     return replace(self, old, new, count);
12628 }
12629 
12630 static PyObject *
unicode_repr(PyObject * unicode)12631 unicode_repr(PyObject *unicode)
12632 {
12633     PyObject *repr;
12634     Py_ssize_t isize;
12635     Py_ssize_t osize, squote, dquote, i, o;
12636     Py_UCS4 max, quote;
12637     int ikind, okind, unchanged;
12638     void *idata, *odata;
12639 
12640     if (PyUnicode_READY(unicode) == -1)
12641         return NULL;
12642 
12643     isize = PyUnicode_GET_LENGTH(unicode);
12644     idata = PyUnicode_DATA(unicode);
12645 
12646     /* Compute length of output, quote characters, and
12647        maximum character */
12648     osize = 0;
12649     max = 127;
12650     squote = dquote = 0;
12651     ikind = PyUnicode_KIND(unicode);
12652     for (i = 0; i < isize; i++) {
12653         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12654         Py_ssize_t incr = 1;
12655         switch (ch) {
12656         case '\'': squote++; break;
12657         case '"':  dquote++; break;
12658         case '\\': case '\t': case '\r': case '\n':
12659             incr = 2;
12660             break;
12661         default:
12662             /* Fast-path ASCII */
12663             if (ch < ' ' || ch == 0x7f)
12664                 incr = 4; /* \xHH */
12665             else if (ch < 0x7f)
12666                 ;
12667             else if (Py_UNICODE_ISPRINTABLE(ch))
12668                 max = ch > max ? ch : max;
12669             else if (ch < 0x100)
12670                 incr = 4; /* \xHH */
12671             else if (ch < 0x10000)
12672                 incr = 6; /* \uHHHH */
12673             else
12674                 incr = 10; /* \uHHHHHHHH */
12675         }
12676         if (osize > PY_SSIZE_T_MAX - incr) {
12677             PyErr_SetString(PyExc_OverflowError,
12678                             "string is too long to generate repr");
12679             return NULL;
12680         }
12681         osize += incr;
12682     }
12683 
12684     quote = '\'';
12685     unchanged = (osize == isize);
12686     if (squote) {
12687         unchanged = 0;
12688         if (dquote)
12689             /* Both squote and dquote present. Use squote,
12690                and escape them */
12691             osize += squote;
12692         else
12693             quote = '"';
12694     }
12695     osize += 2;   /* quotes */
12696 
12697     repr = PyUnicode_New(osize, max);
12698     if (repr == NULL)
12699         return NULL;
12700     okind = PyUnicode_KIND(repr);
12701     odata = PyUnicode_DATA(repr);
12702 
12703     PyUnicode_WRITE(okind, odata, 0, quote);
12704     PyUnicode_WRITE(okind, odata, osize-1, quote);
12705     if (unchanged) {
12706         _PyUnicode_FastCopyCharacters(repr, 1,
12707                                       unicode, 0,
12708                                       isize);
12709     }
12710     else {
12711         for (i = 0, o = 1; i < isize; i++) {
12712             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12713 
12714             /* Escape quotes and backslashes */
12715             if ((ch == quote) || (ch == '\\')) {
12716                 PyUnicode_WRITE(okind, odata, o++, '\\');
12717                 PyUnicode_WRITE(okind, odata, o++, ch);
12718                 continue;
12719             }
12720 
12721             /* Map special whitespace to '\t', \n', '\r' */
12722             if (ch == '\t') {
12723                 PyUnicode_WRITE(okind, odata, o++, '\\');
12724                 PyUnicode_WRITE(okind, odata, o++, 't');
12725             }
12726             else if (ch == '\n') {
12727                 PyUnicode_WRITE(okind, odata, o++, '\\');
12728                 PyUnicode_WRITE(okind, odata, o++, 'n');
12729             }
12730             else if (ch == '\r') {
12731                 PyUnicode_WRITE(okind, odata, o++, '\\');
12732                 PyUnicode_WRITE(okind, odata, o++, 'r');
12733             }
12734 
12735             /* Map non-printable US ASCII to '\xhh' */
12736             else if (ch < ' ' || ch == 0x7F) {
12737                 PyUnicode_WRITE(okind, odata, o++, '\\');
12738                 PyUnicode_WRITE(okind, odata, o++, 'x');
12739                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12740                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12741             }
12742 
12743             /* Copy ASCII characters as-is */
12744             else if (ch < 0x7F) {
12745                 PyUnicode_WRITE(okind, odata, o++, ch);
12746             }
12747 
12748             /* Non-ASCII characters */
12749             else {
12750                 /* Map Unicode whitespace and control characters
12751                    (categories Z* and C* except ASCII space)
12752                 */
12753                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12754                     PyUnicode_WRITE(okind, odata, o++, '\\');
12755                     /* Map 8-bit characters to '\xhh' */
12756                     if (ch <= 0xff) {
12757                         PyUnicode_WRITE(okind, odata, o++, 'x');
12758                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12759                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12760                     }
12761                     /* Map 16-bit characters to '\uxxxx' */
12762                     else if (ch <= 0xffff) {
12763                         PyUnicode_WRITE(okind, odata, o++, 'u');
12764                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12765                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12766                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12767                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12768                     }
12769                     /* Map 21-bit characters to '\U00xxxxxx' */
12770                     else {
12771                         PyUnicode_WRITE(okind, odata, o++, 'U');
12772                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12773                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12774                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12775                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12776                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12777                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12778                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12779                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12780                     }
12781                 }
12782                 /* Copy characters as-is */
12783                 else {
12784                     PyUnicode_WRITE(okind, odata, o++, ch);
12785                 }
12786             }
12787         }
12788     }
12789     /* Closing quote already added at the beginning */
12790     assert(_PyUnicode_CheckConsistency(repr, 1));
12791     return repr;
12792 }
12793 
12794 PyDoc_STRVAR(rfind__doc__,
12795              "S.rfind(sub[, start[, end]]) -> int\n\
12796 \n\
12797 Return the highest index in S where substring sub is found,\n\
12798 such that sub is contained within S[start:end].  Optional\n\
12799 arguments start and end are interpreted as in slice notation.\n\
12800 \n\
12801 Return -1 on failure.");
12802 
12803 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12804 unicode_rfind(PyObject *self, PyObject *args)
12805 {
12806     /* initialize variables to prevent gcc warning */
12807     PyObject *substring = NULL;
12808     Py_ssize_t start = 0;
12809     Py_ssize_t end = 0;
12810     Py_ssize_t result;
12811 
12812     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12813         return NULL;
12814 
12815     if (PyUnicode_READY(self) == -1)
12816         return NULL;
12817 
12818     result = any_find_slice(self, substring, start, end, -1);
12819 
12820     if (result == -2)
12821         return NULL;
12822 
12823     return PyLong_FromSsize_t(result);
12824 }
12825 
12826 PyDoc_STRVAR(rindex__doc__,
12827              "S.rindex(sub[, start[, end]]) -> int\n\
12828 \n\
12829 Return the highest index in S where substring sub is found,\n\
12830 such that sub is contained within S[start:end].  Optional\n\
12831 arguments start and end are interpreted as in slice notation.\n\
12832 \n\
12833 Raises ValueError when the substring is not found.");
12834 
12835 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12836 unicode_rindex(PyObject *self, PyObject *args)
12837 {
12838     /* initialize variables to prevent gcc warning */
12839     PyObject *substring = NULL;
12840     Py_ssize_t start = 0;
12841     Py_ssize_t end = 0;
12842     Py_ssize_t result;
12843 
12844     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12845         return NULL;
12846 
12847     if (PyUnicode_READY(self) == -1)
12848         return NULL;
12849 
12850     result = any_find_slice(self, substring, start, end, -1);
12851 
12852     if (result == -2)
12853         return NULL;
12854 
12855     if (result < 0) {
12856         PyErr_SetString(PyExc_ValueError, "substring not found");
12857         return NULL;
12858     }
12859 
12860     return PyLong_FromSsize_t(result);
12861 }
12862 
12863 /*[clinic input]
12864 str.rjust as unicode_rjust
12865 
12866     width: Py_ssize_t
12867     fillchar: Py_UCS4 = ' '
12868     /
12869 
12870 Return a right-justified string of length width.
12871 
12872 Padding is done using the specified fill character (default is a space).
12873 [clinic start generated code]*/
12874 
12875 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12876 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12877 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12878 {
12879     if (PyUnicode_READY(self) == -1)
12880         return NULL;
12881 
12882     if (PyUnicode_GET_LENGTH(self) >= width)
12883         return unicode_result_unchanged(self);
12884 
12885     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12886 }
12887 
12888 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12889 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12890 {
12891     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12892         return NULL;
12893 
12894     return split(s, sep, maxsplit);
12895 }
12896 
12897 /*[clinic input]
12898 str.split as unicode_split
12899 
12900     sep: object = None
12901         The delimiter according which to split the string.
12902         None (the default value) means split according to any whitespace,
12903         and discard empty strings from the result.
12904     maxsplit: Py_ssize_t = -1
12905         Maximum number of splits to do.
12906         -1 (the default value) means no limit.
12907 
12908 Return a list of the words in the string, using sep as the delimiter string.
12909 [clinic start generated code]*/
12910 
12911 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12912 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12913 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12914 {
12915     if (sep == Py_None)
12916         return split(self, NULL, maxsplit);
12917     if (PyUnicode_Check(sep))
12918         return split(self, sep, maxsplit);
12919 
12920     PyErr_Format(PyExc_TypeError,
12921                  "must be str or None, not %.100s",
12922                  Py_TYPE(sep)->tp_name);
12923     return NULL;
12924 }
12925 
12926 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12927 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12928 {
12929     PyObject* out;
12930     int kind1, kind2;
12931     void *buf1, *buf2;
12932     Py_ssize_t len1, len2;
12933 
12934     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12935         return NULL;
12936 
12937     kind1 = PyUnicode_KIND(str_obj);
12938     kind2 = PyUnicode_KIND(sep_obj);
12939     len1 = PyUnicode_GET_LENGTH(str_obj);
12940     len2 = PyUnicode_GET_LENGTH(sep_obj);
12941     if (kind1 < kind2 || len1 < len2) {
12942         _Py_INCREF_UNICODE_EMPTY();
12943         if (!unicode_empty)
12944             out = NULL;
12945         else {
12946             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12947             Py_DECREF(unicode_empty);
12948         }
12949         return out;
12950     }
12951     buf1 = PyUnicode_DATA(str_obj);
12952     buf2 = PyUnicode_DATA(sep_obj);
12953     if (kind2 != kind1) {
12954         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12955         if (!buf2)
12956             return NULL;
12957     }
12958 
12959     switch (kind1) {
12960     case PyUnicode_1BYTE_KIND:
12961         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12962             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12963         else
12964             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965         break;
12966     case PyUnicode_2BYTE_KIND:
12967         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968         break;
12969     case PyUnicode_4BYTE_KIND:
12970         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971         break;
12972     default:
12973         Py_UNREACHABLE();
12974     }
12975 
12976     if (kind2 != kind1)
12977         PyMem_Free(buf2);
12978 
12979     return out;
12980 }
12981 
12982 
12983 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12984 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12985 {
12986     PyObject* out;
12987     int kind1, kind2;
12988     void *buf1, *buf2;
12989     Py_ssize_t len1, len2;
12990 
12991     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12992         return NULL;
12993 
12994     kind1 = PyUnicode_KIND(str_obj);
12995     kind2 = PyUnicode_KIND(sep_obj);
12996     len1 = PyUnicode_GET_LENGTH(str_obj);
12997     len2 = PyUnicode_GET_LENGTH(sep_obj);
12998     if (kind1 < kind2 || len1 < len2) {
12999         _Py_INCREF_UNICODE_EMPTY();
13000         if (!unicode_empty)
13001             out = NULL;
13002         else {
13003             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13004             Py_DECREF(unicode_empty);
13005         }
13006         return out;
13007     }
13008     buf1 = PyUnicode_DATA(str_obj);
13009     buf2 = PyUnicode_DATA(sep_obj);
13010     if (kind2 != kind1) {
13011         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13012         if (!buf2)
13013             return NULL;
13014     }
13015 
13016     switch (kind1) {
13017     case PyUnicode_1BYTE_KIND:
13018         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13019             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13020         else
13021             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022         break;
13023     case PyUnicode_2BYTE_KIND:
13024         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13025         break;
13026     case PyUnicode_4BYTE_KIND:
13027         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13028         break;
13029     default:
13030         Py_UNREACHABLE();
13031     }
13032 
13033     if (kind2 != kind1)
13034         PyMem_Free(buf2);
13035 
13036     return out;
13037 }
13038 
13039 /*[clinic input]
13040 str.partition as unicode_partition
13041 
13042     sep: object
13043     /
13044 
13045 Partition the string into three parts using the given separator.
13046 
13047 This will search for the separator in the string.  If the separator is found,
13048 returns a 3-tuple containing the part before the separator, the separator
13049 itself, and the part after it.
13050 
13051 If the separator is not found, returns a 3-tuple containing the original string
13052 and two empty strings.
13053 [clinic start generated code]*/
13054 
13055 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13056 unicode_partition(PyObject *self, PyObject *sep)
13057 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13058 {
13059     return PyUnicode_Partition(self, sep);
13060 }
13061 
13062 /*[clinic input]
13063 str.rpartition as unicode_rpartition = str.partition
13064 
13065 Partition the string into three parts using the given separator.
13066 
13067 This will search for the separator in the string, starting at the end. If
13068 the separator is found, returns a 3-tuple containing the part before the
13069 separator, the separator itself, and the part after it.
13070 
13071 If the separator is not found, returns a 3-tuple containing two empty strings
13072 and the original string.
13073 [clinic start generated code]*/
13074 
13075 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13076 unicode_rpartition(PyObject *self, PyObject *sep)
13077 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13078 {
13079     return PyUnicode_RPartition(self, sep);
13080 }
13081 
13082 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13083 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13084 {
13085     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13086         return NULL;
13087 
13088     return rsplit(s, sep, maxsplit);
13089 }
13090 
13091 /*[clinic input]
13092 str.rsplit as unicode_rsplit = str.split
13093 
13094 Return a list of the words in the string, using sep as the delimiter string.
13095 
13096 Splits are done starting at the end of the string and working to the front.
13097 [clinic start generated code]*/
13098 
13099 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13100 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13101 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13102 {
13103     if (sep == Py_None)
13104         return rsplit(self, NULL, maxsplit);
13105     if (PyUnicode_Check(sep))
13106         return rsplit(self, sep, maxsplit);
13107 
13108     PyErr_Format(PyExc_TypeError,
13109                  "must be str or None, not %.100s",
13110                  Py_TYPE(sep)->tp_name);
13111     return NULL;
13112 }
13113 
13114 /*[clinic input]
13115 str.splitlines as unicode_splitlines
13116 
13117     keepends: bool(accept={int}) = False
13118 
13119 Return a list of the lines in the string, breaking at line boundaries.
13120 
13121 Line breaks are not included in the resulting list unless keepends is given and
13122 true.
13123 [clinic start generated code]*/
13124 
13125 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13126 unicode_splitlines_impl(PyObject *self, int keepends)
13127 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13128 {
13129     return PyUnicode_Splitlines(self, keepends);
13130 }
13131 
13132 static
unicode_str(PyObject * self)13133 PyObject *unicode_str(PyObject *self)
13134 {
13135     return unicode_result_unchanged(self);
13136 }
13137 
13138 /*[clinic input]
13139 str.swapcase as unicode_swapcase
13140 
13141 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13142 [clinic start generated code]*/
13143 
13144 static PyObject *
unicode_swapcase_impl(PyObject * self)13145 unicode_swapcase_impl(PyObject *self)
13146 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13147 {
13148     if (PyUnicode_READY(self) == -1)
13149         return NULL;
13150     return case_operation(self, do_swapcase);
13151 }
13152 
13153 /*[clinic input]
13154 
13155 @staticmethod
13156 str.maketrans as unicode_maketrans
13157 
13158   x: object
13159 
13160   y: unicode=NULL
13161 
13162   z: unicode=NULL
13163 
13164   /
13165 
13166 Return a translation table usable for str.translate().
13167 
13168 If there is only one argument, it must be a dictionary mapping Unicode
13169 ordinals (integers) or characters to Unicode ordinals, strings or None.
13170 Character keys will be then converted to ordinals.
13171 If there are two arguments, they must be strings of equal length, and
13172 in the resulting dictionary, each character in x will be mapped to the
13173 character at the same position in y. If there is a third argument, it
13174 must be a string, whose characters will be mapped to None in the result.
13175 [clinic start generated code]*/
13176 
13177 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13178 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13179 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13180 {
13181     PyObject *new = NULL, *key, *value;
13182     Py_ssize_t i = 0;
13183     int res;
13184 
13185     new = PyDict_New();
13186     if (!new)
13187         return NULL;
13188     if (y != NULL) {
13189         int x_kind, y_kind, z_kind;
13190         void *x_data, *y_data, *z_data;
13191 
13192         /* x must be a string too, of equal length */
13193         if (!PyUnicode_Check(x)) {
13194             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13195                             "be a string if there is a second argument");
13196             goto err;
13197         }
13198         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13199             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13200                             "arguments must have equal length");
13201             goto err;
13202         }
13203         /* create entries for translating chars in x to those in y */
13204         x_kind = PyUnicode_KIND(x);
13205         y_kind = PyUnicode_KIND(y);
13206         x_data = PyUnicode_DATA(x);
13207         y_data = PyUnicode_DATA(y);
13208         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13209             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13210             if (!key)
13211                 goto err;
13212             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13213             if (!value) {
13214                 Py_DECREF(key);
13215                 goto err;
13216             }
13217             res = PyDict_SetItem(new, key, value);
13218             Py_DECREF(key);
13219             Py_DECREF(value);
13220             if (res < 0)
13221                 goto err;
13222         }
13223         /* create entries for deleting chars in z */
13224         if (z != NULL) {
13225             z_kind = PyUnicode_KIND(z);
13226             z_data = PyUnicode_DATA(z);
13227             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13228                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13229                 if (!key)
13230                     goto err;
13231                 res = PyDict_SetItem(new, key, Py_None);
13232                 Py_DECREF(key);
13233                 if (res < 0)
13234                     goto err;
13235             }
13236         }
13237     } else {
13238         int kind;
13239         void *data;
13240 
13241         /* x must be a dict */
13242         if (!PyDict_CheckExact(x)) {
13243             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13244                             "to maketrans it must be a dict");
13245             goto err;
13246         }
13247         /* copy entries into the new dict, converting string keys to int keys */
13248         while (PyDict_Next(x, &i, &key, &value)) {
13249             if (PyUnicode_Check(key)) {
13250                 /* convert string keys to integer keys */
13251                 PyObject *newkey;
13252                 if (PyUnicode_GET_LENGTH(key) != 1) {
13253                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13254                                     "table must be of length 1");
13255                     goto err;
13256                 }
13257                 kind = PyUnicode_KIND(key);
13258                 data = PyUnicode_DATA(key);
13259                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13260                 if (!newkey)
13261                     goto err;
13262                 res = PyDict_SetItem(new, newkey, value);
13263                 Py_DECREF(newkey);
13264                 if (res < 0)
13265                     goto err;
13266             } else if (PyLong_Check(key)) {
13267                 /* just keep integer keys */
13268                 if (PyDict_SetItem(new, key, value) < 0)
13269                     goto err;
13270             } else {
13271                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13272                                 "be strings or integers");
13273                 goto err;
13274             }
13275         }
13276     }
13277     return new;
13278   err:
13279     Py_DECREF(new);
13280     return NULL;
13281 }
13282 
13283 /*[clinic input]
13284 str.translate as unicode_translate
13285 
13286     table: object
13287         Translation table, which must be a mapping of Unicode ordinals to
13288         Unicode ordinals, strings, or None.
13289     /
13290 
13291 Replace each character in the string using the given translation table.
13292 
13293 The table must implement lookup/indexing via __getitem__, for instance a
13294 dictionary or list.  If this operation raises LookupError, the character is
13295 left untouched.  Characters mapped to None are deleted.
13296 [clinic start generated code]*/
13297 
13298 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13299 unicode_translate(PyObject *self, PyObject *table)
13300 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13301 {
13302     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13303 }
13304 
13305 /*[clinic input]
13306 str.upper as unicode_upper
13307 
13308 Return a copy of the string converted to uppercase.
13309 [clinic start generated code]*/
13310 
13311 static PyObject *
unicode_upper_impl(PyObject * self)13312 unicode_upper_impl(PyObject *self)
13313 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13314 {
13315     if (PyUnicode_READY(self) == -1)
13316         return NULL;
13317     if (PyUnicode_IS_ASCII(self))
13318         return ascii_upper_or_lower(self, 0);
13319     return case_operation(self, do_upper);
13320 }
13321 
13322 /*[clinic input]
13323 str.zfill as unicode_zfill
13324 
13325     width: Py_ssize_t
13326     /
13327 
13328 Pad a numeric string with zeros on the left, to fill a field of the given width.
13329 
13330 The string is never truncated.
13331 [clinic start generated code]*/
13332 
13333 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13334 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13335 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13336 {
13337     Py_ssize_t fill;
13338     PyObject *u;
13339     int kind;
13340     void *data;
13341     Py_UCS4 chr;
13342 
13343     if (PyUnicode_READY(self) == -1)
13344         return NULL;
13345 
13346     if (PyUnicode_GET_LENGTH(self) >= width)
13347         return unicode_result_unchanged(self);
13348 
13349     fill = width - PyUnicode_GET_LENGTH(self);
13350 
13351     u = pad(self, fill, 0, '0');
13352 
13353     if (u == NULL)
13354         return NULL;
13355 
13356     kind = PyUnicode_KIND(u);
13357     data = PyUnicode_DATA(u);
13358     chr = PyUnicode_READ(kind, data, fill);
13359 
13360     if (chr == '+' || chr == '-') {
13361         /* move sign to beginning of string */
13362         PyUnicode_WRITE(kind, data, 0, chr);
13363         PyUnicode_WRITE(kind, data, fill, '0');
13364     }
13365 
13366     assert(_PyUnicode_CheckConsistency(u, 1));
13367     return u;
13368 }
13369 
13370 #if 0
13371 static PyObject *
13372 unicode__decimal2ascii(PyObject *self)
13373 {
13374     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13375 }
13376 #endif
13377 
13378 PyDoc_STRVAR(startswith__doc__,
13379              "S.startswith(prefix[, start[, end]]) -> bool\n\
13380 \n\
13381 Return True if S starts with the specified prefix, False otherwise.\n\
13382 With optional start, test S beginning at that position.\n\
13383 With optional end, stop comparing S at that position.\n\
13384 prefix can also be a tuple of strings to try.");
13385 
13386 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13387 unicode_startswith(PyObject *self,
13388                    PyObject *args)
13389 {
13390     PyObject *subobj;
13391     PyObject *substring;
13392     Py_ssize_t start = 0;
13393     Py_ssize_t end = PY_SSIZE_T_MAX;
13394     int result;
13395 
13396     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13397         return NULL;
13398     if (PyTuple_Check(subobj)) {
13399         Py_ssize_t i;
13400         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13401             substring = PyTuple_GET_ITEM(subobj, i);
13402             if (!PyUnicode_Check(substring)) {
13403                 PyErr_Format(PyExc_TypeError,
13404                              "tuple for startswith must only contain str, "
13405                              "not %.100s",
13406                              Py_TYPE(substring)->tp_name);
13407                 return NULL;
13408             }
13409             result = tailmatch(self, substring, start, end, -1);
13410             if (result == -1)
13411                 return NULL;
13412             if (result) {
13413                 Py_RETURN_TRUE;
13414             }
13415         }
13416         /* nothing matched */
13417         Py_RETURN_FALSE;
13418     }
13419     if (!PyUnicode_Check(subobj)) {
13420         PyErr_Format(PyExc_TypeError,
13421                      "startswith first arg must be str or "
13422                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13423         return NULL;
13424     }
13425     result = tailmatch(self, subobj, start, end, -1);
13426     if (result == -1)
13427         return NULL;
13428     return PyBool_FromLong(result);
13429 }
13430 
13431 
13432 PyDoc_STRVAR(endswith__doc__,
13433              "S.endswith(suffix[, start[, end]]) -> bool\n\
13434 \n\
13435 Return True if S ends with the specified suffix, False otherwise.\n\
13436 With optional start, test S beginning at that position.\n\
13437 With optional end, stop comparing S at that position.\n\
13438 suffix can also be a tuple of strings to try.");
13439 
13440 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13441 unicode_endswith(PyObject *self,
13442                  PyObject *args)
13443 {
13444     PyObject *subobj;
13445     PyObject *substring;
13446     Py_ssize_t start = 0;
13447     Py_ssize_t end = PY_SSIZE_T_MAX;
13448     int result;
13449 
13450     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13451         return NULL;
13452     if (PyTuple_Check(subobj)) {
13453         Py_ssize_t i;
13454         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13455             substring = PyTuple_GET_ITEM(subobj, i);
13456             if (!PyUnicode_Check(substring)) {
13457                 PyErr_Format(PyExc_TypeError,
13458                              "tuple for endswith must only contain str, "
13459                              "not %.100s",
13460                              Py_TYPE(substring)->tp_name);
13461                 return NULL;
13462             }
13463             result = tailmatch(self, substring, start, end, +1);
13464             if (result == -1)
13465                 return NULL;
13466             if (result) {
13467                 Py_RETURN_TRUE;
13468             }
13469         }
13470         Py_RETURN_FALSE;
13471     }
13472     if (!PyUnicode_Check(subobj)) {
13473         PyErr_Format(PyExc_TypeError,
13474                      "endswith first arg must be str or "
13475                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13476         return NULL;
13477     }
13478     result = tailmatch(self, subobj, start, end, +1);
13479     if (result == -1)
13480         return NULL;
13481     return PyBool_FromLong(result);
13482 }
13483 
13484 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13485 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13486 {
13487     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13488     writer->data = PyUnicode_DATA(writer->buffer);
13489 
13490     if (!writer->readonly) {
13491         writer->kind = PyUnicode_KIND(writer->buffer);
13492         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13493     }
13494     else {
13495         /* use a value smaller than PyUnicode_1BYTE_KIND() so
13496            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13497         writer->kind = PyUnicode_WCHAR_KIND;
13498         assert(writer->kind <= PyUnicode_1BYTE_KIND);
13499 
13500         /* Copy-on-write mode: set buffer size to 0 so
13501          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13502          * next write. */
13503         writer->size = 0;
13504     }
13505 }
13506 
13507 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13508 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13509 {
13510     memset(writer, 0, sizeof(*writer));
13511 
13512     /* ASCII is the bare minimum */
13513     writer->min_char = 127;
13514 
13515     /* use a value smaller than PyUnicode_1BYTE_KIND() so
13516        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13517     writer->kind = PyUnicode_WCHAR_KIND;
13518     assert(writer->kind <= PyUnicode_1BYTE_KIND);
13519 }
13520 
13521 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13522 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13523                                  Py_ssize_t length, Py_UCS4 maxchar)
13524 {
13525     Py_ssize_t newlen;
13526     PyObject *newbuffer;
13527 
13528     assert(maxchar <= MAX_UNICODE);
13529 
13530     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13531     assert((maxchar > writer->maxchar && length >= 0)
13532            || length > 0);
13533 
13534     if (length > PY_SSIZE_T_MAX - writer->pos) {
13535         PyErr_NoMemory();
13536         return -1;
13537     }
13538     newlen = writer->pos + length;
13539 
13540     maxchar = Py_MAX(maxchar, writer->min_char);
13541 
13542     if (writer->buffer == NULL) {
13543         assert(!writer->readonly);
13544         if (writer->overallocate
13545             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13546             /* overallocate to limit the number of realloc() */
13547             newlen += newlen / OVERALLOCATE_FACTOR;
13548         }
13549         if (newlen < writer->min_length)
13550             newlen = writer->min_length;
13551 
13552         writer->buffer = PyUnicode_New(newlen, maxchar);
13553         if (writer->buffer == NULL)
13554             return -1;
13555     }
13556     else if (newlen > writer->size) {
13557         if (writer->overallocate
13558             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13559             /* overallocate to limit the number of realloc() */
13560             newlen += newlen / OVERALLOCATE_FACTOR;
13561         }
13562         if (newlen < writer->min_length)
13563             newlen = writer->min_length;
13564 
13565         if (maxchar > writer->maxchar || writer->readonly) {
13566             /* resize + widen */
13567             maxchar = Py_MAX(maxchar, writer->maxchar);
13568             newbuffer = PyUnicode_New(newlen, maxchar);
13569             if (newbuffer == NULL)
13570                 return -1;
13571             _PyUnicode_FastCopyCharacters(newbuffer, 0,
13572                                           writer->buffer, 0, writer->pos);
13573             Py_DECREF(writer->buffer);
13574             writer->readonly = 0;
13575         }
13576         else {
13577             newbuffer = resize_compact(writer->buffer, newlen);
13578             if (newbuffer == NULL)
13579                 return -1;
13580         }
13581         writer->buffer = newbuffer;
13582     }
13583     else if (maxchar > writer->maxchar) {
13584         assert(!writer->readonly);
13585         newbuffer = PyUnicode_New(writer->size, maxchar);
13586         if (newbuffer == NULL)
13587             return -1;
13588         _PyUnicode_FastCopyCharacters(newbuffer, 0,
13589                                       writer->buffer, 0, writer->pos);
13590         Py_SETREF(writer->buffer, newbuffer);
13591     }
13592     _PyUnicodeWriter_Update(writer);
13593     return 0;
13594 
13595 #undef OVERALLOCATE_FACTOR
13596 }
13597 
13598 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13599 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13600                                      enum PyUnicode_Kind kind)
13601 {
13602     Py_UCS4 maxchar;
13603 
13604     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13605     assert(writer->kind < kind);
13606 
13607     switch (kind)
13608     {
13609     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13610     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13611     case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13612     default:
13613         Py_UNREACHABLE();
13614     }
13615 
13616     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13617 }
13618 
13619 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13620 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13621 {
13622     assert(ch <= MAX_UNICODE);
13623     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13624         return -1;
13625     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13626     writer->pos++;
13627     return 0;
13628 }
13629 
13630 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13631 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13632 {
13633     return _PyUnicodeWriter_WriteCharInline(writer, ch);
13634 }
13635 
13636 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13637 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13638 {
13639     Py_UCS4 maxchar;
13640     Py_ssize_t len;
13641 
13642     if (PyUnicode_READY(str) == -1)
13643         return -1;
13644     len = PyUnicode_GET_LENGTH(str);
13645     if (len == 0)
13646         return 0;
13647     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13648     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13649         if (writer->buffer == NULL && !writer->overallocate) {
13650             assert(_PyUnicode_CheckConsistency(str, 1));
13651             writer->readonly = 1;
13652             Py_INCREF(str);
13653             writer->buffer = str;
13654             _PyUnicodeWriter_Update(writer);
13655             writer->pos += len;
13656             return 0;
13657         }
13658         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13659             return -1;
13660     }
13661     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13662                                   str, 0, len);
13663     writer->pos += len;
13664     return 0;
13665 }
13666 
13667 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13668 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13669                                 Py_ssize_t start, Py_ssize_t end)
13670 {
13671     Py_UCS4 maxchar;
13672     Py_ssize_t len;
13673 
13674     if (PyUnicode_READY(str) == -1)
13675         return -1;
13676 
13677     assert(0 <= start);
13678     assert(end <= PyUnicode_GET_LENGTH(str));
13679     assert(start <= end);
13680 
13681     if (end == 0)
13682         return 0;
13683 
13684     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13685         return _PyUnicodeWriter_WriteStr(writer, str);
13686 
13687     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13688         maxchar = _PyUnicode_FindMaxChar(str, start, end);
13689     else
13690         maxchar = writer->maxchar;
13691     len = end - start;
13692 
13693     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13694         return -1;
13695 
13696     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13697                                   str, start, len);
13698     writer->pos += len;
13699     return 0;
13700 }
13701 
13702 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13703 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13704                                   const char *ascii, Py_ssize_t len)
13705 {
13706     if (len == -1)
13707         len = strlen(ascii);
13708 
13709     assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13710 
13711     if (writer->buffer == NULL && !writer->overallocate) {
13712         PyObject *str;
13713 
13714         str = _PyUnicode_FromASCII(ascii, len);
13715         if (str == NULL)
13716             return -1;
13717 
13718         writer->readonly = 1;
13719         writer->buffer = str;
13720         _PyUnicodeWriter_Update(writer);
13721         writer->pos += len;
13722         return 0;
13723     }
13724 
13725     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13726         return -1;
13727 
13728     switch (writer->kind)
13729     {
13730     case PyUnicode_1BYTE_KIND:
13731     {
13732         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13733         Py_UCS1 *data = writer->data;
13734 
13735         memcpy(data + writer->pos, str, len);
13736         break;
13737     }
13738     case PyUnicode_2BYTE_KIND:
13739     {
13740         _PyUnicode_CONVERT_BYTES(
13741             Py_UCS1, Py_UCS2,
13742             ascii, ascii + len,
13743             (Py_UCS2 *)writer->data + writer->pos);
13744         break;
13745     }
13746     case PyUnicode_4BYTE_KIND:
13747     {
13748         _PyUnicode_CONVERT_BYTES(
13749             Py_UCS1, Py_UCS4,
13750             ascii, ascii + len,
13751             (Py_UCS4 *)writer->data + writer->pos);
13752         break;
13753     }
13754     default:
13755         Py_UNREACHABLE();
13756     }
13757 
13758     writer->pos += len;
13759     return 0;
13760 }
13761 
13762 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13763 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13764                                    const char *str, Py_ssize_t len)
13765 {
13766     Py_UCS4 maxchar;
13767 
13768     maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13769     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13770         return -1;
13771     unicode_write_cstr(writer->buffer, writer->pos, str, len);
13772     writer->pos += len;
13773     return 0;
13774 }
13775 
13776 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13777 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13778 {
13779     PyObject *str;
13780 
13781     if (writer->pos == 0) {
13782         Py_CLEAR(writer->buffer);
13783         _Py_RETURN_UNICODE_EMPTY();
13784     }
13785 
13786     str = writer->buffer;
13787     writer->buffer = NULL;
13788 
13789     if (writer->readonly) {
13790         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13791         return str;
13792     }
13793 
13794     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13795         PyObject *str2;
13796         str2 = resize_compact(str, writer->pos);
13797         if (str2 == NULL) {
13798             Py_DECREF(str);
13799             return NULL;
13800         }
13801         str = str2;
13802     }
13803 
13804     assert(_PyUnicode_CheckConsistency(str, 1));
13805     return unicode_result_ready(str);
13806 }
13807 
13808 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13809 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13810 {
13811     Py_CLEAR(writer->buffer);
13812 }
13813 
13814 #include "stringlib/unicode_format.h"
13815 
13816 PyDoc_STRVAR(format__doc__,
13817              "S.format(*args, **kwargs) -> str\n\
13818 \n\
13819 Return a formatted version of S, using substitutions from args and kwargs.\n\
13820 The substitutions are identified by braces ('{' and '}').");
13821 
13822 PyDoc_STRVAR(format_map__doc__,
13823              "S.format_map(mapping) -> str\n\
13824 \n\
13825 Return a formatted version of S, using substitutions from mapping.\n\
13826 The substitutions are identified by braces ('{' and '}').");
13827 
13828 /*[clinic input]
13829 str.__format__ as unicode___format__
13830 
13831     format_spec: unicode
13832     /
13833 
13834 Return a formatted version of the string as described by format_spec.
13835 [clinic start generated code]*/
13836 
13837 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13838 unicode___format___impl(PyObject *self, PyObject *format_spec)
13839 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13840 {
13841     _PyUnicodeWriter writer;
13842     int ret;
13843 
13844     if (PyUnicode_READY(self) == -1)
13845         return NULL;
13846     _PyUnicodeWriter_Init(&writer);
13847     ret = _PyUnicode_FormatAdvancedWriter(&writer,
13848                                           self, format_spec, 0,
13849                                           PyUnicode_GET_LENGTH(format_spec));
13850     if (ret == -1) {
13851         _PyUnicodeWriter_Dealloc(&writer);
13852         return NULL;
13853     }
13854     return _PyUnicodeWriter_Finish(&writer);
13855 }
13856 
13857 /*[clinic input]
13858 str.__sizeof__ as unicode_sizeof
13859 
13860 Return the size of the string in memory, in bytes.
13861 [clinic start generated code]*/
13862 
13863 static PyObject *
unicode_sizeof_impl(PyObject * self)13864 unicode_sizeof_impl(PyObject *self)
13865 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13866 {
13867     Py_ssize_t size;
13868 
13869     /* If it's a compact object, account for base structure +
13870        character data. */
13871     if (PyUnicode_IS_COMPACT_ASCII(self))
13872         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13873     else if (PyUnicode_IS_COMPACT(self))
13874         size = sizeof(PyCompactUnicodeObject) +
13875             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13876     else {
13877         /* If it is a two-block object, account for base object, and
13878            for character block if present. */
13879         size = sizeof(PyUnicodeObject);
13880         if (_PyUnicode_DATA_ANY(self))
13881             size += (PyUnicode_GET_LENGTH(self) + 1) *
13882                 PyUnicode_KIND(self);
13883     }
13884     /* If the wstr pointer is present, account for it unless it is shared
13885        with the data pointer. Check if the data is not shared. */
13886     if (_PyUnicode_HAS_WSTR_MEMORY(self))
13887         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13888     if (_PyUnicode_HAS_UTF8_MEMORY(self))
13889         size += PyUnicode_UTF8_LENGTH(self) + 1;
13890 
13891     return PyLong_FromSsize_t(size);
13892 }
13893 
13894 static PyObject *
unicode_getnewargs(PyObject * v)13895 unicode_getnewargs(PyObject *v)
13896 {
13897     PyObject *copy = _PyUnicode_Copy(v);
13898     if (!copy)
13899         return NULL;
13900     return Py_BuildValue("(N)", copy);
13901 }
13902 
13903 static PyMethodDef unicode_methods[] = {
13904     UNICODE_ENCODE_METHODDEF
13905     UNICODE_REPLACE_METHODDEF
13906     UNICODE_SPLIT_METHODDEF
13907     UNICODE_RSPLIT_METHODDEF
13908     UNICODE_JOIN_METHODDEF
13909     UNICODE_CAPITALIZE_METHODDEF
13910     UNICODE_CASEFOLD_METHODDEF
13911     UNICODE_TITLE_METHODDEF
13912     UNICODE_CENTER_METHODDEF
13913     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13914     UNICODE_EXPANDTABS_METHODDEF
13915     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13916     UNICODE_PARTITION_METHODDEF
13917     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13918     UNICODE_LJUST_METHODDEF
13919     UNICODE_LOWER_METHODDEF
13920     UNICODE_LSTRIP_METHODDEF
13921     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13922     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13923     UNICODE_RJUST_METHODDEF
13924     UNICODE_RSTRIP_METHODDEF
13925     UNICODE_RPARTITION_METHODDEF
13926     UNICODE_SPLITLINES_METHODDEF
13927     UNICODE_STRIP_METHODDEF
13928     UNICODE_SWAPCASE_METHODDEF
13929     UNICODE_TRANSLATE_METHODDEF
13930     UNICODE_UPPER_METHODDEF
13931     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13932     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13933     UNICODE_ISASCII_METHODDEF
13934     UNICODE_ISLOWER_METHODDEF
13935     UNICODE_ISUPPER_METHODDEF
13936     UNICODE_ISTITLE_METHODDEF
13937     UNICODE_ISSPACE_METHODDEF
13938     UNICODE_ISDECIMAL_METHODDEF
13939     UNICODE_ISDIGIT_METHODDEF
13940     UNICODE_ISNUMERIC_METHODDEF
13941     UNICODE_ISALPHA_METHODDEF
13942     UNICODE_ISALNUM_METHODDEF
13943     UNICODE_ISIDENTIFIER_METHODDEF
13944     UNICODE_ISPRINTABLE_METHODDEF
13945     UNICODE_ZFILL_METHODDEF
13946     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13947     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13948     UNICODE___FORMAT___METHODDEF
13949     UNICODE_MAKETRANS_METHODDEF
13950     UNICODE_SIZEOF_METHODDEF
13951 #if 0
13952     /* These methods are just used for debugging the implementation. */
13953     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13954 #endif
13955 
13956     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13957     {NULL, NULL}
13958 };
13959 
13960 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13961 unicode_mod(PyObject *v, PyObject *w)
13962 {
13963     if (!PyUnicode_Check(v))
13964         Py_RETURN_NOTIMPLEMENTED;
13965     return PyUnicode_Format(v, w);
13966 }
13967 
13968 static PyNumberMethods unicode_as_number = {
13969     0,              /*nb_add*/
13970     0,              /*nb_subtract*/
13971     0,              /*nb_multiply*/
13972     unicode_mod,            /*nb_remainder*/
13973 };
13974 
13975 static PySequenceMethods unicode_as_sequence = {
13976     (lenfunc) unicode_length,       /* sq_length */
13977     PyUnicode_Concat,           /* sq_concat */
13978     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13979     (ssizeargfunc) unicode_getitem,     /* sq_item */
13980     0,                  /* sq_slice */
13981     0,                  /* sq_ass_item */
13982     0,                  /* sq_ass_slice */
13983     PyUnicode_Contains,         /* sq_contains */
13984 };
13985 
13986 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13987 unicode_subscript(PyObject* self, PyObject* item)
13988 {
13989     if (PyUnicode_READY(self) == -1)
13990         return NULL;
13991 
13992     if (PyIndex_Check(item)) {
13993         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13994         if (i == -1 && PyErr_Occurred())
13995             return NULL;
13996         if (i < 0)
13997             i += PyUnicode_GET_LENGTH(self);
13998         return unicode_getitem(self, i);
13999     } else if (PySlice_Check(item)) {
14000         Py_ssize_t start, stop, step, slicelength, i;
14001         size_t cur;
14002         PyObject *result;
14003         void *src_data, *dest_data;
14004         int src_kind, dest_kind;
14005         Py_UCS4 ch, max_char, kind_limit;
14006 
14007         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14008             return NULL;
14009         }
14010         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14011                                             &start, &stop, step);
14012 
14013         if (slicelength <= 0) {
14014             _Py_RETURN_UNICODE_EMPTY();
14015         } else if (start == 0 && step == 1 &&
14016                    slicelength == PyUnicode_GET_LENGTH(self)) {
14017             return unicode_result_unchanged(self);
14018         } else if (step == 1) {
14019             return PyUnicode_Substring(self,
14020                                        start, start + slicelength);
14021         }
14022         /* General case */
14023         src_kind = PyUnicode_KIND(self);
14024         src_data = PyUnicode_DATA(self);
14025         if (!PyUnicode_IS_ASCII(self)) {
14026             kind_limit = kind_maxchar_limit(src_kind);
14027             max_char = 0;
14028             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14029                 ch = PyUnicode_READ(src_kind, src_data, cur);
14030                 if (ch > max_char) {
14031                     max_char = ch;
14032                     if (max_char >= kind_limit)
14033                         break;
14034                 }
14035             }
14036         }
14037         else
14038             max_char = 127;
14039         result = PyUnicode_New(slicelength, max_char);
14040         if (result == NULL)
14041             return NULL;
14042         dest_kind = PyUnicode_KIND(result);
14043         dest_data = PyUnicode_DATA(result);
14044 
14045         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14046             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14047             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14048         }
14049         assert(_PyUnicode_CheckConsistency(result, 1));
14050         return result;
14051     } else {
14052         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14053         return NULL;
14054     }
14055 }
14056 
14057 static PyMappingMethods unicode_as_mapping = {
14058     (lenfunc)unicode_length,        /* mp_length */
14059     (binaryfunc)unicode_subscript,  /* mp_subscript */
14060     (objobjargproc)0,           /* mp_ass_subscript */
14061 };
14062 
14063 
14064 /* Helpers for PyUnicode_Format() */
14065 
14066 struct unicode_formatter_t {
14067     PyObject *args;
14068     int args_owned;
14069     Py_ssize_t arglen, argidx;
14070     PyObject *dict;
14071 
14072     enum PyUnicode_Kind fmtkind;
14073     Py_ssize_t fmtcnt, fmtpos;
14074     void *fmtdata;
14075     PyObject *fmtstr;
14076 
14077     _PyUnicodeWriter writer;
14078 };
14079 
14080 struct unicode_format_arg_t {
14081     Py_UCS4 ch;
14082     int flags;
14083     Py_ssize_t width;
14084     int prec;
14085     int sign;
14086 };
14087 
14088 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14089 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14090 {
14091     Py_ssize_t argidx = ctx->argidx;
14092 
14093     if (argidx < ctx->arglen) {
14094         ctx->argidx++;
14095         if (ctx->arglen < 0)
14096             return ctx->args;
14097         else
14098             return PyTuple_GetItem(ctx->args, argidx);
14099     }
14100     PyErr_SetString(PyExc_TypeError,
14101                     "not enough arguments for format string");
14102     return NULL;
14103 }
14104 
14105 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14106 
14107 /* Format a float into the writer if the writer is not NULL, or into *p_output
14108    otherwise.
14109 
14110    Return 0 on success, raise an exception and return -1 on error. */
14111 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14112 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14113             PyObject **p_output,
14114             _PyUnicodeWriter *writer)
14115 {
14116     char *p;
14117     double x;
14118     Py_ssize_t len;
14119     int prec;
14120     int dtoa_flags;
14121 
14122     x = PyFloat_AsDouble(v);
14123     if (x == -1.0 && PyErr_Occurred())
14124         return -1;
14125 
14126     prec = arg->prec;
14127     if (prec < 0)
14128         prec = 6;
14129 
14130     if (arg->flags & F_ALT)
14131         dtoa_flags = Py_DTSF_ALT;
14132     else
14133         dtoa_flags = 0;
14134     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14135     if (p == NULL)
14136         return -1;
14137     len = strlen(p);
14138     if (writer) {
14139         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14140             PyMem_Free(p);
14141             return -1;
14142         }
14143     }
14144     else
14145         *p_output = _PyUnicode_FromASCII(p, len);
14146     PyMem_Free(p);
14147     return 0;
14148 }
14149 
14150 /* formatlong() emulates the format codes d, u, o, x and X, and
14151  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14152  * Python's regular ints.
14153  * Return value:  a new PyUnicodeObject*, or NULL if error.
14154  *     The output string is of the form
14155  *         "-"? ("0x" | "0X")? digit+
14156  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14157  *         set in flags.  The case of hex digits will be correct,
14158  *     There will be at least prec digits, zero-filled on the left if
14159  *         necessary to get that many.
14160  * val          object to be converted
14161  * flags        bitmask of format flags; only F_ALT is looked at
14162  * prec         minimum number of digits; 0-fill on left if needed
14163  * type         a character in [duoxX]; u acts the same as d
14164  *
14165  * CAUTION:  o, x and X conversions on regular ints can never
14166  * produce a '-' sign, but can for Python's unbounded ints.
14167  */
14168 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14169 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14170 {
14171     PyObject *result = NULL;
14172     char *buf;
14173     Py_ssize_t i;
14174     int sign;           /* 1 if '-', else 0 */
14175     int len;            /* number of characters */
14176     Py_ssize_t llen;
14177     int numdigits;      /* len == numnondigits + numdigits */
14178     int numnondigits = 0;
14179 
14180     /* Avoid exceeding SSIZE_T_MAX */
14181     if (prec > INT_MAX-3) {
14182         PyErr_SetString(PyExc_OverflowError,
14183                         "precision too large");
14184         return NULL;
14185     }
14186 
14187     assert(PyLong_Check(val));
14188 
14189     switch (type) {
14190     default:
14191         Py_UNREACHABLE();
14192     case 'd':
14193     case 'i':
14194     case 'u':
14195         /* int and int subclasses should print numerically when a numeric */
14196         /* format code is used (see issue18780) */
14197         result = PyNumber_ToBase(val, 10);
14198         break;
14199     case 'o':
14200         numnondigits = 2;
14201         result = PyNumber_ToBase(val, 8);
14202         break;
14203     case 'x':
14204     case 'X':
14205         numnondigits = 2;
14206         result = PyNumber_ToBase(val, 16);
14207         break;
14208     }
14209     if (!result)
14210         return NULL;
14211 
14212     assert(unicode_modifiable(result));
14213     assert(PyUnicode_IS_READY(result));
14214     assert(PyUnicode_IS_ASCII(result));
14215 
14216     /* To modify the string in-place, there can only be one reference. */
14217     if (Py_REFCNT(result) != 1) {
14218         Py_DECREF(result);
14219         PyErr_BadInternalCall();
14220         return NULL;
14221     }
14222     buf = PyUnicode_DATA(result);
14223     llen = PyUnicode_GET_LENGTH(result);
14224     if (llen > INT_MAX) {
14225         Py_DECREF(result);
14226         PyErr_SetString(PyExc_ValueError,
14227                         "string too large in _PyUnicode_FormatLong");
14228         return NULL;
14229     }
14230     len = (int)llen;
14231     sign = buf[0] == '-';
14232     numnondigits += sign;
14233     numdigits = len - numnondigits;
14234     assert(numdigits > 0);
14235 
14236     /* Get rid of base marker unless F_ALT */
14237     if (((alt) == 0 &&
14238         (type == 'o' || type == 'x' || type == 'X'))) {
14239         assert(buf[sign] == '0');
14240         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14241                buf[sign+1] == 'o');
14242         numnondigits -= 2;
14243         buf += 2;
14244         len -= 2;
14245         if (sign)
14246             buf[0] = '-';
14247         assert(len == numnondigits + numdigits);
14248         assert(numdigits > 0);
14249     }
14250 
14251     /* Fill with leading zeroes to meet minimum width. */
14252     if (prec > numdigits) {
14253         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14254                                 numnondigits + prec);
14255         char *b1;
14256         if (!r1) {
14257             Py_DECREF(result);
14258             return NULL;
14259         }
14260         b1 = PyBytes_AS_STRING(r1);
14261         for (i = 0; i < numnondigits; ++i)
14262             *b1++ = *buf++;
14263         for (i = 0; i < prec - numdigits; i++)
14264             *b1++ = '0';
14265         for (i = 0; i < numdigits; i++)
14266             *b1++ = *buf++;
14267         *b1 = '\0';
14268         Py_DECREF(result);
14269         result = r1;
14270         buf = PyBytes_AS_STRING(result);
14271         len = numnondigits + prec;
14272     }
14273 
14274     /* Fix up case for hex conversions. */
14275     if (type == 'X') {
14276         /* Need to convert all lower case letters to upper case.
14277            and need to convert 0x to 0X (and -0x to -0X). */
14278         for (i = 0; i < len; i++)
14279             if (buf[i] >= 'a' && buf[i] <= 'x')
14280                 buf[i] -= 'a'-'A';
14281     }
14282     if (!PyUnicode_Check(result)
14283         || buf != PyUnicode_DATA(result)) {
14284         PyObject *unicode;
14285         unicode = _PyUnicode_FromASCII(buf, len);
14286         Py_DECREF(result);
14287         result = unicode;
14288     }
14289     else if (len != PyUnicode_GET_LENGTH(result)) {
14290         if (PyUnicode_Resize(&result, len) < 0)
14291             Py_CLEAR(result);
14292     }
14293     return result;
14294 }
14295 
14296 /* Format an integer or a float as an integer.
14297  * Return 1 if the number has been formatted into the writer,
14298  *        0 if the number has been formatted into *p_output
14299  *       -1 and raise an exception on error */
14300 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14301 mainformatlong(PyObject *v,
14302                struct unicode_format_arg_t *arg,
14303                PyObject **p_output,
14304                _PyUnicodeWriter *writer)
14305 {
14306     PyObject *iobj, *res;
14307     char type = (char)arg->ch;
14308 
14309     if (!PyNumber_Check(v))
14310         goto wrongtype;
14311 
14312     /* make sure number is a type of integer for o, x, and X */
14313     if (!PyLong_Check(v)) {
14314         if (type == 'o' || type == 'x' || type == 'X') {
14315             iobj = PyNumber_Index(v);
14316             if (iobj == NULL) {
14317                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14318                     goto wrongtype;
14319                 return -1;
14320             }
14321         }
14322         else {
14323             iobj = PyNumber_Long(v);
14324             if (iobj == NULL ) {
14325                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14326                     goto wrongtype;
14327                 return -1;
14328             }
14329         }
14330         assert(PyLong_Check(iobj));
14331     }
14332     else {
14333         iobj = v;
14334         Py_INCREF(iobj);
14335     }
14336 
14337     if (PyLong_CheckExact(v)
14338         && arg->width == -1 && arg->prec == -1
14339         && !(arg->flags & (F_SIGN | F_BLANK))
14340         && type != 'X')
14341     {
14342         /* Fast path */
14343         int alternate = arg->flags & F_ALT;
14344         int base;
14345 
14346         switch(type)
14347         {
14348             default:
14349                 Py_UNREACHABLE();
14350             case 'd':
14351             case 'i':
14352             case 'u':
14353                 base = 10;
14354                 break;
14355             case 'o':
14356                 base = 8;
14357                 break;
14358             case 'x':
14359             case 'X':
14360                 base = 16;
14361                 break;
14362         }
14363 
14364         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14365             Py_DECREF(iobj);
14366             return -1;
14367         }
14368         Py_DECREF(iobj);
14369         return 1;
14370     }
14371 
14372     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14373     Py_DECREF(iobj);
14374     if (res == NULL)
14375         return -1;
14376     *p_output = res;
14377     return 0;
14378 
14379 wrongtype:
14380     switch(type)
14381     {
14382         case 'o':
14383         case 'x':
14384         case 'X':
14385             PyErr_Format(PyExc_TypeError,
14386                     "%%%c format: an integer is required, "
14387                     "not %.200s",
14388                     type, Py_TYPE(v)->tp_name);
14389             break;
14390         default:
14391             PyErr_Format(PyExc_TypeError,
14392                     "%%%c format: a number is required, "
14393                     "not %.200s",
14394                     type, Py_TYPE(v)->tp_name);
14395             break;
14396     }
14397     return -1;
14398 }
14399 
14400 static Py_UCS4
formatchar(PyObject * v)14401 formatchar(PyObject *v)
14402 {
14403     /* presume that the buffer is at least 3 characters long */
14404     if (PyUnicode_Check(v)) {
14405         if (PyUnicode_GET_LENGTH(v) == 1) {
14406             return PyUnicode_READ_CHAR(v, 0);
14407         }
14408         goto onError;
14409     }
14410     else {
14411         PyObject *iobj;
14412         long x;
14413         /* make sure number is a type of integer */
14414         if (!PyLong_Check(v)) {
14415             iobj = PyNumber_Index(v);
14416             if (iobj == NULL) {
14417                 goto onError;
14418             }
14419             x = PyLong_AsLong(iobj);
14420             Py_DECREF(iobj);
14421         }
14422         else {
14423             x = PyLong_AsLong(v);
14424         }
14425         if (x == -1 && PyErr_Occurred())
14426             goto onError;
14427 
14428         if (x < 0 || x > MAX_UNICODE) {
14429             PyErr_SetString(PyExc_OverflowError,
14430                             "%c arg not in range(0x110000)");
14431             return (Py_UCS4) -1;
14432         }
14433 
14434         return (Py_UCS4) x;
14435     }
14436 
14437   onError:
14438     PyErr_SetString(PyExc_TypeError,
14439                     "%c requires int or char");
14440     return (Py_UCS4) -1;
14441 }
14442 
14443 /* Parse options of an argument: flags, width, precision.
14444    Handle also "%(name)" syntax.
14445 
14446    Return 0 if the argument has been formatted into arg->str.
14447    Return 1 if the argument has been written into ctx->writer,
14448    Raise an exception and return -1 on error. */
14449 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14450 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14451                          struct unicode_format_arg_t *arg)
14452 {
14453 #define FORMAT_READ(ctx) \
14454         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14455 
14456     PyObject *v;
14457 
14458     if (arg->ch == '(') {
14459         /* Get argument value from a dictionary. Example: "%(name)s". */
14460         Py_ssize_t keystart;
14461         Py_ssize_t keylen;
14462         PyObject *key;
14463         int pcount = 1;
14464 
14465         if (ctx->dict == NULL) {
14466             PyErr_SetString(PyExc_TypeError,
14467                             "format requires a mapping");
14468             return -1;
14469         }
14470         ++ctx->fmtpos;
14471         --ctx->fmtcnt;
14472         keystart = ctx->fmtpos;
14473         /* Skip over balanced parentheses */
14474         while (pcount > 0 && --ctx->fmtcnt >= 0) {
14475             arg->ch = FORMAT_READ(ctx);
14476             if (arg->ch == ')')
14477                 --pcount;
14478             else if (arg->ch == '(')
14479                 ++pcount;
14480             ctx->fmtpos++;
14481         }
14482         keylen = ctx->fmtpos - keystart - 1;
14483         if (ctx->fmtcnt < 0 || pcount > 0) {
14484             PyErr_SetString(PyExc_ValueError,
14485                             "incomplete format key");
14486             return -1;
14487         }
14488         key = PyUnicode_Substring(ctx->fmtstr,
14489                                   keystart, keystart + keylen);
14490         if (key == NULL)
14491             return -1;
14492         if (ctx->args_owned) {
14493             ctx->args_owned = 0;
14494             Py_DECREF(ctx->args);
14495         }
14496         ctx->args = PyObject_GetItem(ctx->dict, key);
14497         Py_DECREF(key);
14498         if (ctx->args == NULL)
14499             return -1;
14500         ctx->args_owned = 1;
14501         ctx->arglen = -1;
14502         ctx->argidx = -2;
14503     }
14504 
14505     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14506     while (--ctx->fmtcnt >= 0) {
14507         arg->ch = FORMAT_READ(ctx);
14508         ctx->fmtpos++;
14509         switch (arg->ch) {
14510         case '-': arg->flags |= F_LJUST; continue;
14511         case '+': arg->flags |= F_SIGN; continue;
14512         case ' ': arg->flags |= F_BLANK; continue;
14513         case '#': arg->flags |= F_ALT; continue;
14514         case '0': arg->flags |= F_ZERO; continue;
14515         }
14516         break;
14517     }
14518 
14519     /* Parse width. Example: "%10s" => width=10 */
14520     if (arg->ch == '*') {
14521         v = unicode_format_getnextarg(ctx);
14522         if (v == NULL)
14523             return -1;
14524         if (!PyLong_Check(v)) {
14525             PyErr_SetString(PyExc_TypeError,
14526                             "* wants int");
14527             return -1;
14528         }
14529         arg->width = PyLong_AsSsize_t(v);
14530         if (arg->width == -1 && PyErr_Occurred())
14531             return -1;
14532         if (arg->width < 0) {
14533             arg->flags |= F_LJUST;
14534             arg->width = -arg->width;
14535         }
14536         if (--ctx->fmtcnt >= 0) {
14537             arg->ch = FORMAT_READ(ctx);
14538             ctx->fmtpos++;
14539         }
14540     }
14541     else if (arg->ch >= '0' && arg->ch <= '9') {
14542         arg->width = arg->ch - '0';
14543         while (--ctx->fmtcnt >= 0) {
14544             arg->ch = FORMAT_READ(ctx);
14545             ctx->fmtpos++;
14546             if (arg->ch < '0' || arg->ch > '9')
14547                 break;
14548             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14549                mixing signed and unsigned comparison. Since arg->ch is between
14550                '0' and '9', casting to int is safe. */
14551             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14552                 PyErr_SetString(PyExc_ValueError,
14553                                 "width too big");
14554                 return -1;
14555             }
14556             arg->width = arg->width*10 + (arg->ch - '0');
14557         }
14558     }
14559 
14560     /* Parse precision. Example: "%.3f" => prec=3 */
14561     if (arg->ch == '.') {
14562         arg->prec = 0;
14563         if (--ctx->fmtcnt >= 0) {
14564             arg->ch = FORMAT_READ(ctx);
14565             ctx->fmtpos++;
14566         }
14567         if (arg->ch == '*') {
14568             v = unicode_format_getnextarg(ctx);
14569             if (v == NULL)
14570                 return -1;
14571             if (!PyLong_Check(v)) {
14572                 PyErr_SetString(PyExc_TypeError,
14573                                 "* wants int");
14574                 return -1;
14575             }
14576             arg->prec = _PyLong_AsInt(v);
14577             if (arg->prec == -1 && PyErr_Occurred())
14578                 return -1;
14579             if (arg->prec < 0)
14580                 arg->prec = 0;
14581             if (--ctx->fmtcnt >= 0) {
14582                 arg->ch = FORMAT_READ(ctx);
14583                 ctx->fmtpos++;
14584             }
14585         }
14586         else if (arg->ch >= '0' && arg->ch <= '9') {
14587             arg->prec = arg->ch - '0';
14588             while (--ctx->fmtcnt >= 0) {
14589                 arg->ch = FORMAT_READ(ctx);
14590                 ctx->fmtpos++;
14591                 if (arg->ch < '0' || arg->ch > '9')
14592                     break;
14593                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14594                     PyErr_SetString(PyExc_ValueError,
14595                                     "precision too big");
14596                     return -1;
14597                 }
14598                 arg->prec = arg->prec*10 + (arg->ch - '0');
14599             }
14600         }
14601     }
14602 
14603     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14604     if (ctx->fmtcnt >= 0) {
14605         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14606             if (--ctx->fmtcnt >= 0) {
14607                 arg->ch = FORMAT_READ(ctx);
14608                 ctx->fmtpos++;
14609             }
14610         }
14611     }
14612     if (ctx->fmtcnt < 0) {
14613         PyErr_SetString(PyExc_ValueError,
14614                         "incomplete format");
14615         return -1;
14616     }
14617     return 0;
14618 
14619 #undef FORMAT_READ
14620 }
14621 
14622 /* Format one argument. Supported conversion specifiers:
14623 
14624    - "s", "r", "a": any type
14625    - "i", "d", "u": int or float
14626    - "o", "x", "X": int
14627    - "e", "E", "f", "F", "g", "G": float
14628    - "c": int or str (1 character)
14629 
14630    When possible, the output is written directly into the Unicode writer
14631    (ctx->writer). A string is created when padding is required.
14632 
14633    Return 0 if the argument has been formatted into *p_str,
14634           1 if the argument has been written into ctx->writer,
14635          -1 on error. */
14636 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14637 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14638                           struct unicode_format_arg_t *arg,
14639                           PyObject **p_str)
14640 {
14641     PyObject *v;
14642     _PyUnicodeWriter *writer = &ctx->writer;
14643 
14644     if (ctx->fmtcnt == 0)
14645         ctx->writer.overallocate = 0;
14646 
14647     v = unicode_format_getnextarg(ctx);
14648     if (v == NULL)
14649         return -1;
14650 
14651 
14652     switch (arg->ch) {
14653     case 's':
14654     case 'r':
14655     case 'a':
14656         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14657             /* Fast path */
14658             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14659                 return -1;
14660             return 1;
14661         }
14662 
14663         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14664             *p_str = v;
14665             Py_INCREF(*p_str);
14666         }
14667         else {
14668             if (arg->ch == 's')
14669                 *p_str = PyObject_Str(v);
14670             else if (arg->ch == 'r')
14671                 *p_str = PyObject_Repr(v);
14672             else
14673                 *p_str = PyObject_ASCII(v);
14674         }
14675         break;
14676 
14677     case 'i':
14678     case 'd':
14679     case 'u':
14680     case 'o':
14681     case 'x':
14682     case 'X':
14683     {
14684         int ret = mainformatlong(v, arg, p_str, writer);
14685         if (ret != 0)
14686             return ret;
14687         arg->sign = 1;
14688         break;
14689     }
14690 
14691     case 'e':
14692     case 'E':
14693     case 'f':
14694     case 'F':
14695     case 'g':
14696     case 'G':
14697         if (arg->width == -1 && arg->prec == -1
14698             && !(arg->flags & (F_SIGN | F_BLANK)))
14699         {
14700             /* Fast path */
14701             if (formatfloat(v, arg, NULL, writer) == -1)
14702                 return -1;
14703             return 1;
14704         }
14705 
14706         arg->sign = 1;
14707         if (formatfloat(v, arg, p_str, NULL) == -1)
14708             return -1;
14709         break;
14710 
14711     case 'c':
14712     {
14713         Py_UCS4 ch = formatchar(v);
14714         if (ch == (Py_UCS4) -1)
14715             return -1;
14716         if (arg->width == -1 && arg->prec == -1) {
14717             /* Fast path */
14718             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14719                 return -1;
14720             return 1;
14721         }
14722         *p_str = PyUnicode_FromOrdinal(ch);
14723         break;
14724     }
14725 
14726     default:
14727         PyErr_Format(PyExc_ValueError,
14728                      "unsupported format character '%c' (0x%x) "
14729                      "at index %zd",
14730                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14731                      (int)arg->ch,
14732                      ctx->fmtpos - 1);
14733         return -1;
14734     }
14735     if (*p_str == NULL)
14736         return -1;
14737     assert (PyUnicode_Check(*p_str));
14738     return 0;
14739 }
14740 
14741 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14742 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14743                           struct unicode_format_arg_t *arg,
14744                           PyObject *str)
14745 {
14746     Py_ssize_t len;
14747     enum PyUnicode_Kind kind;
14748     void *pbuf;
14749     Py_ssize_t pindex;
14750     Py_UCS4 signchar;
14751     Py_ssize_t buflen;
14752     Py_UCS4 maxchar;
14753     Py_ssize_t sublen;
14754     _PyUnicodeWriter *writer = &ctx->writer;
14755     Py_UCS4 fill;
14756 
14757     fill = ' ';
14758     if (arg->sign && arg->flags & F_ZERO)
14759         fill = '0';
14760 
14761     if (PyUnicode_READY(str) == -1)
14762         return -1;
14763 
14764     len = PyUnicode_GET_LENGTH(str);
14765     if ((arg->width == -1 || arg->width <= len)
14766         && (arg->prec == -1 || arg->prec >= len)
14767         && !(arg->flags & (F_SIGN | F_BLANK)))
14768     {
14769         /* Fast path */
14770         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14771             return -1;
14772         return 0;
14773     }
14774 
14775     /* Truncate the string for "s", "r" and "a" formats
14776        if the precision is set */
14777     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14778         if (arg->prec >= 0 && len > arg->prec)
14779             len = arg->prec;
14780     }
14781 
14782     /* Adjust sign and width */
14783     kind = PyUnicode_KIND(str);
14784     pbuf = PyUnicode_DATA(str);
14785     pindex = 0;
14786     signchar = '\0';
14787     if (arg->sign) {
14788         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14789         if (ch == '-' || ch == '+') {
14790             signchar = ch;
14791             len--;
14792             pindex++;
14793         }
14794         else if (arg->flags & F_SIGN)
14795             signchar = '+';
14796         else if (arg->flags & F_BLANK)
14797             signchar = ' ';
14798         else
14799             arg->sign = 0;
14800     }
14801     if (arg->width < len)
14802         arg->width = len;
14803 
14804     /* Prepare the writer */
14805     maxchar = writer->maxchar;
14806     if (!(arg->flags & F_LJUST)) {
14807         if (arg->sign) {
14808             if ((arg->width-1) > len)
14809                 maxchar = Py_MAX(maxchar, fill);
14810         }
14811         else {
14812             if (arg->width > len)
14813                 maxchar = Py_MAX(maxchar, fill);
14814         }
14815     }
14816     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14817         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14818         maxchar = Py_MAX(maxchar, strmaxchar);
14819     }
14820 
14821     buflen = arg->width;
14822     if (arg->sign && len == arg->width)
14823         buflen++;
14824     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14825         return -1;
14826 
14827     /* Write the sign if needed */
14828     if (arg->sign) {
14829         if (fill != ' ') {
14830             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14831             writer->pos += 1;
14832         }
14833         if (arg->width > len)
14834             arg->width--;
14835     }
14836 
14837     /* Write the numeric prefix for "x", "X" and "o" formats
14838        if the alternate form is used.
14839        For example, write "0x" for the "%#x" format. */
14840     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14841         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14842         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14843         if (fill != ' ') {
14844             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14845             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14846             writer->pos += 2;
14847             pindex += 2;
14848         }
14849         arg->width -= 2;
14850         if (arg->width < 0)
14851             arg->width = 0;
14852         len -= 2;
14853     }
14854 
14855     /* Pad left with the fill character if needed */
14856     if (arg->width > len && !(arg->flags & F_LJUST)) {
14857         sublen = arg->width - len;
14858         FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14859         writer->pos += sublen;
14860         arg->width = len;
14861     }
14862 
14863     /* If padding with spaces: write sign if needed and/or numeric prefix if
14864        the alternate form is used */
14865     if (fill == ' ') {
14866         if (arg->sign) {
14867             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14868             writer->pos += 1;
14869         }
14870         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14871             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14872             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14873             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14874             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14875             writer->pos += 2;
14876             pindex += 2;
14877         }
14878     }
14879 
14880     /* Write characters */
14881     if (len) {
14882         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14883                                       str, pindex, len);
14884         writer->pos += len;
14885     }
14886 
14887     /* Pad right with the fill character if needed */
14888     if (arg->width > len) {
14889         sublen = arg->width - len;
14890         FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14891         writer->pos += sublen;
14892     }
14893     return 0;
14894 }
14895 
14896 /* Helper of PyUnicode_Format(): format one arg.
14897    Return 0 on success, raise an exception and return -1 on error. */
14898 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14899 unicode_format_arg(struct unicode_formatter_t *ctx)
14900 {
14901     struct unicode_format_arg_t arg;
14902     PyObject *str;
14903     int ret;
14904 
14905     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14906     if (arg.ch == '%') {
14907         ctx->fmtpos++;
14908         ctx->fmtcnt--;
14909         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14910             return -1;
14911         return 0;
14912     }
14913     arg.flags = 0;
14914     arg.width = -1;
14915     arg.prec = -1;
14916     arg.sign = 0;
14917     str = NULL;
14918 
14919     ret = unicode_format_arg_parse(ctx, &arg);
14920     if (ret == -1)
14921         return -1;
14922 
14923     ret = unicode_format_arg_format(ctx, &arg, &str);
14924     if (ret == -1)
14925         return -1;
14926 
14927     if (ret != 1) {
14928         ret = unicode_format_arg_output(ctx, &arg, str);
14929         Py_DECREF(str);
14930         if (ret == -1)
14931             return -1;
14932     }
14933 
14934     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14935         PyErr_SetString(PyExc_TypeError,
14936                         "not all arguments converted during string formatting");
14937         return -1;
14938     }
14939     return 0;
14940 }
14941 
14942 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14943 PyUnicode_Format(PyObject *format, PyObject *args)
14944 {
14945     struct unicode_formatter_t ctx;
14946 
14947     if (format == NULL || args == NULL) {
14948         PyErr_BadInternalCall();
14949         return NULL;
14950     }
14951 
14952     if (ensure_unicode(format) < 0)
14953         return NULL;
14954 
14955     ctx.fmtstr = format;
14956     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14957     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14958     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14959     ctx.fmtpos = 0;
14960 
14961     _PyUnicodeWriter_Init(&ctx.writer);
14962     ctx.writer.min_length = ctx.fmtcnt + 100;
14963     ctx.writer.overallocate = 1;
14964 
14965     if (PyTuple_Check(args)) {
14966         ctx.arglen = PyTuple_Size(args);
14967         ctx.argidx = 0;
14968     }
14969     else {
14970         ctx.arglen = -1;
14971         ctx.argidx = -2;
14972     }
14973     ctx.args_owned = 0;
14974     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14975         ctx.dict = args;
14976     else
14977         ctx.dict = NULL;
14978     ctx.args = args;
14979 
14980     while (--ctx.fmtcnt >= 0) {
14981         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14982             Py_ssize_t nonfmtpos;
14983 
14984             nonfmtpos = ctx.fmtpos++;
14985             while (ctx.fmtcnt >= 0 &&
14986                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14987                 ctx.fmtpos++;
14988                 ctx.fmtcnt--;
14989             }
14990             if (ctx.fmtcnt < 0) {
14991                 ctx.fmtpos--;
14992                 ctx.writer.overallocate = 0;
14993             }
14994 
14995             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14996                                                 nonfmtpos, ctx.fmtpos) < 0)
14997                 goto onError;
14998         }
14999         else {
15000             ctx.fmtpos++;
15001             if (unicode_format_arg(&ctx) == -1)
15002                 goto onError;
15003         }
15004     }
15005 
15006     if (ctx.argidx < ctx.arglen && !ctx.dict) {
15007         PyErr_SetString(PyExc_TypeError,
15008                         "not all arguments converted during string formatting");
15009         goto onError;
15010     }
15011 
15012     if (ctx.args_owned) {
15013         Py_DECREF(ctx.args);
15014     }
15015     return _PyUnicodeWriter_Finish(&ctx.writer);
15016 
15017   onError:
15018     _PyUnicodeWriter_Dealloc(&ctx.writer);
15019     if (ctx.args_owned) {
15020         Py_DECREF(ctx.args);
15021     }
15022     return NULL;
15023 }
15024 
15025 static PyObject *
15026 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15027 
15028 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15029 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15030 {
15031     PyObject *x = NULL;
15032     static char *kwlist[] = {"object", "encoding", "errors", 0};
15033     char *encoding = NULL;
15034     char *errors = NULL;
15035 
15036     if (type != &PyUnicode_Type)
15037         return unicode_subtype_new(type, args, kwds);
15038     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15039                                      kwlist, &x, &encoding, &errors))
15040         return NULL;
15041     if (x == NULL)
15042         _Py_RETURN_UNICODE_EMPTY();
15043     if (encoding == NULL && errors == NULL)
15044         return PyObject_Str(x);
15045     else
15046         return PyUnicode_FromEncodedObject(x, encoding, errors);
15047 }
15048 
15049 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15050 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15051 {
15052     PyObject *unicode, *self;
15053     Py_ssize_t length, char_size;
15054     int share_wstr, share_utf8;
15055     unsigned int kind;
15056     void *data;
15057 
15058     assert(PyType_IsSubtype(type, &PyUnicode_Type));
15059 
15060     unicode = unicode_new(&PyUnicode_Type, args, kwds);
15061     if (unicode == NULL)
15062         return NULL;
15063     assert(_PyUnicode_CHECK(unicode));
15064     if (PyUnicode_READY(unicode) == -1) {
15065         Py_DECREF(unicode);
15066         return NULL;
15067     }
15068 
15069     self = type->tp_alloc(type, 0);
15070     if (self == NULL) {
15071         Py_DECREF(unicode);
15072         return NULL;
15073     }
15074     kind = PyUnicode_KIND(unicode);
15075     length = PyUnicode_GET_LENGTH(unicode);
15076 
15077     _PyUnicode_LENGTH(self) = length;
15078 #ifdef Py_DEBUG
15079     _PyUnicode_HASH(self) = -1;
15080 #else
15081     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15082 #endif
15083     _PyUnicode_STATE(self).interned = 0;
15084     _PyUnicode_STATE(self).kind = kind;
15085     _PyUnicode_STATE(self).compact = 0;
15086     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15087     _PyUnicode_STATE(self).ready = 1;
15088     _PyUnicode_WSTR(self) = NULL;
15089     _PyUnicode_UTF8_LENGTH(self) = 0;
15090     _PyUnicode_UTF8(self) = NULL;
15091     _PyUnicode_WSTR_LENGTH(self) = 0;
15092     _PyUnicode_DATA_ANY(self) = NULL;
15093 
15094     share_utf8 = 0;
15095     share_wstr = 0;
15096     if (kind == PyUnicode_1BYTE_KIND) {
15097         char_size = 1;
15098         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15099             share_utf8 = 1;
15100     }
15101     else if (kind == PyUnicode_2BYTE_KIND) {
15102         char_size = 2;
15103         if (sizeof(wchar_t) == 2)
15104             share_wstr = 1;
15105     }
15106     else {
15107         assert(kind == PyUnicode_4BYTE_KIND);
15108         char_size = 4;
15109         if (sizeof(wchar_t) == 4)
15110             share_wstr = 1;
15111     }
15112 
15113     /* Ensure we won't overflow the length. */
15114     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15115         PyErr_NoMemory();
15116         goto onError;
15117     }
15118     data = PyObject_MALLOC((length + 1) * char_size);
15119     if (data == NULL) {
15120         PyErr_NoMemory();
15121         goto onError;
15122     }
15123 
15124     _PyUnicode_DATA_ANY(self) = data;
15125     if (share_utf8) {
15126         _PyUnicode_UTF8_LENGTH(self) = length;
15127         _PyUnicode_UTF8(self) = data;
15128     }
15129     if (share_wstr) {
15130         _PyUnicode_WSTR_LENGTH(self) = length;
15131         _PyUnicode_WSTR(self) = (wchar_t *)data;
15132     }
15133 
15134     memcpy(data, PyUnicode_DATA(unicode),
15135               kind * (length + 1));
15136     assert(_PyUnicode_CheckConsistency(self, 1));
15137 #ifdef Py_DEBUG
15138     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15139 #endif
15140     Py_DECREF(unicode);
15141     return self;
15142 
15143 onError:
15144     Py_DECREF(unicode);
15145     Py_DECREF(self);
15146     return NULL;
15147 }
15148 
15149 PyDoc_STRVAR(unicode_doc,
15150 "str(object='') -> str\n\
15151 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15152 \n\
15153 Create a new string object from the given object. If encoding or\n\
15154 errors is specified, then the object must expose a data buffer\n\
15155 that will be decoded using the given encoding and error handler.\n\
15156 Otherwise, returns the result of object.__str__() (if defined)\n\
15157 or repr(object).\n\
15158 encoding defaults to sys.getdefaultencoding().\n\
15159 errors defaults to 'strict'.");
15160 
15161 static PyObject *unicode_iter(PyObject *seq);
15162 
15163 PyTypeObject PyUnicode_Type = {
15164     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15165     "str",              /* tp_name */
15166     sizeof(PyUnicodeObject),        /* tp_size */
15167     0,                  /* tp_itemsize */
15168     /* Slots */
15169     (destructor)unicode_dealloc,    /* tp_dealloc */
15170     0,                  /* tp_print */
15171     0,                  /* tp_getattr */
15172     0,                  /* tp_setattr */
15173     0,                  /* tp_reserved */
15174     unicode_repr,           /* tp_repr */
15175     &unicode_as_number,         /* tp_as_number */
15176     &unicode_as_sequence,       /* tp_as_sequence */
15177     &unicode_as_mapping,        /* tp_as_mapping */
15178     (hashfunc) unicode_hash,        /* tp_hash*/
15179     0,                  /* tp_call*/
15180     (reprfunc) unicode_str,     /* tp_str */
15181     PyObject_GenericGetAttr,        /* tp_getattro */
15182     0,                  /* tp_setattro */
15183     0,                  /* tp_as_buffer */
15184     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15185     Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
15186     unicode_doc,            /* tp_doc */
15187     0,                  /* tp_traverse */
15188     0,                  /* tp_clear */
15189     PyUnicode_RichCompare,      /* tp_richcompare */
15190     0,                  /* tp_weaklistoffset */
15191     unicode_iter,           /* tp_iter */
15192     0,                  /* tp_iternext */
15193     unicode_methods,            /* tp_methods */
15194     0,                  /* tp_members */
15195     0,                  /* tp_getset */
15196     &PyBaseObject_Type,         /* tp_base */
15197     0,                  /* tp_dict */
15198     0,                  /* tp_descr_get */
15199     0,                  /* tp_descr_set */
15200     0,                  /* tp_dictoffset */
15201     0,                  /* tp_init */
15202     0,                  /* tp_alloc */
15203     unicode_new,            /* tp_new */
15204     PyObject_Del,           /* tp_free */
15205 };
15206 
15207 /* Initialize the Unicode implementation */
15208 
_PyUnicode_Init(void)15209 int _PyUnicode_Init(void)
15210 {
15211     /* XXX - move this array to unicodectype.c ? */
15212     Py_UCS2 linebreak[] = {
15213         0x000A, /* LINE FEED */
15214         0x000D, /* CARRIAGE RETURN */
15215         0x001C, /* FILE SEPARATOR */
15216         0x001D, /* GROUP SEPARATOR */
15217         0x001E, /* RECORD SEPARATOR */
15218         0x0085, /* NEXT LINE */
15219         0x2028, /* LINE SEPARATOR */
15220         0x2029, /* PARAGRAPH SEPARATOR */
15221     };
15222 
15223     /* Init the implementation */
15224     _Py_INCREF_UNICODE_EMPTY();
15225     if (!unicode_empty)
15226         Py_FatalError("Can't create empty string");
15227     Py_DECREF(unicode_empty);
15228 
15229     if (PyType_Ready(&PyUnicode_Type) < 0)
15230         Py_FatalError("Can't initialize 'unicode'");
15231 
15232     /* initialize the linebreak bloom filter */
15233     bloom_linebreak = make_bloom_mask(
15234         PyUnicode_2BYTE_KIND, linebreak,
15235         Py_ARRAY_LENGTH(linebreak));
15236 
15237     if (PyType_Ready(&EncodingMapType) < 0)
15238          Py_FatalError("Can't initialize encoding map type");
15239 
15240     if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15241         Py_FatalError("Can't initialize field name iterator type");
15242 
15243     if (PyType_Ready(&PyFormatterIter_Type) < 0)
15244         Py_FatalError("Can't initialize formatter iter type");
15245 
15246     return 0;
15247 }
15248 
15249 /* Finalize the Unicode implementation */
15250 
15251 int
PyUnicode_ClearFreeList(void)15252 PyUnicode_ClearFreeList(void)
15253 {
15254     return 0;
15255 }
15256 
15257 void
_PyUnicode_Fini(void)15258 _PyUnicode_Fini(void)
15259 {
15260     int i;
15261 
15262     Py_CLEAR(unicode_empty);
15263 
15264     for (i = 0; i < 256; i++)
15265         Py_CLEAR(unicode_latin1[i]);
15266     _PyUnicode_ClearStaticStrings();
15267     (void)PyUnicode_ClearFreeList();
15268 }
15269 
15270 void
PyUnicode_InternInPlace(PyObject ** p)15271 PyUnicode_InternInPlace(PyObject **p)
15272 {
15273     PyObject *s = *p;
15274     PyObject *t;
15275 #ifdef Py_DEBUG
15276     assert(s != NULL);
15277     assert(_PyUnicode_CHECK(s));
15278 #else
15279     if (s == NULL || !PyUnicode_Check(s))
15280         return;
15281 #endif
15282     /* If it's a subclass, we don't really know what putting
15283        it in the interned dict might do. */
15284     if (!PyUnicode_CheckExact(s))
15285         return;
15286     if (PyUnicode_CHECK_INTERNED(s))
15287         return;
15288     if (interned == NULL) {
15289         interned = PyDict_New();
15290         if (interned == NULL) {
15291             PyErr_Clear(); /* Don't leave an exception */
15292             return;
15293         }
15294     }
15295     Py_ALLOW_RECURSION
15296     t = PyDict_SetDefault(interned, s, s);
15297     Py_END_ALLOW_RECURSION
15298     if (t == NULL) {
15299         PyErr_Clear();
15300         return;
15301     }
15302     if (t != s) {
15303         Py_INCREF(t);
15304         Py_SETREF(*p, t);
15305         return;
15306     }
15307     /* The two references in interned are not counted by refcnt.
15308        The deallocator will take care of this */
15309     Py_REFCNT(s) -= 2;
15310     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15311 }
15312 
15313 void
PyUnicode_InternImmortal(PyObject ** p)15314 PyUnicode_InternImmortal(PyObject **p)
15315 {
15316     PyUnicode_InternInPlace(p);
15317     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15318         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15319         Py_INCREF(*p);
15320     }
15321 }
15322 
15323 PyObject *
PyUnicode_InternFromString(const char * cp)15324 PyUnicode_InternFromString(const char *cp)
15325 {
15326     PyObject *s = PyUnicode_FromString(cp);
15327     if (s == NULL)
15328         return NULL;
15329     PyUnicode_InternInPlace(&s);
15330     return s;
15331 }
15332 
15333 void
_Py_ReleaseInternedUnicodeStrings(void)15334 _Py_ReleaseInternedUnicodeStrings(void)
15335 {
15336     PyObject *keys;
15337     PyObject *s;
15338     Py_ssize_t i, n;
15339     Py_ssize_t immortal_size = 0, mortal_size = 0;
15340 
15341     if (interned == NULL || !PyDict_Check(interned))
15342         return;
15343     keys = PyDict_Keys(interned);
15344     if (keys == NULL || !PyList_Check(keys)) {
15345         PyErr_Clear();
15346         return;
15347     }
15348 
15349     /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15350        detector, interned unicode strings are not forcibly deallocated;
15351        rather, we give them their stolen references back, and then clear
15352        and DECREF the interned dict. */
15353 
15354     n = PyList_GET_SIZE(keys);
15355     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15356             n);
15357     for (i = 0; i < n; i++) {
15358         s = PyList_GET_ITEM(keys, i);
15359         if (PyUnicode_READY(s) == -1) {
15360             Py_UNREACHABLE();
15361         }
15362         switch (PyUnicode_CHECK_INTERNED(s)) {
15363         case SSTATE_NOT_INTERNED:
15364             /* XXX Shouldn't happen */
15365             break;
15366         case SSTATE_INTERNED_IMMORTAL:
15367             Py_REFCNT(s) += 1;
15368             immortal_size += PyUnicode_GET_LENGTH(s);
15369             break;
15370         case SSTATE_INTERNED_MORTAL:
15371             Py_REFCNT(s) += 2;
15372             mortal_size += PyUnicode_GET_LENGTH(s);
15373             break;
15374         default:
15375             Py_FatalError("Inconsistent interned string state.");
15376         }
15377         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15378     }
15379     fprintf(stderr, "total size of all interned strings: "
15380             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15381             "mortal/immortal\n", mortal_size, immortal_size);
15382     Py_DECREF(keys);
15383     PyDict_Clear(interned);
15384     Py_CLEAR(interned);
15385 }
15386 
15387 
15388 /********************* Unicode Iterator **************************/
15389 
15390 typedef struct {
15391     PyObject_HEAD
15392     Py_ssize_t it_index;
15393     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15394 } unicodeiterobject;
15395 
15396 static void
unicodeiter_dealloc(unicodeiterobject * it)15397 unicodeiter_dealloc(unicodeiterobject *it)
15398 {
15399     _PyObject_GC_UNTRACK(it);
15400     Py_XDECREF(it->it_seq);
15401     PyObject_GC_Del(it);
15402 }
15403 
15404 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15405 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15406 {
15407     Py_VISIT(it->it_seq);
15408     return 0;
15409 }
15410 
15411 static PyObject *
unicodeiter_next(unicodeiterobject * it)15412 unicodeiter_next(unicodeiterobject *it)
15413 {
15414     PyObject *seq, *item;
15415 
15416     assert(it != NULL);
15417     seq = it->it_seq;
15418     if (seq == NULL)
15419         return NULL;
15420     assert(_PyUnicode_CHECK(seq));
15421 
15422     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15423         int kind = PyUnicode_KIND(seq);
15424         void *data = PyUnicode_DATA(seq);
15425         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15426         item = PyUnicode_FromOrdinal(chr);
15427         if (item != NULL)
15428             ++it->it_index;
15429         return item;
15430     }
15431 
15432     it->it_seq = NULL;
15433     Py_DECREF(seq);
15434     return NULL;
15435 }
15436 
15437 static PyObject *
unicodeiter_len(unicodeiterobject * it)15438 unicodeiter_len(unicodeiterobject *it)
15439 {
15440     Py_ssize_t len = 0;
15441     if (it->it_seq)
15442         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15443     return PyLong_FromSsize_t(len);
15444 }
15445 
15446 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15447 
15448 static PyObject *
unicodeiter_reduce(unicodeiterobject * it)15449 unicodeiter_reduce(unicodeiterobject *it)
15450 {
15451     if (it->it_seq != NULL) {
15452         return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15453                              it->it_seq, it->it_index);
15454     } else {
15455         PyObject *u = (PyObject *)_PyUnicode_New(0);
15456         if (u == NULL)
15457             return NULL;
15458         return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15459     }
15460 }
15461 
15462 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15463 
15464 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15465 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15466 {
15467     Py_ssize_t index = PyLong_AsSsize_t(state);
15468     if (index == -1 && PyErr_Occurred())
15469         return NULL;
15470     if (it->it_seq != NULL) {
15471         if (index < 0)
15472             index = 0;
15473         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15474             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15475         it->it_index = index;
15476     }
15477     Py_RETURN_NONE;
15478 }
15479 
15480 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15481 
15482 static PyMethodDef unicodeiter_methods[] = {
15483     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15484      length_hint_doc},
15485     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15486      reduce_doc},
15487     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15488      setstate_doc},
15489     {NULL,      NULL}       /* sentinel */
15490 };
15491 
15492 PyTypeObject PyUnicodeIter_Type = {
15493     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15494     "str_iterator",         /* tp_name */
15495     sizeof(unicodeiterobject),      /* tp_basicsize */
15496     0,                  /* tp_itemsize */
15497     /* methods */
15498     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15499     0,                  /* tp_print */
15500     0,                  /* tp_getattr */
15501     0,                  /* tp_setattr */
15502     0,                  /* tp_reserved */
15503     0,                  /* tp_repr */
15504     0,                  /* tp_as_number */
15505     0,                  /* tp_as_sequence */
15506     0,                  /* tp_as_mapping */
15507     0,                  /* tp_hash */
15508     0,                  /* tp_call */
15509     0,                  /* tp_str */
15510     PyObject_GenericGetAttr,        /* tp_getattro */
15511     0,                  /* tp_setattro */
15512     0,                  /* tp_as_buffer */
15513     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15514     0,                  /* tp_doc */
15515     (traverseproc)unicodeiter_traverse, /* tp_traverse */
15516     0,                  /* tp_clear */
15517     0,                  /* tp_richcompare */
15518     0,                  /* tp_weaklistoffset */
15519     PyObject_SelfIter,          /* tp_iter */
15520     (iternextfunc)unicodeiter_next,     /* tp_iternext */
15521     unicodeiter_methods,            /* tp_methods */
15522     0,
15523 };
15524 
15525 static PyObject *
unicode_iter(PyObject * seq)15526 unicode_iter(PyObject *seq)
15527 {
15528     unicodeiterobject *it;
15529 
15530     if (!PyUnicode_Check(seq)) {
15531         PyErr_BadInternalCall();
15532         return NULL;
15533     }
15534     if (PyUnicode_READY(seq) == -1)
15535         return NULL;
15536     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15537     if (it == NULL)
15538         return NULL;
15539     it->it_index = 0;
15540     Py_INCREF(seq);
15541     it->it_seq = seq;
15542     _PyObject_GC_TRACK(it);
15543     return (PyObject *)it;
15544 }
15545 
15546 
15547 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15548 Py_UNICODE_strlen(const Py_UNICODE *u)
15549 {
15550     return wcslen(u);
15551 }
15552 
15553 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15554 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15555 {
15556     Py_UNICODE *u = s1;
15557     while ((*u++ = *s2++));
15558     return s1;
15559 }
15560 
15561 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15562 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15563 {
15564     Py_UNICODE *u = s1;
15565     while ((*u++ = *s2++))
15566         if (n-- == 0)
15567             break;
15568     return s1;
15569 }
15570 
15571 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15572 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15573 {
15574     Py_UNICODE *u1 = s1;
15575     u1 += wcslen(u1);
15576     while ((*u1++ = *s2++));
15577     return s1;
15578 }
15579 
15580 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15581 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15582 {
15583     while (*s1 && *s2 && *s1 == *s2)
15584         s1++, s2++;
15585     if (*s1 && *s2)
15586         return (*s1 < *s2) ? -1 : +1;
15587     if (*s1)
15588         return 1;
15589     if (*s2)
15590         return -1;
15591     return 0;
15592 }
15593 
15594 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15595 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15596 {
15597     Py_UNICODE u1, u2;
15598     for (; n != 0; n--) {
15599         u1 = *s1;
15600         u2 = *s2;
15601         if (u1 != u2)
15602             return (u1 < u2) ? -1 : +1;
15603         if (u1 == '\0')
15604             return 0;
15605         s1++;
15606         s2++;
15607     }
15608     return 0;
15609 }
15610 
15611 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15612 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15613 {
15614     const Py_UNICODE *p;
15615     for (p = s; *p; p++)
15616         if (*p == c)
15617             return (Py_UNICODE*)p;
15618     return NULL;
15619 }
15620 
15621 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15622 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15623 {
15624     const Py_UNICODE *p;
15625     p = s + wcslen(s);
15626     while (p != s) {
15627         p--;
15628         if (*p == c)
15629             return (Py_UNICODE*)p;
15630     }
15631     return NULL;
15632 }
15633 
15634 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15635 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15636 {
15637     Py_UNICODE *u, *copy;
15638     Py_ssize_t len, size;
15639 
15640     if (!PyUnicode_Check(unicode)) {
15641         PyErr_BadArgument();
15642         return NULL;
15643     }
15644     u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15645     if (u == NULL)
15646         return NULL;
15647     /* Ensure we won't overflow the size. */
15648     if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15649         PyErr_NoMemory();
15650         return NULL;
15651     }
15652     size = len + 1; /* copy the null character */
15653     size *= sizeof(Py_UNICODE);
15654     copy = PyMem_Malloc(size);
15655     if (copy == NULL) {
15656         PyErr_NoMemory();
15657         return NULL;
15658     }
15659     memcpy(copy, u, size);
15660     return copy;
15661 }
15662 
15663 /* A _string module, to export formatter_parser and formatter_field_name_split
15664    to the string.Formatter class implemented in Python. */
15665 
15666 static PyMethodDef _string_methods[] = {
15667     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15668      METH_O, PyDoc_STR("split the argument as a field name")},
15669     {"formatter_parser", (PyCFunction) formatter_parser,
15670      METH_O, PyDoc_STR("parse the argument as a format string")},
15671     {NULL, NULL}
15672 };
15673 
15674 static struct PyModuleDef _string_module = {
15675     PyModuleDef_HEAD_INIT,
15676     "_string",
15677     PyDoc_STR("string helper module"),
15678     0,
15679     _string_methods,
15680     NULL,
15681     NULL,
15682     NULL,
15683     NULL
15684 };
15685 
15686 PyMODINIT_FUNC
PyInit__string(void)15687 PyInit__string(void)
15688 {
15689     return PyModule_Create(&_string_module);
15690 }
15691 
15692 
15693 #ifdef __cplusplus
15694 }
15695 #endif
15696