1 /*
2 
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5 
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8 
9 Copyright (c) Corporation for National Research Initiatives.
10 
11 --------------------------------------------------------------------
12 The original string type implementation is:
13 
14   Copyright (c) 1999 by Secret Labs AB
15   Copyright (c) 1999 by Fredrik Lundh
16 
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20 
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29 
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38 
39 */
40 
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_initconfig.h"
44 #include "pycore_fileutils.h"
45 #include "pycore_object.h"
46 #include "pycore_pylifecycle.h"
47 #include "pycore_pystate.h"
48 #include "ucnhash.h"
49 #include "bytes_methods.h"
50 #include "stringlib/eq.h"
51 
52 #ifdef MS_WINDOWS
53 #include <windows.h>
54 #endif
55 
56 /* Uncomment to display statistics on interned strings at exit when
57    using Valgrind or Insecure++. */
58 /* #define INTERNED_STATS 1 */
59 
60 
61 /*[clinic input]
62 class str "PyObject *" "&PyUnicode_Type"
63 [clinic start generated code]*/
64 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65 
66 /*[python input]
67 class Py_UCS4_converter(CConverter):
68     type = 'Py_UCS4'
69     converter = 'convert_uc'
70 
71     def converter_init(self):
72         if self.default is not unspecified:
73             self.c_default = ascii(self.default)
74             if len(self.c_default) > 4 or self.c_default[0] != "'":
75                 self.c_default = hex(ord(self.default))
76 
77 [python start generated code]*/
78 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
79 
80 /* --- Globals ------------------------------------------------------------
81 
82 NOTE: In the interpreter's initialization phase, some globals are currently
83       initialized dynamically as needed. In the process Unicode objects may
84       be created before the Unicode type is ready.
85 
86 */
87 
88 
89 #ifdef __cplusplus
90 extern "C" {
91 #endif
92 
93 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
94 // The value must be the same in fileutils.c.
95 #define MAX_UNICODE 0x10ffff
96 
97 #ifdef Py_DEBUG
98 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
99 #else
100 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101 #endif
102 
103 #define _PyUnicode_UTF8(op)                             \
104     (((PyCompactUnicodeObject*)(op))->utf8)
105 #define PyUnicode_UTF8(op)                              \
106     (assert(_PyUnicode_CHECK(op)),                      \
107      assert(PyUnicode_IS_READY(op)),                    \
108      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
109          ((char*)((PyASCIIObject*)(op) + 1)) :          \
110          _PyUnicode_UTF8(op))
111 #define _PyUnicode_UTF8_LENGTH(op)                      \
112     (((PyCompactUnicodeObject*)(op))->utf8_length)
113 #define PyUnicode_UTF8_LENGTH(op)                       \
114     (assert(_PyUnicode_CHECK(op)),                      \
115      assert(PyUnicode_IS_READY(op)),                    \
116      PyUnicode_IS_COMPACT_ASCII(op) ?                   \
117          ((PyASCIIObject*)(op))->length :               \
118          _PyUnicode_UTF8_LENGTH(op))
119 #define _PyUnicode_WSTR(op)                             \
120     (((PyASCIIObject*)(op))->wstr)
121 #define _PyUnicode_WSTR_LENGTH(op)                      \
122     (((PyCompactUnicodeObject*)(op))->wstr_length)
123 #define _PyUnicode_LENGTH(op)                           \
124     (((PyASCIIObject *)(op))->length)
125 #define _PyUnicode_STATE(op)                            \
126     (((PyASCIIObject *)(op))->state)
127 #define _PyUnicode_HASH(op)                             \
128     (((PyASCIIObject *)(op))->hash)
129 #define _PyUnicode_KIND(op)                             \
130     (assert(_PyUnicode_CHECK(op)),                      \
131      ((PyASCIIObject *)(op))->state.kind)
132 #define _PyUnicode_GET_LENGTH(op)                       \
133     (assert(_PyUnicode_CHECK(op)),                      \
134      ((PyASCIIObject *)(op))->length)
135 #define _PyUnicode_DATA_ANY(op)                         \
136     (((PyUnicodeObject*)(op))->data.any)
137 
138 #undef PyUnicode_READY
139 #define PyUnicode_READY(op)                             \
140     (assert(_PyUnicode_CHECK(op)),                      \
141      (PyUnicode_IS_READY(op) ?                          \
142       0 :                                               \
143       _PyUnicode_Ready(op)))
144 
145 #define _PyUnicode_SHARE_UTF8(op)                       \
146     (assert(_PyUnicode_CHECK(op)),                      \
147      assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
148      (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149 #define _PyUnicode_SHARE_WSTR(op)                       \
150     (assert(_PyUnicode_CHECK(op)),                      \
151      (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152 
153 /* true if the Unicode object has an allocated UTF-8 memory block
154    (not shared with other data) */
155 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
156     ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
157       && _PyUnicode_UTF8(op)                            \
158       && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159 
160 /* true if the Unicode object has an allocated wstr memory block
161    (not shared with other data) */
162 #define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
163     ((_PyUnicode_WSTR(op) &&                            \
164       (!PyUnicode_IS_READY(op) ||                       \
165        _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166 
167 /* Generic helper macro to convert characters of different types.
168    from_type and to_type have to be valid type names, begin and end
169    are pointers to the source characters which should be of type
170    "from_type *".  to is a pointer of type "to_type *" and points to the
171    buffer where the result characters are written to. */
172 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173     do {                                                \
174         to_type *_to = (to_type *)(to);                \
175         const from_type *_iter = (const from_type *)(begin);\
176         const from_type *_end = (const from_type *)(end);\
177         Py_ssize_t n = (_end) - (_iter);                \
178         const from_type *_unrolled_end =                \
179             _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
180         while (_iter < (_unrolled_end)) {               \
181             _to[0] = (to_type) _iter[0];                \
182             _to[1] = (to_type) _iter[1];                \
183             _to[2] = (to_type) _iter[2];                \
184             _to[3] = (to_type) _iter[3];                \
185             _iter += 4; _to += 4;                       \
186         }                                               \
187         while (_iter < (_end))                          \
188             *_to++ = (to_type) *_iter++;                \
189     } while (0)
190 
191 #ifdef MS_WINDOWS
192    /* On Windows, overallocate by 50% is the best factor */
193 #  define OVERALLOCATE_FACTOR 2
194 #else
195    /* On Linux, overallocate by 25% is the best factor */
196 #  define OVERALLOCATE_FACTOR 4
197 #endif
198 
199 /* This dictionary holds all interned unicode strings.  Note that references
200    to strings in this dictionary are *not* counted in the string's ob_refcnt.
201    When the interned string reaches a refcnt of 0 the string deallocation
202    function will delete the reference from this dictionary.
203 
204    Another way to look at this is that to say that the actual reference
205    count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
206 */
207 static PyObject *interned = NULL;
208 
209 /* The empty Unicode object is shared to improve performance. */
210 static PyObject *unicode_empty = NULL;
211 
212 #define _Py_INCREF_UNICODE_EMPTY()                      \
213     do {                                                \
214         if (unicode_empty != NULL)                      \
215             Py_INCREF(unicode_empty);                   \
216         else {                                          \
217             unicode_empty = PyUnicode_New(0, 0);        \
218             if (unicode_empty != NULL) {                \
219                 Py_INCREF(unicode_empty);               \
220                 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221             }                                           \
222         }                                               \
223     } while (0)
224 
225 #define _Py_RETURN_UNICODE_EMPTY()                      \
226     do {                                                \
227         _Py_INCREF_UNICODE_EMPTY();                     \
228         return unicode_empty;                           \
229     } while (0)
230 
231 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)232 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233              Py_ssize_t start, Py_ssize_t length)
234 {
235     assert(0 <= start);
236     assert(kind != PyUnicode_WCHAR_KIND);
237     switch (kind) {
238     case PyUnicode_1BYTE_KIND: {
239         assert(value <= 0xff);
240         Py_UCS1 ch = (unsigned char)value;
241         Py_UCS1 *to = (Py_UCS1 *)data + start;
242         memset(to, ch, length);
243         break;
244     }
245     case PyUnicode_2BYTE_KIND: {
246         assert(value <= 0xffff);
247         Py_UCS2 ch = (Py_UCS2)value;
248         Py_UCS2 *to = (Py_UCS2 *)data + start;
249         const Py_UCS2 *end = to + length;
250         for (; to < end; ++to) *to = ch;
251         break;
252     }
253     case PyUnicode_4BYTE_KIND: {
254         assert(value <= MAX_UNICODE);
255         Py_UCS4 ch = value;
256         Py_UCS4 * to = (Py_UCS4 *)data + start;
257         const Py_UCS4 *end = to + length;
258         for (; to < end; ++to) *to = ch;
259         break;
260     }
261     default: Py_UNREACHABLE();
262     }
263 }
264 
265 
266 /* Forward declaration */
267 static inline int
268 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
269 static PyObject *
270 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
271                     const char *errors);
272 static PyObject *
273 unicode_decode_utf8(const char *s, Py_ssize_t size,
274                     _Py_error_handler error_handler, const char *errors,
275                     Py_ssize_t *consumed);
276 
277 /* List of static strings. */
278 static _Py_Identifier *static_strings = NULL;
279 
280 /* Single character Unicode strings in the Latin-1 range are being
281    shared as well. */
282 static PyObject *unicode_latin1[256] = {NULL};
283 
284 /* Fast detection of the most frequent whitespace characters */
285 const unsigned char _Py_ascii_whitespace[] = {
286     0, 0, 0, 0, 0, 0, 0, 0,
287 /*     case 0x0009: * CHARACTER TABULATION */
288 /*     case 0x000A: * LINE FEED */
289 /*     case 0x000B: * LINE TABULATION */
290 /*     case 0x000C: * FORM FEED */
291 /*     case 0x000D: * CARRIAGE RETURN */
292     0, 1, 1, 1, 1, 1, 0, 0,
293     0, 0, 0, 0, 0, 0, 0, 0,
294 /*     case 0x001C: * FILE SEPARATOR */
295 /*     case 0x001D: * GROUP SEPARATOR */
296 /*     case 0x001E: * RECORD SEPARATOR */
297 /*     case 0x001F: * UNIT SEPARATOR */
298     0, 0, 0, 0, 1, 1, 1, 1,
299 /*     case 0x0020: * SPACE */
300     1, 0, 0, 0, 0, 0, 0, 0,
301     0, 0, 0, 0, 0, 0, 0, 0,
302     0, 0, 0, 0, 0, 0, 0, 0,
303     0, 0, 0, 0, 0, 0, 0, 0,
304 
305     0, 0, 0, 0, 0, 0, 0, 0,
306     0, 0, 0, 0, 0, 0, 0, 0,
307     0, 0, 0, 0, 0, 0, 0, 0,
308     0, 0, 0, 0, 0, 0, 0, 0,
309     0, 0, 0, 0, 0, 0, 0, 0,
310     0, 0, 0, 0, 0, 0, 0, 0,
311     0, 0, 0, 0, 0, 0, 0, 0,
312     0, 0, 0, 0, 0, 0, 0, 0
313 };
314 
315 /* forward */
316 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
317 static PyObject* get_latin1_char(unsigned char ch);
318 static int unicode_modifiable(PyObject *unicode);
319 
320 
321 static PyObject *
322 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
323 static PyObject *
324 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
325 static PyObject *
326 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
327 
328 static PyObject *
329 unicode_encode_call_errorhandler(const char *errors,
330        PyObject **errorHandler,const char *encoding, const char *reason,
331        PyObject *unicode, PyObject **exceptionObject,
332        Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
333 
334 static void
335 raise_encode_exception(PyObject **exceptionObject,
336                        const char *encoding,
337                        PyObject *unicode,
338                        Py_ssize_t startpos, Py_ssize_t endpos,
339                        const char *reason);
340 
341 /* Same for linebreaks */
342 static const unsigned char ascii_linebreak[] = {
343     0, 0, 0, 0, 0, 0, 0, 0,
344 /*         0x000A, * LINE FEED */
345 /*         0x000B, * LINE TABULATION */
346 /*         0x000C, * FORM FEED */
347 /*         0x000D, * CARRIAGE RETURN */
348     0, 0, 1, 1, 1, 1, 0, 0,
349     0, 0, 0, 0, 0, 0, 0, 0,
350 /*         0x001C, * FILE SEPARATOR */
351 /*         0x001D, * GROUP SEPARATOR */
352 /*         0x001E, * RECORD SEPARATOR */
353     0, 0, 0, 0, 1, 1, 1, 0,
354     0, 0, 0, 0, 0, 0, 0, 0,
355     0, 0, 0, 0, 0, 0, 0, 0,
356     0, 0, 0, 0, 0, 0, 0, 0,
357     0, 0, 0, 0, 0, 0, 0, 0,
358 
359     0, 0, 0, 0, 0, 0, 0, 0,
360     0, 0, 0, 0, 0, 0, 0, 0,
361     0, 0, 0, 0, 0, 0, 0, 0,
362     0, 0, 0, 0, 0, 0, 0, 0,
363     0, 0, 0, 0, 0, 0, 0, 0,
364     0, 0, 0, 0, 0, 0, 0, 0,
365     0, 0, 0, 0, 0, 0, 0, 0,
366     0, 0, 0, 0, 0, 0, 0, 0
367 };
368 
369 static int convert_uc(PyObject *obj, void *addr);
370 
371 #include "clinic/unicodeobject.c.h"
372 
373 _Py_error_handler
_Py_GetErrorHandler(const char * errors)374 _Py_GetErrorHandler(const char *errors)
375 {
376     if (errors == NULL || strcmp(errors, "strict") == 0) {
377         return _Py_ERROR_STRICT;
378     }
379     if (strcmp(errors, "surrogateescape") == 0) {
380         return _Py_ERROR_SURROGATEESCAPE;
381     }
382     if (strcmp(errors, "replace") == 0) {
383         return _Py_ERROR_REPLACE;
384     }
385     if (strcmp(errors, "ignore") == 0) {
386         return _Py_ERROR_IGNORE;
387     }
388     if (strcmp(errors, "backslashreplace") == 0) {
389         return _Py_ERROR_BACKSLASHREPLACE;
390     }
391     if (strcmp(errors, "surrogatepass") == 0) {
392         return _Py_ERROR_SURROGATEPASS;
393     }
394     if (strcmp(errors, "xmlcharrefreplace") == 0) {
395         return _Py_ERROR_XMLCHARREFREPLACE;
396     }
397     return _Py_ERROR_OTHER;
398 }
399 
400 
401 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)402 get_error_handler_wide(const wchar_t *errors)
403 {
404     if (errors == NULL || wcscmp(errors, L"strict") == 0) {
405         return _Py_ERROR_STRICT;
406     }
407     if (wcscmp(errors, L"surrogateescape") == 0) {
408         return _Py_ERROR_SURROGATEESCAPE;
409     }
410     if (wcscmp(errors, L"replace") == 0) {
411         return _Py_ERROR_REPLACE;
412     }
413     if (wcscmp(errors, L"ignore") == 0) {
414         return _Py_ERROR_IGNORE;
415     }
416     if (wcscmp(errors, L"backslashreplace") == 0) {
417         return _Py_ERROR_BACKSLASHREPLACE;
418     }
419     if (wcscmp(errors, L"surrogatepass") == 0) {
420         return _Py_ERROR_SURROGATEPASS;
421     }
422     if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
423         return _Py_ERROR_XMLCHARREFREPLACE;
424     }
425     return _Py_ERROR_OTHER;
426 }
427 
428 
429 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
430    This function is kept for backward compatibility with the old API. */
431 Py_UNICODE
PyUnicode_GetMax(void)432 PyUnicode_GetMax(void)
433 {
434 #ifdef Py_UNICODE_WIDE
435     return 0x10FFFF;
436 #else
437     /* This is actually an illegal character, so it should
438        not be passed to unichr. */
439     return 0xFFFF;
440 #endif
441 }
442 
443 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)444 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
445 {
446 #define CHECK(expr) \
447     do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
448 
449     PyASCIIObject *ascii;
450     unsigned int kind;
451 
452     assert(op != NULL);
453     CHECK(PyUnicode_Check(op));
454 
455     ascii = (PyASCIIObject *)op;
456     kind = ascii->state.kind;
457 
458     if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
459         CHECK(kind == PyUnicode_1BYTE_KIND);
460         CHECK(ascii->state.ready == 1);
461     }
462     else {
463         PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
464         void *data;
465 
466         if (ascii->state.compact == 1) {
467             data = compact + 1;
468             CHECK(kind == PyUnicode_1BYTE_KIND
469                                  || kind == PyUnicode_2BYTE_KIND
470                                  || kind == PyUnicode_4BYTE_KIND);
471             CHECK(ascii->state.ascii == 0);
472             CHECK(ascii->state.ready == 1);
473             CHECK(compact->utf8 != data);
474         }
475         else {
476             PyUnicodeObject *unicode = (PyUnicodeObject *)op;
477 
478             data = unicode->data.any;
479             if (kind == PyUnicode_WCHAR_KIND) {
480                 CHECK(ascii->length == 0);
481                 CHECK(ascii->hash == -1);
482                 CHECK(ascii->state.compact == 0);
483                 CHECK(ascii->state.ascii == 0);
484                 CHECK(ascii->state.ready == 0);
485                 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
486                 CHECK(ascii->wstr != NULL);
487                 CHECK(data == NULL);
488                 CHECK(compact->utf8 == NULL);
489             }
490             else {
491                 CHECK(kind == PyUnicode_1BYTE_KIND
492                                      || kind == PyUnicode_2BYTE_KIND
493                                      || kind == PyUnicode_4BYTE_KIND);
494                 CHECK(ascii->state.compact == 0);
495                 CHECK(ascii->state.ready == 1);
496                 CHECK(data != NULL);
497                 if (ascii->state.ascii) {
498                     CHECK(compact->utf8 == data);
499                     CHECK(compact->utf8_length == ascii->length);
500                 }
501                 else
502                     CHECK(compact->utf8 != data);
503             }
504         }
505         if (kind != PyUnicode_WCHAR_KIND) {
506             if (
507 #if SIZEOF_WCHAR_T == 2
508                 kind == PyUnicode_2BYTE_KIND
509 #else
510                 kind == PyUnicode_4BYTE_KIND
511 #endif
512                )
513             {
514                 CHECK(ascii->wstr == data);
515                 CHECK(compact->wstr_length == ascii->length);
516             } else
517                 CHECK(ascii->wstr != data);
518         }
519 
520         if (compact->utf8 == NULL)
521             CHECK(compact->utf8_length == 0);
522         if (ascii->wstr == NULL)
523             CHECK(compact->wstr_length == 0);
524     }
525 
526     /* check that the best kind is used: O(n) operation */
527     if (check_content && kind != PyUnicode_WCHAR_KIND) {
528         Py_ssize_t i;
529         Py_UCS4 maxchar = 0;
530         void *data;
531         Py_UCS4 ch;
532 
533         data = PyUnicode_DATA(ascii);
534         for (i=0; i < ascii->length; i++)
535         {
536             ch = PyUnicode_READ(kind, data, i);
537             if (ch > maxchar)
538                 maxchar = ch;
539         }
540         if (kind == PyUnicode_1BYTE_KIND) {
541             if (ascii->state.ascii == 0) {
542                 CHECK(maxchar >= 128);
543                 CHECK(maxchar <= 255);
544             }
545             else
546                 CHECK(maxchar < 128);
547         }
548         else if (kind == PyUnicode_2BYTE_KIND) {
549             CHECK(maxchar >= 0x100);
550             CHECK(maxchar <= 0xFFFF);
551         }
552         else {
553             CHECK(maxchar >= 0x10000);
554             CHECK(maxchar <= MAX_UNICODE);
555         }
556         CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
557     }
558     return 1;
559 
560 #undef CHECK
561 }
562 
563 
564 static PyObject*
unicode_result_wchar(PyObject * unicode)565 unicode_result_wchar(PyObject *unicode)
566 {
567 #ifndef Py_DEBUG
568     Py_ssize_t len;
569 
570     len = _PyUnicode_WSTR_LENGTH(unicode);
571     if (len == 0) {
572         Py_DECREF(unicode);
573         _Py_RETURN_UNICODE_EMPTY();
574     }
575 
576     if (len == 1) {
577         wchar_t ch = _PyUnicode_WSTR(unicode)[0];
578         if ((Py_UCS4)ch < 256) {
579             PyObject *latin1_char = get_latin1_char((unsigned char)ch);
580             Py_DECREF(unicode);
581             return latin1_char;
582         }
583     }
584 
585     if (_PyUnicode_Ready(unicode) < 0) {
586         Py_DECREF(unicode);
587         return NULL;
588     }
589 #else
590     assert(Py_REFCNT(unicode) == 1);
591 
592     /* don't make the result ready in debug mode to ensure that the caller
593        makes the string ready before using it */
594     assert(_PyUnicode_CheckConsistency(unicode, 1));
595 #endif
596     return unicode;
597 }
598 
599 static PyObject*
unicode_result_ready(PyObject * unicode)600 unicode_result_ready(PyObject *unicode)
601 {
602     Py_ssize_t length;
603 
604     length = PyUnicode_GET_LENGTH(unicode);
605     if (length == 0) {
606         if (unicode != unicode_empty) {
607             Py_DECREF(unicode);
608             _Py_RETURN_UNICODE_EMPTY();
609         }
610         return unicode_empty;
611     }
612 
613     if (length == 1) {
614         void *data = PyUnicode_DATA(unicode);
615         int kind = PyUnicode_KIND(unicode);
616         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
617         if (ch < 256) {
618             PyObject *latin1_char = unicode_latin1[ch];
619             if (latin1_char != NULL) {
620                 if (unicode != latin1_char) {
621                     Py_INCREF(latin1_char);
622                     Py_DECREF(unicode);
623                 }
624                 return latin1_char;
625             }
626             else {
627                 assert(_PyUnicode_CheckConsistency(unicode, 1));
628                 Py_INCREF(unicode);
629                 unicode_latin1[ch] = unicode;
630                 return unicode;
631             }
632         }
633     }
634 
635     assert(_PyUnicode_CheckConsistency(unicode, 1));
636     return unicode;
637 }
638 
639 static PyObject*
unicode_result(PyObject * unicode)640 unicode_result(PyObject *unicode)
641 {
642     assert(_PyUnicode_CHECK(unicode));
643     if (PyUnicode_IS_READY(unicode))
644         return unicode_result_ready(unicode);
645     else
646         return unicode_result_wchar(unicode);
647 }
648 
649 static PyObject*
unicode_result_unchanged(PyObject * unicode)650 unicode_result_unchanged(PyObject *unicode)
651 {
652     if (PyUnicode_CheckExact(unicode)) {
653         if (PyUnicode_READY(unicode) == -1)
654             return NULL;
655         Py_INCREF(unicode);
656         return unicode;
657     }
658     else
659         /* Subtype -- return genuine unicode string with the same value. */
660         return _PyUnicode_Copy(unicode);
661 }
662 
663 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
664    ASCII, Latin1, UTF-8, etc. */
665 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)666 backslashreplace(_PyBytesWriter *writer, char *str,
667                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
668 {
669     Py_ssize_t size, i;
670     Py_UCS4 ch;
671     enum PyUnicode_Kind kind;
672     void *data;
673 
674     assert(PyUnicode_IS_READY(unicode));
675     kind = PyUnicode_KIND(unicode);
676     data = PyUnicode_DATA(unicode);
677 
678     size = 0;
679     /* determine replacement size */
680     for (i = collstart; i < collend; ++i) {
681         Py_ssize_t incr;
682 
683         ch = PyUnicode_READ(kind, data, i);
684         if (ch < 0x100)
685             incr = 2+2;
686         else if (ch < 0x10000)
687             incr = 2+4;
688         else {
689             assert(ch <= MAX_UNICODE);
690             incr = 2+8;
691         }
692         if (size > PY_SSIZE_T_MAX - incr) {
693             PyErr_SetString(PyExc_OverflowError,
694                             "encoded result is too long for a Python string");
695             return NULL;
696         }
697         size += incr;
698     }
699 
700     str = _PyBytesWriter_Prepare(writer, str, size);
701     if (str == NULL)
702         return NULL;
703 
704     /* generate replacement */
705     for (i = collstart; i < collend; ++i) {
706         ch = PyUnicode_READ(kind, data, i);
707         *str++ = '\\';
708         if (ch >= 0x00010000) {
709             *str++ = 'U';
710             *str++ = Py_hexdigits[(ch>>28)&0xf];
711             *str++ = Py_hexdigits[(ch>>24)&0xf];
712             *str++ = Py_hexdigits[(ch>>20)&0xf];
713             *str++ = Py_hexdigits[(ch>>16)&0xf];
714             *str++ = Py_hexdigits[(ch>>12)&0xf];
715             *str++ = Py_hexdigits[(ch>>8)&0xf];
716         }
717         else if (ch >= 0x100) {
718             *str++ = 'u';
719             *str++ = Py_hexdigits[(ch>>12)&0xf];
720             *str++ = Py_hexdigits[(ch>>8)&0xf];
721         }
722         else
723             *str++ = 'x';
724         *str++ = Py_hexdigits[(ch>>4)&0xf];
725         *str++ = Py_hexdigits[ch&0xf];
726     }
727     return str;
728 }
729 
730 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
731    ASCII, Latin1, UTF-8, etc. */
732 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)733 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
734                   PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
735 {
736     Py_ssize_t size, i;
737     Py_UCS4 ch;
738     enum PyUnicode_Kind kind;
739     void *data;
740 
741     assert(PyUnicode_IS_READY(unicode));
742     kind = PyUnicode_KIND(unicode);
743     data = PyUnicode_DATA(unicode);
744 
745     size = 0;
746     /* determine replacement size */
747     for (i = collstart; i < collend; ++i) {
748         Py_ssize_t incr;
749 
750         ch = PyUnicode_READ(kind, data, i);
751         if (ch < 10)
752             incr = 2+1+1;
753         else if (ch < 100)
754             incr = 2+2+1;
755         else if (ch < 1000)
756             incr = 2+3+1;
757         else if (ch < 10000)
758             incr = 2+4+1;
759         else if (ch < 100000)
760             incr = 2+5+1;
761         else if (ch < 1000000)
762             incr = 2+6+1;
763         else {
764             assert(ch <= MAX_UNICODE);
765             incr = 2+7+1;
766         }
767         if (size > PY_SSIZE_T_MAX - incr) {
768             PyErr_SetString(PyExc_OverflowError,
769                             "encoded result is too long for a Python string");
770             return NULL;
771         }
772         size += incr;
773     }
774 
775     str = _PyBytesWriter_Prepare(writer, str, size);
776     if (str == NULL)
777         return NULL;
778 
779     /* generate replacement */
780     for (i = collstart; i < collend; ++i) {
781         str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
782     }
783     return str;
784 }
785 
786 /* --- Bloom Filters ----------------------------------------------------- */
787 
788 /* stuff to implement simple "bloom filters" for Unicode characters.
789    to keep things simple, we use a single bitmask, using the least 5
790    bits from each unicode characters as the bit index. */
791 
792 /* the linebreak mask is set up by Unicode_Init below */
793 
794 #if LONG_BIT >= 128
795 #define BLOOM_WIDTH 128
796 #elif LONG_BIT >= 64
797 #define BLOOM_WIDTH 64
798 #elif LONG_BIT >= 32
799 #define BLOOM_WIDTH 32
800 #else
801 #error "LONG_BIT is smaller than 32"
802 #endif
803 
804 #define BLOOM_MASK unsigned long
805 
806 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
807 
808 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
809 
810 #define BLOOM_LINEBREAK(ch)                                             \
811     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
812      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
813 
814 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)815 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
816 {
817 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
818     do {                                               \
819         TYPE *data = (TYPE *)PTR;                      \
820         TYPE *end = data + LEN;                        \
821         Py_UCS4 ch;                                    \
822         for (; data != end; data++) {                  \
823             ch = *data;                                \
824             MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
825         }                                              \
826         break;                                         \
827     } while (0)
828 
829     /* calculate simple bloom-style bitmask for a given unicode string */
830 
831     BLOOM_MASK mask;
832 
833     mask = 0;
834     switch (kind) {
835     case PyUnicode_1BYTE_KIND:
836         BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
837         break;
838     case PyUnicode_2BYTE_KIND:
839         BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
840         break;
841     case PyUnicode_4BYTE_KIND:
842         BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
843         break;
844     default:
845         Py_UNREACHABLE();
846     }
847     return mask;
848 
849 #undef BLOOM_UPDATE
850 }
851 
852 static int
ensure_unicode(PyObject * obj)853 ensure_unicode(PyObject *obj)
854 {
855     if (!PyUnicode_Check(obj)) {
856         PyErr_Format(PyExc_TypeError,
857                      "must be str, not %.100s",
858                      Py_TYPE(obj)->tp_name);
859         return -1;
860     }
861     return PyUnicode_READY(obj);
862 }
863 
864 /* Compilation of templated routines */
865 
866 #include "stringlib/asciilib.h"
867 #include "stringlib/fastsearch.h"
868 #include "stringlib/partition.h"
869 #include "stringlib/split.h"
870 #include "stringlib/count.h"
871 #include "stringlib/find.h"
872 #include "stringlib/find_max_char.h"
873 #include "stringlib/undef.h"
874 
875 #include "stringlib/ucs1lib.h"
876 #include "stringlib/fastsearch.h"
877 #include "stringlib/partition.h"
878 #include "stringlib/split.h"
879 #include "stringlib/count.h"
880 #include "stringlib/find.h"
881 #include "stringlib/replace.h"
882 #include "stringlib/find_max_char.h"
883 #include "stringlib/undef.h"
884 
885 #include "stringlib/ucs2lib.h"
886 #include "stringlib/fastsearch.h"
887 #include "stringlib/partition.h"
888 #include "stringlib/split.h"
889 #include "stringlib/count.h"
890 #include "stringlib/find.h"
891 #include "stringlib/replace.h"
892 #include "stringlib/find_max_char.h"
893 #include "stringlib/undef.h"
894 
895 #include "stringlib/ucs4lib.h"
896 #include "stringlib/fastsearch.h"
897 #include "stringlib/partition.h"
898 #include "stringlib/split.h"
899 #include "stringlib/count.h"
900 #include "stringlib/find.h"
901 #include "stringlib/replace.h"
902 #include "stringlib/find_max_char.h"
903 #include "stringlib/undef.h"
904 
905 #include "stringlib/unicodedefs.h"
906 #include "stringlib/fastsearch.h"
907 #include "stringlib/count.h"
908 #include "stringlib/find.h"
909 #include "stringlib/undef.h"
910 
911 /* --- Unicode Object ----------------------------------------------------- */
912 
913 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)914 findchar(const void *s, int kind,
915          Py_ssize_t size, Py_UCS4 ch,
916          int direction)
917 {
918     switch (kind) {
919     case PyUnicode_1BYTE_KIND:
920         if ((Py_UCS1) ch != ch)
921             return -1;
922         if (direction > 0)
923             return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
924         else
925             return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
926     case PyUnicode_2BYTE_KIND:
927         if ((Py_UCS2) ch != ch)
928             return -1;
929         if (direction > 0)
930             return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
931         else
932             return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
933     case PyUnicode_4BYTE_KIND:
934         if (direction > 0)
935             return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
936         else
937             return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
938     default:
939         Py_UNREACHABLE();
940     }
941 }
942 
943 #ifdef Py_DEBUG
944 /* Fill the data of a Unicode string with invalid characters to detect bugs
945    earlier.
946 
947    _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
948    ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
949    invalid character in Unicode 6.0. */
950 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)951 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
952 {
953     int kind = PyUnicode_KIND(unicode);
954     Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
955     Py_ssize_t length = _PyUnicode_LENGTH(unicode);
956     if (length <= old_length)
957         return;
958     memset(data + old_length * kind, 0xff, (length - old_length) * kind);
959 }
960 #endif
961 
962 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)963 resize_compact(PyObject *unicode, Py_ssize_t length)
964 {
965     Py_ssize_t char_size;
966     Py_ssize_t struct_size;
967     Py_ssize_t new_size;
968     int share_wstr;
969     PyObject *new_unicode;
970 #ifdef Py_DEBUG
971     Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
972 #endif
973 
974     assert(unicode_modifiable(unicode));
975     assert(PyUnicode_IS_READY(unicode));
976     assert(PyUnicode_IS_COMPACT(unicode));
977 
978     char_size = PyUnicode_KIND(unicode);
979     if (PyUnicode_IS_ASCII(unicode))
980         struct_size = sizeof(PyASCIIObject);
981     else
982         struct_size = sizeof(PyCompactUnicodeObject);
983     share_wstr = _PyUnicode_SHARE_WSTR(unicode);
984 
985     if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
986         PyErr_NoMemory();
987         return NULL;
988     }
989     new_size = (struct_size + (length + 1) * char_size);
990 
991     if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
992         PyObject_DEL(_PyUnicode_UTF8(unicode));
993         _PyUnicode_UTF8(unicode) = NULL;
994         _PyUnicode_UTF8_LENGTH(unicode) = 0;
995     }
996     _Py_DEC_REFTOTAL;
997     _Py_ForgetReference(unicode);
998 
999     new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
1000     if (new_unicode == NULL) {
1001         _Py_NewReference(unicode);
1002         PyErr_NoMemory();
1003         return NULL;
1004     }
1005     unicode = new_unicode;
1006     _Py_NewReference(unicode);
1007 
1008     _PyUnicode_LENGTH(unicode) = length;
1009     if (share_wstr) {
1010         _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1011         if (!PyUnicode_IS_ASCII(unicode))
1012             _PyUnicode_WSTR_LENGTH(unicode) = length;
1013     }
1014     else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1015         PyObject_DEL(_PyUnicode_WSTR(unicode));
1016         _PyUnicode_WSTR(unicode) = NULL;
1017         if (!PyUnicode_IS_ASCII(unicode))
1018             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1019     }
1020 #ifdef Py_DEBUG
1021     unicode_fill_invalid(unicode, old_length);
1022 #endif
1023     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1024                     length, 0);
1025     assert(_PyUnicode_CheckConsistency(unicode, 0));
1026     return unicode;
1027 }
1028 
1029 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1030 resize_inplace(PyObject *unicode, Py_ssize_t length)
1031 {
1032     wchar_t *wstr;
1033     Py_ssize_t new_size;
1034     assert(!PyUnicode_IS_COMPACT(unicode));
1035     assert(Py_REFCNT(unicode) == 1);
1036 
1037     if (PyUnicode_IS_READY(unicode)) {
1038         Py_ssize_t char_size;
1039         int share_wstr, share_utf8;
1040         void *data;
1041 #ifdef Py_DEBUG
1042         Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1043 #endif
1044 
1045         data = _PyUnicode_DATA_ANY(unicode);
1046         char_size = PyUnicode_KIND(unicode);
1047         share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1048         share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1049 
1050         if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1051             PyErr_NoMemory();
1052             return -1;
1053         }
1054         new_size = (length + 1) * char_size;
1055 
1056         if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1057         {
1058             PyObject_DEL(_PyUnicode_UTF8(unicode));
1059             _PyUnicode_UTF8(unicode) = NULL;
1060             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1061         }
1062 
1063         data = (PyObject *)PyObject_REALLOC(data, new_size);
1064         if (data == NULL) {
1065             PyErr_NoMemory();
1066             return -1;
1067         }
1068         _PyUnicode_DATA_ANY(unicode) = data;
1069         if (share_wstr) {
1070             _PyUnicode_WSTR(unicode) = data;
1071             _PyUnicode_WSTR_LENGTH(unicode) = length;
1072         }
1073         if (share_utf8) {
1074             _PyUnicode_UTF8(unicode) = data;
1075             _PyUnicode_UTF8_LENGTH(unicode) = length;
1076         }
1077         _PyUnicode_LENGTH(unicode) = length;
1078         PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1079 #ifdef Py_DEBUG
1080         unicode_fill_invalid(unicode, old_length);
1081 #endif
1082         if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1083             assert(_PyUnicode_CheckConsistency(unicode, 0));
1084             return 0;
1085         }
1086     }
1087     assert(_PyUnicode_WSTR(unicode) != NULL);
1088 
1089     /* check for integer overflow */
1090     if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1091         PyErr_NoMemory();
1092         return -1;
1093     }
1094     new_size = sizeof(wchar_t) * (length + 1);
1095     wstr =  _PyUnicode_WSTR(unicode);
1096     wstr = PyObject_REALLOC(wstr, new_size);
1097     if (!wstr) {
1098         PyErr_NoMemory();
1099         return -1;
1100     }
1101     _PyUnicode_WSTR(unicode) = wstr;
1102     _PyUnicode_WSTR(unicode)[length] = 0;
1103     _PyUnicode_WSTR_LENGTH(unicode) = length;
1104     assert(_PyUnicode_CheckConsistency(unicode, 0));
1105     return 0;
1106 }
1107 
1108 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1109 resize_copy(PyObject *unicode, Py_ssize_t length)
1110 {
1111     Py_ssize_t copy_length;
1112     if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1113         PyObject *copy;
1114 
1115         assert(PyUnicode_IS_READY(unicode));
1116 
1117         copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1118         if (copy == NULL)
1119             return NULL;
1120 
1121         copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1122         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1123         return copy;
1124     }
1125     else {
1126         PyObject *w;
1127 
1128         w = (PyObject*)_PyUnicode_New(length);
1129         if (w == NULL)
1130             return NULL;
1131         copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1132         copy_length = Py_MIN(copy_length, length);
1133         memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1134                   copy_length * sizeof(wchar_t));
1135         return w;
1136     }
1137 }
1138 
1139 /* We allocate one more byte to make sure the string is
1140    Ux0000 terminated; some code (e.g. new_identifier)
1141    relies on that.
1142 
1143    XXX This allocator could further be enhanced by assuring that the
1144    free list never reduces its size below 1.
1145 
1146 */
1147 
1148 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1149 _PyUnicode_New(Py_ssize_t length)
1150 {
1151     PyUnicodeObject *unicode;
1152     size_t new_size;
1153 
1154     /* Optimization for empty strings */
1155     if (length == 0 && unicode_empty != NULL) {
1156         Py_INCREF(unicode_empty);
1157         return (PyUnicodeObject*)unicode_empty;
1158     }
1159 
1160     /* Ensure we won't overflow the size. */
1161     if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1162         return (PyUnicodeObject *)PyErr_NoMemory();
1163     }
1164     if (length < 0) {
1165         PyErr_SetString(PyExc_SystemError,
1166                         "Negative size passed to _PyUnicode_New");
1167         return NULL;
1168     }
1169 
1170     unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1171     if (unicode == NULL)
1172         return NULL;
1173     new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1174 
1175     _PyUnicode_WSTR_LENGTH(unicode) = length;
1176     _PyUnicode_HASH(unicode) = -1;
1177     _PyUnicode_STATE(unicode).interned = 0;
1178     _PyUnicode_STATE(unicode).kind = 0;
1179     _PyUnicode_STATE(unicode).compact = 0;
1180     _PyUnicode_STATE(unicode).ready = 0;
1181     _PyUnicode_STATE(unicode).ascii = 0;
1182     _PyUnicode_DATA_ANY(unicode) = NULL;
1183     _PyUnicode_LENGTH(unicode) = 0;
1184     _PyUnicode_UTF8(unicode) = NULL;
1185     _PyUnicode_UTF8_LENGTH(unicode) = 0;
1186 
1187     _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1188     if (!_PyUnicode_WSTR(unicode)) {
1189         Py_DECREF(unicode);
1190         PyErr_NoMemory();
1191         return NULL;
1192     }
1193 
1194     /* Initialize the first element to guard against cases where
1195      * the caller fails before initializing str -- unicode_resize()
1196      * reads str[0], and the Keep-Alive optimization can keep memory
1197      * allocated for str alive across a call to unicode_dealloc(unicode).
1198      * We don't want unicode_resize to read uninitialized memory in
1199      * that case.
1200      */
1201     _PyUnicode_WSTR(unicode)[0] = 0;
1202     _PyUnicode_WSTR(unicode)[length] = 0;
1203 
1204     assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1205     return unicode;
1206 }
1207 
1208 static const char*
unicode_kind_name(PyObject * unicode)1209 unicode_kind_name(PyObject *unicode)
1210 {
1211     /* don't check consistency: unicode_kind_name() is called from
1212        _PyUnicode_Dump() */
1213     if (!PyUnicode_IS_COMPACT(unicode))
1214     {
1215         if (!PyUnicode_IS_READY(unicode))
1216             return "wstr";
1217         switch (PyUnicode_KIND(unicode))
1218         {
1219         case PyUnicode_1BYTE_KIND:
1220             if (PyUnicode_IS_ASCII(unicode))
1221                 return "legacy ascii";
1222             else
1223                 return "legacy latin1";
1224         case PyUnicode_2BYTE_KIND:
1225             return "legacy UCS2";
1226         case PyUnicode_4BYTE_KIND:
1227             return "legacy UCS4";
1228         default:
1229             return "<legacy invalid kind>";
1230         }
1231     }
1232     assert(PyUnicode_IS_READY(unicode));
1233     switch (PyUnicode_KIND(unicode)) {
1234     case PyUnicode_1BYTE_KIND:
1235         if (PyUnicode_IS_ASCII(unicode))
1236             return "ascii";
1237         else
1238             return "latin1";
1239     case PyUnicode_2BYTE_KIND:
1240         return "UCS2";
1241     case PyUnicode_4BYTE_KIND:
1242         return "UCS4";
1243     default:
1244         return "<invalid compact kind>";
1245     }
1246 }
1247 
1248 #ifdef Py_DEBUG
1249 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1250 char *_PyUnicode_utf8(void *unicode_raw){
1251     PyObject *unicode = _PyObject_CAST(unicode_raw);
1252     return PyUnicode_UTF8(unicode);
1253 }
1254 
_PyUnicode_compact_data(void * unicode_raw)1255 void *_PyUnicode_compact_data(void *unicode_raw) {
1256     PyObject *unicode = _PyObject_CAST(unicode_raw);
1257     return _PyUnicode_COMPACT_DATA(unicode);
1258 }
_PyUnicode_data(void * unicode_raw)1259 void *_PyUnicode_data(void *unicode_raw) {
1260     PyObject *unicode = _PyObject_CAST(unicode_raw);
1261     printf("obj %p\n", (void*)unicode);
1262     printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1263     printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1264     printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1265     printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1266     printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1267     return PyUnicode_DATA(unicode);
1268 }
1269 
1270 void
_PyUnicode_Dump(PyObject * op)1271 _PyUnicode_Dump(PyObject *op)
1272 {
1273     PyASCIIObject *ascii = (PyASCIIObject *)op;
1274     PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1275     PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1276     void *data;
1277 
1278     if (ascii->state.compact)
1279     {
1280         if (ascii->state.ascii)
1281             data = (ascii + 1);
1282         else
1283             data = (compact + 1);
1284     }
1285     else
1286         data = unicode->data.any;
1287     printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1288            unicode_kind_name(op), ascii->length);
1289 
1290     if (ascii->wstr == data)
1291         printf("shared ");
1292     printf("wstr=%p", (void *)ascii->wstr);
1293 
1294     if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1295         printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1296         if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1297             printf("shared ");
1298         printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1299                (void *)compact->utf8, compact->utf8_length);
1300     }
1301     printf(", data=%p\n", data);
1302 }
1303 #endif
1304 
1305 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1306 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1307 {
1308     PyObject *obj;
1309     PyCompactUnicodeObject *unicode;
1310     void *data;
1311     enum PyUnicode_Kind kind;
1312     int is_sharing, is_ascii;
1313     Py_ssize_t char_size;
1314     Py_ssize_t struct_size;
1315 
1316     /* Optimization for empty strings */
1317     if (size == 0 && unicode_empty != NULL) {
1318         Py_INCREF(unicode_empty);
1319         return unicode_empty;
1320     }
1321 
1322     is_ascii = 0;
1323     is_sharing = 0;
1324     struct_size = sizeof(PyCompactUnicodeObject);
1325     if (maxchar < 128) {
1326         kind = PyUnicode_1BYTE_KIND;
1327         char_size = 1;
1328         is_ascii = 1;
1329         struct_size = sizeof(PyASCIIObject);
1330     }
1331     else if (maxchar < 256) {
1332         kind = PyUnicode_1BYTE_KIND;
1333         char_size = 1;
1334     }
1335     else if (maxchar < 65536) {
1336         kind = PyUnicode_2BYTE_KIND;
1337         char_size = 2;
1338         if (sizeof(wchar_t) == 2)
1339             is_sharing = 1;
1340     }
1341     else {
1342         if (maxchar > MAX_UNICODE) {
1343             PyErr_SetString(PyExc_SystemError,
1344                             "invalid maximum character passed to PyUnicode_New");
1345             return NULL;
1346         }
1347         kind = PyUnicode_4BYTE_KIND;
1348         char_size = 4;
1349         if (sizeof(wchar_t) == 4)
1350             is_sharing = 1;
1351     }
1352 
1353     /* Ensure we won't overflow the size. */
1354     if (size < 0) {
1355         PyErr_SetString(PyExc_SystemError,
1356                         "Negative size passed to PyUnicode_New");
1357         return NULL;
1358     }
1359     if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1360         return PyErr_NoMemory();
1361 
1362     /* Duplicated allocation code from _PyObject_New() instead of a call to
1363      * PyObject_New() so we are able to allocate space for the object and
1364      * it's data buffer.
1365      */
1366     obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1367     if (obj == NULL)
1368         return PyErr_NoMemory();
1369     obj = PyObject_INIT(obj, &PyUnicode_Type);
1370     if (obj == NULL)
1371         return NULL;
1372 
1373     unicode = (PyCompactUnicodeObject *)obj;
1374     if (is_ascii)
1375         data = ((PyASCIIObject*)obj) + 1;
1376     else
1377         data = unicode + 1;
1378     _PyUnicode_LENGTH(unicode) = size;
1379     _PyUnicode_HASH(unicode) = -1;
1380     _PyUnicode_STATE(unicode).interned = 0;
1381     _PyUnicode_STATE(unicode).kind = kind;
1382     _PyUnicode_STATE(unicode).compact = 1;
1383     _PyUnicode_STATE(unicode).ready = 1;
1384     _PyUnicode_STATE(unicode).ascii = is_ascii;
1385     if (is_ascii) {
1386         ((char*)data)[size] = 0;
1387         _PyUnicode_WSTR(unicode) = NULL;
1388     }
1389     else if (kind == PyUnicode_1BYTE_KIND) {
1390         ((char*)data)[size] = 0;
1391         _PyUnicode_WSTR(unicode) = NULL;
1392         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1393         unicode->utf8 = NULL;
1394         unicode->utf8_length = 0;
1395     }
1396     else {
1397         unicode->utf8 = NULL;
1398         unicode->utf8_length = 0;
1399         if (kind == PyUnicode_2BYTE_KIND)
1400             ((Py_UCS2*)data)[size] = 0;
1401         else /* kind == PyUnicode_4BYTE_KIND */
1402             ((Py_UCS4*)data)[size] = 0;
1403         if (is_sharing) {
1404             _PyUnicode_WSTR_LENGTH(unicode) = size;
1405             _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1406         }
1407         else {
1408             _PyUnicode_WSTR_LENGTH(unicode) = 0;
1409             _PyUnicode_WSTR(unicode) = NULL;
1410         }
1411     }
1412 #ifdef Py_DEBUG
1413     unicode_fill_invalid((PyObject*)unicode, 0);
1414 #endif
1415     assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1416     return obj;
1417 }
1418 
1419 #if SIZEOF_WCHAR_T == 2
1420 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1421    will decode surrogate pairs, the other conversions are implemented as macros
1422    for efficiency.
1423 
1424    This function assumes that unicode can hold one more code point than wstr
1425    characters for a terminating null character. */
1426 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1427 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1428                               PyObject *unicode)
1429 {
1430     const wchar_t *iter;
1431     Py_UCS4 *ucs4_out;
1432 
1433     assert(unicode != NULL);
1434     assert(_PyUnicode_CHECK(unicode));
1435     assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1436     ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1437 
1438     for (iter = begin; iter < end; ) {
1439         assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1440                            _PyUnicode_GET_LENGTH(unicode)));
1441         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1442             && (iter+1) < end
1443             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1444         {
1445             *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1446             iter += 2;
1447         }
1448         else {
1449             *ucs4_out++ = *iter;
1450             iter++;
1451         }
1452     }
1453     assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1454                         _PyUnicode_GET_LENGTH(unicode)));
1455 
1456 }
1457 #endif
1458 
1459 static int
unicode_check_modifiable(PyObject * unicode)1460 unicode_check_modifiable(PyObject *unicode)
1461 {
1462     if (!unicode_modifiable(unicode)) {
1463         PyErr_SetString(PyExc_SystemError,
1464                         "Cannot modify a string currently used");
1465         return -1;
1466     }
1467     return 0;
1468 }
1469 
1470 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1471 _copy_characters(PyObject *to, Py_ssize_t to_start,
1472                  PyObject *from, Py_ssize_t from_start,
1473                  Py_ssize_t how_many, int check_maxchar)
1474 {
1475     unsigned int from_kind, to_kind;
1476     void *from_data, *to_data;
1477 
1478     assert(0 <= how_many);
1479     assert(0 <= from_start);
1480     assert(0 <= to_start);
1481     assert(PyUnicode_Check(from));
1482     assert(PyUnicode_IS_READY(from));
1483     assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1484 
1485     assert(PyUnicode_Check(to));
1486     assert(PyUnicode_IS_READY(to));
1487     assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1488 
1489     if (how_many == 0)
1490         return 0;
1491 
1492     from_kind = PyUnicode_KIND(from);
1493     from_data = PyUnicode_DATA(from);
1494     to_kind = PyUnicode_KIND(to);
1495     to_data = PyUnicode_DATA(to);
1496 
1497 #ifdef Py_DEBUG
1498     if (!check_maxchar
1499         && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1500     {
1501         const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1502         Py_UCS4 ch;
1503         Py_ssize_t i;
1504         for (i=0; i < how_many; i++) {
1505             ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1506             assert(ch <= to_maxchar);
1507         }
1508     }
1509 #endif
1510 
1511     if (from_kind == to_kind) {
1512         if (check_maxchar
1513             && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1514         {
1515             /* Writing Latin-1 characters into an ASCII string requires to
1516                check that all written characters are pure ASCII */
1517             Py_UCS4 max_char;
1518             max_char = ucs1lib_find_max_char(from_data,
1519                                              (Py_UCS1*)from_data + how_many);
1520             if (max_char >= 128)
1521                 return -1;
1522         }
1523         memcpy((char*)to_data + to_kind * to_start,
1524                   (char*)from_data + from_kind * from_start,
1525                   to_kind * how_many);
1526     }
1527     else if (from_kind == PyUnicode_1BYTE_KIND
1528              && to_kind == PyUnicode_2BYTE_KIND)
1529     {
1530         _PyUnicode_CONVERT_BYTES(
1531             Py_UCS1, Py_UCS2,
1532             PyUnicode_1BYTE_DATA(from) + from_start,
1533             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1534             PyUnicode_2BYTE_DATA(to) + to_start
1535             );
1536     }
1537     else if (from_kind == PyUnicode_1BYTE_KIND
1538              && to_kind == PyUnicode_4BYTE_KIND)
1539     {
1540         _PyUnicode_CONVERT_BYTES(
1541             Py_UCS1, Py_UCS4,
1542             PyUnicode_1BYTE_DATA(from) + from_start,
1543             PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1544             PyUnicode_4BYTE_DATA(to) + to_start
1545             );
1546     }
1547     else if (from_kind == PyUnicode_2BYTE_KIND
1548              && to_kind == PyUnicode_4BYTE_KIND)
1549     {
1550         _PyUnicode_CONVERT_BYTES(
1551             Py_UCS2, Py_UCS4,
1552             PyUnicode_2BYTE_DATA(from) + from_start,
1553             PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1554             PyUnicode_4BYTE_DATA(to) + to_start
1555             );
1556     }
1557     else {
1558         assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1559 
1560         if (!check_maxchar) {
1561             if (from_kind == PyUnicode_2BYTE_KIND
1562                 && to_kind == PyUnicode_1BYTE_KIND)
1563             {
1564                 _PyUnicode_CONVERT_BYTES(
1565                     Py_UCS2, Py_UCS1,
1566                     PyUnicode_2BYTE_DATA(from) + from_start,
1567                     PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1568                     PyUnicode_1BYTE_DATA(to) + to_start
1569                     );
1570             }
1571             else if (from_kind == PyUnicode_4BYTE_KIND
1572                      && to_kind == PyUnicode_1BYTE_KIND)
1573             {
1574                 _PyUnicode_CONVERT_BYTES(
1575                     Py_UCS4, Py_UCS1,
1576                     PyUnicode_4BYTE_DATA(from) + from_start,
1577                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1578                     PyUnicode_1BYTE_DATA(to) + to_start
1579                     );
1580             }
1581             else if (from_kind == PyUnicode_4BYTE_KIND
1582                      && to_kind == PyUnicode_2BYTE_KIND)
1583             {
1584                 _PyUnicode_CONVERT_BYTES(
1585                     Py_UCS4, Py_UCS2,
1586                     PyUnicode_4BYTE_DATA(from) + from_start,
1587                     PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1588                     PyUnicode_2BYTE_DATA(to) + to_start
1589                     );
1590             }
1591             else {
1592                 Py_UNREACHABLE();
1593             }
1594         }
1595         else {
1596             const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1597             Py_UCS4 ch;
1598             Py_ssize_t i;
1599 
1600             for (i=0; i < how_many; i++) {
1601                 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1602                 if (ch > to_maxchar)
1603                     return -1;
1604                 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1605             }
1606         }
1607     }
1608     return 0;
1609 }
1610 
1611 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1612 _PyUnicode_FastCopyCharacters(
1613     PyObject *to, Py_ssize_t to_start,
1614     PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1615 {
1616     (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1617 }
1618 
1619 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1620 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1621                          PyObject *from, Py_ssize_t from_start,
1622                          Py_ssize_t how_many)
1623 {
1624     int err;
1625 
1626     if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1627         PyErr_BadInternalCall();
1628         return -1;
1629     }
1630 
1631     if (PyUnicode_READY(from) == -1)
1632         return -1;
1633     if (PyUnicode_READY(to) == -1)
1634         return -1;
1635 
1636     if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1637         PyErr_SetString(PyExc_IndexError, "string index out of range");
1638         return -1;
1639     }
1640     if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1641         PyErr_SetString(PyExc_IndexError, "string index out of range");
1642         return -1;
1643     }
1644     if (how_many < 0) {
1645         PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1646         return -1;
1647     }
1648     how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1649     if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1650         PyErr_Format(PyExc_SystemError,
1651                      "Cannot write %zi characters at %zi "
1652                      "in a string of %zi characters",
1653                      how_many, to_start, PyUnicode_GET_LENGTH(to));
1654         return -1;
1655     }
1656 
1657     if (how_many == 0)
1658         return 0;
1659 
1660     if (unicode_check_modifiable(to))
1661         return -1;
1662 
1663     err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1664     if (err) {
1665         PyErr_Format(PyExc_SystemError,
1666                      "Cannot copy %s characters "
1667                      "into a string of %s characters",
1668                      unicode_kind_name(from),
1669                      unicode_kind_name(to));
1670         return -1;
1671     }
1672     return how_many;
1673 }
1674 
1675 /* Find the maximum code point and count the number of surrogate pairs so a
1676    correct string length can be computed before converting a string to UCS4.
1677    This function counts single surrogates as a character and not as a pair.
1678 
1679    Return 0 on success, or -1 on error. */
1680 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1681 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1682                         Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1683 {
1684     const wchar_t *iter;
1685     Py_UCS4 ch;
1686 
1687     assert(num_surrogates != NULL && maxchar != NULL);
1688     *num_surrogates = 0;
1689     *maxchar = 0;
1690 
1691     for (iter = begin; iter < end; ) {
1692 #if SIZEOF_WCHAR_T == 2
1693         if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1694             && (iter+1) < end
1695             && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1696         {
1697             ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1698             ++(*num_surrogates);
1699             iter += 2;
1700         }
1701         else
1702 #endif
1703         {
1704             ch = *iter;
1705             iter++;
1706         }
1707         if (ch > *maxchar) {
1708             *maxchar = ch;
1709             if (*maxchar > MAX_UNICODE) {
1710                 PyErr_Format(PyExc_ValueError,
1711                              "character U+%x is not in range [U+0000; U+%x]",
1712                              ch, MAX_UNICODE);
1713                 return -1;
1714             }
1715         }
1716     }
1717     return 0;
1718 }
1719 
1720 int
_PyUnicode_Ready(PyObject * unicode)1721 _PyUnicode_Ready(PyObject *unicode)
1722 {
1723     wchar_t *end;
1724     Py_UCS4 maxchar = 0;
1725     Py_ssize_t num_surrogates;
1726 #if SIZEOF_WCHAR_T == 2
1727     Py_ssize_t length_wo_surrogates;
1728 #endif
1729 
1730     /* _PyUnicode_Ready() is only intended for old-style API usage where
1731        strings were created using _PyObject_New() and where no canonical
1732        representation (the str field) has been set yet aka strings
1733        which are not yet ready. */
1734     assert(_PyUnicode_CHECK(unicode));
1735     assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1736     assert(_PyUnicode_WSTR(unicode) != NULL);
1737     assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1738     assert(_PyUnicode_UTF8(unicode) == NULL);
1739     /* Actually, it should neither be interned nor be anything else: */
1740     assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1741 
1742     end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1743     if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1744                                 &maxchar, &num_surrogates) == -1)
1745         return -1;
1746 
1747     if (maxchar < 256) {
1748         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1749         if (!_PyUnicode_DATA_ANY(unicode)) {
1750             PyErr_NoMemory();
1751             return -1;
1752         }
1753         _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1754                                 _PyUnicode_WSTR(unicode), end,
1755                                 PyUnicode_1BYTE_DATA(unicode));
1756         PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1757         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1758         _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1759         if (maxchar < 128) {
1760             _PyUnicode_STATE(unicode).ascii = 1;
1761             _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1762             _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1763         }
1764         else {
1765             _PyUnicode_STATE(unicode).ascii = 0;
1766             _PyUnicode_UTF8(unicode) = NULL;
1767             _PyUnicode_UTF8_LENGTH(unicode) = 0;
1768         }
1769         PyObject_FREE(_PyUnicode_WSTR(unicode));
1770         _PyUnicode_WSTR(unicode) = NULL;
1771         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1772     }
1773     /* In this case we might have to convert down from 4-byte native
1774        wchar_t to 2-byte unicode. */
1775     else if (maxchar < 65536) {
1776         assert(num_surrogates == 0 &&
1777                "FindMaxCharAndNumSurrogatePairs() messed up");
1778 
1779 #if SIZEOF_WCHAR_T == 2
1780         /* We can share representations and are done. */
1781         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1782         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1783         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1784         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1785         _PyUnicode_UTF8(unicode) = NULL;
1786         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1787 #else
1788         /* sizeof(wchar_t) == 4 */
1789         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1790             2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1791         if (!_PyUnicode_DATA_ANY(unicode)) {
1792             PyErr_NoMemory();
1793             return -1;
1794         }
1795         _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1796                                 _PyUnicode_WSTR(unicode), end,
1797                                 PyUnicode_2BYTE_DATA(unicode));
1798         PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1799         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1800         _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1801         _PyUnicode_UTF8(unicode) = NULL;
1802         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1803         PyObject_FREE(_PyUnicode_WSTR(unicode));
1804         _PyUnicode_WSTR(unicode) = NULL;
1805         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1806 #endif
1807     }
1808     /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1809     else {
1810 #if SIZEOF_WCHAR_T == 2
1811         /* in case the native representation is 2-bytes, we need to allocate a
1812            new normalized 4-byte version. */
1813         length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1814         if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1815             PyErr_NoMemory();
1816             return -1;
1817         }
1818         _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1819         if (!_PyUnicode_DATA_ANY(unicode)) {
1820             PyErr_NoMemory();
1821             return -1;
1822         }
1823         _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1824         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1825         _PyUnicode_UTF8(unicode) = NULL;
1826         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1827         /* unicode_convert_wchar_to_ucs4() requires a ready string */
1828         _PyUnicode_STATE(unicode).ready = 1;
1829         unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1830         PyObject_FREE(_PyUnicode_WSTR(unicode));
1831         _PyUnicode_WSTR(unicode) = NULL;
1832         _PyUnicode_WSTR_LENGTH(unicode) = 0;
1833 #else
1834         assert(num_surrogates == 0);
1835 
1836         _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1837         _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1838         _PyUnicode_UTF8(unicode) = NULL;
1839         _PyUnicode_UTF8_LENGTH(unicode) = 0;
1840         _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1841 #endif
1842         PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1843     }
1844     _PyUnicode_STATE(unicode).ready = 1;
1845     assert(_PyUnicode_CheckConsistency(unicode, 1));
1846     return 0;
1847 }
1848 
1849 static void
unicode_dealloc(PyObject * unicode)1850 unicode_dealloc(PyObject *unicode)
1851 {
1852     switch (PyUnicode_CHECK_INTERNED(unicode)) {
1853     case SSTATE_NOT_INTERNED:
1854         break;
1855 
1856     case SSTATE_INTERNED_MORTAL:
1857         /* revive dead object temporarily for DelItem */
1858         Py_REFCNT(unicode) = 3;
1859         if (PyDict_DelItem(interned, unicode) != 0)
1860             Py_FatalError(
1861                 "deletion of interned string failed");
1862         break;
1863 
1864     case SSTATE_INTERNED_IMMORTAL:
1865         Py_FatalError("Immortal interned string died.");
1866         /* fall through */
1867 
1868     default:
1869         Py_FatalError("Inconsistent interned string state.");
1870     }
1871 
1872     if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1873         PyObject_DEL(_PyUnicode_WSTR(unicode));
1874     if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1875         PyObject_DEL(_PyUnicode_UTF8(unicode));
1876     if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1877         PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1878 
1879     Py_TYPE(unicode)->tp_free(unicode);
1880 }
1881 
1882 #ifdef Py_DEBUG
1883 static int
unicode_is_singleton(PyObject * unicode)1884 unicode_is_singleton(PyObject *unicode)
1885 {
1886     PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1887     if (unicode == unicode_empty)
1888         return 1;
1889     if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1890     {
1891         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1892         if (ch < 256 && unicode_latin1[ch] == unicode)
1893             return 1;
1894     }
1895     return 0;
1896 }
1897 #endif
1898 
1899 static int
unicode_modifiable(PyObject * unicode)1900 unicode_modifiable(PyObject *unicode)
1901 {
1902     assert(_PyUnicode_CHECK(unicode));
1903     if (Py_REFCNT(unicode) != 1)
1904         return 0;
1905     if (_PyUnicode_HASH(unicode) != -1)
1906         return 0;
1907     if (PyUnicode_CHECK_INTERNED(unicode))
1908         return 0;
1909     if (!PyUnicode_CheckExact(unicode))
1910         return 0;
1911 #ifdef Py_DEBUG
1912     /* singleton refcount is greater than 1 */
1913     assert(!unicode_is_singleton(unicode));
1914 #endif
1915     return 1;
1916 }
1917 
1918 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1919 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1920 {
1921     PyObject *unicode;
1922     Py_ssize_t old_length;
1923 
1924     assert(p_unicode != NULL);
1925     unicode = *p_unicode;
1926 
1927     assert(unicode != NULL);
1928     assert(PyUnicode_Check(unicode));
1929     assert(0 <= length);
1930 
1931     if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1932         old_length = PyUnicode_WSTR_LENGTH(unicode);
1933     else
1934         old_length = PyUnicode_GET_LENGTH(unicode);
1935     if (old_length == length)
1936         return 0;
1937 
1938     if (length == 0) {
1939         _Py_INCREF_UNICODE_EMPTY();
1940         if (!unicode_empty)
1941             return -1;
1942         Py_SETREF(*p_unicode, unicode_empty);
1943         return 0;
1944     }
1945 
1946     if (!unicode_modifiable(unicode)) {
1947         PyObject *copy = resize_copy(unicode, length);
1948         if (copy == NULL)
1949             return -1;
1950         Py_SETREF(*p_unicode, copy);
1951         return 0;
1952     }
1953 
1954     if (PyUnicode_IS_COMPACT(unicode)) {
1955         PyObject *new_unicode = resize_compact(unicode, length);
1956         if (new_unicode == NULL)
1957             return -1;
1958         *p_unicode = new_unicode;
1959         return 0;
1960     }
1961     return resize_inplace(unicode, length);
1962 }
1963 
1964 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1965 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1966 {
1967     PyObject *unicode;
1968     if (p_unicode == NULL) {
1969         PyErr_BadInternalCall();
1970         return -1;
1971     }
1972     unicode = *p_unicode;
1973     if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1974     {
1975         PyErr_BadInternalCall();
1976         return -1;
1977     }
1978     return unicode_resize(p_unicode, length);
1979 }
1980 
1981 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1982 
1983    WARNING: The function doesn't copy the terminating null character and
1984    doesn't check the maximum character (may write a latin1 character in an
1985    ASCII string). */
1986 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1987 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1988                    const char *str, Py_ssize_t len)
1989 {
1990     enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1991     void *data = PyUnicode_DATA(unicode);
1992     const char *end = str + len;
1993 
1994     switch (kind) {
1995     case PyUnicode_1BYTE_KIND: {
1996         assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1997 #ifdef Py_DEBUG
1998         if (PyUnicode_IS_ASCII(unicode)) {
1999             Py_UCS4 maxchar = ucs1lib_find_max_char(
2000                 (const Py_UCS1*)str,
2001                 (const Py_UCS1*)str + len);
2002             assert(maxchar < 128);
2003         }
2004 #endif
2005         memcpy((char *) data + index, str, len);
2006         break;
2007     }
2008     case PyUnicode_2BYTE_KIND: {
2009         Py_UCS2 *start = (Py_UCS2 *)data + index;
2010         Py_UCS2 *ucs2 = start;
2011         assert(index <= PyUnicode_GET_LENGTH(unicode));
2012 
2013         for (; str < end; ++ucs2, ++str)
2014             *ucs2 = (Py_UCS2)*str;
2015 
2016         assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2017         break;
2018     }
2019     default: {
2020         Py_UCS4 *start = (Py_UCS4 *)data + index;
2021         Py_UCS4 *ucs4 = start;
2022         assert(kind == PyUnicode_4BYTE_KIND);
2023         assert(index <= PyUnicode_GET_LENGTH(unicode));
2024 
2025         for (; str < end; ++ucs4, ++str)
2026             *ucs4 = (Py_UCS4)*str;
2027 
2028         assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2029     }
2030     }
2031 }
2032 
2033 static PyObject*
get_latin1_char(unsigned char ch)2034 get_latin1_char(unsigned char ch)
2035 {
2036     PyObject *unicode = unicode_latin1[ch];
2037     if (!unicode) {
2038         unicode = PyUnicode_New(1, ch);
2039         if (!unicode)
2040             return NULL;
2041         PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2042         assert(_PyUnicode_CheckConsistency(unicode, 1));
2043         unicode_latin1[ch] = unicode;
2044     }
2045     Py_INCREF(unicode);
2046     return unicode;
2047 }
2048 
2049 static PyObject*
unicode_char(Py_UCS4 ch)2050 unicode_char(Py_UCS4 ch)
2051 {
2052     PyObject *unicode;
2053 
2054     assert(ch <= MAX_UNICODE);
2055 
2056     if (ch < 256)
2057         return get_latin1_char(ch);
2058 
2059     unicode = PyUnicode_New(1, ch);
2060     if (unicode == NULL)
2061         return NULL;
2062 
2063     assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2064     if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2065         PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2066     } else {
2067         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2068         PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2069     }
2070     assert(_PyUnicode_CheckConsistency(unicode, 1));
2071     return unicode;
2072 }
2073 
2074 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2075 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2076 {
2077     if (u == NULL)
2078         return (PyObject*)_PyUnicode_New(size);
2079 
2080     if (size < 0) {
2081         PyErr_BadInternalCall();
2082         return NULL;
2083     }
2084 
2085     return PyUnicode_FromWideChar(u, size);
2086 }
2087 
2088 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2089 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2090 {
2091     PyObject *unicode;
2092     Py_UCS4 maxchar = 0;
2093     Py_ssize_t num_surrogates;
2094 
2095     if (u == NULL && size != 0) {
2096         PyErr_BadInternalCall();
2097         return NULL;
2098     }
2099 
2100     if (size == -1) {
2101         size = wcslen(u);
2102     }
2103 
2104     /* If the Unicode data is known at construction time, we can apply
2105        some optimizations which share commonly used objects. */
2106 
2107     /* Optimization for empty strings */
2108     if (size == 0)
2109         _Py_RETURN_UNICODE_EMPTY();
2110 
2111     /* Single character Unicode objects in the Latin-1 range are
2112        shared when using this constructor */
2113     if (size == 1 && (Py_UCS4)*u < 256)
2114         return get_latin1_char((unsigned char)*u);
2115 
2116     /* If not empty and not single character, copy the Unicode data
2117        into the new object */
2118     if (find_maxchar_surrogates(u, u + size,
2119                                 &maxchar, &num_surrogates) == -1)
2120         return NULL;
2121 
2122     unicode = PyUnicode_New(size - num_surrogates, maxchar);
2123     if (!unicode)
2124         return NULL;
2125 
2126     switch (PyUnicode_KIND(unicode)) {
2127     case PyUnicode_1BYTE_KIND:
2128         _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2129                                 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2130         break;
2131     case PyUnicode_2BYTE_KIND:
2132 #if Py_UNICODE_SIZE == 2
2133         memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2134 #else
2135         _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2136                                 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2137 #endif
2138         break;
2139     case PyUnicode_4BYTE_KIND:
2140 #if SIZEOF_WCHAR_T == 2
2141         /* This is the only case which has to process surrogates, thus
2142            a simple copy loop is not enough and we need a function. */
2143         unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2144 #else
2145         assert(num_surrogates == 0);
2146         memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2147 #endif
2148         break;
2149     default:
2150         Py_UNREACHABLE();
2151     }
2152 
2153     return unicode_result(unicode);
2154 }
2155 
2156 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2157 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2158 {
2159     if (size < 0) {
2160         PyErr_SetString(PyExc_SystemError,
2161                         "Negative size passed to PyUnicode_FromStringAndSize");
2162         return NULL;
2163     }
2164     if (u != NULL)
2165         return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2166     else
2167         return (PyObject *)_PyUnicode_New(size);
2168 }
2169 
2170 PyObject *
PyUnicode_FromString(const char * u)2171 PyUnicode_FromString(const char *u)
2172 {
2173     size_t size = strlen(u);
2174     if (size > PY_SSIZE_T_MAX) {
2175         PyErr_SetString(PyExc_OverflowError, "input too long");
2176         return NULL;
2177     }
2178     return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2179 }
2180 
2181 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2182 _PyUnicode_FromId(_Py_Identifier *id)
2183 {
2184     if (!id->object) {
2185         id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2186                                                   strlen(id->string),
2187                                                   NULL, NULL);
2188         if (!id->object)
2189             return NULL;
2190         PyUnicode_InternInPlace(&id->object);
2191         assert(!id->next);
2192         id->next = static_strings;
2193         static_strings = id;
2194     }
2195     return id->object;
2196 }
2197 
2198 void
_PyUnicode_ClearStaticStrings()2199 _PyUnicode_ClearStaticStrings()
2200 {
2201     _Py_Identifier *tmp, *s = static_strings;
2202     while (s) {
2203         Py_CLEAR(s->object);
2204         tmp = s->next;
2205         s->next = NULL;
2206         s = tmp;
2207     }
2208     static_strings = NULL;
2209 }
2210 
2211 /* Internal function, doesn't check maximum character */
2212 
2213 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2214 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2215 {
2216     const unsigned char *s = (const unsigned char *)buffer;
2217     PyObject *unicode;
2218     if (size == 1) {
2219 #ifdef Py_DEBUG
2220         assert((unsigned char)s[0] < 128);
2221 #endif
2222         return get_latin1_char(s[0]);
2223     }
2224     unicode = PyUnicode_New(size, 127);
2225     if (!unicode)
2226         return NULL;
2227     memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2228     assert(_PyUnicode_CheckConsistency(unicode, 1));
2229     return unicode;
2230 }
2231 
2232 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2233 kind_maxchar_limit(unsigned int kind)
2234 {
2235     switch (kind) {
2236     case PyUnicode_1BYTE_KIND:
2237         return 0x80;
2238     case PyUnicode_2BYTE_KIND:
2239         return 0x100;
2240     case PyUnicode_4BYTE_KIND:
2241         return 0x10000;
2242     default:
2243         Py_UNREACHABLE();
2244     }
2245 }
2246 
2247 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2248 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2249 {
2250     PyObject *res;
2251     unsigned char max_char;
2252 
2253     if (size == 0)
2254         _Py_RETURN_UNICODE_EMPTY();
2255     assert(size > 0);
2256     if (size == 1)
2257         return get_latin1_char(u[0]);
2258 
2259     max_char = ucs1lib_find_max_char(u, u + size);
2260     res = PyUnicode_New(size, max_char);
2261     if (!res)
2262         return NULL;
2263     memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2264     assert(_PyUnicode_CheckConsistency(res, 1));
2265     return res;
2266 }
2267 
2268 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2269 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2270 {
2271     PyObject *res;
2272     Py_UCS2 max_char;
2273 
2274     if (size == 0)
2275         _Py_RETURN_UNICODE_EMPTY();
2276     assert(size > 0);
2277     if (size == 1)
2278         return unicode_char(u[0]);
2279 
2280     max_char = ucs2lib_find_max_char(u, u + size);
2281     res = PyUnicode_New(size, max_char);
2282     if (!res)
2283         return NULL;
2284     if (max_char >= 256)
2285         memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2286     else {
2287         _PyUnicode_CONVERT_BYTES(
2288             Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2289     }
2290     assert(_PyUnicode_CheckConsistency(res, 1));
2291     return res;
2292 }
2293 
2294 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2295 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2296 {
2297     PyObject *res;
2298     Py_UCS4 max_char;
2299 
2300     if (size == 0)
2301         _Py_RETURN_UNICODE_EMPTY();
2302     assert(size > 0);
2303     if (size == 1)
2304         return unicode_char(u[0]);
2305 
2306     max_char = ucs4lib_find_max_char(u, u + size);
2307     res = PyUnicode_New(size, max_char);
2308     if (!res)
2309         return NULL;
2310     if (max_char < 256)
2311         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2312                                  PyUnicode_1BYTE_DATA(res));
2313     else if (max_char < 0x10000)
2314         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2315                                  PyUnicode_2BYTE_DATA(res));
2316     else
2317         memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2318     assert(_PyUnicode_CheckConsistency(res, 1));
2319     return res;
2320 }
2321 
2322 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2323 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2324 {
2325     if (size < 0) {
2326         PyErr_SetString(PyExc_ValueError, "size must be positive");
2327         return NULL;
2328     }
2329     switch (kind) {
2330     case PyUnicode_1BYTE_KIND:
2331         return _PyUnicode_FromUCS1(buffer, size);
2332     case PyUnicode_2BYTE_KIND:
2333         return _PyUnicode_FromUCS2(buffer, size);
2334     case PyUnicode_4BYTE_KIND:
2335         return _PyUnicode_FromUCS4(buffer, size);
2336     default:
2337         PyErr_SetString(PyExc_SystemError, "invalid kind");
2338         return NULL;
2339     }
2340 }
2341 
2342 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2343 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2344 {
2345     enum PyUnicode_Kind kind;
2346     void *startptr, *endptr;
2347 
2348     assert(PyUnicode_IS_READY(unicode));
2349     assert(0 <= start);
2350     assert(end <= PyUnicode_GET_LENGTH(unicode));
2351     assert(start <= end);
2352 
2353     if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2354         return PyUnicode_MAX_CHAR_VALUE(unicode);
2355 
2356     if (start == end)
2357         return 127;
2358 
2359     if (PyUnicode_IS_ASCII(unicode))
2360         return 127;
2361 
2362     kind = PyUnicode_KIND(unicode);
2363     startptr = PyUnicode_DATA(unicode);
2364     endptr = (char *)startptr + end * kind;
2365     startptr = (char *)startptr + start * kind;
2366     switch(kind) {
2367     case PyUnicode_1BYTE_KIND:
2368         return ucs1lib_find_max_char(startptr, endptr);
2369     case PyUnicode_2BYTE_KIND:
2370         return ucs2lib_find_max_char(startptr, endptr);
2371     case PyUnicode_4BYTE_KIND:
2372         return ucs4lib_find_max_char(startptr, endptr);
2373     default:
2374         Py_UNREACHABLE();
2375     }
2376 }
2377 
2378 /* Ensure that a string uses the most efficient storage, if it is not the
2379    case: create a new string with of the right kind. Write NULL into *p_unicode
2380    on error. */
2381 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2382 unicode_adjust_maxchar(PyObject **p_unicode)
2383 {
2384     PyObject *unicode, *copy;
2385     Py_UCS4 max_char;
2386     Py_ssize_t len;
2387     unsigned int kind;
2388 
2389     assert(p_unicode != NULL);
2390     unicode = *p_unicode;
2391     assert(PyUnicode_IS_READY(unicode));
2392     if (PyUnicode_IS_ASCII(unicode))
2393         return;
2394 
2395     len = PyUnicode_GET_LENGTH(unicode);
2396     kind = PyUnicode_KIND(unicode);
2397     if (kind == PyUnicode_1BYTE_KIND) {
2398         const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2399         max_char = ucs1lib_find_max_char(u, u + len);
2400         if (max_char >= 128)
2401             return;
2402     }
2403     else if (kind == PyUnicode_2BYTE_KIND) {
2404         const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2405         max_char = ucs2lib_find_max_char(u, u + len);
2406         if (max_char >= 256)
2407             return;
2408     }
2409     else {
2410         const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2411         assert(kind == PyUnicode_4BYTE_KIND);
2412         max_char = ucs4lib_find_max_char(u, u + len);
2413         if (max_char >= 0x10000)
2414             return;
2415     }
2416     copy = PyUnicode_New(len, max_char);
2417     if (copy != NULL)
2418         _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2419     Py_DECREF(unicode);
2420     *p_unicode = copy;
2421 }
2422 
2423 PyObject*
_PyUnicode_Copy(PyObject * unicode)2424 _PyUnicode_Copy(PyObject *unicode)
2425 {
2426     Py_ssize_t length;
2427     PyObject *copy;
2428 
2429     if (!PyUnicode_Check(unicode)) {
2430         PyErr_BadInternalCall();
2431         return NULL;
2432     }
2433     if (PyUnicode_READY(unicode) == -1)
2434         return NULL;
2435 
2436     length = PyUnicode_GET_LENGTH(unicode);
2437     copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2438     if (!copy)
2439         return NULL;
2440     assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2441 
2442     memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2443               length * PyUnicode_KIND(unicode));
2444     assert(_PyUnicode_CheckConsistency(copy, 1));
2445     return copy;
2446 }
2447 
2448 
2449 /* Widen Unicode objects to larger buffers. Don't write terminating null
2450    character. Return NULL on error. */
2451 
2452 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2453 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2454 {
2455     Py_ssize_t len;
2456     void *result;
2457     unsigned int skind;
2458 
2459     if (PyUnicode_READY(s) == -1)
2460         return NULL;
2461 
2462     len = PyUnicode_GET_LENGTH(s);
2463     skind = PyUnicode_KIND(s);
2464     if (skind >= kind) {
2465         PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2466         return NULL;
2467     }
2468     switch (kind) {
2469     case PyUnicode_2BYTE_KIND:
2470         result = PyMem_New(Py_UCS2, len);
2471         if (!result)
2472             return PyErr_NoMemory();
2473         assert(skind == PyUnicode_1BYTE_KIND);
2474         _PyUnicode_CONVERT_BYTES(
2475             Py_UCS1, Py_UCS2,
2476             PyUnicode_1BYTE_DATA(s),
2477             PyUnicode_1BYTE_DATA(s) + len,
2478             result);
2479         return result;
2480     case PyUnicode_4BYTE_KIND:
2481         result = PyMem_New(Py_UCS4, len);
2482         if (!result)
2483             return PyErr_NoMemory();
2484         if (skind == PyUnicode_2BYTE_KIND) {
2485             _PyUnicode_CONVERT_BYTES(
2486                 Py_UCS2, Py_UCS4,
2487                 PyUnicode_2BYTE_DATA(s),
2488                 PyUnicode_2BYTE_DATA(s) + len,
2489                 result);
2490         }
2491         else {
2492             assert(skind == PyUnicode_1BYTE_KIND);
2493             _PyUnicode_CONVERT_BYTES(
2494                 Py_UCS1, Py_UCS4,
2495                 PyUnicode_1BYTE_DATA(s),
2496                 PyUnicode_1BYTE_DATA(s) + len,
2497                 result);
2498         }
2499         return result;
2500     default:
2501         break;
2502     }
2503     PyErr_SetString(PyExc_SystemError, "invalid kind");
2504     return NULL;
2505 }
2506 
2507 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2508 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2509         int copy_null)
2510 {
2511     int kind;
2512     void *data;
2513     Py_ssize_t len, targetlen;
2514     if (PyUnicode_READY(string) == -1)
2515         return NULL;
2516     kind = PyUnicode_KIND(string);
2517     data = PyUnicode_DATA(string);
2518     len = PyUnicode_GET_LENGTH(string);
2519     targetlen = len;
2520     if (copy_null)
2521         targetlen++;
2522     if (!target) {
2523         target = PyMem_New(Py_UCS4, targetlen);
2524         if (!target) {
2525             PyErr_NoMemory();
2526             return NULL;
2527         }
2528     }
2529     else {
2530         if (targetsize < targetlen) {
2531             PyErr_Format(PyExc_SystemError,
2532                          "string is longer than the buffer");
2533             if (copy_null && 0 < targetsize)
2534                 target[0] = 0;
2535             return NULL;
2536         }
2537     }
2538     if (kind == PyUnicode_1BYTE_KIND) {
2539         Py_UCS1 *start = (Py_UCS1 *) data;
2540         _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2541     }
2542     else if (kind == PyUnicode_2BYTE_KIND) {
2543         Py_UCS2 *start = (Py_UCS2 *) data;
2544         _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2545     }
2546     else {
2547         assert(kind == PyUnicode_4BYTE_KIND);
2548         memcpy(target, data, len * sizeof(Py_UCS4));
2549     }
2550     if (copy_null)
2551         target[len] = 0;
2552     return target;
2553 }
2554 
2555 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2556 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2557                  int copy_null)
2558 {
2559     if (target == NULL || targetsize < 0) {
2560         PyErr_BadInternalCall();
2561         return NULL;
2562     }
2563     return as_ucs4(string, target, targetsize, copy_null);
2564 }
2565 
2566 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2567 PyUnicode_AsUCS4Copy(PyObject *string)
2568 {
2569     return as_ucs4(string, NULL, 0, 1);
2570 }
2571 
2572 /* maximum number of characters required for output of %lld or %p.
2573    We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2574    plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2575 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2576 
2577 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2578 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2579                              Py_ssize_t width, Py_ssize_t precision)
2580 {
2581     Py_ssize_t length, fill, arglen;
2582     Py_UCS4 maxchar;
2583 
2584     if (PyUnicode_READY(str) == -1)
2585         return -1;
2586 
2587     length = PyUnicode_GET_LENGTH(str);
2588     if ((precision == -1 || precision >= length)
2589         && width <= length)
2590         return _PyUnicodeWriter_WriteStr(writer, str);
2591 
2592     if (precision != -1)
2593         length = Py_MIN(precision, length);
2594 
2595     arglen = Py_MAX(length, width);
2596     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2597         maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2598     else
2599         maxchar = writer->maxchar;
2600 
2601     if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2602         return -1;
2603 
2604     if (width > length) {
2605         fill = width - length;
2606         if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2607             return -1;
2608         writer->pos += fill;
2609     }
2610 
2611     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2612                                   str, 0, length);
2613     writer->pos += length;
2614     return 0;
2615 }
2616 
2617 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2618 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2619                               Py_ssize_t width, Py_ssize_t precision)
2620 {
2621     /* UTF-8 */
2622     Py_ssize_t length;
2623     PyObject *unicode;
2624     int res;
2625 
2626     if (precision == -1) {
2627         length = strlen(str);
2628     }
2629     else {
2630         length = 0;
2631         while (length < precision && str[length]) {
2632             length++;
2633         }
2634     }
2635     unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2636     if (unicode == NULL)
2637         return -1;
2638 
2639     res = unicode_fromformat_write_str(writer, unicode, width, -1);
2640     Py_DECREF(unicode);
2641     return res;
2642 }
2643 
2644 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2645 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2646                        const char *f, va_list *vargs)
2647 {
2648     const char *p;
2649     Py_ssize_t len;
2650     int zeropad;
2651     Py_ssize_t width;
2652     Py_ssize_t precision;
2653     int longflag;
2654     int longlongflag;
2655     int size_tflag;
2656     Py_ssize_t fill;
2657 
2658     p = f;
2659     f++;
2660     zeropad = 0;
2661     if (*f == '0') {
2662         zeropad = 1;
2663         f++;
2664     }
2665 
2666     /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2667     width = -1;
2668     if (Py_ISDIGIT((unsigned)*f)) {
2669         width = *f - '0';
2670         f++;
2671         while (Py_ISDIGIT((unsigned)*f)) {
2672             if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2673                 PyErr_SetString(PyExc_ValueError,
2674                                 "width too big");
2675                 return NULL;
2676             }
2677             width = (width * 10) + (*f - '0');
2678             f++;
2679         }
2680     }
2681     precision = -1;
2682     if (*f == '.') {
2683         f++;
2684         if (Py_ISDIGIT((unsigned)*f)) {
2685             precision = (*f - '0');
2686             f++;
2687             while (Py_ISDIGIT((unsigned)*f)) {
2688                 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2689                     PyErr_SetString(PyExc_ValueError,
2690                                     "precision too big");
2691                     return NULL;
2692                 }
2693                 precision = (precision * 10) + (*f - '0');
2694                 f++;
2695             }
2696         }
2697         if (*f == '%') {
2698             /* "%.3%s" => f points to "3" */
2699             f--;
2700         }
2701     }
2702     if (*f == '\0') {
2703         /* bogus format "%.123" => go backward, f points to "3" */
2704         f--;
2705     }
2706 
2707     /* Handle %ld, %lu, %lld and %llu. */
2708     longflag = 0;
2709     longlongflag = 0;
2710     size_tflag = 0;
2711     if (*f == 'l') {
2712         if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2713             longflag = 1;
2714             ++f;
2715         }
2716         else if (f[1] == 'l' &&
2717                  (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2718             longlongflag = 1;
2719             f += 2;
2720         }
2721     }
2722     /* handle the size_t flag. */
2723     else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2724         size_tflag = 1;
2725         ++f;
2726     }
2727 
2728     if (f[1] == '\0')
2729         writer->overallocate = 0;
2730 
2731     switch (*f) {
2732     case 'c':
2733     {
2734         int ordinal = va_arg(*vargs, int);
2735         if (ordinal < 0 || ordinal > MAX_UNICODE) {
2736             PyErr_SetString(PyExc_OverflowError,
2737                             "character argument not in range(0x110000)");
2738             return NULL;
2739         }
2740         if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2741             return NULL;
2742         break;
2743     }
2744 
2745     case 'i':
2746     case 'd':
2747     case 'u':
2748     case 'x':
2749     {
2750         /* used by sprintf */
2751         char buffer[MAX_LONG_LONG_CHARS];
2752         Py_ssize_t arglen;
2753 
2754         if (*f == 'u') {
2755             if (longflag)
2756                 len = sprintf(buffer, "%lu",
2757                         va_arg(*vargs, unsigned long));
2758             else if (longlongflag)
2759                 len = sprintf(buffer, "%llu",
2760                         va_arg(*vargs, unsigned long long));
2761             else if (size_tflag)
2762                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2763                         va_arg(*vargs, size_t));
2764             else
2765                 len = sprintf(buffer, "%u",
2766                         va_arg(*vargs, unsigned int));
2767         }
2768         else if (*f == 'x') {
2769             len = sprintf(buffer, "%x", va_arg(*vargs, int));
2770         }
2771         else {
2772             if (longflag)
2773                 len = sprintf(buffer, "%li",
2774                         va_arg(*vargs, long));
2775             else if (longlongflag)
2776                 len = sprintf(buffer, "%lli",
2777                         va_arg(*vargs, long long));
2778             else if (size_tflag)
2779                 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2780                         va_arg(*vargs, Py_ssize_t));
2781             else
2782                 len = sprintf(buffer, "%i",
2783                         va_arg(*vargs, int));
2784         }
2785         assert(len >= 0);
2786 
2787         if (precision < len)
2788             precision = len;
2789 
2790         arglen = Py_MAX(precision, width);
2791         if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2792             return NULL;
2793 
2794         if (width > precision) {
2795             Py_UCS4 fillchar;
2796             fill = width - precision;
2797             fillchar = zeropad?'0':' ';
2798             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2799                 return NULL;
2800             writer->pos += fill;
2801         }
2802         if (precision > len) {
2803             fill = precision - len;
2804             if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2805                 return NULL;
2806             writer->pos += fill;
2807         }
2808 
2809         if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2810             return NULL;
2811         break;
2812     }
2813 
2814     case 'p':
2815     {
2816         char number[MAX_LONG_LONG_CHARS];
2817 
2818         len = sprintf(number, "%p", va_arg(*vargs, void*));
2819         assert(len >= 0);
2820 
2821         /* %p is ill-defined:  ensure leading 0x. */
2822         if (number[1] == 'X')
2823             number[1] = 'x';
2824         else if (number[1] != 'x') {
2825             memmove(number + 2, number,
2826                     strlen(number) + 1);
2827             number[0] = '0';
2828             number[1] = 'x';
2829             len += 2;
2830         }
2831 
2832         if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2833             return NULL;
2834         break;
2835     }
2836 
2837     case 's':
2838     {
2839         /* UTF-8 */
2840         const char *s = va_arg(*vargs, const char*);
2841         if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2842             return NULL;
2843         break;
2844     }
2845 
2846     case 'U':
2847     {
2848         PyObject *obj = va_arg(*vargs, PyObject *);
2849         assert(obj && _PyUnicode_CHECK(obj));
2850 
2851         if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2852             return NULL;
2853         break;
2854     }
2855 
2856     case 'V':
2857     {
2858         PyObject *obj = va_arg(*vargs, PyObject *);
2859         const char *str = va_arg(*vargs, const char *);
2860         if (obj) {
2861             assert(_PyUnicode_CHECK(obj));
2862             if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2863                 return NULL;
2864         }
2865         else {
2866             assert(str != NULL);
2867             if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2868                 return NULL;
2869         }
2870         break;
2871     }
2872 
2873     case 'S':
2874     {
2875         PyObject *obj = va_arg(*vargs, PyObject *);
2876         PyObject *str;
2877         assert(obj);
2878         str = PyObject_Str(obj);
2879         if (!str)
2880             return NULL;
2881         if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2882             Py_DECREF(str);
2883             return NULL;
2884         }
2885         Py_DECREF(str);
2886         break;
2887     }
2888 
2889     case 'R':
2890     {
2891         PyObject *obj = va_arg(*vargs, PyObject *);
2892         PyObject *repr;
2893         assert(obj);
2894         repr = PyObject_Repr(obj);
2895         if (!repr)
2896             return NULL;
2897         if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2898             Py_DECREF(repr);
2899             return NULL;
2900         }
2901         Py_DECREF(repr);
2902         break;
2903     }
2904 
2905     case 'A':
2906     {
2907         PyObject *obj = va_arg(*vargs, PyObject *);
2908         PyObject *ascii;
2909         assert(obj);
2910         ascii = PyObject_ASCII(obj);
2911         if (!ascii)
2912             return NULL;
2913         if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2914             Py_DECREF(ascii);
2915             return NULL;
2916         }
2917         Py_DECREF(ascii);
2918         break;
2919     }
2920 
2921     case '%':
2922         if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2923             return NULL;
2924         break;
2925 
2926     default:
2927         /* if we stumble upon an unknown formatting code, copy the rest
2928            of the format string to the output string. (we cannot just
2929            skip the code, since there's no way to know what's in the
2930            argument list) */
2931         len = strlen(p);
2932         if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2933             return NULL;
2934         f = p+len;
2935         return f;
2936     }
2937 
2938     f++;
2939     return f;
2940 }
2941 
2942 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2943 PyUnicode_FromFormatV(const char *format, va_list vargs)
2944 {
2945     va_list vargs2;
2946     const char *f;
2947     _PyUnicodeWriter writer;
2948 
2949     _PyUnicodeWriter_Init(&writer);
2950     writer.min_length = strlen(format) + 100;
2951     writer.overallocate = 1;
2952 
2953     // Copy varags to be able to pass a reference to a subfunction.
2954     va_copy(vargs2, vargs);
2955 
2956     for (f = format; *f; ) {
2957         if (*f == '%') {
2958             f = unicode_fromformat_arg(&writer, f, &vargs2);
2959             if (f == NULL)
2960                 goto fail;
2961         }
2962         else {
2963             const char *p;
2964             Py_ssize_t len;
2965 
2966             p = f;
2967             do
2968             {
2969                 if ((unsigned char)*p > 127) {
2970                     PyErr_Format(PyExc_ValueError,
2971                         "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2972                         "string, got a non-ASCII byte: 0x%02x",
2973                         (unsigned char)*p);
2974                     goto fail;
2975                 }
2976                 p++;
2977             }
2978             while (*p != '\0' && *p != '%');
2979             len = p - f;
2980 
2981             if (*p == '\0')
2982                 writer.overallocate = 0;
2983 
2984             if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2985                 goto fail;
2986 
2987             f = p;
2988         }
2989     }
2990     va_end(vargs2);
2991     return _PyUnicodeWriter_Finish(&writer);
2992 
2993   fail:
2994     va_end(vargs2);
2995     _PyUnicodeWriter_Dealloc(&writer);
2996     return NULL;
2997 }
2998 
2999 PyObject *
PyUnicode_FromFormat(const char * format,...)3000 PyUnicode_FromFormat(const char *format, ...)
3001 {
3002     PyObject* ret;
3003     va_list vargs;
3004 
3005 #ifdef HAVE_STDARG_PROTOTYPES
3006     va_start(vargs, format);
3007 #else
3008     va_start(vargs);
3009 #endif
3010     ret = PyUnicode_FromFormatV(format, vargs);
3011     va_end(vargs);
3012     return ret;
3013 }
3014 
3015 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3016 unicode_get_widechar_size(PyObject *unicode)
3017 {
3018     Py_ssize_t res;
3019 
3020     assert(unicode != NULL);
3021     assert(_PyUnicode_CHECK(unicode));
3022 
3023     if (_PyUnicode_WSTR(unicode) != NULL) {
3024         return PyUnicode_WSTR_LENGTH(unicode);
3025     }
3026     assert(PyUnicode_IS_READY(unicode));
3027 
3028     res = _PyUnicode_LENGTH(unicode);
3029 #if SIZEOF_WCHAR_T == 2
3030     if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3031         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3032         const Py_UCS4 *end = s + res;
3033         for (; s < end; ++s) {
3034             if (*s > 0xFFFF) {
3035                 ++res;
3036             }
3037         }
3038     }
3039 #endif
3040     return res;
3041 }
3042 
3043 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3044 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3045 {
3046     const wchar_t *wstr;
3047 
3048     assert(unicode != NULL);
3049     assert(_PyUnicode_CHECK(unicode));
3050 
3051     wstr = _PyUnicode_WSTR(unicode);
3052     if (wstr != NULL) {
3053         memcpy(w, wstr, size * sizeof(wchar_t));
3054         return;
3055     }
3056     assert(PyUnicode_IS_READY(unicode));
3057 
3058     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3059         const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3060         for (; size--; ++s, ++w) {
3061             *w = *s;
3062         }
3063     }
3064     else {
3065 #if SIZEOF_WCHAR_T == 4
3066         assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3067         const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3068         for (; size--; ++s, ++w) {
3069             *w = *s;
3070         }
3071 #else
3072         assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3073         const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3074         for (; size--; ++s, ++w) {
3075             Py_UCS4 ch = *s;
3076             if (ch > 0xFFFF) {
3077                 assert(ch <= MAX_UNICODE);
3078                 /* encode surrogate pair in this case */
3079                 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3080                 if (!size--)
3081                     break;
3082                 *w = Py_UNICODE_LOW_SURROGATE(ch);
3083             }
3084             else {
3085                 *w = ch;
3086             }
3087         }
3088 #endif
3089     }
3090 }
3091 
3092 #ifdef HAVE_WCHAR_H
3093 
3094 /* Convert a Unicode object to a wide character string.
3095 
3096    - If w is NULL: return the number of wide characters (including the null
3097      character) required to convert the unicode object. Ignore size argument.
3098 
3099    - Otherwise: return the number of wide characters (excluding the null
3100      character) written into w. Write at most size wide characters (including
3101      the null character). */
3102 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3103 PyUnicode_AsWideChar(PyObject *unicode,
3104                      wchar_t *w,
3105                      Py_ssize_t size)
3106 {
3107     Py_ssize_t res;
3108 
3109     if (unicode == NULL) {
3110         PyErr_BadInternalCall();
3111         return -1;
3112     }
3113     if (!PyUnicode_Check(unicode)) {
3114         PyErr_BadArgument();
3115         return -1;
3116     }
3117 
3118     res = unicode_get_widechar_size(unicode);
3119     if (w == NULL) {
3120         return res + 1;
3121     }
3122 
3123     if (size > res) {
3124         size = res + 1;
3125     }
3126     else {
3127         res = size;
3128     }
3129     unicode_copy_as_widechar(unicode, w, size);
3130     return res;
3131 }
3132 
3133 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3134 PyUnicode_AsWideCharString(PyObject *unicode,
3135                            Py_ssize_t *size)
3136 {
3137     wchar_t *buffer;
3138     Py_ssize_t buflen;
3139 
3140     if (unicode == NULL) {
3141         PyErr_BadInternalCall();
3142         return NULL;
3143     }
3144     if (!PyUnicode_Check(unicode)) {
3145         PyErr_BadArgument();
3146         return NULL;
3147     }
3148 
3149     buflen = unicode_get_widechar_size(unicode);
3150     buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3151     if (buffer == NULL) {
3152         PyErr_NoMemory();
3153         return NULL;
3154     }
3155     unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3156     if (size != NULL) {
3157         *size = buflen;
3158     }
3159     else if (wcslen(buffer) != (size_t)buflen) {
3160         PyMem_FREE(buffer);
3161         PyErr_SetString(PyExc_ValueError,
3162                         "embedded null character");
3163         return NULL;
3164     }
3165     return buffer;
3166 }
3167 
3168 #endif /* HAVE_WCHAR_H */
3169 
3170 PyObject *
PyUnicode_FromOrdinal(int ordinal)3171 PyUnicode_FromOrdinal(int ordinal)
3172 {
3173     if (ordinal < 0 || ordinal > MAX_UNICODE) {
3174         PyErr_SetString(PyExc_ValueError,
3175                         "chr() arg not in range(0x110000)");
3176         return NULL;
3177     }
3178 
3179     return unicode_char((Py_UCS4)ordinal);
3180 }
3181 
3182 PyObject *
PyUnicode_FromObject(PyObject * obj)3183 PyUnicode_FromObject(PyObject *obj)
3184 {
3185     /* XXX Perhaps we should make this API an alias of
3186        PyObject_Str() instead ?! */
3187     if (PyUnicode_CheckExact(obj)) {
3188         if (PyUnicode_READY(obj) == -1)
3189             return NULL;
3190         Py_INCREF(obj);
3191         return obj;
3192     }
3193     if (PyUnicode_Check(obj)) {
3194         /* For a Unicode subtype that's not a Unicode object,
3195            return a true Unicode object with the same data. */
3196         return _PyUnicode_Copy(obj);
3197     }
3198     PyErr_Format(PyExc_TypeError,
3199                  "Can't convert '%.100s' object to str implicitly",
3200                  Py_TYPE(obj)->tp_name);
3201     return NULL;
3202 }
3203 
3204 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3205 PyUnicode_FromEncodedObject(PyObject *obj,
3206                             const char *encoding,
3207                             const char *errors)
3208 {
3209     Py_buffer buffer;
3210     PyObject *v;
3211 
3212     if (obj == NULL) {
3213         PyErr_BadInternalCall();
3214         return NULL;
3215     }
3216 
3217     /* Decoding bytes objects is the most common case and should be fast */
3218     if (PyBytes_Check(obj)) {
3219         if (PyBytes_GET_SIZE(obj) == 0)
3220             _Py_RETURN_UNICODE_EMPTY();
3221         v = PyUnicode_Decode(
3222                 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3223                 encoding, errors);
3224         return v;
3225     }
3226 
3227     if (PyUnicode_Check(obj)) {
3228         PyErr_SetString(PyExc_TypeError,
3229                         "decoding str is not supported");
3230         return NULL;
3231     }
3232 
3233     /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3234     if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3235         PyErr_Format(PyExc_TypeError,
3236                      "decoding to str: need a bytes-like object, %.80s found",
3237                      Py_TYPE(obj)->tp_name);
3238         return NULL;
3239     }
3240 
3241     if (buffer.len == 0) {
3242         PyBuffer_Release(&buffer);
3243         _Py_RETURN_UNICODE_EMPTY();
3244     }
3245 
3246     v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3247     PyBuffer_Release(&buffer);
3248     return v;
3249 }
3250 
3251 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3252    also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3253    longer than lower_len-1). */
3254 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3255 _Py_normalize_encoding(const char *encoding,
3256                        char *lower,
3257                        size_t lower_len)
3258 {
3259     const char *e;
3260     char *l;
3261     char *l_end;
3262     int punct;
3263 
3264     assert(encoding != NULL);
3265 
3266     e = encoding;
3267     l = lower;
3268     l_end = &lower[lower_len - 1];
3269     punct = 0;
3270     while (1) {
3271         char c = *e;
3272         if (c == 0) {
3273             break;
3274         }
3275 
3276         if (Py_ISALNUM(c) || c == '.') {
3277             if (punct && l != lower) {
3278                 if (l == l_end) {
3279                     return 0;
3280                 }
3281                 *l++ = '_';
3282             }
3283             punct = 0;
3284 
3285             if (l == l_end) {
3286                 return 0;
3287             }
3288             *l++ = Py_TOLOWER(c);
3289         }
3290         else {
3291             punct = 1;
3292         }
3293 
3294         e++;
3295     }
3296     *l = '\0';
3297     return 1;
3298 }
3299 
3300 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3301 PyUnicode_Decode(const char *s,
3302                  Py_ssize_t size,
3303                  const char *encoding,
3304                  const char *errors)
3305 {
3306     PyObject *buffer = NULL, *unicode;
3307     Py_buffer info;
3308     char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3309 
3310     if (encoding == NULL) {
3311         return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3312     }
3313 
3314     /* Shortcuts for common default encodings */
3315     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3316         char *lower = buflower;
3317 
3318         /* Fast paths */
3319         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3320             lower += 3;
3321             if (*lower == '_') {
3322                 /* Match "utf8" and "utf_8" */
3323                 lower++;
3324             }
3325 
3326             if (lower[0] == '8' && lower[1] == 0) {
3327                 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3328             }
3329             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3330                 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3331             }
3332             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3333                 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3334             }
3335         }
3336         else {
3337             if (strcmp(lower, "ascii") == 0
3338                 || strcmp(lower, "us_ascii") == 0) {
3339                 return PyUnicode_DecodeASCII(s, size, errors);
3340             }
3341     #ifdef MS_WINDOWS
3342             else if (strcmp(lower, "mbcs") == 0) {
3343                 return PyUnicode_DecodeMBCS(s, size, errors);
3344             }
3345     #endif
3346             else if (strcmp(lower, "latin1") == 0
3347                      || strcmp(lower, "latin_1") == 0
3348                      || strcmp(lower, "iso_8859_1") == 0
3349                      || strcmp(lower, "iso8859_1") == 0) {
3350                 return PyUnicode_DecodeLatin1(s, size, errors);
3351             }
3352         }
3353     }
3354 
3355     /* Decode via the codec registry */
3356     buffer = NULL;
3357     if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3358         goto onError;
3359     buffer = PyMemoryView_FromBuffer(&info);
3360     if (buffer == NULL)
3361         goto onError;
3362     unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3363     if (unicode == NULL)
3364         goto onError;
3365     if (!PyUnicode_Check(unicode)) {
3366         PyErr_Format(PyExc_TypeError,
3367                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3368                      "use codecs.decode() to decode to arbitrary types",
3369                      encoding,
3370                      Py_TYPE(unicode)->tp_name);
3371         Py_DECREF(unicode);
3372         goto onError;
3373     }
3374     Py_DECREF(buffer);
3375     return unicode_result(unicode);
3376 
3377   onError:
3378     Py_XDECREF(buffer);
3379     return NULL;
3380 }
3381 
3382 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3383 PyUnicode_AsDecodedObject(PyObject *unicode,
3384                           const char *encoding,
3385                           const char *errors)
3386 {
3387     if (!PyUnicode_Check(unicode)) {
3388         PyErr_BadArgument();
3389         return NULL;
3390     }
3391 
3392     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3393                      "PyUnicode_AsDecodedObject() is deprecated; "
3394                      "use PyCodec_Decode() to decode from str", 1) < 0)
3395         return NULL;
3396 
3397     if (encoding == NULL)
3398         encoding = PyUnicode_GetDefaultEncoding();
3399 
3400     /* Decode via the codec registry */
3401     return PyCodec_Decode(unicode, encoding, errors);
3402 }
3403 
3404 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3405 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3406                            const char *encoding,
3407                            const char *errors)
3408 {
3409     PyObject *v;
3410 
3411     if (!PyUnicode_Check(unicode)) {
3412         PyErr_BadArgument();
3413         goto onError;
3414     }
3415 
3416     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3417                      "PyUnicode_AsDecodedUnicode() is deprecated; "
3418                      "use PyCodec_Decode() to decode from str to str", 1) < 0)
3419         return NULL;
3420 
3421     if (encoding == NULL)
3422         encoding = PyUnicode_GetDefaultEncoding();
3423 
3424     /* Decode via the codec registry */
3425     v = PyCodec_Decode(unicode, encoding, errors);
3426     if (v == NULL)
3427         goto onError;
3428     if (!PyUnicode_Check(v)) {
3429         PyErr_Format(PyExc_TypeError,
3430                      "'%.400s' decoder returned '%.400s' instead of 'str'; "
3431                      "use codecs.decode() to decode to arbitrary types",
3432                      encoding,
3433                      Py_TYPE(unicode)->tp_name);
3434         Py_DECREF(v);
3435         goto onError;
3436     }
3437     return unicode_result(v);
3438 
3439   onError:
3440     return NULL;
3441 }
3442 
3443 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3444 PyUnicode_Encode(const Py_UNICODE *s,
3445                  Py_ssize_t size,
3446                  const char *encoding,
3447                  const char *errors)
3448 {
3449     PyObject *v, *unicode;
3450 
3451     unicode = PyUnicode_FromWideChar(s, size);
3452     if (unicode == NULL)
3453         return NULL;
3454     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3455     Py_DECREF(unicode);
3456     return v;
3457 }
3458 
3459 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3460 PyUnicode_AsEncodedObject(PyObject *unicode,
3461                           const char *encoding,
3462                           const char *errors)
3463 {
3464     PyObject *v;
3465 
3466     if (!PyUnicode_Check(unicode)) {
3467         PyErr_BadArgument();
3468         goto onError;
3469     }
3470 
3471     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3472                      "PyUnicode_AsEncodedObject() is deprecated; "
3473                      "use PyUnicode_AsEncodedString() to encode from str to bytes "
3474                      "or PyCodec_Encode() for generic encoding", 1) < 0)
3475         return NULL;
3476 
3477     if (encoding == NULL)
3478         encoding = PyUnicode_GetDefaultEncoding();
3479 
3480     /* Encode via the codec registry */
3481     v = PyCodec_Encode(unicode, encoding, errors);
3482     if (v == NULL)
3483         goto onError;
3484     return v;
3485 
3486   onError:
3487     return NULL;
3488 }
3489 
3490 
3491 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3492 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3493                       int current_locale)
3494 {
3495     Py_ssize_t wlen;
3496     wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3497     if (wstr == NULL) {
3498         return NULL;
3499     }
3500 
3501     if ((size_t)wlen != wcslen(wstr)) {
3502         PyErr_SetString(PyExc_ValueError, "embedded null character");
3503         PyMem_Free(wstr);
3504         return NULL;
3505     }
3506 
3507     char *str;
3508     size_t error_pos;
3509     const char *reason;
3510     int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3511                                  current_locale, error_handler);
3512     PyMem_Free(wstr);
3513 
3514     if (res != 0) {
3515         if (res == -2) {
3516             PyObject *exc;
3517             exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3518                     "locale", unicode,
3519                     (Py_ssize_t)error_pos,
3520                     (Py_ssize_t)(error_pos+1),
3521                     reason);
3522             if (exc != NULL) {
3523                 PyCodec_StrictErrors(exc);
3524                 Py_DECREF(exc);
3525             }
3526         }
3527         else if (res == -3) {
3528             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3529         }
3530         else {
3531             PyErr_NoMemory();
3532         }
3533         return NULL;
3534     }
3535 
3536     PyObject *bytes = PyBytes_FromString(str);
3537     PyMem_RawFree(str);
3538     return bytes;
3539 }
3540 
3541 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3542 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3543 {
3544     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3545     return unicode_encode_locale(unicode, error_handler, 1);
3546 }
3547 
3548 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3549 PyUnicode_EncodeFSDefault(PyObject *unicode)
3550 {
3551     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3552 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3553     if (interp->fs_codec.encoding) {
3554         return unicode_encode_utf8(unicode,
3555                                    interp->fs_codec.error_handler,
3556                                    interp->fs_codec.errors);
3557     }
3558     else {
3559         const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3560         _Py_error_handler errors;
3561         errors = get_error_handler_wide(filesystem_errors);
3562         assert(errors != _Py_ERROR_UNKNOWN);
3563         return unicode_encode_utf8(unicode, errors, NULL);
3564     }
3565 #else
3566     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3567        cannot use it to encode and decode filenames before it is loaded. Load
3568        the Python codec requires to encode at least its own filename. Use the C
3569        implementation of the locale codec until the codec registry is
3570        initialized and the Python codec is loaded. See initfsencoding(). */
3571     if (interp->fs_codec.encoding) {
3572         return PyUnicode_AsEncodedString(unicode,
3573                                          interp->fs_codec.encoding,
3574                                          interp->fs_codec.errors);
3575     }
3576     else {
3577         const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3578         _Py_error_handler errors;
3579         errors = get_error_handler_wide(filesystem_errors);
3580         assert(errors != _Py_ERROR_UNKNOWN);
3581         return unicode_encode_locale(unicode, errors, 0);
3582     }
3583 #endif
3584 }
3585 
3586 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3587 PyUnicode_AsEncodedString(PyObject *unicode,
3588                           const char *encoding,
3589                           const char *errors)
3590 {
3591     PyObject *v;
3592     char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3593 
3594     if (!PyUnicode_Check(unicode)) {
3595         PyErr_BadArgument();
3596         return NULL;
3597     }
3598 
3599     if (encoding == NULL) {
3600         return _PyUnicode_AsUTF8String(unicode, errors);
3601     }
3602 
3603     /* Shortcuts for common default encodings */
3604     if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3605         char *lower = buflower;
3606 
3607         /* Fast paths */
3608         if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3609             lower += 3;
3610             if (*lower == '_') {
3611                 /* Match "utf8" and "utf_8" */
3612                 lower++;
3613             }
3614 
3615             if (lower[0] == '8' && lower[1] == 0) {
3616                 return _PyUnicode_AsUTF8String(unicode, errors);
3617             }
3618             else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3619                 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3620             }
3621             else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3622                 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3623             }
3624         }
3625         else {
3626             if (strcmp(lower, "ascii") == 0
3627                 || strcmp(lower, "us_ascii") == 0) {
3628                 return _PyUnicode_AsASCIIString(unicode, errors);
3629             }
3630 #ifdef MS_WINDOWS
3631             else if (strcmp(lower, "mbcs") == 0) {
3632                 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3633             }
3634 #endif
3635             else if (strcmp(lower, "latin1") == 0 ||
3636                      strcmp(lower, "latin_1") == 0 ||
3637                      strcmp(lower, "iso_8859_1") == 0 ||
3638                      strcmp(lower, "iso8859_1") == 0) {
3639                 return _PyUnicode_AsLatin1String(unicode, errors);
3640             }
3641         }
3642     }
3643 
3644     /* Encode via the codec registry */
3645     v = _PyCodec_EncodeText(unicode, encoding, errors);
3646     if (v == NULL)
3647         return NULL;
3648 
3649     /* The normal path */
3650     if (PyBytes_Check(v))
3651         return v;
3652 
3653     /* If the codec returns a buffer, raise a warning and convert to bytes */
3654     if (PyByteArray_Check(v)) {
3655         int error;
3656         PyObject *b;
3657 
3658         error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3659             "encoder %s returned bytearray instead of bytes; "
3660             "use codecs.encode() to encode to arbitrary types",
3661             encoding);
3662         if (error) {
3663             Py_DECREF(v);
3664             return NULL;
3665         }
3666 
3667         b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3668                                       PyByteArray_GET_SIZE(v));
3669         Py_DECREF(v);
3670         return b;
3671     }
3672 
3673     PyErr_Format(PyExc_TypeError,
3674                  "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3675                  "use codecs.encode() to encode to arbitrary types",
3676                  encoding,
3677                  Py_TYPE(v)->tp_name);
3678     Py_DECREF(v);
3679     return NULL;
3680 }
3681 
3682 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3683 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3684                            const char *encoding,
3685                            const char *errors)
3686 {
3687     PyObject *v;
3688 
3689     if (!PyUnicode_Check(unicode)) {
3690         PyErr_BadArgument();
3691         goto onError;
3692     }
3693 
3694     if (PyErr_WarnEx(PyExc_DeprecationWarning,
3695                      "PyUnicode_AsEncodedUnicode() is deprecated; "
3696                      "use PyCodec_Encode() to encode from str to str", 1) < 0)
3697         return NULL;
3698 
3699     if (encoding == NULL)
3700         encoding = PyUnicode_GetDefaultEncoding();
3701 
3702     /* Encode via the codec registry */
3703     v = PyCodec_Encode(unicode, encoding, errors);
3704     if (v == NULL)
3705         goto onError;
3706     if (!PyUnicode_Check(v)) {
3707         PyErr_Format(PyExc_TypeError,
3708                      "'%.400s' encoder returned '%.400s' instead of 'str'; "
3709                      "use codecs.encode() to encode to arbitrary types",
3710                      encoding,
3711                      Py_TYPE(v)->tp_name);
3712         Py_DECREF(v);
3713         goto onError;
3714     }
3715     return v;
3716 
3717   onError:
3718     return NULL;
3719 }
3720 
3721 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3722 unicode_decode_locale(const char *str, Py_ssize_t len,
3723                       _Py_error_handler errors, int current_locale)
3724 {
3725     if (str[len] != '\0' || (size_t)len != strlen(str))  {
3726         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3727         return NULL;
3728     }
3729 
3730     wchar_t *wstr;
3731     size_t wlen;
3732     const char *reason;
3733     int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3734                                  current_locale, errors);
3735     if (res != 0) {
3736         if (res == -2) {
3737             PyObject *exc;
3738             exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3739                                         "locale", str, len,
3740                                         (Py_ssize_t)wlen,
3741                                         (Py_ssize_t)(wlen + 1),
3742                                         reason);
3743             if (exc != NULL) {
3744                 PyCodec_StrictErrors(exc);
3745                 Py_DECREF(exc);
3746             }
3747         }
3748         else if (res == -3) {
3749             PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3750         }
3751         else {
3752             PyErr_NoMemory();
3753         }
3754         return NULL;
3755     }
3756 
3757     PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3758     PyMem_RawFree(wstr);
3759     return unicode;
3760 }
3761 
3762 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3763 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3764                               const char *errors)
3765 {
3766     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3767     return unicode_decode_locale(str, len, error_handler, 1);
3768 }
3769 
3770 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3771 PyUnicode_DecodeLocale(const char *str, const char *errors)
3772 {
3773     Py_ssize_t size = (Py_ssize_t)strlen(str);
3774     _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3775     return unicode_decode_locale(str, size, error_handler, 1);
3776 }
3777 
3778 
3779 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3780 PyUnicode_DecodeFSDefault(const char *s) {
3781     Py_ssize_t size = (Py_ssize_t)strlen(s);
3782     return PyUnicode_DecodeFSDefaultAndSize(s, size);
3783 }
3784 
3785 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3786 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3787 {
3788     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3789 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3790     if (interp->fs_codec.encoding) {
3791         return unicode_decode_utf8(s, size,
3792                                    interp->fs_codec.error_handler,
3793                                    interp->fs_codec.errors,
3794                                    NULL);
3795     }
3796     else {
3797         const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3798         _Py_error_handler errors;
3799         errors = get_error_handler_wide(filesystem_errors);
3800         assert(errors != _Py_ERROR_UNKNOWN);
3801         return unicode_decode_utf8(s, size, errors, NULL, NULL);
3802     }
3803 #else
3804     /* Bootstrap check: if the filesystem codec is implemented in Python, we
3805        cannot use it to encode and decode filenames before it is loaded. Load
3806        the Python codec requires to encode at least its own filename. Use the C
3807        implementation of the locale codec until the codec registry is
3808        initialized and the Python codec is loaded. See initfsencoding(). */
3809     if (interp->fs_codec.encoding) {
3810         return PyUnicode_Decode(s, size,
3811                                 interp->fs_codec.encoding,
3812                                 interp->fs_codec.errors);
3813     }
3814     else {
3815         const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3816         _Py_error_handler errors;
3817         errors = get_error_handler_wide(filesystem_errors);
3818         return unicode_decode_locale(s, size, errors, 0);
3819     }
3820 #endif
3821 }
3822 
3823 
3824 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3825 PyUnicode_FSConverter(PyObject* arg, void* addr)
3826 {
3827     PyObject *path = NULL;
3828     PyObject *output = NULL;
3829     Py_ssize_t size;
3830     void *data;
3831     if (arg == NULL) {
3832         Py_DECREF(*(PyObject**)addr);
3833         *(PyObject**)addr = NULL;
3834         return 1;
3835     }
3836     path = PyOS_FSPath(arg);
3837     if (path == NULL) {
3838         return 0;
3839     }
3840     if (PyBytes_Check(path)) {
3841         output = path;
3842     }
3843     else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3844         output = PyUnicode_EncodeFSDefault(path);
3845         Py_DECREF(path);
3846         if (!output) {
3847             return 0;
3848         }
3849         assert(PyBytes_Check(output));
3850     }
3851 
3852     size = PyBytes_GET_SIZE(output);
3853     data = PyBytes_AS_STRING(output);
3854     if ((size_t)size != strlen(data)) {
3855         PyErr_SetString(PyExc_ValueError, "embedded null byte");
3856         Py_DECREF(output);
3857         return 0;
3858     }
3859     *(PyObject**)addr = output;
3860     return Py_CLEANUP_SUPPORTED;
3861 }
3862 
3863 
3864 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3865 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3866 {
3867     int is_buffer = 0;
3868     PyObject *path = NULL;
3869     PyObject *output = NULL;
3870     if (arg == NULL) {
3871         Py_DECREF(*(PyObject**)addr);
3872         *(PyObject**)addr = NULL;
3873         return 1;
3874     }
3875 
3876     is_buffer = PyObject_CheckBuffer(arg);
3877     if (!is_buffer) {
3878         path = PyOS_FSPath(arg);
3879         if (path == NULL) {
3880             return 0;
3881         }
3882     }
3883     else {
3884         path = arg;
3885         Py_INCREF(arg);
3886     }
3887 
3888     if (PyUnicode_Check(path)) {
3889         output = path;
3890     }
3891     else if (PyBytes_Check(path) || is_buffer) {
3892         PyObject *path_bytes = NULL;
3893 
3894         if (!PyBytes_Check(path) &&
3895             PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3896             "path should be string, bytes, or os.PathLike, not %.200s",
3897             Py_TYPE(arg)->tp_name)) {
3898                 Py_DECREF(path);
3899             return 0;
3900         }
3901         path_bytes = PyBytes_FromObject(path);
3902         Py_DECREF(path);
3903         if (!path_bytes) {
3904             return 0;
3905         }
3906         output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3907                                                   PyBytes_GET_SIZE(path_bytes));
3908         Py_DECREF(path_bytes);
3909         if (!output) {
3910             return 0;
3911         }
3912     }
3913     else {
3914         PyErr_Format(PyExc_TypeError,
3915                      "path should be string, bytes, or os.PathLike, not %.200s",
3916                      Py_TYPE(arg)->tp_name);
3917         Py_DECREF(path);
3918         return 0;
3919     }
3920     if (PyUnicode_READY(output) == -1) {
3921         Py_DECREF(output);
3922         return 0;
3923     }
3924     if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3925                  PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3926         PyErr_SetString(PyExc_ValueError, "embedded null character");
3927         Py_DECREF(output);
3928         return 0;
3929     }
3930     *(PyObject**)addr = output;
3931     return Py_CLEANUP_SUPPORTED;
3932 }
3933 
3934 
3935 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3936 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3937 {
3938     PyObject *bytes;
3939 
3940     if (!PyUnicode_Check(unicode)) {
3941         PyErr_BadArgument();
3942         return NULL;
3943     }
3944     if (PyUnicode_READY(unicode) == -1)
3945         return NULL;
3946 
3947     if (PyUnicode_UTF8(unicode) == NULL) {
3948         assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3949         bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3950         if (bytes == NULL)
3951             return NULL;
3952         _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3953         if (_PyUnicode_UTF8(unicode) == NULL) {
3954             PyErr_NoMemory();
3955             Py_DECREF(bytes);
3956             return NULL;
3957         }
3958         _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3959         memcpy(_PyUnicode_UTF8(unicode),
3960                   PyBytes_AS_STRING(bytes),
3961                   _PyUnicode_UTF8_LENGTH(unicode) + 1);
3962         Py_DECREF(bytes);
3963     }
3964 
3965     if (psize)
3966         *psize = PyUnicode_UTF8_LENGTH(unicode);
3967     return PyUnicode_UTF8(unicode);
3968 }
3969 
3970 const char *
PyUnicode_AsUTF8(PyObject * unicode)3971 PyUnicode_AsUTF8(PyObject *unicode)
3972 {
3973     return PyUnicode_AsUTF8AndSize(unicode, NULL);
3974 }
3975 
3976 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)3977 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3978 {
3979     if (!PyUnicode_Check(unicode)) {
3980         PyErr_BadArgument();
3981         return NULL;
3982     }
3983     Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3984     if (w == NULL) {
3985         /* Non-ASCII compact unicode object */
3986         assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
3987         assert(PyUnicode_IS_READY(unicode));
3988 
3989         Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3990         if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3991             PyErr_NoMemory();
3992             return NULL;
3993         }
3994         w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3995         if (w == NULL) {
3996             PyErr_NoMemory();
3997             return NULL;
3998         }
3999         unicode_copy_as_widechar(unicode, w, wlen + 1);
4000         _PyUnicode_WSTR(unicode) = w;
4001         if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4002             _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4003         }
4004     }
4005     if (size != NULL)
4006         *size = PyUnicode_WSTR_LENGTH(unicode);
4007     return w;
4008 }
4009 
4010 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4011 PyUnicode_AsUnicode(PyObject *unicode)
4012 {
4013     return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4014 }
4015 
4016 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4017 _PyUnicode_AsUnicode(PyObject *unicode)
4018 {
4019     Py_ssize_t size;
4020     const Py_UNICODE *wstr;
4021 
4022     wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4023     if (wstr && wcslen(wstr) != (size_t)size) {
4024         PyErr_SetString(PyExc_ValueError, "embedded null character");
4025         return NULL;
4026     }
4027     return wstr;
4028 }
4029 
4030 
4031 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4032 PyUnicode_GetSize(PyObject *unicode)
4033 {
4034     if (!PyUnicode_Check(unicode)) {
4035         PyErr_BadArgument();
4036         goto onError;
4037     }
4038     if (_PyUnicode_WSTR(unicode) == NULL) {
4039         if (PyUnicode_AsUnicode(unicode) == NULL)
4040             goto onError;
4041     }
4042     return PyUnicode_WSTR_LENGTH(unicode);
4043 
4044   onError:
4045     return -1;
4046 }
4047 
4048 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4049 PyUnicode_GetLength(PyObject *unicode)
4050 {
4051     if (!PyUnicode_Check(unicode)) {
4052         PyErr_BadArgument();
4053         return -1;
4054     }
4055     if (PyUnicode_READY(unicode) == -1)
4056         return -1;
4057     return PyUnicode_GET_LENGTH(unicode);
4058 }
4059 
4060 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4061 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4062 {
4063     void *data;
4064     int kind;
4065 
4066     if (!PyUnicode_Check(unicode)) {
4067         PyErr_BadArgument();
4068         return (Py_UCS4)-1;
4069     }
4070     if (PyUnicode_READY(unicode) == -1) {
4071         return (Py_UCS4)-1;
4072     }
4073     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4074         PyErr_SetString(PyExc_IndexError, "string index out of range");
4075         return (Py_UCS4)-1;
4076     }
4077     data = PyUnicode_DATA(unicode);
4078     kind = PyUnicode_KIND(unicode);
4079     return PyUnicode_READ(kind, data, index);
4080 }
4081 
4082 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4083 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4084 {
4085     if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4086         PyErr_BadArgument();
4087         return -1;
4088     }
4089     assert(PyUnicode_IS_READY(unicode));
4090     if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4091         PyErr_SetString(PyExc_IndexError, "string index out of range");
4092         return -1;
4093     }
4094     if (unicode_check_modifiable(unicode))
4095         return -1;
4096     if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4097         PyErr_SetString(PyExc_ValueError, "character out of range");
4098         return -1;
4099     }
4100     PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4101                     index, ch);
4102     return 0;
4103 }
4104 
4105 const char *
PyUnicode_GetDefaultEncoding(void)4106 PyUnicode_GetDefaultEncoding(void)
4107 {
4108     return "utf-8";
4109 }
4110 
4111 /* create or adjust a UnicodeDecodeError */
4112 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4113 make_decode_exception(PyObject **exceptionObject,
4114                       const char *encoding,
4115                       const char *input, Py_ssize_t length,
4116                       Py_ssize_t startpos, Py_ssize_t endpos,
4117                       const char *reason)
4118 {
4119     if (*exceptionObject == NULL) {
4120         *exceptionObject = PyUnicodeDecodeError_Create(
4121             encoding, input, length, startpos, endpos, reason);
4122     }
4123     else {
4124         if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4125             goto onError;
4126         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4127             goto onError;
4128         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4129             goto onError;
4130     }
4131     return;
4132 
4133 onError:
4134     Py_CLEAR(*exceptionObject);
4135 }
4136 
4137 #ifdef MS_WINDOWS
4138 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4139 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4140 {
4141     if (newsize > *size) {
4142         wchar_t *newbuf = *buf;
4143         if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4144             PyErr_NoMemory();
4145             return -1;
4146         }
4147         *buf = newbuf;
4148     }
4149     *size = newsize;
4150     return 0;
4151 }
4152 
4153 /* error handling callback helper:
4154    build arguments, call the callback and check the arguments,
4155    if no exception occurred, copy the replacement to the output
4156    and adjust various state variables.
4157    return 0 on success, -1 on error
4158 */
4159 
4160 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4161 unicode_decode_call_errorhandler_wchar(
4162     const char *errors, PyObject **errorHandler,
4163     const char *encoding, const char *reason,
4164     const char **input, const char **inend, Py_ssize_t *startinpos,
4165     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4166     wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4167 {
4168     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4169 
4170     PyObject *restuple = NULL;
4171     PyObject *repunicode = NULL;
4172     Py_ssize_t outsize;
4173     Py_ssize_t insize;
4174     Py_ssize_t requiredsize;
4175     Py_ssize_t newpos;
4176     PyObject *inputobj = NULL;
4177     wchar_t *repwstr;
4178     Py_ssize_t repwlen;
4179 
4180     if (*errorHandler == NULL) {
4181         *errorHandler = PyCodec_LookupError(errors);
4182         if (*errorHandler == NULL)
4183             goto onError;
4184     }
4185 
4186     make_decode_exception(exceptionObject,
4187         encoding,
4188         *input, *inend - *input,
4189         *startinpos, *endinpos,
4190         reason);
4191     if (*exceptionObject == NULL)
4192         goto onError;
4193 
4194     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4195     if (restuple == NULL)
4196         goto onError;
4197     if (!PyTuple_Check(restuple)) {
4198         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4199         goto onError;
4200     }
4201     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4202         goto onError;
4203 
4204     /* Copy back the bytes variables, which might have been modified by the
4205        callback */
4206     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4207     if (!inputobj)
4208         goto onError;
4209     *input = PyBytes_AS_STRING(inputobj);
4210     insize = PyBytes_GET_SIZE(inputobj);
4211     *inend = *input + insize;
4212     /* we can DECREF safely, as the exception has another reference,
4213        so the object won't go away. */
4214     Py_DECREF(inputobj);
4215 
4216     if (newpos<0)
4217         newpos = insize+newpos;
4218     if (newpos<0 || newpos>insize) {
4219         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4220         goto onError;
4221     }
4222 
4223     repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4224     if (repwstr == NULL)
4225         goto onError;
4226     /* need more space? (at least enough for what we
4227        have+the replacement+the rest of the string (starting
4228        at the new input position), so we won't have to check space
4229        when there are no errors in the rest of the string) */
4230     requiredsize = *outpos;
4231     if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4232         goto overflow;
4233     requiredsize += repwlen;
4234     if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4235         goto overflow;
4236     requiredsize += insize - newpos;
4237     outsize = *bufsize;
4238     if (requiredsize > outsize) {
4239         if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4240             requiredsize = 2*outsize;
4241         if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4242             goto onError;
4243         }
4244     }
4245     wcsncpy(*buf + *outpos, repwstr, repwlen);
4246     *outpos += repwlen;
4247     *endinpos = newpos;
4248     *inptr = *input + newpos;
4249 
4250     /* we made it! */
4251     Py_DECREF(restuple);
4252     return 0;
4253 
4254   overflow:
4255     PyErr_SetString(PyExc_OverflowError,
4256                     "decoded result is too long for a Python string");
4257 
4258   onError:
4259     Py_XDECREF(restuple);
4260     return -1;
4261 }
4262 #endif   /* MS_WINDOWS */
4263 
4264 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4265 unicode_decode_call_errorhandler_writer(
4266     const char *errors, PyObject **errorHandler,
4267     const char *encoding, const char *reason,
4268     const char **input, const char **inend, Py_ssize_t *startinpos,
4269     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4270     _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4271 {
4272     static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4273 
4274     PyObject *restuple = NULL;
4275     PyObject *repunicode = NULL;
4276     Py_ssize_t insize;
4277     Py_ssize_t newpos;
4278     Py_ssize_t replen;
4279     Py_ssize_t remain;
4280     PyObject *inputobj = NULL;
4281     int need_to_grow = 0;
4282     const char *new_inptr;
4283 
4284     if (*errorHandler == NULL) {
4285         *errorHandler = PyCodec_LookupError(errors);
4286         if (*errorHandler == NULL)
4287             goto onError;
4288     }
4289 
4290     make_decode_exception(exceptionObject,
4291         encoding,
4292         *input, *inend - *input,
4293         *startinpos, *endinpos,
4294         reason);
4295     if (*exceptionObject == NULL)
4296         goto onError;
4297 
4298     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4299     if (restuple == NULL)
4300         goto onError;
4301     if (!PyTuple_Check(restuple)) {
4302         PyErr_SetString(PyExc_TypeError, &argparse[3]);
4303         goto onError;
4304     }
4305     if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4306         goto onError;
4307 
4308     /* Copy back the bytes variables, which might have been modified by the
4309        callback */
4310     inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4311     if (!inputobj)
4312         goto onError;
4313     remain = *inend - *input - *endinpos;
4314     *input = PyBytes_AS_STRING(inputobj);
4315     insize = PyBytes_GET_SIZE(inputobj);
4316     *inend = *input + insize;
4317     /* we can DECREF safely, as the exception has another reference,
4318        so the object won't go away. */
4319     Py_DECREF(inputobj);
4320 
4321     if (newpos<0)
4322         newpos = insize+newpos;
4323     if (newpos<0 || newpos>insize) {
4324         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4325         goto onError;
4326     }
4327 
4328     replen = PyUnicode_GET_LENGTH(repunicode);
4329     if (replen > 1) {
4330         writer->min_length += replen - 1;
4331         need_to_grow = 1;
4332     }
4333     new_inptr = *input + newpos;
4334     if (*inend - new_inptr > remain) {
4335         /* We don't know the decoding algorithm here so we make the worst
4336            assumption that one byte decodes to one unicode character.
4337            If unfortunately one byte could decode to more unicode characters,
4338            the decoder may write out-of-bound then.  Is it possible for the
4339            algorithms using this function? */
4340         writer->min_length += *inend - new_inptr - remain;
4341         need_to_grow = 1;
4342     }
4343     if (need_to_grow) {
4344         writer->overallocate = 1;
4345         if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4346                             PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4347             goto onError;
4348     }
4349     if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4350         goto onError;
4351 
4352     *endinpos = newpos;
4353     *inptr = new_inptr;
4354 
4355     /* we made it! */
4356     Py_DECREF(restuple);
4357     return 0;
4358 
4359   onError:
4360     Py_XDECREF(restuple);
4361     return -1;
4362 }
4363 
4364 /* --- UTF-7 Codec -------------------------------------------------------- */
4365 
4366 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
4367 
4368 /* Three simple macros defining base-64. */
4369 
4370 /* Is c a base-64 character? */
4371 
4372 #define IS_BASE64(c) \
4373     (((c) >= 'A' && (c) <= 'Z') ||     \
4374      ((c) >= 'a' && (c) <= 'z') ||     \
4375      ((c) >= '0' && (c) <= '9') ||     \
4376      (c) == '+' || (c) == '/')
4377 
4378 /* given that c is a base-64 character, what is its base-64 value? */
4379 
4380 #define FROM_BASE64(c)                                                  \
4381     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4382      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4383      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4384      (c) == '+' ? 62 : 63)
4385 
4386 /* What is the base-64 character of the bottom 6 bits of n? */
4387 
4388 #define TO_BASE64(n)  \
4389     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4390 
4391 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4392  * decoded as itself.  We are permissive on decoding; the only ASCII
4393  * byte not decoding to itself is the + which begins a base64
4394  * string. */
4395 
4396 #define DECODE_DIRECT(c)                                \
4397     ((c) <= 127 && (c) != '+')
4398 
4399 /* The UTF-7 encoder treats ASCII characters differently according to
4400  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4401  * the above).  See RFC2152.  This array identifies these different
4402  * sets:
4403  * 0 : "Set D"
4404  *     alphanumeric and '(),-./:?
4405  * 1 : "Set O"
4406  *     !"#$%&*;<=>@[]^_`{|}
4407  * 2 : "whitespace"
4408  *     ht nl cr sp
4409  * 3 : special (must be base64 encoded)
4410  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4411  */
4412 
4413 static
4414 char utf7_category[128] = {
4415 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4416     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4417 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4418     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4419 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4420     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4421 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4422     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4423 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4424     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4425 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4426     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4427 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4428     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4429 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4430     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4431 };
4432 
4433 /* ENCODE_DIRECT: this character should be encoded as itself.  The
4434  * answer depends on whether we are encoding set O as itself, and also
4435  * on whether we are encoding whitespace as itself.  RFC2152 makes it
4436  * clear that the answers to these questions vary between
4437  * applications, so this code needs to be flexible.  */
4438 
4439 #define ENCODE_DIRECT(c, directO, directWS)             \
4440     ((c) < 128 && (c) > 0 &&                            \
4441      ((utf7_category[(c)] == 0) ||                      \
4442       (directWS && (utf7_category[(c)] == 2)) ||        \
4443       (directO && (utf7_category[(c)] == 1))))
4444 
4445 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4446 PyUnicode_DecodeUTF7(const char *s,
4447                      Py_ssize_t size,
4448                      const char *errors)
4449 {
4450     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4451 }
4452 
4453 /* The decoder.  The only state we preserve is our read position,
4454  * i.e. how many characters we have consumed.  So if we end in the
4455  * middle of a shift sequence we have to back off the read position
4456  * and the output to the beginning of the sequence, otherwise we lose
4457  * all the shift state (seen bits, number of bits seen, high
4458  * surrogate). */
4459 
4460 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4461 PyUnicode_DecodeUTF7Stateful(const char *s,
4462                              Py_ssize_t size,
4463                              const char *errors,
4464                              Py_ssize_t *consumed)
4465 {
4466     const char *starts = s;
4467     Py_ssize_t startinpos;
4468     Py_ssize_t endinpos;
4469     const char *e;
4470     _PyUnicodeWriter writer;
4471     const char *errmsg = "";
4472     int inShift = 0;
4473     Py_ssize_t shiftOutStart;
4474     unsigned int base64bits = 0;
4475     unsigned long base64buffer = 0;
4476     Py_UCS4 surrogate = 0;
4477     PyObject *errorHandler = NULL;
4478     PyObject *exc = NULL;
4479 
4480     if (size == 0) {
4481         if (consumed)
4482             *consumed = 0;
4483         _Py_RETURN_UNICODE_EMPTY();
4484     }
4485 
4486     /* Start off assuming it's all ASCII. Widen later as necessary. */
4487     _PyUnicodeWriter_Init(&writer);
4488     writer.min_length = size;
4489 
4490     shiftOutStart = 0;
4491     e = s + size;
4492 
4493     while (s < e) {
4494         Py_UCS4 ch;
4495       restart:
4496         ch = (unsigned char) *s;
4497 
4498         if (inShift) { /* in a base-64 section */
4499             if (IS_BASE64(ch)) { /* consume a base-64 character */
4500                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4501                 base64bits += 6;
4502                 s++;
4503                 if (base64bits >= 16) {
4504                     /* we have enough bits for a UTF-16 value */
4505                     Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4506                     base64bits -= 16;
4507                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4508                     assert(outCh <= 0xffff);
4509                     if (surrogate) {
4510                         /* expecting a second surrogate */
4511                         if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4512                             Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4513                             if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4514                                 goto onError;
4515                             surrogate = 0;
4516                             continue;
4517                         }
4518                         else {
4519                             if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4520                                 goto onError;
4521                             surrogate = 0;
4522                         }
4523                     }
4524                     if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4525                         /* first surrogate */
4526                         surrogate = outCh;
4527                     }
4528                     else {
4529                         if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4530                             goto onError;
4531                     }
4532                 }
4533             }
4534             else { /* now leaving a base-64 section */
4535                 inShift = 0;
4536                 if (base64bits > 0) { /* left-over bits */
4537                     if (base64bits >= 6) {
4538                         /* We've seen at least one base-64 character */
4539                         s++;
4540                         errmsg = "partial character in shift sequence";
4541                         goto utf7Error;
4542                     }
4543                     else {
4544                         /* Some bits remain; they should be zero */
4545                         if (base64buffer != 0) {
4546                             s++;
4547                             errmsg = "non-zero padding bits in shift sequence";
4548                             goto utf7Error;
4549                         }
4550                     }
4551                 }
4552                 if (surrogate && DECODE_DIRECT(ch)) {
4553                     if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4554                         goto onError;
4555                 }
4556                 surrogate = 0;
4557                 if (ch == '-') {
4558                     /* '-' is absorbed; other terminating
4559                        characters are preserved */
4560                     s++;
4561                 }
4562             }
4563         }
4564         else if ( ch == '+' ) {
4565             startinpos = s-starts;
4566             s++; /* consume '+' */
4567             if (s < e && *s == '-') { /* '+-' encodes '+' */
4568                 s++;
4569                 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4570                     goto onError;
4571             }
4572             else if (s < e && !IS_BASE64(*s)) {
4573                 s++;
4574                 errmsg = "ill-formed sequence";
4575                 goto utf7Error;
4576             }
4577             else { /* begin base64-encoded section */
4578                 inShift = 1;
4579                 surrogate = 0;
4580                 shiftOutStart = writer.pos;
4581                 base64bits = 0;
4582                 base64buffer = 0;
4583             }
4584         }
4585         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4586             s++;
4587             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4588                 goto onError;
4589         }
4590         else {
4591             startinpos = s-starts;
4592             s++;
4593             errmsg = "unexpected special character";
4594             goto utf7Error;
4595         }
4596         continue;
4597 utf7Error:
4598         endinpos = s-starts;
4599         if (unicode_decode_call_errorhandler_writer(
4600                 errors, &errorHandler,
4601                 "utf7", errmsg,
4602                 &starts, &e, &startinpos, &endinpos, &exc, &s,
4603                 &writer))
4604             goto onError;
4605     }
4606 
4607     /* end of string */
4608 
4609     if (inShift && !consumed) { /* in shift sequence, no more to follow */
4610         /* if we're in an inconsistent state, that's an error */
4611         inShift = 0;
4612         if (surrogate ||
4613                 (base64bits >= 6) ||
4614                 (base64bits > 0 && base64buffer != 0)) {
4615             endinpos = size;
4616             if (unicode_decode_call_errorhandler_writer(
4617                     errors, &errorHandler,
4618                     "utf7", "unterminated shift sequence",
4619                     &starts, &e, &startinpos, &endinpos, &exc, &s,
4620                     &writer))
4621                 goto onError;
4622             if (s < e)
4623                 goto restart;
4624         }
4625     }
4626 
4627     /* return state */
4628     if (consumed) {
4629         if (inShift) {
4630             *consumed = startinpos;
4631             if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4632                 PyObject *result = PyUnicode_FromKindAndData(
4633                         writer.kind, writer.data, shiftOutStart);
4634                 Py_XDECREF(errorHandler);
4635                 Py_XDECREF(exc);
4636                 _PyUnicodeWriter_Dealloc(&writer);
4637                 return result;
4638             }
4639             writer.pos = shiftOutStart; /* back off output */
4640         }
4641         else {
4642             *consumed = s-starts;
4643         }
4644     }
4645 
4646     Py_XDECREF(errorHandler);
4647     Py_XDECREF(exc);
4648     return _PyUnicodeWriter_Finish(&writer);
4649 
4650   onError:
4651     Py_XDECREF(errorHandler);
4652     Py_XDECREF(exc);
4653     _PyUnicodeWriter_Dealloc(&writer);
4654     return NULL;
4655 }
4656 
4657 
4658 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4659 _PyUnicode_EncodeUTF7(PyObject *str,
4660                       int base64SetO,
4661                       int base64WhiteSpace,
4662                       const char *errors)
4663 {
4664     int kind;
4665     void *data;
4666     Py_ssize_t len;
4667     PyObject *v;
4668     int inShift = 0;
4669     Py_ssize_t i;
4670     unsigned int base64bits = 0;
4671     unsigned long base64buffer = 0;
4672     char * out;
4673     char * start;
4674 
4675     if (PyUnicode_READY(str) == -1)
4676         return NULL;
4677     kind = PyUnicode_KIND(str);
4678     data = PyUnicode_DATA(str);
4679     len = PyUnicode_GET_LENGTH(str);
4680 
4681     if (len == 0)
4682         return PyBytes_FromStringAndSize(NULL, 0);
4683 
4684     /* It might be possible to tighten this worst case */
4685     if (len > PY_SSIZE_T_MAX / 8)
4686         return PyErr_NoMemory();
4687     v = PyBytes_FromStringAndSize(NULL, len * 8);
4688     if (v == NULL)
4689         return NULL;
4690 
4691     start = out = PyBytes_AS_STRING(v);
4692     for (i = 0; i < len; ++i) {
4693         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4694 
4695         if (inShift) {
4696             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4697                 /* shifting out */
4698                 if (base64bits) { /* output remaining bits */
4699                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
4700                     base64buffer = 0;
4701                     base64bits = 0;
4702                 }
4703                 inShift = 0;
4704                 /* Characters not in the BASE64 set implicitly unshift the sequence
4705                    so no '-' is required, except if the character is itself a '-' */
4706                 if (IS_BASE64(ch) || ch == '-') {
4707                     *out++ = '-';
4708                 }
4709                 *out++ = (char) ch;
4710             }
4711             else {
4712                 goto encode_char;
4713             }
4714         }
4715         else { /* not in a shift sequence */
4716             if (ch == '+') {
4717                 *out++ = '+';
4718                         *out++ = '-';
4719             }
4720             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4721                 *out++ = (char) ch;
4722             }
4723             else {
4724                 *out++ = '+';
4725                 inShift = 1;
4726                 goto encode_char;
4727             }
4728         }
4729         continue;
4730 encode_char:
4731         if (ch >= 0x10000) {
4732             assert(ch <= MAX_UNICODE);
4733 
4734             /* code first surrogate */
4735             base64bits += 16;
4736             base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4737             while (base64bits >= 6) {
4738                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4739                 base64bits -= 6;
4740             }
4741             /* prepare second surrogate */
4742             ch = Py_UNICODE_LOW_SURROGATE(ch);
4743         }
4744         base64bits += 16;
4745         base64buffer = (base64buffer << 16) | ch;
4746         while (base64bits >= 6) {
4747             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4748             base64bits -= 6;
4749         }
4750     }
4751     if (base64bits)
4752         *out++= TO_BASE64(base64buffer << (6-base64bits) );
4753     if (inShift)
4754         *out++ = '-';
4755     if (_PyBytes_Resize(&v, out - start) < 0)
4756         return NULL;
4757     return v;
4758 }
4759 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4760 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4761                      Py_ssize_t size,
4762                      int base64SetO,
4763                      int base64WhiteSpace,
4764                      const char *errors)
4765 {
4766     PyObject *result;
4767     PyObject *tmp = PyUnicode_FromWideChar(s, size);
4768     if (tmp == NULL)
4769         return NULL;
4770     result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4771                                    base64WhiteSpace, errors);
4772     Py_DECREF(tmp);
4773     return result;
4774 }
4775 
4776 #undef IS_BASE64
4777 #undef FROM_BASE64
4778 #undef TO_BASE64
4779 #undef DECODE_DIRECT
4780 #undef ENCODE_DIRECT
4781 
4782 /* --- UTF-8 Codec -------------------------------------------------------- */
4783 
4784 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4785 PyUnicode_DecodeUTF8(const char *s,
4786                      Py_ssize_t size,
4787                      const char *errors)
4788 {
4789     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4790 }
4791 
4792 #include "stringlib/asciilib.h"
4793 #include "stringlib/codecs.h"
4794 #include "stringlib/undef.h"
4795 
4796 #include "stringlib/ucs1lib.h"
4797 #include "stringlib/codecs.h"
4798 #include "stringlib/undef.h"
4799 
4800 #include "stringlib/ucs2lib.h"
4801 #include "stringlib/codecs.h"
4802 #include "stringlib/undef.h"
4803 
4804 #include "stringlib/ucs4lib.h"
4805 #include "stringlib/codecs.h"
4806 #include "stringlib/undef.h"
4807 
4808 /* Mask to quickly check whether a C 'long' contains a
4809    non-ASCII, UTF8-encoded char. */
4810 #if (SIZEOF_LONG == 8)
4811 # define ASCII_CHAR_MASK 0x8080808080808080UL
4812 #elif (SIZEOF_LONG == 4)
4813 # define ASCII_CHAR_MASK 0x80808080UL
4814 #else
4815 # error C 'long' size should be either 4 or 8!
4816 #endif
4817 
4818 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4819 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4820 {
4821     const char *p = start;
4822     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4823 
4824     /*
4825      * Issue #17237: m68k is a bit different from most architectures in
4826      * that objects do not use "natural alignment" - for example, int and
4827      * long are only aligned at 2-byte boundaries.  Therefore the assert()
4828      * won't work; also, tests have shown that skipping the "optimised
4829      * version" will even speed up m68k.
4830      */
4831 #if !defined(__m68k__)
4832 #if SIZEOF_LONG <= SIZEOF_VOID_P
4833     assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4834     if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4835         /* Fast path, see in STRINGLIB(utf8_decode) for
4836            an explanation. */
4837         /* Help allocation */
4838         const char *_p = p;
4839         Py_UCS1 * q = dest;
4840         while (_p < aligned_end) {
4841             unsigned long value = *(const unsigned long *) _p;
4842             if (value & ASCII_CHAR_MASK)
4843                 break;
4844             *((unsigned long *)q) = value;
4845             _p += SIZEOF_LONG;
4846             q += SIZEOF_LONG;
4847         }
4848         p = _p;
4849         while (p < end) {
4850             if ((unsigned char)*p & 0x80)
4851                 break;
4852             *q++ = *p++;
4853         }
4854         return p - start;
4855     }
4856 #endif
4857 #endif
4858     while (p < end) {
4859         /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4860            for an explanation. */
4861         if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4862             /* Help allocation */
4863             const char *_p = p;
4864             while (_p < aligned_end) {
4865                 unsigned long value = *(const unsigned long *) _p;
4866                 if (value & ASCII_CHAR_MASK)
4867                     break;
4868                 _p += SIZEOF_LONG;
4869             }
4870             p = _p;
4871             if (_p == end)
4872                 break;
4873         }
4874         if ((unsigned char)*p & 0x80)
4875             break;
4876         ++p;
4877     }
4878     memcpy(dest, start, p - start);
4879     return p - start;
4880 }
4881 
4882 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)4883 unicode_decode_utf8(const char *s, Py_ssize_t size,
4884                     _Py_error_handler error_handler, const char *errors,
4885                     Py_ssize_t *consumed)
4886 {
4887     _PyUnicodeWriter writer;
4888     const char *starts = s;
4889     const char *end = s + size;
4890 
4891     Py_ssize_t startinpos;
4892     Py_ssize_t endinpos;
4893     const char *errmsg = "";
4894     PyObject *error_handler_obj = NULL;
4895     PyObject *exc = NULL;
4896 
4897     if (size == 0) {
4898         if (consumed)
4899             *consumed = 0;
4900         _Py_RETURN_UNICODE_EMPTY();
4901     }
4902 
4903     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4904     if (size == 1 && (unsigned char)s[0] < 128) {
4905         if (consumed)
4906             *consumed = 1;
4907         return get_latin1_char((unsigned char)s[0]);
4908     }
4909 
4910     _PyUnicodeWriter_Init(&writer);
4911     writer.min_length = size;
4912     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4913         goto onError;
4914 
4915     writer.pos = ascii_decode(s, end, writer.data);
4916     s += writer.pos;
4917     while (s < end) {
4918         Py_UCS4 ch;
4919         int kind = writer.kind;
4920 
4921         if (kind == PyUnicode_1BYTE_KIND) {
4922             if (PyUnicode_IS_ASCII(writer.buffer))
4923                 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4924             else
4925                 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4926         } else if (kind == PyUnicode_2BYTE_KIND) {
4927             ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4928         } else {
4929             assert(kind == PyUnicode_4BYTE_KIND);
4930             ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4931         }
4932 
4933         switch (ch) {
4934         case 0:
4935             if (s == end || consumed)
4936                 goto End;
4937             errmsg = "unexpected end of data";
4938             startinpos = s - starts;
4939             endinpos = end - starts;
4940             break;
4941         case 1:
4942             errmsg = "invalid start byte";
4943             startinpos = s - starts;
4944             endinpos = startinpos + 1;
4945             break;
4946         case 2:
4947             if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4948                 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4949             {
4950                 /* Truncated surrogate code in range D800-DFFF */
4951                 goto End;
4952             }
4953             /* fall through */
4954         case 3:
4955         case 4:
4956             errmsg = "invalid continuation byte";
4957             startinpos = s - starts;
4958             endinpos = startinpos + ch - 1;
4959             break;
4960         default:
4961             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4962                 goto onError;
4963             continue;
4964         }
4965 
4966         if (error_handler == _Py_ERROR_UNKNOWN)
4967             error_handler = _Py_GetErrorHandler(errors);
4968 
4969         switch (error_handler) {
4970         case _Py_ERROR_IGNORE:
4971             s += (endinpos - startinpos);
4972             break;
4973 
4974         case _Py_ERROR_REPLACE:
4975             if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4976                 goto onError;
4977             s += (endinpos - startinpos);
4978             break;
4979 
4980         case _Py_ERROR_SURROGATEESCAPE:
4981         {
4982             Py_ssize_t i;
4983 
4984             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4985                 goto onError;
4986             for (i=startinpos; i<endinpos; i++) {
4987                 ch = (Py_UCS4)(unsigned char)(starts[i]);
4988                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4989                                 ch + 0xdc00);
4990                 writer.pos++;
4991             }
4992             s += (endinpos - startinpos);
4993             break;
4994         }
4995 
4996         default:
4997             if (unicode_decode_call_errorhandler_writer(
4998                     errors, &error_handler_obj,
4999                     "utf-8", errmsg,
5000                     &starts, &end, &startinpos, &endinpos, &exc, &s,
5001                     &writer))
5002                 goto onError;
5003         }
5004     }
5005 
5006 End:
5007     if (consumed)
5008         *consumed = s - starts;
5009 
5010     Py_XDECREF(error_handler_obj);
5011     Py_XDECREF(exc);
5012     return _PyUnicodeWriter_Finish(&writer);
5013 
5014 onError:
5015     Py_XDECREF(error_handler_obj);
5016     Py_XDECREF(exc);
5017     _PyUnicodeWriter_Dealloc(&writer);
5018     return NULL;
5019 }
5020 
5021 
5022 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5023 PyUnicode_DecodeUTF8Stateful(const char *s,
5024                              Py_ssize_t size,
5025                              const char *errors,
5026                              Py_ssize_t *consumed)
5027 {
5028     return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5029 }
5030 
5031 
5032 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5033    non-zero, use strict error handler otherwise.
5034 
5035    On success, write a pointer to a newly allocated wide character string into
5036    *wstr (use PyMem_RawFree() to free the memory) and write the output length
5037    (in number of wchar_t units) into *wlen (if wlen is set).
5038 
5039    On memory allocation failure, return -1.
5040 
5041    On decoding error (if surrogateescape is zero), return -2. If wlen is
5042    non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5043    is not NULL, write the decoding error message into *reason. */
5044 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5045 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5046                  const char **reason, _Py_error_handler errors)
5047 {
5048     const char *orig_s = s;
5049     const char *e;
5050     wchar_t *unicode;
5051     Py_ssize_t outpos;
5052 
5053     int surrogateescape = 0;
5054     int surrogatepass = 0;
5055     switch (errors)
5056     {
5057     case _Py_ERROR_STRICT:
5058         break;
5059     case _Py_ERROR_SURROGATEESCAPE:
5060         surrogateescape = 1;
5061         break;
5062     case _Py_ERROR_SURROGATEPASS:
5063         surrogatepass = 1;
5064         break;
5065     default:
5066         return -3;
5067     }
5068 
5069     /* Note: size will always be longer than the resulting Unicode
5070        character count */
5071     if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5072         return -1;
5073     }
5074 
5075     unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5076     if (!unicode) {
5077         return -1;
5078     }
5079 
5080     /* Unpack UTF-8 encoded data */
5081     e = s + size;
5082     outpos = 0;
5083     while (s < e) {
5084         Py_UCS4 ch;
5085 #if SIZEOF_WCHAR_T == 4
5086         ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5087 #else
5088         ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5089 #endif
5090         if (ch > 0xFF) {
5091 #if SIZEOF_WCHAR_T == 4
5092             Py_UNREACHABLE();
5093 #else
5094             assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5095             /* write a surrogate pair */
5096             unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5097             unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5098 #endif
5099         }
5100         else {
5101             if (!ch && s == e) {
5102                 break;
5103             }
5104 
5105             if (surrogateescape) {
5106                 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5107             }
5108             else {
5109                 /* Is it a valid three-byte code? */
5110                 if (surrogatepass
5111                     && (e - s) >= 3
5112                     && (s[0] & 0xf0) == 0xe0
5113                     && (s[1] & 0xc0) == 0x80
5114                     && (s[2] & 0xc0) == 0x80)
5115                 {
5116                     ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5117                     s += 3;
5118                     unicode[outpos++] = ch;
5119                 }
5120                 else {
5121                     PyMem_RawFree(unicode );
5122                     if (reason != NULL) {
5123                         switch (ch) {
5124                         case 0:
5125                             *reason = "unexpected end of data";
5126                             break;
5127                         case 1:
5128                             *reason = "invalid start byte";
5129                             break;
5130                         /* 2, 3, 4 */
5131                         default:
5132                             *reason = "invalid continuation byte";
5133                             break;
5134                         }
5135                     }
5136                     if (wlen != NULL) {
5137                         *wlen = s - orig_s;
5138                     }
5139                     return -2;
5140                 }
5141             }
5142         }
5143     }
5144     unicode[outpos] = L'\0';
5145     if (wlen) {
5146         *wlen = outpos;
5147     }
5148     *wstr = unicode;
5149     return 0;
5150 }
5151 
5152 
5153 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5154 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5155                                size_t *wlen)
5156 {
5157     wchar_t *wstr;
5158     int res = _Py_DecodeUTF8Ex(arg, arglen,
5159                                &wstr, wlen,
5160                                NULL, _Py_ERROR_SURROGATEESCAPE);
5161     if (res != 0) {
5162         /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5163         assert(res != -3);
5164         if (wlen) {
5165             *wlen = (size_t)res;
5166         }
5167         return NULL;
5168     }
5169     return wstr;
5170 }
5171 
5172 
5173 /* UTF-8 encoder using the surrogateescape error handler .
5174 
5175    On success, return 0 and write the newly allocated character string (use
5176    PyMem_Free() to free the memory) into *str.
5177 
5178    On encoding failure, return -2 and write the position of the invalid
5179    surrogate character into *error_pos (if error_pos is set) and the decoding
5180    error message into *reason (if reason is set).
5181 
5182    On memory allocation failure, return -1. */
5183 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5184 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5185                  const char **reason, int raw_malloc, _Py_error_handler errors)
5186 {
5187     const Py_ssize_t max_char_size = 4;
5188     Py_ssize_t len = wcslen(text);
5189 
5190     assert(len >= 0);
5191 
5192     int surrogateescape = 0;
5193     int surrogatepass = 0;
5194     switch (errors)
5195     {
5196     case _Py_ERROR_STRICT:
5197         break;
5198     case _Py_ERROR_SURROGATEESCAPE:
5199         surrogateescape = 1;
5200         break;
5201     case _Py_ERROR_SURROGATEPASS:
5202         surrogatepass = 1;
5203         break;
5204     default:
5205         return -3;
5206     }
5207 
5208     if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5209         return -1;
5210     }
5211     char *bytes;
5212     if (raw_malloc) {
5213         bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5214     }
5215     else {
5216         bytes = PyMem_Malloc((len + 1) * max_char_size);
5217     }
5218     if (bytes == NULL) {
5219         return -1;
5220     }
5221 
5222     char *p = bytes;
5223     Py_ssize_t i;
5224     for (i = 0; i < len; ) {
5225         Py_ssize_t ch_pos = i;
5226         Py_UCS4 ch = text[i];
5227         i++;
5228 #if Py_UNICODE_SIZE == 2
5229         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5230             && i < len
5231             && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5232         {
5233             ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5234             i++;
5235         }
5236 #endif
5237 
5238         if (ch < 0x80) {
5239             /* Encode ASCII */
5240             *p++ = (char) ch;
5241 
5242         }
5243         else if (ch < 0x0800) {
5244             /* Encode Latin-1 */
5245             *p++ = (char)(0xc0 | (ch >> 6));
5246             *p++ = (char)(0x80 | (ch & 0x3f));
5247         }
5248         else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5249             /* surrogateescape error handler */
5250             if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5251                 if (error_pos != NULL) {
5252                     *error_pos = (size_t)ch_pos;
5253                 }
5254                 if (reason != NULL) {
5255                     *reason = "encoding error";
5256                 }
5257                 if (raw_malloc) {
5258                     PyMem_RawFree(bytes);
5259                 }
5260                 else {
5261                     PyMem_Free(bytes);
5262                 }
5263                 return -2;
5264             }
5265             *p++ = (char)(ch & 0xff);
5266         }
5267         else if (ch < 0x10000) {
5268             *p++ = (char)(0xe0 | (ch >> 12));
5269             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5270             *p++ = (char)(0x80 | (ch & 0x3f));
5271         }
5272         else {  /* ch >= 0x10000 */
5273             assert(ch <= MAX_UNICODE);
5274             /* Encode UCS4 Unicode ordinals */
5275             *p++ = (char)(0xf0 | (ch >> 18));
5276             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5277             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5278             *p++ = (char)(0x80 | (ch & 0x3f));
5279         }
5280     }
5281     *p++ = '\0';
5282 
5283     size_t final_size = (p - bytes);
5284     char *bytes2;
5285     if (raw_malloc) {
5286         bytes2 = PyMem_RawRealloc(bytes, final_size);
5287     }
5288     else {
5289         bytes2 = PyMem_Realloc(bytes, final_size);
5290     }
5291     if (bytes2 == NULL) {
5292         if (error_pos != NULL) {
5293             *error_pos = (size_t)-1;
5294         }
5295         if (raw_malloc) {
5296             PyMem_RawFree(bytes);
5297         }
5298         else {
5299             PyMem_Free(bytes);
5300         }
5301         return -1;
5302     }
5303     *str = bytes2;
5304     return 0;
5305 }
5306 
5307 
5308 /* Primary internal function which creates utf8 encoded bytes objects.
5309 
5310    Allocation strategy:  if the string is short, convert into a stack buffer
5311    and allocate exactly as much space needed at the end.  Else allocate the
5312    maximum possible needed (4 result bytes per Unicode character), and return
5313    the excess memory at the end.
5314 */
5315 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5316 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5317                     const char *errors)
5318 {
5319     enum PyUnicode_Kind kind;
5320     void *data;
5321     Py_ssize_t size;
5322 
5323     if (!PyUnicode_Check(unicode)) {
5324         PyErr_BadArgument();
5325         return NULL;
5326     }
5327 
5328     if (PyUnicode_READY(unicode) == -1)
5329         return NULL;
5330 
5331     if (PyUnicode_UTF8(unicode))
5332         return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5333                                          PyUnicode_UTF8_LENGTH(unicode));
5334 
5335     kind = PyUnicode_KIND(unicode);
5336     data = PyUnicode_DATA(unicode);
5337     size = PyUnicode_GET_LENGTH(unicode);
5338 
5339     switch (kind) {
5340     default:
5341         Py_UNREACHABLE();
5342     case PyUnicode_1BYTE_KIND:
5343         /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5344         assert(!PyUnicode_IS_ASCII(unicode));
5345         return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
5346     case PyUnicode_2BYTE_KIND:
5347         return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
5348     case PyUnicode_4BYTE_KIND:
5349         return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
5350     }
5351 }
5352 
5353 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5354 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5355 {
5356     return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5357 }
5358 
5359 
5360 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5361 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5362                      Py_ssize_t size,
5363                      const char *errors)
5364 {
5365     PyObject *v, *unicode;
5366 
5367     unicode = PyUnicode_FromWideChar(s, size);
5368     if (unicode == NULL)
5369         return NULL;
5370     v = _PyUnicode_AsUTF8String(unicode, errors);
5371     Py_DECREF(unicode);
5372     return v;
5373 }
5374 
5375 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5376 PyUnicode_AsUTF8String(PyObject *unicode)
5377 {
5378     return _PyUnicode_AsUTF8String(unicode, NULL);
5379 }
5380 
5381 /* --- UTF-32 Codec ------------------------------------------------------- */
5382 
5383 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5384 PyUnicode_DecodeUTF32(const char *s,
5385                       Py_ssize_t size,
5386                       const char *errors,
5387                       int *byteorder)
5388 {
5389     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5390 }
5391 
5392 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5393 PyUnicode_DecodeUTF32Stateful(const char *s,
5394                               Py_ssize_t size,
5395                               const char *errors,
5396                               int *byteorder,
5397                               Py_ssize_t *consumed)
5398 {
5399     const char *starts = s;
5400     Py_ssize_t startinpos;
5401     Py_ssize_t endinpos;
5402     _PyUnicodeWriter writer;
5403     const unsigned char *q, *e;
5404     int le, bo = 0;       /* assume native ordering by default */
5405     const char *encoding;
5406     const char *errmsg = "";
5407     PyObject *errorHandler = NULL;
5408     PyObject *exc = NULL;
5409 
5410     q = (const unsigned char *)s;
5411     e = q + size;
5412 
5413     if (byteorder)
5414         bo = *byteorder;
5415 
5416     /* Check for BOM marks (U+FEFF) in the input and adjust current
5417        byte order setting accordingly. In native mode, the leading BOM
5418        mark is skipped, in all other modes, it is copied to the output
5419        stream as-is (giving a ZWNBSP character). */
5420     if (bo == 0 && size >= 4) {
5421         Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5422         if (bom == 0x0000FEFF) {
5423             bo = -1;
5424             q += 4;
5425         }
5426         else if (bom == 0xFFFE0000) {
5427             bo = 1;
5428             q += 4;
5429         }
5430         if (byteorder)
5431             *byteorder = bo;
5432     }
5433 
5434     if (q == e) {
5435         if (consumed)
5436             *consumed = size;
5437         _Py_RETURN_UNICODE_EMPTY();
5438     }
5439 
5440 #ifdef WORDS_BIGENDIAN
5441     le = bo < 0;
5442 #else
5443     le = bo <= 0;
5444 #endif
5445     encoding = le ? "utf-32-le" : "utf-32-be";
5446 
5447     _PyUnicodeWriter_Init(&writer);
5448     writer.min_length = (e - q + 3) / 4;
5449     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5450         goto onError;
5451 
5452     while (1) {
5453         Py_UCS4 ch = 0;
5454         Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5455 
5456         if (e - q >= 4) {
5457             enum PyUnicode_Kind kind = writer.kind;
5458             void *data = writer.data;
5459             const unsigned char *last = e - 4;
5460             Py_ssize_t pos = writer.pos;
5461             if (le) {
5462                 do {
5463                     ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5464                     if (ch > maxch)
5465                         break;
5466                     if (kind != PyUnicode_1BYTE_KIND &&
5467                         Py_UNICODE_IS_SURROGATE(ch))
5468                         break;
5469                     PyUnicode_WRITE(kind, data, pos++, ch);
5470                     q += 4;
5471                 } while (q <= last);
5472             }
5473             else {
5474                 do {
5475                     ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5476                     if (ch > maxch)
5477                         break;
5478                     if (kind != PyUnicode_1BYTE_KIND &&
5479                         Py_UNICODE_IS_SURROGATE(ch))
5480                         break;
5481                     PyUnicode_WRITE(kind, data, pos++, ch);
5482                     q += 4;
5483                 } while (q <= last);
5484             }
5485             writer.pos = pos;
5486         }
5487 
5488         if (Py_UNICODE_IS_SURROGATE(ch)) {
5489             errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5490             startinpos = ((const char *)q) - starts;
5491             endinpos = startinpos + 4;
5492         }
5493         else if (ch <= maxch) {
5494             if (q == e || consumed)
5495                 break;
5496             /* remaining bytes at the end? (size should be divisible by 4) */
5497             errmsg = "truncated data";
5498             startinpos = ((const char *)q) - starts;
5499             endinpos = ((const char *)e) - starts;
5500         }
5501         else {
5502             if (ch < 0x110000) {
5503                 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5504                     goto onError;
5505                 q += 4;
5506                 continue;
5507             }
5508             errmsg = "code point not in range(0x110000)";
5509             startinpos = ((const char *)q) - starts;
5510             endinpos = startinpos + 4;
5511         }
5512 
5513         /* The remaining input chars are ignored if the callback
5514            chooses to skip the input */
5515         if (unicode_decode_call_errorhandler_writer(
5516                 errors, &errorHandler,
5517                 encoding, errmsg,
5518                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5519                 &writer))
5520             goto onError;
5521     }
5522 
5523     if (consumed)
5524         *consumed = (const char *)q-starts;
5525 
5526     Py_XDECREF(errorHandler);
5527     Py_XDECREF(exc);
5528     return _PyUnicodeWriter_Finish(&writer);
5529 
5530   onError:
5531     _PyUnicodeWriter_Dealloc(&writer);
5532     Py_XDECREF(errorHandler);
5533     Py_XDECREF(exc);
5534     return NULL;
5535 }
5536 
5537 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5538 _PyUnicode_EncodeUTF32(PyObject *str,
5539                        const char *errors,
5540                        int byteorder)
5541 {
5542     enum PyUnicode_Kind kind;
5543     const void *data;
5544     Py_ssize_t len;
5545     PyObject *v;
5546     uint32_t *out;
5547 #if PY_LITTLE_ENDIAN
5548     int native_ordering = byteorder <= 0;
5549 #else
5550     int native_ordering = byteorder >= 0;
5551 #endif
5552     const char *encoding;
5553     Py_ssize_t nsize, pos;
5554     PyObject *errorHandler = NULL;
5555     PyObject *exc = NULL;
5556     PyObject *rep = NULL;
5557 
5558     if (!PyUnicode_Check(str)) {
5559         PyErr_BadArgument();
5560         return NULL;
5561     }
5562     if (PyUnicode_READY(str) == -1)
5563         return NULL;
5564     kind = PyUnicode_KIND(str);
5565     data = PyUnicode_DATA(str);
5566     len = PyUnicode_GET_LENGTH(str);
5567 
5568     if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5569         return PyErr_NoMemory();
5570     nsize = len + (byteorder == 0);
5571     v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5572     if (v == NULL)
5573         return NULL;
5574 
5575     /* output buffer is 4-bytes aligned */
5576     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5577     out = (uint32_t *)PyBytes_AS_STRING(v);
5578     if (byteorder == 0)
5579         *out++ = 0xFEFF;
5580     if (len == 0)
5581         goto done;
5582 
5583     if (byteorder == -1)
5584         encoding = "utf-32-le";
5585     else if (byteorder == 1)
5586         encoding = "utf-32-be";
5587     else
5588         encoding = "utf-32";
5589 
5590     if (kind == PyUnicode_1BYTE_KIND) {
5591         ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5592         goto done;
5593     }
5594 
5595     pos = 0;
5596     while (pos < len) {
5597         Py_ssize_t repsize, moreunits;
5598 
5599         if (kind == PyUnicode_2BYTE_KIND) {
5600             pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5601                                         &out, native_ordering);
5602         }
5603         else {
5604             assert(kind == PyUnicode_4BYTE_KIND);
5605             pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5606                                         &out, native_ordering);
5607         }
5608         if (pos == len)
5609             break;
5610 
5611         rep = unicode_encode_call_errorhandler(
5612                 errors, &errorHandler,
5613                 encoding, "surrogates not allowed",
5614                 str, &exc, pos, pos + 1, &pos);
5615         if (!rep)
5616             goto error;
5617 
5618         if (PyBytes_Check(rep)) {
5619             repsize = PyBytes_GET_SIZE(rep);
5620             if (repsize & 3) {
5621                 raise_encode_exception(&exc, encoding,
5622                                        str, pos - 1, pos,
5623                                        "surrogates not allowed");
5624                 goto error;
5625             }
5626             moreunits = repsize / 4;
5627         }
5628         else {
5629             assert(PyUnicode_Check(rep));
5630             if (PyUnicode_READY(rep) < 0)
5631                 goto error;
5632             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5633             if (!PyUnicode_IS_ASCII(rep)) {
5634                 raise_encode_exception(&exc, encoding,
5635                                        str, pos - 1, pos,
5636                                        "surrogates not allowed");
5637                 goto error;
5638             }
5639         }
5640 
5641         /* four bytes are reserved for each surrogate */
5642         if (moreunits > 1) {
5643             Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5644             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5645                 /* integer overflow */
5646                 PyErr_NoMemory();
5647                 goto error;
5648             }
5649             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5650                 goto error;
5651             out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5652         }
5653 
5654         if (PyBytes_Check(rep)) {
5655             memcpy(out, PyBytes_AS_STRING(rep), repsize);
5656             out += moreunits;
5657         } else /* rep is unicode */ {
5658             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5659             ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5660                                  &out, native_ordering);
5661         }
5662 
5663         Py_CLEAR(rep);
5664     }
5665 
5666     /* Cut back to size actually needed. This is necessary for, for example,
5667        encoding of a string containing isolated surrogates and the 'ignore'
5668        handler is used. */
5669     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5670     if (nsize != PyBytes_GET_SIZE(v))
5671       _PyBytes_Resize(&v, nsize);
5672     Py_XDECREF(errorHandler);
5673     Py_XDECREF(exc);
5674   done:
5675     return v;
5676   error:
5677     Py_XDECREF(rep);
5678     Py_XDECREF(errorHandler);
5679     Py_XDECREF(exc);
5680     Py_XDECREF(v);
5681     return NULL;
5682 }
5683 
5684 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5685 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5686                       Py_ssize_t size,
5687                       const char *errors,
5688                       int byteorder)
5689 {
5690     PyObject *result;
5691     PyObject *tmp = PyUnicode_FromWideChar(s, size);
5692     if (tmp == NULL)
5693         return NULL;
5694     result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5695     Py_DECREF(tmp);
5696     return result;
5697 }
5698 
5699 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5700 PyUnicode_AsUTF32String(PyObject *unicode)
5701 {
5702     return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5703 }
5704 
5705 /* --- UTF-16 Codec ------------------------------------------------------- */
5706 
5707 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5708 PyUnicode_DecodeUTF16(const char *s,
5709                       Py_ssize_t size,
5710                       const char *errors,
5711                       int *byteorder)
5712 {
5713     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5714 }
5715 
5716 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5717 PyUnicode_DecodeUTF16Stateful(const char *s,
5718                               Py_ssize_t size,
5719                               const char *errors,
5720                               int *byteorder,
5721                               Py_ssize_t *consumed)
5722 {
5723     const char *starts = s;
5724     Py_ssize_t startinpos;
5725     Py_ssize_t endinpos;
5726     _PyUnicodeWriter writer;
5727     const unsigned char *q, *e;
5728     int bo = 0;       /* assume native ordering by default */
5729     int native_ordering;
5730     const char *errmsg = "";
5731     PyObject *errorHandler = NULL;
5732     PyObject *exc = NULL;
5733     const char *encoding;
5734 
5735     q = (const unsigned char *)s;
5736     e = q + size;
5737 
5738     if (byteorder)
5739         bo = *byteorder;
5740 
5741     /* Check for BOM marks (U+FEFF) in the input and adjust current
5742        byte order setting accordingly. In native mode, the leading BOM
5743        mark is skipped, in all other modes, it is copied to the output
5744        stream as-is (giving a ZWNBSP character). */
5745     if (bo == 0 && size >= 2) {
5746         const Py_UCS4 bom = (q[1] << 8) | q[0];
5747         if (bom == 0xFEFF) {
5748             q += 2;
5749             bo = -1;
5750         }
5751         else if (bom == 0xFFFE) {
5752             q += 2;
5753             bo = 1;
5754         }
5755         if (byteorder)
5756             *byteorder = bo;
5757     }
5758 
5759     if (q == e) {
5760         if (consumed)
5761             *consumed = size;
5762         _Py_RETURN_UNICODE_EMPTY();
5763     }
5764 
5765 #if PY_LITTLE_ENDIAN
5766     native_ordering = bo <= 0;
5767     encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5768 #else
5769     native_ordering = bo >= 0;
5770     encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5771 #endif
5772 
5773     /* Note: size will always be longer than the resulting Unicode
5774        character count normally.  Error handler will take care of
5775        resizing when needed. */
5776     _PyUnicodeWriter_Init(&writer);
5777     writer.min_length = (e - q + 1) / 2;
5778     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5779         goto onError;
5780 
5781     while (1) {
5782         Py_UCS4 ch = 0;
5783         if (e - q >= 2) {
5784             int kind = writer.kind;
5785             if (kind == PyUnicode_1BYTE_KIND) {
5786                 if (PyUnicode_IS_ASCII(writer.buffer))
5787                     ch = asciilib_utf16_decode(&q, e,
5788                             (Py_UCS1*)writer.data, &writer.pos,
5789                             native_ordering);
5790                 else
5791                     ch = ucs1lib_utf16_decode(&q, e,
5792                             (Py_UCS1*)writer.data, &writer.pos,
5793                             native_ordering);
5794             } else if (kind == PyUnicode_2BYTE_KIND) {
5795                 ch = ucs2lib_utf16_decode(&q, e,
5796                         (Py_UCS2*)writer.data, &writer.pos,
5797                         native_ordering);
5798             } else {
5799                 assert(kind == PyUnicode_4BYTE_KIND);
5800                 ch = ucs4lib_utf16_decode(&q, e,
5801                         (Py_UCS4*)writer.data, &writer.pos,
5802                         native_ordering);
5803             }
5804         }
5805 
5806         switch (ch)
5807         {
5808         case 0:
5809             /* remaining byte at the end? (size should be even) */
5810             if (q == e || consumed)
5811                 goto End;
5812             errmsg = "truncated data";
5813             startinpos = ((const char *)q) - starts;
5814             endinpos = ((const char *)e) - starts;
5815             break;
5816             /* The remaining input chars are ignored if the callback
5817                chooses to skip the input */
5818         case 1:
5819             q -= 2;
5820             if (consumed)
5821                 goto End;
5822             errmsg = "unexpected end of data";
5823             startinpos = ((const char *)q) - starts;
5824             endinpos = ((const char *)e) - starts;
5825             break;
5826         case 2:
5827             errmsg = "illegal encoding";
5828             startinpos = ((const char *)q) - 2 - starts;
5829             endinpos = startinpos + 2;
5830             break;
5831         case 3:
5832             errmsg = "illegal UTF-16 surrogate";
5833             startinpos = ((const char *)q) - 4 - starts;
5834             endinpos = startinpos + 2;
5835             break;
5836         default:
5837             if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5838                 goto onError;
5839             continue;
5840         }
5841 
5842         if (unicode_decode_call_errorhandler_writer(
5843                 errors,
5844                 &errorHandler,
5845                 encoding, errmsg,
5846                 &starts,
5847                 (const char **)&e,
5848                 &startinpos,
5849                 &endinpos,
5850                 &exc,
5851                 (const char **)&q,
5852                 &writer))
5853             goto onError;
5854     }
5855 
5856 End:
5857     if (consumed)
5858         *consumed = (const char *)q-starts;
5859 
5860     Py_XDECREF(errorHandler);
5861     Py_XDECREF(exc);
5862     return _PyUnicodeWriter_Finish(&writer);
5863 
5864   onError:
5865     _PyUnicodeWriter_Dealloc(&writer);
5866     Py_XDECREF(errorHandler);
5867     Py_XDECREF(exc);
5868     return NULL;
5869 }
5870 
5871 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5872 _PyUnicode_EncodeUTF16(PyObject *str,
5873                        const char *errors,
5874                        int byteorder)
5875 {
5876     enum PyUnicode_Kind kind;
5877     const void *data;
5878     Py_ssize_t len;
5879     PyObject *v;
5880     unsigned short *out;
5881     Py_ssize_t pairs;
5882 #if PY_BIG_ENDIAN
5883     int native_ordering = byteorder >= 0;
5884 #else
5885     int native_ordering = byteorder <= 0;
5886 #endif
5887     const char *encoding;
5888     Py_ssize_t nsize, pos;
5889     PyObject *errorHandler = NULL;
5890     PyObject *exc = NULL;
5891     PyObject *rep = NULL;
5892 
5893     if (!PyUnicode_Check(str)) {
5894         PyErr_BadArgument();
5895         return NULL;
5896     }
5897     if (PyUnicode_READY(str) == -1)
5898         return NULL;
5899     kind = PyUnicode_KIND(str);
5900     data = PyUnicode_DATA(str);
5901     len = PyUnicode_GET_LENGTH(str);
5902 
5903     pairs = 0;
5904     if (kind == PyUnicode_4BYTE_KIND) {
5905         const Py_UCS4 *in = (const Py_UCS4 *)data;
5906         const Py_UCS4 *end = in + len;
5907         while (in < end) {
5908             if (*in++ >= 0x10000) {
5909                 pairs++;
5910             }
5911         }
5912     }
5913     if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5914         return PyErr_NoMemory();
5915     }
5916     nsize = len + pairs + (byteorder == 0);
5917     v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5918     if (v == NULL) {
5919         return NULL;
5920     }
5921 
5922     /* output buffer is 2-bytes aligned */
5923     assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5924     out = (unsigned short *)PyBytes_AS_STRING(v);
5925     if (byteorder == 0) {
5926         *out++ = 0xFEFF;
5927     }
5928     if (len == 0) {
5929         goto done;
5930     }
5931 
5932     if (kind == PyUnicode_1BYTE_KIND) {
5933         ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5934         goto done;
5935     }
5936 
5937     if (byteorder < 0) {
5938         encoding = "utf-16-le";
5939     }
5940     else if (byteorder > 0) {
5941         encoding = "utf-16-be";
5942     }
5943     else {
5944         encoding = "utf-16";
5945     }
5946 
5947     pos = 0;
5948     while (pos < len) {
5949         Py_ssize_t repsize, moreunits;
5950 
5951         if (kind == PyUnicode_2BYTE_KIND) {
5952             pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5953                                         &out, native_ordering);
5954         }
5955         else {
5956             assert(kind == PyUnicode_4BYTE_KIND);
5957             pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5958                                         &out, native_ordering);
5959         }
5960         if (pos == len)
5961             break;
5962 
5963         rep = unicode_encode_call_errorhandler(
5964                 errors, &errorHandler,
5965                 encoding, "surrogates not allowed",
5966                 str, &exc, pos, pos + 1, &pos);
5967         if (!rep)
5968             goto error;
5969 
5970         if (PyBytes_Check(rep)) {
5971             repsize = PyBytes_GET_SIZE(rep);
5972             if (repsize & 1) {
5973                 raise_encode_exception(&exc, encoding,
5974                                        str, pos - 1, pos,
5975                                        "surrogates not allowed");
5976                 goto error;
5977             }
5978             moreunits = repsize / 2;
5979         }
5980         else {
5981             assert(PyUnicode_Check(rep));
5982             if (PyUnicode_READY(rep) < 0)
5983                 goto error;
5984             moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5985             if (!PyUnicode_IS_ASCII(rep)) {
5986                 raise_encode_exception(&exc, encoding,
5987                                        str, pos - 1, pos,
5988                                        "surrogates not allowed");
5989                 goto error;
5990             }
5991         }
5992 
5993         /* two bytes are reserved for each surrogate */
5994         if (moreunits > 1) {
5995             Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5996             if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5997                 /* integer overflow */
5998                 PyErr_NoMemory();
5999                 goto error;
6000             }
6001             if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6002                 goto error;
6003             out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6004         }
6005 
6006         if (PyBytes_Check(rep)) {
6007             memcpy(out, PyBytes_AS_STRING(rep), repsize);
6008             out += moreunits;
6009         } else /* rep is unicode */ {
6010             assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6011             ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6012                                  &out, native_ordering);
6013         }
6014 
6015         Py_CLEAR(rep);
6016     }
6017 
6018     /* Cut back to size actually needed. This is necessary for, for example,
6019     encoding of a string containing isolated surrogates and the 'ignore' handler
6020     is used. */
6021     nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6022     if (nsize != PyBytes_GET_SIZE(v))
6023       _PyBytes_Resize(&v, nsize);
6024     Py_XDECREF(errorHandler);
6025     Py_XDECREF(exc);
6026   done:
6027     return v;
6028   error:
6029     Py_XDECREF(rep);
6030     Py_XDECREF(errorHandler);
6031     Py_XDECREF(exc);
6032     Py_XDECREF(v);
6033     return NULL;
6034 #undef STORECHAR
6035 }
6036 
6037 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6038 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6039                       Py_ssize_t size,
6040                       const char *errors,
6041                       int byteorder)
6042 {
6043     PyObject *result;
6044     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6045     if (tmp == NULL)
6046         return NULL;
6047     result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6048     Py_DECREF(tmp);
6049     return result;
6050 }
6051 
6052 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6053 PyUnicode_AsUTF16String(PyObject *unicode)
6054 {
6055     return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6056 }
6057 
6058 /* --- Unicode Escape Codec ----------------------------------------------- */
6059 
6060 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
6061 
6062 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)6063 _PyUnicode_DecodeUnicodeEscape(const char *s,
6064                                Py_ssize_t size,
6065                                const char *errors,
6066                                const char **first_invalid_escape)
6067 {
6068     const char *starts = s;
6069     _PyUnicodeWriter writer;
6070     const char *end;
6071     PyObject *errorHandler = NULL;
6072     PyObject *exc = NULL;
6073 
6074     // so we can remember if we've seen an invalid escape char or not
6075     *first_invalid_escape = NULL;
6076 
6077     if (size == 0) {
6078         _Py_RETURN_UNICODE_EMPTY();
6079     }
6080     /* Escaped strings will always be longer than the resulting
6081        Unicode string, so we start with size here and then reduce the
6082        length after conversion to the true value.
6083        (but if the error callback returns a long replacement string
6084        we'll have to allocate more space) */
6085     _PyUnicodeWriter_Init(&writer);
6086     writer.min_length = size;
6087     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6088         goto onError;
6089     }
6090 
6091     end = s + size;
6092     while (s < end) {
6093         unsigned char c = (unsigned char) *s++;
6094         Py_UCS4 ch;
6095         int count;
6096         Py_ssize_t startinpos;
6097         Py_ssize_t endinpos;
6098         const char *message;
6099 
6100 #define WRITE_ASCII_CHAR(ch)                                                  \
6101             do {                                                              \
6102                 assert(ch <= 127);                                            \
6103                 assert(writer.pos < writer.size);                             \
6104                 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6105             } while(0)
6106 
6107 #define WRITE_CHAR(ch)                                                        \
6108             do {                                                              \
6109                 if (ch <= writer.maxchar) {                                   \
6110                     assert(writer.pos < writer.size);                         \
6111                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6112                 }                                                             \
6113                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6114                     goto onError;                                             \
6115                 }                                                             \
6116             } while(0)
6117 
6118         /* Non-escape characters are interpreted as Unicode ordinals */
6119         if (c != '\\') {
6120             WRITE_CHAR(c);
6121             continue;
6122         }
6123 
6124         startinpos = s - starts - 1;
6125         /* \ - Escapes */
6126         if (s >= end) {
6127             message = "\\ at end of string";
6128             goto error;
6129         }
6130         c = (unsigned char) *s++;
6131 
6132         assert(writer.pos < writer.size);
6133         switch (c) {
6134 
6135             /* \x escapes */
6136         case '\n': continue;
6137         case '\\': WRITE_ASCII_CHAR('\\'); continue;
6138         case '\'': WRITE_ASCII_CHAR('\''); continue;
6139         case '\"': WRITE_ASCII_CHAR('\"'); continue;
6140         case 'b': WRITE_ASCII_CHAR('\b'); continue;
6141         /* FF */
6142         case 'f': WRITE_ASCII_CHAR('\014'); continue;
6143         case 't': WRITE_ASCII_CHAR('\t'); continue;
6144         case 'n': WRITE_ASCII_CHAR('\n'); continue;
6145         case 'r': WRITE_ASCII_CHAR('\r'); continue;
6146         /* VT */
6147         case 'v': WRITE_ASCII_CHAR('\013'); continue;
6148         /* BEL, not classic C */
6149         case 'a': WRITE_ASCII_CHAR('\007'); continue;
6150 
6151             /* \OOO (octal) escapes */
6152         case '0': case '1': case '2': case '3':
6153         case '4': case '5': case '6': case '7':
6154             ch = c - '0';
6155             if (s < end && '0' <= *s && *s <= '7') {
6156                 ch = (ch<<3) + *s++ - '0';
6157                 if (s < end && '0' <= *s && *s <= '7') {
6158                     ch = (ch<<3) + *s++ - '0';
6159                 }
6160             }
6161             WRITE_CHAR(ch);
6162             continue;
6163 
6164             /* hex escapes */
6165             /* \xXX */
6166         case 'x':
6167             count = 2;
6168             message = "truncated \\xXX escape";
6169             goto hexescape;
6170 
6171             /* \uXXXX */
6172         case 'u':
6173             count = 4;
6174             message = "truncated \\uXXXX escape";
6175             goto hexescape;
6176 
6177             /* \UXXXXXXXX */
6178         case 'U':
6179             count = 8;
6180             message = "truncated \\UXXXXXXXX escape";
6181         hexescape:
6182             for (ch = 0; count && s < end; ++s, --count) {
6183                 c = (unsigned char)*s;
6184                 ch <<= 4;
6185                 if (c >= '0' && c <= '9') {
6186                     ch += c - '0';
6187                 }
6188                 else if (c >= 'a' && c <= 'f') {
6189                     ch += c - ('a' - 10);
6190                 }
6191                 else if (c >= 'A' && c <= 'F') {
6192                     ch += c - ('A' - 10);
6193                 }
6194                 else {
6195                     break;
6196                 }
6197             }
6198             if (count) {
6199                 goto error;
6200             }
6201 
6202             /* when we get here, ch is a 32-bit unicode character */
6203             if (ch > MAX_UNICODE) {
6204                 message = "illegal Unicode character";
6205                 goto error;
6206             }
6207 
6208             WRITE_CHAR(ch);
6209             continue;
6210 
6211             /* \N{name} */
6212         case 'N':
6213             if (ucnhash_CAPI == NULL) {
6214                 /* load the unicode data module */
6215                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6216                                                 PyUnicodeData_CAPSULE_NAME, 1);
6217                 if (ucnhash_CAPI == NULL) {
6218                     PyErr_SetString(
6219                         PyExc_UnicodeError,
6220                         "\\N escapes not supported (can't load unicodedata module)"
6221                         );
6222                     goto onError;
6223                 }
6224             }
6225 
6226             message = "malformed \\N character escape";
6227             if (s < end && *s == '{') {
6228                 const char *start = ++s;
6229                 size_t namelen;
6230                 /* look for the closing brace */
6231                 while (s < end && *s != '}')
6232                     s++;
6233                 namelen = s - start;
6234                 if (namelen && s < end) {
6235                     /* found a name.  look it up in the unicode database */
6236                     s++;
6237                     ch = 0xffffffff; /* in case 'getcode' messes up */
6238                     if (namelen <= INT_MAX &&
6239                         ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6240                                               &ch, 0)) {
6241                         assert(ch <= MAX_UNICODE);
6242                         WRITE_CHAR(ch);
6243                         continue;
6244                     }
6245                     message = "unknown Unicode character name";
6246                 }
6247             }
6248             goto error;
6249 
6250         default:
6251             if (*first_invalid_escape == NULL) {
6252                 *first_invalid_escape = s-1; /* Back up one char, since we've
6253                                                 already incremented s. */
6254             }
6255             WRITE_ASCII_CHAR('\\');
6256             WRITE_CHAR(c);
6257             continue;
6258         }
6259 
6260       error:
6261         endinpos = s-starts;
6262         writer.min_length = end - s + writer.pos;
6263         if (unicode_decode_call_errorhandler_writer(
6264                 errors, &errorHandler,
6265                 "unicodeescape", message,
6266                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6267                 &writer)) {
6268             goto onError;
6269         }
6270         assert(end - s <= writer.size - writer.pos);
6271 
6272 #undef WRITE_ASCII_CHAR
6273 #undef WRITE_CHAR
6274     }
6275 
6276     Py_XDECREF(errorHandler);
6277     Py_XDECREF(exc);
6278     return _PyUnicodeWriter_Finish(&writer);
6279 
6280   onError:
6281     _PyUnicodeWriter_Dealloc(&writer);
6282     Py_XDECREF(errorHandler);
6283     Py_XDECREF(exc);
6284     return NULL;
6285 }
6286 
6287 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6288 PyUnicode_DecodeUnicodeEscape(const char *s,
6289                               Py_ssize_t size,
6290                               const char *errors)
6291 {
6292     const char *first_invalid_escape;
6293     PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6294                                                       &first_invalid_escape);
6295     if (result == NULL)
6296         return NULL;
6297     if (first_invalid_escape != NULL) {
6298         if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6299                              "invalid escape sequence '\\%c'",
6300                              (unsigned char)*first_invalid_escape) < 0) {
6301             Py_DECREF(result);
6302             return NULL;
6303         }
6304     }
6305     return result;
6306 }
6307 
6308 /* Return a Unicode-Escape string version of the Unicode object. */
6309 
6310 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6311 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6312 {
6313     Py_ssize_t i, len;
6314     PyObject *repr;
6315     char *p;
6316     enum PyUnicode_Kind kind;
6317     void *data;
6318     Py_ssize_t expandsize;
6319 
6320     /* Initial allocation is based on the longest-possible character
6321        escape.
6322 
6323        For UCS1 strings it's '\xxx', 4 bytes per source character.
6324        For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6325        For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6326     */
6327 
6328     if (!PyUnicode_Check(unicode)) {
6329         PyErr_BadArgument();
6330         return NULL;
6331     }
6332     if (PyUnicode_READY(unicode) == -1) {
6333         return NULL;
6334     }
6335 
6336     len = PyUnicode_GET_LENGTH(unicode);
6337     if (len == 0) {
6338         return PyBytes_FromStringAndSize(NULL, 0);
6339     }
6340 
6341     kind = PyUnicode_KIND(unicode);
6342     data = PyUnicode_DATA(unicode);
6343     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6344        bytes, and 1 byte characters 4. */
6345     expandsize = kind * 2 + 2;
6346     if (len > PY_SSIZE_T_MAX / expandsize) {
6347         return PyErr_NoMemory();
6348     }
6349     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6350     if (repr == NULL) {
6351         return NULL;
6352     }
6353 
6354     p = PyBytes_AS_STRING(repr);
6355     for (i = 0; i < len; i++) {
6356         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6357 
6358         /* U+0000-U+00ff range */
6359         if (ch < 0x100) {
6360             if (ch >= ' ' && ch < 127) {
6361                 if (ch != '\\') {
6362                     /* Copy printable US ASCII as-is */
6363                     *p++ = (char) ch;
6364                 }
6365                 /* Escape backslashes */
6366                 else {
6367                     *p++ = '\\';
6368                     *p++ = '\\';
6369                 }
6370             }
6371 
6372             /* Map special whitespace to '\t', \n', '\r' */
6373             else if (ch == '\t') {
6374                 *p++ = '\\';
6375                 *p++ = 't';
6376             }
6377             else if (ch == '\n') {
6378                 *p++ = '\\';
6379                 *p++ = 'n';
6380             }
6381             else if (ch == '\r') {
6382                 *p++ = '\\';
6383                 *p++ = 'r';
6384             }
6385 
6386             /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6387             else {
6388                 *p++ = '\\';
6389                 *p++ = 'x';
6390                 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6391                 *p++ = Py_hexdigits[ch & 0x000F];
6392             }
6393         }
6394         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6395         else if (ch < 0x10000) {
6396             *p++ = '\\';
6397             *p++ = 'u';
6398             *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6399             *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6400             *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6401             *p++ = Py_hexdigits[ch & 0x000F];
6402         }
6403         /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6404         else {
6405 
6406             /* Make sure that the first two digits are zero */
6407             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6408             *p++ = '\\';
6409             *p++ = 'U';
6410             *p++ = '0';
6411             *p++ = '0';
6412             *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6413             *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6414             *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6415             *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6416             *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6417             *p++ = Py_hexdigits[ch & 0x0000000F];
6418         }
6419     }
6420 
6421     assert(p - PyBytes_AS_STRING(repr) > 0);
6422     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6423         return NULL;
6424     }
6425     return repr;
6426 }
6427 
6428 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6429 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6430                               Py_ssize_t size)
6431 {
6432     PyObject *result;
6433     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6434     if (tmp == NULL) {
6435         return NULL;
6436     }
6437 
6438     result = PyUnicode_AsUnicodeEscapeString(tmp);
6439     Py_DECREF(tmp);
6440     return result;
6441 }
6442 
6443 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6444 
6445 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6446 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6447                                  Py_ssize_t size,
6448                                  const char *errors)
6449 {
6450     const char *starts = s;
6451     _PyUnicodeWriter writer;
6452     const char *end;
6453     PyObject *errorHandler = NULL;
6454     PyObject *exc = NULL;
6455 
6456     if (size == 0) {
6457         _Py_RETURN_UNICODE_EMPTY();
6458     }
6459 
6460     /* Escaped strings will always be longer than the resulting
6461        Unicode string, so we start with size here and then reduce the
6462        length after conversion to the true value. (But decoding error
6463        handler might have to resize the string) */
6464     _PyUnicodeWriter_Init(&writer);
6465      writer.min_length = size;
6466     if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6467         goto onError;
6468     }
6469 
6470     end = s + size;
6471     while (s < end) {
6472         unsigned char c = (unsigned char) *s++;
6473         Py_UCS4 ch;
6474         int count;
6475         Py_ssize_t startinpos;
6476         Py_ssize_t endinpos;
6477         const char *message;
6478 
6479 #define WRITE_CHAR(ch)                                                        \
6480             do {                                                              \
6481                 if (ch <= writer.maxchar) {                                   \
6482                     assert(writer.pos < writer.size);                         \
6483                     PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6484                 }                                                             \
6485                 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6486                     goto onError;                                             \
6487                 }                                                             \
6488             } while(0)
6489 
6490         /* Non-escape characters are interpreted as Unicode ordinals */
6491         if (c != '\\' || s >= end) {
6492             WRITE_CHAR(c);
6493             continue;
6494         }
6495 
6496         c = (unsigned char) *s++;
6497         if (c == 'u') {
6498             count = 4;
6499             message = "truncated \\uXXXX escape";
6500         }
6501         else if (c == 'U') {
6502             count = 8;
6503             message = "truncated \\UXXXXXXXX escape";
6504         }
6505         else {
6506             assert(writer.pos < writer.size);
6507             PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6508             WRITE_CHAR(c);
6509             continue;
6510         }
6511         startinpos = s - starts - 2;
6512 
6513         /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6514         for (ch = 0; count && s < end; ++s, --count) {
6515             c = (unsigned char)*s;
6516             ch <<= 4;
6517             if (c >= '0' && c <= '9') {
6518                 ch += c - '0';
6519             }
6520             else if (c >= 'a' && c <= 'f') {
6521                 ch += c - ('a' - 10);
6522             }
6523             else if (c >= 'A' && c <= 'F') {
6524                 ch += c - ('A' - 10);
6525             }
6526             else {
6527                 break;
6528             }
6529         }
6530         if (!count) {
6531             if (ch <= MAX_UNICODE) {
6532                 WRITE_CHAR(ch);
6533                 continue;
6534             }
6535             message = "\\Uxxxxxxxx out of range";
6536         }
6537 
6538         endinpos = s-starts;
6539         writer.min_length = end - s + writer.pos;
6540         if (unicode_decode_call_errorhandler_writer(
6541                 errors, &errorHandler,
6542                 "rawunicodeescape", message,
6543                 &starts, &end, &startinpos, &endinpos, &exc, &s,
6544                 &writer)) {
6545             goto onError;
6546         }
6547         assert(end - s <= writer.size - writer.pos);
6548 
6549 #undef WRITE_CHAR
6550     }
6551     Py_XDECREF(errorHandler);
6552     Py_XDECREF(exc);
6553     return _PyUnicodeWriter_Finish(&writer);
6554 
6555   onError:
6556     _PyUnicodeWriter_Dealloc(&writer);
6557     Py_XDECREF(errorHandler);
6558     Py_XDECREF(exc);
6559     return NULL;
6560 
6561 }
6562 
6563 
6564 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6565 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6566 {
6567     PyObject *repr;
6568     char *p;
6569     Py_ssize_t expandsize, pos;
6570     int kind;
6571     void *data;
6572     Py_ssize_t len;
6573 
6574     if (!PyUnicode_Check(unicode)) {
6575         PyErr_BadArgument();
6576         return NULL;
6577     }
6578     if (PyUnicode_READY(unicode) == -1) {
6579         return NULL;
6580     }
6581     kind = PyUnicode_KIND(unicode);
6582     data = PyUnicode_DATA(unicode);
6583     len = PyUnicode_GET_LENGTH(unicode);
6584     if (kind == PyUnicode_1BYTE_KIND) {
6585         return PyBytes_FromStringAndSize(data, len);
6586     }
6587 
6588     /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6589        bytes, and 1 byte characters 4. */
6590     expandsize = kind * 2 + 2;
6591 
6592     if (len > PY_SSIZE_T_MAX / expandsize) {
6593         return PyErr_NoMemory();
6594     }
6595     repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6596     if (repr == NULL) {
6597         return NULL;
6598     }
6599     if (len == 0) {
6600         return repr;
6601     }
6602 
6603     p = PyBytes_AS_STRING(repr);
6604     for (pos = 0; pos < len; pos++) {
6605         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6606 
6607         /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6608         if (ch < 0x100) {
6609             *p++ = (char) ch;
6610         }
6611         /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6612         else if (ch < 0x10000) {
6613             *p++ = '\\';
6614             *p++ = 'u';
6615             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6616             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6617             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6618             *p++ = Py_hexdigits[ch & 15];
6619         }
6620         /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6621         else {
6622             assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6623             *p++ = '\\';
6624             *p++ = 'U';
6625             *p++ = '0';
6626             *p++ = '0';
6627             *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6628             *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6629             *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6630             *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6631             *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6632             *p++ = Py_hexdigits[ch & 15];
6633         }
6634     }
6635 
6636     assert(p > PyBytes_AS_STRING(repr));
6637     if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6638         return NULL;
6639     }
6640     return repr;
6641 }
6642 
6643 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6644 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6645                                  Py_ssize_t size)
6646 {
6647     PyObject *result;
6648     PyObject *tmp = PyUnicode_FromWideChar(s, size);
6649     if (tmp == NULL)
6650         return NULL;
6651     result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6652     Py_DECREF(tmp);
6653     return result;
6654 }
6655 
6656 /* --- Latin-1 Codec ------------------------------------------------------ */
6657 
6658 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6659 PyUnicode_DecodeLatin1(const char *s,
6660                        Py_ssize_t size,
6661                        const char *errors)
6662 {
6663     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6664     return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6665 }
6666 
6667 /* create or adjust a UnicodeEncodeError */
6668 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6669 make_encode_exception(PyObject **exceptionObject,
6670                       const char *encoding,
6671                       PyObject *unicode,
6672                       Py_ssize_t startpos, Py_ssize_t endpos,
6673                       const char *reason)
6674 {
6675     if (*exceptionObject == NULL) {
6676         *exceptionObject = PyObject_CallFunction(
6677             PyExc_UnicodeEncodeError, "sOnns",
6678             encoding, unicode, startpos, endpos, reason);
6679     }
6680     else {
6681         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6682             goto onError;
6683         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6684             goto onError;
6685         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6686             goto onError;
6687         return;
6688       onError:
6689         Py_CLEAR(*exceptionObject);
6690     }
6691 }
6692 
6693 /* raises a UnicodeEncodeError */
6694 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6695 raise_encode_exception(PyObject **exceptionObject,
6696                        const char *encoding,
6697                        PyObject *unicode,
6698                        Py_ssize_t startpos, Py_ssize_t endpos,
6699                        const char *reason)
6700 {
6701     make_encode_exception(exceptionObject,
6702                           encoding, unicode, startpos, endpos, reason);
6703     if (*exceptionObject != NULL)
6704         PyCodec_StrictErrors(*exceptionObject);
6705 }
6706 
6707 /* error handling callback helper:
6708    build arguments, call the callback and check the arguments,
6709    put the result into newpos and return the replacement string, which
6710    has to be freed by the caller */
6711 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6712 unicode_encode_call_errorhandler(const char *errors,
6713                                  PyObject **errorHandler,
6714                                  const char *encoding, const char *reason,
6715                                  PyObject *unicode, PyObject **exceptionObject,
6716                                  Py_ssize_t startpos, Py_ssize_t endpos,
6717                                  Py_ssize_t *newpos)
6718 {
6719     static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6720     Py_ssize_t len;
6721     PyObject *restuple;
6722     PyObject *resunicode;
6723 
6724     if (*errorHandler == NULL) {
6725         *errorHandler = PyCodec_LookupError(errors);
6726         if (*errorHandler == NULL)
6727             return NULL;
6728     }
6729 
6730     if (PyUnicode_READY(unicode) == -1)
6731         return NULL;
6732     len = PyUnicode_GET_LENGTH(unicode);
6733 
6734     make_encode_exception(exceptionObject,
6735                           encoding, unicode, startpos, endpos, reason);
6736     if (*exceptionObject == NULL)
6737         return NULL;
6738 
6739     restuple = PyObject_CallFunctionObjArgs(
6740         *errorHandler, *exceptionObject, NULL);
6741     if (restuple == NULL)
6742         return NULL;
6743     if (!PyTuple_Check(restuple)) {
6744         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6745         Py_DECREF(restuple);
6746         return NULL;
6747     }
6748     if (!PyArg_ParseTuple(restuple, argparse,
6749                           &resunicode, newpos)) {
6750         Py_DECREF(restuple);
6751         return NULL;
6752     }
6753     if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6754         PyErr_SetString(PyExc_TypeError, &argparse[3]);
6755         Py_DECREF(restuple);
6756         return NULL;
6757     }
6758     if (*newpos<0)
6759         *newpos = len + *newpos;
6760     if (*newpos<0 || *newpos>len) {
6761         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6762         Py_DECREF(restuple);
6763         return NULL;
6764     }
6765     Py_INCREF(resunicode);
6766     Py_DECREF(restuple);
6767     return resunicode;
6768 }
6769 
6770 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6771 unicode_encode_ucs1(PyObject *unicode,
6772                     const char *errors,
6773                     const Py_UCS4 limit)
6774 {
6775     /* input state */
6776     Py_ssize_t pos=0, size;
6777     int kind;
6778     void *data;
6779     /* pointer into the output */
6780     char *str;
6781     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6782     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6783     PyObject *error_handler_obj = NULL;
6784     PyObject *exc = NULL;
6785     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6786     PyObject *rep = NULL;
6787     /* output object */
6788     _PyBytesWriter writer;
6789 
6790     if (PyUnicode_READY(unicode) == -1)
6791         return NULL;
6792     size = PyUnicode_GET_LENGTH(unicode);
6793     kind = PyUnicode_KIND(unicode);
6794     data = PyUnicode_DATA(unicode);
6795     /* allocate enough for a simple encoding without
6796        replacements, if we need more, we'll resize */
6797     if (size == 0)
6798         return PyBytes_FromStringAndSize(NULL, 0);
6799 
6800     _PyBytesWriter_Init(&writer);
6801     str = _PyBytesWriter_Alloc(&writer, size);
6802     if (str == NULL)
6803         return NULL;
6804 
6805     while (pos < size) {
6806         Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6807 
6808         /* can we encode this? */
6809         if (ch < limit) {
6810             /* no overflow check, because we know that the space is enough */
6811             *str++ = (char)ch;
6812             ++pos;
6813         }
6814         else {
6815             Py_ssize_t newpos, i;
6816             /* startpos for collecting unencodable chars */
6817             Py_ssize_t collstart = pos;
6818             Py_ssize_t collend = collstart + 1;
6819             /* find all unecodable characters */
6820 
6821             while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6822                 ++collend;
6823 
6824             /* Only overallocate the buffer if it's not the last write */
6825             writer.overallocate = (collend < size);
6826 
6827             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6828             if (error_handler == _Py_ERROR_UNKNOWN)
6829                 error_handler = _Py_GetErrorHandler(errors);
6830 
6831             switch (error_handler) {
6832             case _Py_ERROR_STRICT:
6833                 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6834                 goto onError;
6835 
6836             case _Py_ERROR_REPLACE:
6837                 memset(str, '?', collend - collstart);
6838                 str += (collend - collstart);
6839                 /* fall through */
6840             case _Py_ERROR_IGNORE:
6841                 pos = collend;
6842                 break;
6843 
6844             case _Py_ERROR_BACKSLASHREPLACE:
6845                 /* subtract preallocated bytes */
6846                 writer.min_size -= (collend - collstart);
6847                 str = backslashreplace(&writer, str,
6848                                        unicode, collstart, collend);
6849                 if (str == NULL)
6850                     goto onError;
6851                 pos = collend;
6852                 break;
6853 
6854             case _Py_ERROR_XMLCHARREFREPLACE:
6855                 /* subtract preallocated bytes */
6856                 writer.min_size -= (collend - collstart);
6857                 str = xmlcharrefreplace(&writer, str,
6858                                         unicode, collstart, collend);
6859                 if (str == NULL)
6860                     goto onError;
6861                 pos = collend;
6862                 break;
6863 
6864             case _Py_ERROR_SURROGATEESCAPE:
6865                 for (i = collstart; i < collend; ++i) {
6866                     ch = PyUnicode_READ(kind, data, i);
6867                     if (ch < 0xdc80 || 0xdcff < ch) {
6868                         /* Not a UTF-8b surrogate */
6869                         break;
6870                     }
6871                     *str++ = (char)(ch - 0xdc00);
6872                     ++pos;
6873                 }
6874                 if (i >= collend)
6875                     break;
6876                 collstart = pos;
6877                 assert(collstart != collend);
6878                 /* fall through */
6879 
6880             default:
6881                 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6882                                                        encoding, reason, unicode, &exc,
6883                                                        collstart, collend, &newpos);
6884                 if (rep == NULL)
6885                     goto onError;
6886 
6887                 /* subtract preallocated bytes */
6888                 writer.min_size -= newpos - collstart;
6889 
6890                 if (PyBytes_Check(rep)) {
6891                     /* Directly copy bytes result to output. */
6892                     str = _PyBytesWriter_WriteBytes(&writer, str,
6893                                                     PyBytes_AS_STRING(rep),
6894                                                     PyBytes_GET_SIZE(rep));
6895                 }
6896                 else {
6897                     assert(PyUnicode_Check(rep));
6898 
6899                     if (PyUnicode_READY(rep) < 0)
6900                         goto onError;
6901 
6902                     if (limit == 256 ?
6903                         PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6904                         !PyUnicode_IS_ASCII(rep))
6905                     {
6906                         /* Not all characters are smaller than limit */
6907                         raise_encode_exception(&exc, encoding, unicode,
6908                                                collstart, collend, reason);
6909                         goto onError;
6910                     }
6911                     assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6912                     str = _PyBytesWriter_WriteBytes(&writer, str,
6913                                                     PyUnicode_DATA(rep),
6914                                                     PyUnicode_GET_LENGTH(rep));
6915                 }
6916                 if (str == NULL)
6917                     goto onError;
6918 
6919                 pos = newpos;
6920                 Py_CLEAR(rep);
6921             }
6922 
6923             /* If overallocation was disabled, ensure that it was the last
6924                write. Otherwise, we missed an optimization */
6925             assert(writer.overallocate || pos == size);
6926         }
6927     }
6928 
6929     Py_XDECREF(error_handler_obj);
6930     Py_XDECREF(exc);
6931     return _PyBytesWriter_Finish(&writer, str);
6932 
6933   onError:
6934     Py_XDECREF(rep);
6935     _PyBytesWriter_Dealloc(&writer);
6936     Py_XDECREF(error_handler_obj);
6937     Py_XDECREF(exc);
6938     return NULL;
6939 }
6940 
6941 /* Deprecated */
6942 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6943 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6944                        Py_ssize_t size,
6945                        const char *errors)
6946 {
6947     PyObject *result;
6948     PyObject *unicode = PyUnicode_FromWideChar(p, size);
6949     if (unicode == NULL)
6950         return NULL;
6951     result = unicode_encode_ucs1(unicode, errors, 256);
6952     Py_DECREF(unicode);
6953     return result;
6954 }
6955 
6956 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6957 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6958 {
6959     if (!PyUnicode_Check(unicode)) {
6960         PyErr_BadArgument();
6961         return NULL;
6962     }
6963     if (PyUnicode_READY(unicode) == -1)
6964         return NULL;
6965     /* Fast path: if it is a one-byte string, construct
6966        bytes object directly. */
6967     if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6968         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6969                                          PyUnicode_GET_LENGTH(unicode));
6970     /* Non-Latin-1 characters present. Defer to above function to
6971        raise the exception. */
6972     return unicode_encode_ucs1(unicode, errors, 256);
6973 }
6974 
6975 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6976 PyUnicode_AsLatin1String(PyObject *unicode)
6977 {
6978     return _PyUnicode_AsLatin1String(unicode, NULL);
6979 }
6980 
6981 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6982 
6983 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6984 PyUnicode_DecodeASCII(const char *s,
6985                       Py_ssize_t size,
6986                       const char *errors)
6987 {
6988     const char *starts = s;
6989     _PyUnicodeWriter writer;
6990     int kind;
6991     void *data;
6992     Py_ssize_t startinpos;
6993     Py_ssize_t endinpos;
6994     Py_ssize_t outpos;
6995     const char *e;
6996     PyObject *error_handler_obj = NULL;
6997     PyObject *exc = NULL;
6998     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6999 
7000     if (size == 0)
7001         _Py_RETURN_UNICODE_EMPTY();
7002 
7003     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7004     if (size == 1 && (unsigned char)s[0] < 128)
7005         return get_latin1_char((unsigned char)s[0]);
7006 
7007     _PyUnicodeWriter_Init(&writer);
7008     writer.min_length = size;
7009     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
7010         return NULL;
7011 
7012     e = s + size;
7013     data = writer.data;
7014     outpos = ascii_decode(s, e, (Py_UCS1 *)data);
7015     writer.pos = outpos;
7016     if (writer.pos == size)
7017         return _PyUnicodeWriter_Finish(&writer);
7018 
7019     s += writer.pos;
7020     kind = writer.kind;
7021     while (s < e) {
7022         unsigned char c = (unsigned char)*s;
7023         if (c < 128) {
7024             PyUnicode_WRITE(kind, data, writer.pos, c);
7025             writer.pos++;
7026             ++s;
7027             continue;
7028         }
7029 
7030         /* byte outsize range 0x00..0x7f: call the error handler */
7031 
7032         if (error_handler == _Py_ERROR_UNKNOWN)
7033             error_handler = _Py_GetErrorHandler(errors);
7034 
7035         switch (error_handler)
7036         {
7037         case _Py_ERROR_REPLACE:
7038         case _Py_ERROR_SURROGATEESCAPE:
7039             /* Fast-path: the error handler only writes one character,
7040                but we may switch to UCS2 at the first write */
7041             if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7042                 goto onError;
7043             kind = writer.kind;
7044             data = writer.data;
7045 
7046             if (error_handler == _Py_ERROR_REPLACE)
7047                 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7048             else
7049                 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7050             writer.pos++;
7051             ++s;
7052             break;
7053 
7054         case _Py_ERROR_IGNORE:
7055             ++s;
7056             break;
7057 
7058         default:
7059             startinpos = s-starts;
7060             endinpos = startinpos + 1;
7061             if (unicode_decode_call_errorhandler_writer(
7062                     errors, &error_handler_obj,
7063                     "ascii", "ordinal not in range(128)",
7064                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7065                     &writer))
7066                 goto onError;
7067             kind = writer.kind;
7068             data = writer.data;
7069         }
7070     }
7071     Py_XDECREF(error_handler_obj);
7072     Py_XDECREF(exc);
7073     return _PyUnicodeWriter_Finish(&writer);
7074 
7075   onError:
7076     _PyUnicodeWriter_Dealloc(&writer);
7077     Py_XDECREF(error_handler_obj);
7078     Py_XDECREF(exc);
7079     return NULL;
7080 }
7081 
7082 /* Deprecated */
7083 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7084 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7085                       Py_ssize_t size,
7086                       const char *errors)
7087 {
7088     PyObject *result;
7089     PyObject *unicode = PyUnicode_FromWideChar(p, size);
7090     if (unicode == NULL)
7091         return NULL;
7092     result = unicode_encode_ucs1(unicode, errors, 128);
7093     Py_DECREF(unicode);
7094     return result;
7095 }
7096 
7097 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7098 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7099 {
7100     if (!PyUnicode_Check(unicode)) {
7101         PyErr_BadArgument();
7102         return NULL;
7103     }
7104     if (PyUnicode_READY(unicode) == -1)
7105         return NULL;
7106     /* Fast path: if it is an ASCII-only string, construct bytes object
7107        directly. Else defer to above function to raise the exception. */
7108     if (PyUnicode_IS_ASCII(unicode))
7109         return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7110                                          PyUnicode_GET_LENGTH(unicode));
7111     return unicode_encode_ucs1(unicode, errors, 128);
7112 }
7113 
7114 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7115 PyUnicode_AsASCIIString(PyObject *unicode)
7116 {
7117     return _PyUnicode_AsASCIIString(unicode, NULL);
7118 }
7119 
7120 #ifdef MS_WINDOWS
7121 
7122 /* --- MBCS codecs for Windows -------------------------------------------- */
7123 
7124 #if SIZEOF_INT < SIZEOF_SIZE_T
7125 #define NEED_RETRY
7126 #endif
7127 
7128 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7129    transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7130    both cases also and avoids partial characters overrunning the
7131    length limit in MultiByteToWideChar on Windows */
7132 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7133 
7134 #ifndef WC_ERR_INVALID_CHARS
7135 #  define WC_ERR_INVALID_CHARS 0x0080
7136 #endif
7137 
7138 static const char*
code_page_name(UINT code_page,PyObject ** obj)7139 code_page_name(UINT code_page, PyObject **obj)
7140 {
7141     *obj = NULL;
7142     if (code_page == CP_ACP)
7143         return "mbcs";
7144     if (code_page == CP_UTF7)
7145         return "CP_UTF7";
7146     if (code_page == CP_UTF8)
7147         return "CP_UTF8";
7148 
7149     *obj = PyBytes_FromFormat("cp%u", code_page);
7150     if (*obj == NULL)
7151         return NULL;
7152     return PyBytes_AS_STRING(*obj);
7153 }
7154 
7155 static DWORD
decode_code_page_flags(UINT code_page)7156 decode_code_page_flags(UINT code_page)
7157 {
7158     if (code_page == CP_UTF7) {
7159         /* The CP_UTF7 decoder only supports flags=0 */
7160         return 0;
7161     }
7162     else
7163         return MB_ERR_INVALID_CHARS;
7164 }
7165 
7166 /*
7167  * Decode a byte string from a Windows code page into unicode object in strict
7168  * mode.
7169  *
7170  * Returns consumed size if succeed, returns -2 on decode error, or raise an
7171  * OSError and returns -1 on other error.
7172  */
7173 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7174 decode_code_page_strict(UINT code_page,
7175                         wchar_t **buf,
7176                         Py_ssize_t *bufsize,
7177                         const char *in,
7178                         int insize)
7179 {
7180     DWORD flags = MB_ERR_INVALID_CHARS;
7181     wchar_t *out;
7182     DWORD outsize;
7183 
7184     /* First get the size of the result */
7185     assert(insize > 0);
7186     while ((outsize = MultiByteToWideChar(code_page, flags,
7187                                           in, insize, NULL, 0)) <= 0)
7188     {
7189         if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7190             goto error;
7191         }
7192         /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7193         flags = 0;
7194     }
7195 
7196     /* Extend a wchar_t* buffer */
7197     Py_ssize_t n = *bufsize;   /* Get the current length */
7198     if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7199         return -1;
7200     }
7201     out = *buf + n;
7202 
7203     /* Do the conversion */
7204     outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7205     if (outsize <= 0)
7206         goto error;
7207     return insize;
7208 
7209 error:
7210     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7211         return -2;
7212     PyErr_SetFromWindowsErr(0);
7213     return -1;
7214 }
7215 
7216 /*
7217  * Decode a byte string from a code page into unicode object with an error
7218  * handler.
7219  *
7220  * Returns consumed size if succeed, or raise an OSError or
7221  * UnicodeDecodeError exception and returns -1 on error.
7222  */
7223 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7224 decode_code_page_errors(UINT code_page,
7225                         wchar_t **buf,
7226                         Py_ssize_t *bufsize,
7227                         const char *in, const int size,
7228                         const char *errors, int final)
7229 {
7230     const char *startin = in;
7231     const char *endin = in + size;
7232     DWORD flags = MB_ERR_INVALID_CHARS;
7233     /* Ideally, we should get reason from FormatMessage. This is the Windows
7234        2000 English version of the message. */
7235     const char *reason = "No mapping for the Unicode character exists "
7236                          "in the target code page.";
7237     /* each step cannot decode more than 1 character, but a character can be
7238        represented as a surrogate pair */
7239     wchar_t buffer[2], *out;
7240     int insize;
7241     Py_ssize_t outsize;
7242     PyObject *errorHandler = NULL;
7243     PyObject *exc = NULL;
7244     PyObject *encoding_obj = NULL;
7245     const char *encoding;
7246     DWORD err;
7247     int ret = -1;
7248 
7249     assert(size > 0);
7250 
7251     encoding = code_page_name(code_page, &encoding_obj);
7252     if (encoding == NULL)
7253         return -1;
7254 
7255     if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7256         /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7257            UnicodeDecodeError. */
7258         make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7259         if (exc != NULL) {
7260             PyCodec_StrictErrors(exc);
7261             Py_CLEAR(exc);
7262         }
7263         goto error;
7264     }
7265 
7266     /* Extend a wchar_t* buffer */
7267     Py_ssize_t n = *bufsize;   /* Get the current length */
7268     if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7269         PyErr_NoMemory();
7270         goto error;
7271     }
7272     if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7273         goto error;
7274     }
7275     out = *buf + n;
7276 
7277     /* Decode the byte string character per character */
7278     while (in < endin)
7279     {
7280         /* Decode a character */
7281         insize = 1;
7282         do
7283         {
7284             outsize = MultiByteToWideChar(code_page, flags,
7285                                           in, insize,
7286                                           buffer, Py_ARRAY_LENGTH(buffer));
7287             if (outsize > 0)
7288                 break;
7289             err = GetLastError();
7290             if (err == ERROR_INVALID_FLAGS && flags) {
7291                 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7292                 flags = 0;
7293                 continue;
7294             }
7295             if (err != ERROR_NO_UNICODE_TRANSLATION
7296                 && err != ERROR_INSUFFICIENT_BUFFER)
7297             {
7298                 PyErr_SetFromWindowsErr(0);
7299                 goto error;
7300             }
7301             insize++;
7302         }
7303         /* 4=maximum length of a UTF-8 sequence */
7304         while (insize <= 4 && (in + insize) <= endin);
7305 
7306         if (outsize <= 0) {
7307             Py_ssize_t startinpos, endinpos, outpos;
7308 
7309             /* last character in partial decode? */
7310             if (in + insize >= endin && !final)
7311                 break;
7312 
7313             startinpos = in - startin;
7314             endinpos = startinpos + 1;
7315             outpos = out - *buf;
7316             if (unicode_decode_call_errorhandler_wchar(
7317                     errors, &errorHandler,
7318                     encoding, reason,
7319                     &startin, &endin, &startinpos, &endinpos, &exc, &in,
7320                     buf, bufsize, &outpos))
7321             {
7322                 goto error;
7323             }
7324             out = *buf + outpos;
7325         }
7326         else {
7327             in += insize;
7328             memcpy(out, buffer, outsize * sizeof(wchar_t));
7329             out += outsize;
7330         }
7331     }
7332 
7333     /* Shrink the buffer */
7334     assert(out - *buf <= *bufsize);
7335     *bufsize = out - *buf;
7336     /* (in - startin) <= size and size is an int */
7337     ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7338 
7339 error:
7340     Py_XDECREF(encoding_obj);
7341     Py_XDECREF(errorHandler);
7342     Py_XDECREF(exc);
7343     return ret;
7344 }
7345 
7346 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7347 decode_code_page_stateful(int code_page,
7348                           const char *s, Py_ssize_t size,
7349                           const char *errors, Py_ssize_t *consumed)
7350 {
7351     wchar_t *buf = NULL;
7352     Py_ssize_t bufsize = 0;
7353     int chunk_size, final, converted, done;
7354 
7355     if (code_page < 0) {
7356         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7357         return NULL;
7358     }
7359     if (size < 0) {
7360         PyErr_BadInternalCall();
7361         return NULL;
7362     }
7363 
7364     if (consumed)
7365         *consumed = 0;
7366 
7367     do
7368     {
7369 #ifdef NEED_RETRY
7370         if (size > DECODING_CHUNK_SIZE) {
7371             chunk_size = DECODING_CHUNK_SIZE;
7372             final = 0;
7373             done = 0;
7374         }
7375         else
7376 #endif
7377         {
7378             chunk_size = (int)size;
7379             final = (consumed == NULL);
7380             done = 1;
7381         }
7382 
7383         if (chunk_size == 0 && done) {
7384             if (buf != NULL)
7385                 break;
7386             _Py_RETURN_UNICODE_EMPTY();
7387         }
7388 
7389         converted = decode_code_page_strict(code_page, &buf, &bufsize,
7390                                             s, chunk_size);
7391         if (converted == -2)
7392             converted = decode_code_page_errors(code_page, &buf, &bufsize,
7393                                                 s, chunk_size,
7394                                                 errors, final);
7395         assert(converted != 0 || done);
7396 
7397         if (converted < 0) {
7398             PyMem_Free(buf);
7399             return NULL;
7400         }
7401 
7402         if (consumed)
7403             *consumed += converted;
7404 
7405         s += converted;
7406         size -= converted;
7407     } while (!done);
7408 
7409     PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7410     PyMem_Free(buf);
7411     return v;
7412 }
7413 
7414 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7415 PyUnicode_DecodeCodePageStateful(int code_page,
7416                                  const char *s,
7417                                  Py_ssize_t size,
7418                                  const char *errors,
7419                                  Py_ssize_t *consumed)
7420 {
7421     return decode_code_page_stateful(code_page, s, size, errors, consumed);
7422 }
7423 
7424 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7425 PyUnicode_DecodeMBCSStateful(const char *s,
7426                              Py_ssize_t size,
7427                              const char *errors,
7428                              Py_ssize_t *consumed)
7429 {
7430     return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7431 }
7432 
7433 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7434 PyUnicode_DecodeMBCS(const char *s,
7435                      Py_ssize_t size,
7436                      const char *errors)
7437 {
7438     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7439 }
7440 
7441 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7442 encode_code_page_flags(UINT code_page, const char *errors)
7443 {
7444     if (code_page == CP_UTF8) {
7445         return WC_ERR_INVALID_CHARS;
7446     }
7447     else if (code_page == CP_UTF7) {
7448         /* CP_UTF7 only supports flags=0 */
7449         return 0;
7450     }
7451     else {
7452         if (errors != NULL && strcmp(errors, "replace") == 0)
7453             return 0;
7454         else
7455             return WC_NO_BEST_FIT_CHARS;
7456     }
7457 }
7458 
7459 /*
7460  * Encode a Unicode string to a Windows code page into a byte string in strict
7461  * mode.
7462  *
7463  * Returns consumed characters if succeed, returns -2 on encode error, or raise
7464  * an OSError and returns -1 on other error.
7465  */
7466 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7467 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7468                         PyObject *unicode, Py_ssize_t offset, int len,
7469                         const char* errors)
7470 {
7471     BOOL usedDefaultChar = FALSE;
7472     BOOL *pusedDefaultChar = &usedDefaultChar;
7473     int outsize;
7474     wchar_t *p;
7475     Py_ssize_t size;
7476     const DWORD flags = encode_code_page_flags(code_page, NULL);
7477     char *out;
7478     /* Create a substring so that we can get the UTF-16 representation
7479        of just the slice under consideration. */
7480     PyObject *substring;
7481 
7482     assert(len > 0);
7483 
7484     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7485         pusedDefaultChar = &usedDefaultChar;
7486     else
7487         pusedDefaultChar = NULL;
7488 
7489     substring = PyUnicode_Substring(unicode, offset, offset+len);
7490     if (substring == NULL)
7491         return -1;
7492     p = PyUnicode_AsUnicodeAndSize(substring, &size);
7493     if (p == NULL) {
7494         Py_DECREF(substring);
7495         return -1;
7496     }
7497     assert(size <= INT_MAX);
7498 
7499     /* First get the size of the result */
7500     outsize = WideCharToMultiByte(code_page, flags,
7501                                   p, (int)size,
7502                                   NULL, 0,
7503                                   NULL, pusedDefaultChar);
7504     if (outsize <= 0)
7505         goto error;
7506     /* If we used a default char, then we failed! */
7507     if (pusedDefaultChar && *pusedDefaultChar) {
7508         Py_DECREF(substring);
7509         return -2;
7510     }
7511 
7512     if (*outbytes == NULL) {
7513         /* Create string object */
7514         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7515         if (*outbytes == NULL) {
7516             Py_DECREF(substring);
7517             return -1;
7518         }
7519         out = PyBytes_AS_STRING(*outbytes);
7520     }
7521     else {
7522         /* Extend string object */
7523         const Py_ssize_t n = PyBytes_Size(*outbytes);
7524         if (outsize > PY_SSIZE_T_MAX - n) {
7525             PyErr_NoMemory();
7526             Py_DECREF(substring);
7527             return -1;
7528         }
7529         if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7530             Py_DECREF(substring);
7531             return -1;
7532         }
7533         out = PyBytes_AS_STRING(*outbytes) + n;
7534     }
7535 
7536     /* Do the conversion */
7537     outsize = WideCharToMultiByte(code_page, flags,
7538                                   p, (int)size,
7539                                   out, outsize,
7540                                   NULL, pusedDefaultChar);
7541     Py_CLEAR(substring);
7542     if (outsize <= 0)
7543         goto error;
7544     if (pusedDefaultChar && *pusedDefaultChar)
7545         return -2;
7546     return 0;
7547 
7548 error:
7549     Py_XDECREF(substring);
7550     if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7551         return -2;
7552     PyErr_SetFromWindowsErr(0);
7553     return -1;
7554 }
7555 
7556 /*
7557  * Encode a Unicode string to a Windows code page into a byte string using an
7558  * error handler.
7559  *
7560  * Returns consumed characters if succeed, or raise an OSError and returns
7561  * -1 on other error.
7562  */
7563 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7564 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7565                         PyObject *unicode, Py_ssize_t unicode_offset,
7566                         Py_ssize_t insize, const char* errors)
7567 {
7568     const DWORD flags = encode_code_page_flags(code_page, errors);
7569     Py_ssize_t pos = unicode_offset;
7570     Py_ssize_t endin = unicode_offset + insize;
7571     /* Ideally, we should get reason from FormatMessage. This is the Windows
7572        2000 English version of the message. */
7573     const char *reason = "invalid character";
7574     /* 4=maximum length of a UTF-8 sequence */
7575     char buffer[4];
7576     BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7577     Py_ssize_t outsize;
7578     char *out;
7579     PyObject *errorHandler = NULL;
7580     PyObject *exc = NULL;
7581     PyObject *encoding_obj = NULL;
7582     const char *encoding;
7583     Py_ssize_t newpos, newoutsize;
7584     PyObject *rep;
7585     int ret = -1;
7586 
7587     assert(insize > 0);
7588 
7589     encoding = code_page_name(code_page, &encoding_obj);
7590     if (encoding == NULL)
7591         return -1;
7592 
7593     if (errors == NULL || strcmp(errors, "strict") == 0) {
7594         /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7595            then we raise a UnicodeEncodeError. */
7596         make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7597         if (exc != NULL) {
7598             PyCodec_StrictErrors(exc);
7599             Py_DECREF(exc);
7600         }
7601         Py_XDECREF(encoding_obj);
7602         return -1;
7603     }
7604 
7605     if (code_page != CP_UTF8 && code_page != CP_UTF7)
7606         pusedDefaultChar = &usedDefaultChar;
7607     else
7608         pusedDefaultChar = NULL;
7609 
7610     if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7611         PyErr_NoMemory();
7612         goto error;
7613     }
7614     outsize = insize * Py_ARRAY_LENGTH(buffer);
7615 
7616     if (*outbytes == NULL) {
7617         /* Create string object */
7618         *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7619         if (*outbytes == NULL)
7620             goto error;
7621         out = PyBytes_AS_STRING(*outbytes);
7622     }
7623     else {
7624         /* Extend string object */
7625         Py_ssize_t n = PyBytes_Size(*outbytes);
7626         if (n > PY_SSIZE_T_MAX - outsize) {
7627             PyErr_NoMemory();
7628             goto error;
7629         }
7630         if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7631             goto error;
7632         out = PyBytes_AS_STRING(*outbytes) + n;
7633     }
7634 
7635     /* Encode the string character per character */
7636     while (pos < endin)
7637     {
7638         Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7639         wchar_t chars[2];
7640         int charsize;
7641         if (ch < 0x10000) {
7642             chars[0] = (wchar_t)ch;
7643             charsize = 1;
7644         }
7645         else {
7646             chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7647             chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7648             charsize = 2;
7649         }
7650 
7651         outsize = WideCharToMultiByte(code_page, flags,
7652                                       chars, charsize,
7653                                       buffer, Py_ARRAY_LENGTH(buffer),
7654                                       NULL, pusedDefaultChar);
7655         if (outsize > 0) {
7656             if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7657             {
7658                 pos++;
7659                 memcpy(out, buffer, outsize);
7660                 out += outsize;
7661                 continue;
7662             }
7663         }
7664         else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7665             PyErr_SetFromWindowsErr(0);
7666             goto error;
7667         }
7668 
7669         rep = unicode_encode_call_errorhandler(
7670                   errors, &errorHandler, encoding, reason,
7671                   unicode, &exc,
7672                   pos, pos + 1, &newpos);
7673         if (rep == NULL)
7674             goto error;
7675         pos = newpos;
7676 
7677         if (PyBytes_Check(rep)) {
7678             outsize = PyBytes_GET_SIZE(rep);
7679             if (outsize != 1) {
7680                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7681                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7682                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7683                     Py_DECREF(rep);
7684                     goto error;
7685                 }
7686                 out = PyBytes_AS_STRING(*outbytes) + offset;
7687             }
7688             memcpy(out, PyBytes_AS_STRING(rep), outsize);
7689             out += outsize;
7690         }
7691         else {
7692             Py_ssize_t i;
7693             enum PyUnicode_Kind kind;
7694             void *data;
7695 
7696             if (PyUnicode_READY(rep) == -1) {
7697                 Py_DECREF(rep);
7698                 goto error;
7699             }
7700 
7701             outsize = PyUnicode_GET_LENGTH(rep);
7702             if (outsize != 1) {
7703                 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7704                 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7705                 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7706                     Py_DECREF(rep);
7707                     goto error;
7708                 }
7709                 out = PyBytes_AS_STRING(*outbytes) + offset;
7710             }
7711             kind = PyUnicode_KIND(rep);
7712             data = PyUnicode_DATA(rep);
7713             for (i=0; i < outsize; i++) {
7714                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7715                 if (ch > 127) {
7716                     raise_encode_exception(&exc,
7717                         encoding, unicode,
7718                         pos, pos + 1,
7719                         "unable to encode error handler result to ASCII");
7720                     Py_DECREF(rep);
7721                     goto error;
7722                 }
7723                 *out = (unsigned char)ch;
7724                 out++;
7725             }
7726         }
7727         Py_DECREF(rep);
7728     }
7729     /* write a NUL byte */
7730     *out = 0;
7731     outsize = out - PyBytes_AS_STRING(*outbytes);
7732     assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7733     if (_PyBytes_Resize(outbytes, outsize) < 0)
7734         goto error;
7735     ret = 0;
7736 
7737 error:
7738     Py_XDECREF(encoding_obj);
7739     Py_XDECREF(errorHandler);
7740     Py_XDECREF(exc);
7741     return ret;
7742 }
7743 
7744 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7745 encode_code_page(int code_page,
7746                  PyObject *unicode,
7747                  const char *errors)
7748 {
7749     Py_ssize_t len;
7750     PyObject *outbytes = NULL;
7751     Py_ssize_t offset;
7752     int chunk_len, ret, done;
7753 
7754     if (!PyUnicode_Check(unicode)) {
7755         PyErr_BadArgument();
7756         return NULL;
7757     }
7758 
7759     if (PyUnicode_READY(unicode) == -1)
7760         return NULL;
7761     len = PyUnicode_GET_LENGTH(unicode);
7762 
7763     if (code_page < 0) {
7764         PyErr_SetString(PyExc_ValueError, "invalid code page number");
7765         return NULL;
7766     }
7767 
7768     if (len == 0)
7769         return PyBytes_FromStringAndSize(NULL, 0);
7770 
7771     offset = 0;
7772     do
7773     {
7774 #ifdef NEED_RETRY
7775         if (len > DECODING_CHUNK_SIZE) {
7776             chunk_len = DECODING_CHUNK_SIZE;
7777             done = 0;
7778         }
7779         else
7780 #endif
7781         {
7782             chunk_len = (int)len;
7783             done = 1;
7784         }
7785 
7786         ret = encode_code_page_strict(code_page, &outbytes,
7787                                       unicode, offset, chunk_len,
7788                                       errors);
7789         if (ret == -2)
7790             ret = encode_code_page_errors(code_page, &outbytes,
7791                                           unicode, offset,
7792                                           chunk_len, errors);
7793         if (ret < 0) {
7794             Py_XDECREF(outbytes);
7795             return NULL;
7796         }
7797 
7798         offset += chunk_len;
7799         len -= chunk_len;
7800     } while (!done);
7801 
7802     return outbytes;
7803 }
7804 
7805 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7806 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7807                      Py_ssize_t size,
7808                      const char *errors)
7809 {
7810     PyObject *unicode, *res;
7811     unicode = PyUnicode_FromWideChar(p, size);
7812     if (unicode == NULL)
7813         return NULL;
7814     res = encode_code_page(CP_ACP, unicode, errors);
7815     Py_DECREF(unicode);
7816     return res;
7817 }
7818 
7819 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7820 PyUnicode_EncodeCodePage(int code_page,
7821                          PyObject *unicode,
7822                          const char *errors)
7823 {
7824     return encode_code_page(code_page, unicode, errors);
7825 }
7826 
7827 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7828 PyUnicode_AsMBCSString(PyObject *unicode)
7829 {
7830     return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7831 }
7832 
7833 #undef NEED_RETRY
7834 
7835 #endif /* MS_WINDOWS */
7836 
7837 /* --- Character Mapping Codec -------------------------------------------- */
7838 
7839 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7840 charmap_decode_string(const char *s,
7841                       Py_ssize_t size,
7842                       PyObject *mapping,
7843                       const char *errors,
7844                       _PyUnicodeWriter *writer)
7845 {
7846     const char *starts = s;
7847     const char *e;
7848     Py_ssize_t startinpos, endinpos;
7849     PyObject *errorHandler = NULL, *exc = NULL;
7850     Py_ssize_t maplen;
7851     enum PyUnicode_Kind mapkind;
7852     void *mapdata;
7853     Py_UCS4 x;
7854     unsigned char ch;
7855 
7856     if (PyUnicode_READY(mapping) == -1)
7857         return -1;
7858 
7859     maplen = PyUnicode_GET_LENGTH(mapping);
7860     mapdata = PyUnicode_DATA(mapping);
7861     mapkind = PyUnicode_KIND(mapping);
7862 
7863     e = s + size;
7864 
7865     if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7866         /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7867          * is disabled in encoding aliases, latin1 is preferred because
7868          * its implementation is faster. */
7869         Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7870         Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7871         Py_UCS4 maxchar = writer->maxchar;
7872 
7873         assert (writer->kind == PyUnicode_1BYTE_KIND);
7874         while (s < e) {
7875             ch = *s;
7876             x = mapdata_ucs1[ch];
7877             if (x > maxchar) {
7878                 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7879                     goto onError;
7880                 maxchar = writer->maxchar;
7881                 outdata = (Py_UCS1 *)writer->data;
7882             }
7883             outdata[writer->pos] = x;
7884             writer->pos++;
7885             ++s;
7886         }
7887         return 0;
7888     }
7889 
7890     while (s < e) {
7891         if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7892             enum PyUnicode_Kind outkind = writer->kind;
7893             Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7894             if (outkind == PyUnicode_1BYTE_KIND) {
7895                 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7896                 Py_UCS4 maxchar = writer->maxchar;
7897                 while (s < e) {
7898                     ch = *s;
7899                     x = mapdata_ucs2[ch];
7900                     if (x > maxchar)
7901                         goto Error;
7902                     outdata[writer->pos] = x;
7903                     writer->pos++;
7904                     ++s;
7905                 }
7906                 break;
7907             }
7908             else if (outkind == PyUnicode_2BYTE_KIND) {
7909                 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7910                 while (s < e) {
7911                     ch = *s;
7912                     x = mapdata_ucs2[ch];
7913                     if (x == 0xFFFE)
7914                         goto Error;
7915                     outdata[writer->pos] = x;
7916                     writer->pos++;
7917                     ++s;
7918                 }
7919                 break;
7920             }
7921         }
7922         ch = *s;
7923 
7924         if (ch < maplen)
7925             x = PyUnicode_READ(mapkind, mapdata, ch);
7926         else
7927             x = 0xfffe; /* invalid value */
7928 Error:
7929         if (x == 0xfffe)
7930         {
7931             /* undefined mapping */
7932             startinpos = s-starts;
7933             endinpos = startinpos+1;
7934             if (unicode_decode_call_errorhandler_writer(
7935                     errors, &errorHandler,
7936                     "charmap", "character maps to <undefined>",
7937                     &starts, &e, &startinpos, &endinpos, &exc, &s,
7938                     writer)) {
7939                 goto onError;
7940             }
7941             continue;
7942         }
7943 
7944         if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7945             goto onError;
7946         ++s;
7947     }
7948     Py_XDECREF(errorHandler);
7949     Py_XDECREF(exc);
7950     return 0;
7951 
7952 onError:
7953     Py_XDECREF(errorHandler);
7954     Py_XDECREF(exc);
7955     return -1;
7956 }
7957 
7958 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7959 charmap_decode_mapping(const char *s,
7960                        Py_ssize_t size,
7961                        PyObject *mapping,
7962                        const char *errors,
7963                        _PyUnicodeWriter *writer)
7964 {
7965     const char *starts = s;
7966     const char *e;
7967     Py_ssize_t startinpos, endinpos;
7968     PyObject *errorHandler = NULL, *exc = NULL;
7969     unsigned char ch;
7970     PyObject *key, *item = NULL;
7971 
7972     e = s + size;
7973 
7974     while (s < e) {
7975         ch = *s;
7976 
7977         /* Get mapping (char ordinal -> integer, Unicode char or None) */
7978         key = PyLong_FromLong((long)ch);
7979         if (key == NULL)
7980             goto onError;
7981 
7982         item = PyObject_GetItem(mapping, key);
7983         Py_DECREF(key);
7984         if (item == NULL) {
7985             if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7986                 /* No mapping found means: mapping is undefined. */
7987                 PyErr_Clear();
7988                 goto Undefined;
7989             } else
7990                 goto onError;
7991         }
7992 
7993         /* Apply mapping */
7994         if (item == Py_None)
7995             goto Undefined;
7996         if (PyLong_Check(item)) {
7997             long value = PyLong_AS_LONG(item);
7998             if (value == 0xFFFE)
7999                 goto Undefined;
8000             if (value < 0 || value > MAX_UNICODE) {
8001                 PyErr_Format(PyExc_TypeError,
8002                              "character mapping must be in range(0x%x)",
8003                              (unsigned long)MAX_UNICODE + 1);
8004                 goto onError;
8005             }
8006 
8007             if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8008                 goto onError;
8009         }
8010         else if (PyUnicode_Check(item)) {
8011             if (PyUnicode_READY(item) == -1)
8012                 goto onError;
8013             if (PyUnicode_GET_LENGTH(item) == 1) {
8014                 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8015                 if (value == 0xFFFE)
8016                     goto Undefined;
8017                 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8018                     goto onError;
8019             }
8020             else {
8021                 writer->overallocate = 1;
8022                 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8023                     goto onError;
8024             }
8025         }
8026         else {
8027             /* wrong return value */
8028             PyErr_SetString(PyExc_TypeError,
8029                             "character mapping must return integer, None or str");
8030             goto onError;
8031         }
8032         Py_CLEAR(item);
8033         ++s;
8034         continue;
8035 
8036 Undefined:
8037         /* undefined mapping */
8038         Py_CLEAR(item);
8039         startinpos = s-starts;
8040         endinpos = startinpos+1;
8041         if (unicode_decode_call_errorhandler_writer(
8042                 errors, &errorHandler,
8043                 "charmap", "character maps to <undefined>",
8044                 &starts, &e, &startinpos, &endinpos, &exc, &s,
8045                 writer)) {
8046             goto onError;
8047         }
8048     }
8049     Py_XDECREF(errorHandler);
8050     Py_XDECREF(exc);
8051     return 0;
8052 
8053 onError:
8054     Py_XDECREF(item);
8055     Py_XDECREF(errorHandler);
8056     Py_XDECREF(exc);
8057     return -1;
8058 }
8059 
8060 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8061 PyUnicode_DecodeCharmap(const char *s,
8062                         Py_ssize_t size,
8063                         PyObject *mapping,
8064                         const char *errors)
8065 {
8066     _PyUnicodeWriter writer;
8067 
8068     /* Default to Latin-1 */
8069     if (mapping == NULL)
8070         return PyUnicode_DecodeLatin1(s, size, errors);
8071 
8072     if (size == 0)
8073         _Py_RETURN_UNICODE_EMPTY();
8074     _PyUnicodeWriter_Init(&writer);
8075     writer.min_length = size;
8076     if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8077         goto onError;
8078 
8079     if (PyUnicode_CheckExact(mapping)) {
8080         if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8081             goto onError;
8082     }
8083     else {
8084         if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8085             goto onError;
8086     }
8087     return _PyUnicodeWriter_Finish(&writer);
8088 
8089   onError:
8090     _PyUnicodeWriter_Dealloc(&writer);
8091     return NULL;
8092 }
8093 
8094 /* Charmap encoding: the lookup table */
8095 
8096 struct encoding_map {
8097     PyObject_HEAD
8098     unsigned char level1[32];
8099     int count2, count3;
8100     unsigned char level23[1];
8101 };
8102 
8103 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8104 encoding_map_size(PyObject *obj, PyObject* args)
8105 {
8106     struct encoding_map *map = (struct encoding_map*)obj;
8107     return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8108                            128*map->count3);
8109 }
8110 
8111 static PyMethodDef encoding_map_methods[] = {
8112     {"size", encoding_map_size, METH_NOARGS,
8113      PyDoc_STR("Return the size (in bytes) of this object") },
8114     { 0 }
8115 };
8116 
8117 static PyTypeObject EncodingMapType = {
8118     PyVarObject_HEAD_INIT(NULL, 0)
8119     "EncodingMap",          /*tp_name*/
8120     sizeof(struct encoding_map),   /*tp_basicsize*/
8121     0,                      /*tp_itemsize*/
8122     /* methods */
8123     0,                      /*tp_dealloc*/
8124     0,                      /*tp_vectorcall_offset*/
8125     0,                      /*tp_getattr*/
8126     0,                      /*tp_setattr*/
8127     0,                      /*tp_as_async*/
8128     0,                      /*tp_repr*/
8129     0,                      /*tp_as_number*/
8130     0,                      /*tp_as_sequence*/
8131     0,                      /*tp_as_mapping*/
8132     0,                      /*tp_hash*/
8133     0,                      /*tp_call*/
8134     0,                      /*tp_str*/
8135     0,                      /*tp_getattro*/
8136     0,                      /*tp_setattro*/
8137     0,                      /*tp_as_buffer*/
8138     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8139     0,                      /*tp_doc*/
8140     0,                      /*tp_traverse*/
8141     0,                      /*tp_clear*/
8142     0,                      /*tp_richcompare*/
8143     0,                      /*tp_weaklistoffset*/
8144     0,                      /*tp_iter*/
8145     0,                      /*tp_iternext*/
8146     encoding_map_methods,   /*tp_methods*/
8147     0,                      /*tp_members*/
8148     0,                      /*tp_getset*/
8149     0,                      /*tp_base*/
8150     0,                      /*tp_dict*/
8151     0,                      /*tp_descr_get*/
8152     0,                      /*tp_descr_set*/
8153     0,                      /*tp_dictoffset*/
8154     0,                      /*tp_init*/
8155     0,                      /*tp_alloc*/
8156     0,                      /*tp_new*/
8157     0,                      /*tp_free*/
8158     0,                      /*tp_is_gc*/
8159 };
8160 
8161 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8162 PyUnicode_BuildEncodingMap(PyObject* string)
8163 {
8164     PyObject *result;
8165     struct encoding_map *mresult;
8166     int i;
8167     int need_dict = 0;
8168     unsigned char level1[32];
8169     unsigned char level2[512];
8170     unsigned char *mlevel1, *mlevel2, *mlevel3;
8171     int count2 = 0, count3 = 0;
8172     int kind;
8173     void *data;
8174     Py_ssize_t length;
8175     Py_UCS4 ch;
8176 
8177     if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8178         PyErr_BadArgument();
8179         return NULL;
8180     }
8181     kind = PyUnicode_KIND(string);
8182     data = PyUnicode_DATA(string);
8183     length = PyUnicode_GET_LENGTH(string);
8184     length = Py_MIN(length, 256);
8185     memset(level1, 0xFF, sizeof level1);
8186     memset(level2, 0xFF, sizeof level2);
8187 
8188     /* If there isn't a one-to-one mapping of NULL to \0,
8189        or if there are non-BMP characters, we need to use
8190        a mapping dictionary. */
8191     if (PyUnicode_READ(kind, data, 0) != 0)
8192         need_dict = 1;
8193     for (i = 1; i < length; i++) {
8194         int l1, l2;
8195         ch = PyUnicode_READ(kind, data, i);
8196         if (ch == 0 || ch > 0xFFFF) {
8197             need_dict = 1;
8198             break;
8199         }
8200         if (ch == 0xFFFE)
8201             /* unmapped character */
8202             continue;
8203         l1 = ch >> 11;
8204         l2 = ch >> 7;
8205         if (level1[l1] == 0xFF)
8206             level1[l1] = count2++;
8207         if (level2[l2] == 0xFF)
8208             level2[l2] = count3++;
8209     }
8210 
8211     if (count2 >= 0xFF || count3 >= 0xFF)
8212         need_dict = 1;
8213 
8214     if (need_dict) {
8215         PyObject *result = PyDict_New();
8216         PyObject *key, *value;
8217         if (!result)
8218             return NULL;
8219         for (i = 0; i < length; i++) {
8220             key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8221             value = PyLong_FromLong(i);
8222             if (!key || !value)
8223                 goto failed1;
8224             if (PyDict_SetItem(result, key, value) == -1)
8225                 goto failed1;
8226             Py_DECREF(key);
8227             Py_DECREF(value);
8228         }
8229         return result;
8230       failed1:
8231         Py_XDECREF(key);
8232         Py_XDECREF(value);
8233         Py_DECREF(result);
8234         return NULL;
8235     }
8236 
8237     /* Create a three-level trie */
8238     result = PyObject_MALLOC(sizeof(struct encoding_map) +
8239                              16*count2 + 128*count3 - 1);
8240     if (!result)
8241         return PyErr_NoMemory();
8242     PyObject_Init(result, &EncodingMapType);
8243     mresult = (struct encoding_map*)result;
8244     mresult->count2 = count2;
8245     mresult->count3 = count3;
8246     mlevel1 = mresult->level1;
8247     mlevel2 = mresult->level23;
8248     mlevel3 = mresult->level23 + 16*count2;
8249     memcpy(mlevel1, level1, 32);
8250     memset(mlevel2, 0xFF, 16*count2);
8251     memset(mlevel3, 0, 128*count3);
8252     count3 = 0;
8253     for (i = 1; i < length; i++) {
8254         int o1, o2, o3, i2, i3;
8255         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8256         if (ch == 0xFFFE)
8257             /* unmapped character */
8258             continue;
8259         o1 = ch>>11;
8260         o2 = (ch>>7) & 0xF;
8261         i2 = 16*mlevel1[o1] + o2;
8262         if (mlevel2[i2] == 0xFF)
8263             mlevel2[i2] = count3++;
8264         o3 = ch & 0x7F;
8265         i3 = 128*mlevel2[i2] + o3;
8266         mlevel3[i3] = i;
8267     }
8268     return result;
8269 }
8270 
8271 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8272 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8273 {
8274     struct encoding_map *map = (struct encoding_map*)mapping;
8275     int l1 = c>>11;
8276     int l2 = (c>>7) & 0xF;
8277     int l3 = c & 0x7F;
8278     int i;
8279 
8280     if (c > 0xFFFF)
8281         return -1;
8282     if (c == 0)
8283         return 0;
8284     /* level 1*/
8285     i = map->level1[l1];
8286     if (i == 0xFF) {
8287         return -1;
8288     }
8289     /* level 2*/
8290     i = map->level23[16*i+l2];
8291     if (i == 0xFF) {
8292         return -1;
8293     }
8294     /* level 3 */
8295     i = map->level23[16*map->count2 + 128*i + l3];
8296     if (i == 0) {
8297         return -1;
8298     }
8299     return i;
8300 }
8301 
8302 /* Lookup the character ch in the mapping. If the character
8303    can't be found, Py_None is returned (or NULL, if another
8304    error occurred). */
8305 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8306 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8307 {
8308     PyObject *w = PyLong_FromLong((long)c);
8309     PyObject *x;
8310 
8311     if (w == NULL)
8312         return NULL;
8313     x = PyObject_GetItem(mapping, w);
8314     Py_DECREF(w);
8315     if (x == NULL) {
8316         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8317             /* No mapping found means: mapping is undefined. */
8318             PyErr_Clear();
8319             Py_RETURN_NONE;
8320         } else
8321             return NULL;
8322     }
8323     else if (x == Py_None)
8324         return x;
8325     else if (PyLong_Check(x)) {
8326         long value = PyLong_AS_LONG(x);
8327         if (value < 0 || value > 255) {
8328             PyErr_SetString(PyExc_TypeError,
8329                             "character mapping must be in range(256)");
8330             Py_DECREF(x);
8331             return NULL;
8332         }
8333         return x;
8334     }
8335     else if (PyBytes_Check(x))
8336         return x;
8337     else {
8338         /* wrong return value */
8339         PyErr_Format(PyExc_TypeError,
8340                      "character mapping must return integer, bytes or None, not %.400s",
8341                      x->ob_type->tp_name);
8342         Py_DECREF(x);
8343         return NULL;
8344     }
8345 }
8346 
8347 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8348 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8349 {
8350     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8351     /* exponentially overallocate to minimize reallocations */
8352     if (requiredsize < 2*outsize)
8353         requiredsize = 2*outsize;
8354     if (_PyBytes_Resize(outobj, requiredsize))
8355         return -1;
8356     return 0;
8357 }
8358 
8359 typedef enum charmapencode_result {
8360     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8361 } charmapencode_result;
8362 /* lookup the character, put the result in the output string and adjust
8363    various state variables. Resize the output bytes object if not enough
8364    space is available. Return a new reference to the object that
8365    was put in the output buffer, or Py_None, if the mapping was undefined
8366    (in which case no character was written) or NULL, if a
8367    reallocation error occurred. The caller must decref the result */
8368 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8369 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8370                      PyObject **outobj, Py_ssize_t *outpos)
8371 {
8372     PyObject *rep;
8373     char *outstart;
8374     Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8375 
8376     if (Py_TYPE(mapping) == &EncodingMapType) {
8377         int res = encoding_map_lookup(c, mapping);
8378         Py_ssize_t requiredsize = *outpos+1;
8379         if (res == -1)
8380             return enc_FAILED;
8381         if (outsize<requiredsize)
8382             if (charmapencode_resize(outobj, outpos, requiredsize))
8383                 return enc_EXCEPTION;
8384         outstart = PyBytes_AS_STRING(*outobj);
8385         outstart[(*outpos)++] = (char)res;
8386         return enc_SUCCESS;
8387     }
8388 
8389     rep = charmapencode_lookup(c, mapping);
8390     if (rep==NULL)
8391         return enc_EXCEPTION;
8392     else if (rep==Py_None) {
8393         Py_DECREF(rep);
8394         return enc_FAILED;
8395     } else {
8396         if (PyLong_Check(rep)) {
8397             Py_ssize_t requiredsize = *outpos+1;
8398             if (outsize<requiredsize)
8399                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8400                     Py_DECREF(rep);
8401                     return enc_EXCEPTION;
8402                 }
8403             outstart = PyBytes_AS_STRING(*outobj);
8404             outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8405         }
8406         else {
8407             const char *repchars = PyBytes_AS_STRING(rep);
8408             Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8409             Py_ssize_t requiredsize = *outpos+repsize;
8410             if (outsize<requiredsize)
8411                 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8412                     Py_DECREF(rep);
8413                     return enc_EXCEPTION;
8414                 }
8415             outstart = PyBytes_AS_STRING(*outobj);
8416             memcpy(outstart + *outpos, repchars, repsize);
8417             *outpos += repsize;
8418         }
8419     }
8420     Py_DECREF(rep);
8421     return enc_SUCCESS;
8422 }
8423 
8424 /* handle an error in PyUnicode_EncodeCharmap
8425    Return 0 on success, -1 on error */
8426 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8427 charmap_encoding_error(
8428     PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8429     PyObject **exceptionObject,
8430     _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8431     PyObject **res, Py_ssize_t *respos)
8432 {
8433     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8434     Py_ssize_t size, repsize;
8435     Py_ssize_t newpos;
8436     enum PyUnicode_Kind kind;
8437     void *data;
8438     Py_ssize_t index;
8439     /* startpos for collecting unencodable chars */
8440     Py_ssize_t collstartpos = *inpos;
8441     Py_ssize_t collendpos = *inpos+1;
8442     Py_ssize_t collpos;
8443     const char *encoding = "charmap";
8444     const char *reason = "character maps to <undefined>";
8445     charmapencode_result x;
8446     Py_UCS4 ch;
8447     int val;
8448 
8449     if (PyUnicode_READY(unicode) == -1)
8450         return -1;
8451     size = PyUnicode_GET_LENGTH(unicode);
8452     /* find all unencodable characters */
8453     while (collendpos < size) {
8454         PyObject *rep;
8455         if (Py_TYPE(mapping) == &EncodingMapType) {
8456             ch = PyUnicode_READ_CHAR(unicode, collendpos);
8457             val = encoding_map_lookup(ch, mapping);
8458             if (val != -1)
8459                 break;
8460             ++collendpos;
8461             continue;
8462         }
8463 
8464         ch = PyUnicode_READ_CHAR(unicode, collendpos);
8465         rep = charmapencode_lookup(ch, mapping);
8466         if (rep==NULL)
8467             return -1;
8468         else if (rep!=Py_None) {
8469             Py_DECREF(rep);
8470             break;
8471         }
8472         Py_DECREF(rep);
8473         ++collendpos;
8474     }
8475     /* cache callback name lookup
8476      * (if not done yet, i.e. it's the first error) */
8477     if (*error_handler == _Py_ERROR_UNKNOWN)
8478         *error_handler = _Py_GetErrorHandler(errors);
8479 
8480     switch (*error_handler) {
8481     case _Py_ERROR_STRICT:
8482         raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8483         return -1;
8484 
8485     case _Py_ERROR_REPLACE:
8486         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8487             x = charmapencode_output('?', mapping, res, respos);
8488             if (x==enc_EXCEPTION) {
8489                 return -1;
8490             }
8491             else if (x==enc_FAILED) {
8492                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8493                 return -1;
8494             }
8495         }
8496         /* fall through */
8497     case _Py_ERROR_IGNORE:
8498         *inpos = collendpos;
8499         break;
8500 
8501     case _Py_ERROR_XMLCHARREFREPLACE:
8502         /* generate replacement (temporarily (mis)uses p) */
8503         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8504             char buffer[2+29+1+1];
8505             char *cp;
8506             sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8507             for (cp = buffer; *cp; ++cp) {
8508                 x = charmapencode_output(*cp, mapping, res, respos);
8509                 if (x==enc_EXCEPTION)
8510                     return -1;
8511                 else if (x==enc_FAILED) {
8512                     raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8513                     return -1;
8514                 }
8515             }
8516         }
8517         *inpos = collendpos;
8518         break;
8519 
8520     default:
8521         repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8522                                                       encoding, reason, unicode, exceptionObject,
8523                                                       collstartpos, collendpos, &newpos);
8524         if (repunicode == NULL)
8525             return -1;
8526         if (PyBytes_Check(repunicode)) {
8527             /* Directly copy bytes result to output. */
8528             Py_ssize_t outsize = PyBytes_Size(*res);
8529             Py_ssize_t requiredsize;
8530             repsize = PyBytes_Size(repunicode);
8531             requiredsize = *respos + repsize;
8532             if (requiredsize > outsize)
8533                 /* Make room for all additional bytes. */
8534                 if (charmapencode_resize(res, respos, requiredsize)) {
8535                     Py_DECREF(repunicode);
8536                     return -1;
8537                 }
8538             memcpy(PyBytes_AsString(*res) + *respos,
8539                    PyBytes_AsString(repunicode),  repsize);
8540             *respos += repsize;
8541             *inpos = newpos;
8542             Py_DECREF(repunicode);
8543             break;
8544         }
8545         /* generate replacement  */
8546         if (PyUnicode_READY(repunicode) == -1) {
8547             Py_DECREF(repunicode);
8548             return -1;
8549         }
8550         repsize = PyUnicode_GET_LENGTH(repunicode);
8551         data = PyUnicode_DATA(repunicode);
8552         kind = PyUnicode_KIND(repunicode);
8553         for (index = 0; index < repsize; index++) {
8554             Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8555             x = charmapencode_output(repch, mapping, res, respos);
8556             if (x==enc_EXCEPTION) {
8557                 Py_DECREF(repunicode);
8558                 return -1;
8559             }
8560             else if (x==enc_FAILED) {
8561                 Py_DECREF(repunicode);
8562                 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8563                 return -1;
8564             }
8565         }
8566         *inpos = newpos;
8567         Py_DECREF(repunicode);
8568     }
8569     return 0;
8570 }
8571 
8572 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8573 _PyUnicode_EncodeCharmap(PyObject *unicode,
8574                          PyObject *mapping,
8575                          const char *errors)
8576 {
8577     /* output object */
8578     PyObject *res = NULL;
8579     /* current input position */
8580     Py_ssize_t inpos = 0;
8581     Py_ssize_t size;
8582     /* current output position */
8583     Py_ssize_t respos = 0;
8584     PyObject *error_handler_obj = NULL;
8585     PyObject *exc = NULL;
8586     _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8587     void *data;
8588     int kind;
8589 
8590     if (PyUnicode_READY(unicode) == -1)
8591         return NULL;
8592     size = PyUnicode_GET_LENGTH(unicode);
8593     data = PyUnicode_DATA(unicode);
8594     kind = PyUnicode_KIND(unicode);
8595 
8596     /* Default to Latin-1 */
8597     if (mapping == NULL)
8598         return unicode_encode_ucs1(unicode, errors, 256);
8599 
8600     /* allocate enough for a simple encoding without
8601        replacements, if we need more, we'll resize */
8602     res = PyBytes_FromStringAndSize(NULL, size);
8603     if (res == NULL)
8604         goto onError;
8605     if (size == 0)
8606         return res;
8607 
8608     while (inpos<size) {
8609         Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8610         /* try to encode it */
8611         charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8612         if (x==enc_EXCEPTION) /* error */
8613             goto onError;
8614         if (x==enc_FAILED) { /* unencodable character */
8615             if (charmap_encoding_error(unicode, &inpos, mapping,
8616                                        &exc,
8617                                        &error_handler, &error_handler_obj, errors,
8618                                        &res, &respos)) {
8619                 goto onError;
8620             }
8621         }
8622         else
8623             /* done with this character => adjust input position */
8624             ++inpos;
8625     }
8626 
8627     /* Resize if we allocated to much */
8628     if (respos<PyBytes_GET_SIZE(res))
8629         if (_PyBytes_Resize(&res, respos) < 0)
8630             goto onError;
8631 
8632     Py_XDECREF(exc);
8633     Py_XDECREF(error_handler_obj);
8634     return res;
8635 
8636   onError:
8637     Py_XDECREF(res);
8638     Py_XDECREF(exc);
8639     Py_XDECREF(error_handler_obj);
8640     return NULL;
8641 }
8642 
8643 /* Deprecated */
8644 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8645 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8646                         Py_ssize_t size,
8647                         PyObject *mapping,
8648                         const char *errors)
8649 {
8650     PyObject *result;
8651     PyObject *unicode = PyUnicode_FromWideChar(p, size);
8652     if (unicode == NULL)
8653         return NULL;
8654     result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8655     Py_DECREF(unicode);
8656     return result;
8657 }
8658 
8659 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8660 PyUnicode_AsCharmapString(PyObject *unicode,
8661                           PyObject *mapping)
8662 {
8663     if (!PyUnicode_Check(unicode) || mapping == NULL) {
8664         PyErr_BadArgument();
8665         return NULL;
8666     }
8667     return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8668 }
8669 
8670 /* create or adjust a UnicodeTranslateError */
8671 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8672 make_translate_exception(PyObject **exceptionObject,
8673                          PyObject *unicode,
8674                          Py_ssize_t startpos, Py_ssize_t endpos,
8675                          const char *reason)
8676 {
8677     if (*exceptionObject == NULL) {
8678         *exceptionObject = _PyUnicodeTranslateError_Create(
8679             unicode, startpos, endpos, reason);
8680     }
8681     else {
8682         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8683             goto onError;
8684         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8685             goto onError;
8686         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8687             goto onError;
8688         return;
8689       onError:
8690         Py_CLEAR(*exceptionObject);
8691     }
8692 }
8693 
8694 /* error handling callback helper:
8695    build arguments, call the callback and check the arguments,
8696    put the result into newpos and return the replacement string, which
8697    has to be freed by the caller */
8698 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8699 unicode_translate_call_errorhandler(const char *errors,
8700                                     PyObject **errorHandler,
8701                                     const char *reason,
8702                                     PyObject *unicode, PyObject **exceptionObject,
8703                                     Py_ssize_t startpos, Py_ssize_t endpos,
8704                                     Py_ssize_t *newpos)
8705 {
8706     static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8707 
8708     Py_ssize_t i_newpos;
8709     PyObject *restuple;
8710     PyObject *resunicode;
8711 
8712     if (*errorHandler == NULL) {
8713         *errorHandler = PyCodec_LookupError(errors);
8714         if (*errorHandler == NULL)
8715             return NULL;
8716     }
8717 
8718     make_translate_exception(exceptionObject,
8719                              unicode, startpos, endpos, reason);
8720     if (*exceptionObject == NULL)
8721         return NULL;
8722 
8723     restuple = PyObject_CallFunctionObjArgs(
8724         *errorHandler, *exceptionObject, NULL);
8725     if (restuple == NULL)
8726         return NULL;
8727     if (!PyTuple_Check(restuple)) {
8728         PyErr_SetString(PyExc_TypeError, &argparse[3]);
8729         Py_DECREF(restuple);
8730         return NULL;
8731     }
8732     if (!PyArg_ParseTuple(restuple, argparse,
8733                           &resunicode, &i_newpos)) {
8734         Py_DECREF(restuple);
8735         return NULL;
8736     }
8737     if (i_newpos<0)
8738         *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8739     else
8740         *newpos = i_newpos;
8741     if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8742         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8743         Py_DECREF(restuple);
8744         return NULL;
8745     }
8746     Py_INCREF(resunicode);
8747     Py_DECREF(restuple);
8748     return resunicode;
8749 }
8750 
8751 /* Lookup the character ch in the mapping and put the result in result,
8752    which must be decrefed by the caller.
8753    Return 0 on success, -1 on error */
8754 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8755 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8756 {
8757     PyObject *w = PyLong_FromLong((long)c);
8758     PyObject *x;
8759 
8760     if (w == NULL)
8761         return -1;
8762     x = PyObject_GetItem(mapping, w);
8763     Py_DECREF(w);
8764     if (x == NULL) {
8765         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8766             /* No mapping found means: use 1:1 mapping. */
8767             PyErr_Clear();
8768             *result = NULL;
8769             return 0;
8770         } else
8771             return -1;
8772     }
8773     else if (x == Py_None) {
8774         *result = x;
8775         return 0;
8776     }
8777     else if (PyLong_Check(x)) {
8778         long value = PyLong_AS_LONG(x);
8779         if (value < 0 || value > MAX_UNICODE) {
8780             PyErr_Format(PyExc_ValueError,
8781                          "character mapping must be in range(0x%x)",
8782                          MAX_UNICODE+1);
8783             Py_DECREF(x);
8784             return -1;
8785         }
8786         *result = x;
8787         return 0;
8788     }
8789     else if (PyUnicode_Check(x)) {
8790         *result = x;
8791         return 0;
8792     }
8793     else {
8794         /* wrong return value */
8795         PyErr_SetString(PyExc_TypeError,
8796                         "character mapping must return integer, None or str");
8797         Py_DECREF(x);
8798         return -1;
8799     }
8800 }
8801 
8802 /* lookup the character, write the result into the writer.
8803    Return 1 if the result was written into the writer, return 0 if the mapping
8804    was undefined, raise an exception return -1 on error. */
8805 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8806 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8807                         _PyUnicodeWriter *writer)
8808 {
8809     PyObject *item;
8810 
8811     if (charmaptranslate_lookup(ch, mapping, &item))
8812         return -1;
8813 
8814     if (item == NULL) {
8815         /* not found => default to 1:1 mapping */
8816         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8817             return -1;
8818         }
8819         return 1;
8820     }
8821 
8822     if (item == Py_None) {
8823         Py_DECREF(item);
8824         return 0;
8825     }
8826 
8827     if (PyLong_Check(item)) {
8828         long ch = (Py_UCS4)PyLong_AS_LONG(item);
8829         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8830            used it */
8831         if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8832             Py_DECREF(item);
8833             return -1;
8834         }
8835         Py_DECREF(item);
8836         return 1;
8837     }
8838 
8839     if (!PyUnicode_Check(item)) {
8840         Py_DECREF(item);
8841         return -1;
8842     }
8843 
8844     if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8845         Py_DECREF(item);
8846         return -1;
8847     }
8848 
8849     Py_DECREF(item);
8850     return 1;
8851 }
8852 
8853 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8854 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8855                               Py_UCS1 *translate)
8856 {
8857     PyObject *item = NULL;
8858     int ret = 0;
8859 
8860     if (charmaptranslate_lookup(ch, mapping, &item)) {
8861         return -1;
8862     }
8863 
8864     if (item == Py_None) {
8865         /* deletion */
8866         translate[ch] = 0xfe;
8867     }
8868     else if (item == NULL) {
8869         /* not found => default to 1:1 mapping */
8870         translate[ch] = ch;
8871         return 1;
8872     }
8873     else if (PyLong_Check(item)) {
8874         long replace = PyLong_AS_LONG(item);
8875         /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8876            used it */
8877         if (127 < replace) {
8878             /* invalid character or character outside ASCII:
8879                skip the fast translate */
8880             goto exit;
8881         }
8882         translate[ch] = (Py_UCS1)replace;
8883     }
8884     else if (PyUnicode_Check(item)) {
8885         Py_UCS4 replace;
8886 
8887         if (PyUnicode_READY(item) == -1) {
8888             Py_DECREF(item);
8889             return -1;
8890         }
8891         if (PyUnicode_GET_LENGTH(item) != 1)
8892             goto exit;
8893 
8894         replace = PyUnicode_READ_CHAR(item, 0);
8895         if (replace > 127)
8896             goto exit;
8897         translate[ch] = (Py_UCS1)replace;
8898     }
8899     else {
8900         /* not None, NULL, long or unicode */
8901         goto exit;
8902     }
8903     ret = 1;
8904 
8905   exit:
8906     Py_DECREF(item);
8907     return ret;
8908 }
8909 
8910 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8911    was translated into writer, return 0 if the input string was partially
8912    translated into writer, raise an exception and return -1 on error. */
8913 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8914 unicode_fast_translate(PyObject *input, PyObject *mapping,
8915                        _PyUnicodeWriter *writer, int ignore,
8916                        Py_ssize_t *input_pos)
8917 {
8918     Py_UCS1 ascii_table[128], ch, ch2;
8919     Py_ssize_t len;
8920     Py_UCS1 *in, *end, *out;
8921     int res = 0;
8922 
8923     len = PyUnicode_GET_LENGTH(input);
8924 
8925     memset(ascii_table, 0xff, 128);
8926 
8927     in = PyUnicode_1BYTE_DATA(input);
8928     end = in + len;
8929 
8930     assert(PyUnicode_IS_ASCII(writer->buffer));
8931     assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8932     out = PyUnicode_1BYTE_DATA(writer->buffer);
8933 
8934     for (; in < end; in++) {
8935         ch = *in;
8936         ch2 = ascii_table[ch];
8937         if (ch2 == 0xff) {
8938             int translate = unicode_fast_translate_lookup(mapping, ch,
8939                                                           ascii_table);
8940             if (translate < 0)
8941                 return -1;
8942             if (translate == 0)
8943                 goto exit;
8944             ch2 = ascii_table[ch];
8945         }
8946         if (ch2 == 0xfe) {
8947             if (ignore)
8948                 continue;
8949             goto exit;
8950         }
8951         assert(ch2 < 128);
8952         *out = ch2;
8953         out++;
8954     }
8955     res = 1;
8956 
8957 exit:
8958     writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8959     *input_pos = in - PyUnicode_1BYTE_DATA(input);
8960     return res;
8961 }
8962 
8963 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8964 _PyUnicode_TranslateCharmap(PyObject *input,
8965                             PyObject *mapping,
8966                             const char *errors)
8967 {
8968     /* input object */
8969     char *data;
8970     Py_ssize_t size, i;
8971     int kind;
8972     /* output buffer */
8973     _PyUnicodeWriter writer;
8974     /* error handler */
8975     const char *reason = "character maps to <undefined>";
8976     PyObject *errorHandler = NULL;
8977     PyObject *exc = NULL;
8978     int ignore;
8979     int res;
8980 
8981     if (mapping == NULL) {
8982         PyErr_BadArgument();
8983         return NULL;
8984     }
8985 
8986     if (PyUnicode_READY(input) == -1)
8987         return NULL;
8988     data = (char*)PyUnicode_DATA(input);
8989     kind = PyUnicode_KIND(input);
8990     size = PyUnicode_GET_LENGTH(input);
8991 
8992     if (size == 0)
8993         return PyUnicode_FromObject(input);
8994 
8995     /* allocate enough for a simple 1:1 translation without
8996        replacements, if we need more, we'll resize */
8997     _PyUnicodeWriter_Init(&writer);
8998     if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8999         goto onError;
9000 
9001     ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9002 
9003     if (PyUnicode_READY(input) == -1)
9004         return NULL;
9005     if (PyUnicode_IS_ASCII(input)) {
9006         res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9007         if (res < 0) {
9008             _PyUnicodeWriter_Dealloc(&writer);
9009             return NULL;
9010         }
9011         if (res == 1)
9012             return _PyUnicodeWriter_Finish(&writer);
9013     }
9014     else {
9015         i = 0;
9016     }
9017 
9018     while (i<size) {
9019         /* try to encode it */
9020         int translate;
9021         PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9022         Py_ssize_t newpos;
9023         /* startpos for collecting untranslatable chars */
9024         Py_ssize_t collstart;
9025         Py_ssize_t collend;
9026         Py_UCS4 ch;
9027 
9028         ch = PyUnicode_READ(kind, data, i);
9029         translate = charmaptranslate_output(ch, mapping, &writer);
9030         if (translate < 0)
9031             goto onError;
9032 
9033         if (translate != 0) {
9034             /* it worked => adjust input pointer */
9035             ++i;
9036             continue;
9037         }
9038 
9039         /* untranslatable character */
9040         collstart = i;
9041         collend = i+1;
9042 
9043         /* find all untranslatable characters */
9044         while (collend < size) {
9045             PyObject *x;
9046             ch = PyUnicode_READ(kind, data, collend);
9047             if (charmaptranslate_lookup(ch, mapping, &x))
9048                 goto onError;
9049             Py_XDECREF(x);
9050             if (x != Py_None)
9051                 break;
9052             ++collend;
9053         }
9054 
9055         if (ignore) {
9056             i = collend;
9057         }
9058         else {
9059             repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9060                                                              reason, input, &exc,
9061                                                              collstart, collend, &newpos);
9062             if (repunicode == NULL)
9063                 goto onError;
9064             if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9065                 Py_DECREF(repunicode);
9066                 goto onError;
9067             }
9068             Py_DECREF(repunicode);
9069             i = newpos;
9070         }
9071     }
9072     Py_XDECREF(exc);
9073     Py_XDECREF(errorHandler);
9074     return _PyUnicodeWriter_Finish(&writer);
9075 
9076   onError:
9077     _PyUnicodeWriter_Dealloc(&writer);
9078     Py_XDECREF(exc);
9079     Py_XDECREF(errorHandler);
9080     return NULL;
9081 }
9082 
9083 /* Deprecated. Use PyUnicode_Translate instead. */
9084 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9085 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9086                            Py_ssize_t size,
9087                            PyObject *mapping,
9088                            const char *errors)
9089 {
9090     PyObject *result;
9091     PyObject *unicode = PyUnicode_FromWideChar(p, size);
9092     if (!unicode)
9093         return NULL;
9094     result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9095     Py_DECREF(unicode);
9096     return result;
9097 }
9098 
9099 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9100 PyUnicode_Translate(PyObject *str,
9101                     PyObject *mapping,
9102                     const char *errors)
9103 {
9104     if (ensure_unicode(str) < 0)
9105         return NULL;
9106     return _PyUnicode_TranslateCharmap(str, mapping, errors);
9107 }
9108 
9109 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9110 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9111 {
9112     if (!PyUnicode_Check(unicode)) {
9113         PyErr_BadInternalCall();
9114         return NULL;
9115     }
9116     if (PyUnicode_READY(unicode) == -1)
9117         return NULL;
9118     if (PyUnicode_IS_ASCII(unicode)) {
9119         /* If the string is already ASCII, just return the same string */
9120         Py_INCREF(unicode);
9121         return unicode;
9122     }
9123 
9124     Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9125     PyObject *result = PyUnicode_New(len, 127);
9126     if (result == NULL) {
9127         return NULL;
9128     }
9129 
9130     Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9131     int kind = PyUnicode_KIND(unicode);
9132     const void *data = PyUnicode_DATA(unicode);
9133     Py_ssize_t i;
9134     for (i = 0; i < len; ++i) {
9135         Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9136         if (ch < 127) {
9137             out[i] = ch;
9138         }
9139         else if (Py_UNICODE_ISSPACE(ch)) {
9140             out[i] = ' ';
9141         }
9142         else {
9143             int decimal = Py_UNICODE_TODECIMAL(ch);
9144             if (decimal < 0) {
9145                 out[i] = '?';
9146                 out[i+1] = '\0';
9147                 _PyUnicode_LENGTH(result) = i + 1;
9148                 break;
9149             }
9150             out[i] = '0' + decimal;
9151         }
9152     }
9153 
9154     assert(_PyUnicode_CheckConsistency(result, 1));
9155     return result;
9156 }
9157 
9158 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9159 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9160                                   Py_ssize_t length)
9161 {
9162     PyObject *decimal;
9163     Py_ssize_t i;
9164     Py_UCS4 maxchar;
9165     enum PyUnicode_Kind kind;
9166     void *data;
9167 
9168     maxchar = 127;
9169     for (i = 0; i < length; i++) {
9170         Py_UCS4 ch = s[i];
9171         if (ch > 127) {
9172             int decimal = Py_UNICODE_TODECIMAL(ch);
9173             if (decimal >= 0)
9174                 ch = '0' + decimal;
9175             maxchar = Py_MAX(maxchar, ch);
9176         }
9177     }
9178 
9179     /* Copy to a new string */
9180     decimal = PyUnicode_New(length, maxchar);
9181     if (decimal == NULL)
9182         return decimal;
9183     kind = PyUnicode_KIND(decimal);
9184     data = PyUnicode_DATA(decimal);
9185     /* Iterate over code points */
9186     for (i = 0; i < length; i++) {
9187         Py_UCS4 ch = s[i];
9188         if (ch > 127) {
9189             int decimal = Py_UNICODE_TODECIMAL(ch);
9190             if (decimal >= 0)
9191                 ch = '0' + decimal;
9192         }
9193         PyUnicode_WRITE(kind, data, i, ch);
9194     }
9195     return unicode_result(decimal);
9196 }
9197 /* --- Decimal Encoder ---------------------------------------------------- */
9198 
9199 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9200 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9201                         Py_ssize_t length,
9202                         char *output,
9203                         const char *errors)
9204 {
9205     PyObject *unicode;
9206     Py_ssize_t i;
9207     enum PyUnicode_Kind kind;
9208     void *data;
9209 
9210     if (output == NULL) {
9211         PyErr_BadArgument();
9212         return -1;
9213     }
9214 
9215     unicode = PyUnicode_FromWideChar(s, length);
9216     if (unicode == NULL)
9217         return -1;
9218 
9219     kind = PyUnicode_KIND(unicode);
9220     data = PyUnicode_DATA(unicode);
9221 
9222     for (i=0; i < length; ) {
9223         PyObject *exc;
9224         Py_UCS4 ch;
9225         int decimal;
9226         Py_ssize_t startpos;
9227 
9228         ch = PyUnicode_READ(kind, data, i);
9229 
9230         if (Py_UNICODE_ISSPACE(ch)) {
9231             *output++ = ' ';
9232             i++;
9233             continue;
9234         }
9235         decimal = Py_UNICODE_TODECIMAL(ch);
9236         if (decimal >= 0) {
9237             *output++ = '0' + decimal;
9238             i++;
9239             continue;
9240         }
9241         if (0 < ch && ch < 256) {
9242             *output++ = (char)ch;
9243             i++;
9244             continue;
9245         }
9246 
9247         startpos = i;
9248         exc = NULL;
9249         raise_encode_exception(&exc, "decimal", unicode,
9250                                startpos, startpos+1,
9251                                "invalid decimal Unicode string");
9252         Py_XDECREF(exc);
9253         Py_DECREF(unicode);
9254         return -1;
9255     }
9256     /* 0-terminate the output string */
9257     *output++ = '\0';
9258     Py_DECREF(unicode);
9259     return 0;
9260 }
9261 
9262 /* --- Helpers ------------------------------------------------------------ */
9263 
9264 /* helper macro to fixup start/end slice values */
9265 #define ADJUST_INDICES(start, end, len)         \
9266     if (end > len)                              \
9267         end = len;                              \
9268     else if (end < 0) {                         \
9269         end += len;                             \
9270         if (end < 0)                            \
9271             end = 0;                            \
9272     }                                           \
9273     if (start < 0) {                            \
9274         start += len;                           \
9275         if (start < 0)                          \
9276             start = 0;                          \
9277     }
9278 
9279 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9280 any_find_slice(PyObject* s1, PyObject* s2,
9281                Py_ssize_t start,
9282                Py_ssize_t end,
9283                int direction)
9284 {
9285     int kind1, kind2;
9286     void *buf1, *buf2;
9287     Py_ssize_t len1, len2, result;
9288 
9289     kind1 = PyUnicode_KIND(s1);
9290     kind2 = PyUnicode_KIND(s2);
9291     if (kind1 < kind2)
9292         return -1;
9293 
9294     len1 = PyUnicode_GET_LENGTH(s1);
9295     len2 = PyUnicode_GET_LENGTH(s2);
9296     ADJUST_INDICES(start, end, len1);
9297     if (end - start < len2)
9298         return -1;
9299 
9300     buf1 = PyUnicode_DATA(s1);
9301     buf2 = PyUnicode_DATA(s2);
9302     if (len2 == 1) {
9303         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9304         result = findchar((const char *)buf1 + kind1*start,
9305                           kind1, end - start, ch, direction);
9306         if (result == -1)
9307             return -1;
9308         else
9309             return start + result;
9310     }
9311 
9312     if (kind2 != kind1) {
9313         buf2 = _PyUnicode_AsKind(s2, kind1);
9314         if (!buf2)
9315             return -2;
9316     }
9317 
9318     if (direction > 0) {
9319         switch (kind1) {
9320         case PyUnicode_1BYTE_KIND:
9321             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9322                 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9323             else
9324                 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9325             break;
9326         case PyUnicode_2BYTE_KIND:
9327             result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9328             break;
9329         case PyUnicode_4BYTE_KIND:
9330             result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9331             break;
9332         default:
9333             Py_UNREACHABLE();
9334         }
9335     }
9336     else {
9337         switch (kind1) {
9338         case PyUnicode_1BYTE_KIND:
9339             if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9340                 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9341             else
9342                 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9343             break;
9344         case PyUnicode_2BYTE_KIND:
9345             result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9346             break;
9347         case PyUnicode_4BYTE_KIND:
9348             result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9349             break;
9350         default:
9351             Py_UNREACHABLE();
9352         }
9353     }
9354 
9355     if (kind2 != kind1)
9356         PyMem_Free(buf2);
9357 
9358     return result;
9359 }
9360 
9361 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9362 #include "stringlib/localeutil.h"
9363 
9364 /**
9365  * InsertThousandsGrouping:
9366  * @writer: Unicode writer.
9367  * @n_buffer: Number of characters in @buffer.
9368  * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9369  * @d_pos: Start of digits string.
9370  * @n_digits: The number of digits in the string, in which we want
9371  *            to put the grouping chars.
9372  * @min_width: The minimum width of the digits in the output string.
9373  *             Output will be zero-padded on the left to fill.
9374  * @grouping: see definition in localeconv().
9375  * @thousands_sep: see definition in localeconv().
9376  *
9377  * There are 2 modes: counting and filling. If @writer is NULL,
9378  *  we are in counting mode, else filling mode.
9379  * If counting, the required buffer size is returned.
9380  * If filling, we know the buffer will be large enough, so we don't
9381  *  need to pass in the buffer size.
9382  * Inserts thousand grouping characters (as defined by grouping and
9383  *  thousands_sep) into @writer.
9384  *
9385  * Return value: -1 on error, number of characters otherwise.
9386  **/
9387 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9388 _PyUnicode_InsertThousandsGrouping(
9389     _PyUnicodeWriter *writer,
9390     Py_ssize_t n_buffer,
9391     PyObject *digits,
9392     Py_ssize_t d_pos,
9393     Py_ssize_t n_digits,
9394     Py_ssize_t min_width,
9395     const char *grouping,
9396     PyObject *thousands_sep,
9397     Py_UCS4 *maxchar)
9398 {
9399     min_width = Py_MAX(0, min_width);
9400     if (writer) {
9401         assert(digits != NULL);
9402         assert(maxchar == NULL);
9403     }
9404     else {
9405         assert(digits == NULL);
9406         assert(maxchar != NULL);
9407     }
9408     assert(0 <= d_pos);
9409     assert(0 <= n_digits);
9410     assert(grouping != NULL);
9411 
9412     if (digits != NULL) {
9413         if (PyUnicode_READY(digits) == -1) {
9414             return -1;
9415         }
9416     }
9417     if (PyUnicode_READY(thousands_sep) == -1) {
9418         return -1;
9419     }
9420 
9421     Py_ssize_t count = 0;
9422     Py_ssize_t n_zeros;
9423     int loop_broken = 0;
9424     int use_separator = 0; /* First time through, don't append the
9425                               separator. They only go between
9426                               groups. */
9427     Py_ssize_t buffer_pos;
9428     Py_ssize_t digits_pos;
9429     Py_ssize_t len;
9430     Py_ssize_t n_chars;
9431     Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9432                                         be looked at */
9433     /* A generator that returns all of the grouping widths, until it
9434        returns 0. */
9435     GroupGenerator groupgen;
9436     GroupGenerator_init(&groupgen, grouping);
9437     const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9438 
9439     /* if digits are not grouped, thousands separator
9440        should be an empty string */
9441     assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9442 
9443     digits_pos = d_pos + n_digits;
9444     if (writer) {
9445         buffer_pos = writer->pos + n_buffer;
9446         assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9447         assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9448     }
9449     else {
9450         buffer_pos = n_buffer;
9451     }
9452 
9453     if (!writer) {
9454         *maxchar = 127;
9455     }
9456 
9457     while ((len = GroupGenerator_next(&groupgen)) > 0) {
9458         len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9459         n_zeros = Py_MAX(0, len - remaining);
9460         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9461 
9462         /* Use n_zero zero's and n_chars chars */
9463 
9464         /* Count only, don't do anything. */
9465         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9466 
9467         /* Copy into the writer. */
9468         InsertThousandsGrouping_fill(writer, &buffer_pos,
9469                                      digits, &digits_pos,
9470                                      n_chars, n_zeros,
9471                                      use_separator ? thousands_sep : NULL,
9472                                      thousands_sep_len, maxchar);
9473 
9474         /* Use a separator next time. */
9475         use_separator = 1;
9476 
9477         remaining -= n_chars;
9478         min_width -= len;
9479 
9480         if (remaining <= 0 && min_width <= 0) {
9481             loop_broken = 1;
9482             break;
9483         }
9484         min_width -= thousands_sep_len;
9485     }
9486     if (!loop_broken) {
9487         /* We left the loop without using a break statement. */
9488 
9489         len = Py_MAX(Py_MAX(remaining, min_width), 1);
9490         n_zeros = Py_MAX(0, len - remaining);
9491         n_chars = Py_MAX(0, Py_MIN(remaining, len));
9492 
9493         /* Use n_zero zero's and n_chars chars */
9494         count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9495 
9496         /* Copy into the writer. */
9497         InsertThousandsGrouping_fill(writer, &buffer_pos,
9498                                      digits, &digits_pos,
9499                                      n_chars, n_zeros,
9500                                      use_separator ? thousands_sep : NULL,
9501                                      thousands_sep_len, maxchar);
9502     }
9503     return count;
9504 }
9505 
9506 
9507 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9508 PyUnicode_Count(PyObject *str,
9509                 PyObject *substr,
9510                 Py_ssize_t start,
9511                 Py_ssize_t end)
9512 {
9513     Py_ssize_t result;
9514     int kind1, kind2;
9515     void *buf1 = NULL, *buf2 = NULL;
9516     Py_ssize_t len1, len2;
9517 
9518     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9519         return -1;
9520 
9521     kind1 = PyUnicode_KIND(str);
9522     kind2 = PyUnicode_KIND(substr);
9523     if (kind1 < kind2)
9524         return 0;
9525 
9526     len1 = PyUnicode_GET_LENGTH(str);
9527     len2 = PyUnicode_GET_LENGTH(substr);
9528     ADJUST_INDICES(start, end, len1);
9529     if (end - start < len2)
9530         return 0;
9531 
9532     buf1 = PyUnicode_DATA(str);
9533     buf2 = PyUnicode_DATA(substr);
9534     if (kind2 != kind1) {
9535         buf2 = _PyUnicode_AsKind(substr, kind1);
9536         if (!buf2)
9537             goto onError;
9538     }
9539 
9540     switch (kind1) {
9541     case PyUnicode_1BYTE_KIND:
9542         if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9543             result = asciilib_count(
9544                 ((Py_UCS1*)buf1) + start, end - start,
9545                 buf2, len2, PY_SSIZE_T_MAX
9546                 );
9547         else
9548             result = ucs1lib_count(
9549                 ((Py_UCS1*)buf1) + start, end - start,
9550                 buf2, len2, PY_SSIZE_T_MAX
9551                 );
9552         break;
9553     case PyUnicode_2BYTE_KIND:
9554         result = ucs2lib_count(
9555             ((Py_UCS2*)buf1) + start, end - start,
9556             buf2, len2, PY_SSIZE_T_MAX
9557             );
9558         break;
9559     case PyUnicode_4BYTE_KIND:
9560         result = ucs4lib_count(
9561             ((Py_UCS4*)buf1) + start, end - start,
9562             buf2, len2, PY_SSIZE_T_MAX
9563             );
9564         break;
9565     default:
9566         Py_UNREACHABLE();
9567     }
9568 
9569     if (kind2 != kind1)
9570         PyMem_Free(buf2);
9571 
9572     return result;
9573   onError:
9574     if (kind2 != kind1 && buf2)
9575         PyMem_Free(buf2);
9576     return -1;
9577 }
9578 
9579 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9580 PyUnicode_Find(PyObject *str,
9581                PyObject *substr,
9582                Py_ssize_t start,
9583                Py_ssize_t end,
9584                int direction)
9585 {
9586     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9587         return -2;
9588 
9589     return any_find_slice(str, substr, start, end, direction);
9590 }
9591 
9592 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9593 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9594                    Py_ssize_t start, Py_ssize_t end,
9595                    int direction)
9596 {
9597     int kind;
9598     Py_ssize_t len, result;
9599     if (PyUnicode_READY(str) == -1)
9600         return -2;
9601     len = PyUnicode_GET_LENGTH(str);
9602     ADJUST_INDICES(start, end, len);
9603     if (end - start < 1)
9604         return -1;
9605     kind = PyUnicode_KIND(str);
9606     result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9607                       kind, end-start, ch, direction);
9608     if (result == -1)
9609         return -1;
9610     else
9611         return start + result;
9612 }
9613 
9614 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9615 tailmatch(PyObject *self,
9616           PyObject *substring,
9617           Py_ssize_t start,
9618           Py_ssize_t end,
9619           int direction)
9620 {
9621     int kind_self;
9622     int kind_sub;
9623     void *data_self;
9624     void *data_sub;
9625     Py_ssize_t offset;
9626     Py_ssize_t i;
9627     Py_ssize_t end_sub;
9628 
9629     if (PyUnicode_READY(self) == -1 ||
9630         PyUnicode_READY(substring) == -1)
9631         return -1;
9632 
9633     ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9634     end -= PyUnicode_GET_LENGTH(substring);
9635     if (end < start)
9636         return 0;
9637 
9638     if (PyUnicode_GET_LENGTH(substring) == 0)
9639         return 1;
9640 
9641     kind_self = PyUnicode_KIND(self);
9642     data_self = PyUnicode_DATA(self);
9643     kind_sub = PyUnicode_KIND(substring);
9644     data_sub = PyUnicode_DATA(substring);
9645     end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9646 
9647     if (direction > 0)
9648         offset = end;
9649     else
9650         offset = start;
9651 
9652     if (PyUnicode_READ(kind_self, data_self, offset) ==
9653         PyUnicode_READ(kind_sub, data_sub, 0) &&
9654         PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9655         PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9656         /* If both are of the same kind, memcmp is sufficient */
9657         if (kind_self == kind_sub) {
9658             return ! memcmp((char *)data_self +
9659                                 (offset * PyUnicode_KIND(substring)),
9660                             data_sub,
9661                             PyUnicode_GET_LENGTH(substring) *
9662                                 PyUnicode_KIND(substring));
9663         }
9664         /* otherwise we have to compare each character by first accessing it */
9665         else {
9666             /* We do not need to compare 0 and len(substring)-1 because
9667                the if statement above ensured already that they are equal
9668                when we end up here. */
9669             for (i = 1; i < end_sub; ++i) {
9670                 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9671                     PyUnicode_READ(kind_sub, data_sub, i))
9672                     return 0;
9673             }
9674             return 1;
9675         }
9676     }
9677 
9678     return 0;
9679 }
9680 
9681 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9682 PyUnicode_Tailmatch(PyObject *str,
9683                     PyObject *substr,
9684                     Py_ssize_t start,
9685                     Py_ssize_t end,
9686                     int direction)
9687 {
9688     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9689         return -1;
9690 
9691     return tailmatch(str, substr, start, end, direction);
9692 }
9693 
9694 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9695 ascii_upper_or_lower(PyObject *self, int lower)
9696 {
9697     Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9698     char *resdata, *data = PyUnicode_DATA(self);
9699     PyObject *res;
9700 
9701     res = PyUnicode_New(len, 127);
9702     if (res == NULL)
9703         return NULL;
9704     resdata = PyUnicode_DATA(res);
9705     if (lower)
9706         _Py_bytes_lower(resdata, data, len);
9707     else
9708         _Py_bytes_upper(resdata, data, len);
9709     return res;
9710 }
9711 
9712 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9713 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9714 {
9715     Py_ssize_t j;
9716     int final_sigma;
9717     Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9718     /* U+03A3 is in the Final_Sigma context when, it is found like this:
9719 
9720      \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9721 
9722     where ! is a negation and \p{xxx} is a character with property xxx.
9723     */
9724     for (j = i - 1; j >= 0; j--) {
9725         c = PyUnicode_READ(kind, data, j);
9726         if (!_PyUnicode_IsCaseIgnorable(c))
9727             break;
9728     }
9729     final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9730     if (final_sigma) {
9731         for (j = i + 1; j < length; j++) {
9732             c = PyUnicode_READ(kind, data, j);
9733             if (!_PyUnicode_IsCaseIgnorable(c))
9734                 break;
9735         }
9736         final_sigma = j == length || !_PyUnicode_IsCased(c);
9737     }
9738     return (final_sigma) ? 0x3C2 : 0x3C3;
9739 }
9740 
9741 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9742 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9743            Py_UCS4 c, Py_UCS4 *mapped)
9744 {
9745     /* Obscure special case. */
9746     if (c == 0x3A3) {
9747         mapped[0] = handle_capital_sigma(kind, data, length, i);
9748         return 1;
9749     }
9750     return _PyUnicode_ToLowerFull(c, mapped);
9751 }
9752 
9753 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9754 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9755 {
9756     Py_ssize_t i, k = 0;
9757     int n_res, j;
9758     Py_UCS4 c, mapped[3];
9759 
9760     c = PyUnicode_READ(kind, data, 0);
9761     n_res = _PyUnicode_ToTitleFull(c, mapped);
9762     for (j = 0; j < n_res; j++) {
9763         *maxchar = Py_MAX(*maxchar, mapped[j]);
9764         res[k++] = mapped[j];
9765     }
9766     for (i = 1; i < length; i++) {
9767         c = PyUnicode_READ(kind, data, i);
9768         n_res = lower_ucs4(kind, data, length, i, c, mapped);
9769         for (j = 0; j < n_res; j++) {
9770             *maxchar = Py_MAX(*maxchar, mapped[j]);
9771             res[k++] = mapped[j];
9772         }
9773     }
9774     return k;
9775 }
9776 
9777 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9778 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9779     Py_ssize_t i, k = 0;
9780 
9781     for (i = 0; i < length; i++) {
9782         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9783         int n_res, j;
9784         if (Py_UNICODE_ISUPPER(c)) {
9785             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9786         }
9787         else if (Py_UNICODE_ISLOWER(c)) {
9788             n_res = _PyUnicode_ToUpperFull(c, mapped);
9789         }
9790         else {
9791             n_res = 1;
9792             mapped[0] = c;
9793         }
9794         for (j = 0; j < n_res; j++) {
9795             *maxchar = Py_MAX(*maxchar, mapped[j]);
9796             res[k++] = mapped[j];
9797         }
9798     }
9799     return k;
9800 }
9801 
9802 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9803 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9804                   Py_UCS4 *maxchar, int lower)
9805 {
9806     Py_ssize_t i, k = 0;
9807 
9808     for (i = 0; i < length; i++) {
9809         Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9810         int n_res, j;
9811         if (lower)
9812             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9813         else
9814             n_res = _PyUnicode_ToUpperFull(c, mapped);
9815         for (j = 0; j < n_res; j++) {
9816             *maxchar = Py_MAX(*maxchar, mapped[j]);
9817             res[k++] = mapped[j];
9818         }
9819     }
9820     return k;
9821 }
9822 
9823 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9824 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9825 {
9826     return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9827 }
9828 
9829 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9830 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9831 {
9832     return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9833 }
9834 
9835 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9836 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9837 {
9838     Py_ssize_t i, k = 0;
9839 
9840     for (i = 0; i < length; i++) {
9841         Py_UCS4 c = PyUnicode_READ(kind, data, i);
9842         Py_UCS4 mapped[3];
9843         int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9844         for (j = 0; j < n_res; j++) {
9845             *maxchar = Py_MAX(*maxchar, mapped[j]);
9846             res[k++] = mapped[j];
9847         }
9848     }
9849     return k;
9850 }
9851 
9852 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9853 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9854 {
9855     Py_ssize_t i, k = 0;
9856     int previous_is_cased;
9857 
9858     previous_is_cased = 0;
9859     for (i = 0; i < length; i++) {
9860         const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9861         Py_UCS4 mapped[3];
9862         int n_res, j;
9863 
9864         if (previous_is_cased)
9865             n_res = lower_ucs4(kind, data, length, i, c, mapped);
9866         else
9867             n_res = _PyUnicode_ToTitleFull(c, mapped);
9868 
9869         for (j = 0; j < n_res; j++) {
9870             *maxchar = Py_MAX(*maxchar, mapped[j]);
9871             res[k++] = mapped[j];
9872         }
9873 
9874         previous_is_cased = _PyUnicode_IsCased(c);
9875     }
9876     return k;
9877 }
9878 
9879 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9880 case_operation(PyObject *self,
9881                Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9882 {
9883     PyObject *res = NULL;
9884     Py_ssize_t length, newlength = 0;
9885     int kind, outkind;
9886     void *data, *outdata;
9887     Py_UCS4 maxchar = 0, *tmp, *tmpend;
9888 
9889     assert(PyUnicode_IS_READY(self));
9890 
9891     kind = PyUnicode_KIND(self);
9892     data = PyUnicode_DATA(self);
9893     length = PyUnicode_GET_LENGTH(self);
9894     if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9895         PyErr_SetString(PyExc_OverflowError, "string is too long");
9896         return NULL;
9897     }
9898     tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9899     if (tmp == NULL)
9900         return PyErr_NoMemory();
9901     newlength = perform(kind, data, length, tmp, &maxchar);
9902     res = PyUnicode_New(newlength, maxchar);
9903     if (res == NULL)
9904         goto leave;
9905     tmpend = tmp + newlength;
9906     outdata = PyUnicode_DATA(res);
9907     outkind = PyUnicode_KIND(res);
9908     switch (outkind) {
9909     case PyUnicode_1BYTE_KIND:
9910         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9911         break;
9912     case PyUnicode_2BYTE_KIND:
9913         _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9914         break;
9915     case PyUnicode_4BYTE_KIND:
9916         memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9917         break;
9918     default:
9919         Py_UNREACHABLE();
9920     }
9921   leave:
9922     PyMem_FREE(tmp);
9923     return res;
9924 }
9925 
9926 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9927 PyUnicode_Join(PyObject *separator, PyObject *seq)
9928 {
9929     PyObject *res;
9930     PyObject *fseq;
9931     Py_ssize_t seqlen;
9932     PyObject **items;
9933 
9934     fseq = PySequence_Fast(seq, "can only join an iterable");
9935     if (fseq == NULL) {
9936         return NULL;
9937     }
9938 
9939     /* NOTE: the following code can't call back into Python code,
9940      * so we are sure that fseq won't be mutated.
9941      */
9942 
9943     items = PySequence_Fast_ITEMS(fseq);
9944     seqlen = PySequence_Fast_GET_SIZE(fseq);
9945     res = _PyUnicode_JoinArray(separator, items, seqlen);
9946     Py_DECREF(fseq);
9947     return res;
9948 }
9949 
9950 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9951 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9952 {
9953     PyObject *res = NULL; /* the result */
9954     PyObject *sep = NULL;
9955     Py_ssize_t seplen;
9956     PyObject *item;
9957     Py_ssize_t sz, i, res_offset;
9958     Py_UCS4 maxchar;
9959     Py_UCS4 item_maxchar;
9960     int use_memcpy;
9961     unsigned char *res_data = NULL, *sep_data = NULL;
9962     PyObject *last_obj;
9963     unsigned int kind = 0;
9964 
9965     /* If empty sequence, return u"". */
9966     if (seqlen == 0) {
9967         _Py_RETURN_UNICODE_EMPTY();
9968     }
9969 
9970     /* If singleton sequence with an exact Unicode, return that. */
9971     last_obj = NULL;
9972     if (seqlen == 1) {
9973         if (PyUnicode_CheckExact(items[0])) {
9974             res = items[0];
9975             Py_INCREF(res);
9976             return res;
9977         }
9978         seplen = 0;
9979         maxchar = 0;
9980     }
9981     else {
9982         /* Set up sep and seplen */
9983         if (separator == NULL) {
9984             /* fall back to a blank space separator */
9985             sep = PyUnicode_FromOrdinal(' ');
9986             if (!sep)
9987                 goto onError;
9988             seplen = 1;
9989             maxchar = 32;
9990         }
9991         else {
9992             if (!PyUnicode_Check(separator)) {
9993                 PyErr_Format(PyExc_TypeError,
9994                              "separator: expected str instance,"
9995                              " %.80s found",
9996                              Py_TYPE(separator)->tp_name);
9997                 goto onError;
9998             }
9999             if (PyUnicode_READY(separator))
10000                 goto onError;
10001             sep = separator;
10002             seplen = PyUnicode_GET_LENGTH(separator);
10003             maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10004             /* inc refcount to keep this code path symmetric with the
10005                above case of a blank separator */
10006             Py_INCREF(sep);
10007         }
10008         last_obj = sep;
10009     }
10010 
10011     /* There are at least two things to join, or else we have a subclass
10012      * of str in the sequence.
10013      * Do a pre-pass to figure out the total amount of space we'll
10014      * need (sz), and see whether all argument are strings.
10015      */
10016     sz = 0;
10017 #ifdef Py_DEBUG
10018     use_memcpy = 0;
10019 #else
10020     use_memcpy = 1;
10021 #endif
10022     for (i = 0; i < seqlen; i++) {
10023         size_t add_sz;
10024         item = items[i];
10025         if (!PyUnicode_Check(item)) {
10026             PyErr_Format(PyExc_TypeError,
10027                          "sequence item %zd: expected str instance,"
10028                          " %.80s found",
10029                          i, Py_TYPE(item)->tp_name);
10030             goto onError;
10031         }
10032         if (PyUnicode_READY(item) == -1)
10033             goto onError;
10034         add_sz = PyUnicode_GET_LENGTH(item);
10035         item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10036         maxchar = Py_MAX(maxchar, item_maxchar);
10037         if (i != 0) {
10038             add_sz += seplen;
10039         }
10040         if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10041             PyErr_SetString(PyExc_OverflowError,
10042                             "join() result is too long for a Python string");
10043             goto onError;
10044         }
10045         sz += add_sz;
10046         if (use_memcpy && last_obj != NULL) {
10047             if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10048                 use_memcpy = 0;
10049         }
10050         last_obj = item;
10051     }
10052 
10053     res = PyUnicode_New(sz, maxchar);
10054     if (res == NULL)
10055         goto onError;
10056 
10057     /* Catenate everything. */
10058 #ifdef Py_DEBUG
10059     use_memcpy = 0;
10060 #else
10061     if (use_memcpy) {
10062         res_data = PyUnicode_1BYTE_DATA(res);
10063         kind = PyUnicode_KIND(res);
10064         if (seplen != 0)
10065             sep_data = PyUnicode_1BYTE_DATA(sep);
10066     }
10067 #endif
10068     if (use_memcpy) {
10069         for (i = 0; i < seqlen; ++i) {
10070             Py_ssize_t itemlen;
10071             item = items[i];
10072 
10073             /* Copy item, and maybe the separator. */
10074             if (i && seplen != 0) {
10075                 memcpy(res_data,
10076                           sep_data,
10077                           kind * seplen);
10078                 res_data += kind * seplen;
10079             }
10080 
10081             itemlen = PyUnicode_GET_LENGTH(item);
10082             if (itemlen != 0) {
10083                 memcpy(res_data,
10084                           PyUnicode_DATA(item),
10085                           kind * itemlen);
10086                 res_data += kind * itemlen;
10087             }
10088         }
10089         assert(res_data == PyUnicode_1BYTE_DATA(res)
10090                            + kind * PyUnicode_GET_LENGTH(res));
10091     }
10092     else {
10093         for (i = 0, res_offset = 0; i < seqlen; ++i) {
10094             Py_ssize_t itemlen;
10095             item = items[i];
10096 
10097             /* Copy item, and maybe the separator. */
10098             if (i && seplen != 0) {
10099                 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10100                 res_offset += seplen;
10101             }
10102 
10103             itemlen = PyUnicode_GET_LENGTH(item);
10104             if (itemlen != 0) {
10105                 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10106                 res_offset += itemlen;
10107             }
10108         }
10109         assert(res_offset == PyUnicode_GET_LENGTH(res));
10110     }
10111 
10112     Py_XDECREF(sep);
10113     assert(_PyUnicode_CheckConsistency(res, 1));
10114     return res;
10115 
10116   onError:
10117     Py_XDECREF(sep);
10118     Py_XDECREF(res);
10119     return NULL;
10120 }
10121 
10122 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10123 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10124                     Py_UCS4 fill_char)
10125 {
10126     const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10127     void *data = PyUnicode_DATA(unicode);
10128     assert(PyUnicode_IS_READY(unicode));
10129     assert(unicode_modifiable(unicode));
10130     assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10131     assert(start >= 0);
10132     assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10133     unicode_fill(kind, data, fill_char, start, length);
10134 }
10135 
10136 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10137 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10138                Py_UCS4 fill_char)
10139 {
10140     Py_ssize_t maxlen;
10141 
10142     if (!PyUnicode_Check(unicode)) {
10143         PyErr_BadInternalCall();
10144         return -1;
10145     }
10146     if (PyUnicode_READY(unicode) == -1)
10147         return -1;
10148     if (unicode_check_modifiable(unicode))
10149         return -1;
10150 
10151     if (start < 0) {
10152         PyErr_SetString(PyExc_IndexError, "string index out of range");
10153         return -1;
10154     }
10155     if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10156         PyErr_SetString(PyExc_ValueError,
10157                          "fill character is bigger than "
10158                          "the string maximum character");
10159         return -1;
10160     }
10161 
10162     maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10163     length = Py_MIN(maxlen, length);
10164     if (length <= 0)
10165         return 0;
10166 
10167     _PyUnicode_FastFill(unicode, start, length, fill_char);
10168     return length;
10169 }
10170 
10171 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10172 pad(PyObject *self,
10173     Py_ssize_t left,
10174     Py_ssize_t right,
10175     Py_UCS4 fill)
10176 {
10177     PyObject *u;
10178     Py_UCS4 maxchar;
10179     int kind;
10180     void *data;
10181 
10182     if (left < 0)
10183         left = 0;
10184     if (right < 0)
10185         right = 0;
10186 
10187     if (left == 0 && right == 0)
10188         return unicode_result_unchanged(self);
10189 
10190     if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10191         right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10192         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10193         return NULL;
10194     }
10195     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10196     maxchar = Py_MAX(maxchar, fill);
10197     u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10198     if (!u)
10199         return NULL;
10200 
10201     kind = PyUnicode_KIND(u);
10202     data = PyUnicode_DATA(u);
10203     if (left)
10204         unicode_fill(kind, data, fill, 0, left);
10205     if (right)
10206         unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10207     _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10208     assert(_PyUnicode_CheckConsistency(u, 1));
10209     return u;
10210 }
10211 
10212 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10213 PyUnicode_Splitlines(PyObject *string, int keepends)
10214 {
10215     PyObject *list;
10216 
10217     if (ensure_unicode(string) < 0)
10218         return NULL;
10219 
10220     switch (PyUnicode_KIND(string)) {
10221     case PyUnicode_1BYTE_KIND:
10222         if (PyUnicode_IS_ASCII(string))
10223             list = asciilib_splitlines(
10224                 string, PyUnicode_1BYTE_DATA(string),
10225                 PyUnicode_GET_LENGTH(string), keepends);
10226         else
10227             list = ucs1lib_splitlines(
10228                 string, PyUnicode_1BYTE_DATA(string),
10229                 PyUnicode_GET_LENGTH(string), keepends);
10230         break;
10231     case PyUnicode_2BYTE_KIND:
10232         list = ucs2lib_splitlines(
10233             string, PyUnicode_2BYTE_DATA(string),
10234             PyUnicode_GET_LENGTH(string), keepends);
10235         break;
10236     case PyUnicode_4BYTE_KIND:
10237         list = ucs4lib_splitlines(
10238             string, PyUnicode_4BYTE_DATA(string),
10239             PyUnicode_GET_LENGTH(string), keepends);
10240         break;
10241     default:
10242         Py_UNREACHABLE();
10243     }
10244     return list;
10245 }
10246 
10247 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10248 split(PyObject *self,
10249       PyObject *substring,
10250       Py_ssize_t maxcount)
10251 {
10252     int kind1, kind2;
10253     void *buf1, *buf2;
10254     Py_ssize_t len1, len2;
10255     PyObject* out;
10256 
10257     if (maxcount < 0)
10258         maxcount = PY_SSIZE_T_MAX;
10259 
10260     if (PyUnicode_READY(self) == -1)
10261         return NULL;
10262 
10263     if (substring == NULL)
10264         switch (PyUnicode_KIND(self)) {
10265         case PyUnicode_1BYTE_KIND:
10266             if (PyUnicode_IS_ASCII(self))
10267                 return asciilib_split_whitespace(
10268                     self,  PyUnicode_1BYTE_DATA(self),
10269                     PyUnicode_GET_LENGTH(self), maxcount
10270                     );
10271             else
10272                 return ucs1lib_split_whitespace(
10273                     self,  PyUnicode_1BYTE_DATA(self),
10274                     PyUnicode_GET_LENGTH(self), maxcount
10275                     );
10276         case PyUnicode_2BYTE_KIND:
10277             return ucs2lib_split_whitespace(
10278                 self,  PyUnicode_2BYTE_DATA(self),
10279                 PyUnicode_GET_LENGTH(self), maxcount
10280                 );
10281         case PyUnicode_4BYTE_KIND:
10282             return ucs4lib_split_whitespace(
10283                 self,  PyUnicode_4BYTE_DATA(self),
10284                 PyUnicode_GET_LENGTH(self), maxcount
10285                 );
10286         default:
10287             Py_UNREACHABLE();
10288         }
10289 
10290     if (PyUnicode_READY(substring) == -1)
10291         return NULL;
10292 
10293     kind1 = PyUnicode_KIND(self);
10294     kind2 = PyUnicode_KIND(substring);
10295     len1 = PyUnicode_GET_LENGTH(self);
10296     len2 = PyUnicode_GET_LENGTH(substring);
10297     if (kind1 < kind2 || len1 < len2) {
10298         out = PyList_New(1);
10299         if (out == NULL)
10300             return NULL;
10301         Py_INCREF(self);
10302         PyList_SET_ITEM(out, 0, self);
10303         return out;
10304     }
10305     buf1 = PyUnicode_DATA(self);
10306     buf2 = PyUnicode_DATA(substring);
10307     if (kind2 != kind1) {
10308         buf2 = _PyUnicode_AsKind(substring, kind1);
10309         if (!buf2)
10310             return NULL;
10311     }
10312 
10313     switch (kind1) {
10314     case PyUnicode_1BYTE_KIND:
10315         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10316             out = asciilib_split(
10317                 self,  buf1, len1, buf2, len2, maxcount);
10318         else
10319             out = ucs1lib_split(
10320                 self,  buf1, len1, buf2, len2, maxcount);
10321         break;
10322     case PyUnicode_2BYTE_KIND:
10323         out = ucs2lib_split(
10324             self,  buf1, len1, buf2, len2, maxcount);
10325         break;
10326     case PyUnicode_4BYTE_KIND:
10327         out = ucs4lib_split(
10328             self,  buf1, len1, buf2, len2, maxcount);
10329         break;
10330     default:
10331         out = NULL;
10332     }
10333     if (kind2 != kind1)
10334         PyMem_Free(buf2);
10335     return out;
10336 }
10337 
10338 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10339 rsplit(PyObject *self,
10340        PyObject *substring,
10341        Py_ssize_t maxcount)
10342 {
10343     int kind1, kind2;
10344     void *buf1, *buf2;
10345     Py_ssize_t len1, len2;
10346     PyObject* out;
10347 
10348     if (maxcount < 0)
10349         maxcount = PY_SSIZE_T_MAX;
10350 
10351     if (PyUnicode_READY(self) == -1)
10352         return NULL;
10353 
10354     if (substring == NULL)
10355         switch (PyUnicode_KIND(self)) {
10356         case PyUnicode_1BYTE_KIND:
10357             if (PyUnicode_IS_ASCII(self))
10358                 return asciilib_rsplit_whitespace(
10359                     self,  PyUnicode_1BYTE_DATA(self),
10360                     PyUnicode_GET_LENGTH(self), maxcount
10361                     );
10362             else
10363                 return ucs1lib_rsplit_whitespace(
10364                     self,  PyUnicode_1BYTE_DATA(self),
10365                     PyUnicode_GET_LENGTH(self), maxcount
10366                     );
10367         case PyUnicode_2BYTE_KIND:
10368             return ucs2lib_rsplit_whitespace(
10369                 self,  PyUnicode_2BYTE_DATA(self),
10370                 PyUnicode_GET_LENGTH(self), maxcount
10371                 );
10372         case PyUnicode_4BYTE_KIND:
10373             return ucs4lib_rsplit_whitespace(
10374                 self,  PyUnicode_4BYTE_DATA(self),
10375                 PyUnicode_GET_LENGTH(self), maxcount
10376                 );
10377         default:
10378             Py_UNREACHABLE();
10379         }
10380 
10381     if (PyUnicode_READY(substring) == -1)
10382         return NULL;
10383 
10384     kind1 = PyUnicode_KIND(self);
10385     kind2 = PyUnicode_KIND(substring);
10386     len1 = PyUnicode_GET_LENGTH(self);
10387     len2 = PyUnicode_GET_LENGTH(substring);
10388     if (kind1 < kind2 || len1 < len2) {
10389         out = PyList_New(1);
10390         if (out == NULL)
10391             return NULL;
10392         Py_INCREF(self);
10393         PyList_SET_ITEM(out, 0, self);
10394         return out;
10395     }
10396     buf1 = PyUnicode_DATA(self);
10397     buf2 = PyUnicode_DATA(substring);
10398     if (kind2 != kind1) {
10399         buf2 = _PyUnicode_AsKind(substring, kind1);
10400         if (!buf2)
10401             return NULL;
10402     }
10403 
10404     switch (kind1) {
10405     case PyUnicode_1BYTE_KIND:
10406         if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10407             out = asciilib_rsplit(
10408                 self,  buf1, len1, buf2, len2, maxcount);
10409         else
10410             out = ucs1lib_rsplit(
10411                 self,  buf1, len1, buf2, len2, maxcount);
10412         break;
10413     case PyUnicode_2BYTE_KIND:
10414         out = ucs2lib_rsplit(
10415             self,  buf1, len1, buf2, len2, maxcount);
10416         break;
10417     case PyUnicode_4BYTE_KIND:
10418         out = ucs4lib_rsplit(
10419             self,  buf1, len1, buf2, len2, maxcount);
10420         break;
10421     default:
10422         out = NULL;
10423     }
10424     if (kind2 != kind1)
10425         PyMem_Free(buf2);
10426     return out;
10427 }
10428 
10429 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10430 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10431             PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10432 {
10433     switch (kind) {
10434     case PyUnicode_1BYTE_KIND:
10435         if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10436             return asciilib_find(buf1, len1, buf2, len2, offset);
10437         else
10438             return ucs1lib_find(buf1, len1, buf2, len2, offset);
10439     case PyUnicode_2BYTE_KIND:
10440         return ucs2lib_find(buf1, len1, buf2, len2, offset);
10441     case PyUnicode_4BYTE_KIND:
10442         return ucs4lib_find(buf1, len1, buf2, len2, offset);
10443     }
10444     Py_UNREACHABLE();
10445 }
10446 
10447 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10448 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10449              PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10450 {
10451     switch (kind) {
10452     case PyUnicode_1BYTE_KIND:
10453         if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10454             return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10455         else
10456             return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10457     case PyUnicode_2BYTE_KIND:
10458         return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10459     case PyUnicode_4BYTE_KIND:
10460         return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10461     }
10462     Py_UNREACHABLE();
10463 }
10464 
10465 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10466 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10467                       Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10468 {
10469     int kind = PyUnicode_KIND(u);
10470     void *data = PyUnicode_DATA(u);
10471     Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10472     if (kind == PyUnicode_1BYTE_KIND) {
10473         ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10474                                       (Py_UCS1 *)data + len,
10475                                       u1, u2, maxcount);
10476     }
10477     else if (kind == PyUnicode_2BYTE_KIND) {
10478         ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10479                                       (Py_UCS2 *)data + len,
10480                                       u1, u2, maxcount);
10481     }
10482     else {
10483         assert(kind == PyUnicode_4BYTE_KIND);
10484         ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10485                                       (Py_UCS4 *)data + len,
10486                                       u1, u2, maxcount);
10487     }
10488 }
10489 
10490 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10491 replace(PyObject *self, PyObject *str1,
10492         PyObject *str2, Py_ssize_t maxcount)
10493 {
10494     PyObject *u;
10495     char *sbuf = PyUnicode_DATA(self);
10496     char *buf1 = PyUnicode_DATA(str1);
10497     char *buf2 = PyUnicode_DATA(str2);
10498     int srelease = 0, release1 = 0, release2 = 0;
10499     int skind = PyUnicode_KIND(self);
10500     int kind1 = PyUnicode_KIND(str1);
10501     int kind2 = PyUnicode_KIND(str2);
10502     Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10503     Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10504     Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10505     int mayshrink;
10506     Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10507 
10508     if (maxcount < 0)
10509         maxcount = PY_SSIZE_T_MAX;
10510     else if (maxcount == 0 || slen == 0)
10511         goto nothing;
10512 
10513     if (str1 == str2)
10514         goto nothing;
10515 
10516     maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10517     maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10518     if (maxchar < maxchar_str1)
10519         /* substring too wide to be present */
10520         goto nothing;
10521     maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10522     /* Replacing str1 with str2 may cause a maxchar reduction in the
10523        result string. */
10524     mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10525     maxchar = Py_MAX(maxchar, maxchar_str2);
10526 
10527     if (len1 == len2) {
10528         /* same length */
10529         if (len1 == 0)
10530             goto nothing;
10531         if (len1 == 1) {
10532             /* replace characters */
10533             Py_UCS4 u1, u2;
10534             Py_ssize_t pos;
10535 
10536             u1 = PyUnicode_READ(kind1, buf1, 0);
10537             pos = findchar(sbuf, skind, slen, u1, 1);
10538             if (pos < 0)
10539                 goto nothing;
10540             u2 = PyUnicode_READ(kind2, buf2, 0);
10541             u = PyUnicode_New(slen, maxchar);
10542             if (!u)
10543                 goto error;
10544 
10545             _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10546             replace_1char_inplace(u, pos, u1, u2, maxcount);
10547         }
10548         else {
10549             int rkind = skind;
10550             char *res;
10551             Py_ssize_t i;
10552 
10553             if (kind1 < rkind) {
10554                 /* widen substring */
10555                 buf1 = _PyUnicode_AsKind(str1, rkind);
10556                 if (!buf1) goto error;
10557                 release1 = 1;
10558             }
10559             i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10560             if (i < 0)
10561                 goto nothing;
10562             if (rkind > kind2) {
10563                 /* widen replacement */
10564                 buf2 = _PyUnicode_AsKind(str2, rkind);
10565                 if (!buf2) goto error;
10566                 release2 = 1;
10567             }
10568             else if (rkind < kind2) {
10569                 /* widen self and buf1 */
10570                 rkind = kind2;
10571                 if (release1) PyMem_Free(buf1);
10572                 release1 = 0;
10573                 sbuf = _PyUnicode_AsKind(self, rkind);
10574                 if (!sbuf) goto error;
10575                 srelease = 1;
10576                 buf1 = _PyUnicode_AsKind(str1, rkind);
10577                 if (!buf1) goto error;
10578                 release1 = 1;
10579             }
10580             u = PyUnicode_New(slen, maxchar);
10581             if (!u)
10582                 goto error;
10583             assert(PyUnicode_KIND(u) == rkind);
10584             res = PyUnicode_DATA(u);
10585 
10586             memcpy(res, sbuf, rkind * slen);
10587             /* change everything in-place, starting with this one */
10588             memcpy(res + rkind * i,
10589                    buf2,
10590                    rkind * len2);
10591             i += len1;
10592 
10593             while ( --maxcount > 0) {
10594                 i = anylib_find(rkind, self,
10595                                 sbuf+rkind*i, slen-i,
10596                                 str1, buf1, len1, i);
10597                 if (i == -1)
10598                     break;
10599                 memcpy(res + rkind * i,
10600                        buf2,
10601                        rkind * len2);
10602                 i += len1;
10603             }
10604         }
10605     }
10606     else {
10607         Py_ssize_t n, i, j, ires;
10608         Py_ssize_t new_size;
10609         int rkind = skind;
10610         char *res;
10611 
10612         if (kind1 < rkind) {
10613             /* widen substring */
10614             buf1 = _PyUnicode_AsKind(str1, rkind);
10615             if (!buf1) goto error;
10616             release1 = 1;
10617         }
10618         n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10619         if (n == 0)
10620             goto nothing;
10621         if (kind2 < rkind) {
10622             /* widen replacement */
10623             buf2 = _PyUnicode_AsKind(str2, rkind);
10624             if (!buf2) goto error;
10625             release2 = 1;
10626         }
10627         else if (kind2 > rkind) {
10628             /* widen self and buf1 */
10629             rkind = kind2;
10630             sbuf = _PyUnicode_AsKind(self, rkind);
10631             if (!sbuf) goto error;
10632             srelease = 1;
10633             if (release1) PyMem_Free(buf1);
10634             release1 = 0;
10635             buf1 = _PyUnicode_AsKind(str1, rkind);
10636             if (!buf1) goto error;
10637             release1 = 1;
10638         }
10639         /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10640            PyUnicode_GET_LENGTH(str1))); */
10641         if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10642                 PyErr_SetString(PyExc_OverflowError,
10643                                 "replace string is too long");
10644                 goto error;
10645         }
10646         new_size = slen + n * (len2 - len1);
10647         if (new_size == 0) {
10648             _Py_INCREF_UNICODE_EMPTY();
10649             if (!unicode_empty)
10650                 goto error;
10651             u = unicode_empty;
10652             goto done;
10653         }
10654         if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10655             PyErr_SetString(PyExc_OverflowError,
10656                             "replace string is too long");
10657             goto error;
10658         }
10659         u = PyUnicode_New(new_size, maxchar);
10660         if (!u)
10661             goto error;
10662         assert(PyUnicode_KIND(u) == rkind);
10663         res = PyUnicode_DATA(u);
10664         ires = i = 0;
10665         if (len1 > 0) {
10666             while (n-- > 0) {
10667                 /* look for next match */
10668                 j = anylib_find(rkind, self,
10669                                 sbuf + rkind * i, slen-i,
10670                                 str1, buf1, len1, i);
10671                 if (j == -1)
10672                     break;
10673                 else if (j > i) {
10674                     /* copy unchanged part [i:j] */
10675                     memcpy(res + rkind * ires,
10676                            sbuf + rkind * i,
10677                            rkind * (j-i));
10678                     ires += j - i;
10679                 }
10680                 /* copy substitution string */
10681                 if (len2 > 0) {
10682                     memcpy(res + rkind * ires,
10683                            buf2,
10684                            rkind * len2);
10685                     ires += len2;
10686                 }
10687                 i = j + len1;
10688             }
10689             if (i < slen)
10690                 /* copy tail [i:] */
10691                 memcpy(res + rkind * ires,
10692                        sbuf + rkind * i,
10693                        rkind * (slen-i));
10694         }
10695         else {
10696             /* interleave */
10697             while (n > 0) {
10698                 memcpy(res + rkind * ires,
10699                        buf2,
10700                        rkind * len2);
10701                 ires += len2;
10702                 if (--n <= 0)
10703                     break;
10704                 memcpy(res + rkind * ires,
10705                        sbuf + rkind * i,
10706                        rkind);
10707                 ires++;
10708                 i++;
10709             }
10710             memcpy(res + rkind * ires,
10711                    sbuf + rkind * i,
10712                    rkind * (slen-i));
10713         }
10714     }
10715 
10716     if (mayshrink) {
10717         unicode_adjust_maxchar(&u);
10718         if (u == NULL)
10719             goto error;
10720     }
10721 
10722   done:
10723     if (srelease)
10724         PyMem_FREE(sbuf);
10725     if (release1)
10726         PyMem_FREE(buf1);
10727     if (release2)
10728         PyMem_FREE(buf2);
10729     assert(_PyUnicode_CheckConsistency(u, 1));
10730     return u;
10731 
10732   nothing:
10733     /* nothing to replace; return original string (when possible) */
10734     if (srelease)
10735         PyMem_FREE(sbuf);
10736     if (release1)
10737         PyMem_FREE(buf1);
10738     if (release2)
10739         PyMem_FREE(buf2);
10740     return unicode_result_unchanged(self);
10741 
10742   error:
10743     if (srelease && sbuf)
10744         PyMem_FREE(sbuf);
10745     if (release1 && buf1)
10746         PyMem_FREE(buf1);
10747     if (release2 && buf2)
10748         PyMem_FREE(buf2);
10749     return NULL;
10750 }
10751 
10752 /* --- Unicode Object Methods --------------------------------------------- */
10753 
10754 /*[clinic input]
10755 str.title as unicode_title
10756 
10757 Return a version of the string where each word is titlecased.
10758 
10759 More specifically, words start with uppercased characters and all remaining
10760 cased characters have lower case.
10761 [clinic start generated code]*/
10762 
10763 static PyObject *
unicode_title_impl(PyObject * self)10764 unicode_title_impl(PyObject *self)
10765 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10766 {
10767     if (PyUnicode_READY(self) == -1)
10768         return NULL;
10769     return case_operation(self, do_title);
10770 }
10771 
10772 /*[clinic input]
10773 str.capitalize as unicode_capitalize
10774 
10775 Return a capitalized version of the string.
10776 
10777 More specifically, make the first character have upper case and the rest lower
10778 case.
10779 [clinic start generated code]*/
10780 
10781 static PyObject *
unicode_capitalize_impl(PyObject * self)10782 unicode_capitalize_impl(PyObject *self)
10783 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10784 {
10785     if (PyUnicode_READY(self) == -1)
10786         return NULL;
10787     if (PyUnicode_GET_LENGTH(self) == 0)
10788         return unicode_result_unchanged(self);
10789     return case_operation(self, do_capitalize);
10790 }
10791 
10792 /*[clinic input]
10793 str.casefold as unicode_casefold
10794 
10795 Return a version of the string suitable for caseless comparisons.
10796 [clinic start generated code]*/
10797 
10798 static PyObject *
unicode_casefold_impl(PyObject * self)10799 unicode_casefold_impl(PyObject *self)
10800 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10801 {
10802     if (PyUnicode_READY(self) == -1)
10803         return NULL;
10804     if (PyUnicode_IS_ASCII(self))
10805         return ascii_upper_or_lower(self, 1);
10806     return case_operation(self, do_casefold);
10807 }
10808 
10809 
10810 /* Argument converter. Accepts a single Unicode character. */
10811 
10812 static int
convert_uc(PyObject * obj,void * addr)10813 convert_uc(PyObject *obj, void *addr)
10814 {
10815     Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10816 
10817     if (!PyUnicode_Check(obj)) {
10818         PyErr_Format(PyExc_TypeError,
10819                      "The fill character must be a unicode character, "
10820                      "not %.100s", Py_TYPE(obj)->tp_name);
10821         return 0;
10822     }
10823     if (PyUnicode_READY(obj) < 0)
10824         return 0;
10825     if (PyUnicode_GET_LENGTH(obj) != 1) {
10826         PyErr_SetString(PyExc_TypeError,
10827                         "The fill character must be exactly one character long");
10828         return 0;
10829     }
10830     *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10831     return 1;
10832 }
10833 
10834 /*[clinic input]
10835 str.center as unicode_center
10836 
10837     width: Py_ssize_t
10838     fillchar: Py_UCS4 = ' '
10839     /
10840 
10841 Return a centered string of length width.
10842 
10843 Padding is done using the specified fill character (default is a space).
10844 [clinic start generated code]*/
10845 
10846 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10847 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10848 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10849 {
10850     Py_ssize_t marg, left;
10851 
10852     if (PyUnicode_READY(self) == -1)
10853         return NULL;
10854 
10855     if (PyUnicode_GET_LENGTH(self) >= width)
10856         return unicode_result_unchanged(self);
10857 
10858     marg = width - PyUnicode_GET_LENGTH(self);
10859     left = marg / 2 + (marg & width & 1);
10860 
10861     return pad(self, left, marg - left, fillchar);
10862 }
10863 
10864 /* This function assumes that str1 and str2 are readied by the caller. */
10865 
10866 static int
unicode_compare(PyObject * str1,PyObject * str2)10867 unicode_compare(PyObject *str1, PyObject *str2)
10868 {
10869 #define COMPARE(TYPE1, TYPE2) \
10870     do { \
10871         TYPE1* p1 = (TYPE1 *)data1; \
10872         TYPE2* p2 = (TYPE2 *)data2; \
10873         TYPE1* end = p1 + len; \
10874         Py_UCS4 c1, c2; \
10875         for (; p1 != end; p1++, p2++) { \
10876             c1 = *p1; \
10877             c2 = *p2; \
10878             if (c1 != c2) \
10879                 return (c1 < c2) ? -1 : 1; \
10880         } \
10881     } \
10882     while (0)
10883 
10884     int kind1, kind2;
10885     void *data1, *data2;
10886     Py_ssize_t len1, len2, len;
10887 
10888     kind1 = PyUnicode_KIND(str1);
10889     kind2 = PyUnicode_KIND(str2);
10890     data1 = PyUnicode_DATA(str1);
10891     data2 = PyUnicode_DATA(str2);
10892     len1 = PyUnicode_GET_LENGTH(str1);
10893     len2 = PyUnicode_GET_LENGTH(str2);
10894     len = Py_MIN(len1, len2);
10895 
10896     switch(kind1) {
10897     case PyUnicode_1BYTE_KIND:
10898     {
10899         switch(kind2) {
10900         case PyUnicode_1BYTE_KIND:
10901         {
10902             int cmp = memcmp(data1, data2, len);
10903             /* normalize result of memcmp() into the range [-1; 1] */
10904             if (cmp < 0)
10905                 return -1;
10906             if (cmp > 0)
10907                 return 1;
10908             break;
10909         }
10910         case PyUnicode_2BYTE_KIND:
10911             COMPARE(Py_UCS1, Py_UCS2);
10912             break;
10913         case PyUnicode_4BYTE_KIND:
10914             COMPARE(Py_UCS1, Py_UCS4);
10915             break;
10916         default:
10917             Py_UNREACHABLE();
10918         }
10919         break;
10920     }
10921     case PyUnicode_2BYTE_KIND:
10922     {
10923         switch(kind2) {
10924         case PyUnicode_1BYTE_KIND:
10925             COMPARE(Py_UCS2, Py_UCS1);
10926             break;
10927         case PyUnicode_2BYTE_KIND:
10928         {
10929             COMPARE(Py_UCS2, Py_UCS2);
10930             break;
10931         }
10932         case PyUnicode_4BYTE_KIND:
10933             COMPARE(Py_UCS2, Py_UCS4);
10934             break;
10935         default:
10936             Py_UNREACHABLE();
10937         }
10938         break;
10939     }
10940     case PyUnicode_4BYTE_KIND:
10941     {
10942         switch(kind2) {
10943         case PyUnicode_1BYTE_KIND:
10944             COMPARE(Py_UCS4, Py_UCS1);
10945             break;
10946         case PyUnicode_2BYTE_KIND:
10947             COMPARE(Py_UCS4, Py_UCS2);
10948             break;
10949         case PyUnicode_4BYTE_KIND:
10950         {
10951 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10952             int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10953             /* normalize result of wmemcmp() into the range [-1; 1] */
10954             if (cmp < 0)
10955                 return -1;
10956             if (cmp > 0)
10957                 return 1;
10958 #else
10959             COMPARE(Py_UCS4, Py_UCS4);
10960 #endif
10961             break;
10962         }
10963         default:
10964             Py_UNREACHABLE();
10965         }
10966         break;
10967     }
10968     default:
10969         Py_UNREACHABLE();
10970     }
10971 
10972     if (len1 == len2)
10973         return 0;
10974     if (len1 < len2)
10975         return -1;
10976     else
10977         return 1;
10978 
10979 #undef COMPARE
10980 }
10981 
10982 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10983 unicode_compare_eq(PyObject *str1, PyObject *str2)
10984 {
10985     int kind;
10986     void *data1, *data2;
10987     Py_ssize_t len;
10988     int cmp;
10989 
10990     len = PyUnicode_GET_LENGTH(str1);
10991     if (PyUnicode_GET_LENGTH(str2) != len)
10992         return 0;
10993     kind = PyUnicode_KIND(str1);
10994     if (PyUnicode_KIND(str2) != kind)
10995         return 0;
10996     data1 = PyUnicode_DATA(str1);
10997     data2 = PyUnicode_DATA(str2);
10998 
10999     cmp = memcmp(data1, data2, len * kind);
11000     return (cmp == 0);
11001 }
11002 
11003 
11004 int
PyUnicode_Compare(PyObject * left,PyObject * right)11005 PyUnicode_Compare(PyObject *left, PyObject *right)
11006 {
11007     if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11008         if (PyUnicode_READY(left) == -1 ||
11009             PyUnicode_READY(right) == -1)
11010             return -1;
11011 
11012         /* a string is equal to itself */
11013         if (left == right)
11014             return 0;
11015 
11016         return unicode_compare(left, right);
11017     }
11018     PyErr_Format(PyExc_TypeError,
11019                  "Can't compare %.100s and %.100s",
11020                  left->ob_type->tp_name,
11021                  right->ob_type->tp_name);
11022     return -1;
11023 }
11024 
11025 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11026 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11027 {
11028     Py_ssize_t i;
11029     int kind;
11030     Py_UCS4 chr;
11031     const unsigned char *ustr = (const unsigned char *)str;
11032 
11033     assert(_PyUnicode_CHECK(uni));
11034     if (!PyUnicode_IS_READY(uni)) {
11035         const wchar_t *ws = _PyUnicode_WSTR(uni);
11036         /* Compare Unicode string and source character set string */
11037         for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11038             if (chr != ustr[i])
11039                 return (chr < ustr[i]) ? -1 : 1;
11040         }
11041         /* This check keeps Python strings that end in '\0' from comparing equal
11042          to C strings identical up to that point. */
11043         if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11044             return 1; /* uni is longer */
11045         if (ustr[i])
11046             return -1; /* str is longer */
11047         return 0;
11048     }
11049     kind = PyUnicode_KIND(uni);
11050     if (kind == PyUnicode_1BYTE_KIND) {
11051         const void *data = PyUnicode_1BYTE_DATA(uni);
11052         size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11053         size_t len, len2 = strlen(str);
11054         int cmp;
11055 
11056         len = Py_MIN(len1, len2);
11057         cmp = memcmp(data, str, len);
11058         if (cmp != 0) {
11059             if (cmp < 0)
11060                 return -1;
11061             else
11062                 return 1;
11063         }
11064         if (len1 > len2)
11065             return 1; /* uni is longer */
11066         if (len1 < len2)
11067             return -1; /* str is longer */
11068         return 0;
11069     }
11070     else {
11071         void *data = PyUnicode_DATA(uni);
11072         /* Compare Unicode string and source character set string */
11073         for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11074             if (chr != (unsigned char)str[i])
11075                 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11076         /* This check keeps Python strings that end in '\0' from comparing equal
11077          to C strings identical up to that point. */
11078         if (PyUnicode_GET_LENGTH(uni) != i || chr)
11079             return 1; /* uni is longer */
11080         if (str[i])
11081             return -1; /* str is longer */
11082         return 0;
11083     }
11084 }
11085 
11086 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11087 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11088 {
11089     size_t i, len;
11090     const wchar_t *p;
11091     len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11092     if (strlen(str) != len)
11093         return 0;
11094     p = _PyUnicode_WSTR(unicode);
11095     assert(p);
11096     for (i = 0; i < len; i++) {
11097         unsigned char c = (unsigned char)str[i];
11098         if (c >= 128 || p[i] != (wchar_t)c)
11099             return 0;
11100     }
11101     return 1;
11102 }
11103 
11104 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11105 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11106 {
11107     size_t len;
11108     assert(_PyUnicode_CHECK(unicode));
11109     assert(str);
11110 #ifndef NDEBUG
11111     for (const char *p = str; *p; p++) {
11112         assert((unsigned char)*p < 128);
11113     }
11114 #endif
11115     if (PyUnicode_READY(unicode) == -1) {
11116         /* Memory error or bad data */
11117         PyErr_Clear();
11118         return non_ready_unicode_equal_to_ascii_string(unicode, str);
11119     }
11120     if (!PyUnicode_IS_ASCII(unicode))
11121         return 0;
11122     len = (size_t)PyUnicode_GET_LENGTH(unicode);
11123     return strlen(str) == len &&
11124            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11125 }
11126 
11127 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11128 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11129 {
11130     PyObject *right_uni;
11131     Py_hash_t hash;
11132 
11133     assert(_PyUnicode_CHECK(left));
11134     assert(right->string);
11135 #ifndef NDEBUG
11136     for (const char *p = right->string; *p; p++) {
11137         assert((unsigned char)*p < 128);
11138     }
11139 #endif
11140 
11141     if (PyUnicode_READY(left) == -1) {
11142         /* memory error or bad data */
11143         PyErr_Clear();
11144         return non_ready_unicode_equal_to_ascii_string(left, right->string);
11145     }
11146 
11147     if (!PyUnicode_IS_ASCII(left))
11148         return 0;
11149 
11150     right_uni = _PyUnicode_FromId(right);       /* borrowed */
11151     if (right_uni == NULL) {
11152         /* memory error or bad data */
11153         PyErr_Clear();
11154         return _PyUnicode_EqualToASCIIString(left, right->string);
11155     }
11156 
11157     if (left == right_uni)
11158         return 1;
11159 
11160     if (PyUnicode_CHECK_INTERNED(left))
11161         return 0;
11162 
11163     assert(_PyUnicode_HASH(right_uni) != -1);
11164     hash = _PyUnicode_HASH(left);
11165     if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11166         return 0;
11167 
11168     return unicode_compare_eq(left, right_uni);
11169 }
11170 
11171 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11172 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11173 {
11174     int result;
11175 
11176     if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11177         Py_RETURN_NOTIMPLEMENTED;
11178 
11179     if (PyUnicode_READY(left) == -1 ||
11180         PyUnicode_READY(right) == -1)
11181         return NULL;
11182 
11183     if (left == right) {
11184         switch (op) {
11185         case Py_EQ:
11186         case Py_LE:
11187         case Py_GE:
11188             /* a string is equal to itself */
11189             Py_RETURN_TRUE;
11190         case Py_NE:
11191         case Py_LT:
11192         case Py_GT:
11193             Py_RETURN_FALSE;
11194         default:
11195             PyErr_BadArgument();
11196             return NULL;
11197         }
11198     }
11199     else if (op == Py_EQ || op == Py_NE) {
11200         result = unicode_compare_eq(left, right);
11201         result ^= (op == Py_NE);
11202         return PyBool_FromLong(result);
11203     }
11204     else {
11205         result = unicode_compare(left, right);
11206         Py_RETURN_RICHCOMPARE(result, 0, op);
11207     }
11208 }
11209 
11210 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11211 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11212 {
11213     return unicode_eq(aa, bb);
11214 }
11215 
11216 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11217 PyUnicode_Contains(PyObject *str, PyObject *substr)
11218 {
11219     int kind1, kind2;
11220     void *buf1, *buf2;
11221     Py_ssize_t len1, len2;
11222     int result;
11223 
11224     if (!PyUnicode_Check(substr)) {
11225         PyErr_Format(PyExc_TypeError,
11226                      "'in <string>' requires string as left operand, not %.100s",
11227                      Py_TYPE(substr)->tp_name);
11228         return -1;
11229     }
11230     if (PyUnicode_READY(substr) == -1)
11231         return -1;
11232     if (ensure_unicode(str) < 0)
11233         return -1;
11234 
11235     kind1 = PyUnicode_KIND(str);
11236     kind2 = PyUnicode_KIND(substr);
11237     if (kind1 < kind2)
11238         return 0;
11239     len1 = PyUnicode_GET_LENGTH(str);
11240     len2 = PyUnicode_GET_LENGTH(substr);
11241     if (len1 < len2)
11242         return 0;
11243     buf1 = PyUnicode_DATA(str);
11244     buf2 = PyUnicode_DATA(substr);
11245     if (len2 == 1) {
11246         Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11247         result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11248         return result;
11249     }
11250     if (kind2 != kind1) {
11251         buf2 = _PyUnicode_AsKind(substr, kind1);
11252         if (!buf2)
11253             return -1;
11254     }
11255 
11256     switch (kind1) {
11257     case PyUnicode_1BYTE_KIND:
11258         result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11259         break;
11260     case PyUnicode_2BYTE_KIND:
11261         result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11262         break;
11263     case PyUnicode_4BYTE_KIND:
11264         result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11265         break;
11266     default:
11267         Py_UNREACHABLE();
11268     }
11269 
11270     if (kind2 != kind1)
11271         PyMem_Free(buf2);
11272 
11273     return result;
11274 }
11275 
11276 /* Concat to string or Unicode object giving a new Unicode object. */
11277 
11278 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11279 PyUnicode_Concat(PyObject *left, PyObject *right)
11280 {
11281     PyObject *result;
11282     Py_UCS4 maxchar, maxchar2;
11283     Py_ssize_t left_len, right_len, new_len;
11284 
11285     if (ensure_unicode(left) < 0)
11286         return NULL;
11287 
11288     if (!PyUnicode_Check(right)) {
11289         PyErr_Format(PyExc_TypeError,
11290                      "can only concatenate str (not \"%.200s\") to str",
11291                      right->ob_type->tp_name);
11292         return NULL;
11293     }
11294     if (PyUnicode_READY(right) < 0)
11295         return NULL;
11296 
11297     /* Shortcuts */
11298     if (left == unicode_empty)
11299         return PyUnicode_FromObject(right);
11300     if (right == unicode_empty)
11301         return PyUnicode_FromObject(left);
11302 
11303     left_len = PyUnicode_GET_LENGTH(left);
11304     right_len = PyUnicode_GET_LENGTH(right);
11305     if (left_len > PY_SSIZE_T_MAX - right_len) {
11306         PyErr_SetString(PyExc_OverflowError,
11307                         "strings are too large to concat");
11308         return NULL;
11309     }
11310     new_len = left_len + right_len;
11311 
11312     maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11313     maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11314     maxchar = Py_MAX(maxchar, maxchar2);
11315 
11316     /* Concat the two Unicode strings */
11317     result = PyUnicode_New(new_len, maxchar);
11318     if (result == NULL)
11319         return NULL;
11320     _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11321     _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11322     assert(_PyUnicode_CheckConsistency(result, 1));
11323     return result;
11324 }
11325 
11326 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11327 PyUnicode_Append(PyObject **p_left, PyObject *right)
11328 {
11329     PyObject *left, *res;
11330     Py_UCS4 maxchar, maxchar2;
11331     Py_ssize_t left_len, right_len, new_len;
11332 
11333     if (p_left == NULL) {
11334         if (!PyErr_Occurred())
11335             PyErr_BadInternalCall();
11336         return;
11337     }
11338     left = *p_left;
11339     if (right == NULL || left == NULL
11340         || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11341         if (!PyErr_Occurred())
11342             PyErr_BadInternalCall();
11343         goto error;
11344     }
11345 
11346     if (PyUnicode_READY(left) == -1)
11347         goto error;
11348     if (PyUnicode_READY(right) == -1)
11349         goto error;
11350 
11351     /* Shortcuts */
11352     if (left == unicode_empty) {
11353         Py_DECREF(left);
11354         Py_INCREF(right);
11355         *p_left = right;
11356         return;
11357     }
11358     if (right == unicode_empty)
11359         return;
11360 
11361     left_len = PyUnicode_GET_LENGTH(left);
11362     right_len = PyUnicode_GET_LENGTH(right);
11363     if (left_len > PY_SSIZE_T_MAX - right_len) {
11364         PyErr_SetString(PyExc_OverflowError,
11365                         "strings are too large to concat");
11366         goto error;
11367     }
11368     new_len = left_len + right_len;
11369 
11370     if (unicode_modifiable(left)
11371         && PyUnicode_CheckExact(right)
11372         && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11373         /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11374            to change the structure size, but characters are stored just after
11375            the structure, and so it requires to move all characters which is
11376            not so different than duplicating the string. */
11377         && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11378     {
11379         /* append inplace */
11380         if (unicode_resize(p_left, new_len) != 0)
11381             goto error;
11382 
11383         /* copy 'right' into the newly allocated area of 'left' */
11384         _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11385     }
11386     else {
11387         maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11388         maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11389         maxchar = Py_MAX(maxchar, maxchar2);
11390 
11391         /* Concat the two Unicode strings */
11392         res = PyUnicode_New(new_len, maxchar);
11393         if (res == NULL)
11394             goto error;
11395         _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11396         _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11397         Py_DECREF(left);
11398         *p_left = res;
11399     }
11400     assert(_PyUnicode_CheckConsistency(*p_left, 1));
11401     return;
11402 
11403 error:
11404     Py_CLEAR(*p_left);
11405 }
11406 
11407 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11408 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11409 {
11410     PyUnicode_Append(pleft, right);
11411     Py_XDECREF(right);
11412 }
11413 
11414 /*
11415 Wraps stringlib_parse_args_finds() and additionally ensures that the
11416 first argument is a unicode object.
11417 */
11418 
11419 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11420 parse_args_finds_unicode(const char * function_name, PyObject *args,
11421                          PyObject **substring,
11422                          Py_ssize_t *start, Py_ssize_t *end)
11423 {
11424     if(stringlib_parse_args_finds(function_name, args, substring,
11425                                   start, end)) {
11426         if (ensure_unicode(*substring) < 0)
11427             return 0;
11428         return 1;
11429     }
11430     return 0;
11431 }
11432 
11433 PyDoc_STRVAR(count__doc__,
11434              "S.count(sub[, start[, end]]) -> int\n\
11435 \n\
11436 Return the number of non-overlapping occurrences of substring sub in\n\
11437 string S[start:end].  Optional arguments start and end are\n\
11438 interpreted as in slice notation.");
11439 
11440 static PyObject *
unicode_count(PyObject * self,PyObject * args)11441 unicode_count(PyObject *self, PyObject *args)
11442 {
11443     PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11444     Py_ssize_t start = 0;
11445     Py_ssize_t end = PY_SSIZE_T_MAX;
11446     PyObject *result;
11447     int kind1, kind2;
11448     void *buf1, *buf2;
11449     Py_ssize_t len1, len2, iresult;
11450 
11451     if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11452         return NULL;
11453 
11454     kind1 = PyUnicode_KIND(self);
11455     kind2 = PyUnicode_KIND(substring);
11456     if (kind1 < kind2)
11457         return PyLong_FromLong(0);
11458 
11459     len1 = PyUnicode_GET_LENGTH(self);
11460     len2 = PyUnicode_GET_LENGTH(substring);
11461     ADJUST_INDICES(start, end, len1);
11462     if (end - start < len2)
11463         return PyLong_FromLong(0);
11464 
11465     buf1 = PyUnicode_DATA(self);
11466     buf2 = PyUnicode_DATA(substring);
11467     if (kind2 != kind1) {
11468         buf2 = _PyUnicode_AsKind(substring, kind1);
11469         if (!buf2)
11470             return NULL;
11471     }
11472     switch (kind1) {
11473     case PyUnicode_1BYTE_KIND:
11474         iresult = ucs1lib_count(
11475             ((Py_UCS1*)buf1) + start, end - start,
11476             buf2, len2, PY_SSIZE_T_MAX
11477             );
11478         break;
11479     case PyUnicode_2BYTE_KIND:
11480         iresult = ucs2lib_count(
11481             ((Py_UCS2*)buf1) + start, end - start,
11482             buf2, len2, PY_SSIZE_T_MAX
11483             );
11484         break;
11485     case PyUnicode_4BYTE_KIND:
11486         iresult = ucs4lib_count(
11487             ((Py_UCS4*)buf1) + start, end - start,
11488             buf2, len2, PY_SSIZE_T_MAX
11489             );
11490         break;
11491     default:
11492         Py_UNREACHABLE();
11493     }
11494 
11495     result = PyLong_FromSsize_t(iresult);
11496 
11497     if (kind2 != kind1)
11498         PyMem_Free(buf2);
11499 
11500     return result;
11501 }
11502 
11503 /*[clinic input]
11504 str.encode as unicode_encode
11505 
11506     encoding: str(c_default="NULL") = 'utf-8'
11507         The encoding in which to encode the string.
11508     errors: str(c_default="NULL") = 'strict'
11509         The error handling scheme to use for encoding errors.
11510         The default is 'strict' meaning that encoding errors raise a
11511         UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11512         'xmlcharrefreplace' as well as any other name registered with
11513         codecs.register_error that can handle UnicodeEncodeErrors.
11514 
11515 Encode the string using the codec registered for encoding.
11516 [clinic start generated code]*/
11517 
11518 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11519 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11520 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11521 {
11522     return PyUnicode_AsEncodedString(self, encoding, errors);
11523 }
11524 
11525 /*[clinic input]
11526 str.expandtabs as unicode_expandtabs
11527 
11528     tabsize: int = 8
11529 
11530 Return a copy where all tab characters are expanded using spaces.
11531 
11532 If tabsize is not given, a tab size of 8 characters is assumed.
11533 [clinic start generated code]*/
11534 
11535 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11536 unicode_expandtabs_impl(PyObject *self, int tabsize)
11537 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11538 {
11539     Py_ssize_t i, j, line_pos, src_len, incr;
11540     Py_UCS4 ch;
11541     PyObject *u;
11542     void *src_data, *dest_data;
11543     int kind;
11544     int found;
11545 
11546     if (PyUnicode_READY(self) == -1)
11547         return NULL;
11548 
11549     /* First pass: determine size of output string */
11550     src_len = PyUnicode_GET_LENGTH(self);
11551     i = j = line_pos = 0;
11552     kind = PyUnicode_KIND(self);
11553     src_data = PyUnicode_DATA(self);
11554     found = 0;
11555     for (; i < src_len; i++) {
11556         ch = PyUnicode_READ(kind, src_data, i);
11557         if (ch == '\t') {
11558             found = 1;
11559             if (tabsize > 0) {
11560                 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11561                 if (j > PY_SSIZE_T_MAX - incr)
11562                     goto overflow;
11563                 line_pos += incr;
11564                 j += incr;
11565             }
11566         }
11567         else {
11568             if (j > PY_SSIZE_T_MAX - 1)
11569                 goto overflow;
11570             line_pos++;
11571             j++;
11572             if (ch == '\n' || ch == '\r')
11573                 line_pos = 0;
11574         }
11575     }
11576     if (!found)
11577         return unicode_result_unchanged(self);
11578 
11579     /* Second pass: create output string and fill it */
11580     u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11581     if (!u)
11582         return NULL;
11583     dest_data = PyUnicode_DATA(u);
11584 
11585     i = j = line_pos = 0;
11586 
11587     for (; i < src_len; i++) {
11588         ch = PyUnicode_READ(kind, src_data, i);
11589         if (ch == '\t') {
11590             if (tabsize > 0) {
11591                 incr = tabsize - (line_pos % tabsize);
11592                 line_pos += incr;
11593                 unicode_fill(kind, dest_data, ' ', j, incr);
11594                 j += incr;
11595             }
11596         }
11597         else {
11598             line_pos++;
11599             PyUnicode_WRITE(kind, dest_data, j, ch);
11600             j++;
11601             if (ch == '\n' || ch == '\r')
11602                 line_pos = 0;
11603         }
11604     }
11605     assert (j == PyUnicode_GET_LENGTH(u));
11606     return unicode_result(u);
11607 
11608   overflow:
11609     PyErr_SetString(PyExc_OverflowError, "new string is too long");
11610     return NULL;
11611 }
11612 
11613 PyDoc_STRVAR(find__doc__,
11614              "S.find(sub[, start[, end]]) -> int\n\
11615 \n\
11616 Return the lowest index in S where substring sub is found,\n\
11617 such that sub is contained within S[start:end].  Optional\n\
11618 arguments start and end are interpreted as in slice notation.\n\
11619 \n\
11620 Return -1 on failure.");
11621 
11622 static PyObject *
unicode_find(PyObject * self,PyObject * args)11623 unicode_find(PyObject *self, PyObject *args)
11624 {
11625     /* initialize variables to prevent gcc warning */
11626     PyObject *substring = NULL;
11627     Py_ssize_t start = 0;
11628     Py_ssize_t end = 0;
11629     Py_ssize_t result;
11630 
11631     if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11632         return NULL;
11633 
11634     if (PyUnicode_READY(self) == -1)
11635         return NULL;
11636 
11637     result = any_find_slice(self, substring, start, end, 1);
11638 
11639     if (result == -2)
11640         return NULL;
11641 
11642     return PyLong_FromSsize_t(result);
11643 }
11644 
11645 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11646 unicode_getitem(PyObject *self, Py_ssize_t index)
11647 {
11648     void *data;
11649     enum PyUnicode_Kind kind;
11650     Py_UCS4 ch;
11651 
11652     if (!PyUnicode_Check(self)) {
11653         PyErr_BadArgument();
11654         return NULL;
11655     }
11656     if (PyUnicode_READY(self) == -1) {
11657         return NULL;
11658     }
11659     if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11660         PyErr_SetString(PyExc_IndexError, "string index out of range");
11661         return NULL;
11662     }
11663     kind = PyUnicode_KIND(self);
11664     data = PyUnicode_DATA(self);
11665     ch = PyUnicode_READ(kind, data, index);
11666     return unicode_char(ch);
11667 }
11668 
11669 /* Believe it or not, this produces the same value for ASCII strings
11670    as bytes_hash(). */
11671 static Py_hash_t
unicode_hash(PyObject * self)11672 unicode_hash(PyObject *self)
11673 {
11674     Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11675 
11676 #ifdef Py_DEBUG
11677     assert(_Py_HashSecret_Initialized);
11678 #endif
11679     if (_PyUnicode_HASH(self) != -1)
11680         return _PyUnicode_HASH(self);
11681     if (PyUnicode_READY(self) == -1)
11682         return -1;
11683 
11684     x = _Py_HashBytes(PyUnicode_DATA(self),
11685                       PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11686     _PyUnicode_HASH(self) = x;
11687     return x;
11688 }
11689 
11690 PyDoc_STRVAR(index__doc__,
11691              "S.index(sub[, start[, end]]) -> int\n\
11692 \n\
11693 Return the lowest index in S where substring sub is found,\n\
11694 such that sub is contained within S[start:end].  Optional\n\
11695 arguments start and end are interpreted as in slice notation.\n\
11696 \n\
11697 Raises ValueError when the substring is not found.");
11698 
11699 static PyObject *
unicode_index(PyObject * self,PyObject * args)11700 unicode_index(PyObject *self, PyObject *args)
11701 {
11702     /* initialize variables to prevent gcc warning */
11703     Py_ssize_t result;
11704     PyObject *substring = NULL;
11705     Py_ssize_t start = 0;
11706     Py_ssize_t end = 0;
11707 
11708     if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11709         return NULL;
11710 
11711     if (PyUnicode_READY(self) == -1)
11712         return NULL;
11713 
11714     result = any_find_slice(self, substring, start, end, 1);
11715 
11716     if (result == -2)
11717         return NULL;
11718 
11719     if (result < 0) {
11720         PyErr_SetString(PyExc_ValueError, "substring not found");
11721         return NULL;
11722     }
11723 
11724     return PyLong_FromSsize_t(result);
11725 }
11726 
11727 /*[clinic input]
11728 str.isascii as unicode_isascii
11729 
11730 Return True if all characters in the string are ASCII, False otherwise.
11731 
11732 ASCII characters have code points in the range U+0000-U+007F.
11733 Empty string is ASCII too.
11734 [clinic start generated code]*/
11735 
11736 static PyObject *
unicode_isascii_impl(PyObject * self)11737 unicode_isascii_impl(PyObject *self)
11738 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11739 {
11740     if (PyUnicode_READY(self) == -1) {
11741         return NULL;
11742     }
11743     return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11744 }
11745 
11746 /*[clinic input]
11747 str.islower as unicode_islower
11748 
11749 Return True if the string is a lowercase string, False otherwise.
11750 
11751 A string is lowercase if all cased characters in the string are lowercase and
11752 there is at least one cased character in the string.
11753 [clinic start generated code]*/
11754 
11755 static PyObject *
unicode_islower_impl(PyObject * self)11756 unicode_islower_impl(PyObject *self)
11757 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11758 {
11759     Py_ssize_t i, length;
11760     int kind;
11761     void *data;
11762     int cased;
11763 
11764     if (PyUnicode_READY(self) == -1)
11765         return NULL;
11766     length = PyUnicode_GET_LENGTH(self);
11767     kind = PyUnicode_KIND(self);
11768     data = PyUnicode_DATA(self);
11769 
11770     /* Shortcut for single character strings */
11771     if (length == 1)
11772         return PyBool_FromLong(
11773             Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11774 
11775     /* Special case for empty strings */
11776     if (length == 0)
11777         Py_RETURN_FALSE;
11778 
11779     cased = 0;
11780     for (i = 0; i < length; i++) {
11781         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11782 
11783         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11784             Py_RETURN_FALSE;
11785         else if (!cased && Py_UNICODE_ISLOWER(ch))
11786             cased = 1;
11787     }
11788     return PyBool_FromLong(cased);
11789 }
11790 
11791 /*[clinic input]
11792 str.isupper as unicode_isupper
11793 
11794 Return True if the string is an uppercase string, False otherwise.
11795 
11796 A string is uppercase if all cased characters in the string are uppercase and
11797 there is at least one cased character in the string.
11798 [clinic start generated code]*/
11799 
11800 static PyObject *
unicode_isupper_impl(PyObject * self)11801 unicode_isupper_impl(PyObject *self)
11802 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11803 {
11804     Py_ssize_t i, length;
11805     int kind;
11806     void *data;
11807     int cased;
11808 
11809     if (PyUnicode_READY(self) == -1)
11810         return NULL;
11811     length = PyUnicode_GET_LENGTH(self);
11812     kind = PyUnicode_KIND(self);
11813     data = PyUnicode_DATA(self);
11814 
11815     /* Shortcut for single character strings */
11816     if (length == 1)
11817         return PyBool_FromLong(
11818             Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11819 
11820     /* Special case for empty strings */
11821     if (length == 0)
11822         Py_RETURN_FALSE;
11823 
11824     cased = 0;
11825     for (i = 0; i < length; i++) {
11826         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11827 
11828         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11829             Py_RETURN_FALSE;
11830         else if (!cased && Py_UNICODE_ISUPPER(ch))
11831             cased = 1;
11832     }
11833     return PyBool_FromLong(cased);
11834 }
11835 
11836 /*[clinic input]
11837 str.istitle as unicode_istitle
11838 
11839 Return True if the string is a title-cased string, False otherwise.
11840 
11841 In a title-cased string, upper- and title-case characters may only
11842 follow uncased characters and lowercase characters only cased ones.
11843 [clinic start generated code]*/
11844 
11845 static PyObject *
unicode_istitle_impl(PyObject * self)11846 unicode_istitle_impl(PyObject *self)
11847 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11848 {
11849     Py_ssize_t i, length;
11850     int kind;
11851     void *data;
11852     int cased, previous_is_cased;
11853 
11854     if (PyUnicode_READY(self) == -1)
11855         return NULL;
11856     length = PyUnicode_GET_LENGTH(self);
11857     kind = PyUnicode_KIND(self);
11858     data = PyUnicode_DATA(self);
11859 
11860     /* Shortcut for single character strings */
11861     if (length == 1) {
11862         Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11863         return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11864                                (Py_UNICODE_ISUPPER(ch) != 0));
11865     }
11866 
11867     /* Special case for empty strings */
11868     if (length == 0)
11869         Py_RETURN_FALSE;
11870 
11871     cased = 0;
11872     previous_is_cased = 0;
11873     for (i = 0; i < length; i++) {
11874         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11875 
11876         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11877             if (previous_is_cased)
11878                 Py_RETURN_FALSE;
11879             previous_is_cased = 1;
11880             cased = 1;
11881         }
11882         else if (Py_UNICODE_ISLOWER(ch)) {
11883             if (!previous_is_cased)
11884                 Py_RETURN_FALSE;
11885             previous_is_cased = 1;
11886             cased = 1;
11887         }
11888         else
11889             previous_is_cased = 0;
11890     }
11891     return PyBool_FromLong(cased);
11892 }
11893 
11894 /*[clinic input]
11895 str.isspace as unicode_isspace
11896 
11897 Return True if the string is a whitespace string, False otherwise.
11898 
11899 A string is whitespace if all characters in the string are whitespace and there
11900 is at least one character in the string.
11901 [clinic start generated code]*/
11902 
11903 static PyObject *
unicode_isspace_impl(PyObject * self)11904 unicode_isspace_impl(PyObject *self)
11905 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11906 {
11907     Py_ssize_t i, length;
11908     int kind;
11909     void *data;
11910 
11911     if (PyUnicode_READY(self) == -1)
11912         return NULL;
11913     length = PyUnicode_GET_LENGTH(self);
11914     kind = PyUnicode_KIND(self);
11915     data = PyUnicode_DATA(self);
11916 
11917     /* Shortcut for single character strings */
11918     if (length == 1)
11919         return PyBool_FromLong(
11920             Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11921 
11922     /* Special case for empty strings */
11923     if (length == 0)
11924         Py_RETURN_FALSE;
11925 
11926     for (i = 0; i < length; i++) {
11927         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11928         if (!Py_UNICODE_ISSPACE(ch))
11929             Py_RETURN_FALSE;
11930     }
11931     Py_RETURN_TRUE;
11932 }
11933 
11934 /*[clinic input]
11935 str.isalpha as unicode_isalpha
11936 
11937 Return True if the string is an alphabetic string, False otherwise.
11938 
11939 A string is alphabetic if all characters in the string are alphabetic and there
11940 is at least one character in the string.
11941 [clinic start generated code]*/
11942 
11943 static PyObject *
unicode_isalpha_impl(PyObject * self)11944 unicode_isalpha_impl(PyObject *self)
11945 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11946 {
11947     Py_ssize_t i, length;
11948     int kind;
11949     void *data;
11950 
11951     if (PyUnicode_READY(self) == -1)
11952         return NULL;
11953     length = PyUnicode_GET_LENGTH(self);
11954     kind = PyUnicode_KIND(self);
11955     data = PyUnicode_DATA(self);
11956 
11957     /* Shortcut for single character strings */
11958     if (length == 1)
11959         return PyBool_FromLong(
11960             Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11961 
11962     /* Special case for empty strings */
11963     if (length == 0)
11964         Py_RETURN_FALSE;
11965 
11966     for (i = 0; i < length; i++) {
11967         if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11968             Py_RETURN_FALSE;
11969     }
11970     Py_RETURN_TRUE;
11971 }
11972 
11973 /*[clinic input]
11974 str.isalnum as unicode_isalnum
11975 
11976 Return True if the string is an alpha-numeric string, False otherwise.
11977 
11978 A string is alpha-numeric if all characters in the string are alpha-numeric and
11979 there is at least one character in the string.
11980 [clinic start generated code]*/
11981 
11982 static PyObject *
unicode_isalnum_impl(PyObject * self)11983 unicode_isalnum_impl(PyObject *self)
11984 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11985 {
11986     int kind;
11987     void *data;
11988     Py_ssize_t len, i;
11989 
11990     if (PyUnicode_READY(self) == -1)
11991         return NULL;
11992 
11993     kind = PyUnicode_KIND(self);
11994     data = PyUnicode_DATA(self);
11995     len = PyUnicode_GET_LENGTH(self);
11996 
11997     /* Shortcut for single character strings */
11998     if (len == 1) {
11999         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12000         return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12001     }
12002 
12003     /* Special case for empty strings */
12004     if (len == 0)
12005         Py_RETURN_FALSE;
12006 
12007     for (i = 0; i < len; i++) {
12008         const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12009         if (!Py_UNICODE_ISALNUM(ch))
12010             Py_RETURN_FALSE;
12011     }
12012     Py_RETURN_TRUE;
12013 }
12014 
12015 /*[clinic input]
12016 str.isdecimal as unicode_isdecimal
12017 
12018 Return True if the string is a decimal string, False otherwise.
12019 
12020 A string is a decimal string if all characters in the string are decimal and
12021 there is at least one character in the string.
12022 [clinic start generated code]*/
12023 
12024 static PyObject *
unicode_isdecimal_impl(PyObject * self)12025 unicode_isdecimal_impl(PyObject *self)
12026 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12027 {
12028     Py_ssize_t i, length;
12029     int kind;
12030     void *data;
12031 
12032     if (PyUnicode_READY(self) == -1)
12033         return NULL;
12034     length = PyUnicode_GET_LENGTH(self);
12035     kind = PyUnicode_KIND(self);
12036     data = PyUnicode_DATA(self);
12037 
12038     /* Shortcut for single character strings */
12039     if (length == 1)
12040         return PyBool_FromLong(
12041             Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12042 
12043     /* Special case for empty strings */
12044     if (length == 0)
12045         Py_RETURN_FALSE;
12046 
12047     for (i = 0; i < length; i++) {
12048         if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12049             Py_RETURN_FALSE;
12050     }
12051     Py_RETURN_TRUE;
12052 }
12053 
12054 /*[clinic input]
12055 str.isdigit as unicode_isdigit
12056 
12057 Return True if the string is a digit string, False otherwise.
12058 
12059 A string is a digit string if all characters in the string are digits and there
12060 is at least one character in the string.
12061 [clinic start generated code]*/
12062 
12063 static PyObject *
unicode_isdigit_impl(PyObject * self)12064 unicode_isdigit_impl(PyObject *self)
12065 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12066 {
12067     Py_ssize_t i, length;
12068     int kind;
12069     void *data;
12070 
12071     if (PyUnicode_READY(self) == -1)
12072         return NULL;
12073     length = PyUnicode_GET_LENGTH(self);
12074     kind = PyUnicode_KIND(self);
12075     data = PyUnicode_DATA(self);
12076 
12077     /* Shortcut for single character strings */
12078     if (length == 1) {
12079         const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12080         return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12081     }
12082 
12083     /* Special case for empty strings */
12084     if (length == 0)
12085         Py_RETURN_FALSE;
12086 
12087     for (i = 0; i < length; i++) {
12088         if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12089             Py_RETURN_FALSE;
12090     }
12091     Py_RETURN_TRUE;
12092 }
12093 
12094 /*[clinic input]
12095 str.isnumeric as unicode_isnumeric
12096 
12097 Return True if the string is a numeric string, False otherwise.
12098 
12099 A string is numeric if all characters in the string are numeric and there is at
12100 least one character in the string.
12101 [clinic start generated code]*/
12102 
12103 static PyObject *
unicode_isnumeric_impl(PyObject * self)12104 unicode_isnumeric_impl(PyObject *self)
12105 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12106 {
12107     Py_ssize_t i, length;
12108     int kind;
12109     void *data;
12110 
12111     if (PyUnicode_READY(self) == -1)
12112         return NULL;
12113     length = PyUnicode_GET_LENGTH(self);
12114     kind = PyUnicode_KIND(self);
12115     data = PyUnicode_DATA(self);
12116 
12117     /* Shortcut for single character strings */
12118     if (length == 1)
12119         return PyBool_FromLong(
12120             Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12121 
12122     /* Special case for empty strings */
12123     if (length == 0)
12124         Py_RETURN_FALSE;
12125 
12126     for (i = 0; i < length; i++) {
12127         if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12128             Py_RETURN_FALSE;
12129     }
12130     Py_RETURN_TRUE;
12131 }
12132 
12133 int
PyUnicode_IsIdentifier(PyObject * self)12134 PyUnicode_IsIdentifier(PyObject *self)
12135 {
12136     int kind;
12137     void *data;
12138     Py_ssize_t i;
12139     Py_UCS4 first;
12140 
12141     if (PyUnicode_READY(self) == -1) {
12142         Py_FatalError("identifier not ready");
12143         return 0;
12144     }
12145 
12146     /* Special case for empty strings */
12147     if (PyUnicode_GET_LENGTH(self) == 0)
12148         return 0;
12149     kind = PyUnicode_KIND(self);
12150     data = PyUnicode_DATA(self);
12151 
12152     /* PEP 3131 says that the first character must be in
12153        XID_Start and subsequent characters in XID_Continue,
12154        and for the ASCII range, the 2.x rules apply (i.e
12155        start with letters and underscore, continue with
12156        letters, digits, underscore). However, given the current
12157        definition of XID_Start and XID_Continue, it is sufficient
12158        to check just for these, except that _ must be allowed
12159        as starting an identifier.  */
12160     first = PyUnicode_READ(kind, data, 0);
12161     if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12162         return 0;
12163 
12164     for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12165         if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12166             return 0;
12167     return 1;
12168 }
12169 
12170 /*[clinic input]
12171 str.isidentifier as unicode_isidentifier
12172 
12173 Return True if the string is a valid Python identifier, False otherwise.
12174 
12175 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12176 such as "def" or "class".
12177 [clinic start generated code]*/
12178 
12179 static PyObject *
unicode_isidentifier_impl(PyObject * self)12180 unicode_isidentifier_impl(PyObject *self)
12181 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12182 {
12183     return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12184 }
12185 
12186 /*[clinic input]
12187 str.isprintable as unicode_isprintable
12188 
12189 Return True if the string is printable, False otherwise.
12190 
12191 A string is printable if all of its characters are considered printable in
12192 repr() or if it is empty.
12193 [clinic start generated code]*/
12194 
12195 static PyObject *
unicode_isprintable_impl(PyObject * self)12196 unicode_isprintable_impl(PyObject *self)
12197 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12198 {
12199     Py_ssize_t i, length;
12200     int kind;
12201     void *data;
12202 
12203     if (PyUnicode_READY(self) == -1)
12204         return NULL;
12205     length = PyUnicode_GET_LENGTH(self);
12206     kind = PyUnicode_KIND(self);
12207     data = PyUnicode_DATA(self);
12208 
12209     /* Shortcut for single character strings */
12210     if (length == 1)
12211         return PyBool_FromLong(
12212             Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12213 
12214     for (i = 0; i < length; i++) {
12215         if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12216             Py_RETURN_FALSE;
12217         }
12218     }
12219     Py_RETURN_TRUE;
12220 }
12221 
12222 /*[clinic input]
12223 str.join as unicode_join
12224 
12225     iterable: object
12226     /
12227 
12228 Concatenate any number of strings.
12229 
12230 The string whose method is called is inserted in between each given string.
12231 The result is returned as a new string.
12232 
12233 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12234 [clinic start generated code]*/
12235 
12236 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12237 unicode_join(PyObject *self, PyObject *iterable)
12238 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12239 {
12240     return PyUnicode_Join(self, iterable);
12241 }
12242 
12243 static Py_ssize_t
unicode_length(PyObject * self)12244 unicode_length(PyObject *self)
12245 {
12246     if (PyUnicode_READY(self) == -1)
12247         return -1;
12248     return PyUnicode_GET_LENGTH(self);
12249 }
12250 
12251 /*[clinic input]
12252 str.ljust as unicode_ljust
12253 
12254     width: Py_ssize_t
12255     fillchar: Py_UCS4 = ' '
12256     /
12257 
12258 Return a left-justified string of length width.
12259 
12260 Padding is done using the specified fill character (default is a space).
12261 [clinic start generated code]*/
12262 
12263 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12264 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12265 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12266 {
12267     if (PyUnicode_READY(self) == -1)
12268         return NULL;
12269 
12270     if (PyUnicode_GET_LENGTH(self) >= width)
12271         return unicode_result_unchanged(self);
12272 
12273     return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12274 }
12275 
12276 /*[clinic input]
12277 str.lower as unicode_lower
12278 
12279 Return a copy of the string converted to lowercase.
12280 [clinic start generated code]*/
12281 
12282 static PyObject *
unicode_lower_impl(PyObject * self)12283 unicode_lower_impl(PyObject *self)
12284 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12285 {
12286     if (PyUnicode_READY(self) == -1)
12287         return NULL;
12288     if (PyUnicode_IS_ASCII(self))
12289         return ascii_upper_or_lower(self, 1);
12290     return case_operation(self, do_lower);
12291 }
12292 
12293 #define LEFTSTRIP 0
12294 #define RIGHTSTRIP 1
12295 #define BOTHSTRIP 2
12296 
12297 /* Arrays indexed by above */
12298 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12299 
12300 #define STRIPNAME(i) (stripfuncnames[i])
12301 
12302 /* externally visible for str.strip(unicode) */
12303 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12304 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12305 {
12306     void *data;
12307     int kind;
12308     Py_ssize_t i, j, len;
12309     BLOOM_MASK sepmask;
12310     Py_ssize_t seplen;
12311 
12312     if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12313         return NULL;
12314 
12315     kind = PyUnicode_KIND(self);
12316     data = PyUnicode_DATA(self);
12317     len = PyUnicode_GET_LENGTH(self);
12318     seplen = PyUnicode_GET_LENGTH(sepobj);
12319     sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12320                               PyUnicode_DATA(sepobj),
12321                               seplen);
12322 
12323     i = 0;
12324     if (striptype != RIGHTSTRIP) {
12325         while (i < len) {
12326             Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12327             if (!BLOOM(sepmask, ch))
12328                 break;
12329             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12330                 break;
12331             i++;
12332         }
12333     }
12334 
12335     j = len;
12336     if (striptype != LEFTSTRIP) {
12337         j--;
12338         while (j >= i) {
12339             Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12340             if (!BLOOM(sepmask, ch))
12341                 break;
12342             if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12343                 break;
12344             j--;
12345         }
12346 
12347         j++;
12348     }
12349 
12350     return PyUnicode_Substring(self, i, j);
12351 }
12352 
12353 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12354 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12355 {
12356     unsigned char *data;
12357     int kind;
12358     Py_ssize_t length;
12359 
12360     if (PyUnicode_READY(self) == -1)
12361         return NULL;
12362 
12363     length = PyUnicode_GET_LENGTH(self);
12364     end = Py_MIN(end, length);
12365 
12366     if (start == 0 && end == length)
12367         return unicode_result_unchanged(self);
12368 
12369     if (start < 0 || end < 0) {
12370         PyErr_SetString(PyExc_IndexError, "string index out of range");
12371         return NULL;
12372     }
12373     if (start >= length || end < start)
12374         _Py_RETURN_UNICODE_EMPTY();
12375 
12376     length = end - start;
12377     if (PyUnicode_IS_ASCII(self)) {
12378         data = PyUnicode_1BYTE_DATA(self);
12379         return _PyUnicode_FromASCII((char*)(data + start), length);
12380     }
12381     else {
12382         kind = PyUnicode_KIND(self);
12383         data = PyUnicode_1BYTE_DATA(self);
12384         return PyUnicode_FromKindAndData(kind,
12385                                          data + kind * start,
12386                                          length);
12387     }
12388 }
12389 
12390 static PyObject *
do_strip(PyObject * self,int striptype)12391 do_strip(PyObject *self, int striptype)
12392 {
12393     Py_ssize_t len, i, j;
12394 
12395     if (PyUnicode_READY(self) == -1)
12396         return NULL;
12397 
12398     len = PyUnicode_GET_LENGTH(self);
12399 
12400     if (PyUnicode_IS_ASCII(self)) {
12401         Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12402 
12403         i = 0;
12404         if (striptype != RIGHTSTRIP) {
12405             while (i < len) {
12406                 Py_UCS1 ch = data[i];
12407                 if (!_Py_ascii_whitespace[ch])
12408                     break;
12409                 i++;
12410             }
12411         }
12412 
12413         j = len;
12414         if (striptype != LEFTSTRIP) {
12415             j--;
12416             while (j >= i) {
12417                 Py_UCS1 ch = data[j];
12418                 if (!_Py_ascii_whitespace[ch])
12419                     break;
12420                 j--;
12421             }
12422             j++;
12423         }
12424     }
12425     else {
12426         int kind = PyUnicode_KIND(self);
12427         void *data = PyUnicode_DATA(self);
12428 
12429         i = 0;
12430         if (striptype != RIGHTSTRIP) {
12431             while (i < len) {
12432                 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12433                 if (!Py_UNICODE_ISSPACE(ch))
12434                     break;
12435                 i++;
12436             }
12437         }
12438 
12439         j = len;
12440         if (striptype != LEFTSTRIP) {
12441             j--;
12442             while (j >= i) {
12443                 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12444                 if (!Py_UNICODE_ISSPACE(ch))
12445                     break;
12446                 j--;
12447             }
12448             j++;
12449         }
12450     }
12451 
12452     return PyUnicode_Substring(self, i, j);
12453 }
12454 
12455 
12456 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12457 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12458 {
12459     if (sep != Py_None) {
12460         if (PyUnicode_Check(sep))
12461             return _PyUnicode_XStrip(self, striptype, sep);
12462         else {
12463             PyErr_Format(PyExc_TypeError,
12464                          "%s arg must be None or str",
12465                          STRIPNAME(striptype));
12466             return NULL;
12467         }
12468     }
12469 
12470     return do_strip(self, striptype);
12471 }
12472 
12473 
12474 /*[clinic input]
12475 str.strip as unicode_strip
12476 
12477     chars: object = None
12478     /
12479 
12480 Return a copy of the string with leading and trailing whitespace removed.
12481 
12482 If chars is given and not None, remove characters in chars instead.
12483 [clinic start generated code]*/
12484 
12485 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12486 unicode_strip_impl(PyObject *self, PyObject *chars)
12487 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12488 {
12489     return do_argstrip(self, BOTHSTRIP, chars);
12490 }
12491 
12492 
12493 /*[clinic input]
12494 str.lstrip as unicode_lstrip
12495 
12496     chars: object = None
12497     /
12498 
12499 Return a copy of the string with leading whitespace removed.
12500 
12501 If chars is given and not None, remove characters in chars instead.
12502 [clinic start generated code]*/
12503 
12504 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12505 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12506 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12507 {
12508     return do_argstrip(self, LEFTSTRIP, chars);
12509 }
12510 
12511 
12512 /*[clinic input]
12513 str.rstrip as unicode_rstrip
12514 
12515     chars: object = None
12516     /
12517 
12518 Return a copy of the string with trailing whitespace removed.
12519 
12520 If chars is given and not None, remove characters in chars instead.
12521 [clinic start generated code]*/
12522 
12523 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12524 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12525 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12526 {
12527     return do_argstrip(self, RIGHTSTRIP, chars);
12528 }
12529 
12530 
12531 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12532 unicode_repeat(PyObject *str, Py_ssize_t len)
12533 {
12534     PyObject *u;
12535     Py_ssize_t nchars, n;
12536 
12537     if (len < 1)
12538         _Py_RETURN_UNICODE_EMPTY();
12539 
12540     /* no repeat, return original string */
12541     if (len == 1)
12542         return unicode_result_unchanged(str);
12543 
12544     if (PyUnicode_READY(str) == -1)
12545         return NULL;
12546 
12547     if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12548         PyErr_SetString(PyExc_OverflowError,
12549                         "repeated string is too long");
12550         return NULL;
12551     }
12552     nchars = len * PyUnicode_GET_LENGTH(str);
12553 
12554     u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12555     if (!u)
12556         return NULL;
12557     assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12558 
12559     if (PyUnicode_GET_LENGTH(str) == 1) {
12560         const int kind = PyUnicode_KIND(str);
12561         const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12562         if (kind == PyUnicode_1BYTE_KIND) {
12563             void *to = PyUnicode_DATA(u);
12564             memset(to, (unsigned char)fill_char, len);
12565         }
12566         else if (kind == PyUnicode_2BYTE_KIND) {
12567             Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12568             for (n = 0; n < len; ++n)
12569                 ucs2[n] = fill_char;
12570         } else {
12571             Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12572             assert(kind == PyUnicode_4BYTE_KIND);
12573             for (n = 0; n < len; ++n)
12574                 ucs4[n] = fill_char;
12575         }
12576     }
12577     else {
12578         /* number of characters copied this far */
12579         Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12580         const Py_ssize_t char_size = PyUnicode_KIND(str);
12581         char *to = (char *) PyUnicode_DATA(u);
12582         memcpy(to, PyUnicode_DATA(str),
12583                   PyUnicode_GET_LENGTH(str) * char_size);
12584         while (done < nchars) {
12585             n = (done <= nchars-done) ? done : nchars-done;
12586             memcpy(to + (done * char_size), to, n * char_size);
12587             done += n;
12588         }
12589     }
12590 
12591     assert(_PyUnicode_CheckConsistency(u, 1));
12592     return u;
12593 }
12594 
12595 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12596 PyUnicode_Replace(PyObject *str,
12597                   PyObject *substr,
12598                   PyObject *replstr,
12599                   Py_ssize_t maxcount)
12600 {
12601     if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12602             ensure_unicode(replstr) < 0)
12603         return NULL;
12604     return replace(str, substr, replstr, maxcount);
12605 }
12606 
12607 /*[clinic input]
12608 str.replace as unicode_replace
12609 
12610     old: unicode
12611     new: unicode
12612     count: Py_ssize_t = -1
12613         Maximum number of occurrences to replace.
12614         -1 (the default value) means replace all occurrences.
12615     /
12616 
12617 Return a copy with all occurrences of substring old replaced by new.
12618 
12619 If the optional argument count is given, only the first count occurrences are
12620 replaced.
12621 [clinic start generated code]*/
12622 
12623 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12624 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12625                      Py_ssize_t count)
12626 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12627 {
12628     if (PyUnicode_READY(self) == -1)
12629         return NULL;
12630     return replace(self, old, new, count);
12631 }
12632 
12633 static PyObject *
unicode_repr(PyObject * unicode)12634 unicode_repr(PyObject *unicode)
12635 {
12636     PyObject *repr;
12637     Py_ssize_t isize;
12638     Py_ssize_t osize, squote, dquote, i, o;
12639     Py_UCS4 max, quote;
12640     int ikind, okind, unchanged;
12641     void *idata, *odata;
12642 
12643     if (PyUnicode_READY(unicode) == -1)
12644         return NULL;
12645 
12646     isize = PyUnicode_GET_LENGTH(unicode);
12647     idata = PyUnicode_DATA(unicode);
12648 
12649     /* Compute length of output, quote characters, and
12650        maximum character */
12651     osize = 0;
12652     max = 127;
12653     squote = dquote = 0;
12654     ikind = PyUnicode_KIND(unicode);
12655     for (i = 0; i < isize; i++) {
12656         Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12657         Py_ssize_t incr = 1;
12658         switch (ch) {
12659         case '\'': squote++; break;
12660         case '"':  dquote++; break;
12661         case '\\': case '\t': case '\r': case '\n':
12662             incr = 2;
12663             break;
12664         default:
12665             /* Fast-path ASCII */
12666             if (ch < ' ' || ch == 0x7f)
12667                 incr = 4; /* \xHH */
12668             else if (ch < 0x7f)
12669                 ;
12670             else if (Py_UNICODE_ISPRINTABLE(ch))
12671                 max = ch > max ? ch : max;
12672             else if (ch < 0x100)
12673                 incr = 4; /* \xHH */
12674             else if (ch < 0x10000)
12675                 incr = 6; /* \uHHHH */
12676             else
12677                 incr = 10; /* \uHHHHHHHH */
12678         }
12679         if (osize > PY_SSIZE_T_MAX - incr) {
12680             PyErr_SetString(PyExc_OverflowError,
12681                             "string is too long to generate repr");
12682             return NULL;
12683         }
12684         osize += incr;
12685     }
12686 
12687     quote = '\'';
12688     unchanged = (osize == isize);
12689     if (squote) {
12690         unchanged = 0;
12691         if (dquote)
12692             /* Both squote and dquote present. Use squote,
12693                and escape them */
12694             osize += squote;
12695         else
12696             quote = '"';
12697     }
12698     osize += 2;   /* quotes */
12699 
12700     repr = PyUnicode_New(osize, max);
12701     if (repr == NULL)
12702         return NULL;
12703     okind = PyUnicode_KIND(repr);
12704     odata = PyUnicode_DATA(repr);
12705 
12706     PyUnicode_WRITE(okind, odata, 0, quote);
12707     PyUnicode_WRITE(okind, odata, osize-1, quote);
12708     if (unchanged) {
12709         _PyUnicode_FastCopyCharacters(repr, 1,
12710                                       unicode, 0,
12711                                       isize);
12712     }
12713     else {
12714         for (i = 0, o = 1; i < isize; i++) {
12715             Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12716 
12717             /* Escape quotes and backslashes */
12718             if ((ch == quote) || (ch == '\\')) {
12719                 PyUnicode_WRITE(okind, odata, o++, '\\');
12720                 PyUnicode_WRITE(okind, odata, o++, ch);
12721                 continue;
12722             }
12723 
12724             /* Map special whitespace to '\t', \n', '\r' */
12725             if (ch == '\t') {
12726                 PyUnicode_WRITE(okind, odata, o++, '\\');
12727                 PyUnicode_WRITE(okind, odata, o++, 't');
12728             }
12729             else if (ch == '\n') {
12730                 PyUnicode_WRITE(okind, odata, o++, '\\');
12731                 PyUnicode_WRITE(okind, odata, o++, 'n');
12732             }
12733             else if (ch == '\r') {
12734                 PyUnicode_WRITE(okind, odata, o++, '\\');
12735                 PyUnicode_WRITE(okind, odata, o++, 'r');
12736             }
12737 
12738             /* Map non-printable US ASCII to '\xhh' */
12739             else if (ch < ' ' || ch == 0x7F) {
12740                 PyUnicode_WRITE(okind, odata, o++, '\\');
12741                 PyUnicode_WRITE(okind, odata, o++, 'x');
12742                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12743                 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12744             }
12745 
12746             /* Copy ASCII characters as-is */
12747             else if (ch < 0x7F) {
12748                 PyUnicode_WRITE(okind, odata, o++, ch);
12749             }
12750 
12751             /* Non-ASCII characters */
12752             else {
12753                 /* Map Unicode whitespace and control characters
12754                    (categories Z* and C* except ASCII space)
12755                 */
12756                 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12757                     PyUnicode_WRITE(okind, odata, o++, '\\');
12758                     /* Map 8-bit characters to '\xhh' */
12759                     if (ch <= 0xff) {
12760                         PyUnicode_WRITE(okind, odata, o++, 'x');
12761                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12762                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12763                     }
12764                     /* Map 16-bit characters to '\uxxxx' */
12765                     else if (ch <= 0xffff) {
12766                         PyUnicode_WRITE(okind, odata, o++, 'u');
12767                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12768                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12769                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12770                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12771                     }
12772                     /* Map 21-bit characters to '\U00xxxxxx' */
12773                     else {
12774                         PyUnicode_WRITE(okind, odata, o++, 'U');
12775                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12776                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12777                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12778                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12779                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12780                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12781                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12782                         PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12783                     }
12784                 }
12785                 /* Copy characters as-is */
12786                 else {
12787                     PyUnicode_WRITE(okind, odata, o++, ch);
12788                 }
12789             }
12790         }
12791     }
12792     /* Closing quote already added at the beginning */
12793     assert(_PyUnicode_CheckConsistency(repr, 1));
12794     return repr;
12795 }
12796 
12797 PyDoc_STRVAR(rfind__doc__,
12798              "S.rfind(sub[, start[, end]]) -> int\n\
12799 \n\
12800 Return the highest index in S where substring sub is found,\n\
12801 such that sub is contained within S[start:end].  Optional\n\
12802 arguments start and end are interpreted as in slice notation.\n\
12803 \n\
12804 Return -1 on failure.");
12805 
12806 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12807 unicode_rfind(PyObject *self, PyObject *args)
12808 {
12809     /* initialize variables to prevent gcc warning */
12810     PyObject *substring = NULL;
12811     Py_ssize_t start = 0;
12812     Py_ssize_t end = 0;
12813     Py_ssize_t result;
12814 
12815     if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12816         return NULL;
12817 
12818     if (PyUnicode_READY(self) == -1)
12819         return NULL;
12820 
12821     result = any_find_slice(self, substring, start, end, -1);
12822 
12823     if (result == -2)
12824         return NULL;
12825 
12826     return PyLong_FromSsize_t(result);
12827 }
12828 
12829 PyDoc_STRVAR(rindex__doc__,
12830              "S.rindex(sub[, start[, end]]) -> int\n\
12831 \n\
12832 Return the highest index in S where substring sub is found,\n\
12833 such that sub is contained within S[start:end].  Optional\n\
12834 arguments start and end are interpreted as in slice notation.\n\
12835 \n\
12836 Raises ValueError when the substring is not found.");
12837 
12838 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12839 unicode_rindex(PyObject *self, PyObject *args)
12840 {
12841     /* initialize variables to prevent gcc warning */
12842     PyObject *substring = NULL;
12843     Py_ssize_t start = 0;
12844     Py_ssize_t end = 0;
12845     Py_ssize_t result;
12846 
12847     if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12848         return NULL;
12849 
12850     if (PyUnicode_READY(self) == -1)
12851         return NULL;
12852 
12853     result = any_find_slice(self, substring, start, end, -1);
12854 
12855     if (result == -2)
12856         return NULL;
12857 
12858     if (result < 0) {
12859         PyErr_SetString(PyExc_ValueError, "substring not found");
12860         return NULL;
12861     }
12862 
12863     return PyLong_FromSsize_t(result);
12864 }
12865 
12866 /*[clinic input]
12867 str.rjust as unicode_rjust
12868 
12869     width: Py_ssize_t
12870     fillchar: Py_UCS4 = ' '
12871     /
12872 
12873 Return a right-justified string of length width.
12874 
12875 Padding is done using the specified fill character (default is a space).
12876 [clinic start generated code]*/
12877 
12878 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12879 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12880 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12881 {
12882     if (PyUnicode_READY(self) == -1)
12883         return NULL;
12884 
12885     if (PyUnicode_GET_LENGTH(self) >= width)
12886         return unicode_result_unchanged(self);
12887 
12888     return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12889 }
12890 
12891 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12892 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12893 {
12894     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12895         return NULL;
12896 
12897     return split(s, sep, maxsplit);
12898 }
12899 
12900 /*[clinic input]
12901 str.split as unicode_split
12902 
12903     sep: object = None
12904         The delimiter according which to split the string.
12905         None (the default value) means split according to any whitespace,
12906         and discard empty strings from the result.
12907     maxsplit: Py_ssize_t = -1
12908         Maximum number of splits to do.
12909         -1 (the default value) means no limit.
12910 
12911 Return a list of the words in the string, using sep as the delimiter string.
12912 [clinic start generated code]*/
12913 
12914 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12915 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12916 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12917 {
12918     if (sep == Py_None)
12919         return split(self, NULL, maxsplit);
12920     if (PyUnicode_Check(sep))
12921         return split(self, sep, maxsplit);
12922 
12923     PyErr_Format(PyExc_TypeError,
12924                  "must be str or None, not %.100s",
12925                  Py_TYPE(sep)->tp_name);
12926     return NULL;
12927 }
12928 
12929 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12930 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12931 {
12932     PyObject* out;
12933     int kind1, kind2;
12934     void *buf1, *buf2;
12935     Py_ssize_t len1, len2;
12936 
12937     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12938         return NULL;
12939 
12940     kind1 = PyUnicode_KIND(str_obj);
12941     kind2 = PyUnicode_KIND(sep_obj);
12942     len1 = PyUnicode_GET_LENGTH(str_obj);
12943     len2 = PyUnicode_GET_LENGTH(sep_obj);
12944     if (kind1 < kind2 || len1 < len2) {
12945         _Py_INCREF_UNICODE_EMPTY();
12946         if (!unicode_empty)
12947             out = NULL;
12948         else {
12949             out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12950             Py_DECREF(unicode_empty);
12951         }
12952         return out;
12953     }
12954     buf1 = PyUnicode_DATA(str_obj);
12955     buf2 = PyUnicode_DATA(sep_obj);
12956     if (kind2 != kind1) {
12957         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12958         if (!buf2)
12959             return NULL;
12960     }
12961 
12962     switch (kind1) {
12963     case PyUnicode_1BYTE_KIND:
12964         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12965             out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12966         else
12967             out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968         break;
12969     case PyUnicode_2BYTE_KIND:
12970         out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971         break;
12972     case PyUnicode_4BYTE_KIND:
12973         out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12974         break;
12975     default:
12976         Py_UNREACHABLE();
12977     }
12978 
12979     if (kind2 != kind1)
12980         PyMem_Free(buf2);
12981 
12982     return out;
12983 }
12984 
12985 
12986 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12987 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12988 {
12989     PyObject* out;
12990     int kind1, kind2;
12991     void *buf1, *buf2;
12992     Py_ssize_t len1, len2;
12993 
12994     if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12995         return NULL;
12996 
12997     kind1 = PyUnicode_KIND(str_obj);
12998     kind2 = PyUnicode_KIND(sep_obj);
12999     len1 = PyUnicode_GET_LENGTH(str_obj);
13000     len2 = PyUnicode_GET_LENGTH(sep_obj);
13001     if (kind1 < kind2 || len1 < len2) {
13002         _Py_INCREF_UNICODE_EMPTY();
13003         if (!unicode_empty)
13004             out = NULL;
13005         else {
13006             out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13007             Py_DECREF(unicode_empty);
13008         }
13009         return out;
13010     }
13011     buf1 = PyUnicode_DATA(str_obj);
13012     buf2 = PyUnicode_DATA(sep_obj);
13013     if (kind2 != kind1) {
13014         buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13015         if (!buf2)
13016             return NULL;
13017     }
13018 
13019     switch (kind1) {
13020     case PyUnicode_1BYTE_KIND:
13021         if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13022             out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13023         else
13024             out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13025         break;
13026     case PyUnicode_2BYTE_KIND:
13027         out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13028         break;
13029     case PyUnicode_4BYTE_KIND:
13030         out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13031         break;
13032     default:
13033         Py_UNREACHABLE();
13034     }
13035 
13036     if (kind2 != kind1)
13037         PyMem_Free(buf2);
13038 
13039     return out;
13040 }
13041 
13042 /*[clinic input]
13043 str.partition as unicode_partition
13044 
13045     sep: object
13046     /
13047 
13048 Partition the string into three parts using the given separator.
13049 
13050 This will search for the separator in the string.  If the separator is found,
13051 returns a 3-tuple containing the part before the separator, the separator
13052 itself, and the part after it.
13053 
13054 If the separator is not found, returns a 3-tuple containing the original string
13055 and two empty strings.
13056 [clinic start generated code]*/
13057 
13058 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13059 unicode_partition(PyObject *self, PyObject *sep)
13060 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13061 {
13062     return PyUnicode_Partition(self, sep);
13063 }
13064 
13065 /*[clinic input]
13066 str.rpartition as unicode_rpartition = str.partition
13067 
13068 Partition the string into three parts using the given separator.
13069 
13070 This will search for the separator in the string, starting at the end. If
13071 the separator is found, returns a 3-tuple containing the part before the
13072 separator, the separator itself, and the part after it.
13073 
13074 If the separator is not found, returns a 3-tuple containing two empty strings
13075 and the original string.
13076 [clinic start generated code]*/
13077 
13078 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13079 unicode_rpartition(PyObject *self, PyObject *sep)
13080 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13081 {
13082     return PyUnicode_RPartition(self, sep);
13083 }
13084 
13085 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13086 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13087 {
13088     if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13089         return NULL;
13090 
13091     return rsplit(s, sep, maxsplit);
13092 }
13093 
13094 /*[clinic input]
13095 str.rsplit as unicode_rsplit = str.split
13096 
13097 Return a list of the words in the string, using sep as the delimiter string.
13098 
13099 Splits are done starting at the end of the string and working to the front.
13100 [clinic start generated code]*/
13101 
13102 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13103 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13104 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13105 {
13106     if (sep == Py_None)
13107         return rsplit(self, NULL, maxsplit);
13108     if (PyUnicode_Check(sep))
13109         return rsplit(self, sep, maxsplit);
13110 
13111     PyErr_Format(PyExc_TypeError,
13112                  "must be str or None, not %.100s",
13113                  Py_TYPE(sep)->tp_name);
13114     return NULL;
13115 }
13116 
13117 /*[clinic input]
13118 str.splitlines as unicode_splitlines
13119 
13120     keepends: bool(accept={int}) = False
13121 
13122 Return a list of the lines in the string, breaking at line boundaries.
13123 
13124 Line breaks are not included in the resulting list unless keepends is given and
13125 true.
13126 [clinic start generated code]*/
13127 
13128 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13129 unicode_splitlines_impl(PyObject *self, int keepends)
13130 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13131 {
13132     return PyUnicode_Splitlines(self, keepends);
13133 }
13134 
13135 static
unicode_str(PyObject * self)13136 PyObject *unicode_str(PyObject *self)
13137 {
13138     return unicode_result_unchanged(self);
13139 }
13140 
13141 /*[clinic input]
13142 str.swapcase as unicode_swapcase
13143 
13144 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13145 [clinic start generated code]*/
13146 
13147 static PyObject *
unicode_swapcase_impl(PyObject * self)13148 unicode_swapcase_impl(PyObject *self)
13149 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13150 {
13151     if (PyUnicode_READY(self) == -1)
13152         return NULL;
13153     return case_operation(self, do_swapcase);
13154 }
13155 
13156 /*[clinic input]
13157 
13158 @staticmethod
13159 str.maketrans as unicode_maketrans
13160 
13161   x: object
13162 
13163   y: unicode=NULL
13164 
13165   z: unicode=NULL
13166 
13167   /
13168 
13169 Return a translation table usable for str.translate().
13170 
13171 If there is only one argument, it must be a dictionary mapping Unicode
13172 ordinals (integers) or characters to Unicode ordinals, strings or None.
13173 Character keys will be then converted to ordinals.
13174 If there are two arguments, they must be strings of equal length, and
13175 in the resulting dictionary, each character in x will be mapped to the
13176 character at the same position in y. If there is a third argument, it
13177 must be a string, whose characters will be mapped to None in the result.
13178 [clinic start generated code]*/
13179 
13180 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13181 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13182 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13183 {
13184     PyObject *new = NULL, *key, *value;
13185     Py_ssize_t i = 0;
13186     int res;
13187 
13188     new = PyDict_New();
13189     if (!new)
13190         return NULL;
13191     if (y != NULL) {
13192         int x_kind, y_kind, z_kind;
13193         void *x_data, *y_data, *z_data;
13194 
13195         /* x must be a string too, of equal length */
13196         if (!PyUnicode_Check(x)) {
13197             PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13198                             "be a string if there is a second argument");
13199             goto err;
13200         }
13201         if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13202             PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13203                             "arguments must have equal length");
13204             goto err;
13205         }
13206         /* create entries for translating chars in x to those in y */
13207         x_kind = PyUnicode_KIND(x);
13208         y_kind = PyUnicode_KIND(y);
13209         x_data = PyUnicode_DATA(x);
13210         y_data = PyUnicode_DATA(y);
13211         for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13212             key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13213             if (!key)
13214                 goto err;
13215             value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13216             if (!value) {
13217                 Py_DECREF(key);
13218                 goto err;
13219             }
13220             res = PyDict_SetItem(new, key, value);
13221             Py_DECREF(key);
13222             Py_DECREF(value);
13223             if (res < 0)
13224                 goto err;
13225         }
13226         /* create entries for deleting chars in z */
13227         if (z != NULL) {
13228             z_kind = PyUnicode_KIND(z);
13229             z_data = PyUnicode_DATA(z);
13230             for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13231                 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13232                 if (!key)
13233                     goto err;
13234                 res = PyDict_SetItem(new, key, Py_None);
13235                 Py_DECREF(key);
13236                 if (res < 0)
13237                     goto err;
13238             }
13239         }
13240     } else {
13241         int kind;
13242         void *data;
13243 
13244         /* x must be a dict */
13245         if (!PyDict_CheckExact(x)) {
13246             PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13247                             "to maketrans it must be a dict");
13248             goto err;
13249         }
13250         /* copy entries into the new dict, converting string keys to int keys */
13251         while (PyDict_Next(x, &i, &key, &value)) {
13252             if (PyUnicode_Check(key)) {
13253                 /* convert string keys to integer keys */
13254                 PyObject *newkey;
13255                 if (PyUnicode_GET_LENGTH(key) != 1) {
13256                     PyErr_SetString(PyExc_ValueError, "string keys in translate "
13257                                     "table must be of length 1");
13258                     goto err;
13259                 }
13260                 kind = PyUnicode_KIND(key);
13261                 data = PyUnicode_DATA(key);
13262                 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13263                 if (!newkey)
13264                     goto err;
13265                 res = PyDict_SetItem(new, newkey, value);
13266                 Py_DECREF(newkey);
13267                 if (res < 0)
13268                     goto err;
13269             } else if (PyLong_Check(key)) {
13270                 /* just keep integer keys */
13271                 if (PyDict_SetItem(new, key, value) < 0)
13272                     goto err;
13273             } else {
13274                 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13275                                 "be strings or integers");
13276                 goto err;
13277             }
13278         }
13279     }
13280     return new;
13281   err:
13282     Py_DECREF(new);
13283     return NULL;
13284 }
13285 
13286 /*[clinic input]
13287 str.translate as unicode_translate
13288 
13289     table: object
13290         Translation table, which must be a mapping of Unicode ordinals to
13291         Unicode ordinals, strings, or None.
13292     /
13293 
13294 Replace each character in the string using the given translation table.
13295 
13296 The table must implement lookup/indexing via __getitem__, for instance a
13297 dictionary or list.  If this operation raises LookupError, the character is
13298 left untouched.  Characters mapped to None are deleted.
13299 [clinic start generated code]*/
13300 
13301 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13302 unicode_translate(PyObject *self, PyObject *table)
13303 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13304 {
13305     return _PyUnicode_TranslateCharmap(self, table, "ignore");
13306 }
13307 
13308 /*[clinic input]
13309 str.upper as unicode_upper
13310 
13311 Return a copy of the string converted to uppercase.
13312 [clinic start generated code]*/
13313 
13314 static PyObject *
unicode_upper_impl(PyObject * self)13315 unicode_upper_impl(PyObject *self)
13316 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13317 {
13318     if (PyUnicode_READY(self) == -1)
13319         return NULL;
13320     if (PyUnicode_IS_ASCII(self))
13321         return ascii_upper_or_lower(self, 0);
13322     return case_operation(self, do_upper);
13323 }
13324 
13325 /*[clinic input]
13326 str.zfill as unicode_zfill
13327 
13328     width: Py_ssize_t
13329     /
13330 
13331 Pad a numeric string with zeros on the left, to fill a field of the given width.
13332 
13333 The string is never truncated.
13334 [clinic start generated code]*/
13335 
13336 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13337 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13338 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13339 {
13340     Py_ssize_t fill;
13341     PyObject *u;
13342     int kind;
13343     void *data;
13344     Py_UCS4 chr;
13345 
13346     if (PyUnicode_READY(self) == -1)
13347         return NULL;
13348 
13349     if (PyUnicode_GET_LENGTH(self) >= width)
13350         return unicode_result_unchanged(self);
13351 
13352     fill = width - PyUnicode_GET_LENGTH(self);
13353 
13354     u = pad(self, fill, 0, '0');
13355 
13356     if (u == NULL)
13357         return NULL;
13358 
13359     kind = PyUnicode_KIND(u);
13360     data = PyUnicode_DATA(u);
13361     chr = PyUnicode_READ(kind, data, fill);
13362 
13363     if (chr == '+' || chr == '-') {
13364         /* move sign to beginning of string */
13365         PyUnicode_WRITE(kind, data, 0, chr);
13366         PyUnicode_WRITE(kind, data, fill, '0');
13367     }
13368 
13369     assert(_PyUnicode_CheckConsistency(u, 1));
13370     return u;
13371 }
13372 
13373 #if 0
13374 static PyObject *
13375 unicode__decimal2ascii(PyObject *self)
13376 {
13377     return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13378 }
13379 #endif
13380 
13381 PyDoc_STRVAR(startswith__doc__,
13382              "S.startswith(prefix[, start[, end]]) -> bool\n\
13383 \n\
13384 Return True if S starts with the specified prefix, False otherwise.\n\
13385 With optional start, test S beginning at that position.\n\
13386 With optional end, stop comparing S at that position.\n\
13387 prefix can also be a tuple of strings to try.");
13388 
13389 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13390 unicode_startswith(PyObject *self,
13391                    PyObject *args)
13392 {
13393     PyObject *subobj;
13394     PyObject *substring;
13395     Py_ssize_t start = 0;
13396     Py_ssize_t end = PY_SSIZE_T_MAX;
13397     int result;
13398 
13399     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13400         return NULL;
13401     if (PyTuple_Check(subobj)) {
13402         Py_ssize_t i;
13403         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13404             substring = PyTuple_GET_ITEM(subobj, i);
13405             if (!PyUnicode_Check(substring)) {
13406                 PyErr_Format(PyExc_TypeError,
13407                              "tuple for startswith must only contain str, "
13408                              "not %.100s",
13409                              Py_TYPE(substring)->tp_name);
13410                 return NULL;
13411             }
13412             result = tailmatch(self, substring, start, end, -1);
13413             if (result == -1)
13414                 return NULL;
13415             if (result) {
13416                 Py_RETURN_TRUE;
13417             }
13418         }
13419         /* nothing matched */
13420         Py_RETURN_FALSE;
13421     }
13422     if (!PyUnicode_Check(subobj)) {
13423         PyErr_Format(PyExc_TypeError,
13424                      "startswith first arg must be str or "
13425                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13426         return NULL;
13427     }
13428     result = tailmatch(self, subobj, start, end, -1);
13429     if (result == -1)
13430         return NULL;
13431     return PyBool_FromLong(result);
13432 }
13433 
13434 
13435 PyDoc_STRVAR(endswith__doc__,
13436              "S.endswith(suffix[, start[, end]]) -> bool\n\
13437 \n\
13438 Return True if S ends with the specified suffix, False otherwise.\n\
13439 With optional start, test S beginning at that position.\n\
13440 With optional end, stop comparing S at that position.\n\
13441 suffix can also be a tuple of strings to try.");
13442 
13443 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13444 unicode_endswith(PyObject *self,
13445                  PyObject *args)
13446 {
13447     PyObject *subobj;
13448     PyObject *substring;
13449     Py_ssize_t start = 0;
13450     Py_ssize_t end = PY_SSIZE_T_MAX;
13451     int result;
13452 
13453     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13454         return NULL;
13455     if (PyTuple_Check(subobj)) {
13456         Py_ssize_t i;
13457         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13458             substring = PyTuple_GET_ITEM(subobj, i);
13459             if (!PyUnicode_Check(substring)) {
13460                 PyErr_Format(PyExc_TypeError,
13461                              "tuple for endswith must only contain str, "
13462                              "not %.100s",
13463                              Py_TYPE(substring)->tp_name);
13464                 return NULL;
13465             }
13466             result = tailmatch(self, substring, start, end, +1);
13467             if (result == -1)
13468                 return NULL;
13469             if (result) {
13470                 Py_RETURN_TRUE;
13471             }
13472         }
13473         Py_RETURN_FALSE;
13474     }
13475     if (!PyUnicode_Check(subobj)) {
13476         PyErr_Format(PyExc_TypeError,
13477                      "endswith first arg must be str or "
13478                      "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13479         return NULL;
13480     }
13481     result = tailmatch(self, subobj, start, end, +1);
13482     if (result == -1)
13483         return NULL;
13484     return PyBool_FromLong(result);
13485 }
13486 
13487 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13488 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13489 {
13490     writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13491     writer->data = PyUnicode_DATA(writer->buffer);
13492 
13493     if (!writer->readonly) {
13494         writer->kind = PyUnicode_KIND(writer->buffer);
13495         writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13496     }
13497     else {
13498         /* use a value smaller than PyUnicode_1BYTE_KIND() so
13499            _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13500         writer->kind = PyUnicode_WCHAR_KIND;
13501         assert(writer->kind <= PyUnicode_1BYTE_KIND);
13502 
13503         /* Copy-on-write mode: set buffer size to 0 so
13504          * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13505          * next write. */
13506         writer->size = 0;
13507     }
13508 }
13509 
13510 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13511 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13512 {
13513     memset(writer, 0, sizeof(*writer));
13514 
13515     /* ASCII is the bare minimum */
13516     writer->min_char = 127;
13517 
13518     /* use a value smaller than PyUnicode_1BYTE_KIND() so
13519        _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13520     writer->kind = PyUnicode_WCHAR_KIND;
13521     assert(writer->kind <= PyUnicode_1BYTE_KIND);
13522 }
13523 
13524 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13525 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13526                                  Py_ssize_t length, Py_UCS4 maxchar)
13527 {
13528     Py_ssize_t newlen;
13529     PyObject *newbuffer;
13530 
13531     assert(maxchar <= MAX_UNICODE);
13532 
13533     /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13534     assert((maxchar > writer->maxchar && length >= 0)
13535            || length > 0);
13536 
13537     if (length > PY_SSIZE_T_MAX - writer->pos) {
13538         PyErr_NoMemory();
13539         return -1;
13540     }
13541     newlen = writer->pos + length;
13542 
13543     maxchar = Py_MAX(maxchar, writer->min_char);
13544 
13545     if (writer->buffer == NULL) {
13546         assert(!writer->readonly);
13547         if (writer->overallocate
13548             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13549             /* overallocate to limit the number of realloc() */
13550             newlen += newlen / OVERALLOCATE_FACTOR;
13551         }
13552         if (newlen < writer->min_length)
13553             newlen = writer->min_length;
13554 
13555         writer->buffer = PyUnicode_New(newlen, maxchar);
13556         if (writer->buffer == NULL)
13557             return -1;
13558     }
13559     else if (newlen > writer->size) {
13560         if (writer->overallocate
13561             && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13562             /* overallocate to limit the number of realloc() */
13563             newlen += newlen / OVERALLOCATE_FACTOR;
13564         }
13565         if (newlen < writer->min_length)
13566             newlen = writer->min_length;
13567 
13568         if (maxchar > writer->maxchar || writer->readonly) {
13569             /* resize + widen */
13570             maxchar = Py_MAX(maxchar, writer->maxchar);
13571             newbuffer = PyUnicode_New(newlen, maxchar);
13572             if (newbuffer == NULL)
13573                 return -1;
13574             _PyUnicode_FastCopyCharacters(newbuffer, 0,
13575                                           writer->buffer, 0, writer->pos);
13576             Py_DECREF(writer->buffer);
13577             writer->readonly = 0;
13578         }
13579         else {
13580             newbuffer = resize_compact(writer->buffer, newlen);
13581             if (newbuffer == NULL)
13582                 return -1;
13583         }
13584         writer->buffer = newbuffer;
13585     }
13586     else if (maxchar > writer->maxchar) {
13587         assert(!writer->readonly);
13588         newbuffer = PyUnicode_New(writer->size, maxchar);
13589         if (newbuffer == NULL)
13590             return -1;
13591         _PyUnicode_FastCopyCharacters(newbuffer, 0,
13592                                       writer->buffer, 0, writer->pos);
13593         Py_SETREF(writer->buffer, newbuffer);
13594     }
13595     _PyUnicodeWriter_Update(writer);
13596     return 0;
13597 
13598 #undef OVERALLOCATE_FACTOR
13599 }
13600 
13601 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13602 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13603                                      enum PyUnicode_Kind kind)
13604 {
13605     Py_UCS4 maxchar;
13606 
13607     /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13608     assert(writer->kind < kind);
13609 
13610     switch (kind)
13611     {
13612     case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13613     case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13614     case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13615     default:
13616         Py_UNREACHABLE();
13617     }
13618 
13619     return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13620 }
13621 
13622 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13623 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13624 {
13625     assert(ch <= MAX_UNICODE);
13626     if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13627         return -1;
13628     PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13629     writer->pos++;
13630     return 0;
13631 }
13632 
13633 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13634 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13635 {
13636     return _PyUnicodeWriter_WriteCharInline(writer, ch);
13637 }
13638 
13639 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13640 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13641 {
13642     Py_UCS4 maxchar;
13643     Py_ssize_t len;
13644 
13645     if (PyUnicode_READY(str) == -1)
13646         return -1;
13647     len = PyUnicode_GET_LENGTH(str);
13648     if (len == 0)
13649         return 0;
13650     maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13651     if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13652         if (writer->buffer == NULL && !writer->overallocate) {
13653             assert(_PyUnicode_CheckConsistency(str, 1));
13654             writer->readonly = 1;
13655             Py_INCREF(str);
13656             writer->buffer = str;
13657             _PyUnicodeWriter_Update(writer);
13658             writer->pos += len;
13659             return 0;
13660         }
13661         if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13662             return -1;
13663     }
13664     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13665                                   str, 0, len);
13666     writer->pos += len;
13667     return 0;
13668 }
13669 
13670 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13671 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13672                                 Py_ssize_t start, Py_ssize_t end)
13673 {
13674     Py_UCS4 maxchar;
13675     Py_ssize_t len;
13676 
13677     if (PyUnicode_READY(str) == -1)
13678         return -1;
13679 
13680     assert(0 <= start);
13681     assert(end <= PyUnicode_GET_LENGTH(str));
13682     assert(start <= end);
13683 
13684     if (end == 0)
13685         return 0;
13686 
13687     if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13688         return _PyUnicodeWriter_WriteStr(writer, str);
13689 
13690     if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13691         maxchar = _PyUnicode_FindMaxChar(str, start, end);
13692     else
13693         maxchar = writer->maxchar;
13694     len = end - start;
13695 
13696     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13697         return -1;
13698 
13699     _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13700                                   str, start, len);
13701     writer->pos += len;
13702     return 0;
13703 }
13704 
13705 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13706 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13707                                   const char *ascii, Py_ssize_t len)
13708 {
13709     if (len == -1)
13710         len = strlen(ascii);
13711 
13712     assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13713 
13714     if (writer->buffer == NULL && !writer->overallocate) {
13715         PyObject *str;
13716 
13717         str = _PyUnicode_FromASCII(ascii, len);
13718         if (str == NULL)
13719             return -1;
13720 
13721         writer->readonly = 1;
13722         writer->buffer = str;
13723         _PyUnicodeWriter_Update(writer);
13724         writer->pos += len;
13725         return 0;
13726     }
13727 
13728     if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13729         return -1;
13730 
13731     switch (writer->kind)
13732     {
13733     case PyUnicode_1BYTE_KIND:
13734     {
13735         const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13736         Py_UCS1 *data = writer->data;
13737 
13738         memcpy(data + writer->pos, str, len);
13739         break;
13740     }
13741     case PyUnicode_2BYTE_KIND:
13742     {
13743         _PyUnicode_CONVERT_BYTES(
13744             Py_UCS1, Py_UCS2,
13745             ascii, ascii + len,
13746             (Py_UCS2 *)writer->data + writer->pos);
13747         break;
13748     }
13749     case PyUnicode_4BYTE_KIND:
13750     {
13751         _PyUnicode_CONVERT_BYTES(
13752             Py_UCS1, Py_UCS4,
13753             ascii, ascii + len,
13754             (Py_UCS4 *)writer->data + writer->pos);
13755         break;
13756     }
13757     default:
13758         Py_UNREACHABLE();
13759     }
13760 
13761     writer->pos += len;
13762     return 0;
13763 }
13764 
13765 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13766 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13767                                    const char *str, Py_ssize_t len)
13768 {
13769     Py_UCS4 maxchar;
13770 
13771     maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
13772     if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13773         return -1;
13774     unicode_write_cstr(writer->buffer, writer->pos, str, len);
13775     writer->pos += len;
13776     return 0;
13777 }
13778 
13779 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13780 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13781 {
13782     PyObject *str;
13783 
13784     if (writer->pos == 0) {
13785         Py_CLEAR(writer->buffer);
13786         _Py_RETURN_UNICODE_EMPTY();
13787     }
13788 
13789     str = writer->buffer;
13790     writer->buffer = NULL;
13791 
13792     if (writer->readonly) {
13793         assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13794         return str;
13795     }
13796 
13797     if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13798         PyObject *str2;
13799         str2 = resize_compact(str, writer->pos);
13800         if (str2 == NULL) {
13801             Py_DECREF(str);
13802             return NULL;
13803         }
13804         str = str2;
13805     }
13806 
13807     assert(_PyUnicode_CheckConsistency(str, 1));
13808     return unicode_result_ready(str);
13809 }
13810 
13811 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13812 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13813 {
13814     Py_CLEAR(writer->buffer);
13815 }
13816 
13817 #include "stringlib/unicode_format.h"
13818 
13819 PyDoc_STRVAR(format__doc__,
13820              "S.format(*args, **kwargs) -> str\n\
13821 \n\
13822 Return a formatted version of S, using substitutions from args and kwargs.\n\
13823 The substitutions are identified by braces ('{' and '}').");
13824 
13825 PyDoc_STRVAR(format_map__doc__,
13826              "S.format_map(mapping) -> str\n\
13827 \n\
13828 Return a formatted version of S, using substitutions from mapping.\n\
13829 The substitutions are identified by braces ('{' and '}').");
13830 
13831 /*[clinic input]
13832 str.__format__ as unicode___format__
13833 
13834     format_spec: unicode
13835     /
13836 
13837 Return a formatted version of the string as described by format_spec.
13838 [clinic start generated code]*/
13839 
13840 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13841 unicode___format___impl(PyObject *self, PyObject *format_spec)
13842 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13843 {
13844     _PyUnicodeWriter writer;
13845     int ret;
13846 
13847     if (PyUnicode_READY(self) == -1)
13848         return NULL;
13849     _PyUnicodeWriter_Init(&writer);
13850     ret = _PyUnicode_FormatAdvancedWriter(&writer,
13851                                           self, format_spec, 0,
13852                                           PyUnicode_GET_LENGTH(format_spec));
13853     if (ret == -1) {
13854         _PyUnicodeWriter_Dealloc(&writer);
13855         return NULL;
13856     }
13857     return _PyUnicodeWriter_Finish(&writer);
13858 }
13859 
13860 /*[clinic input]
13861 str.__sizeof__ as unicode_sizeof
13862 
13863 Return the size of the string in memory, in bytes.
13864 [clinic start generated code]*/
13865 
13866 static PyObject *
unicode_sizeof_impl(PyObject * self)13867 unicode_sizeof_impl(PyObject *self)
13868 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13869 {
13870     Py_ssize_t size;
13871 
13872     /* If it's a compact object, account for base structure +
13873        character data. */
13874     if (PyUnicode_IS_COMPACT_ASCII(self))
13875         size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13876     else if (PyUnicode_IS_COMPACT(self))
13877         size = sizeof(PyCompactUnicodeObject) +
13878             (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13879     else {
13880         /* If it is a two-block object, account for base object, and
13881            for character block if present. */
13882         size = sizeof(PyUnicodeObject);
13883         if (_PyUnicode_DATA_ANY(self))
13884             size += (PyUnicode_GET_LENGTH(self) + 1) *
13885                 PyUnicode_KIND(self);
13886     }
13887     /* If the wstr pointer is present, account for it unless it is shared
13888        with the data pointer. Check if the data is not shared. */
13889     if (_PyUnicode_HAS_WSTR_MEMORY(self))
13890         size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13891     if (_PyUnicode_HAS_UTF8_MEMORY(self))
13892         size += PyUnicode_UTF8_LENGTH(self) + 1;
13893 
13894     return PyLong_FromSsize_t(size);
13895 }
13896 
13897 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))13898 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13899 {
13900     PyObject *copy = _PyUnicode_Copy(v);
13901     if (!copy)
13902         return NULL;
13903     return Py_BuildValue("(N)", copy);
13904 }
13905 
13906 static PyMethodDef unicode_methods[] = {
13907     UNICODE_ENCODE_METHODDEF
13908     UNICODE_REPLACE_METHODDEF
13909     UNICODE_SPLIT_METHODDEF
13910     UNICODE_RSPLIT_METHODDEF
13911     UNICODE_JOIN_METHODDEF
13912     UNICODE_CAPITALIZE_METHODDEF
13913     UNICODE_CASEFOLD_METHODDEF
13914     UNICODE_TITLE_METHODDEF
13915     UNICODE_CENTER_METHODDEF
13916     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13917     UNICODE_EXPANDTABS_METHODDEF
13918     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13919     UNICODE_PARTITION_METHODDEF
13920     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13921     UNICODE_LJUST_METHODDEF
13922     UNICODE_LOWER_METHODDEF
13923     UNICODE_LSTRIP_METHODDEF
13924     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13925     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13926     UNICODE_RJUST_METHODDEF
13927     UNICODE_RSTRIP_METHODDEF
13928     UNICODE_RPARTITION_METHODDEF
13929     UNICODE_SPLITLINES_METHODDEF
13930     UNICODE_STRIP_METHODDEF
13931     UNICODE_SWAPCASE_METHODDEF
13932     UNICODE_TRANSLATE_METHODDEF
13933     UNICODE_UPPER_METHODDEF
13934     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13935     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13936     UNICODE_ISASCII_METHODDEF
13937     UNICODE_ISLOWER_METHODDEF
13938     UNICODE_ISUPPER_METHODDEF
13939     UNICODE_ISTITLE_METHODDEF
13940     UNICODE_ISSPACE_METHODDEF
13941     UNICODE_ISDECIMAL_METHODDEF
13942     UNICODE_ISDIGIT_METHODDEF
13943     UNICODE_ISNUMERIC_METHODDEF
13944     UNICODE_ISALPHA_METHODDEF
13945     UNICODE_ISALNUM_METHODDEF
13946     UNICODE_ISIDENTIFIER_METHODDEF
13947     UNICODE_ISPRINTABLE_METHODDEF
13948     UNICODE_ZFILL_METHODDEF
13949     {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13950     {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13951     UNICODE___FORMAT___METHODDEF
13952     UNICODE_MAKETRANS_METHODDEF
13953     UNICODE_SIZEOF_METHODDEF
13954 #if 0
13955     /* These methods are just used for debugging the implementation. */
13956     {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13957 #endif
13958 
13959     {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13960     {NULL, NULL}
13961 };
13962 
13963 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13964 unicode_mod(PyObject *v, PyObject *w)
13965 {
13966     if (!PyUnicode_Check(v))
13967         Py_RETURN_NOTIMPLEMENTED;
13968     return PyUnicode_Format(v, w);
13969 }
13970 
13971 static PyNumberMethods unicode_as_number = {
13972     0,              /*nb_add*/
13973     0,              /*nb_subtract*/
13974     0,              /*nb_multiply*/
13975     unicode_mod,            /*nb_remainder*/
13976 };
13977 
13978 static PySequenceMethods unicode_as_sequence = {
13979     (lenfunc) unicode_length,       /* sq_length */
13980     PyUnicode_Concat,           /* sq_concat */
13981     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13982     (ssizeargfunc) unicode_getitem,     /* sq_item */
13983     0,                  /* sq_slice */
13984     0,                  /* sq_ass_item */
13985     0,                  /* sq_ass_slice */
13986     PyUnicode_Contains,         /* sq_contains */
13987 };
13988 
13989 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13990 unicode_subscript(PyObject* self, PyObject* item)
13991 {
13992     if (PyUnicode_READY(self) == -1)
13993         return NULL;
13994 
13995     if (PyIndex_Check(item)) {
13996         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13997         if (i == -1 && PyErr_Occurred())
13998             return NULL;
13999         if (i < 0)
14000             i += PyUnicode_GET_LENGTH(self);
14001         return unicode_getitem(self, i);
14002     } else if (PySlice_Check(item)) {
14003         Py_ssize_t start, stop, step, slicelength, i;
14004         size_t cur;
14005         PyObject *result;
14006         void *src_data, *dest_data;
14007         int src_kind, dest_kind;
14008         Py_UCS4 ch, max_char, kind_limit;
14009 
14010         if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14011             return NULL;
14012         }
14013         slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14014                                             &start, &stop, step);
14015 
14016         if (slicelength <= 0) {
14017             _Py_RETURN_UNICODE_EMPTY();
14018         } else if (start == 0 && step == 1 &&
14019                    slicelength == PyUnicode_GET_LENGTH(self)) {
14020             return unicode_result_unchanged(self);
14021         } else if (step == 1) {
14022             return PyUnicode_Substring(self,
14023                                        start, start + slicelength);
14024         }
14025         /* General case */
14026         src_kind = PyUnicode_KIND(self);
14027         src_data = PyUnicode_DATA(self);
14028         if (!PyUnicode_IS_ASCII(self)) {
14029             kind_limit = kind_maxchar_limit(src_kind);
14030             max_char = 0;
14031             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14032                 ch = PyUnicode_READ(src_kind, src_data, cur);
14033                 if (ch > max_char) {
14034                     max_char = ch;
14035                     if (max_char >= kind_limit)
14036                         break;
14037                 }
14038             }
14039         }
14040         else
14041             max_char = 127;
14042         result = PyUnicode_New(slicelength, max_char);
14043         if (result == NULL)
14044             return NULL;
14045         dest_kind = PyUnicode_KIND(result);
14046         dest_data = PyUnicode_DATA(result);
14047 
14048         for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14049             Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14050             PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14051         }
14052         assert(_PyUnicode_CheckConsistency(result, 1));
14053         return result;
14054     } else {
14055         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14056         return NULL;
14057     }
14058 }
14059 
14060 static PyMappingMethods unicode_as_mapping = {
14061     (lenfunc)unicode_length,        /* mp_length */
14062     (binaryfunc)unicode_subscript,  /* mp_subscript */
14063     (objobjargproc)0,           /* mp_ass_subscript */
14064 };
14065 
14066 
14067 /* Helpers for PyUnicode_Format() */
14068 
14069 struct unicode_formatter_t {
14070     PyObject *args;
14071     int args_owned;
14072     Py_ssize_t arglen, argidx;
14073     PyObject *dict;
14074 
14075     enum PyUnicode_Kind fmtkind;
14076     Py_ssize_t fmtcnt, fmtpos;
14077     void *fmtdata;
14078     PyObject *fmtstr;
14079 
14080     _PyUnicodeWriter writer;
14081 };
14082 
14083 struct unicode_format_arg_t {
14084     Py_UCS4 ch;
14085     int flags;
14086     Py_ssize_t width;
14087     int prec;
14088     int sign;
14089 };
14090 
14091 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14092 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14093 {
14094     Py_ssize_t argidx = ctx->argidx;
14095 
14096     if (argidx < ctx->arglen) {
14097         ctx->argidx++;
14098         if (ctx->arglen < 0)
14099             return ctx->args;
14100         else
14101             return PyTuple_GetItem(ctx->args, argidx);
14102     }
14103     PyErr_SetString(PyExc_TypeError,
14104                     "not enough arguments for format string");
14105     return NULL;
14106 }
14107 
14108 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14109 
14110 /* Format a float into the writer if the writer is not NULL, or into *p_output
14111    otherwise.
14112 
14113    Return 0 on success, raise an exception and return -1 on error. */
14114 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14115 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14116             PyObject **p_output,
14117             _PyUnicodeWriter *writer)
14118 {
14119     char *p;
14120     double x;
14121     Py_ssize_t len;
14122     int prec;
14123     int dtoa_flags;
14124 
14125     x = PyFloat_AsDouble(v);
14126     if (x == -1.0 && PyErr_Occurred())
14127         return -1;
14128 
14129     prec = arg->prec;
14130     if (prec < 0)
14131         prec = 6;
14132 
14133     if (arg->flags & F_ALT)
14134         dtoa_flags = Py_DTSF_ALT;
14135     else
14136         dtoa_flags = 0;
14137     p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14138     if (p == NULL)
14139         return -1;
14140     len = strlen(p);
14141     if (writer) {
14142         if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14143             PyMem_Free(p);
14144             return -1;
14145         }
14146     }
14147     else
14148         *p_output = _PyUnicode_FromASCII(p, len);
14149     PyMem_Free(p);
14150     return 0;
14151 }
14152 
14153 /* formatlong() emulates the format codes d, u, o, x and X, and
14154  * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14155  * Python's regular ints.
14156  * Return value:  a new PyUnicodeObject*, or NULL if error.
14157  *     The output string is of the form
14158  *         "-"? ("0x" | "0X")? digit+
14159  *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14160  *         set in flags.  The case of hex digits will be correct,
14161  *     There will be at least prec digits, zero-filled on the left if
14162  *         necessary to get that many.
14163  * val          object to be converted
14164  * flags        bitmask of format flags; only F_ALT is looked at
14165  * prec         minimum number of digits; 0-fill on left if needed
14166  * type         a character in [duoxX]; u acts the same as d
14167  *
14168  * CAUTION:  o, x and X conversions on regular ints can never
14169  * produce a '-' sign, but can for Python's unbounded ints.
14170  */
14171 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14172 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14173 {
14174     PyObject *result = NULL;
14175     char *buf;
14176     Py_ssize_t i;
14177     int sign;           /* 1 if '-', else 0 */
14178     int len;            /* number of characters */
14179     Py_ssize_t llen;
14180     int numdigits;      /* len == numnondigits + numdigits */
14181     int numnondigits = 0;
14182 
14183     /* Avoid exceeding SSIZE_T_MAX */
14184     if (prec > INT_MAX-3) {
14185         PyErr_SetString(PyExc_OverflowError,
14186                         "precision too large");
14187         return NULL;
14188     }
14189 
14190     assert(PyLong_Check(val));
14191 
14192     switch (type) {
14193     default:
14194         Py_UNREACHABLE();
14195     case 'd':
14196     case 'i':
14197     case 'u':
14198         /* int and int subclasses should print numerically when a numeric */
14199         /* format code is used (see issue18780) */
14200         result = PyNumber_ToBase(val, 10);
14201         break;
14202     case 'o':
14203         numnondigits = 2;
14204         result = PyNumber_ToBase(val, 8);
14205         break;
14206     case 'x':
14207     case 'X':
14208         numnondigits = 2;
14209         result = PyNumber_ToBase(val, 16);
14210         break;
14211     }
14212     if (!result)
14213         return NULL;
14214 
14215     assert(unicode_modifiable(result));
14216     assert(PyUnicode_IS_READY(result));
14217     assert(PyUnicode_IS_ASCII(result));
14218 
14219     /* To modify the string in-place, there can only be one reference. */
14220     if (Py_REFCNT(result) != 1) {
14221         Py_DECREF(result);
14222         PyErr_BadInternalCall();
14223         return NULL;
14224     }
14225     buf = PyUnicode_DATA(result);
14226     llen = PyUnicode_GET_LENGTH(result);
14227     if (llen > INT_MAX) {
14228         Py_DECREF(result);
14229         PyErr_SetString(PyExc_ValueError,
14230                         "string too large in _PyUnicode_FormatLong");
14231         return NULL;
14232     }
14233     len = (int)llen;
14234     sign = buf[0] == '-';
14235     numnondigits += sign;
14236     numdigits = len - numnondigits;
14237     assert(numdigits > 0);
14238 
14239     /* Get rid of base marker unless F_ALT */
14240     if (((alt) == 0 &&
14241         (type == 'o' || type == 'x' || type == 'X'))) {
14242         assert(buf[sign] == '0');
14243         assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14244                buf[sign+1] == 'o');
14245         numnondigits -= 2;
14246         buf += 2;
14247         len -= 2;
14248         if (sign)
14249             buf[0] = '-';
14250         assert(len == numnondigits + numdigits);
14251         assert(numdigits > 0);
14252     }
14253 
14254     /* Fill with leading zeroes to meet minimum width. */
14255     if (prec > numdigits) {
14256         PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14257                                 numnondigits + prec);
14258         char *b1;
14259         if (!r1) {
14260             Py_DECREF(result);
14261             return NULL;
14262         }
14263         b1 = PyBytes_AS_STRING(r1);
14264         for (i = 0; i < numnondigits; ++i)
14265             *b1++ = *buf++;
14266         for (i = 0; i < prec - numdigits; i++)
14267             *b1++ = '0';
14268         for (i = 0; i < numdigits; i++)
14269             *b1++ = *buf++;
14270         *b1 = '\0';
14271         Py_DECREF(result);
14272         result = r1;
14273         buf = PyBytes_AS_STRING(result);
14274         len = numnondigits + prec;
14275     }
14276 
14277     /* Fix up case for hex conversions. */
14278     if (type == 'X') {
14279         /* Need to convert all lower case letters to upper case.
14280            and need to convert 0x to 0X (and -0x to -0X). */
14281         for (i = 0; i < len; i++)
14282             if (buf[i] >= 'a' && buf[i] <= 'x')
14283                 buf[i] -= 'a'-'A';
14284     }
14285     if (!PyUnicode_Check(result)
14286         || buf != PyUnicode_DATA(result)) {
14287         PyObject *unicode;
14288         unicode = _PyUnicode_FromASCII(buf, len);
14289         Py_DECREF(result);
14290         result = unicode;
14291     }
14292     else if (len != PyUnicode_GET_LENGTH(result)) {
14293         if (PyUnicode_Resize(&result, len) < 0)
14294             Py_CLEAR(result);
14295     }
14296     return result;
14297 }
14298 
14299 /* Format an integer or a float as an integer.
14300  * Return 1 if the number has been formatted into the writer,
14301  *        0 if the number has been formatted into *p_output
14302  *       -1 and raise an exception on error */
14303 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14304 mainformatlong(PyObject *v,
14305                struct unicode_format_arg_t *arg,
14306                PyObject **p_output,
14307                _PyUnicodeWriter *writer)
14308 {
14309     PyObject *iobj, *res;
14310     char type = (char)arg->ch;
14311 
14312     if (!PyNumber_Check(v))
14313         goto wrongtype;
14314 
14315     /* make sure number is a type of integer for o, x, and X */
14316     if (!PyLong_Check(v)) {
14317         if (type == 'o' || type == 'x' || type == 'X') {
14318             iobj = PyNumber_Index(v);
14319             if (iobj == NULL) {
14320                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14321                     goto wrongtype;
14322                 return -1;
14323             }
14324         }
14325         else {
14326             iobj = PyNumber_Long(v);
14327             if (iobj == NULL ) {
14328                 if (PyErr_ExceptionMatches(PyExc_TypeError))
14329                     goto wrongtype;
14330                 return -1;
14331             }
14332         }
14333         assert(PyLong_Check(iobj));
14334     }
14335     else {
14336         iobj = v;
14337         Py_INCREF(iobj);
14338     }
14339 
14340     if (PyLong_CheckExact(v)
14341         && arg->width == -1 && arg->prec == -1
14342         && !(arg->flags & (F_SIGN | F_BLANK))
14343         && type != 'X')
14344     {
14345         /* Fast path */
14346         int alternate = arg->flags & F_ALT;
14347         int base;
14348 
14349         switch(type)
14350         {
14351             default:
14352                 Py_UNREACHABLE();
14353             case 'd':
14354             case 'i':
14355             case 'u':
14356                 base = 10;
14357                 break;
14358             case 'o':
14359                 base = 8;
14360                 break;
14361             case 'x':
14362             case 'X':
14363                 base = 16;
14364                 break;
14365         }
14366 
14367         if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14368             Py_DECREF(iobj);
14369             return -1;
14370         }
14371         Py_DECREF(iobj);
14372         return 1;
14373     }
14374 
14375     res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14376     Py_DECREF(iobj);
14377     if (res == NULL)
14378         return -1;
14379     *p_output = res;
14380     return 0;
14381 
14382 wrongtype:
14383     switch(type)
14384     {
14385         case 'o':
14386         case 'x':
14387         case 'X':
14388             PyErr_Format(PyExc_TypeError,
14389                     "%%%c format: an integer is required, "
14390                     "not %.200s",
14391                     type, Py_TYPE(v)->tp_name);
14392             break;
14393         default:
14394             PyErr_Format(PyExc_TypeError,
14395                     "%%%c format: a number is required, "
14396                     "not %.200s",
14397                     type, Py_TYPE(v)->tp_name);
14398             break;
14399     }
14400     return -1;
14401 }
14402 
14403 static Py_UCS4
formatchar(PyObject * v)14404 formatchar(PyObject *v)
14405 {
14406     /* presume that the buffer is at least 3 characters long */
14407     if (PyUnicode_Check(v)) {
14408         if (PyUnicode_GET_LENGTH(v) == 1) {
14409             return PyUnicode_READ_CHAR(v, 0);
14410         }
14411         goto onError;
14412     }
14413     else {
14414         PyObject *iobj;
14415         long x;
14416         /* make sure number is a type of integer */
14417         if (!PyLong_Check(v)) {
14418             iobj = PyNumber_Index(v);
14419             if (iobj == NULL) {
14420                 goto onError;
14421             }
14422             x = PyLong_AsLong(iobj);
14423             Py_DECREF(iobj);
14424         }
14425         else {
14426             x = PyLong_AsLong(v);
14427         }
14428         if (x == -1 && PyErr_Occurred())
14429             goto onError;
14430 
14431         if (x < 0 || x > MAX_UNICODE) {
14432             PyErr_SetString(PyExc_OverflowError,
14433                             "%c arg not in range(0x110000)");
14434             return (Py_UCS4) -1;
14435         }
14436 
14437         return (Py_UCS4) x;
14438     }
14439 
14440   onError:
14441     PyErr_SetString(PyExc_TypeError,
14442                     "%c requires int or char");
14443     return (Py_UCS4) -1;
14444 }
14445 
14446 /* Parse options of an argument: flags, width, precision.
14447    Handle also "%(name)" syntax.
14448 
14449    Return 0 if the argument has been formatted into arg->str.
14450    Return 1 if the argument has been written into ctx->writer,
14451    Raise an exception and return -1 on error. */
14452 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14453 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14454                          struct unicode_format_arg_t *arg)
14455 {
14456 #define FORMAT_READ(ctx) \
14457         PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14458 
14459     PyObject *v;
14460 
14461     if (arg->ch == '(') {
14462         /* Get argument value from a dictionary. Example: "%(name)s". */
14463         Py_ssize_t keystart;
14464         Py_ssize_t keylen;
14465         PyObject *key;
14466         int pcount = 1;
14467 
14468         if (ctx->dict == NULL) {
14469             PyErr_SetString(PyExc_TypeError,
14470                             "format requires a mapping");
14471             return -1;
14472         }
14473         ++ctx->fmtpos;
14474         --ctx->fmtcnt;
14475         keystart = ctx->fmtpos;
14476         /* Skip over balanced parentheses */
14477         while (pcount > 0 && --ctx->fmtcnt >= 0) {
14478             arg->ch = FORMAT_READ(ctx);
14479             if (arg->ch == ')')
14480                 --pcount;
14481             else if (arg->ch == '(')
14482                 ++pcount;
14483             ctx->fmtpos++;
14484         }
14485         keylen = ctx->fmtpos - keystart - 1;
14486         if (ctx->fmtcnt < 0 || pcount > 0) {
14487             PyErr_SetString(PyExc_ValueError,
14488                             "incomplete format key");
14489             return -1;
14490         }
14491         key = PyUnicode_Substring(ctx->fmtstr,
14492                                   keystart, keystart + keylen);
14493         if (key == NULL)
14494             return -1;
14495         if (ctx->args_owned) {
14496             ctx->args_owned = 0;
14497             Py_DECREF(ctx->args);
14498         }
14499         ctx->args = PyObject_GetItem(ctx->dict, key);
14500         Py_DECREF(key);
14501         if (ctx->args == NULL)
14502             return -1;
14503         ctx->args_owned = 1;
14504         ctx->arglen = -1;
14505         ctx->argidx = -2;
14506     }
14507 
14508     /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14509     while (--ctx->fmtcnt >= 0) {
14510         arg->ch = FORMAT_READ(ctx);
14511         ctx->fmtpos++;
14512         switch (arg->ch) {
14513         case '-': arg->flags |= F_LJUST; continue;
14514         case '+': arg->flags |= F_SIGN; continue;
14515         case ' ': arg->flags |= F_BLANK; continue;
14516         case '#': arg->flags |= F_ALT; continue;
14517         case '0': arg->flags |= F_ZERO; continue;
14518         }
14519         break;
14520     }
14521 
14522     /* Parse width. Example: "%10s" => width=10 */
14523     if (arg->ch == '*') {
14524         v = unicode_format_getnextarg(ctx);
14525         if (v == NULL)
14526             return -1;
14527         if (!PyLong_Check(v)) {
14528             PyErr_SetString(PyExc_TypeError,
14529                             "* wants int");
14530             return -1;
14531         }
14532         arg->width = PyLong_AsSsize_t(v);
14533         if (arg->width == -1 && PyErr_Occurred())
14534             return -1;
14535         if (arg->width < 0) {
14536             arg->flags |= F_LJUST;
14537             arg->width = -arg->width;
14538         }
14539         if (--ctx->fmtcnt >= 0) {
14540             arg->ch = FORMAT_READ(ctx);
14541             ctx->fmtpos++;
14542         }
14543     }
14544     else if (arg->ch >= '0' && arg->ch <= '9') {
14545         arg->width = arg->ch - '0';
14546         while (--ctx->fmtcnt >= 0) {
14547             arg->ch = FORMAT_READ(ctx);
14548             ctx->fmtpos++;
14549             if (arg->ch < '0' || arg->ch > '9')
14550                 break;
14551             /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14552                mixing signed and unsigned comparison. Since arg->ch is between
14553                '0' and '9', casting to int is safe. */
14554             if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14555                 PyErr_SetString(PyExc_ValueError,
14556                                 "width too big");
14557                 return -1;
14558             }
14559             arg->width = arg->width*10 + (arg->ch - '0');
14560         }
14561     }
14562 
14563     /* Parse precision. Example: "%.3f" => prec=3 */
14564     if (arg->ch == '.') {
14565         arg->prec = 0;
14566         if (--ctx->fmtcnt >= 0) {
14567             arg->ch = FORMAT_READ(ctx);
14568             ctx->fmtpos++;
14569         }
14570         if (arg->ch == '*') {
14571             v = unicode_format_getnextarg(ctx);
14572             if (v == NULL)
14573                 return -1;
14574             if (!PyLong_Check(v)) {
14575                 PyErr_SetString(PyExc_TypeError,
14576                                 "* wants int");
14577                 return -1;
14578             }
14579             arg->prec = _PyLong_AsInt(v);
14580             if (arg->prec == -1 && PyErr_Occurred())
14581                 return -1;
14582             if (arg->prec < 0)
14583                 arg->prec = 0;
14584             if (--ctx->fmtcnt >= 0) {
14585                 arg->ch = FORMAT_READ(ctx);
14586                 ctx->fmtpos++;
14587             }
14588         }
14589         else if (arg->ch >= '0' && arg->ch <= '9') {
14590             arg->prec = arg->ch - '0';
14591             while (--ctx->fmtcnt >= 0) {
14592                 arg->ch = FORMAT_READ(ctx);
14593                 ctx->fmtpos++;
14594                 if (arg->ch < '0' || arg->ch > '9')
14595                     break;
14596                 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14597                     PyErr_SetString(PyExc_ValueError,
14598                                     "precision too big");
14599                     return -1;
14600                 }
14601                 arg->prec = arg->prec*10 + (arg->ch - '0');
14602             }
14603         }
14604     }
14605 
14606     /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14607     if (ctx->fmtcnt >= 0) {
14608         if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14609             if (--ctx->fmtcnt >= 0) {
14610                 arg->ch = FORMAT_READ(ctx);
14611                 ctx->fmtpos++;
14612             }
14613         }
14614     }
14615     if (ctx->fmtcnt < 0) {
14616         PyErr_SetString(PyExc_ValueError,
14617                         "incomplete format");
14618         return -1;
14619     }
14620     return 0;
14621 
14622 #undef FORMAT_READ
14623 }
14624 
14625 /* Format one argument. Supported conversion specifiers:
14626 
14627    - "s", "r", "a": any type
14628    - "i", "d", "u": int or float
14629    - "o", "x", "X": int
14630    - "e", "E", "f", "F", "g", "G": float
14631    - "c": int or str (1 character)
14632 
14633    When possible, the output is written directly into the Unicode writer
14634    (ctx->writer). A string is created when padding is required.
14635 
14636    Return 0 if the argument has been formatted into *p_str,
14637           1 if the argument has been written into ctx->writer,
14638          -1 on error. */
14639 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14640 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14641                           struct unicode_format_arg_t *arg,
14642                           PyObject **p_str)
14643 {
14644     PyObject *v;
14645     _PyUnicodeWriter *writer = &ctx->writer;
14646 
14647     if (ctx->fmtcnt == 0)
14648         ctx->writer.overallocate = 0;
14649 
14650     v = unicode_format_getnextarg(ctx);
14651     if (v == NULL)
14652         return -1;
14653 
14654 
14655     switch (arg->ch) {
14656     case 's':
14657     case 'r':
14658     case 'a':
14659         if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14660             /* Fast path */
14661             if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14662                 return -1;
14663             return 1;
14664         }
14665 
14666         if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14667             *p_str = v;
14668             Py_INCREF(*p_str);
14669         }
14670         else {
14671             if (arg->ch == 's')
14672                 *p_str = PyObject_Str(v);
14673             else if (arg->ch == 'r')
14674                 *p_str = PyObject_Repr(v);
14675             else
14676                 *p_str = PyObject_ASCII(v);
14677         }
14678         break;
14679 
14680     case 'i':
14681     case 'd':
14682     case 'u':
14683     case 'o':
14684     case 'x':
14685     case 'X':
14686     {
14687         int ret = mainformatlong(v, arg, p_str, writer);
14688         if (ret != 0)
14689             return ret;
14690         arg->sign = 1;
14691         break;
14692     }
14693 
14694     case 'e':
14695     case 'E':
14696     case 'f':
14697     case 'F':
14698     case 'g':
14699     case 'G':
14700         if (arg->width == -1 && arg->prec == -1
14701             && !(arg->flags & (F_SIGN | F_BLANK)))
14702         {
14703             /* Fast path */
14704             if (formatfloat(v, arg, NULL, writer) == -1)
14705                 return -1;
14706             return 1;
14707         }
14708 
14709         arg->sign = 1;
14710         if (formatfloat(v, arg, p_str, NULL) == -1)
14711             return -1;
14712         break;
14713 
14714     case 'c':
14715     {
14716         Py_UCS4 ch = formatchar(v);
14717         if (ch == (Py_UCS4) -1)
14718             return -1;
14719         if (arg->width == -1 && arg->prec == -1) {
14720             /* Fast path */
14721             if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14722                 return -1;
14723             return 1;
14724         }
14725         *p_str = PyUnicode_FromOrdinal(ch);
14726         break;
14727     }
14728 
14729     default:
14730         PyErr_Format(PyExc_ValueError,
14731                      "unsupported format character '%c' (0x%x) "
14732                      "at index %zd",
14733                      (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14734                      (int)arg->ch,
14735                      ctx->fmtpos - 1);
14736         return -1;
14737     }
14738     if (*p_str == NULL)
14739         return -1;
14740     assert (PyUnicode_Check(*p_str));
14741     return 0;
14742 }
14743 
14744 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14745 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14746                           struct unicode_format_arg_t *arg,
14747                           PyObject *str)
14748 {
14749     Py_ssize_t len;
14750     enum PyUnicode_Kind kind;
14751     void *pbuf;
14752     Py_ssize_t pindex;
14753     Py_UCS4 signchar;
14754     Py_ssize_t buflen;
14755     Py_UCS4 maxchar;
14756     Py_ssize_t sublen;
14757     _PyUnicodeWriter *writer = &ctx->writer;
14758     Py_UCS4 fill;
14759 
14760     fill = ' ';
14761     if (arg->sign && arg->flags & F_ZERO)
14762         fill = '0';
14763 
14764     if (PyUnicode_READY(str) == -1)
14765         return -1;
14766 
14767     len = PyUnicode_GET_LENGTH(str);
14768     if ((arg->width == -1 || arg->width <= len)
14769         && (arg->prec == -1 || arg->prec >= len)
14770         && !(arg->flags & (F_SIGN | F_BLANK)))
14771     {
14772         /* Fast path */
14773         if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14774             return -1;
14775         return 0;
14776     }
14777 
14778     /* Truncate the string for "s", "r" and "a" formats
14779        if the precision is set */
14780     if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14781         if (arg->prec >= 0 && len > arg->prec)
14782             len = arg->prec;
14783     }
14784 
14785     /* Adjust sign and width */
14786     kind = PyUnicode_KIND(str);
14787     pbuf = PyUnicode_DATA(str);
14788     pindex = 0;
14789     signchar = '\0';
14790     if (arg->sign) {
14791         Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14792         if (ch == '-' || ch == '+') {
14793             signchar = ch;
14794             len--;
14795             pindex++;
14796         }
14797         else if (arg->flags & F_SIGN)
14798             signchar = '+';
14799         else if (arg->flags & F_BLANK)
14800             signchar = ' ';
14801         else
14802             arg->sign = 0;
14803     }
14804     if (arg->width < len)
14805         arg->width = len;
14806 
14807     /* Prepare the writer */
14808     maxchar = writer->maxchar;
14809     if (!(arg->flags & F_LJUST)) {
14810         if (arg->sign) {
14811             if ((arg->width-1) > len)
14812                 maxchar = Py_MAX(maxchar, fill);
14813         }
14814         else {
14815             if (arg->width > len)
14816                 maxchar = Py_MAX(maxchar, fill);
14817         }
14818     }
14819     if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14820         Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14821         maxchar = Py_MAX(maxchar, strmaxchar);
14822     }
14823 
14824     buflen = arg->width;
14825     if (arg->sign && len == arg->width)
14826         buflen++;
14827     if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14828         return -1;
14829 
14830     /* Write the sign if needed */
14831     if (arg->sign) {
14832         if (fill != ' ') {
14833             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14834             writer->pos += 1;
14835         }
14836         if (arg->width > len)
14837             arg->width--;
14838     }
14839 
14840     /* Write the numeric prefix for "x", "X" and "o" formats
14841        if the alternate form is used.
14842        For example, write "0x" for the "%#x" format. */
14843     if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14844         assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14845         assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14846         if (fill != ' ') {
14847             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14848             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14849             writer->pos += 2;
14850             pindex += 2;
14851         }
14852         arg->width -= 2;
14853         if (arg->width < 0)
14854             arg->width = 0;
14855         len -= 2;
14856     }
14857 
14858     /* Pad left with the fill character if needed */
14859     if (arg->width > len && !(arg->flags & F_LJUST)) {
14860         sublen = arg->width - len;
14861         unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
14862         writer->pos += sublen;
14863         arg->width = len;
14864     }
14865 
14866     /* If padding with spaces: write sign if needed and/or numeric prefix if
14867        the alternate form is used */
14868     if (fill == ' ') {
14869         if (arg->sign) {
14870             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14871             writer->pos += 1;
14872         }
14873         if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14874             assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14875             assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14876             PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14877             PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14878             writer->pos += 2;
14879             pindex += 2;
14880         }
14881     }
14882 
14883     /* Write characters */
14884     if (len) {
14885         _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14886                                       str, pindex, len);
14887         writer->pos += len;
14888     }
14889 
14890     /* Pad right with the fill character if needed */
14891     if (arg->width > len) {
14892         sublen = arg->width - len;
14893         unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
14894         writer->pos += sublen;
14895     }
14896     return 0;
14897 }
14898 
14899 /* Helper of PyUnicode_Format(): format one arg.
14900    Return 0 on success, raise an exception and return -1 on error. */
14901 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14902 unicode_format_arg(struct unicode_formatter_t *ctx)
14903 {
14904     struct unicode_format_arg_t arg;
14905     PyObject *str;
14906     int ret;
14907 
14908     arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14909     if (arg.ch == '%') {
14910         ctx->fmtpos++;
14911         ctx->fmtcnt--;
14912         if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14913             return -1;
14914         return 0;
14915     }
14916     arg.flags = 0;
14917     arg.width = -1;
14918     arg.prec = -1;
14919     arg.sign = 0;
14920     str = NULL;
14921 
14922     ret = unicode_format_arg_parse(ctx, &arg);
14923     if (ret == -1)
14924         return -1;
14925 
14926     ret = unicode_format_arg_format(ctx, &arg, &str);
14927     if (ret == -1)
14928         return -1;
14929 
14930     if (ret != 1) {
14931         ret = unicode_format_arg_output(ctx, &arg, str);
14932         Py_DECREF(str);
14933         if (ret == -1)
14934             return -1;
14935     }
14936 
14937     if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14938         PyErr_SetString(PyExc_TypeError,
14939                         "not all arguments converted during string formatting");
14940         return -1;
14941     }
14942     return 0;
14943 }
14944 
14945 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14946 PyUnicode_Format(PyObject *format, PyObject *args)
14947 {
14948     struct unicode_formatter_t ctx;
14949 
14950     if (format == NULL || args == NULL) {
14951         PyErr_BadInternalCall();
14952         return NULL;
14953     }
14954 
14955     if (ensure_unicode(format) < 0)
14956         return NULL;
14957 
14958     ctx.fmtstr = format;
14959     ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14960     ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14961     ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14962     ctx.fmtpos = 0;
14963 
14964     _PyUnicodeWriter_Init(&ctx.writer);
14965     ctx.writer.min_length = ctx.fmtcnt + 100;
14966     ctx.writer.overallocate = 1;
14967 
14968     if (PyTuple_Check(args)) {
14969         ctx.arglen = PyTuple_Size(args);
14970         ctx.argidx = 0;
14971     }
14972     else {
14973         ctx.arglen = -1;
14974         ctx.argidx = -2;
14975     }
14976     ctx.args_owned = 0;
14977     if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14978         ctx.dict = args;
14979     else
14980         ctx.dict = NULL;
14981     ctx.args = args;
14982 
14983     while (--ctx.fmtcnt >= 0) {
14984         if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14985             Py_ssize_t nonfmtpos;
14986 
14987             nonfmtpos = ctx.fmtpos++;
14988             while (ctx.fmtcnt >= 0 &&
14989                    PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14990                 ctx.fmtpos++;
14991                 ctx.fmtcnt--;
14992             }
14993             if (ctx.fmtcnt < 0) {
14994                 ctx.fmtpos--;
14995                 ctx.writer.overallocate = 0;
14996             }
14997 
14998             if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14999                                                 nonfmtpos, ctx.fmtpos) < 0)
15000                 goto onError;
15001         }
15002         else {
15003             ctx.fmtpos++;
15004             if (unicode_format_arg(&ctx) == -1)
15005                 goto onError;
15006         }
15007     }
15008 
15009     if (ctx.argidx < ctx.arglen && !ctx.dict) {
15010         PyErr_SetString(PyExc_TypeError,
15011                         "not all arguments converted during string formatting");
15012         goto onError;
15013     }
15014 
15015     if (ctx.args_owned) {
15016         Py_DECREF(ctx.args);
15017     }
15018     return _PyUnicodeWriter_Finish(&ctx.writer);
15019 
15020   onError:
15021     _PyUnicodeWriter_Dealloc(&ctx.writer);
15022     if (ctx.args_owned) {
15023         Py_DECREF(ctx.args);
15024     }
15025     return NULL;
15026 }
15027 
15028 static PyObject *
15029 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15030 
15031 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15032 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15033 {
15034     PyObject *x = NULL;
15035     static char *kwlist[] = {"object", "encoding", "errors", 0};
15036     char *encoding = NULL;
15037     char *errors = NULL;
15038 
15039     if (type != &PyUnicode_Type)
15040         return unicode_subtype_new(type, args, kwds);
15041     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15042                                      kwlist, &x, &encoding, &errors))
15043         return NULL;
15044     if (x == NULL)
15045         _Py_RETURN_UNICODE_EMPTY();
15046     if (encoding == NULL && errors == NULL)
15047         return PyObject_Str(x);
15048     else
15049         return PyUnicode_FromEncodedObject(x, encoding, errors);
15050 }
15051 
15052 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15053 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15054 {
15055     PyObject *unicode, *self;
15056     Py_ssize_t length, char_size;
15057     int share_wstr, share_utf8;
15058     unsigned int kind;
15059     void *data;
15060 
15061     assert(PyType_IsSubtype(type, &PyUnicode_Type));
15062 
15063     unicode = unicode_new(&PyUnicode_Type, args, kwds);
15064     if (unicode == NULL)
15065         return NULL;
15066     assert(_PyUnicode_CHECK(unicode));
15067     if (PyUnicode_READY(unicode) == -1) {
15068         Py_DECREF(unicode);
15069         return NULL;
15070     }
15071 
15072     self = type->tp_alloc(type, 0);
15073     if (self == NULL) {
15074         Py_DECREF(unicode);
15075         return NULL;
15076     }
15077     kind = PyUnicode_KIND(unicode);
15078     length = PyUnicode_GET_LENGTH(unicode);
15079 
15080     _PyUnicode_LENGTH(self) = length;
15081 #ifdef Py_DEBUG
15082     _PyUnicode_HASH(self) = -1;
15083 #else
15084     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15085 #endif
15086     _PyUnicode_STATE(self).interned = 0;
15087     _PyUnicode_STATE(self).kind = kind;
15088     _PyUnicode_STATE(self).compact = 0;
15089     _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15090     _PyUnicode_STATE(self).ready = 1;
15091     _PyUnicode_WSTR(self) = NULL;
15092     _PyUnicode_UTF8_LENGTH(self) = 0;
15093     _PyUnicode_UTF8(self) = NULL;
15094     _PyUnicode_WSTR_LENGTH(self) = 0;
15095     _PyUnicode_DATA_ANY(self) = NULL;
15096 
15097     share_utf8 = 0;
15098     share_wstr = 0;
15099     if (kind == PyUnicode_1BYTE_KIND) {
15100         char_size = 1;
15101         if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15102             share_utf8 = 1;
15103     }
15104     else if (kind == PyUnicode_2BYTE_KIND) {
15105         char_size = 2;
15106         if (sizeof(wchar_t) == 2)
15107             share_wstr = 1;
15108     }
15109     else {
15110         assert(kind == PyUnicode_4BYTE_KIND);
15111         char_size = 4;
15112         if (sizeof(wchar_t) == 4)
15113             share_wstr = 1;
15114     }
15115 
15116     /* Ensure we won't overflow the length. */
15117     if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15118         PyErr_NoMemory();
15119         goto onError;
15120     }
15121     data = PyObject_MALLOC((length + 1) * char_size);
15122     if (data == NULL) {
15123         PyErr_NoMemory();
15124         goto onError;
15125     }
15126 
15127     _PyUnicode_DATA_ANY(self) = data;
15128     if (share_utf8) {
15129         _PyUnicode_UTF8_LENGTH(self) = length;
15130         _PyUnicode_UTF8(self) = data;
15131     }
15132     if (share_wstr) {
15133         _PyUnicode_WSTR_LENGTH(self) = length;
15134         _PyUnicode_WSTR(self) = (wchar_t *)data;
15135     }
15136 
15137     memcpy(data, PyUnicode_DATA(unicode),
15138               kind * (length + 1));
15139     assert(_PyUnicode_CheckConsistency(self, 1));
15140 #ifdef Py_DEBUG
15141     _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15142 #endif
15143     Py_DECREF(unicode);
15144     return self;
15145 
15146 onError:
15147     Py_DECREF(unicode);
15148     Py_DECREF(self);
15149     return NULL;
15150 }
15151 
15152 PyDoc_STRVAR(unicode_doc,
15153 "str(object='') -> str\n\
15154 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15155 \n\
15156 Create a new string object from the given object. If encoding or\n\
15157 errors is specified, then the object must expose a data buffer\n\
15158 that will be decoded using the given encoding and error handler.\n\
15159 Otherwise, returns the result of object.__str__() (if defined)\n\
15160 or repr(object).\n\
15161 encoding defaults to sys.getdefaultencoding().\n\
15162 errors defaults to 'strict'.");
15163 
15164 static PyObject *unicode_iter(PyObject *seq);
15165 
15166 PyTypeObject PyUnicode_Type = {
15167     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15168     "str",                        /* tp_name */
15169     sizeof(PyUnicodeObject),      /* tp_basicsize */
15170     0,                            /* tp_itemsize */
15171     /* Slots */
15172     (destructor)unicode_dealloc,  /* tp_dealloc */
15173     0,                            /* tp_vectorcall_offset */
15174     0,                            /* tp_getattr */
15175     0,                            /* tp_setattr */
15176     0,                            /* tp_as_async */
15177     unicode_repr,                 /* tp_repr */
15178     &unicode_as_number,           /* tp_as_number */
15179     &unicode_as_sequence,         /* tp_as_sequence */
15180     &unicode_as_mapping,          /* tp_as_mapping */
15181     (hashfunc) unicode_hash,      /* tp_hash*/
15182     0,                            /* tp_call*/
15183     (reprfunc) unicode_str,       /* tp_str */
15184     PyObject_GenericGetAttr,      /* tp_getattro */
15185     0,                            /* tp_setattro */
15186     0,                            /* tp_as_buffer */
15187     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15188     Py_TPFLAGS_UNICODE_SUBCLASS,   /* tp_flags */
15189     unicode_doc,                  /* tp_doc */
15190     0,                            /* tp_traverse */
15191     0,                            /* tp_clear */
15192     PyUnicode_RichCompare,        /* tp_richcompare */
15193     0,                            /* tp_weaklistoffset */
15194     unicode_iter,                 /* tp_iter */
15195     0,                            /* tp_iternext */
15196     unicode_methods,              /* tp_methods */
15197     0,                            /* tp_members */
15198     0,                            /* tp_getset */
15199     &PyBaseObject_Type,           /* tp_base */
15200     0,                            /* tp_dict */
15201     0,                            /* tp_descr_get */
15202     0,                            /* tp_descr_set */
15203     0,                            /* tp_dictoffset */
15204     0,                            /* tp_init */
15205     0,                            /* tp_alloc */
15206     unicode_new,                  /* tp_new */
15207     PyObject_Del,                 /* tp_free */
15208 };
15209 
15210 /* Initialize the Unicode implementation */
15211 
15212 PyStatus
_PyUnicode_Init(void)15213 _PyUnicode_Init(void)
15214 {
15215     /* XXX - move this array to unicodectype.c ? */
15216     Py_UCS2 linebreak[] = {
15217         0x000A, /* LINE FEED */
15218         0x000D, /* CARRIAGE RETURN */
15219         0x001C, /* FILE SEPARATOR */
15220         0x001D, /* GROUP SEPARATOR */
15221         0x001E, /* RECORD SEPARATOR */
15222         0x0085, /* NEXT LINE */
15223         0x2028, /* LINE SEPARATOR */
15224         0x2029, /* PARAGRAPH SEPARATOR */
15225     };
15226 
15227     /* Init the implementation */
15228     _Py_INCREF_UNICODE_EMPTY();
15229     if (!unicode_empty) {
15230         return _PyStatus_ERR("Can't create empty string");
15231     }
15232     Py_DECREF(unicode_empty);
15233 
15234     if (PyType_Ready(&PyUnicode_Type) < 0) {
15235         return _PyStatus_ERR("Can't initialize unicode type");
15236     }
15237 
15238     /* initialize the linebreak bloom filter */
15239     bloom_linebreak = make_bloom_mask(
15240         PyUnicode_2BYTE_KIND, linebreak,
15241         Py_ARRAY_LENGTH(linebreak));
15242 
15243     if (PyType_Ready(&EncodingMapType) < 0) {
15244          return _PyStatus_ERR("Can't initialize encoding map type");
15245     }
15246     if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15247         return _PyStatus_ERR("Can't initialize field name iterator type");
15248     }
15249     if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15250         return _PyStatus_ERR("Can't initialize formatter iter type");
15251     }
15252     return _PyStatus_OK();
15253 }
15254 
15255 /* Finalize the Unicode implementation */
15256 
15257 int
PyUnicode_ClearFreeList(void)15258 PyUnicode_ClearFreeList(void)
15259 {
15260     return 0;
15261 }
15262 
15263 
15264 void
PyUnicode_InternInPlace(PyObject ** p)15265 PyUnicode_InternInPlace(PyObject **p)
15266 {
15267     PyObject *s = *p;
15268     PyObject *t;
15269 #ifdef Py_DEBUG
15270     assert(s != NULL);
15271     assert(_PyUnicode_CHECK(s));
15272 #else
15273     if (s == NULL || !PyUnicode_Check(s))
15274         return;
15275 #endif
15276     /* If it's a subclass, we don't really know what putting
15277        it in the interned dict might do. */
15278     if (!PyUnicode_CheckExact(s))
15279         return;
15280     if (PyUnicode_CHECK_INTERNED(s))
15281         return;
15282     if (interned == NULL) {
15283         interned = PyDict_New();
15284         if (interned == NULL) {
15285             PyErr_Clear(); /* Don't leave an exception */
15286             return;
15287         }
15288     }
15289     t = PyDict_SetDefault(interned, s, s);
15290     if (t == NULL) {
15291         PyErr_Clear();
15292         return;
15293     }
15294     if (t != s) {
15295         Py_INCREF(t);
15296         Py_SETREF(*p, t);
15297         return;
15298     }
15299     /* The two references in interned are not counted by refcnt.
15300        The deallocator will take care of this */
15301     Py_REFCNT(s) -= 2;
15302     _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15303 }
15304 
15305 void
PyUnicode_InternImmortal(PyObject ** p)15306 PyUnicode_InternImmortal(PyObject **p)
15307 {
15308     PyUnicode_InternInPlace(p);
15309     if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15310         _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15311         Py_INCREF(*p);
15312     }
15313 }
15314 
15315 PyObject *
PyUnicode_InternFromString(const char * cp)15316 PyUnicode_InternFromString(const char *cp)
15317 {
15318     PyObject *s = PyUnicode_FromString(cp);
15319     if (s == NULL)
15320         return NULL;
15321     PyUnicode_InternInPlace(&s);
15322     return s;
15323 }
15324 
15325 
15326 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15327 static void
unicode_release_interned(void)15328 unicode_release_interned(void)
15329 {
15330     PyObject *keys;
15331     PyObject *s;
15332     Py_ssize_t i, n;
15333     Py_ssize_t immortal_size = 0, mortal_size = 0;
15334 
15335     if (interned == NULL || !PyDict_Check(interned))
15336         return;
15337     keys = PyDict_Keys(interned);
15338     if (keys == NULL || !PyList_Check(keys)) {
15339         PyErr_Clear();
15340         return;
15341     }
15342 
15343     /* Since unicode_release_interned() is intended to help a leak
15344        detector, interned unicode strings are not forcibly deallocated;
15345        rather, we give them their stolen references back, and then clear
15346        and DECREF the interned dict. */
15347 
15348     n = PyList_GET_SIZE(keys);
15349 #ifdef INTERNED_STATS
15350     fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15351             n);
15352 #endif
15353     for (i = 0; i < n; i++) {
15354         s = PyList_GET_ITEM(keys, i);
15355         if (PyUnicode_READY(s) == -1) {
15356             Py_UNREACHABLE();
15357         }
15358         switch (PyUnicode_CHECK_INTERNED(s)) {
15359         case SSTATE_NOT_INTERNED:
15360             /* XXX Shouldn't happen */
15361             break;
15362         case SSTATE_INTERNED_IMMORTAL:
15363             Py_REFCNT(s) += 1;
15364             immortal_size += PyUnicode_GET_LENGTH(s);
15365             break;
15366         case SSTATE_INTERNED_MORTAL:
15367             Py_REFCNT(s) += 2;
15368             mortal_size += PyUnicode_GET_LENGTH(s);
15369             break;
15370         default:
15371             Py_FatalError("Inconsistent interned string state.");
15372         }
15373         _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15374     }
15375 #ifdef INTERNED_STATS
15376     fprintf(stderr, "total size of all interned strings: "
15377             "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15378             "mortal/immortal\n", mortal_size, immortal_size);
15379 #endif
15380     Py_DECREF(keys);
15381     PyDict_Clear(interned);
15382     Py_CLEAR(interned);
15383 }
15384 #endif
15385 
15386 
15387 /********************* Unicode Iterator **************************/
15388 
15389 typedef struct {
15390     PyObject_HEAD
15391     Py_ssize_t it_index;
15392     PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15393 } unicodeiterobject;
15394 
15395 static void
unicodeiter_dealloc(unicodeiterobject * it)15396 unicodeiter_dealloc(unicodeiterobject *it)
15397 {
15398     _PyObject_GC_UNTRACK(it);
15399     Py_XDECREF(it->it_seq);
15400     PyObject_GC_Del(it);
15401 }
15402 
15403 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15404 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15405 {
15406     Py_VISIT(it->it_seq);
15407     return 0;
15408 }
15409 
15410 static PyObject *
unicodeiter_next(unicodeiterobject * it)15411 unicodeiter_next(unicodeiterobject *it)
15412 {
15413     PyObject *seq, *item;
15414 
15415     assert(it != NULL);
15416     seq = it->it_seq;
15417     if (seq == NULL)
15418         return NULL;
15419     assert(_PyUnicode_CHECK(seq));
15420 
15421     if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15422         int kind = PyUnicode_KIND(seq);
15423         void *data = PyUnicode_DATA(seq);
15424         Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15425         item = PyUnicode_FromOrdinal(chr);
15426         if (item != NULL)
15427             ++it->it_index;
15428         return item;
15429     }
15430 
15431     it->it_seq = NULL;
15432     Py_DECREF(seq);
15433     return NULL;
15434 }
15435 
15436 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15437 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15438 {
15439     Py_ssize_t len = 0;
15440     if (it->it_seq)
15441         len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15442     return PyLong_FromSsize_t(len);
15443 }
15444 
15445 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15446 
15447 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15448 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15449 {
15450     _Py_IDENTIFIER(iter);
15451     if (it->it_seq != NULL) {
15452         return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
15453                              it->it_seq, it->it_index);
15454     } else {
15455         PyObject *u = (PyObject *)_PyUnicode_New(0);
15456         if (u == NULL)
15457             return NULL;
15458         return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
15459     }
15460 }
15461 
15462 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15463 
15464 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15465 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15466 {
15467     Py_ssize_t index = PyLong_AsSsize_t(state);
15468     if (index == -1 && PyErr_Occurred())
15469         return NULL;
15470     if (it->it_seq != NULL) {
15471         if (index < 0)
15472             index = 0;
15473         else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15474             index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15475         it->it_index = index;
15476     }
15477     Py_RETURN_NONE;
15478 }
15479 
15480 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15481 
15482 static PyMethodDef unicodeiter_methods[] = {
15483     {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15484      length_hint_doc},
15485     {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15486      reduce_doc},
15487     {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15488      setstate_doc},
15489     {NULL,      NULL}       /* sentinel */
15490 };
15491 
15492 PyTypeObject PyUnicodeIter_Type = {
15493     PyVarObject_HEAD_INIT(&PyType_Type, 0)
15494     "str_iterator",         /* tp_name */
15495     sizeof(unicodeiterobject),      /* tp_basicsize */
15496     0,                  /* tp_itemsize */
15497     /* methods */
15498     (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15499     0,                  /* tp_vectorcall_offset */
15500     0,                  /* tp_getattr */
15501     0,                  /* tp_setattr */
15502     0,                  /* tp_as_async */
15503     0,                  /* tp_repr */
15504     0,                  /* tp_as_number */
15505     0,                  /* tp_as_sequence */
15506     0,                  /* tp_as_mapping */
15507     0,                  /* tp_hash */
15508     0,                  /* tp_call */
15509     0,                  /* tp_str */
15510     PyObject_GenericGetAttr,        /* tp_getattro */
15511     0,                  /* tp_setattro */
15512     0,                  /* tp_as_buffer */
15513     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15514     0,                  /* tp_doc */
15515     (traverseproc)unicodeiter_traverse, /* tp_traverse */
15516     0,                  /* tp_clear */
15517     0,                  /* tp_richcompare */
15518     0,                  /* tp_weaklistoffset */
15519     PyObject_SelfIter,          /* tp_iter */
15520     (iternextfunc)unicodeiter_next,     /* tp_iternext */
15521     unicodeiter_methods,            /* tp_methods */
15522     0,
15523 };
15524 
15525 static PyObject *
unicode_iter(PyObject * seq)15526 unicode_iter(PyObject *seq)
15527 {
15528     unicodeiterobject *it;
15529 
15530     if (!PyUnicode_Check(seq)) {
15531         PyErr_BadInternalCall();
15532         return NULL;
15533     }
15534     if (PyUnicode_READY(seq) == -1)
15535         return NULL;
15536     it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15537     if (it == NULL)
15538         return NULL;
15539     it->it_index = 0;
15540     Py_INCREF(seq);
15541     it->it_seq = seq;
15542     _PyObject_GC_TRACK(it);
15543     return (PyObject *)it;
15544 }
15545 
15546 
15547 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15548 Py_UNICODE_strlen(const Py_UNICODE *u)
15549 {
15550     return wcslen(u);
15551 }
15552 
15553 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15554 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15555 {
15556     Py_UNICODE *u = s1;
15557     while ((*u++ = *s2++));
15558     return s1;
15559 }
15560 
15561 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15562 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15563 {
15564     Py_UNICODE *u = s1;
15565     while ((*u++ = *s2++))
15566         if (n-- == 0)
15567             break;
15568     return s1;
15569 }
15570 
15571 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15572 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15573 {
15574     Py_UNICODE *u1 = s1;
15575     u1 += wcslen(u1);
15576     while ((*u1++ = *s2++));
15577     return s1;
15578 }
15579 
15580 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15581 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15582 {
15583     while (*s1 && *s2 && *s1 == *s2)
15584         s1++, s2++;
15585     if (*s1 && *s2)
15586         return (*s1 < *s2) ? -1 : +1;
15587     if (*s1)
15588         return 1;
15589     if (*s2)
15590         return -1;
15591     return 0;
15592 }
15593 
15594 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15595 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15596 {
15597     Py_UNICODE u1, u2;
15598     for (; n != 0; n--) {
15599         u1 = *s1;
15600         u2 = *s2;
15601         if (u1 != u2)
15602             return (u1 < u2) ? -1 : +1;
15603         if (u1 == '\0')
15604             return 0;
15605         s1++;
15606         s2++;
15607     }
15608     return 0;
15609 }
15610 
15611 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15612 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15613 {
15614     const Py_UNICODE *p;
15615     for (p = s; *p; p++)
15616         if (*p == c)
15617             return (Py_UNICODE*)p;
15618     return NULL;
15619 }
15620 
15621 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15622 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15623 {
15624     const Py_UNICODE *p;
15625     p = s + wcslen(s);
15626     while (p != s) {
15627         p--;
15628         if (*p == c)
15629             return (Py_UNICODE*)p;
15630     }
15631     return NULL;
15632 }
15633 
15634 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15635 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15636 {
15637     Py_UNICODE *u, *copy;
15638     Py_ssize_t len, size;
15639 
15640     if (!PyUnicode_Check(unicode)) {
15641         PyErr_BadArgument();
15642         return NULL;
15643     }
15644     u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15645     if (u == NULL)
15646         return NULL;
15647     /* Ensure we won't overflow the size. */
15648     if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15649         PyErr_NoMemory();
15650         return NULL;
15651     }
15652     size = len + 1; /* copy the null character */
15653     size *= sizeof(Py_UNICODE);
15654     copy = PyMem_Malloc(size);
15655     if (copy == NULL) {
15656         PyErr_NoMemory();
15657         return NULL;
15658     }
15659     memcpy(copy, u, size);
15660     return copy;
15661 }
15662 
15663 
15664 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15665 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15666 {
15667     int res;
15668     res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15669     if (res == -2) {
15670         PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15671         return -1;
15672     }
15673     if (res < 0) {
15674         PyErr_NoMemory();
15675         return -1;
15676     }
15677     return 0;
15678 }
15679 
15680 
15681 static int
config_get_codec_name(wchar_t ** config_encoding)15682 config_get_codec_name(wchar_t **config_encoding)
15683 {
15684     char *encoding;
15685     if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15686         return -1;
15687     }
15688 
15689     PyObject *name_obj = NULL;
15690     PyObject *codec = _PyCodec_Lookup(encoding);
15691     PyMem_RawFree(encoding);
15692 
15693     if (!codec)
15694         goto error;
15695 
15696     name_obj = PyObject_GetAttrString(codec, "name");
15697     Py_CLEAR(codec);
15698     if (!name_obj) {
15699         goto error;
15700     }
15701 
15702     wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15703     Py_DECREF(name_obj);
15704     if (wname == NULL) {
15705         goto error;
15706     }
15707 
15708     wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15709     if (raw_wname == NULL) {
15710         PyMem_Free(wname);
15711         PyErr_NoMemory();
15712         goto error;
15713     }
15714 
15715     PyMem_RawFree(*config_encoding);
15716     *config_encoding = raw_wname;
15717 
15718     PyMem_Free(wname);
15719     return 0;
15720 
15721 error:
15722     Py_XDECREF(codec);
15723     Py_XDECREF(name_obj);
15724     return -1;
15725 }
15726 
15727 
15728 static PyStatus
init_stdio_encoding(PyThreadState * tstate)15729 init_stdio_encoding(PyThreadState *tstate)
15730 {
15731     /* Update the stdio encoding to the normalized Python codec name. */
15732     PyConfig *config = &tstate->interp->config;
15733     if (config_get_codec_name(&config->stdio_encoding) < 0) {
15734         return _PyStatus_ERR("failed to get the Python codec name "
15735                              "of the stdio encoding");
15736     }
15737     return _PyStatus_OK();
15738 }
15739 
15740 
15741 static int
init_fs_codec(PyInterpreterState * interp)15742 init_fs_codec(PyInterpreterState *interp)
15743 {
15744     PyConfig *config = &interp->config;
15745 
15746     _Py_error_handler error_handler;
15747     error_handler = get_error_handler_wide(config->filesystem_errors);
15748     if (error_handler == _Py_ERROR_UNKNOWN) {
15749         PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15750         return -1;
15751     }
15752 
15753     char *encoding, *errors;
15754     if (encode_wstr_utf8(config->filesystem_encoding,
15755                          &encoding,
15756                          "filesystem_encoding") < 0) {
15757         return -1;
15758     }
15759 
15760     if (encode_wstr_utf8(config->filesystem_errors,
15761                          &errors,
15762                          "filesystem_errors") < 0) {
15763         PyMem_RawFree(encoding);
15764         return -1;
15765     }
15766 
15767     PyMem_RawFree(interp->fs_codec.encoding);
15768     interp->fs_codec.encoding = encoding;
15769     PyMem_RawFree(interp->fs_codec.errors);
15770     interp->fs_codec.errors = errors;
15771     interp->fs_codec.error_handler = error_handler;
15772 
15773     /* At this point, PyUnicode_EncodeFSDefault() and
15774        PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15775        the C implementation of the filesystem encoding. */
15776 
15777     /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15778        global configuration variables. */
15779     if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15780                                   interp->fs_codec.errors) < 0) {
15781         PyErr_NoMemory();
15782         return -1;
15783     }
15784     return 0;
15785 }
15786 
15787 
15788 static PyStatus
init_fs_encoding(PyThreadState * tstate)15789 init_fs_encoding(PyThreadState *tstate)
15790 {
15791     PyInterpreterState *interp = tstate->interp;
15792 
15793     /* Update the filesystem encoding to the normalized Python codec name.
15794        For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15795        (Python codec name). */
15796     PyConfig *config = &interp->config;
15797     if (config_get_codec_name(&config->filesystem_encoding) < 0) {
15798         _Py_DumpPathConfig(tstate);
15799         return _PyStatus_ERR("failed to get the Python codec "
15800                              "of the filesystem encoding");
15801     }
15802 
15803     if (init_fs_codec(interp) < 0) {
15804         return _PyStatus_ERR("cannot initialize filesystem codec");
15805     }
15806     return _PyStatus_OK();
15807 }
15808 
15809 
15810 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)15811 _PyUnicode_InitEncodings(PyThreadState *tstate)
15812 {
15813     PyStatus status = init_fs_encoding(tstate);
15814     if (_PyStatus_EXCEPTION(status)) {
15815         return status;
15816     }
15817 
15818     return init_stdio_encoding(tstate);
15819 }
15820 
15821 
15822 #ifdef MS_WINDOWS
15823 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)15824 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
15825 {
15826     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15827     PyConfig *config = &interp->config;
15828 
15829     /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15830     wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15831     wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15832     if (encoding == NULL || errors == NULL) {
15833         PyMem_RawFree(encoding);
15834         PyMem_RawFree(errors);
15835         PyErr_NoMemory();
15836         return -1;
15837     }
15838 
15839     PyMem_RawFree(config->filesystem_encoding);
15840     config->filesystem_encoding = encoding;
15841     PyMem_RawFree(config->filesystem_errors);
15842     config->filesystem_errors = errors;
15843 
15844     return init_fs_codec(interp);
15845 }
15846 #endif
15847 
15848 
15849 void
_PyUnicode_Fini(void)15850 _PyUnicode_Fini(void)
15851 {
15852 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15853     /* Insure++ is a memory analysis tool that aids in discovering
15854      * memory leaks and other memory problems.  On Python exit, the
15855      * interned string dictionaries are flagged as being in use at exit
15856      * (which it is).  Under normal circumstances, this is fine because
15857      * the memory will be automatically reclaimed by the system.  Under
15858      * memory debugging, it's a huge source of useless noise, so we
15859      * trade off slower shutdown for less distraction in the memory
15860      * reports.  -baw
15861      */
15862     unicode_release_interned();
15863 #endif /* __INSURE__ */
15864 
15865     Py_CLEAR(unicode_empty);
15866 
15867     for (Py_ssize_t i = 0; i < 256; i++) {
15868         Py_CLEAR(unicode_latin1[i]);
15869     }
15870     _PyUnicode_ClearStaticStrings();
15871     (void)PyUnicode_ClearFreeList();
15872 
15873     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15874     PyMem_RawFree(interp->fs_codec.encoding);
15875     interp->fs_codec.encoding = NULL;
15876     PyMem_RawFree(interp->fs_codec.errors);
15877     interp->fs_codec.errors = NULL;
15878 }
15879 
15880 
15881 /* A _string module, to export formatter_parser and formatter_field_name_split
15882    to the string.Formatter class implemented in Python. */
15883 
15884 static PyMethodDef _string_methods[] = {
15885     {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15886      METH_O, PyDoc_STR("split the argument as a field name")},
15887     {"formatter_parser", (PyCFunction) formatter_parser,
15888      METH_O, PyDoc_STR("parse the argument as a format string")},
15889     {NULL, NULL}
15890 };
15891 
15892 static struct PyModuleDef _string_module = {
15893     PyModuleDef_HEAD_INIT,
15894     "_string",
15895     PyDoc_STR("string helper module"),
15896     0,
15897     _string_methods,
15898     NULL,
15899     NULL,
15900     NULL,
15901     NULL
15902 };
15903 
15904 PyMODINIT_FUNC
PyInit__string(void)15905 PyInit__string(void)
15906 {
15907     return PyModule_Create(&_string_module);
15908 }
15909 
15910 
15911 #ifdef __cplusplus
15912 }
15913 #endif
15914