1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "pycore_initconfig.h"
44 #include "pycore_fileutils.h"
45 #include "pycore_object.h"
46 #include "pycore_pylifecycle.h"
47 #include "pycore_pystate.h"
48 #include "ucnhash.h"
49 #include "bytes_methods.h"
50 #include "stringlib/eq.h"
51
52 #ifdef MS_WINDOWS
53 #include <windows.h>
54 #endif
55
56 /* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58 /* #define INTERNED_STATS 1 */
59
60
61 /*[clinic input]
62 class str "PyObject *" "&PyUnicode_Type"
63 [clinic start generated code]*/
64 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66 /*[python input]
67 class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77 [python start generated code]*/
78 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
79
80 /* --- Globals ------------------------------------------------------------
81
82 NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
85
86 */
87
88
89 #ifdef __cplusplus
90 extern "C" {
91 #endif
92
93 // Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
94 // The value must be the same in fileutils.c.
95 #define MAX_UNICODE 0x10ffff
96
97 #ifdef Py_DEBUG
98 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
99 #else
100 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101 #endif
102
103 #define _PyUnicode_UTF8(op) \
104 (((PyCompactUnicodeObject*)(op))->utf8)
105 #define PyUnicode_UTF8(op) \
106 (assert(_PyUnicode_CHECK(op)), \
107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 _PyUnicode_UTF8(op))
111 #define _PyUnicode_UTF8_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->utf8_length)
113 #define PyUnicode_UTF8_LENGTH(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((PyASCIIObject*)(op))->length : \
118 _PyUnicode_UTF8_LENGTH(op))
119 #define _PyUnicode_WSTR(op) \
120 (((PyASCIIObject*)(op))->wstr)
121 #define _PyUnicode_WSTR_LENGTH(op) \
122 (((PyCompactUnicodeObject*)(op))->wstr_length)
123 #define _PyUnicode_LENGTH(op) \
124 (((PyASCIIObject *)(op))->length)
125 #define _PyUnicode_STATE(op) \
126 (((PyASCIIObject *)(op))->state)
127 #define _PyUnicode_HASH(op) \
128 (((PyASCIIObject *)(op))->hash)
129 #define _PyUnicode_KIND(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 ((PyASCIIObject *)(op))->state.kind)
132 #define _PyUnicode_GET_LENGTH(op) \
133 (assert(_PyUnicode_CHECK(op)), \
134 ((PyASCIIObject *)(op))->length)
135 #define _PyUnicode_DATA_ANY(op) \
136 (((PyUnicodeObject*)(op))->data.any)
137
138 #undef PyUnicode_READY
139 #define PyUnicode_READY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (PyUnicode_IS_READY(op) ? \
142 0 : \
143 _PyUnicode_Ready(op)))
144
145 #define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149 #define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
153 /* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
155 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
156 ((!PyUnicode_IS_COMPACT_ASCII(op) \
157 && _PyUnicode_UTF8(op) \
158 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159
160 /* true if the Unicode object has an allocated wstr memory block
161 (not shared with other data) */
162 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
163 ((_PyUnicode_WSTR(op) && \
164 (!PyUnicode_IS_READY(op) || \
165 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166
167 /* Generic helper macro to convert characters of different types.
168 from_type and to_type have to be valid type names, begin and end
169 are pointers to the source characters which should be of type
170 "from_type *". to is a pointer of type "to_type *" and points to the
171 buffer where the result characters are written to. */
172 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173 do { \
174 to_type *_to = (to_type *)(to); \
175 const from_type *_iter = (const from_type *)(begin);\
176 const from_type *_end = (const from_type *)(end);\
177 Py_ssize_t n = (_end) - (_iter); \
178 const from_type *_unrolled_end = \
179 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
180 while (_iter < (_unrolled_end)) { \
181 _to[0] = (to_type) _iter[0]; \
182 _to[1] = (to_type) _iter[1]; \
183 _to[2] = (to_type) _iter[2]; \
184 _to[3] = (to_type) _iter[3]; \
185 _iter += 4; _to += 4; \
186 } \
187 while (_iter < (_end)) \
188 *_to++ = (to_type) *_iter++; \
189 } while (0)
190
191 #ifdef MS_WINDOWS
192 /* On Windows, overallocate by 50% is the best factor */
193 # define OVERALLOCATE_FACTOR 2
194 #else
195 /* On Linux, overallocate by 25% is the best factor */
196 # define OVERALLOCATE_FACTOR 4
197 #endif
198
199 /* This dictionary holds all interned unicode strings. Note that references
200 to strings in this dictionary are *not* counted in the string's ob_refcnt.
201 When the interned string reaches a refcnt of 0 the string deallocation
202 function will delete the reference from this dictionary.
203
204 Another way to look at this is that to say that the actual reference
205 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
206 */
207 static PyObject *interned = NULL;
208
209 /* The empty Unicode object is shared to improve performance. */
210 static PyObject *unicode_empty = NULL;
211
212 #define _Py_INCREF_UNICODE_EMPTY() \
213 do { \
214 if (unicode_empty != NULL) \
215 Py_INCREF(unicode_empty); \
216 else { \
217 unicode_empty = PyUnicode_New(0, 0); \
218 if (unicode_empty != NULL) { \
219 Py_INCREF(unicode_empty); \
220 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221 } \
222 } \
223 } while (0)
224
225 #define _Py_RETURN_UNICODE_EMPTY() \
226 do { \
227 _Py_INCREF_UNICODE_EMPTY(); \
228 return unicode_empty; \
229 } while (0)
230
231 static inline void
unicode_fill(enum PyUnicode_Kind kind,void * data,Py_UCS4 value,Py_ssize_t start,Py_ssize_t length)232 unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233 Py_ssize_t start, Py_ssize_t length)
234 {
235 assert(0 <= start);
236 assert(kind != PyUnicode_WCHAR_KIND);
237 switch (kind) {
238 case PyUnicode_1BYTE_KIND: {
239 assert(value <= 0xff);
240 Py_UCS1 ch = (unsigned char)value;
241 Py_UCS1 *to = (Py_UCS1 *)data + start;
242 memset(to, ch, length);
243 break;
244 }
245 case PyUnicode_2BYTE_KIND: {
246 assert(value <= 0xffff);
247 Py_UCS2 ch = (Py_UCS2)value;
248 Py_UCS2 *to = (Py_UCS2 *)data + start;
249 const Py_UCS2 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 case PyUnicode_4BYTE_KIND: {
254 assert(value <= MAX_UNICODE);
255 Py_UCS4 ch = value;
256 Py_UCS4 * to = (Py_UCS4 *)data + start;
257 const Py_UCS4 *end = to + length;
258 for (; to < end; ++to) *to = ch;
259 break;
260 }
261 default: Py_UNREACHABLE();
262 }
263 }
264
265
266 /* Forward declaration */
267 static inline int
268 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
269 static PyObject *
270 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
271 const char *errors);
272 static PyObject *
273 unicode_decode_utf8(const char *s, Py_ssize_t size,
274 _Py_error_handler error_handler, const char *errors,
275 Py_ssize_t *consumed);
276
277 /* List of static strings. */
278 static _Py_Identifier *static_strings = NULL;
279
280 /* Single character Unicode strings in the Latin-1 range are being
281 shared as well. */
282 static PyObject *unicode_latin1[256] = {NULL};
283
284 /* Fast detection of the most frequent whitespace characters */
285 const unsigned char _Py_ascii_whitespace[] = {
286 0, 0, 0, 0, 0, 0, 0, 0,
287 /* case 0x0009: * CHARACTER TABULATION */
288 /* case 0x000A: * LINE FEED */
289 /* case 0x000B: * LINE TABULATION */
290 /* case 0x000C: * FORM FEED */
291 /* case 0x000D: * CARRIAGE RETURN */
292 0, 1, 1, 1, 1, 1, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0,
294 /* case 0x001C: * FILE SEPARATOR */
295 /* case 0x001D: * GROUP SEPARATOR */
296 /* case 0x001E: * RECORD SEPARATOR */
297 /* case 0x001F: * UNIT SEPARATOR */
298 0, 0, 0, 0, 1, 1, 1, 1,
299 /* case 0x0020: * SPACE */
300 1, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0
313 };
314
315 /* forward */
316 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
317 static PyObject* get_latin1_char(unsigned char ch);
318 static int unicode_modifiable(PyObject *unicode);
319
320
321 static PyObject *
322 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
323 static PyObject *
324 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
325 static PyObject *
326 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
327
328 static PyObject *
329 unicode_encode_call_errorhandler(const char *errors,
330 PyObject **errorHandler,const char *encoding, const char *reason,
331 PyObject *unicode, PyObject **exceptionObject,
332 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
333
334 static void
335 raise_encode_exception(PyObject **exceptionObject,
336 const char *encoding,
337 PyObject *unicode,
338 Py_ssize_t startpos, Py_ssize_t endpos,
339 const char *reason);
340
341 /* Same for linebreaks */
342 static const unsigned char ascii_linebreak[] = {
343 0, 0, 0, 0, 0, 0, 0, 0,
344 /* 0x000A, * LINE FEED */
345 /* 0x000B, * LINE TABULATION */
346 /* 0x000C, * FORM FEED */
347 /* 0x000D, * CARRIAGE RETURN */
348 0, 0, 1, 1, 1, 1, 0, 0,
349 0, 0, 0, 0, 0, 0, 0, 0,
350 /* 0x001C, * FILE SEPARATOR */
351 /* 0x001D, * GROUP SEPARATOR */
352 /* 0x001E, * RECORD SEPARATOR */
353 0, 0, 0, 0, 1, 1, 1, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0
367 };
368
369 static int convert_uc(PyObject *obj, void *addr);
370
371 #include "clinic/unicodeobject.c.h"
372
373 _Py_error_handler
_Py_GetErrorHandler(const char * errors)374 _Py_GetErrorHandler(const char *errors)
375 {
376 if (errors == NULL || strcmp(errors, "strict") == 0) {
377 return _Py_ERROR_STRICT;
378 }
379 if (strcmp(errors, "surrogateescape") == 0) {
380 return _Py_ERROR_SURROGATEESCAPE;
381 }
382 if (strcmp(errors, "replace") == 0) {
383 return _Py_ERROR_REPLACE;
384 }
385 if (strcmp(errors, "ignore") == 0) {
386 return _Py_ERROR_IGNORE;
387 }
388 if (strcmp(errors, "backslashreplace") == 0) {
389 return _Py_ERROR_BACKSLASHREPLACE;
390 }
391 if (strcmp(errors, "surrogatepass") == 0) {
392 return _Py_ERROR_SURROGATEPASS;
393 }
394 if (strcmp(errors, "xmlcharrefreplace") == 0) {
395 return _Py_ERROR_XMLCHARREFREPLACE;
396 }
397 return _Py_ERROR_OTHER;
398 }
399
400
401 static _Py_error_handler
get_error_handler_wide(const wchar_t * errors)402 get_error_handler_wide(const wchar_t *errors)
403 {
404 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
405 return _Py_ERROR_STRICT;
406 }
407 if (wcscmp(errors, L"surrogateescape") == 0) {
408 return _Py_ERROR_SURROGATEESCAPE;
409 }
410 if (wcscmp(errors, L"replace") == 0) {
411 return _Py_ERROR_REPLACE;
412 }
413 if (wcscmp(errors, L"ignore") == 0) {
414 return _Py_ERROR_IGNORE;
415 }
416 if (wcscmp(errors, L"backslashreplace") == 0) {
417 return _Py_ERROR_BACKSLASHREPLACE;
418 }
419 if (wcscmp(errors, L"surrogatepass") == 0) {
420 return _Py_ERROR_SURROGATEPASS;
421 }
422 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
423 return _Py_ERROR_XMLCHARREFREPLACE;
424 }
425 return _Py_ERROR_OTHER;
426 }
427
428
429 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
430 This function is kept for backward compatibility with the old API. */
431 Py_UNICODE
PyUnicode_GetMax(void)432 PyUnicode_GetMax(void)
433 {
434 #ifdef Py_UNICODE_WIDE
435 return 0x10FFFF;
436 #else
437 /* This is actually an illegal character, so it should
438 not be passed to unichr. */
439 return 0xFFFF;
440 #endif
441 }
442
443 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)444 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
445 {
446 #define CHECK(expr) \
447 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
448
449 PyASCIIObject *ascii;
450 unsigned int kind;
451
452 assert(op != NULL);
453 CHECK(PyUnicode_Check(op));
454
455 ascii = (PyASCIIObject *)op;
456 kind = ascii->state.kind;
457
458 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
459 CHECK(kind == PyUnicode_1BYTE_KIND);
460 CHECK(ascii->state.ready == 1);
461 }
462 else {
463 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
464 void *data;
465
466 if (ascii->state.compact == 1) {
467 data = compact + 1;
468 CHECK(kind == PyUnicode_1BYTE_KIND
469 || kind == PyUnicode_2BYTE_KIND
470 || kind == PyUnicode_4BYTE_KIND);
471 CHECK(ascii->state.ascii == 0);
472 CHECK(ascii->state.ready == 1);
473 CHECK(compact->utf8 != data);
474 }
475 else {
476 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
477
478 data = unicode->data.any;
479 if (kind == PyUnicode_WCHAR_KIND) {
480 CHECK(ascii->length == 0);
481 CHECK(ascii->hash == -1);
482 CHECK(ascii->state.compact == 0);
483 CHECK(ascii->state.ascii == 0);
484 CHECK(ascii->state.ready == 0);
485 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
486 CHECK(ascii->wstr != NULL);
487 CHECK(data == NULL);
488 CHECK(compact->utf8 == NULL);
489 }
490 else {
491 CHECK(kind == PyUnicode_1BYTE_KIND
492 || kind == PyUnicode_2BYTE_KIND
493 || kind == PyUnicode_4BYTE_KIND);
494 CHECK(ascii->state.compact == 0);
495 CHECK(ascii->state.ready == 1);
496 CHECK(data != NULL);
497 if (ascii->state.ascii) {
498 CHECK(compact->utf8 == data);
499 CHECK(compact->utf8_length == ascii->length);
500 }
501 else
502 CHECK(compact->utf8 != data);
503 }
504 }
505 if (kind != PyUnicode_WCHAR_KIND) {
506 if (
507 #if SIZEOF_WCHAR_T == 2
508 kind == PyUnicode_2BYTE_KIND
509 #else
510 kind == PyUnicode_4BYTE_KIND
511 #endif
512 )
513 {
514 CHECK(ascii->wstr == data);
515 CHECK(compact->wstr_length == ascii->length);
516 } else
517 CHECK(ascii->wstr != data);
518 }
519
520 if (compact->utf8 == NULL)
521 CHECK(compact->utf8_length == 0);
522 if (ascii->wstr == NULL)
523 CHECK(compact->wstr_length == 0);
524 }
525
526 /* check that the best kind is used: O(n) operation */
527 if (check_content && kind != PyUnicode_WCHAR_KIND) {
528 Py_ssize_t i;
529 Py_UCS4 maxchar = 0;
530 void *data;
531 Py_UCS4 ch;
532
533 data = PyUnicode_DATA(ascii);
534 for (i=0; i < ascii->length; i++)
535 {
536 ch = PyUnicode_READ(kind, data, i);
537 if (ch > maxchar)
538 maxchar = ch;
539 }
540 if (kind == PyUnicode_1BYTE_KIND) {
541 if (ascii->state.ascii == 0) {
542 CHECK(maxchar >= 128);
543 CHECK(maxchar <= 255);
544 }
545 else
546 CHECK(maxchar < 128);
547 }
548 else if (kind == PyUnicode_2BYTE_KIND) {
549 CHECK(maxchar >= 0x100);
550 CHECK(maxchar <= 0xFFFF);
551 }
552 else {
553 CHECK(maxchar >= 0x10000);
554 CHECK(maxchar <= MAX_UNICODE);
555 }
556 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
557 }
558 return 1;
559
560 #undef CHECK
561 }
562
563
564 static PyObject*
unicode_result_wchar(PyObject * unicode)565 unicode_result_wchar(PyObject *unicode)
566 {
567 #ifndef Py_DEBUG
568 Py_ssize_t len;
569
570 len = _PyUnicode_WSTR_LENGTH(unicode);
571 if (len == 0) {
572 Py_DECREF(unicode);
573 _Py_RETURN_UNICODE_EMPTY();
574 }
575
576 if (len == 1) {
577 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
578 if ((Py_UCS4)ch < 256) {
579 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
580 Py_DECREF(unicode);
581 return latin1_char;
582 }
583 }
584
585 if (_PyUnicode_Ready(unicode) < 0) {
586 Py_DECREF(unicode);
587 return NULL;
588 }
589 #else
590 assert(Py_REFCNT(unicode) == 1);
591
592 /* don't make the result ready in debug mode to ensure that the caller
593 makes the string ready before using it */
594 assert(_PyUnicode_CheckConsistency(unicode, 1));
595 #endif
596 return unicode;
597 }
598
599 static PyObject*
unicode_result_ready(PyObject * unicode)600 unicode_result_ready(PyObject *unicode)
601 {
602 Py_ssize_t length;
603
604 length = PyUnicode_GET_LENGTH(unicode);
605 if (length == 0) {
606 if (unicode != unicode_empty) {
607 Py_DECREF(unicode);
608 _Py_RETURN_UNICODE_EMPTY();
609 }
610 return unicode_empty;
611 }
612
613 if (length == 1) {
614 void *data = PyUnicode_DATA(unicode);
615 int kind = PyUnicode_KIND(unicode);
616 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
617 if (ch < 256) {
618 PyObject *latin1_char = unicode_latin1[ch];
619 if (latin1_char != NULL) {
620 if (unicode != latin1_char) {
621 Py_INCREF(latin1_char);
622 Py_DECREF(unicode);
623 }
624 return latin1_char;
625 }
626 else {
627 assert(_PyUnicode_CheckConsistency(unicode, 1));
628 Py_INCREF(unicode);
629 unicode_latin1[ch] = unicode;
630 return unicode;
631 }
632 }
633 }
634
635 assert(_PyUnicode_CheckConsistency(unicode, 1));
636 return unicode;
637 }
638
639 static PyObject*
unicode_result(PyObject * unicode)640 unicode_result(PyObject *unicode)
641 {
642 assert(_PyUnicode_CHECK(unicode));
643 if (PyUnicode_IS_READY(unicode))
644 return unicode_result_ready(unicode);
645 else
646 return unicode_result_wchar(unicode);
647 }
648
649 static PyObject*
unicode_result_unchanged(PyObject * unicode)650 unicode_result_unchanged(PyObject *unicode)
651 {
652 if (PyUnicode_CheckExact(unicode)) {
653 if (PyUnicode_READY(unicode) == -1)
654 return NULL;
655 Py_INCREF(unicode);
656 return unicode;
657 }
658 else
659 /* Subtype -- return genuine unicode string with the same value. */
660 return _PyUnicode_Copy(unicode);
661 }
662
663 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
664 ASCII, Latin1, UTF-8, etc. */
665 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)666 backslashreplace(_PyBytesWriter *writer, char *str,
667 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
668 {
669 Py_ssize_t size, i;
670 Py_UCS4 ch;
671 enum PyUnicode_Kind kind;
672 void *data;
673
674 assert(PyUnicode_IS_READY(unicode));
675 kind = PyUnicode_KIND(unicode);
676 data = PyUnicode_DATA(unicode);
677
678 size = 0;
679 /* determine replacement size */
680 for (i = collstart; i < collend; ++i) {
681 Py_ssize_t incr;
682
683 ch = PyUnicode_READ(kind, data, i);
684 if (ch < 0x100)
685 incr = 2+2;
686 else if (ch < 0x10000)
687 incr = 2+4;
688 else {
689 assert(ch <= MAX_UNICODE);
690 incr = 2+8;
691 }
692 if (size > PY_SSIZE_T_MAX - incr) {
693 PyErr_SetString(PyExc_OverflowError,
694 "encoded result is too long for a Python string");
695 return NULL;
696 }
697 size += incr;
698 }
699
700 str = _PyBytesWriter_Prepare(writer, str, size);
701 if (str == NULL)
702 return NULL;
703
704 /* generate replacement */
705 for (i = collstart; i < collend; ++i) {
706 ch = PyUnicode_READ(kind, data, i);
707 *str++ = '\\';
708 if (ch >= 0x00010000) {
709 *str++ = 'U';
710 *str++ = Py_hexdigits[(ch>>28)&0xf];
711 *str++ = Py_hexdigits[(ch>>24)&0xf];
712 *str++ = Py_hexdigits[(ch>>20)&0xf];
713 *str++ = Py_hexdigits[(ch>>16)&0xf];
714 *str++ = Py_hexdigits[(ch>>12)&0xf];
715 *str++ = Py_hexdigits[(ch>>8)&0xf];
716 }
717 else if (ch >= 0x100) {
718 *str++ = 'u';
719 *str++ = Py_hexdigits[(ch>>12)&0xf];
720 *str++ = Py_hexdigits[(ch>>8)&0xf];
721 }
722 else
723 *str++ = 'x';
724 *str++ = Py_hexdigits[(ch>>4)&0xf];
725 *str++ = Py_hexdigits[ch&0xf];
726 }
727 return str;
728 }
729
730 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
731 ASCII, Latin1, UTF-8, etc. */
732 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)733 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
734 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
735 {
736 Py_ssize_t size, i;
737 Py_UCS4 ch;
738 enum PyUnicode_Kind kind;
739 void *data;
740
741 assert(PyUnicode_IS_READY(unicode));
742 kind = PyUnicode_KIND(unicode);
743 data = PyUnicode_DATA(unicode);
744
745 size = 0;
746 /* determine replacement size */
747 for (i = collstart; i < collend; ++i) {
748 Py_ssize_t incr;
749
750 ch = PyUnicode_READ(kind, data, i);
751 if (ch < 10)
752 incr = 2+1+1;
753 else if (ch < 100)
754 incr = 2+2+1;
755 else if (ch < 1000)
756 incr = 2+3+1;
757 else if (ch < 10000)
758 incr = 2+4+1;
759 else if (ch < 100000)
760 incr = 2+5+1;
761 else if (ch < 1000000)
762 incr = 2+6+1;
763 else {
764 assert(ch <= MAX_UNICODE);
765 incr = 2+7+1;
766 }
767 if (size > PY_SSIZE_T_MAX - incr) {
768 PyErr_SetString(PyExc_OverflowError,
769 "encoded result is too long for a Python string");
770 return NULL;
771 }
772 size += incr;
773 }
774
775 str = _PyBytesWriter_Prepare(writer, str, size);
776 if (str == NULL)
777 return NULL;
778
779 /* generate replacement */
780 for (i = collstart; i < collend; ++i) {
781 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
782 }
783 return str;
784 }
785
786 /* --- Bloom Filters ----------------------------------------------------- */
787
788 /* stuff to implement simple "bloom filters" for Unicode characters.
789 to keep things simple, we use a single bitmask, using the least 5
790 bits from each unicode characters as the bit index. */
791
792 /* the linebreak mask is set up by Unicode_Init below */
793
794 #if LONG_BIT >= 128
795 #define BLOOM_WIDTH 128
796 #elif LONG_BIT >= 64
797 #define BLOOM_WIDTH 64
798 #elif LONG_BIT >= 32
799 #define BLOOM_WIDTH 32
800 #else
801 #error "LONG_BIT is smaller than 32"
802 #endif
803
804 #define BLOOM_MASK unsigned long
805
806 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
807
808 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
809
810 #define BLOOM_LINEBREAK(ch) \
811 ((ch) < 128U ? ascii_linebreak[(ch)] : \
812 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
813
814 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)815 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
816 {
817 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
818 do { \
819 TYPE *data = (TYPE *)PTR; \
820 TYPE *end = data + LEN; \
821 Py_UCS4 ch; \
822 for (; data != end; data++) { \
823 ch = *data; \
824 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
825 } \
826 break; \
827 } while (0)
828
829 /* calculate simple bloom-style bitmask for a given unicode string */
830
831 BLOOM_MASK mask;
832
833 mask = 0;
834 switch (kind) {
835 case PyUnicode_1BYTE_KIND:
836 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
837 break;
838 case PyUnicode_2BYTE_KIND:
839 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
840 break;
841 case PyUnicode_4BYTE_KIND:
842 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
843 break;
844 default:
845 Py_UNREACHABLE();
846 }
847 return mask;
848
849 #undef BLOOM_UPDATE
850 }
851
852 static int
ensure_unicode(PyObject * obj)853 ensure_unicode(PyObject *obj)
854 {
855 if (!PyUnicode_Check(obj)) {
856 PyErr_Format(PyExc_TypeError,
857 "must be str, not %.100s",
858 Py_TYPE(obj)->tp_name);
859 return -1;
860 }
861 return PyUnicode_READY(obj);
862 }
863
864 /* Compilation of templated routines */
865
866 #include "stringlib/asciilib.h"
867 #include "stringlib/fastsearch.h"
868 #include "stringlib/partition.h"
869 #include "stringlib/split.h"
870 #include "stringlib/count.h"
871 #include "stringlib/find.h"
872 #include "stringlib/find_max_char.h"
873 #include "stringlib/undef.h"
874
875 #include "stringlib/ucs1lib.h"
876 #include "stringlib/fastsearch.h"
877 #include "stringlib/partition.h"
878 #include "stringlib/split.h"
879 #include "stringlib/count.h"
880 #include "stringlib/find.h"
881 #include "stringlib/replace.h"
882 #include "stringlib/find_max_char.h"
883 #include "stringlib/undef.h"
884
885 #include "stringlib/ucs2lib.h"
886 #include "stringlib/fastsearch.h"
887 #include "stringlib/partition.h"
888 #include "stringlib/split.h"
889 #include "stringlib/count.h"
890 #include "stringlib/find.h"
891 #include "stringlib/replace.h"
892 #include "stringlib/find_max_char.h"
893 #include "stringlib/undef.h"
894
895 #include "stringlib/ucs4lib.h"
896 #include "stringlib/fastsearch.h"
897 #include "stringlib/partition.h"
898 #include "stringlib/split.h"
899 #include "stringlib/count.h"
900 #include "stringlib/find.h"
901 #include "stringlib/replace.h"
902 #include "stringlib/find_max_char.h"
903 #include "stringlib/undef.h"
904
905 #include "stringlib/unicodedefs.h"
906 #include "stringlib/fastsearch.h"
907 #include "stringlib/count.h"
908 #include "stringlib/find.h"
909 #include "stringlib/undef.h"
910
911 /* --- Unicode Object ----------------------------------------------------- */
912
913 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)914 findchar(const void *s, int kind,
915 Py_ssize_t size, Py_UCS4 ch,
916 int direction)
917 {
918 switch (kind) {
919 case PyUnicode_1BYTE_KIND:
920 if ((Py_UCS1) ch != ch)
921 return -1;
922 if (direction > 0)
923 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
924 else
925 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
926 case PyUnicode_2BYTE_KIND:
927 if ((Py_UCS2) ch != ch)
928 return -1;
929 if (direction > 0)
930 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
931 else
932 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
933 case PyUnicode_4BYTE_KIND:
934 if (direction > 0)
935 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
936 else
937 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
938 default:
939 Py_UNREACHABLE();
940 }
941 }
942
943 #ifdef Py_DEBUG
944 /* Fill the data of a Unicode string with invalid characters to detect bugs
945 earlier.
946
947 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
948 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
949 invalid character in Unicode 6.0. */
950 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)951 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
952 {
953 int kind = PyUnicode_KIND(unicode);
954 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
955 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
956 if (length <= old_length)
957 return;
958 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
959 }
960 #endif
961
962 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)963 resize_compact(PyObject *unicode, Py_ssize_t length)
964 {
965 Py_ssize_t char_size;
966 Py_ssize_t struct_size;
967 Py_ssize_t new_size;
968 int share_wstr;
969 PyObject *new_unicode;
970 #ifdef Py_DEBUG
971 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
972 #endif
973
974 assert(unicode_modifiable(unicode));
975 assert(PyUnicode_IS_READY(unicode));
976 assert(PyUnicode_IS_COMPACT(unicode));
977
978 char_size = PyUnicode_KIND(unicode);
979 if (PyUnicode_IS_ASCII(unicode))
980 struct_size = sizeof(PyASCIIObject);
981 else
982 struct_size = sizeof(PyCompactUnicodeObject);
983 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
984
985 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
986 PyErr_NoMemory();
987 return NULL;
988 }
989 new_size = (struct_size + (length + 1) * char_size);
990
991 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
992 PyObject_DEL(_PyUnicode_UTF8(unicode));
993 _PyUnicode_UTF8(unicode) = NULL;
994 _PyUnicode_UTF8_LENGTH(unicode) = 0;
995 }
996 _Py_DEC_REFTOTAL;
997 _Py_ForgetReference(unicode);
998
999 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
1000 if (new_unicode == NULL) {
1001 _Py_NewReference(unicode);
1002 PyErr_NoMemory();
1003 return NULL;
1004 }
1005 unicode = new_unicode;
1006 _Py_NewReference(unicode);
1007
1008 _PyUnicode_LENGTH(unicode) = length;
1009 if (share_wstr) {
1010 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1011 if (!PyUnicode_IS_ASCII(unicode))
1012 _PyUnicode_WSTR_LENGTH(unicode) = length;
1013 }
1014 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1015 PyObject_DEL(_PyUnicode_WSTR(unicode));
1016 _PyUnicode_WSTR(unicode) = NULL;
1017 if (!PyUnicode_IS_ASCII(unicode))
1018 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1019 }
1020 #ifdef Py_DEBUG
1021 unicode_fill_invalid(unicode, old_length);
1022 #endif
1023 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1024 length, 0);
1025 assert(_PyUnicode_CheckConsistency(unicode, 0));
1026 return unicode;
1027 }
1028
1029 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)1030 resize_inplace(PyObject *unicode, Py_ssize_t length)
1031 {
1032 wchar_t *wstr;
1033 Py_ssize_t new_size;
1034 assert(!PyUnicode_IS_COMPACT(unicode));
1035 assert(Py_REFCNT(unicode) == 1);
1036
1037 if (PyUnicode_IS_READY(unicode)) {
1038 Py_ssize_t char_size;
1039 int share_wstr, share_utf8;
1040 void *data;
1041 #ifdef Py_DEBUG
1042 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1043 #endif
1044
1045 data = _PyUnicode_DATA_ANY(unicode);
1046 char_size = PyUnicode_KIND(unicode);
1047 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1048 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1049
1050 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1051 PyErr_NoMemory();
1052 return -1;
1053 }
1054 new_size = (length + 1) * char_size;
1055
1056 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1057 {
1058 PyObject_DEL(_PyUnicode_UTF8(unicode));
1059 _PyUnicode_UTF8(unicode) = NULL;
1060 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1061 }
1062
1063 data = (PyObject *)PyObject_REALLOC(data, new_size);
1064 if (data == NULL) {
1065 PyErr_NoMemory();
1066 return -1;
1067 }
1068 _PyUnicode_DATA_ANY(unicode) = data;
1069 if (share_wstr) {
1070 _PyUnicode_WSTR(unicode) = data;
1071 _PyUnicode_WSTR_LENGTH(unicode) = length;
1072 }
1073 if (share_utf8) {
1074 _PyUnicode_UTF8(unicode) = data;
1075 _PyUnicode_UTF8_LENGTH(unicode) = length;
1076 }
1077 _PyUnicode_LENGTH(unicode) = length;
1078 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1079 #ifdef Py_DEBUG
1080 unicode_fill_invalid(unicode, old_length);
1081 #endif
1082 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1083 assert(_PyUnicode_CheckConsistency(unicode, 0));
1084 return 0;
1085 }
1086 }
1087 assert(_PyUnicode_WSTR(unicode) != NULL);
1088
1089 /* check for integer overflow */
1090 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1091 PyErr_NoMemory();
1092 return -1;
1093 }
1094 new_size = sizeof(wchar_t) * (length + 1);
1095 wstr = _PyUnicode_WSTR(unicode);
1096 wstr = PyObject_REALLOC(wstr, new_size);
1097 if (!wstr) {
1098 PyErr_NoMemory();
1099 return -1;
1100 }
1101 _PyUnicode_WSTR(unicode) = wstr;
1102 _PyUnicode_WSTR(unicode)[length] = 0;
1103 _PyUnicode_WSTR_LENGTH(unicode) = length;
1104 assert(_PyUnicode_CheckConsistency(unicode, 0));
1105 return 0;
1106 }
1107
1108 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1109 resize_copy(PyObject *unicode, Py_ssize_t length)
1110 {
1111 Py_ssize_t copy_length;
1112 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1113 PyObject *copy;
1114
1115 assert(PyUnicode_IS_READY(unicode));
1116
1117 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1118 if (copy == NULL)
1119 return NULL;
1120
1121 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1122 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1123 return copy;
1124 }
1125 else {
1126 PyObject *w;
1127
1128 w = (PyObject*)_PyUnicode_New(length);
1129 if (w == NULL)
1130 return NULL;
1131 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1132 copy_length = Py_MIN(copy_length, length);
1133 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1134 copy_length * sizeof(wchar_t));
1135 return w;
1136 }
1137 }
1138
1139 /* We allocate one more byte to make sure the string is
1140 Ux0000 terminated; some code (e.g. new_identifier)
1141 relies on that.
1142
1143 XXX This allocator could further be enhanced by assuring that the
1144 free list never reduces its size below 1.
1145
1146 */
1147
1148 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1149 _PyUnicode_New(Py_ssize_t length)
1150 {
1151 PyUnicodeObject *unicode;
1152 size_t new_size;
1153
1154 /* Optimization for empty strings */
1155 if (length == 0 && unicode_empty != NULL) {
1156 Py_INCREF(unicode_empty);
1157 return (PyUnicodeObject*)unicode_empty;
1158 }
1159
1160 /* Ensure we won't overflow the size. */
1161 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1162 return (PyUnicodeObject *)PyErr_NoMemory();
1163 }
1164 if (length < 0) {
1165 PyErr_SetString(PyExc_SystemError,
1166 "Negative size passed to _PyUnicode_New");
1167 return NULL;
1168 }
1169
1170 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1171 if (unicode == NULL)
1172 return NULL;
1173 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1174
1175 _PyUnicode_WSTR_LENGTH(unicode) = length;
1176 _PyUnicode_HASH(unicode) = -1;
1177 _PyUnicode_STATE(unicode).interned = 0;
1178 _PyUnicode_STATE(unicode).kind = 0;
1179 _PyUnicode_STATE(unicode).compact = 0;
1180 _PyUnicode_STATE(unicode).ready = 0;
1181 _PyUnicode_STATE(unicode).ascii = 0;
1182 _PyUnicode_DATA_ANY(unicode) = NULL;
1183 _PyUnicode_LENGTH(unicode) = 0;
1184 _PyUnicode_UTF8(unicode) = NULL;
1185 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1186
1187 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1188 if (!_PyUnicode_WSTR(unicode)) {
1189 Py_DECREF(unicode);
1190 PyErr_NoMemory();
1191 return NULL;
1192 }
1193
1194 /* Initialize the first element to guard against cases where
1195 * the caller fails before initializing str -- unicode_resize()
1196 * reads str[0], and the Keep-Alive optimization can keep memory
1197 * allocated for str alive across a call to unicode_dealloc(unicode).
1198 * We don't want unicode_resize to read uninitialized memory in
1199 * that case.
1200 */
1201 _PyUnicode_WSTR(unicode)[0] = 0;
1202 _PyUnicode_WSTR(unicode)[length] = 0;
1203
1204 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1205 return unicode;
1206 }
1207
1208 static const char*
unicode_kind_name(PyObject * unicode)1209 unicode_kind_name(PyObject *unicode)
1210 {
1211 /* don't check consistency: unicode_kind_name() is called from
1212 _PyUnicode_Dump() */
1213 if (!PyUnicode_IS_COMPACT(unicode))
1214 {
1215 if (!PyUnicode_IS_READY(unicode))
1216 return "wstr";
1217 switch (PyUnicode_KIND(unicode))
1218 {
1219 case PyUnicode_1BYTE_KIND:
1220 if (PyUnicode_IS_ASCII(unicode))
1221 return "legacy ascii";
1222 else
1223 return "legacy latin1";
1224 case PyUnicode_2BYTE_KIND:
1225 return "legacy UCS2";
1226 case PyUnicode_4BYTE_KIND:
1227 return "legacy UCS4";
1228 default:
1229 return "<legacy invalid kind>";
1230 }
1231 }
1232 assert(PyUnicode_IS_READY(unicode));
1233 switch (PyUnicode_KIND(unicode)) {
1234 case PyUnicode_1BYTE_KIND:
1235 if (PyUnicode_IS_ASCII(unicode))
1236 return "ascii";
1237 else
1238 return "latin1";
1239 case PyUnicode_2BYTE_KIND:
1240 return "UCS2";
1241 case PyUnicode_4BYTE_KIND:
1242 return "UCS4";
1243 default:
1244 return "<invalid compact kind>";
1245 }
1246 }
1247
1248 #ifdef Py_DEBUG
1249 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode_raw)1250 char *_PyUnicode_utf8(void *unicode_raw){
1251 PyObject *unicode = _PyObject_CAST(unicode_raw);
1252 return PyUnicode_UTF8(unicode);
1253 }
1254
_PyUnicode_compact_data(void * unicode_raw)1255 void *_PyUnicode_compact_data(void *unicode_raw) {
1256 PyObject *unicode = _PyObject_CAST(unicode_raw);
1257 return _PyUnicode_COMPACT_DATA(unicode);
1258 }
_PyUnicode_data(void * unicode_raw)1259 void *_PyUnicode_data(void *unicode_raw) {
1260 PyObject *unicode = _PyObject_CAST(unicode_raw);
1261 printf("obj %p\n", (void*)unicode);
1262 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1263 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1264 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1265 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1266 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1267 return PyUnicode_DATA(unicode);
1268 }
1269
1270 void
_PyUnicode_Dump(PyObject * op)1271 _PyUnicode_Dump(PyObject *op)
1272 {
1273 PyASCIIObject *ascii = (PyASCIIObject *)op;
1274 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1275 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1276 void *data;
1277
1278 if (ascii->state.compact)
1279 {
1280 if (ascii->state.ascii)
1281 data = (ascii + 1);
1282 else
1283 data = (compact + 1);
1284 }
1285 else
1286 data = unicode->data.any;
1287 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1288 unicode_kind_name(op), ascii->length);
1289
1290 if (ascii->wstr == data)
1291 printf("shared ");
1292 printf("wstr=%p", (void *)ascii->wstr);
1293
1294 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1295 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1296 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1297 printf("shared ");
1298 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1299 (void *)compact->utf8, compact->utf8_length);
1300 }
1301 printf(", data=%p\n", data);
1302 }
1303 #endif
1304
1305 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1306 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1307 {
1308 PyObject *obj;
1309 PyCompactUnicodeObject *unicode;
1310 void *data;
1311 enum PyUnicode_Kind kind;
1312 int is_sharing, is_ascii;
1313 Py_ssize_t char_size;
1314 Py_ssize_t struct_size;
1315
1316 /* Optimization for empty strings */
1317 if (size == 0 && unicode_empty != NULL) {
1318 Py_INCREF(unicode_empty);
1319 return unicode_empty;
1320 }
1321
1322 is_ascii = 0;
1323 is_sharing = 0;
1324 struct_size = sizeof(PyCompactUnicodeObject);
1325 if (maxchar < 128) {
1326 kind = PyUnicode_1BYTE_KIND;
1327 char_size = 1;
1328 is_ascii = 1;
1329 struct_size = sizeof(PyASCIIObject);
1330 }
1331 else if (maxchar < 256) {
1332 kind = PyUnicode_1BYTE_KIND;
1333 char_size = 1;
1334 }
1335 else if (maxchar < 65536) {
1336 kind = PyUnicode_2BYTE_KIND;
1337 char_size = 2;
1338 if (sizeof(wchar_t) == 2)
1339 is_sharing = 1;
1340 }
1341 else {
1342 if (maxchar > MAX_UNICODE) {
1343 PyErr_SetString(PyExc_SystemError,
1344 "invalid maximum character passed to PyUnicode_New");
1345 return NULL;
1346 }
1347 kind = PyUnicode_4BYTE_KIND;
1348 char_size = 4;
1349 if (sizeof(wchar_t) == 4)
1350 is_sharing = 1;
1351 }
1352
1353 /* Ensure we won't overflow the size. */
1354 if (size < 0) {
1355 PyErr_SetString(PyExc_SystemError,
1356 "Negative size passed to PyUnicode_New");
1357 return NULL;
1358 }
1359 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1360 return PyErr_NoMemory();
1361
1362 /* Duplicated allocation code from _PyObject_New() instead of a call to
1363 * PyObject_New() so we are able to allocate space for the object and
1364 * it's data buffer.
1365 */
1366 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1367 if (obj == NULL)
1368 return PyErr_NoMemory();
1369 obj = PyObject_INIT(obj, &PyUnicode_Type);
1370 if (obj == NULL)
1371 return NULL;
1372
1373 unicode = (PyCompactUnicodeObject *)obj;
1374 if (is_ascii)
1375 data = ((PyASCIIObject*)obj) + 1;
1376 else
1377 data = unicode + 1;
1378 _PyUnicode_LENGTH(unicode) = size;
1379 _PyUnicode_HASH(unicode) = -1;
1380 _PyUnicode_STATE(unicode).interned = 0;
1381 _PyUnicode_STATE(unicode).kind = kind;
1382 _PyUnicode_STATE(unicode).compact = 1;
1383 _PyUnicode_STATE(unicode).ready = 1;
1384 _PyUnicode_STATE(unicode).ascii = is_ascii;
1385 if (is_ascii) {
1386 ((char*)data)[size] = 0;
1387 _PyUnicode_WSTR(unicode) = NULL;
1388 }
1389 else if (kind == PyUnicode_1BYTE_KIND) {
1390 ((char*)data)[size] = 0;
1391 _PyUnicode_WSTR(unicode) = NULL;
1392 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1393 unicode->utf8 = NULL;
1394 unicode->utf8_length = 0;
1395 }
1396 else {
1397 unicode->utf8 = NULL;
1398 unicode->utf8_length = 0;
1399 if (kind == PyUnicode_2BYTE_KIND)
1400 ((Py_UCS2*)data)[size] = 0;
1401 else /* kind == PyUnicode_4BYTE_KIND */
1402 ((Py_UCS4*)data)[size] = 0;
1403 if (is_sharing) {
1404 _PyUnicode_WSTR_LENGTH(unicode) = size;
1405 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1406 }
1407 else {
1408 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1409 _PyUnicode_WSTR(unicode) = NULL;
1410 }
1411 }
1412 #ifdef Py_DEBUG
1413 unicode_fill_invalid((PyObject*)unicode, 0);
1414 #endif
1415 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1416 return obj;
1417 }
1418
1419 #if SIZEOF_WCHAR_T == 2
1420 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1421 will decode surrogate pairs, the other conversions are implemented as macros
1422 for efficiency.
1423
1424 This function assumes that unicode can hold one more code point than wstr
1425 characters for a terminating null character. */
1426 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1427 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1428 PyObject *unicode)
1429 {
1430 const wchar_t *iter;
1431 Py_UCS4 *ucs4_out;
1432
1433 assert(unicode != NULL);
1434 assert(_PyUnicode_CHECK(unicode));
1435 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1436 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1437
1438 for (iter = begin; iter < end; ) {
1439 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1440 _PyUnicode_GET_LENGTH(unicode)));
1441 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1442 && (iter+1) < end
1443 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1444 {
1445 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1446 iter += 2;
1447 }
1448 else {
1449 *ucs4_out++ = *iter;
1450 iter++;
1451 }
1452 }
1453 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1454 _PyUnicode_GET_LENGTH(unicode)));
1455
1456 }
1457 #endif
1458
1459 static int
unicode_check_modifiable(PyObject * unicode)1460 unicode_check_modifiable(PyObject *unicode)
1461 {
1462 if (!unicode_modifiable(unicode)) {
1463 PyErr_SetString(PyExc_SystemError,
1464 "Cannot modify a string currently used");
1465 return -1;
1466 }
1467 return 0;
1468 }
1469
1470 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1471 _copy_characters(PyObject *to, Py_ssize_t to_start,
1472 PyObject *from, Py_ssize_t from_start,
1473 Py_ssize_t how_many, int check_maxchar)
1474 {
1475 unsigned int from_kind, to_kind;
1476 void *from_data, *to_data;
1477
1478 assert(0 <= how_many);
1479 assert(0 <= from_start);
1480 assert(0 <= to_start);
1481 assert(PyUnicode_Check(from));
1482 assert(PyUnicode_IS_READY(from));
1483 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1484
1485 assert(PyUnicode_Check(to));
1486 assert(PyUnicode_IS_READY(to));
1487 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1488
1489 if (how_many == 0)
1490 return 0;
1491
1492 from_kind = PyUnicode_KIND(from);
1493 from_data = PyUnicode_DATA(from);
1494 to_kind = PyUnicode_KIND(to);
1495 to_data = PyUnicode_DATA(to);
1496
1497 #ifdef Py_DEBUG
1498 if (!check_maxchar
1499 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1500 {
1501 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1502 Py_UCS4 ch;
1503 Py_ssize_t i;
1504 for (i=0; i < how_many; i++) {
1505 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1506 assert(ch <= to_maxchar);
1507 }
1508 }
1509 #endif
1510
1511 if (from_kind == to_kind) {
1512 if (check_maxchar
1513 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1514 {
1515 /* Writing Latin-1 characters into an ASCII string requires to
1516 check that all written characters are pure ASCII */
1517 Py_UCS4 max_char;
1518 max_char = ucs1lib_find_max_char(from_data,
1519 (Py_UCS1*)from_data + how_many);
1520 if (max_char >= 128)
1521 return -1;
1522 }
1523 memcpy((char*)to_data + to_kind * to_start,
1524 (char*)from_data + from_kind * from_start,
1525 to_kind * how_many);
1526 }
1527 else if (from_kind == PyUnicode_1BYTE_KIND
1528 && to_kind == PyUnicode_2BYTE_KIND)
1529 {
1530 _PyUnicode_CONVERT_BYTES(
1531 Py_UCS1, Py_UCS2,
1532 PyUnicode_1BYTE_DATA(from) + from_start,
1533 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1534 PyUnicode_2BYTE_DATA(to) + to_start
1535 );
1536 }
1537 else if (from_kind == PyUnicode_1BYTE_KIND
1538 && to_kind == PyUnicode_4BYTE_KIND)
1539 {
1540 _PyUnicode_CONVERT_BYTES(
1541 Py_UCS1, Py_UCS4,
1542 PyUnicode_1BYTE_DATA(from) + from_start,
1543 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1544 PyUnicode_4BYTE_DATA(to) + to_start
1545 );
1546 }
1547 else if (from_kind == PyUnicode_2BYTE_KIND
1548 && to_kind == PyUnicode_4BYTE_KIND)
1549 {
1550 _PyUnicode_CONVERT_BYTES(
1551 Py_UCS2, Py_UCS4,
1552 PyUnicode_2BYTE_DATA(from) + from_start,
1553 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1554 PyUnicode_4BYTE_DATA(to) + to_start
1555 );
1556 }
1557 else {
1558 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1559
1560 if (!check_maxchar) {
1561 if (from_kind == PyUnicode_2BYTE_KIND
1562 && to_kind == PyUnicode_1BYTE_KIND)
1563 {
1564 _PyUnicode_CONVERT_BYTES(
1565 Py_UCS2, Py_UCS1,
1566 PyUnicode_2BYTE_DATA(from) + from_start,
1567 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1568 PyUnicode_1BYTE_DATA(to) + to_start
1569 );
1570 }
1571 else if (from_kind == PyUnicode_4BYTE_KIND
1572 && to_kind == PyUnicode_1BYTE_KIND)
1573 {
1574 _PyUnicode_CONVERT_BYTES(
1575 Py_UCS4, Py_UCS1,
1576 PyUnicode_4BYTE_DATA(from) + from_start,
1577 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1578 PyUnicode_1BYTE_DATA(to) + to_start
1579 );
1580 }
1581 else if (from_kind == PyUnicode_4BYTE_KIND
1582 && to_kind == PyUnicode_2BYTE_KIND)
1583 {
1584 _PyUnicode_CONVERT_BYTES(
1585 Py_UCS4, Py_UCS2,
1586 PyUnicode_4BYTE_DATA(from) + from_start,
1587 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1588 PyUnicode_2BYTE_DATA(to) + to_start
1589 );
1590 }
1591 else {
1592 Py_UNREACHABLE();
1593 }
1594 }
1595 else {
1596 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1597 Py_UCS4 ch;
1598 Py_ssize_t i;
1599
1600 for (i=0; i < how_many; i++) {
1601 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1602 if (ch > to_maxchar)
1603 return -1;
1604 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1605 }
1606 }
1607 }
1608 return 0;
1609 }
1610
1611 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1612 _PyUnicode_FastCopyCharacters(
1613 PyObject *to, Py_ssize_t to_start,
1614 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1615 {
1616 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1617 }
1618
1619 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1620 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1621 PyObject *from, Py_ssize_t from_start,
1622 Py_ssize_t how_many)
1623 {
1624 int err;
1625
1626 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1627 PyErr_BadInternalCall();
1628 return -1;
1629 }
1630
1631 if (PyUnicode_READY(from) == -1)
1632 return -1;
1633 if (PyUnicode_READY(to) == -1)
1634 return -1;
1635
1636 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1637 PyErr_SetString(PyExc_IndexError, "string index out of range");
1638 return -1;
1639 }
1640 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1641 PyErr_SetString(PyExc_IndexError, "string index out of range");
1642 return -1;
1643 }
1644 if (how_many < 0) {
1645 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1646 return -1;
1647 }
1648 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1649 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1650 PyErr_Format(PyExc_SystemError,
1651 "Cannot write %zi characters at %zi "
1652 "in a string of %zi characters",
1653 how_many, to_start, PyUnicode_GET_LENGTH(to));
1654 return -1;
1655 }
1656
1657 if (how_many == 0)
1658 return 0;
1659
1660 if (unicode_check_modifiable(to))
1661 return -1;
1662
1663 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1664 if (err) {
1665 PyErr_Format(PyExc_SystemError,
1666 "Cannot copy %s characters "
1667 "into a string of %s characters",
1668 unicode_kind_name(from),
1669 unicode_kind_name(to));
1670 return -1;
1671 }
1672 return how_many;
1673 }
1674
1675 /* Find the maximum code point and count the number of surrogate pairs so a
1676 correct string length can be computed before converting a string to UCS4.
1677 This function counts single surrogates as a character and not as a pair.
1678
1679 Return 0 on success, or -1 on error. */
1680 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1681 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1682 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1683 {
1684 const wchar_t *iter;
1685 Py_UCS4 ch;
1686
1687 assert(num_surrogates != NULL && maxchar != NULL);
1688 *num_surrogates = 0;
1689 *maxchar = 0;
1690
1691 for (iter = begin; iter < end; ) {
1692 #if SIZEOF_WCHAR_T == 2
1693 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1694 && (iter+1) < end
1695 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1696 {
1697 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1698 ++(*num_surrogates);
1699 iter += 2;
1700 }
1701 else
1702 #endif
1703 {
1704 ch = *iter;
1705 iter++;
1706 }
1707 if (ch > *maxchar) {
1708 *maxchar = ch;
1709 if (*maxchar > MAX_UNICODE) {
1710 PyErr_Format(PyExc_ValueError,
1711 "character U+%x is not in range [U+0000; U+%x]",
1712 ch, MAX_UNICODE);
1713 return -1;
1714 }
1715 }
1716 }
1717 return 0;
1718 }
1719
1720 int
_PyUnicode_Ready(PyObject * unicode)1721 _PyUnicode_Ready(PyObject *unicode)
1722 {
1723 wchar_t *end;
1724 Py_UCS4 maxchar = 0;
1725 Py_ssize_t num_surrogates;
1726 #if SIZEOF_WCHAR_T == 2
1727 Py_ssize_t length_wo_surrogates;
1728 #endif
1729
1730 /* _PyUnicode_Ready() is only intended for old-style API usage where
1731 strings were created using _PyObject_New() and where no canonical
1732 representation (the str field) has been set yet aka strings
1733 which are not yet ready. */
1734 assert(_PyUnicode_CHECK(unicode));
1735 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1736 assert(_PyUnicode_WSTR(unicode) != NULL);
1737 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1738 assert(_PyUnicode_UTF8(unicode) == NULL);
1739 /* Actually, it should neither be interned nor be anything else: */
1740 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1741
1742 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1743 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1744 &maxchar, &num_surrogates) == -1)
1745 return -1;
1746
1747 if (maxchar < 256) {
1748 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1749 if (!_PyUnicode_DATA_ANY(unicode)) {
1750 PyErr_NoMemory();
1751 return -1;
1752 }
1753 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1754 _PyUnicode_WSTR(unicode), end,
1755 PyUnicode_1BYTE_DATA(unicode));
1756 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1757 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1758 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1759 if (maxchar < 128) {
1760 _PyUnicode_STATE(unicode).ascii = 1;
1761 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1762 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1763 }
1764 else {
1765 _PyUnicode_STATE(unicode).ascii = 0;
1766 _PyUnicode_UTF8(unicode) = NULL;
1767 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1768 }
1769 PyObject_FREE(_PyUnicode_WSTR(unicode));
1770 _PyUnicode_WSTR(unicode) = NULL;
1771 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1772 }
1773 /* In this case we might have to convert down from 4-byte native
1774 wchar_t to 2-byte unicode. */
1775 else if (maxchar < 65536) {
1776 assert(num_surrogates == 0 &&
1777 "FindMaxCharAndNumSurrogatePairs() messed up");
1778
1779 #if SIZEOF_WCHAR_T == 2
1780 /* We can share representations and are done. */
1781 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1782 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1783 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1784 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1785 _PyUnicode_UTF8(unicode) = NULL;
1786 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1787 #else
1788 /* sizeof(wchar_t) == 4 */
1789 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1790 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1791 if (!_PyUnicode_DATA_ANY(unicode)) {
1792 PyErr_NoMemory();
1793 return -1;
1794 }
1795 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1796 _PyUnicode_WSTR(unicode), end,
1797 PyUnicode_2BYTE_DATA(unicode));
1798 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1799 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1800 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1801 _PyUnicode_UTF8(unicode) = NULL;
1802 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1803 PyObject_FREE(_PyUnicode_WSTR(unicode));
1804 _PyUnicode_WSTR(unicode) = NULL;
1805 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1806 #endif
1807 }
1808 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1809 else {
1810 #if SIZEOF_WCHAR_T == 2
1811 /* in case the native representation is 2-bytes, we need to allocate a
1812 new normalized 4-byte version. */
1813 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1814 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1815 PyErr_NoMemory();
1816 return -1;
1817 }
1818 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1819 if (!_PyUnicode_DATA_ANY(unicode)) {
1820 PyErr_NoMemory();
1821 return -1;
1822 }
1823 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1824 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1825 _PyUnicode_UTF8(unicode) = NULL;
1826 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1827 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1828 _PyUnicode_STATE(unicode).ready = 1;
1829 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1830 PyObject_FREE(_PyUnicode_WSTR(unicode));
1831 _PyUnicode_WSTR(unicode) = NULL;
1832 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1833 #else
1834 assert(num_surrogates == 0);
1835
1836 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1837 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1838 _PyUnicode_UTF8(unicode) = NULL;
1839 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1840 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1841 #endif
1842 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1843 }
1844 _PyUnicode_STATE(unicode).ready = 1;
1845 assert(_PyUnicode_CheckConsistency(unicode, 1));
1846 return 0;
1847 }
1848
1849 static void
unicode_dealloc(PyObject * unicode)1850 unicode_dealloc(PyObject *unicode)
1851 {
1852 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1853 case SSTATE_NOT_INTERNED:
1854 break;
1855
1856 case SSTATE_INTERNED_MORTAL:
1857 /* revive dead object temporarily for DelItem */
1858 Py_REFCNT(unicode) = 3;
1859 if (PyDict_DelItem(interned, unicode) != 0)
1860 Py_FatalError(
1861 "deletion of interned string failed");
1862 break;
1863
1864 case SSTATE_INTERNED_IMMORTAL:
1865 Py_FatalError("Immortal interned string died.");
1866 /* fall through */
1867
1868 default:
1869 Py_FatalError("Inconsistent interned string state.");
1870 }
1871
1872 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1873 PyObject_DEL(_PyUnicode_WSTR(unicode));
1874 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1875 PyObject_DEL(_PyUnicode_UTF8(unicode));
1876 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1877 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1878
1879 Py_TYPE(unicode)->tp_free(unicode);
1880 }
1881
1882 #ifdef Py_DEBUG
1883 static int
unicode_is_singleton(PyObject * unicode)1884 unicode_is_singleton(PyObject *unicode)
1885 {
1886 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1887 if (unicode == unicode_empty)
1888 return 1;
1889 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1890 {
1891 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1892 if (ch < 256 && unicode_latin1[ch] == unicode)
1893 return 1;
1894 }
1895 return 0;
1896 }
1897 #endif
1898
1899 static int
unicode_modifiable(PyObject * unicode)1900 unicode_modifiable(PyObject *unicode)
1901 {
1902 assert(_PyUnicode_CHECK(unicode));
1903 if (Py_REFCNT(unicode) != 1)
1904 return 0;
1905 if (_PyUnicode_HASH(unicode) != -1)
1906 return 0;
1907 if (PyUnicode_CHECK_INTERNED(unicode))
1908 return 0;
1909 if (!PyUnicode_CheckExact(unicode))
1910 return 0;
1911 #ifdef Py_DEBUG
1912 /* singleton refcount is greater than 1 */
1913 assert(!unicode_is_singleton(unicode));
1914 #endif
1915 return 1;
1916 }
1917
1918 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1919 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1920 {
1921 PyObject *unicode;
1922 Py_ssize_t old_length;
1923
1924 assert(p_unicode != NULL);
1925 unicode = *p_unicode;
1926
1927 assert(unicode != NULL);
1928 assert(PyUnicode_Check(unicode));
1929 assert(0 <= length);
1930
1931 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1932 old_length = PyUnicode_WSTR_LENGTH(unicode);
1933 else
1934 old_length = PyUnicode_GET_LENGTH(unicode);
1935 if (old_length == length)
1936 return 0;
1937
1938 if (length == 0) {
1939 _Py_INCREF_UNICODE_EMPTY();
1940 if (!unicode_empty)
1941 return -1;
1942 Py_SETREF(*p_unicode, unicode_empty);
1943 return 0;
1944 }
1945
1946 if (!unicode_modifiable(unicode)) {
1947 PyObject *copy = resize_copy(unicode, length);
1948 if (copy == NULL)
1949 return -1;
1950 Py_SETREF(*p_unicode, copy);
1951 return 0;
1952 }
1953
1954 if (PyUnicode_IS_COMPACT(unicode)) {
1955 PyObject *new_unicode = resize_compact(unicode, length);
1956 if (new_unicode == NULL)
1957 return -1;
1958 *p_unicode = new_unicode;
1959 return 0;
1960 }
1961 return resize_inplace(unicode, length);
1962 }
1963
1964 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1965 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1966 {
1967 PyObject *unicode;
1968 if (p_unicode == NULL) {
1969 PyErr_BadInternalCall();
1970 return -1;
1971 }
1972 unicode = *p_unicode;
1973 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1974 {
1975 PyErr_BadInternalCall();
1976 return -1;
1977 }
1978 return unicode_resize(p_unicode, length);
1979 }
1980
1981 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1982
1983 WARNING: The function doesn't copy the terminating null character and
1984 doesn't check the maximum character (may write a latin1 character in an
1985 ASCII string). */
1986 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1987 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1988 const char *str, Py_ssize_t len)
1989 {
1990 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1991 void *data = PyUnicode_DATA(unicode);
1992 const char *end = str + len;
1993
1994 switch (kind) {
1995 case PyUnicode_1BYTE_KIND: {
1996 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1997 #ifdef Py_DEBUG
1998 if (PyUnicode_IS_ASCII(unicode)) {
1999 Py_UCS4 maxchar = ucs1lib_find_max_char(
2000 (const Py_UCS1*)str,
2001 (const Py_UCS1*)str + len);
2002 assert(maxchar < 128);
2003 }
2004 #endif
2005 memcpy((char *) data + index, str, len);
2006 break;
2007 }
2008 case PyUnicode_2BYTE_KIND: {
2009 Py_UCS2 *start = (Py_UCS2 *)data + index;
2010 Py_UCS2 *ucs2 = start;
2011 assert(index <= PyUnicode_GET_LENGTH(unicode));
2012
2013 for (; str < end; ++ucs2, ++str)
2014 *ucs2 = (Py_UCS2)*str;
2015
2016 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2017 break;
2018 }
2019 default: {
2020 Py_UCS4 *start = (Py_UCS4 *)data + index;
2021 Py_UCS4 *ucs4 = start;
2022 assert(kind == PyUnicode_4BYTE_KIND);
2023 assert(index <= PyUnicode_GET_LENGTH(unicode));
2024
2025 for (; str < end; ++ucs4, ++str)
2026 *ucs4 = (Py_UCS4)*str;
2027
2028 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2029 }
2030 }
2031 }
2032
2033 static PyObject*
get_latin1_char(unsigned char ch)2034 get_latin1_char(unsigned char ch)
2035 {
2036 PyObject *unicode = unicode_latin1[ch];
2037 if (!unicode) {
2038 unicode = PyUnicode_New(1, ch);
2039 if (!unicode)
2040 return NULL;
2041 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2042 assert(_PyUnicode_CheckConsistency(unicode, 1));
2043 unicode_latin1[ch] = unicode;
2044 }
2045 Py_INCREF(unicode);
2046 return unicode;
2047 }
2048
2049 static PyObject*
unicode_char(Py_UCS4 ch)2050 unicode_char(Py_UCS4 ch)
2051 {
2052 PyObject *unicode;
2053
2054 assert(ch <= MAX_UNICODE);
2055
2056 if (ch < 256)
2057 return get_latin1_char(ch);
2058
2059 unicode = PyUnicode_New(1, ch);
2060 if (unicode == NULL)
2061 return NULL;
2062
2063 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2064 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2065 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2066 } else {
2067 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2068 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2069 }
2070 assert(_PyUnicode_CheckConsistency(unicode, 1));
2071 return unicode;
2072 }
2073
2074 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2075 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2076 {
2077 if (u == NULL)
2078 return (PyObject*)_PyUnicode_New(size);
2079
2080 if (size < 0) {
2081 PyErr_BadInternalCall();
2082 return NULL;
2083 }
2084
2085 return PyUnicode_FromWideChar(u, size);
2086 }
2087
2088 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2089 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2090 {
2091 PyObject *unicode;
2092 Py_UCS4 maxchar = 0;
2093 Py_ssize_t num_surrogates;
2094
2095 if (u == NULL && size != 0) {
2096 PyErr_BadInternalCall();
2097 return NULL;
2098 }
2099
2100 if (size == -1) {
2101 size = wcslen(u);
2102 }
2103
2104 /* If the Unicode data is known at construction time, we can apply
2105 some optimizations which share commonly used objects. */
2106
2107 /* Optimization for empty strings */
2108 if (size == 0)
2109 _Py_RETURN_UNICODE_EMPTY();
2110
2111 /* Single character Unicode objects in the Latin-1 range are
2112 shared when using this constructor */
2113 if (size == 1 && (Py_UCS4)*u < 256)
2114 return get_latin1_char((unsigned char)*u);
2115
2116 /* If not empty and not single character, copy the Unicode data
2117 into the new object */
2118 if (find_maxchar_surrogates(u, u + size,
2119 &maxchar, &num_surrogates) == -1)
2120 return NULL;
2121
2122 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2123 if (!unicode)
2124 return NULL;
2125
2126 switch (PyUnicode_KIND(unicode)) {
2127 case PyUnicode_1BYTE_KIND:
2128 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2129 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2130 break;
2131 case PyUnicode_2BYTE_KIND:
2132 #if Py_UNICODE_SIZE == 2
2133 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2134 #else
2135 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2136 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2137 #endif
2138 break;
2139 case PyUnicode_4BYTE_KIND:
2140 #if SIZEOF_WCHAR_T == 2
2141 /* This is the only case which has to process surrogates, thus
2142 a simple copy loop is not enough and we need a function. */
2143 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2144 #else
2145 assert(num_surrogates == 0);
2146 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2147 #endif
2148 break;
2149 default:
2150 Py_UNREACHABLE();
2151 }
2152
2153 return unicode_result(unicode);
2154 }
2155
2156 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2157 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2158 {
2159 if (size < 0) {
2160 PyErr_SetString(PyExc_SystemError,
2161 "Negative size passed to PyUnicode_FromStringAndSize");
2162 return NULL;
2163 }
2164 if (u != NULL)
2165 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2166 else
2167 return (PyObject *)_PyUnicode_New(size);
2168 }
2169
2170 PyObject *
PyUnicode_FromString(const char * u)2171 PyUnicode_FromString(const char *u)
2172 {
2173 size_t size = strlen(u);
2174 if (size > PY_SSIZE_T_MAX) {
2175 PyErr_SetString(PyExc_OverflowError, "input too long");
2176 return NULL;
2177 }
2178 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2179 }
2180
2181 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2182 _PyUnicode_FromId(_Py_Identifier *id)
2183 {
2184 if (!id->object) {
2185 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2186 strlen(id->string),
2187 NULL, NULL);
2188 if (!id->object)
2189 return NULL;
2190 PyUnicode_InternInPlace(&id->object);
2191 assert(!id->next);
2192 id->next = static_strings;
2193 static_strings = id;
2194 }
2195 return id->object;
2196 }
2197
2198 void
_PyUnicode_ClearStaticStrings()2199 _PyUnicode_ClearStaticStrings()
2200 {
2201 _Py_Identifier *tmp, *s = static_strings;
2202 while (s) {
2203 Py_CLEAR(s->object);
2204 tmp = s->next;
2205 s->next = NULL;
2206 s = tmp;
2207 }
2208 static_strings = NULL;
2209 }
2210
2211 /* Internal function, doesn't check maximum character */
2212
2213 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2214 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2215 {
2216 const unsigned char *s = (const unsigned char *)buffer;
2217 PyObject *unicode;
2218 if (size == 1) {
2219 #ifdef Py_DEBUG
2220 assert((unsigned char)s[0] < 128);
2221 #endif
2222 return get_latin1_char(s[0]);
2223 }
2224 unicode = PyUnicode_New(size, 127);
2225 if (!unicode)
2226 return NULL;
2227 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2228 assert(_PyUnicode_CheckConsistency(unicode, 1));
2229 return unicode;
2230 }
2231
2232 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2233 kind_maxchar_limit(unsigned int kind)
2234 {
2235 switch (kind) {
2236 case PyUnicode_1BYTE_KIND:
2237 return 0x80;
2238 case PyUnicode_2BYTE_KIND:
2239 return 0x100;
2240 case PyUnicode_4BYTE_KIND:
2241 return 0x10000;
2242 default:
2243 Py_UNREACHABLE();
2244 }
2245 }
2246
2247 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2248 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2249 {
2250 PyObject *res;
2251 unsigned char max_char;
2252
2253 if (size == 0)
2254 _Py_RETURN_UNICODE_EMPTY();
2255 assert(size > 0);
2256 if (size == 1)
2257 return get_latin1_char(u[0]);
2258
2259 max_char = ucs1lib_find_max_char(u, u + size);
2260 res = PyUnicode_New(size, max_char);
2261 if (!res)
2262 return NULL;
2263 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2264 assert(_PyUnicode_CheckConsistency(res, 1));
2265 return res;
2266 }
2267
2268 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2269 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2270 {
2271 PyObject *res;
2272 Py_UCS2 max_char;
2273
2274 if (size == 0)
2275 _Py_RETURN_UNICODE_EMPTY();
2276 assert(size > 0);
2277 if (size == 1)
2278 return unicode_char(u[0]);
2279
2280 max_char = ucs2lib_find_max_char(u, u + size);
2281 res = PyUnicode_New(size, max_char);
2282 if (!res)
2283 return NULL;
2284 if (max_char >= 256)
2285 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2286 else {
2287 _PyUnicode_CONVERT_BYTES(
2288 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2289 }
2290 assert(_PyUnicode_CheckConsistency(res, 1));
2291 return res;
2292 }
2293
2294 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2295 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2296 {
2297 PyObject *res;
2298 Py_UCS4 max_char;
2299
2300 if (size == 0)
2301 _Py_RETURN_UNICODE_EMPTY();
2302 assert(size > 0);
2303 if (size == 1)
2304 return unicode_char(u[0]);
2305
2306 max_char = ucs4lib_find_max_char(u, u + size);
2307 res = PyUnicode_New(size, max_char);
2308 if (!res)
2309 return NULL;
2310 if (max_char < 256)
2311 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2312 PyUnicode_1BYTE_DATA(res));
2313 else if (max_char < 0x10000)
2314 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2315 PyUnicode_2BYTE_DATA(res));
2316 else
2317 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2318 assert(_PyUnicode_CheckConsistency(res, 1));
2319 return res;
2320 }
2321
2322 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2323 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2324 {
2325 if (size < 0) {
2326 PyErr_SetString(PyExc_ValueError, "size must be positive");
2327 return NULL;
2328 }
2329 switch (kind) {
2330 case PyUnicode_1BYTE_KIND:
2331 return _PyUnicode_FromUCS1(buffer, size);
2332 case PyUnicode_2BYTE_KIND:
2333 return _PyUnicode_FromUCS2(buffer, size);
2334 case PyUnicode_4BYTE_KIND:
2335 return _PyUnicode_FromUCS4(buffer, size);
2336 default:
2337 PyErr_SetString(PyExc_SystemError, "invalid kind");
2338 return NULL;
2339 }
2340 }
2341
2342 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2343 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2344 {
2345 enum PyUnicode_Kind kind;
2346 void *startptr, *endptr;
2347
2348 assert(PyUnicode_IS_READY(unicode));
2349 assert(0 <= start);
2350 assert(end <= PyUnicode_GET_LENGTH(unicode));
2351 assert(start <= end);
2352
2353 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2354 return PyUnicode_MAX_CHAR_VALUE(unicode);
2355
2356 if (start == end)
2357 return 127;
2358
2359 if (PyUnicode_IS_ASCII(unicode))
2360 return 127;
2361
2362 kind = PyUnicode_KIND(unicode);
2363 startptr = PyUnicode_DATA(unicode);
2364 endptr = (char *)startptr + end * kind;
2365 startptr = (char *)startptr + start * kind;
2366 switch(kind) {
2367 case PyUnicode_1BYTE_KIND:
2368 return ucs1lib_find_max_char(startptr, endptr);
2369 case PyUnicode_2BYTE_KIND:
2370 return ucs2lib_find_max_char(startptr, endptr);
2371 case PyUnicode_4BYTE_KIND:
2372 return ucs4lib_find_max_char(startptr, endptr);
2373 default:
2374 Py_UNREACHABLE();
2375 }
2376 }
2377
2378 /* Ensure that a string uses the most efficient storage, if it is not the
2379 case: create a new string with of the right kind. Write NULL into *p_unicode
2380 on error. */
2381 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2382 unicode_adjust_maxchar(PyObject **p_unicode)
2383 {
2384 PyObject *unicode, *copy;
2385 Py_UCS4 max_char;
2386 Py_ssize_t len;
2387 unsigned int kind;
2388
2389 assert(p_unicode != NULL);
2390 unicode = *p_unicode;
2391 assert(PyUnicode_IS_READY(unicode));
2392 if (PyUnicode_IS_ASCII(unicode))
2393 return;
2394
2395 len = PyUnicode_GET_LENGTH(unicode);
2396 kind = PyUnicode_KIND(unicode);
2397 if (kind == PyUnicode_1BYTE_KIND) {
2398 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2399 max_char = ucs1lib_find_max_char(u, u + len);
2400 if (max_char >= 128)
2401 return;
2402 }
2403 else if (kind == PyUnicode_2BYTE_KIND) {
2404 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2405 max_char = ucs2lib_find_max_char(u, u + len);
2406 if (max_char >= 256)
2407 return;
2408 }
2409 else {
2410 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2411 assert(kind == PyUnicode_4BYTE_KIND);
2412 max_char = ucs4lib_find_max_char(u, u + len);
2413 if (max_char >= 0x10000)
2414 return;
2415 }
2416 copy = PyUnicode_New(len, max_char);
2417 if (copy != NULL)
2418 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2419 Py_DECREF(unicode);
2420 *p_unicode = copy;
2421 }
2422
2423 PyObject*
_PyUnicode_Copy(PyObject * unicode)2424 _PyUnicode_Copy(PyObject *unicode)
2425 {
2426 Py_ssize_t length;
2427 PyObject *copy;
2428
2429 if (!PyUnicode_Check(unicode)) {
2430 PyErr_BadInternalCall();
2431 return NULL;
2432 }
2433 if (PyUnicode_READY(unicode) == -1)
2434 return NULL;
2435
2436 length = PyUnicode_GET_LENGTH(unicode);
2437 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2438 if (!copy)
2439 return NULL;
2440 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2441
2442 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2443 length * PyUnicode_KIND(unicode));
2444 assert(_PyUnicode_CheckConsistency(copy, 1));
2445 return copy;
2446 }
2447
2448
2449 /* Widen Unicode objects to larger buffers. Don't write terminating null
2450 character. Return NULL on error. */
2451
2452 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2453 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2454 {
2455 Py_ssize_t len;
2456 void *result;
2457 unsigned int skind;
2458
2459 if (PyUnicode_READY(s) == -1)
2460 return NULL;
2461
2462 len = PyUnicode_GET_LENGTH(s);
2463 skind = PyUnicode_KIND(s);
2464 if (skind >= kind) {
2465 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2466 return NULL;
2467 }
2468 switch (kind) {
2469 case PyUnicode_2BYTE_KIND:
2470 result = PyMem_New(Py_UCS2, len);
2471 if (!result)
2472 return PyErr_NoMemory();
2473 assert(skind == PyUnicode_1BYTE_KIND);
2474 _PyUnicode_CONVERT_BYTES(
2475 Py_UCS1, Py_UCS2,
2476 PyUnicode_1BYTE_DATA(s),
2477 PyUnicode_1BYTE_DATA(s) + len,
2478 result);
2479 return result;
2480 case PyUnicode_4BYTE_KIND:
2481 result = PyMem_New(Py_UCS4, len);
2482 if (!result)
2483 return PyErr_NoMemory();
2484 if (skind == PyUnicode_2BYTE_KIND) {
2485 _PyUnicode_CONVERT_BYTES(
2486 Py_UCS2, Py_UCS4,
2487 PyUnicode_2BYTE_DATA(s),
2488 PyUnicode_2BYTE_DATA(s) + len,
2489 result);
2490 }
2491 else {
2492 assert(skind == PyUnicode_1BYTE_KIND);
2493 _PyUnicode_CONVERT_BYTES(
2494 Py_UCS1, Py_UCS4,
2495 PyUnicode_1BYTE_DATA(s),
2496 PyUnicode_1BYTE_DATA(s) + len,
2497 result);
2498 }
2499 return result;
2500 default:
2501 break;
2502 }
2503 PyErr_SetString(PyExc_SystemError, "invalid kind");
2504 return NULL;
2505 }
2506
2507 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2508 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2509 int copy_null)
2510 {
2511 int kind;
2512 void *data;
2513 Py_ssize_t len, targetlen;
2514 if (PyUnicode_READY(string) == -1)
2515 return NULL;
2516 kind = PyUnicode_KIND(string);
2517 data = PyUnicode_DATA(string);
2518 len = PyUnicode_GET_LENGTH(string);
2519 targetlen = len;
2520 if (copy_null)
2521 targetlen++;
2522 if (!target) {
2523 target = PyMem_New(Py_UCS4, targetlen);
2524 if (!target) {
2525 PyErr_NoMemory();
2526 return NULL;
2527 }
2528 }
2529 else {
2530 if (targetsize < targetlen) {
2531 PyErr_Format(PyExc_SystemError,
2532 "string is longer than the buffer");
2533 if (copy_null && 0 < targetsize)
2534 target[0] = 0;
2535 return NULL;
2536 }
2537 }
2538 if (kind == PyUnicode_1BYTE_KIND) {
2539 Py_UCS1 *start = (Py_UCS1 *) data;
2540 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2541 }
2542 else if (kind == PyUnicode_2BYTE_KIND) {
2543 Py_UCS2 *start = (Py_UCS2 *) data;
2544 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2545 }
2546 else {
2547 assert(kind == PyUnicode_4BYTE_KIND);
2548 memcpy(target, data, len * sizeof(Py_UCS4));
2549 }
2550 if (copy_null)
2551 target[len] = 0;
2552 return target;
2553 }
2554
2555 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2556 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2557 int copy_null)
2558 {
2559 if (target == NULL || targetsize < 0) {
2560 PyErr_BadInternalCall();
2561 return NULL;
2562 }
2563 return as_ucs4(string, target, targetsize, copy_null);
2564 }
2565
2566 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2567 PyUnicode_AsUCS4Copy(PyObject *string)
2568 {
2569 return as_ucs4(string, NULL, 0, 1);
2570 }
2571
2572 /* maximum number of characters required for output of %lld or %p.
2573 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2574 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2575 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2576
2577 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2578 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2579 Py_ssize_t width, Py_ssize_t precision)
2580 {
2581 Py_ssize_t length, fill, arglen;
2582 Py_UCS4 maxchar;
2583
2584 if (PyUnicode_READY(str) == -1)
2585 return -1;
2586
2587 length = PyUnicode_GET_LENGTH(str);
2588 if ((precision == -1 || precision >= length)
2589 && width <= length)
2590 return _PyUnicodeWriter_WriteStr(writer, str);
2591
2592 if (precision != -1)
2593 length = Py_MIN(precision, length);
2594
2595 arglen = Py_MAX(length, width);
2596 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2597 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2598 else
2599 maxchar = writer->maxchar;
2600
2601 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2602 return -1;
2603
2604 if (width > length) {
2605 fill = width - length;
2606 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2607 return -1;
2608 writer->pos += fill;
2609 }
2610
2611 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2612 str, 0, length);
2613 writer->pos += length;
2614 return 0;
2615 }
2616
2617 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2618 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2619 Py_ssize_t width, Py_ssize_t precision)
2620 {
2621 /* UTF-8 */
2622 Py_ssize_t length;
2623 PyObject *unicode;
2624 int res;
2625
2626 if (precision == -1) {
2627 length = strlen(str);
2628 }
2629 else {
2630 length = 0;
2631 while (length < precision && str[length]) {
2632 length++;
2633 }
2634 }
2635 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2636 if (unicode == NULL)
2637 return -1;
2638
2639 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2640 Py_DECREF(unicode);
2641 return res;
2642 }
2643
2644 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2645 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2646 const char *f, va_list *vargs)
2647 {
2648 const char *p;
2649 Py_ssize_t len;
2650 int zeropad;
2651 Py_ssize_t width;
2652 Py_ssize_t precision;
2653 int longflag;
2654 int longlongflag;
2655 int size_tflag;
2656 Py_ssize_t fill;
2657
2658 p = f;
2659 f++;
2660 zeropad = 0;
2661 if (*f == '0') {
2662 zeropad = 1;
2663 f++;
2664 }
2665
2666 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2667 width = -1;
2668 if (Py_ISDIGIT((unsigned)*f)) {
2669 width = *f - '0';
2670 f++;
2671 while (Py_ISDIGIT((unsigned)*f)) {
2672 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2673 PyErr_SetString(PyExc_ValueError,
2674 "width too big");
2675 return NULL;
2676 }
2677 width = (width * 10) + (*f - '0');
2678 f++;
2679 }
2680 }
2681 precision = -1;
2682 if (*f == '.') {
2683 f++;
2684 if (Py_ISDIGIT((unsigned)*f)) {
2685 precision = (*f - '0');
2686 f++;
2687 while (Py_ISDIGIT((unsigned)*f)) {
2688 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2689 PyErr_SetString(PyExc_ValueError,
2690 "precision too big");
2691 return NULL;
2692 }
2693 precision = (precision * 10) + (*f - '0');
2694 f++;
2695 }
2696 }
2697 if (*f == '%') {
2698 /* "%.3%s" => f points to "3" */
2699 f--;
2700 }
2701 }
2702 if (*f == '\0') {
2703 /* bogus format "%.123" => go backward, f points to "3" */
2704 f--;
2705 }
2706
2707 /* Handle %ld, %lu, %lld and %llu. */
2708 longflag = 0;
2709 longlongflag = 0;
2710 size_tflag = 0;
2711 if (*f == 'l') {
2712 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2713 longflag = 1;
2714 ++f;
2715 }
2716 else if (f[1] == 'l' &&
2717 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2718 longlongflag = 1;
2719 f += 2;
2720 }
2721 }
2722 /* handle the size_t flag. */
2723 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2724 size_tflag = 1;
2725 ++f;
2726 }
2727
2728 if (f[1] == '\0')
2729 writer->overallocate = 0;
2730
2731 switch (*f) {
2732 case 'c':
2733 {
2734 int ordinal = va_arg(*vargs, int);
2735 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2736 PyErr_SetString(PyExc_OverflowError,
2737 "character argument not in range(0x110000)");
2738 return NULL;
2739 }
2740 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2741 return NULL;
2742 break;
2743 }
2744
2745 case 'i':
2746 case 'd':
2747 case 'u':
2748 case 'x':
2749 {
2750 /* used by sprintf */
2751 char buffer[MAX_LONG_LONG_CHARS];
2752 Py_ssize_t arglen;
2753
2754 if (*f == 'u') {
2755 if (longflag)
2756 len = sprintf(buffer, "%lu",
2757 va_arg(*vargs, unsigned long));
2758 else if (longlongflag)
2759 len = sprintf(buffer, "%llu",
2760 va_arg(*vargs, unsigned long long));
2761 else if (size_tflag)
2762 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2763 va_arg(*vargs, size_t));
2764 else
2765 len = sprintf(buffer, "%u",
2766 va_arg(*vargs, unsigned int));
2767 }
2768 else if (*f == 'x') {
2769 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2770 }
2771 else {
2772 if (longflag)
2773 len = sprintf(buffer, "%li",
2774 va_arg(*vargs, long));
2775 else if (longlongflag)
2776 len = sprintf(buffer, "%lli",
2777 va_arg(*vargs, long long));
2778 else if (size_tflag)
2779 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2780 va_arg(*vargs, Py_ssize_t));
2781 else
2782 len = sprintf(buffer, "%i",
2783 va_arg(*vargs, int));
2784 }
2785 assert(len >= 0);
2786
2787 if (precision < len)
2788 precision = len;
2789
2790 arglen = Py_MAX(precision, width);
2791 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2792 return NULL;
2793
2794 if (width > precision) {
2795 Py_UCS4 fillchar;
2796 fill = width - precision;
2797 fillchar = zeropad?'0':' ';
2798 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2799 return NULL;
2800 writer->pos += fill;
2801 }
2802 if (precision > len) {
2803 fill = precision - len;
2804 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2805 return NULL;
2806 writer->pos += fill;
2807 }
2808
2809 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2810 return NULL;
2811 break;
2812 }
2813
2814 case 'p':
2815 {
2816 char number[MAX_LONG_LONG_CHARS];
2817
2818 len = sprintf(number, "%p", va_arg(*vargs, void*));
2819 assert(len >= 0);
2820
2821 /* %p is ill-defined: ensure leading 0x. */
2822 if (number[1] == 'X')
2823 number[1] = 'x';
2824 else if (number[1] != 'x') {
2825 memmove(number + 2, number,
2826 strlen(number) + 1);
2827 number[0] = '0';
2828 number[1] = 'x';
2829 len += 2;
2830 }
2831
2832 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2833 return NULL;
2834 break;
2835 }
2836
2837 case 's':
2838 {
2839 /* UTF-8 */
2840 const char *s = va_arg(*vargs, const char*);
2841 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2842 return NULL;
2843 break;
2844 }
2845
2846 case 'U':
2847 {
2848 PyObject *obj = va_arg(*vargs, PyObject *);
2849 assert(obj && _PyUnicode_CHECK(obj));
2850
2851 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2852 return NULL;
2853 break;
2854 }
2855
2856 case 'V':
2857 {
2858 PyObject *obj = va_arg(*vargs, PyObject *);
2859 const char *str = va_arg(*vargs, const char *);
2860 if (obj) {
2861 assert(_PyUnicode_CHECK(obj));
2862 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2863 return NULL;
2864 }
2865 else {
2866 assert(str != NULL);
2867 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2868 return NULL;
2869 }
2870 break;
2871 }
2872
2873 case 'S':
2874 {
2875 PyObject *obj = va_arg(*vargs, PyObject *);
2876 PyObject *str;
2877 assert(obj);
2878 str = PyObject_Str(obj);
2879 if (!str)
2880 return NULL;
2881 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2882 Py_DECREF(str);
2883 return NULL;
2884 }
2885 Py_DECREF(str);
2886 break;
2887 }
2888
2889 case 'R':
2890 {
2891 PyObject *obj = va_arg(*vargs, PyObject *);
2892 PyObject *repr;
2893 assert(obj);
2894 repr = PyObject_Repr(obj);
2895 if (!repr)
2896 return NULL;
2897 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2898 Py_DECREF(repr);
2899 return NULL;
2900 }
2901 Py_DECREF(repr);
2902 break;
2903 }
2904
2905 case 'A':
2906 {
2907 PyObject *obj = va_arg(*vargs, PyObject *);
2908 PyObject *ascii;
2909 assert(obj);
2910 ascii = PyObject_ASCII(obj);
2911 if (!ascii)
2912 return NULL;
2913 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2914 Py_DECREF(ascii);
2915 return NULL;
2916 }
2917 Py_DECREF(ascii);
2918 break;
2919 }
2920
2921 case '%':
2922 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2923 return NULL;
2924 break;
2925
2926 default:
2927 /* if we stumble upon an unknown formatting code, copy the rest
2928 of the format string to the output string. (we cannot just
2929 skip the code, since there's no way to know what's in the
2930 argument list) */
2931 len = strlen(p);
2932 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2933 return NULL;
2934 f = p+len;
2935 return f;
2936 }
2937
2938 f++;
2939 return f;
2940 }
2941
2942 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2943 PyUnicode_FromFormatV(const char *format, va_list vargs)
2944 {
2945 va_list vargs2;
2946 const char *f;
2947 _PyUnicodeWriter writer;
2948
2949 _PyUnicodeWriter_Init(&writer);
2950 writer.min_length = strlen(format) + 100;
2951 writer.overallocate = 1;
2952
2953 // Copy varags to be able to pass a reference to a subfunction.
2954 va_copy(vargs2, vargs);
2955
2956 for (f = format; *f; ) {
2957 if (*f == '%') {
2958 f = unicode_fromformat_arg(&writer, f, &vargs2);
2959 if (f == NULL)
2960 goto fail;
2961 }
2962 else {
2963 const char *p;
2964 Py_ssize_t len;
2965
2966 p = f;
2967 do
2968 {
2969 if ((unsigned char)*p > 127) {
2970 PyErr_Format(PyExc_ValueError,
2971 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2972 "string, got a non-ASCII byte: 0x%02x",
2973 (unsigned char)*p);
2974 goto fail;
2975 }
2976 p++;
2977 }
2978 while (*p != '\0' && *p != '%');
2979 len = p - f;
2980
2981 if (*p == '\0')
2982 writer.overallocate = 0;
2983
2984 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2985 goto fail;
2986
2987 f = p;
2988 }
2989 }
2990 va_end(vargs2);
2991 return _PyUnicodeWriter_Finish(&writer);
2992
2993 fail:
2994 va_end(vargs2);
2995 _PyUnicodeWriter_Dealloc(&writer);
2996 return NULL;
2997 }
2998
2999 PyObject *
PyUnicode_FromFormat(const char * format,...)3000 PyUnicode_FromFormat(const char *format, ...)
3001 {
3002 PyObject* ret;
3003 va_list vargs;
3004
3005 #ifdef HAVE_STDARG_PROTOTYPES
3006 va_start(vargs, format);
3007 #else
3008 va_start(vargs);
3009 #endif
3010 ret = PyUnicode_FromFormatV(format, vargs);
3011 va_end(vargs);
3012 return ret;
3013 }
3014
3015 static Py_ssize_t
unicode_get_widechar_size(PyObject * unicode)3016 unicode_get_widechar_size(PyObject *unicode)
3017 {
3018 Py_ssize_t res;
3019
3020 assert(unicode != NULL);
3021 assert(_PyUnicode_CHECK(unicode));
3022
3023 if (_PyUnicode_WSTR(unicode) != NULL) {
3024 return PyUnicode_WSTR_LENGTH(unicode);
3025 }
3026 assert(PyUnicode_IS_READY(unicode));
3027
3028 res = _PyUnicode_LENGTH(unicode);
3029 #if SIZEOF_WCHAR_T == 2
3030 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3031 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3032 const Py_UCS4 *end = s + res;
3033 for (; s < end; ++s) {
3034 if (*s > 0xFFFF) {
3035 ++res;
3036 }
3037 }
3038 }
3039 #endif
3040 return res;
3041 }
3042
3043 static void
unicode_copy_as_widechar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3044 unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3045 {
3046 const wchar_t *wstr;
3047
3048 assert(unicode != NULL);
3049 assert(_PyUnicode_CHECK(unicode));
3050
3051 wstr = _PyUnicode_WSTR(unicode);
3052 if (wstr != NULL) {
3053 memcpy(w, wstr, size * sizeof(wchar_t));
3054 return;
3055 }
3056 assert(PyUnicode_IS_READY(unicode));
3057
3058 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3059 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3060 for (; size--; ++s, ++w) {
3061 *w = *s;
3062 }
3063 }
3064 else {
3065 #if SIZEOF_WCHAR_T == 4
3066 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3067 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3068 for (; size--; ++s, ++w) {
3069 *w = *s;
3070 }
3071 #else
3072 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3073 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3074 for (; size--; ++s, ++w) {
3075 Py_UCS4 ch = *s;
3076 if (ch > 0xFFFF) {
3077 assert(ch <= MAX_UNICODE);
3078 /* encode surrogate pair in this case */
3079 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3080 if (!size--)
3081 break;
3082 *w = Py_UNICODE_LOW_SURROGATE(ch);
3083 }
3084 else {
3085 *w = ch;
3086 }
3087 }
3088 #endif
3089 }
3090 }
3091
3092 #ifdef HAVE_WCHAR_H
3093
3094 /* Convert a Unicode object to a wide character string.
3095
3096 - If w is NULL: return the number of wide characters (including the null
3097 character) required to convert the unicode object. Ignore size argument.
3098
3099 - Otherwise: return the number of wide characters (excluding the null
3100 character) written into w. Write at most size wide characters (including
3101 the null character). */
3102 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)3103 PyUnicode_AsWideChar(PyObject *unicode,
3104 wchar_t *w,
3105 Py_ssize_t size)
3106 {
3107 Py_ssize_t res;
3108
3109 if (unicode == NULL) {
3110 PyErr_BadInternalCall();
3111 return -1;
3112 }
3113 if (!PyUnicode_Check(unicode)) {
3114 PyErr_BadArgument();
3115 return -1;
3116 }
3117
3118 res = unicode_get_widechar_size(unicode);
3119 if (w == NULL) {
3120 return res + 1;
3121 }
3122
3123 if (size > res) {
3124 size = res + 1;
3125 }
3126 else {
3127 res = size;
3128 }
3129 unicode_copy_as_widechar(unicode, w, size);
3130 return res;
3131 }
3132
3133 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3134 PyUnicode_AsWideCharString(PyObject *unicode,
3135 Py_ssize_t *size)
3136 {
3137 wchar_t *buffer;
3138 Py_ssize_t buflen;
3139
3140 if (unicode == NULL) {
3141 PyErr_BadInternalCall();
3142 return NULL;
3143 }
3144 if (!PyUnicode_Check(unicode)) {
3145 PyErr_BadArgument();
3146 return NULL;
3147 }
3148
3149 buflen = unicode_get_widechar_size(unicode);
3150 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3151 if (buffer == NULL) {
3152 PyErr_NoMemory();
3153 return NULL;
3154 }
3155 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3156 if (size != NULL) {
3157 *size = buflen;
3158 }
3159 else if (wcslen(buffer) != (size_t)buflen) {
3160 PyMem_FREE(buffer);
3161 PyErr_SetString(PyExc_ValueError,
3162 "embedded null character");
3163 return NULL;
3164 }
3165 return buffer;
3166 }
3167
3168 #endif /* HAVE_WCHAR_H */
3169
3170 PyObject *
PyUnicode_FromOrdinal(int ordinal)3171 PyUnicode_FromOrdinal(int ordinal)
3172 {
3173 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3174 PyErr_SetString(PyExc_ValueError,
3175 "chr() arg not in range(0x110000)");
3176 return NULL;
3177 }
3178
3179 return unicode_char((Py_UCS4)ordinal);
3180 }
3181
3182 PyObject *
PyUnicode_FromObject(PyObject * obj)3183 PyUnicode_FromObject(PyObject *obj)
3184 {
3185 /* XXX Perhaps we should make this API an alias of
3186 PyObject_Str() instead ?! */
3187 if (PyUnicode_CheckExact(obj)) {
3188 if (PyUnicode_READY(obj) == -1)
3189 return NULL;
3190 Py_INCREF(obj);
3191 return obj;
3192 }
3193 if (PyUnicode_Check(obj)) {
3194 /* For a Unicode subtype that's not a Unicode object,
3195 return a true Unicode object with the same data. */
3196 return _PyUnicode_Copy(obj);
3197 }
3198 PyErr_Format(PyExc_TypeError,
3199 "Can't convert '%.100s' object to str implicitly",
3200 Py_TYPE(obj)->tp_name);
3201 return NULL;
3202 }
3203
3204 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3205 PyUnicode_FromEncodedObject(PyObject *obj,
3206 const char *encoding,
3207 const char *errors)
3208 {
3209 Py_buffer buffer;
3210 PyObject *v;
3211
3212 if (obj == NULL) {
3213 PyErr_BadInternalCall();
3214 return NULL;
3215 }
3216
3217 /* Decoding bytes objects is the most common case and should be fast */
3218 if (PyBytes_Check(obj)) {
3219 if (PyBytes_GET_SIZE(obj) == 0)
3220 _Py_RETURN_UNICODE_EMPTY();
3221 v = PyUnicode_Decode(
3222 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3223 encoding, errors);
3224 return v;
3225 }
3226
3227 if (PyUnicode_Check(obj)) {
3228 PyErr_SetString(PyExc_TypeError,
3229 "decoding str is not supported");
3230 return NULL;
3231 }
3232
3233 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3234 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3235 PyErr_Format(PyExc_TypeError,
3236 "decoding to str: need a bytes-like object, %.80s found",
3237 Py_TYPE(obj)->tp_name);
3238 return NULL;
3239 }
3240
3241 if (buffer.len == 0) {
3242 PyBuffer_Release(&buffer);
3243 _Py_RETURN_UNICODE_EMPTY();
3244 }
3245
3246 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3247 PyBuffer_Release(&buffer);
3248 return v;
3249 }
3250
3251 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3252 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3253 longer than lower_len-1). */
3254 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3255 _Py_normalize_encoding(const char *encoding,
3256 char *lower,
3257 size_t lower_len)
3258 {
3259 const char *e;
3260 char *l;
3261 char *l_end;
3262 int punct;
3263
3264 assert(encoding != NULL);
3265
3266 e = encoding;
3267 l = lower;
3268 l_end = &lower[lower_len - 1];
3269 punct = 0;
3270 while (1) {
3271 char c = *e;
3272 if (c == 0) {
3273 break;
3274 }
3275
3276 if (Py_ISALNUM(c) || c == '.') {
3277 if (punct && l != lower) {
3278 if (l == l_end) {
3279 return 0;
3280 }
3281 *l++ = '_';
3282 }
3283 punct = 0;
3284
3285 if (l == l_end) {
3286 return 0;
3287 }
3288 *l++ = Py_TOLOWER(c);
3289 }
3290 else {
3291 punct = 1;
3292 }
3293
3294 e++;
3295 }
3296 *l = '\0';
3297 return 1;
3298 }
3299
3300 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3301 PyUnicode_Decode(const char *s,
3302 Py_ssize_t size,
3303 const char *encoding,
3304 const char *errors)
3305 {
3306 PyObject *buffer = NULL, *unicode;
3307 Py_buffer info;
3308 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3309
3310 if (encoding == NULL) {
3311 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3312 }
3313
3314 /* Shortcuts for common default encodings */
3315 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3316 char *lower = buflower;
3317
3318 /* Fast paths */
3319 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3320 lower += 3;
3321 if (*lower == '_') {
3322 /* Match "utf8" and "utf_8" */
3323 lower++;
3324 }
3325
3326 if (lower[0] == '8' && lower[1] == 0) {
3327 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3328 }
3329 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3330 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3331 }
3332 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3333 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3334 }
3335 }
3336 else {
3337 if (strcmp(lower, "ascii") == 0
3338 || strcmp(lower, "us_ascii") == 0) {
3339 return PyUnicode_DecodeASCII(s, size, errors);
3340 }
3341 #ifdef MS_WINDOWS
3342 else if (strcmp(lower, "mbcs") == 0) {
3343 return PyUnicode_DecodeMBCS(s, size, errors);
3344 }
3345 #endif
3346 else if (strcmp(lower, "latin1") == 0
3347 || strcmp(lower, "latin_1") == 0
3348 || strcmp(lower, "iso_8859_1") == 0
3349 || strcmp(lower, "iso8859_1") == 0) {
3350 return PyUnicode_DecodeLatin1(s, size, errors);
3351 }
3352 }
3353 }
3354
3355 /* Decode via the codec registry */
3356 buffer = NULL;
3357 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3358 goto onError;
3359 buffer = PyMemoryView_FromBuffer(&info);
3360 if (buffer == NULL)
3361 goto onError;
3362 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3363 if (unicode == NULL)
3364 goto onError;
3365 if (!PyUnicode_Check(unicode)) {
3366 PyErr_Format(PyExc_TypeError,
3367 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3368 "use codecs.decode() to decode to arbitrary types",
3369 encoding,
3370 Py_TYPE(unicode)->tp_name);
3371 Py_DECREF(unicode);
3372 goto onError;
3373 }
3374 Py_DECREF(buffer);
3375 return unicode_result(unicode);
3376
3377 onError:
3378 Py_XDECREF(buffer);
3379 return NULL;
3380 }
3381
3382 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3383 PyUnicode_AsDecodedObject(PyObject *unicode,
3384 const char *encoding,
3385 const char *errors)
3386 {
3387 if (!PyUnicode_Check(unicode)) {
3388 PyErr_BadArgument();
3389 return NULL;
3390 }
3391
3392 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3393 "PyUnicode_AsDecodedObject() is deprecated; "
3394 "use PyCodec_Decode() to decode from str", 1) < 0)
3395 return NULL;
3396
3397 if (encoding == NULL)
3398 encoding = PyUnicode_GetDefaultEncoding();
3399
3400 /* Decode via the codec registry */
3401 return PyCodec_Decode(unicode, encoding, errors);
3402 }
3403
3404 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3405 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3406 const char *encoding,
3407 const char *errors)
3408 {
3409 PyObject *v;
3410
3411 if (!PyUnicode_Check(unicode)) {
3412 PyErr_BadArgument();
3413 goto onError;
3414 }
3415
3416 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3417 "PyUnicode_AsDecodedUnicode() is deprecated; "
3418 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3419 return NULL;
3420
3421 if (encoding == NULL)
3422 encoding = PyUnicode_GetDefaultEncoding();
3423
3424 /* Decode via the codec registry */
3425 v = PyCodec_Decode(unicode, encoding, errors);
3426 if (v == NULL)
3427 goto onError;
3428 if (!PyUnicode_Check(v)) {
3429 PyErr_Format(PyExc_TypeError,
3430 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3431 "use codecs.decode() to decode to arbitrary types",
3432 encoding,
3433 Py_TYPE(unicode)->tp_name);
3434 Py_DECREF(v);
3435 goto onError;
3436 }
3437 return unicode_result(v);
3438
3439 onError:
3440 return NULL;
3441 }
3442
3443 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3444 PyUnicode_Encode(const Py_UNICODE *s,
3445 Py_ssize_t size,
3446 const char *encoding,
3447 const char *errors)
3448 {
3449 PyObject *v, *unicode;
3450
3451 unicode = PyUnicode_FromWideChar(s, size);
3452 if (unicode == NULL)
3453 return NULL;
3454 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3455 Py_DECREF(unicode);
3456 return v;
3457 }
3458
3459 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3460 PyUnicode_AsEncodedObject(PyObject *unicode,
3461 const char *encoding,
3462 const char *errors)
3463 {
3464 PyObject *v;
3465
3466 if (!PyUnicode_Check(unicode)) {
3467 PyErr_BadArgument();
3468 goto onError;
3469 }
3470
3471 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3472 "PyUnicode_AsEncodedObject() is deprecated; "
3473 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3474 "or PyCodec_Encode() for generic encoding", 1) < 0)
3475 return NULL;
3476
3477 if (encoding == NULL)
3478 encoding = PyUnicode_GetDefaultEncoding();
3479
3480 /* Encode via the codec registry */
3481 v = PyCodec_Encode(unicode, encoding, errors);
3482 if (v == NULL)
3483 goto onError;
3484 return v;
3485
3486 onError:
3487 return NULL;
3488 }
3489
3490
3491 static PyObject *
unicode_encode_locale(PyObject * unicode,_Py_error_handler error_handler,int current_locale)3492 unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3493 int current_locale)
3494 {
3495 Py_ssize_t wlen;
3496 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3497 if (wstr == NULL) {
3498 return NULL;
3499 }
3500
3501 if ((size_t)wlen != wcslen(wstr)) {
3502 PyErr_SetString(PyExc_ValueError, "embedded null character");
3503 PyMem_Free(wstr);
3504 return NULL;
3505 }
3506
3507 char *str;
3508 size_t error_pos;
3509 const char *reason;
3510 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3511 current_locale, error_handler);
3512 PyMem_Free(wstr);
3513
3514 if (res != 0) {
3515 if (res == -2) {
3516 PyObject *exc;
3517 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3518 "locale", unicode,
3519 (Py_ssize_t)error_pos,
3520 (Py_ssize_t)(error_pos+1),
3521 reason);
3522 if (exc != NULL) {
3523 PyCodec_StrictErrors(exc);
3524 Py_DECREF(exc);
3525 }
3526 }
3527 else if (res == -3) {
3528 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3529 }
3530 else {
3531 PyErr_NoMemory();
3532 }
3533 return NULL;
3534 }
3535
3536 PyObject *bytes = PyBytes_FromString(str);
3537 PyMem_RawFree(str);
3538 return bytes;
3539 }
3540
3541 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3542 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3543 {
3544 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3545 return unicode_encode_locale(unicode, error_handler, 1);
3546 }
3547
3548 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3549 PyUnicode_EncodeFSDefault(PyObject *unicode)
3550 {
3551 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3552 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3553 if (interp->fs_codec.encoding) {
3554 return unicode_encode_utf8(unicode,
3555 interp->fs_codec.error_handler,
3556 interp->fs_codec.errors);
3557 }
3558 else {
3559 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3560 _Py_error_handler errors;
3561 errors = get_error_handler_wide(filesystem_errors);
3562 assert(errors != _Py_ERROR_UNKNOWN);
3563 return unicode_encode_utf8(unicode, errors, NULL);
3564 }
3565 #else
3566 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3567 cannot use it to encode and decode filenames before it is loaded. Load
3568 the Python codec requires to encode at least its own filename. Use the C
3569 implementation of the locale codec until the codec registry is
3570 initialized and the Python codec is loaded. See initfsencoding(). */
3571 if (interp->fs_codec.encoding) {
3572 return PyUnicode_AsEncodedString(unicode,
3573 interp->fs_codec.encoding,
3574 interp->fs_codec.errors);
3575 }
3576 else {
3577 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3578 _Py_error_handler errors;
3579 errors = get_error_handler_wide(filesystem_errors);
3580 assert(errors != _Py_ERROR_UNKNOWN);
3581 return unicode_encode_locale(unicode, errors, 0);
3582 }
3583 #endif
3584 }
3585
3586 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3587 PyUnicode_AsEncodedString(PyObject *unicode,
3588 const char *encoding,
3589 const char *errors)
3590 {
3591 PyObject *v;
3592 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3593
3594 if (!PyUnicode_Check(unicode)) {
3595 PyErr_BadArgument();
3596 return NULL;
3597 }
3598
3599 if (encoding == NULL) {
3600 return _PyUnicode_AsUTF8String(unicode, errors);
3601 }
3602
3603 /* Shortcuts for common default encodings */
3604 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3605 char *lower = buflower;
3606
3607 /* Fast paths */
3608 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3609 lower += 3;
3610 if (*lower == '_') {
3611 /* Match "utf8" and "utf_8" */
3612 lower++;
3613 }
3614
3615 if (lower[0] == '8' && lower[1] == 0) {
3616 return _PyUnicode_AsUTF8String(unicode, errors);
3617 }
3618 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3619 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3620 }
3621 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3622 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3623 }
3624 }
3625 else {
3626 if (strcmp(lower, "ascii") == 0
3627 || strcmp(lower, "us_ascii") == 0) {
3628 return _PyUnicode_AsASCIIString(unicode, errors);
3629 }
3630 #ifdef MS_WINDOWS
3631 else if (strcmp(lower, "mbcs") == 0) {
3632 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3633 }
3634 #endif
3635 else if (strcmp(lower, "latin1") == 0 ||
3636 strcmp(lower, "latin_1") == 0 ||
3637 strcmp(lower, "iso_8859_1") == 0 ||
3638 strcmp(lower, "iso8859_1") == 0) {
3639 return _PyUnicode_AsLatin1String(unicode, errors);
3640 }
3641 }
3642 }
3643
3644 /* Encode via the codec registry */
3645 v = _PyCodec_EncodeText(unicode, encoding, errors);
3646 if (v == NULL)
3647 return NULL;
3648
3649 /* The normal path */
3650 if (PyBytes_Check(v))
3651 return v;
3652
3653 /* If the codec returns a buffer, raise a warning and convert to bytes */
3654 if (PyByteArray_Check(v)) {
3655 int error;
3656 PyObject *b;
3657
3658 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3659 "encoder %s returned bytearray instead of bytes; "
3660 "use codecs.encode() to encode to arbitrary types",
3661 encoding);
3662 if (error) {
3663 Py_DECREF(v);
3664 return NULL;
3665 }
3666
3667 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3668 PyByteArray_GET_SIZE(v));
3669 Py_DECREF(v);
3670 return b;
3671 }
3672
3673 PyErr_Format(PyExc_TypeError,
3674 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3675 "use codecs.encode() to encode to arbitrary types",
3676 encoding,
3677 Py_TYPE(v)->tp_name);
3678 Py_DECREF(v);
3679 return NULL;
3680 }
3681
3682 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3683 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3684 const char *encoding,
3685 const char *errors)
3686 {
3687 PyObject *v;
3688
3689 if (!PyUnicode_Check(unicode)) {
3690 PyErr_BadArgument();
3691 goto onError;
3692 }
3693
3694 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3695 "PyUnicode_AsEncodedUnicode() is deprecated; "
3696 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3697 return NULL;
3698
3699 if (encoding == NULL)
3700 encoding = PyUnicode_GetDefaultEncoding();
3701
3702 /* Encode via the codec registry */
3703 v = PyCodec_Encode(unicode, encoding, errors);
3704 if (v == NULL)
3705 goto onError;
3706 if (!PyUnicode_Check(v)) {
3707 PyErr_Format(PyExc_TypeError,
3708 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3709 "use codecs.encode() to encode to arbitrary types",
3710 encoding,
3711 Py_TYPE(v)->tp_name);
3712 Py_DECREF(v);
3713 goto onError;
3714 }
3715 return v;
3716
3717 onError:
3718 return NULL;
3719 }
3720
3721 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,_Py_error_handler errors,int current_locale)3722 unicode_decode_locale(const char *str, Py_ssize_t len,
3723 _Py_error_handler errors, int current_locale)
3724 {
3725 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3726 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3727 return NULL;
3728 }
3729
3730 wchar_t *wstr;
3731 size_t wlen;
3732 const char *reason;
3733 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3734 current_locale, errors);
3735 if (res != 0) {
3736 if (res == -2) {
3737 PyObject *exc;
3738 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3739 "locale", str, len,
3740 (Py_ssize_t)wlen,
3741 (Py_ssize_t)(wlen + 1),
3742 reason);
3743 if (exc != NULL) {
3744 PyCodec_StrictErrors(exc);
3745 Py_DECREF(exc);
3746 }
3747 }
3748 else if (res == -3) {
3749 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3750 }
3751 else {
3752 PyErr_NoMemory();
3753 }
3754 return NULL;
3755 }
3756
3757 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3758 PyMem_RawFree(wstr);
3759 return unicode;
3760 }
3761
3762 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3763 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3764 const char *errors)
3765 {
3766 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3767 return unicode_decode_locale(str, len, error_handler, 1);
3768 }
3769
3770 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3771 PyUnicode_DecodeLocale(const char *str, const char *errors)
3772 {
3773 Py_ssize_t size = (Py_ssize_t)strlen(str);
3774 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3775 return unicode_decode_locale(str, size, error_handler, 1);
3776 }
3777
3778
3779 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3780 PyUnicode_DecodeFSDefault(const char *s) {
3781 Py_ssize_t size = (Py_ssize_t)strlen(s);
3782 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3783 }
3784
3785 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3786 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3787 {
3788 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3789 #ifdef _Py_FORCE_UTF8_FS_ENCODING
3790 if (interp->fs_codec.encoding) {
3791 return unicode_decode_utf8(s, size,
3792 interp->fs_codec.error_handler,
3793 interp->fs_codec.errors,
3794 NULL);
3795 }
3796 else {
3797 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3798 _Py_error_handler errors;
3799 errors = get_error_handler_wide(filesystem_errors);
3800 assert(errors != _Py_ERROR_UNKNOWN);
3801 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3802 }
3803 #else
3804 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3805 cannot use it to encode and decode filenames before it is loaded. Load
3806 the Python codec requires to encode at least its own filename. Use the C
3807 implementation of the locale codec until the codec registry is
3808 initialized and the Python codec is loaded. See initfsencoding(). */
3809 if (interp->fs_codec.encoding) {
3810 return PyUnicode_Decode(s, size,
3811 interp->fs_codec.encoding,
3812 interp->fs_codec.errors);
3813 }
3814 else {
3815 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3816 _Py_error_handler errors;
3817 errors = get_error_handler_wide(filesystem_errors);
3818 return unicode_decode_locale(s, size, errors, 0);
3819 }
3820 #endif
3821 }
3822
3823
3824 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3825 PyUnicode_FSConverter(PyObject* arg, void* addr)
3826 {
3827 PyObject *path = NULL;
3828 PyObject *output = NULL;
3829 Py_ssize_t size;
3830 void *data;
3831 if (arg == NULL) {
3832 Py_DECREF(*(PyObject**)addr);
3833 *(PyObject**)addr = NULL;
3834 return 1;
3835 }
3836 path = PyOS_FSPath(arg);
3837 if (path == NULL) {
3838 return 0;
3839 }
3840 if (PyBytes_Check(path)) {
3841 output = path;
3842 }
3843 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3844 output = PyUnicode_EncodeFSDefault(path);
3845 Py_DECREF(path);
3846 if (!output) {
3847 return 0;
3848 }
3849 assert(PyBytes_Check(output));
3850 }
3851
3852 size = PyBytes_GET_SIZE(output);
3853 data = PyBytes_AS_STRING(output);
3854 if ((size_t)size != strlen(data)) {
3855 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3856 Py_DECREF(output);
3857 return 0;
3858 }
3859 *(PyObject**)addr = output;
3860 return Py_CLEANUP_SUPPORTED;
3861 }
3862
3863
3864 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3865 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3866 {
3867 int is_buffer = 0;
3868 PyObject *path = NULL;
3869 PyObject *output = NULL;
3870 if (arg == NULL) {
3871 Py_DECREF(*(PyObject**)addr);
3872 *(PyObject**)addr = NULL;
3873 return 1;
3874 }
3875
3876 is_buffer = PyObject_CheckBuffer(arg);
3877 if (!is_buffer) {
3878 path = PyOS_FSPath(arg);
3879 if (path == NULL) {
3880 return 0;
3881 }
3882 }
3883 else {
3884 path = arg;
3885 Py_INCREF(arg);
3886 }
3887
3888 if (PyUnicode_Check(path)) {
3889 output = path;
3890 }
3891 else if (PyBytes_Check(path) || is_buffer) {
3892 PyObject *path_bytes = NULL;
3893
3894 if (!PyBytes_Check(path) &&
3895 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3896 "path should be string, bytes, or os.PathLike, not %.200s",
3897 Py_TYPE(arg)->tp_name)) {
3898 Py_DECREF(path);
3899 return 0;
3900 }
3901 path_bytes = PyBytes_FromObject(path);
3902 Py_DECREF(path);
3903 if (!path_bytes) {
3904 return 0;
3905 }
3906 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3907 PyBytes_GET_SIZE(path_bytes));
3908 Py_DECREF(path_bytes);
3909 if (!output) {
3910 return 0;
3911 }
3912 }
3913 else {
3914 PyErr_Format(PyExc_TypeError,
3915 "path should be string, bytes, or os.PathLike, not %.200s",
3916 Py_TYPE(arg)->tp_name);
3917 Py_DECREF(path);
3918 return 0;
3919 }
3920 if (PyUnicode_READY(output) == -1) {
3921 Py_DECREF(output);
3922 return 0;
3923 }
3924 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3925 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3926 PyErr_SetString(PyExc_ValueError, "embedded null character");
3927 Py_DECREF(output);
3928 return 0;
3929 }
3930 *(PyObject**)addr = output;
3931 return Py_CLEANUP_SUPPORTED;
3932 }
3933
3934
3935 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3936 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3937 {
3938 PyObject *bytes;
3939
3940 if (!PyUnicode_Check(unicode)) {
3941 PyErr_BadArgument();
3942 return NULL;
3943 }
3944 if (PyUnicode_READY(unicode) == -1)
3945 return NULL;
3946
3947 if (PyUnicode_UTF8(unicode) == NULL) {
3948 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3949 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3950 if (bytes == NULL)
3951 return NULL;
3952 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3953 if (_PyUnicode_UTF8(unicode) == NULL) {
3954 PyErr_NoMemory();
3955 Py_DECREF(bytes);
3956 return NULL;
3957 }
3958 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3959 memcpy(_PyUnicode_UTF8(unicode),
3960 PyBytes_AS_STRING(bytes),
3961 _PyUnicode_UTF8_LENGTH(unicode) + 1);
3962 Py_DECREF(bytes);
3963 }
3964
3965 if (psize)
3966 *psize = PyUnicode_UTF8_LENGTH(unicode);
3967 return PyUnicode_UTF8(unicode);
3968 }
3969
3970 const char *
PyUnicode_AsUTF8(PyObject * unicode)3971 PyUnicode_AsUTF8(PyObject *unicode)
3972 {
3973 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3974 }
3975
3976 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)3977 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3978 {
3979 if (!PyUnicode_Check(unicode)) {
3980 PyErr_BadArgument();
3981 return NULL;
3982 }
3983 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3984 if (w == NULL) {
3985 /* Non-ASCII compact unicode object */
3986 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
3987 assert(PyUnicode_IS_READY(unicode));
3988
3989 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3990 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3991 PyErr_NoMemory();
3992 return NULL;
3993 }
3994 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3995 if (w == NULL) {
3996 PyErr_NoMemory();
3997 return NULL;
3998 }
3999 unicode_copy_as_widechar(unicode, w, wlen + 1);
4000 _PyUnicode_WSTR(unicode) = w;
4001 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4002 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4003 }
4004 }
4005 if (size != NULL)
4006 *size = PyUnicode_WSTR_LENGTH(unicode);
4007 return w;
4008 }
4009
4010 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)4011 PyUnicode_AsUnicode(PyObject *unicode)
4012 {
4013 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4014 }
4015
4016 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)4017 _PyUnicode_AsUnicode(PyObject *unicode)
4018 {
4019 Py_ssize_t size;
4020 const Py_UNICODE *wstr;
4021
4022 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4023 if (wstr && wcslen(wstr) != (size_t)size) {
4024 PyErr_SetString(PyExc_ValueError, "embedded null character");
4025 return NULL;
4026 }
4027 return wstr;
4028 }
4029
4030
4031 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)4032 PyUnicode_GetSize(PyObject *unicode)
4033 {
4034 if (!PyUnicode_Check(unicode)) {
4035 PyErr_BadArgument();
4036 goto onError;
4037 }
4038 if (_PyUnicode_WSTR(unicode) == NULL) {
4039 if (PyUnicode_AsUnicode(unicode) == NULL)
4040 goto onError;
4041 }
4042 return PyUnicode_WSTR_LENGTH(unicode);
4043
4044 onError:
4045 return -1;
4046 }
4047
4048 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4049 PyUnicode_GetLength(PyObject *unicode)
4050 {
4051 if (!PyUnicode_Check(unicode)) {
4052 PyErr_BadArgument();
4053 return -1;
4054 }
4055 if (PyUnicode_READY(unicode) == -1)
4056 return -1;
4057 return PyUnicode_GET_LENGTH(unicode);
4058 }
4059
4060 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4061 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4062 {
4063 void *data;
4064 int kind;
4065
4066 if (!PyUnicode_Check(unicode)) {
4067 PyErr_BadArgument();
4068 return (Py_UCS4)-1;
4069 }
4070 if (PyUnicode_READY(unicode) == -1) {
4071 return (Py_UCS4)-1;
4072 }
4073 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4074 PyErr_SetString(PyExc_IndexError, "string index out of range");
4075 return (Py_UCS4)-1;
4076 }
4077 data = PyUnicode_DATA(unicode);
4078 kind = PyUnicode_KIND(unicode);
4079 return PyUnicode_READ(kind, data, index);
4080 }
4081
4082 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4083 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4084 {
4085 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4086 PyErr_BadArgument();
4087 return -1;
4088 }
4089 assert(PyUnicode_IS_READY(unicode));
4090 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4091 PyErr_SetString(PyExc_IndexError, "string index out of range");
4092 return -1;
4093 }
4094 if (unicode_check_modifiable(unicode))
4095 return -1;
4096 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4097 PyErr_SetString(PyExc_ValueError, "character out of range");
4098 return -1;
4099 }
4100 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4101 index, ch);
4102 return 0;
4103 }
4104
4105 const char *
PyUnicode_GetDefaultEncoding(void)4106 PyUnicode_GetDefaultEncoding(void)
4107 {
4108 return "utf-8";
4109 }
4110
4111 /* create or adjust a UnicodeDecodeError */
4112 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4113 make_decode_exception(PyObject **exceptionObject,
4114 const char *encoding,
4115 const char *input, Py_ssize_t length,
4116 Py_ssize_t startpos, Py_ssize_t endpos,
4117 const char *reason)
4118 {
4119 if (*exceptionObject == NULL) {
4120 *exceptionObject = PyUnicodeDecodeError_Create(
4121 encoding, input, length, startpos, endpos, reason);
4122 }
4123 else {
4124 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4125 goto onError;
4126 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4127 goto onError;
4128 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4129 goto onError;
4130 }
4131 return;
4132
4133 onError:
4134 Py_CLEAR(*exceptionObject);
4135 }
4136
4137 #ifdef MS_WINDOWS
4138 static int
widechar_resize(wchar_t ** buf,Py_ssize_t * size,Py_ssize_t newsize)4139 widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4140 {
4141 if (newsize > *size) {
4142 wchar_t *newbuf = *buf;
4143 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4144 PyErr_NoMemory();
4145 return -1;
4146 }
4147 *buf = newbuf;
4148 }
4149 *size = newsize;
4150 return 0;
4151 }
4152
4153 /* error handling callback helper:
4154 build arguments, call the callback and check the arguments,
4155 if no exception occurred, copy the replacement to the output
4156 and adjust various state variables.
4157 return 0 on success, -1 on error
4158 */
4159
4160 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,wchar_t ** buf,Py_ssize_t * bufsize,Py_ssize_t * outpos)4161 unicode_decode_call_errorhandler_wchar(
4162 const char *errors, PyObject **errorHandler,
4163 const char *encoding, const char *reason,
4164 const char **input, const char **inend, Py_ssize_t *startinpos,
4165 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4166 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4167 {
4168 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4169
4170 PyObject *restuple = NULL;
4171 PyObject *repunicode = NULL;
4172 Py_ssize_t outsize;
4173 Py_ssize_t insize;
4174 Py_ssize_t requiredsize;
4175 Py_ssize_t newpos;
4176 PyObject *inputobj = NULL;
4177 wchar_t *repwstr;
4178 Py_ssize_t repwlen;
4179
4180 if (*errorHandler == NULL) {
4181 *errorHandler = PyCodec_LookupError(errors);
4182 if (*errorHandler == NULL)
4183 goto onError;
4184 }
4185
4186 make_decode_exception(exceptionObject,
4187 encoding,
4188 *input, *inend - *input,
4189 *startinpos, *endinpos,
4190 reason);
4191 if (*exceptionObject == NULL)
4192 goto onError;
4193
4194 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4195 if (restuple == NULL)
4196 goto onError;
4197 if (!PyTuple_Check(restuple)) {
4198 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4199 goto onError;
4200 }
4201 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4202 goto onError;
4203
4204 /* Copy back the bytes variables, which might have been modified by the
4205 callback */
4206 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4207 if (!inputobj)
4208 goto onError;
4209 *input = PyBytes_AS_STRING(inputobj);
4210 insize = PyBytes_GET_SIZE(inputobj);
4211 *inend = *input + insize;
4212 /* we can DECREF safely, as the exception has another reference,
4213 so the object won't go away. */
4214 Py_DECREF(inputobj);
4215
4216 if (newpos<0)
4217 newpos = insize+newpos;
4218 if (newpos<0 || newpos>insize) {
4219 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4220 goto onError;
4221 }
4222
4223 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4224 if (repwstr == NULL)
4225 goto onError;
4226 /* need more space? (at least enough for what we
4227 have+the replacement+the rest of the string (starting
4228 at the new input position), so we won't have to check space
4229 when there are no errors in the rest of the string) */
4230 requiredsize = *outpos;
4231 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4232 goto overflow;
4233 requiredsize += repwlen;
4234 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4235 goto overflow;
4236 requiredsize += insize - newpos;
4237 outsize = *bufsize;
4238 if (requiredsize > outsize) {
4239 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4240 requiredsize = 2*outsize;
4241 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4242 goto onError;
4243 }
4244 }
4245 wcsncpy(*buf + *outpos, repwstr, repwlen);
4246 *outpos += repwlen;
4247 *endinpos = newpos;
4248 *inptr = *input + newpos;
4249
4250 /* we made it! */
4251 Py_DECREF(restuple);
4252 return 0;
4253
4254 overflow:
4255 PyErr_SetString(PyExc_OverflowError,
4256 "decoded result is too long for a Python string");
4257
4258 onError:
4259 Py_XDECREF(restuple);
4260 return -1;
4261 }
4262 #endif /* MS_WINDOWS */
4263
4264 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4265 unicode_decode_call_errorhandler_writer(
4266 const char *errors, PyObject **errorHandler,
4267 const char *encoding, const char *reason,
4268 const char **input, const char **inend, Py_ssize_t *startinpos,
4269 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4270 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4271 {
4272 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4273
4274 PyObject *restuple = NULL;
4275 PyObject *repunicode = NULL;
4276 Py_ssize_t insize;
4277 Py_ssize_t newpos;
4278 Py_ssize_t replen;
4279 Py_ssize_t remain;
4280 PyObject *inputobj = NULL;
4281 int need_to_grow = 0;
4282 const char *new_inptr;
4283
4284 if (*errorHandler == NULL) {
4285 *errorHandler = PyCodec_LookupError(errors);
4286 if (*errorHandler == NULL)
4287 goto onError;
4288 }
4289
4290 make_decode_exception(exceptionObject,
4291 encoding,
4292 *input, *inend - *input,
4293 *startinpos, *endinpos,
4294 reason);
4295 if (*exceptionObject == NULL)
4296 goto onError;
4297
4298 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4299 if (restuple == NULL)
4300 goto onError;
4301 if (!PyTuple_Check(restuple)) {
4302 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4303 goto onError;
4304 }
4305 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4306 goto onError;
4307
4308 /* Copy back the bytes variables, which might have been modified by the
4309 callback */
4310 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4311 if (!inputobj)
4312 goto onError;
4313 remain = *inend - *input - *endinpos;
4314 *input = PyBytes_AS_STRING(inputobj);
4315 insize = PyBytes_GET_SIZE(inputobj);
4316 *inend = *input + insize;
4317 /* we can DECREF safely, as the exception has another reference,
4318 so the object won't go away. */
4319 Py_DECREF(inputobj);
4320
4321 if (newpos<0)
4322 newpos = insize+newpos;
4323 if (newpos<0 || newpos>insize) {
4324 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4325 goto onError;
4326 }
4327
4328 replen = PyUnicode_GET_LENGTH(repunicode);
4329 if (replen > 1) {
4330 writer->min_length += replen - 1;
4331 need_to_grow = 1;
4332 }
4333 new_inptr = *input + newpos;
4334 if (*inend - new_inptr > remain) {
4335 /* We don't know the decoding algorithm here so we make the worst
4336 assumption that one byte decodes to one unicode character.
4337 If unfortunately one byte could decode to more unicode characters,
4338 the decoder may write out-of-bound then. Is it possible for the
4339 algorithms using this function? */
4340 writer->min_length += *inend - new_inptr - remain;
4341 need_to_grow = 1;
4342 }
4343 if (need_to_grow) {
4344 writer->overallocate = 1;
4345 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4346 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4347 goto onError;
4348 }
4349 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4350 goto onError;
4351
4352 *endinpos = newpos;
4353 *inptr = new_inptr;
4354
4355 /* we made it! */
4356 Py_DECREF(restuple);
4357 return 0;
4358
4359 onError:
4360 Py_XDECREF(restuple);
4361 return -1;
4362 }
4363
4364 /* --- UTF-7 Codec -------------------------------------------------------- */
4365
4366 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4367
4368 /* Three simple macros defining base-64. */
4369
4370 /* Is c a base-64 character? */
4371
4372 #define IS_BASE64(c) \
4373 (((c) >= 'A' && (c) <= 'Z') || \
4374 ((c) >= 'a' && (c) <= 'z') || \
4375 ((c) >= '0' && (c) <= '9') || \
4376 (c) == '+' || (c) == '/')
4377
4378 /* given that c is a base-64 character, what is its base-64 value? */
4379
4380 #define FROM_BASE64(c) \
4381 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4382 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4383 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4384 (c) == '+' ? 62 : 63)
4385
4386 /* What is the base-64 character of the bottom 6 bits of n? */
4387
4388 #define TO_BASE64(n) \
4389 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4390
4391 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4392 * decoded as itself. We are permissive on decoding; the only ASCII
4393 * byte not decoding to itself is the + which begins a base64
4394 * string. */
4395
4396 #define DECODE_DIRECT(c) \
4397 ((c) <= 127 && (c) != '+')
4398
4399 /* The UTF-7 encoder treats ASCII characters differently according to
4400 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4401 * the above). See RFC2152. This array identifies these different
4402 * sets:
4403 * 0 : "Set D"
4404 * alphanumeric and '(),-./:?
4405 * 1 : "Set O"
4406 * !"#$%&*;<=>@[]^_`{|}
4407 * 2 : "whitespace"
4408 * ht nl cr sp
4409 * 3 : special (must be base64 encoded)
4410 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4411 */
4412
4413 static
4414 char utf7_category[128] = {
4415 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4416 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4417 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4418 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4419 /* sp ! " # $ % & ' ( ) * + , - . / */
4420 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4421 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4422 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4423 /* @ A B C D E F G H I J K L M N O */
4424 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4425 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4426 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4427 /* ` a b c d e f g h i j k l m n o */
4428 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4429 /* p q r s t u v w x y z { | } ~ del */
4430 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4431 };
4432
4433 /* ENCODE_DIRECT: this character should be encoded as itself. The
4434 * answer depends on whether we are encoding set O as itself, and also
4435 * on whether we are encoding whitespace as itself. RFC2152 makes it
4436 * clear that the answers to these questions vary between
4437 * applications, so this code needs to be flexible. */
4438
4439 #define ENCODE_DIRECT(c, directO, directWS) \
4440 ((c) < 128 && (c) > 0 && \
4441 ((utf7_category[(c)] == 0) || \
4442 (directWS && (utf7_category[(c)] == 2)) || \
4443 (directO && (utf7_category[(c)] == 1))))
4444
4445 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4446 PyUnicode_DecodeUTF7(const char *s,
4447 Py_ssize_t size,
4448 const char *errors)
4449 {
4450 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4451 }
4452
4453 /* The decoder. The only state we preserve is our read position,
4454 * i.e. how many characters we have consumed. So if we end in the
4455 * middle of a shift sequence we have to back off the read position
4456 * and the output to the beginning of the sequence, otherwise we lose
4457 * all the shift state (seen bits, number of bits seen, high
4458 * surrogate). */
4459
4460 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4461 PyUnicode_DecodeUTF7Stateful(const char *s,
4462 Py_ssize_t size,
4463 const char *errors,
4464 Py_ssize_t *consumed)
4465 {
4466 const char *starts = s;
4467 Py_ssize_t startinpos;
4468 Py_ssize_t endinpos;
4469 const char *e;
4470 _PyUnicodeWriter writer;
4471 const char *errmsg = "";
4472 int inShift = 0;
4473 Py_ssize_t shiftOutStart;
4474 unsigned int base64bits = 0;
4475 unsigned long base64buffer = 0;
4476 Py_UCS4 surrogate = 0;
4477 PyObject *errorHandler = NULL;
4478 PyObject *exc = NULL;
4479
4480 if (size == 0) {
4481 if (consumed)
4482 *consumed = 0;
4483 _Py_RETURN_UNICODE_EMPTY();
4484 }
4485
4486 /* Start off assuming it's all ASCII. Widen later as necessary. */
4487 _PyUnicodeWriter_Init(&writer);
4488 writer.min_length = size;
4489
4490 shiftOutStart = 0;
4491 e = s + size;
4492
4493 while (s < e) {
4494 Py_UCS4 ch;
4495 restart:
4496 ch = (unsigned char) *s;
4497
4498 if (inShift) { /* in a base-64 section */
4499 if (IS_BASE64(ch)) { /* consume a base-64 character */
4500 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4501 base64bits += 6;
4502 s++;
4503 if (base64bits >= 16) {
4504 /* we have enough bits for a UTF-16 value */
4505 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4506 base64bits -= 16;
4507 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4508 assert(outCh <= 0xffff);
4509 if (surrogate) {
4510 /* expecting a second surrogate */
4511 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4512 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4513 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4514 goto onError;
4515 surrogate = 0;
4516 continue;
4517 }
4518 else {
4519 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4520 goto onError;
4521 surrogate = 0;
4522 }
4523 }
4524 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4525 /* first surrogate */
4526 surrogate = outCh;
4527 }
4528 else {
4529 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4530 goto onError;
4531 }
4532 }
4533 }
4534 else { /* now leaving a base-64 section */
4535 inShift = 0;
4536 if (base64bits > 0) { /* left-over bits */
4537 if (base64bits >= 6) {
4538 /* We've seen at least one base-64 character */
4539 s++;
4540 errmsg = "partial character in shift sequence";
4541 goto utf7Error;
4542 }
4543 else {
4544 /* Some bits remain; they should be zero */
4545 if (base64buffer != 0) {
4546 s++;
4547 errmsg = "non-zero padding bits in shift sequence";
4548 goto utf7Error;
4549 }
4550 }
4551 }
4552 if (surrogate && DECODE_DIRECT(ch)) {
4553 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4554 goto onError;
4555 }
4556 surrogate = 0;
4557 if (ch == '-') {
4558 /* '-' is absorbed; other terminating
4559 characters are preserved */
4560 s++;
4561 }
4562 }
4563 }
4564 else if ( ch == '+' ) {
4565 startinpos = s-starts;
4566 s++; /* consume '+' */
4567 if (s < e && *s == '-') { /* '+-' encodes '+' */
4568 s++;
4569 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4570 goto onError;
4571 }
4572 else if (s < e && !IS_BASE64(*s)) {
4573 s++;
4574 errmsg = "ill-formed sequence";
4575 goto utf7Error;
4576 }
4577 else { /* begin base64-encoded section */
4578 inShift = 1;
4579 surrogate = 0;
4580 shiftOutStart = writer.pos;
4581 base64bits = 0;
4582 base64buffer = 0;
4583 }
4584 }
4585 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4586 s++;
4587 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4588 goto onError;
4589 }
4590 else {
4591 startinpos = s-starts;
4592 s++;
4593 errmsg = "unexpected special character";
4594 goto utf7Error;
4595 }
4596 continue;
4597 utf7Error:
4598 endinpos = s-starts;
4599 if (unicode_decode_call_errorhandler_writer(
4600 errors, &errorHandler,
4601 "utf7", errmsg,
4602 &starts, &e, &startinpos, &endinpos, &exc, &s,
4603 &writer))
4604 goto onError;
4605 }
4606
4607 /* end of string */
4608
4609 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4610 /* if we're in an inconsistent state, that's an error */
4611 inShift = 0;
4612 if (surrogate ||
4613 (base64bits >= 6) ||
4614 (base64bits > 0 && base64buffer != 0)) {
4615 endinpos = size;
4616 if (unicode_decode_call_errorhandler_writer(
4617 errors, &errorHandler,
4618 "utf7", "unterminated shift sequence",
4619 &starts, &e, &startinpos, &endinpos, &exc, &s,
4620 &writer))
4621 goto onError;
4622 if (s < e)
4623 goto restart;
4624 }
4625 }
4626
4627 /* return state */
4628 if (consumed) {
4629 if (inShift) {
4630 *consumed = startinpos;
4631 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4632 PyObject *result = PyUnicode_FromKindAndData(
4633 writer.kind, writer.data, shiftOutStart);
4634 Py_XDECREF(errorHandler);
4635 Py_XDECREF(exc);
4636 _PyUnicodeWriter_Dealloc(&writer);
4637 return result;
4638 }
4639 writer.pos = shiftOutStart; /* back off output */
4640 }
4641 else {
4642 *consumed = s-starts;
4643 }
4644 }
4645
4646 Py_XDECREF(errorHandler);
4647 Py_XDECREF(exc);
4648 return _PyUnicodeWriter_Finish(&writer);
4649
4650 onError:
4651 Py_XDECREF(errorHandler);
4652 Py_XDECREF(exc);
4653 _PyUnicodeWriter_Dealloc(&writer);
4654 return NULL;
4655 }
4656
4657
4658 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4659 _PyUnicode_EncodeUTF7(PyObject *str,
4660 int base64SetO,
4661 int base64WhiteSpace,
4662 const char *errors)
4663 {
4664 int kind;
4665 void *data;
4666 Py_ssize_t len;
4667 PyObject *v;
4668 int inShift = 0;
4669 Py_ssize_t i;
4670 unsigned int base64bits = 0;
4671 unsigned long base64buffer = 0;
4672 char * out;
4673 char * start;
4674
4675 if (PyUnicode_READY(str) == -1)
4676 return NULL;
4677 kind = PyUnicode_KIND(str);
4678 data = PyUnicode_DATA(str);
4679 len = PyUnicode_GET_LENGTH(str);
4680
4681 if (len == 0)
4682 return PyBytes_FromStringAndSize(NULL, 0);
4683
4684 /* It might be possible to tighten this worst case */
4685 if (len > PY_SSIZE_T_MAX / 8)
4686 return PyErr_NoMemory();
4687 v = PyBytes_FromStringAndSize(NULL, len * 8);
4688 if (v == NULL)
4689 return NULL;
4690
4691 start = out = PyBytes_AS_STRING(v);
4692 for (i = 0; i < len; ++i) {
4693 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4694
4695 if (inShift) {
4696 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4697 /* shifting out */
4698 if (base64bits) { /* output remaining bits */
4699 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4700 base64buffer = 0;
4701 base64bits = 0;
4702 }
4703 inShift = 0;
4704 /* Characters not in the BASE64 set implicitly unshift the sequence
4705 so no '-' is required, except if the character is itself a '-' */
4706 if (IS_BASE64(ch) || ch == '-') {
4707 *out++ = '-';
4708 }
4709 *out++ = (char) ch;
4710 }
4711 else {
4712 goto encode_char;
4713 }
4714 }
4715 else { /* not in a shift sequence */
4716 if (ch == '+') {
4717 *out++ = '+';
4718 *out++ = '-';
4719 }
4720 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4721 *out++ = (char) ch;
4722 }
4723 else {
4724 *out++ = '+';
4725 inShift = 1;
4726 goto encode_char;
4727 }
4728 }
4729 continue;
4730 encode_char:
4731 if (ch >= 0x10000) {
4732 assert(ch <= MAX_UNICODE);
4733
4734 /* code first surrogate */
4735 base64bits += 16;
4736 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4737 while (base64bits >= 6) {
4738 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4739 base64bits -= 6;
4740 }
4741 /* prepare second surrogate */
4742 ch = Py_UNICODE_LOW_SURROGATE(ch);
4743 }
4744 base64bits += 16;
4745 base64buffer = (base64buffer << 16) | ch;
4746 while (base64bits >= 6) {
4747 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4748 base64bits -= 6;
4749 }
4750 }
4751 if (base64bits)
4752 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4753 if (inShift)
4754 *out++ = '-';
4755 if (_PyBytes_Resize(&v, out - start) < 0)
4756 return NULL;
4757 return v;
4758 }
4759 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4760 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4761 Py_ssize_t size,
4762 int base64SetO,
4763 int base64WhiteSpace,
4764 const char *errors)
4765 {
4766 PyObject *result;
4767 PyObject *tmp = PyUnicode_FromWideChar(s, size);
4768 if (tmp == NULL)
4769 return NULL;
4770 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4771 base64WhiteSpace, errors);
4772 Py_DECREF(tmp);
4773 return result;
4774 }
4775
4776 #undef IS_BASE64
4777 #undef FROM_BASE64
4778 #undef TO_BASE64
4779 #undef DECODE_DIRECT
4780 #undef ENCODE_DIRECT
4781
4782 /* --- UTF-8 Codec -------------------------------------------------------- */
4783
4784 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4785 PyUnicode_DecodeUTF8(const char *s,
4786 Py_ssize_t size,
4787 const char *errors)
4788 {
4789 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4790 }
4791
4792 #include "stringlib/asciilib.h"
4793 #include "stringlib/codecs.h"
4794 #include "stringlib/undef.h"
4795
4796 #include "stringlib/ucs1lib.h"
4797 #include "stringlib/codecs.h"
4798 #include "stringlib/undef.h"
4799
4800 #include "stringlib/ucs2lib.h"
4801 #include "stringlib/codecs.h"
4802 #include "stringlib/undef.h"
4803
4804 #include "stringlib/ucs4lib.h"
4805 #include "stringlib/codecs.h"
4806 #include "stringlib/undef.h"
4807
4808 /* Mask to quickly check whether a C 'long' contains a
4809 non-ASCII, UTF8-encoded char. */
4810 #if (SIZEOF_LONG == 8)
4811 # define ASCII_CHAR_MASK 0x8080808080808080UL
4812 #elif (SIZEOF_LONG == 4)
4813 # define ASCII_CHAR_MASK 0x80808080UL
4814 #else
4815 # error C 'long' size should be either 4 or 8!
4816 #endif
4817
4818 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4819 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4820 {
4821 const char *p = start;
4822 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4823
4824 /*
4825 * Issue #17237: m68k is a bit different from most architectures in
4826 * that objects do not use "natural alignment" - for example, int and
4827 * long are only aligned at 2-byte boundaries. Therefore the assert()
4828 * won't work; also, tests have shown that skipping the "optimised
4829 * version" will even speed up m68k.
4830 */
4831 #if !defined(__m68k__)
4832 #if SIZEOF_LONG <= SIZEOF_VOID_P
4833 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4834 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4835 /* Fast path, see in STRINGLIB(utf8_decode) for
4836 an explanation. */
4837 /* Help allocation */
4838 const char *_p = p;
4839 Py_UCS1 * q = dest;
4840 while (_p < aligned_end) {
4841 unsigned long value = *(const unsigned long *) _p;
4842 if (value & ASCII_CHAR_MASK)
4843 break;
4844 *((unsigned long *)q) = value;
4845 _p += SIZEOF_LONG;
4846 q += SIZEOF_LONG;
4847 }
4848 p = _p;
4849 while (p < end) {
4850 if ((unsigned char)*p & 0x80)
4851 break;
4852 *q++ = *p++;
4853 }
4854 return p - start;
4855 }
4856 #endif
4857 #endif
4858 while (p < end) {
4859 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4860 for an explanation. */
4861 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4862 /* Help allocation */
4863 const char *_p = p;
4864 while (_p < aligned_end) {
4865 unsigned long value = *(const unsigned long *) _p;
4866 if (value & ASCII_CHAR_MASK)
4867 break;
4868 _p += SIZEOF_LONG;
4869 }
4870 p = _p;
4871 if (_p == end)
4872 break;
4873 }
4874 if ((unsigned char)*p & 0x80)
4875 break;
4876 ++p;
4877 }
4878 memcpy(dest, start, p - start);
4879 return p - start;
4880 }
4881
4882 static PyObject *
unicode_decode_utf8(const char * s,Py_ssize_t size,_Py_error_handler error_handler,const char * errors,Py_ssize_t * consumed)4883 unicode_decode_utf8(const char *s, Py_ssize_t size,
4884 _Py_error_handler error_handler, const char *errors,
4885 Py_ssize_t *consumed)
4886 {
4887 _PyUnicodeWriter writer;
4888 const char *starts = s;
4889 const char *end = s + size;
4890
4891 Py_ssize_t startinpos;
4892 Py_ssize_t endinpos;
4893 const char *errmsg = "";
4894 PyObject *error_handler_obj = NULL;
4895 PyObject *exc = NULL;
4896
4897 if (size == 0) {
4898 if (consumed)
4899 *consumed = 0;
4900 _Py_RETURN_UNICODE_EMPTY();
4901 }
4902
4903 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4904 if (size == 1 && (unsigned char)s[0] < 128) {
4905 if (consumed)
4906 *consumed = 1;
4907 return get_latin1_char((unsigned char)s[0]);
4908 }
4909
4910 _PyUnicodeWriter_Init(&writer);
4911 writer.min_length = size;
4912 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4913 goto onError;
4914
4915 writer.pos = ascii_decode(s, end, writer.data);
4916 s += writer.pos;
4917 while (s < end) {
4918 Py_UCS4 ch;
4919 int kind = writer.kind;
4920
4921 if (kind == PyUnicode_1BYTE_KIND) {
4922 if (PyUnicode_IS_ASCII(writer.buffer))
4923 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4924 else
4925 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4926 } else if (kind == PyUnicode_2BYTE_KIND) {
4927 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4928 } else {
4929 assert(kind == PyUnicode_4BYTE_KIND);
4930 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4931 }
4932
4933 switch (ch) {
4934 case 0:
4935 if (s == end || consumed)
4936 goto End;
4937 errmsg = "unexpected end of data";
4938 startinpos = s - starts;
4939 endinpos = end - starts;
4940 break;
4941 case 1:
4942 errmsg = "invalid start byte";
4943 startinpos = s - starts;
4944 endinpos = startinpos + 1;
4945 break;
4946 case 2:
4947 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4948 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4949 {
4950 /* Truncated surrogate code in range D800-DFFF */
4951 goto End;
4952 }
4953 /* fall through */
4954 case 3:
4955 case 4:
4956 errmsg = "invalid continuation byte";
4957 startinpos = s - starts;
4958 endinpos = startinpos + ch - 1;
4959 break;
4960 default:
4961 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4962 goto onError;
4963 continue;
4964 }
4965
4966 if (error_handler == _Py_ERROR_UNKNOWN)
4967 error_handler = _Py_GetErrorHandler(errors);
4968
4969 switch (error_handler) {
4970 case _Py_ERROR_IGNORE:
4971 s += (endinpos - startinpos);
4972 break;
4973
4974 case _Py_ERROR_REPLACE:
4975 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4976 goto onError;
4977 s += (endinpos - startinpos);
4978 break;
4979
4980 case _Py_ERROR_SURROGATEESCAPE:
4981 {
4982 Py_ssize_t i;
4983
4984 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4985 goto onError;
4986 for (i=startinpos; i<endinpos; i++) {
4987 ch = (Py_UCS4)(unsigned char)(starts[i]);
4988 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4989 ch + 0xdc00);
4990 writer.pos++;
4991 }
4992 s += (endinpos - startinpos);
4993 break;
4994 }
4995
4996 default:
4997 if (unicode_decode_call_errorhandler_writer(
4998 errors, &error_handler_obj,
4999 "utf-8", errmsg,
5000 &starts, &end, &startinpos, &endinpos, &exc, &s,
5001 &writer))
5002 goto onError;
5003 }
5004 }
5005
5006 End:
5007 if (consumed)
5008 *consumed = s - starts;
5009
5010 Py_XDECREF(error_handler_obj);
5011 Py_XDECREF(exc);
5012 return _PyUnicodeWriter_Finish(&writer);
5013
5014 onError:
5015 Py_XDECREF(error_handler_obj);
5016 Py_XDECREF(exc);
5017 _PyUnicodeWriter_Dealloc(&writer);
5018 return NULL;
5019 }
5020
5021
5022 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)5023 PyUnicode_DecodeUTF8Stateful(const char *s,
5024 Py_ssize_t size,
5025 const char *errors,
5026 Py_ssize_t *consumed)
5027 {
5028 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5029 }
5030
5031
5032 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5033 non-zero, use strict error handler otherwise.
5034
5035 On success, write a pointer to a newly allocated wide character string into
5036 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5037 (in number of wchar_t units) into *wlen (if wlen is set).
5038
5039 On memory allocation failure, return -1.
5040
5041 On decoding error (if surrogateescape is zero), return -2. If wlen is
5042 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5043 is not NULL, write the decoding error message into *reason. */
5044 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,_Py_error_handler errors)5045 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5046 const char **reason, _Py_error_handler errors)
5047 {
5048 const char *orig_s = s;
5049 const char *e;
5050 wchar_t *unicode;
5051 Py_ssize_t outpos;
5052
5053 int surrogateescape = 0;
5054 int surrogatepass = 0;
5055 switch (errors)
5056 {
5057 case _Py_ERROR_STRICT:
5058 break;
5059 case _Py_ERROR_SURROGATEESCAPE:
5060 surrogateescape = 1;
5061 break;
5062 case _Py_ERROR_SURROGATEPASS:
5063 surrogatepass = 1;
5064 break;
5065 default:
5066 return -3;
5067 }
5068
5069 /* Note: size will always be longer than the resulting Unicode
5070 character count */
5071 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5072 return -1;
5073 }
5074
5075 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5076 if (!unicode) {
5077 return -1;
5078 }
5079
5080 /* Unpack UTF-8 encoded data */
5081 e = s + size;
5082 outpos = 0;
5083 while (s < e) {
5084 Py_UCS4 ch;
5085 #if SIZEOF_WCHAR_T == 4
5086 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5087 #else
5088 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5089 #endif
5090 if (ch > 0xFF) {
5091 #if SIZEOF_WCHAR_T == 4
5092 Py_UNREACHABLE();
5093 #else
5094 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5095 /* write a surrogate pair */
5096 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5097 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5098 #endif
5099 }
5100 else {
5101 if (!ch && s == e) {
5102 break;
5103 }
5104
5105 if (surrogateescape) {
5106 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5107 }
5108 else {
5109 /* Is it a valid three-byte code? */
5110 if (surrogatepass
5111 && (e - s) >= 3
5112 && (s[0] & 0xf0) == 0xe0
5113 && (s[1] & 0xc0) == 0x80
5114 && (s[2] & 0xc0) == 0x80)
5115 {
5116 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5117 s += 3;
5118 unicode[outpos++] = ch;
5119 }
5120 else {
5121 PyMem_RawFree(unicode );
5122 if (reason != NULL) {
5123 switch (ch) {
5124 case 0:
5125 *reason = "unexpected end of data";
5126 break;
5127 case 1:
5128 *reason = "invalid start byte";
5129 break;
5130 /* 2, 3, 4 */
5131 default:
5132 *reason = "invalid continuation byte";
5133 break;
5134 }
5135 }
5136 if (wlen != NULL) {
5137 *wlen = s - orig_s;
5138 }
5139 return -2;
5140 }
5141 }
5142 }
5143 }
5144 unicode[outpos] = L'\0';
5145 if (wlen) {
5146 *wlen = outpos;
5147 }
5148 *wstr = unicode;
5149 return 0;
5150 }
5151
5152
5153 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen,size_t * wlen)5154 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5155 size_t *wlen)
5156 {
5157 wchar_t *wstr;
5158 int res = _Py_DecodeUTF8Ex(arg, arglen,
5159 &wstr, wlen,
5160 NULL, _Py_ERROR_SURROGATEESCAPE);
5161 if (res != 0) {
5162 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5163 assert(res != -3);
5164 if (wlen) {
5165 *wlen = (size_t)res;
5166 }
5167 return NULL;
5168 }
5169 return wstr;
5170 }
5171
5172
5173 /* UTF-8 encoder using the surrogateescape error handler .
5174
5175 On success, return 0 and write the newly allocated character string (use
5176 PyMem_Free() to free the memory) into *str.
5177
5178 On encoding failure, return -2 and write the position of the invalid
5179 surrogate character into *error_pos (if error_pos is set) and the decoding
5180 error message into *reason (if reason is set).
5181
5182 On memory allocation failure, return -1. */
5183 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,_Py_error_handler errors)5184 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5185 const char **reason, int raw_malloc, _Py_error_handler errors)
5186 {
5187 const Py_ssize_t max_char_size = 4;
5188 Py_ssize_t len = wcslen(text);
5189
5190 assert(len >= 0);
5191
5192 int surrogateescape = 0;
5193 int surrogatepass = 0;
5194 switch (errors)
5195 {
5196 case _Py_ERROR_STRICT:
5197 break;
5198 case _Py_ERROR_SURROGATEESCAPE:
5199 surrogateescape = 1;
5200 break;
5201 case _Py_ERROR_SURROGATEPASS:
5202 surrogatepass = 1;
5203 break;
5204 default:
5205 return -3;
5206 }
5207
5208 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5209 return -1;
5210 }
5211 char *bytes;
5212 if (raw_malloc) {
5213 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5214 }
5215 else {
5216 bytes = PyMem_Malloc((len + 1) * max_char_size);
5217 }
5218 if (bytes == NULL) {
5219 return -1;
5220 }
5221
5222 char *p = bytes;
5223 Py_ssize_t i;
5224 for (i = 0; i < len; ) {
5225 Py_ssize_t ch_pos = i;
5226 Py_UCS4 ch = text[i];
5227 i++;
5228 #if Py_UNICODE_SIZE == 2
5229 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5230 && i < len
5231 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5232 {
5233 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5234 i++;
5235 }
5236 #endif
5237
5238 if (ch < 0x80) {
5239 /* Encode ASCII */
5240 *p++ = (char) ch;
5241
5242 }
5243 else if (ch < 0x0800) {
5244 /* Encode Latin-1 */
5245 *p++ = (char)(0xc0 | (ch >> 6));
5246 *p++ = (char)(0x80 | (ch & 0x3f));
5247 }
5248 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5249 /* surrogateescape error handler */
5250 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5251 if (error_pos != NULL) {
5252 *error_pos = (size_t)ch_pos;
5253 }
5254 if (reason != NULL) {
5255 *reason = "encoding error";
5256 }
5257 if (raw_malloc) {
5258 PyMem_RawFree(bytes);
5259 }
5260 else {
5261 PyMem_Free(bytes);
5262 }
5263 return -2;
5264 }
5265 *p++ = (char)(ch & 0xff);
5266 }
5267 else if (ch < 0x10000) {
5268 *p++ = (char)(0xe0 | (ch >> 12));
5269 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5270 *p++ = (char)(0x80 | (ch & 0x3f));
5271 }
5272 else { /* ch >= 0x10000 */
5273 assert(ch <= MAX_UNICODE);
5274 /* Encode UCS4 Unicode ordinals */
5275 *p++ = (char)(0xf0 | (ch >> 18));
5276 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5277 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5278 *p++ = (char)(0x80 | (ch & 0x3f));
5279 }
5280 }
5281 *p++ = '\0';
5282
5283 size_t final_size = (p - bytes);
5284 char *bytes2;
5285 if (raw_malloc) {
5286 bytes2 = PyMem_RawRealloc(bytes, final_size);
5287 }
5288 else {
5289 bytes2 = PyMem_Realloc(bytes, final_size);
5290 }
5291 if (bytes2 == NULL) {
5292 if (error_pos != NULL) {
5293 *error_pos = (size_t)-1;
5294 }
5295 if (raw_malloc) {
5296 PyMem_RawFree(bytes);
5297 }
5298 else {
5299 PyMem_Free(bytes);
5300 }
5301 return -1;
5302 }
5303 *str = bytes2;
5304 return 0;
5305 }
5306
5307
5308 /* Primary internal function which creates utf8 encoded bytes objects.
5309
5310 Allocation strategy: if the string is short, convert into a stack buffer
5311 and allocate exactly as much space needed at the end. Else allocate the
5312 maximum possible needed (4 result bytes per Unicode character), and return
5313 the excess memory at the end.
5314 */
5315 static PyObject *
unicode_encode_utf8(PyObject * unicode,_Py_error_handler error_handler,const char * errors)5316 unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5317 const char *errors)
5318 {
5319 enum PyUnicode_Kind kind;
5320 void *data;
5321 Py_ssize_t size;
5322
5323 if (!PyUnicode_Check(unicode)) {
5324 PyErr_BadArgument();
5325 return NULL;
5326 }
5327
5328 if (PyUnicode_READY(unicode) == -1)
5329 return NULL;
5330
5331 if (PyUnicode_UTF8(unicode))
5332 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5333 PyUnicode_UTF8_LENGTH(unicode));
5334
5335 kind = PyUnicode_KIND(unicode);
5336 data = PyUnicode_DATA(unicode);
5337 size = PyUnicode_GET_LENGTH(unicode);
5338
5339 switch (kind) {
5340 default:
5341 Py_UNREACHABLE();
5342 case PyUnicode_1BYTE_KIND:
5343 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5344 assert(!PyUnicode_IS_ASCII(unicode));
5345 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
5346 case PyUnicode_2BYTE_KIND:
5347 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
5348 case PyUnicode_4BYTE_KIND:
5349 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
5350 }
5351 }
5352
5353 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5354 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5355 {
5356 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5357 }
5358
5359
5360 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5361 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5362 Py_ssize_t size,
5363 const char *errors)
5364 {
5365 PyObject *v, *unicode;
5366
5367 unicode = PyUnicode_FromWideChar(s, size);
5368 if (unicode == NULL)
5369 return NULL;
5370 v = _PyUnicode_AsUTF8String(unicode, errors);
5371 Py_DECREF(unicode);
5372 return v;
5373 }
5374
5375 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5376 PyUnicode_AsUTF8String(PyObject *unicode)
5377 {
5378 return _PyUnicode_AsUTF8String(unicode, NULL);
5379 }
5380
5381 /* --- UTF-32 Codec ------------------------------------------------------- */
5382
5383 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5384 PyUnicode_DecodeUTF32(const char *s,
5385 Py_ssize_t size,
5386 const char *errors,
5387 int *byteorder)
5388 {
5389 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5390 }
5391
5392 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5393 PyUnicode_DecodeUTF32Stateful(const char *s,
5394 Py_ssize_t size,
5395 const char *errors,
5396 int *byteorder,
5397 Py_ssize_t *consumed)
5398 {
5399 const char *starts = s;
5400 Py_ssize_t startinpos;
5401 Py_ssize_t endinpos;
5402 _PyUnicodeWriter writer;
5403 const unsigned char *q, *e;
5404 int le, bo = 0; /* assume native ordering by default */
5405 const char *encoding;
5406 const char *errmsg = "";
5407 PyObject *errorHandler = NULL;
5408 PyObject *exc = NULL;
5409
5410 q = (const unsigned char *)s;
5411 e = q + size;
5412
5413 if (byteorder)
5414 bo = *byteorder;
5415
5416 /* Check for BOM marks (U+FEFF) in the input and adjust current
5417 byte order setting accordingly. In native mode, the leading BOM
5418 mark is skipped, in all other modes, it is copied to the output
5419 stream as-is (giving a ZWNBSP character). */
5420 if (bo == 0 && size >= 4) {
5421 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5422 if (bom == 0x0000FEFF) {
5423 bo = -1;
5424 q += 4;
5425 }
5426 else if (bom == 0xFFFE0000) {
5427 bo = 1;
5428 q += 4;
5429 }
5430 if (byteorder)
5431 *byteorder = bo;
5432 }
5433
5434 if (q == e) {
5435 if (consumed)
5436 *consumed = size;
5437 _Py_RETURN_UNICODE_EMPTY();
5438 }
5439
5440 #ifdef WORDS_BIGENDIAN
5441 le = bo < 0;
5442 #else
5443 le = bo <= 0;
5444 #endif
5445 encoding = le ? "utf-32-le" : "utf-32-be";
5446
5447 _PyUnicodeWriter_Init(&writer);
5448 writer.min_length = (e - q + 3) / 4;
5449 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5450 goto onError;
5451
5452 while (1) {
5453 Py_UCS4 ch = 0;
5454 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5455
5456 if (e - q >= 4) {
5457 enum PyUnicode_Kind kind = writer.kind;
5458 void *data = writer.data;
5459 const unsigned char *last = e - 4;
5460 Py_ssize_t pos = writer.pos;
5461 if (le) {
5462 do {
5463 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5464 if (ch > maxch)
5465 break;
5466 if (kind != PyUnicode_1BYTE_KIND &&
5467 Py_UNICODE_IS_SURROGATE(ch))
5468 break;
5469 PyUnicode_WRITE(kind, data, pos++, ch);
5470 q += 4;
5471 } while (q <= last);
5472 }
5473 else {
5474 do {
5475 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5476 if (ch > maxch)
5477 break;
5478 if (kind != PyUnicode_1BYTE_KIND &&
5479 Py_UNICODE_IS_SURROGATE(ch))
5480 break;
5481 PyUnicode_WRITE(kind, data, pos++, ch);
5482 q += 4;
5483 } while (q <= last);
5484 }
5485 writer.pos = pos;
5486 }
5487
5488 if (Py_UNICODE_IS_SURROGATE(ch)) {
5489 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5490 startinpos = ((const char *)q) - starts;
5491 endinpos = startinpos + 4;
5492 }
5493 else if (ch <= maxch) {
5494 if (q == e || consumed)
5495 break;
5496 /* remaining bytes at the end? (size should be divisible by 4) */
5497 errmsg = "truncated data";
5498 startinpos = ((const char *)q) - starts;
5499 endinpos = ((const char *)e) - starts;
5500 }
5501 else {
5502 if (ch < 0x110000) {
5503 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5504 goto onError;
5505 q += 4;
5506 continue;
5507 }
5508 errmsg = "code point not in range(0x110000)";
5509 startinpos = ((const char *)q) - starts;
5510 endinpos = startinpos + 4;
5511 }
5512
5513 /* The remaining input chars are ignored if the callback
5514 chooses to skip the input */
5515 if (unicode_decode_call_errorhandler_writer(
5516 errors, &errorHandler,
5517 encoding, errmsg,
5518 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5519 &writer))
5520 goto onError;
5521 }
5522
5523 if (consumed)
5524 *consumed = (const char *)q-starts;
5525
5526 Py_XDECREF(errorHandler);
5527 Py_XDECREF(exc);
5528 return _PyUnicodeWriter_Finish(&writer);
5529
5530 onError:
5531 _PyUnicodeWriter_Dealloc(&writer);
5532 Py_XDECREF(errorHandler);
5533 Py_XDECREF(exc);
5534 return NULL;
5535 }
5536
5537 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5538 _PyUnicode_EncodeUTF32(PyObject *str,
5539 const char *errors,
5540 int byteorder)
5541 {
5542 enum PyUnicode_Kind kind;
5543 const void *data;
5544 Py_ssize_t len;
5545 PyObject *v;
5546 uint32_t *out;
5547 #if PY_LITTLE_ENDIAN
5548 int native_ordering = byteorder <= 0;
5549 #else
5550 int native_ordering = byteorder >= 0;
5551 #endif
5552 const char *encoding;
5553 Py_ssize_t nsize, pos;
5554 PyObject *errorHandler = NULL;
5555 PyObject *exc = NULL;
5556 PyObject *rep = NULL;
5557
5558 if (!PyUnicode_Check(str)) {
5559 PyErr_BadArgument();
5560 return NULL;
5561 }
5562 if (PyUnicode_READY(str) == -1)
5563 return NULL;
5564 kind = PyUnicode_KIND(str);
5565 data = PyUnicode_DATA(str);
5566 len = PyUnicode_GET_LENGTH(str);
5567
5568 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5569 return PyErr_NoMemory();
5570 nsize = len + (byteorder == 0);
5571 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5572 if (v == NULL)
5573 return NULL;
5574
5575 /* output buffer is 4-bytes aligned */
5576 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5577 out = (uint32_t *)PyBytes_AS_STRING(v);
5578 if (byteorder == 0)
5579 *out++ = 0xFEFF;
5580 if (len == 0)
5581 goto done;
5582
5583 if (byteorder == -1)
5584 encoding = "utf-32-le";
5585 else if (byteorder == 1)
5586 encoding = "utf-32-be";
5587 else
5588 encoding = "utf-32";
5589
5590 if (kind == PyUnicode_1BYTE_KIND) {
5591 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5592 goto done;
5593 }
5594
5595 pos = 0;
5596 while (pos < len) {
5597 Py_ssize_t repsize, moreunits;
5598
5599 if (kind == PyUnicode_2BYTE_KIND) {
5600 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5601 &out, native_ordering);
5602 }
5603 else {
5604 assert(kind == PyUnicode_4BYTE_KIND);
5605 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5606 &out, native_ordering);
5607 }
5608 if (pos == len)
5609 break;
5610
5611 rep = unicode_encode_call_errorhandler(
5612 errors, &errorHandler,
5613 encoding, "surrogates not allowed",
5614 str, &exc, pos, pos + 1, &pos);
5615 if (!rep)
5616 goto error;
5617
5618 if (PyBytes_Check(rep)) {
5619 repsize = PyBytes_GET_SIZE(rep);
5620 if (repsize & 3) {
5621 raise_encode_exception(&exc, encoding,
5622 str, pos - 1, pos,
5623 "surrogates not allowed");
5624 goto error;
5625 }
5626 moreunits = repsize / 4;
5627 }
5628 else {
5629 assert(PyUnicode_Check(rep));
5630 if (PyUnicode_READY(rep) < 0)
5631 goto error;
5632 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5633 if (!PyUnicode_IS_ASCII(rep)) {
5634 raise_encode_exception(&exc, encoding,
5635 str, pos - 1, pos,
5636 "surrogates not allowed");
5637 goto error;
5638 }
5639 }
5640
5641 /* four bytes are reserved for each surrogate */
5642 if (moreunits > 1) {
5643 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5644 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5645 /* integer overflow */
5646 PyErr_NoMemory();
5647 goto error;
5648 }
5649 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5650 goto error;
5651 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5652 }
5653
5654 if (PyBytes_Check(rep)) {
5655 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5656 out += moreunits;
5657 } else /* rep is unicode */ {
5658 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5659 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5660 &out, native_ordering);
5661 }
5662
5663 Py_CLEAR(rep);
5664 }
5665
5666 /* Cut back to size actually needed. This is necessary for, for example,
5667 encoding of a string containing isolated surrogates and the 'ignore'
5668 handler is used. */
5669 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5670 if (nsize != PyBytes_GET_SIZE(v))
5671 _PyBytes_Resize(&v, nsize);
5672 Py_XDECREF(errorHandler);
5673 Py_XDECREF(exc);
5674 done:
5675 return v;
5676 error:
5677 Py_XDECREF(rep);
5678 Py_XDECREF(errorHandler);
5679 Py_XDECREF(exc);
5680 Py_XDECREF(v);
5681 return NULL;
5682 }
5683
5684 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5685 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5686 Py_ssize_t size,
5687 const char *errors,
5688 int byteorder)
5689 {
5690 PyObject *result;
5691 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5692 if (tmp == NULL)
5693 return NULL;
5694 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5695 Py_DECREF(tmp);
5696 return result;
5697 }
5698
5699 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5700 PyUnicode_AsUTF32String(PyObject *unicode)
5701 {
5702 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5703 }
5704
5705 /* --- UTF-16 Codec ------------------------------------------------------- */
5706
5707 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5708 PyUnicode_DecodeUTF16(const char *s,
5709 Py_ssize_t size,
5710 const char *errors,
5711 int *byteorder)
5712 {
5713 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5714 }
5715
5716 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5717 PyUnicode_DecodeUTF16Stateful(const char *s,
5718 Py_ssize_t size,
5719 const char *errors,
5720 int *byteorder,
5721 Py_ssize_t *consumed)
5722 {
5723 const char *starts = s;
5724 Py_ssize_t startinpos;
5725 Py_ssize_t endinpos;
5726 _PyUnicodeWriter writer;
5727 const unsigned char *q, *e;
5728 int bo = 0; /* assume native ordering by default */
5729 int native_ordering;
5730 const char *errmsg = "";
5731 PyObject *errorHandler = NULL;
5732 PyObject *exc = NULL;
5733 const char *encoding;
5734
5735 q = (const unsigned char *)s;
5736 e = q + size;
5737
5738 if (byteorder)
5739 bo = *byteorder;
5740
5741 /* Check for BOM marks (U+FEFF) in the input and adjust current
5742 byte order setting accordingly. In native mode, the leading BOM
5743 mark is skipped, in all other modes, it is copied to the output
5744 stream as-is (giving a ZWNBSP character). */
5745 if (bo == 0 && size >= 2) {
5746 const Py_UCS4 bom = (q[1] << 8) | q[0];
5747 if (bom == 0xFEFF) {
5748 q += 2;
5749 bo = -1;
5750 }
5751 else if (bom == 0xFFFE) {
5752 q += 2;
5753 bo = 1;
5754 }
5755 if (byteorder)
5756 *byteorder = bo;
5757 }
5758
5759 if (q == e) {
5760 if (consumed)
5761 *consumed = size;
5762 _Py_RETURN_UNICODE_EMPTY();
5763 }
5764
5765 #if PY_LITTLE_ENDIAN
5766 native_ordering = bo <= 0;
5767 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5768 #else
5769 native_ordering = bo >= 0;
5770 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5771 #endif
5772
5773 /* Note: size will always be longer than the resulting Unicode
5774 character count normally. Error handler will take care of
5775 resizing when needed. */
5776 _PyUnicodeWriter_Init(&writer);
5777 writer.min_length = (e - q + 1) / 2;
5778 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5779 goto onError;
5780
5781 while (1) {
5782 Py_UCS4 ch = 0;
5783 if (e - q >= 2) {
5784 int kind = writer.kind;
5785 if (kind == PyUnicode_1BYTE_KIND) {
5786 if (PyUnicode_IS_ASCII(writer.buffer))
5787 ch = asciilib_utf16_decode(&q, e,
5788 (Py_UCS1*)writer.data, &writer.pos,
5789 native_ordering);
5790 else
5791 ch = ucs1lib_utf16_decode(&q, e,
5792 (Py_UCS1*)writer.data, &writer.pos,
5793 native_ordering);
5794 } else if (kind == PyUnicode_2BYTE_KIND) {
5795 ch = ucs2lib_utf16_decode(&q, e,
5796 (Py_UCS2*)writer.data, &writer.pos,
5797 native_ordering);
5798 } else {
5799 assert(kind == PyUnicode_4BYTE_KIND);
5800 ch = ucs4lib_utf16_decode(&q, e,
5801 (Py_UCS4*)writer.data, &writer.pos,
5802 native_ordering);
5803 }
5804 }
5805
5806 switch (ch)
5807 {
5808 case 0:
5809 /* remaining byte at the end? (size should be even) */
5810 if (q == e || consumed)
5811 goto End;
5812 errmsg = "truncated data";
5813 startinpos = ((const char *)q) - starts;
5814 endinpos = ((const char *)e) - starts;
5815 break;
5816 /* The remaining input chars are ignored if the callback
5817 chooses to skip the input */
5818 case 1:
5819 q -= 2;
5820 if (consumed)
5821 goto End;
5822 errmsg = "unexpected end of data";
5823 startinpos = ((const char *)q) - starts;
5824 endinpos = ((const char *)e) - starts;
5825 break;
5826 case 2:
5827 errmsg = "illegal encoding";
5828 startinpos = ((const char *)q) - 2 - starts;
5829 endinpos = startinpos + 2;
5830 break;
5831 case 3:
5832 errmsg = "illegal UTF-16 surrogate";
5833 startinpos = ((const char *)q) - 4 - starts;
5834 endinpos = startinpos + 2;
5835 break;
5836 default:
5837 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5838 goto onError;
5839 continue;
5840 }
5841
5842 if (unicode_decode_call_errorhandler_writer(
5843 errors,
5844 &errorHandler,
5845 encoding, errmsg,
5846 &starts,
5847 (const char **)&e,
5848 &startinpos,
5849 &endinpos,
5850 &exc,
5851 (const char **)&q,
5852 &writer))
5853 goto onError;
5854 }
5855
5856 End:
5857 if (consumed)
5858 *consumed = (const char *)q-starts;
5859
5860 Py_XDECREF(errorHandler);
5861 Py_XDECREF(exc);
5862 return _PyUnicodeWriter_Finish(&writer);
5863
5864 onError:
5865 _PyUnicodeWriter_Dealloc(&writer);
5866 Py_XDECREF(errorHandler);
5867 Py_XDECREF(exc);
5868 return NULL;
5869 }
5870
5871 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5872 _PyUnicode_EncodeUTF16(PyObject *str,
5873 const char *errors,
5874 int byteorder)
5875 {
5876 enum PyUnicode_Kind kind;
5877 const void *data;
5878 Py_ssize_t len;
5879 PyObject *v;
5880 unsigned short *out;
5881 Py_ssize_t pairs;
5882 #if PY_BIG_ENDIAN
5883 int native_ordering = byteorder >= 0;
5884 #else
5885 int native_ordering = byteorder <= 0;
5886 #endif
5887 const char *encoding;
5888 Py_ssize_t nsize, pos;
5889 PyObject *errorHandler = NULL;
5890 PyObject *exc = NULL;
5891 PyObject *rep = NULL;
5892
5893 if (!PyUnicode_Check(str)) {
5894 PyErr_BadArgument();
5895 return NULL;
5896 }
5897 if (PyUnicode_READY(str) == -1)
5898 return NULL;
5899 kind = PyUnicode_KIND(str);
5900 data = PyUnicode_DATA(str);
5901 len = PyUnicode_GET_LENGTH(str);
5902
5903 pairs = 0;
5904 if (kind == PyUnicode_4BYTE_KIND) {
5905 const Py_UCS4 *in = (const Py_UCS4 *)data;
5906 const Py_UCS4 *end = in + len;
5907 while (in < end) {
5908 if (*in++ >= 0x10000) {
5909 pairs++;
5910 }
5911 }
5912 }
5913 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5914 return PyErr_NoMemory();
5915 }
5916 nsize = len + pairs + (byteorder == 0);
5917 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5918 if (v == NULL) {
5919 return NULL;
5920 }
5921
5922 /* output buffer is 2-bytes aligned */
5923 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5924 out = (unsigned short *)PyBytes_AS_STRING(v);
5925 if (byteorder == 0) {
5926 *out++ = 0xFEFF;
5927 }
5928 if (len == 0) {
5929 goto done;
5930 }
5931
5932 if (kind == PyUnicode_1BYTE_KIND) {
5933 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5934 goto done;
5935 }
5936
5937 if (byteorder < 0) {
5938 encoding = "utf-16-le";
5939 }
5940 else if (byteorder > 0) {
5941 encoding = "utf-16-be";
5942 }
5943 else {
5944 encoding = "utf-16";
5945 }
5946
5947 pos = 0;
5948 while (pos < len) {
5949 Py_ssize_t repsize, moreunits;
5950
5951 if (kind == PyUnicode_2BYTE_KIND) {
5952 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5953 &out, native_ordering);
5954 }
5955 else {
5956 assert(kind == PyUnicode_4BYTE_KIND);
5957 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5958 &out, native_ordering);
5959 }
5960 if (pos == len)
5961 break;
5962
5963 rep = unicode_encode_call_errorhandler(
5964 errors, &errorHandler,
5965 encoding, "surrogates not allowed",
5966 str, &exc, pos, pos + 1, &pos);
5967 if (!rep)
5968 goto error;
5969
5970 if (PyBytes_Check(rep)) {
5971 repsize = PyBytes_GET_SIZE(rep);
5972 if (repsize & 1) {
5973 raise_encode_exception(&exc, encoding,
5974 str, pos - 1, pos,
5975 "surrogates not allowed");
5976 goto error;
5977 }
5978 moreunits = repsize / 2;
5979 }
5980 else {
5981 assert(PyUnicode_Check(rep));
5982 if (PyUnicode_READY(rep) < 0)
5983 goto error;
5984 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5985 if (!PyUnicode_IS_ASCII(rep)) {
5986 raise_encode_exception(&exc, encoding,
5987 str, pos - 1, pos,
5988 "surrogates not allowed");
5989 goto error;
5990 }
5991 }
5992
5993 /* two bytes are reserved for each surrogate */
5994 if (moreunits > 1) {
5995 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5996 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5997 /* integer overflow */
5998 PyErr_NoMemory();
5999 goto error;
6000 }
6001 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6002 goto error;
6003 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6004 }
6005
6006 if (PyBytes_Check(rep)) {
6007 memcpy(out, PyBytes_AS_STRING(rep), repsize);
6008 out += moreunits;
6009 } else /* rep is unicode */ {
6010 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6011 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6012 &out, native_ordering);
6013 }
6014
6015 Py_CLEAR(rep);
6016 }
6017
6018 /* Cut back to size actually needed. This is necessary for, for example,
6019 encoding of a string containing isolated surrogates and the 'ignore' handler
6020 is used. */
6021 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6022 if (nsize != PyBytes_GET_SIZE(v))
6023 _PyBytes_Resize(&v, nsize);
6024 Py_XDECREF(errorHandler);
6025 Py_XDECREF(exc);
6026 done:
6027 return v;
6028 error:
6029 Py_XDECREF(rep);
6030 Py_XDECREF(errorHandler);
6031 Py_XDECREF(exc);
6032 Py_XDECREF(v);
6033 return NULL;
6034 #undef STORECHAR
6035 }
6036
6037 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)6038 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6039 Py_ssize_t size,
6040 const char *errors,
6041 int byteorder)
6042 {
6043 PyObject *result;
6044 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6045 if (tmp == NULL)
6046 return NULL;
6047 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6048 Py_DECREF(tmp);
6049 return result;
6050 }
6051
6052 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)6053 PyUnicode_AsUTF16String(PyObject *unicode)
6054 {
6055 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6056 }
6057
6058 /* --- Unicode Escape Codec ----------------------------------------------- */
6059
6060 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
6061
6062 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)6063 _PyUnicode_DecodeUnicodeEscape(const char *s,
6064 Py_ssize_t size,
6065 const char *errors,
6066 const char **first_invalid_escape)
6067 {
6068 const char *starts = s;
6069 _PyUnicodeWriter writer;
6070 const char *end;
6071 PyObject *errorHandler = NULL;
6072 PyObject *exc = NULL;
6073
6074 // so we can remember if we've seen an invalid escape char or not
6075 *first_invalid_escape = NULL;
6076
6077 if (size == 0) {
6078 _Py_RETURN_UNICODE_EMPTY();
6079 }
6080 /* Escaped strings will always be longer than the resulting
6081 Unicode string, so we start with size here and then reduce the
6082 length after conversion to the true value.
6083 (but if the error callback returns a long replacement string
6084 we'll have to allocate more space) */
6085 _PyUnicodeWriter_Init(&writer);
6086 writer.min_length = size;
6087 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6088 goto onError;
6089 }
6090
6091 end = s + size;
6092 while (s < end) {
6093 unsigned char c = (unsigned char) *s++;
6094 Py_UCS4 ch;
6095 int count;
6096 Py_ssize_t startinpos;
6097 Py_ssize_t endinpos;
6098 const char *message;
6099
6100 #define WRITE_ASCII_CHAR(ch) \
6101 do { \
6102 assert(ch <= 127); \
6103 assert(writer.pos < writer.size); \
6104 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6105 } while(0)
6106
6107 #define WRITE_CHAR(ch) \
6108 do { \
6109 if (ch <= writer.maxchar) { \
6110 assert(writer.pos < writer.size); \
6111 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6112 } \
6113 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6114 goto onError; \
6115 } \
6116 } while(0)
6117
6118 /* Non-escape characters are interpreted as Unicode ordinals */
6119 if (c != '\\') {
6120 WRITE_CHAR(c);
6121 continue;
6122 }
6123
6124 startinpos = s - starts - 1;
6125 /* \ - Escapes */
6126 if (s >= end) {
6127 message = "\\ at end of string";
6128 goto error;
6129 }
6130 c = (unsigned char) *s++;
6131
6132 assert(writer.pos < writer.size);
6133 switch (c) {
6134
6135 /* \x escapes */
6136 case '\n': continue;
6137 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6138 case '\'': WRITE_ASCII_CHAR('\''); continue;
6139 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6140 case 'b': WRITE_ASCII_CHAR('\b'); continue;
6141 /* FF */
6142 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6143 case 't': WRITE_ASCII_CHAR('\t'); continue;
6144 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6145 case 'r': WRITE_ASCII_CHAR('\r'); continue;
6146 /* VT */
6147 case 'v': WRITE_ASCII_CHAR('\013'); continue;
6148 /* BEL, not classic C */
6149 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6150
6151 /* \OOO (octal) escapes */
6152 case '0': case '1': case '2': case '3':
6153 case '4': case '5': case '6': case '7':
6154 ch = c - '0';
6155 if (s < end && '0' <= *s && *s <= '7') {
6156 ch = (ch<<3) + *s++ - '0';
6157 if (s < end && '0' <= *s && *s <= '7') {
6158 ch = (ch<<3) + *s++ - '0';
6159 }
6160 }
6161 WRITE_CHAR(ch);
6162 continue;
6163
6164 /* hex escapes */
6165 /* \xXX */
6166 case 'x':
6167 count = 2;
6168 message = "truncated \\xXX escape";
6169 goto hexescape;
6170
6171 /* \uXXXX */
6172 case 'u':
6173 count = 4;
6174 message = "truncated \\uXXXX escape";
6175 goto hexescape;
6176
6177 /* \UXXXXXXXX */
6178 case 'U':
6179 count = 8;
6180 message = "truncated \\UXXXXXXXX escape";
6181 hexescape:
6182 for (ch = 0; count && s < end; ++s, --count) {
6183 c = (unsigned char)*s;
6184 ch <<= 4;
6185 if (c >= '0' && c <= '9') {
6186 ch += c - '0';
6187 }
6188 else if (c >= 'a' && c <= 'f') {
6189 ch += c - ('a' - 10);
6190 }
6191 else if (c >= 'A' && c <= 'F') {
6192 ch += c - ('A' - 10);
6193 }
6194 else {
6195 break;
6196 }
6197 }
6198 if (count) {
6199 goto error;
6200 }
6201
6202 /* when we get here, ch is a 32-bit unicode character */
6203 if (ch > MAX_UNICODE) {
6204 message = "illegal Unicode character";
6205 goto error;
6206 }
6207
6208 WRITE_CHAR(ch);
6209 continue;
6210
6211 /* \N{name} */
6212 case 'N':
6213 if (ucnhash_CAPI == NULL) {
6214 /* load the unicode data module */
6215 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6216 PyUnicodeData_CAPSULE_NAME, 1);
6217 if (ucnhash_CAPI == NULL) {
6218 PyErr_SetString(
6219 PyExc_UnicodeError,
6220 "\\N escapes not supported (can't load unicodedata module)"
6221 );
6222 goto onError;
6223 }
6224 }
6225
6226 message = "malformed \\N character escape";
6227 if (s < end && *s == '{') {
6228 const char *start = ++s;
6229 size_t namelen;
6230 /* look for the closing brace */
6231 while (s < end && *s != '}')
6232 s++;
6233 namelen = s - start;
6234 if (namelen && s < end) {
6235 /* found a name. look it up in the unicode database */
6236 s++;
6237 ch = 0xffffffff; /* in case 'getcode' messes up */
6238 if (namelen <= INT_MAX &&
6239 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6240 &ch, 0)) {
6241 assert(ch <= MAX_UNICODE);
6242 WRITE_CHAR(ch);
6243 continue;
6244 }
6245 message = "unknown Unicode character name";
6246 }
6247 }
6248 goto error;
6249
6250 default:
6251 if (*first_invalid_escape == NULL) {
6252 *first_invalid_escape = s-1; /* Back up one char, since we've
6253 already incremented s. */
6254 }
6255 WRITE_ASCII_CHAR('\\');
6256 WRITE_CHAR(c);
6257 continue;
6258 }
6259
6260 error:
6261 endinpos = s-starts;
6262 writer.min_length = end - s + writer.pos;
6263 if (unicode_decode_call_errorhandler_writer(
6264 errors, &errorHandler,
6265 "unicodeescape", message,
6266 &starts, &end, &startinpos, &endinpos, &exc, &s,
6267 &writer)) {
6268 goto onError;
6269 }
6270 assert(end - s <= writer.size - writer.pos);
6271
6272 #undef WRITE_ASCII_CHAR
6273 #undef WRITE_CHAR
6274 }
6275
6276 Py_XDECREF(errorHandler);
6277 Py_XDECREF(exc);
6278 return _PyUnicodeWriter_Finish(&writer);
6279
6280 onError:
6281 _PyUnicodeWriter_Dealloc(&writer);
6282 Py_XDECREF(errorHandler);
6283 Py_XDECREF(exc);
6284 return NULL;
6285 }
6286
6287 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6288 PyUnicode_DecodeUnicodeEscape(const char *s,
6289 Py_ssize_t size,
6290 const char *errors)
6291 {
6292 const char *first_invalid_escape;
6293 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6294 &first_invalid_escape);
6295 if (result == NULL)
6296 return NULL;
6297 if (first_invalid_escape != NULL) {
6298 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6299 "invalid escape sequence '\\%c'",
6300 (unsigned char)*first_invalid_escape) < 0) {
6301 Py_DECREF(result);
6302 return NULL;
6303 }
6304 }
6305 return result;
6306 }
6307
6308 /* Return a Unicode-Escape string version of the Unicode object. */
6309
6310 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6311 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6312 {
6313 Py_ssize_t i, len;
6314 PyObject *repr;
6315 char *p;
6316 enum PyUnicode_Kind kind;
6317 void *data;
6318 Py_ssize_t expandsize;
6319
6320 /* Initial allocation is based on the longest-possible character
6321 escape.
6322
6323 For UCS1 strings it's '\xxx', 4 bytes per source character.
6324 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6325 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6326 */
6327
6328 if (!PyUnicode_Check(unicode)) {
6329 PyErr_BadArgument();
6330 return NULL;
6331 }
6332 if (PyUnicode_READY(unicode) == -1) {
6333 return NULL;
6334 }
6335
6336 len = PyUnicode_GET_LENGTH(unicode);
6337 if (len == 0) {
6338 return PyBytes_FromStringAndSize(NULL, 0);
6339 }
6340
6341 kind = PyUnicode_KIND(unicode);
6342 data = PyUnicode_DATA(unicode);
6343 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6344 bytes, and 1 byte characters 4. */
6345 expandsize = kind * 2 + 2;
6346 if (len > PY_SSIZE_T_MAX / expandsize) {
6347 return PyErr_NoMemory();
6348 }
6349 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6350 if (repr == NULL) {
6351 return NULL;
6352 }
6353
6354 p = PyBytes_AS_STRING(repr);
6355 for (i = 0; i < len; i++) {
6356 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6357
6358 /* U+0000-U+00ff range */
6359 if (ch < 0x100) {
6360 if (ch >= ' ' && ch < 127) {
6361 if (ch != '\\') {
6362 /* Copy printable US ASCII as-is */
6363 *p++ = (char) ch;
6364 }
6365 /* Escape backslashes */
6366 else {
6367 *p++ = '\\';
6368 *p++ = '\\';
6369 }
6370 }
6371
6372 /* Map special whitespace to '\t', \n', '\r' */
6373 else if (ch == '\t') {
6374 *p++ = '\\';
6375 *p++ = 't';
6376 }
6377 else if (ch == '\n') {
6378 *p++ = '\\';
6379 *p++ = 'n';
6380 }
6381 else if (ch == '\r') {
6382 *p++ = '\\';
6383 *p++ = 'r';
6384 }
6385
6386 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6387 else {
6388 *p++ = '\\';
6389 *p++ = 'x';
6390 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6391 *p++ = Py_hexdigits[ch & 0x000F];
6392 }
6393 }
6394 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6395 else if (ch < 0x10000) {
6396 *p++ = '\\';
6397 *p++ = 'u';
6398 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6399 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6400 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6401 *p++ = Py_hexdigits[ch & 0x000F];
6402 }
6403 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6404 else {
6405
6406 /* Make sure that the first two digits are zero */
6407 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6408 *p++ = '\\';
6409 *p++ = 'U';
6410 *p++ = '0';
6411 *p++ = '0';
6412 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6413 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6414 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6415 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6416 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6417 *p++ = Py_hexdigits[ch & 0x0000000F];
6418 }
6419 }
6420
6421 assert(p - PyBytes_AS_STRING(repr) > 0);
6422 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6423 return NULL;
6424 }
6425 return repr;
6426 }
6427
6428 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6429 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6430 Py_ssize_t size)
6431 {
6432 PyObject *result;
6433 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6434 if (tmp == NULL) {
6435 return NULL;
6436 }
6437
6438 result = PyUnicode_AsUnicodeEscapeString(tmp);
6439 Py_DECREF(tmp);
6440 return result;
6441 }
6442
6443 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6444
6445 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6446 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6447 Py_ssize_t size,
6448 const char *errors)
6449 {
6450 const char *starts = s;
6451 _PyUnicodeWriter writer;
6452 const char *end;
6453 PyObject *errorHandler = NULL;
6454 PyObject *exc = NULL;
6455
6456 if (size == 0) {
6457 _Py_RETURN_UNICODE_EMPTY();
6458 }
6459
6460 /* Escaped strings will always be longer than the resulting
6461 Unicode string, so we start with size here and then reduce the
6462 length after conversion to the true value. (But decoding error
6463 handler might have to resize the string) */
6464 _PyUnicodeWriter_Init(&writer);
6465 writer.min_length = size;
6466 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6467 goto onError;
6468 }
6469
6470 end = s + size;
6471 while (s < end) {
6472 unsigned char c = (unsigned char) *s++;
6473 Py_UCS4 ch;
6474 int count;
6475 Py_ssize_t startinpos;
6476 Py_ssize_t endinpos;
6477 const char *message;
6478
6479 #define WRITE_CHAR(ch) \
6480 do { \
6481 if (ch <= writer.maxchar) { \
6482 assert(writer.pos < writer.size); \
6483 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6484 } \
6485 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6486 goto onError; \
6487 } \
6488 } while(0)
6489
6490 /* Non-escape characters are interpreted as Unicode ordinals */
6491 if (c != '\\' || s >= end) {
6492 WRITE_CHAR(c);
6493 continue;
6494 }
6495
6496 c = (unsigned char) *s++;
6497 if (c == 'u') {
6498 count = 4;
6499 message = "truncated \\uXXXX escape";
6500 }
6501 else if (c == 'U') {
6502 count = 8;
6503 message = "truncated \\UXXXXXXXX escape";
6504 }
6505 else {
6506 assert(writer.pos < writer.size);
6507 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6508 WRITE_CHAR(c);
6509 continue;
6510 }
6511 startinpos = s - starts - 2;
6512
6513 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6514 for (ch = 0; count && s < end; ++s, --count) {
6515 c = (unsigned char)*s;
6516 ch <<= 4;
6517 if (c >= '0' && c <= '9') {
6518 ch += c - '0';
6519 }
6520 else if (c >= 'a' && c <= 'f') {
6521 ch += c - ('a' - 10);
6522 }
6523 else if (c >= 'A' && c <= 'F') {
6524 ch += c - ('A' - 10);
6525 }
6526 else {
6527 break;
6528 }
6529 }
6530 if (!count) {
6531 if (ch <= MAX_UNICODE) {
6532 WRITE_CHAR(ch);
6533 continue;
6534 }
6535 message = "\\Uxxxxxxxx out of range";
6536 }
6537
6538 endinpos = s-starts;
6539 writer.min_length = end - s + writer.pos;
6540 if (unicode_decode_call_errorhandler_writer(
6541 errors, &errorHandler,
6542 "rawunicodeescape", message,
6543 &starts, &end, &startinpos, &endinpos, &exc, &s,
6544 &writer)) {
6545 goto onError;
6546 }
6547 assert(end - s <= writer.size - writer.pos);
6548
6549 #undef WRITE_CHAR
6550 }
6551 Py_XDECREF(errorHandler);
6552 Py_XDECREF(exc);
6553 return _PyUnicodeWriter_Finish(&writer);
6554
6555 onError:
6556 _PyUnicodeWriter_Dealloc(&writer);
6557 Py_XDECREF(errorHandler);
6558 Py_XDECREF(exc);
6559 return NULL;
6560
6561 }
6562
6563
6564 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6565 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6566 {
6567 PyObject *repr;
6568 char *p;
6569 Py_ssize_t expandsize, pos;
6570 int kind;
6571 void *data;
6572 Py_ssize_t len;
6573
6574 if (!PyUnicode_Check(unicode)) {
6575 PyErr_BadArgument();
6576 return NULL;
6577 }
6578 if (PyUnicode_READY(unicode) == -1) {
6579 return NULL;
6580 }
6581 kind = PyUnicode_KIND(unicode);
6582 data = PyUnicode_DATA(unicode);
6583 len = PyUnicode_GET_LENGTH(unicode);
6584 if (kind == PyUnicode_1BYTE_KIND) {
6585 return PyBytes_FromStringAndSize(data, len);
6586 }
6587
6588 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6589 bytes, and 1 byte characters 4. */
6590 expandsize = kind * 2 + 2;
6591
6592 if (len > PY_SSIZE_T_MAX / expandsize) {
6593 return PyErr_NoMemory();
6594 }
6595 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6596 if (repr == NULL) {
6597 return NULL;
6598 }
6599 if (len == 0) {
6600 return repr;
6601 }
6602
6603 p = PyBytes_AS_STRING(repr);
6604 for (pos = 0; pos < len; pos++) {
6605 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6606
6607 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6608 if (ch < 0x100) {
6609 *p++ = (char) ch;
6610 }
6611 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6612 else if (ch < 0x10000) {
6613 *p++ = '\\';
6614 *p++ = 'u';
6615 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6616 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6617 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6618 *p++ = Py_hexdigits[ch & 15];
6619 }
6620 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6621 else {
6622 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6623 *p++ = '\\';
6624 *p++ = 'U';
6625 *p++ = '0';
6626 *p++ = '0';
6627 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6628 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6629 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6630 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6631 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6632 *p++ = Py_hexdigits[ch & 15];
6633 }
6634 }
6635
6636 assert(p > PyBytes_AS_STRING(repr));
6637 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6638 return NULL;
6639 }
6640 return repr;
6641 }
6642
6643 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6644 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6645 Py_ssize_t size)
6646 {
6647 PyObject *result;
6648 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6649 if (tmp == NULL)
6650 return NULL;
6651 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6652 Py_DECREF(tmp);
6653 return result;
6654 }
6655
6656 /* --- Latin-1 Codec ------------------------------------------------------ */
6657
6658 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6659 PyUnicode_DecodeLatin1(const char *s,
6660 Py_ssize_t size,
6661 const char *errors)
6662 {
6663 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6664 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6665 }
6666
6667 /* create or adjust a UnicodeEncodeError */
6668 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6669 make_encode_exception(PyObject **exceptionObject,
6670 const char *encoding,
6671 PyObject *unicode,
6672 Py_ssize_t startpos, Py_ssize_t endpos,
6673 const char *reason)
6674 {
6675 if (*exceptionObject == NULL) {
6676 *exceptionObject = PyObject_CallFunction(
6677 PyExc_UnicodeEncodeError, "sOnns",
6678 encoding, unicode, startpos, endpos, reason);
6679 }
6680 else {
6681 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6682 goto onError;
6683 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6684 goto onError;
6685 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6686 goto onError;
6687 return;
6688 onError:
6689 Py_CLEAR(*exceptionObject);
6690 }
6691 }
6692
6693 /* raises a UnicodeEncodeError */
6694 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6695 raise_encode_exception(PyObject **exceptionObject,
6696 const char *encoding,
6697 PyObject *unicode,
6698 Py_ssize_t startpos, Py_ssize_t endpos,
6699 const char *reason)
6700 {
6701 make_encode_exception(exceptionObject,
6702 encoding, unicode, startpos, endpos, reason);
6703 if (*exceptionObject != NULL)
6704 PyCodec_StrictErrors(*exceptionObject);
6705 }
6706
6707 /* error handling callback helper:
6708 build arguments, call the callback and check the arguments,
6709 put the result into newpos and return the replacement string, which
6710 has to be freed by the caller */
6711 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6712 unicode_encode_call_errorhandler(const char *errors,
6713 PyObject **errorHandler,
6714 const char *encoding, const char *reason,
6715 PyObject *unicode, PyObject **exceptionObject,
6716 Py_ssize_t startpos, Py_ssize_t endpos,
6717 Py_ssize_t *newpos)
6718 {
6719 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6720 Py_ssize_t len;
6721 PyObject *restuple;
6722 PyObject *resunicode;
6723
6724 if (*errorHandler == NULL) {
6725 *errorHandler = PyCodec_LookupError(errors);
6726 if (*errorHandler == NULL)
6727 return NULL;
6728 }
6729
6730 if (PyUnicode_READY(unicode) == -1)
6731 return NULL;
6732 len = PyUnicode_GET_LENGTH(unicode);
6733
6734 make_encode_exception(exceptionObject,
6735 encoding, unicode, startpos, endpos, reason);
6736 if (*exceptionObject == NULL)
6737 return NULL;
6738
6739 restuple = PyObject_CallFunctionObjArgs(
6740 *errorHandler, *exceptionObject, NULL);
6741 if (restuple == NULL)
6742 return NULL;
6743 if (!PyTuple_Check(restuple)) {
6744 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6745 Py_DECREF(restuple);
6746 return NULL;
6747 }
6748 if (!PyArg_ParseTuple(restuple, argparse,
6749 &resunicode, newpos)) {
6750 Py_DECREF(restuple);
6751 return NULL;
6752 }
6753 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6754 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6755 Py_DECREF(restuple);
6756 return NULL;
6757 }
6758 if (*newpos<0)
6759 *newpos = len + *newpos;
6760 if (*newpos<0 || *newpos>len) {
6761 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6762 Py_DECREF(restuple);
6763 return NULL;
6764 }
6765 Py_INCREF(resunicode);
6766 Py_DECREF(restuple);
6767 return resunicode;
6768 }
6769
6770 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6771 unicode_encode_ucs1(PyObject *unicode,
6772 const char *errors,
6773 const Py_UCS4 limit)
6774 {
6775 /* input state */
6776 Py_ssize_t pos=0, size;
6777 int kind;
6778 void *data;
6779 /* pointer into the output */
6780 char *str;
6781 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6782 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6783 PyObject *error_handler_obj = NULL;
6784 PyObject *exc = NULL;
6785 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6786 PyObject *rep = NULL;
6787 /* output object */
6788 _PyBytesWriter writer;
6789
6790 if (PyUnicode_READY(unicode) == -1)
6791 return NULL;
6792 size = PyUnicode_GET_LENGTH(unicode);
6793 kind = PyUnicode_KIND(unicode);
6794 data = PyUnicode_DATA(unicode);
6795 /* allocate enough for a simple encoding without
6796 replacements, if we need more, we'll resize */
6797 if (size == 0)
6798 return PyBytes_FromStringAndSize(NULL, 0);
6799
6800 _PyBytesWriter_Init(&writer);
6801 str = _PyBytesWriter_Alloc(&writer, size);
6802 if (str == NULL)
6803 return NULL;
6804
6805 while (pos < size) {
6806 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6807
6808 /* can we encode this? */
6809 if (ch < limit) {
6810 /* no overflow check, because we know that the space is enough */
6811 *str++ = (char)ch;
6812 ++pos;
6813 }
6814 else {
6815 Py_ssize_t newpos, i;
6816 /* startpos for collecting unencodable chars */
6817 Py_ssize_t collstart = pos;
6818 Py_ssize_t collend = collstart + 1;
6819 /* find all unecodable characters */
6820
6821 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6822 ++collend;
6823
6824 /* Only overallocate the buffer if it's not the last write */
6825 writer.overallocate = (collend < size);
6826
6827 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6828 if (error_handler == _Py_ERROR_UNKNOWN)
6829 error_handler = _Py_GetErrorHandler(errors);
6830
6831 switch (error_handler) {
6832 case _Py_ERROR_STRICT:
6833 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6834 goto onError;
6835
6836 case _Py_ERROR_REPLACE:
6837 memset(str, '?', collend - collstart);
6838 str += (collend - collstart);
6839 /* fall through */
6840 case _Py_ERROR_IGNORE:
6841 pos = collend;
6842 break;
6843
6844 case _Py_ERROR_BACKSLASHREPLACE:
6845 /* subtract preallocated bytes */
6846 writer.min_size -= (collend - collstart);
6847 str = backslashreplace(&writer, str,
6848 unicode, collstart, collend);
6849 if (str == NULL)
6850 goto onError;
6851 pos = collend;
6852 break;
6853
6854 case _Py_ERROR_XMLCHARREFREPLACE:
6855 /* subtract preallocated bytes */
6856 writer.min_size -= (collend - collstart);
6857 str = xmlcharrefreplace(&writer, str,
6858 unicode, collstart, collend);
6859 if (str == NULL)
6860 goto onError;
6861 pos = collend;
6862 break;
6863
6864 case _Py_ERROR_SURROGATEESCAPE:
6865 for (i = collstart; i < collend; ++i) {
6866 ch = PyUnicode_READ(kind, data, i);
6867 if (ch < 0xdc80 || 0xdcff < ch) {
6868 /* Not a UTF-8b surrogate */
6869 break;
6870 }
6871 *str++ = (char)(ch - 0xdc00);
6872 ++pos;
6873 }
6874 if (i >= collend)
6875 break;
6876 collstart = pos;
6877 assert(collstart != collend);
6878 /* fall through */
6879
6880 default:
6881 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6882 encoding, reason, unicode, &exc,
6883 collstart, collend, &newpos);
6884 if (rep == NULL)
6885 goto onError;
6886
6887 /* subtract preallocated bytes */
6888 writer.min_size -= newpos - collstart;
6889
6890 if (PyBytes_Check(rep)) {
6891 /* Directly copy bytes result to output. */
6892 str = _PyBytesWriter_WriteBytes(&writer, str,
6893 PyBytes_AS_STRING(rep),
6894 PyBytes_GET_SIZE(rep));
6895 }
6896 else {
6897 assert(PyUnicode_Check(rep));
6898
6899 if (PyUnicode_READY(rep) < 0)
6900 goto onError;
6901
6902 if (limit == 256 ?
6903 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6904 !PyUnicode_IS_ASCII(rep))
6905 {
6906 /* Not all characters are smaller than limit */
6907 raise_encode_exception(&exc, encoding, unicode,
6908 collstart, collend, reason);
6909 goto onError;
6910 }
6911 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6912 str = _PyBytesWriter_WriteBytes(&writer, str,
6913 PyUnicode_DATA(rep),
6914 PyUnicode_GET_LENGTH(rep));
6915 }
6916 if (str == NULL)
6917 goto onError;
6918
6919 pos = newpos;
6920 Py_CLEAR(rep);
6921 }
6922
6923 /* If overallocation was disabled, ensure that it was the last
6924 write. Otherwise, we missed an optimization */
6925 assert(writer.overallocate || pos == size);
6926 }
6927 }
6928
6929 Py_XDECREF(error_handler_obj);
6930 Py_XDECREF(exc);
6931 return _PyBytesWriter_Finish(&writer, str);
6932
6933 onError:
6934 Py_XDECREF(rep);
6935 _PyBytesWriter_Dealloc(&writer);
6936 Py_XDECREF(error_handler_obj);
6937 Py_XDECREF(exc);
6938 return NULL;
6939 }
6940
6941 /* Deprecated */
6942 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6943 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6944 Py_ssize_t size,
6945 const char *errors)
6946 {
6947 PyObject *result;
6948 PyObject *unicode = PyUnicode_FromWideChar(p, size);
6949 if (unicode == NULL)
6950 return NULL;
6951 result = unicode_encode_ucs1(unicode, errors, 256);
6952 Py_DECREF(unicode);
6953 return result;
6954 }
6955
6956 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6957 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6958 {
6959 if (!PyUnicode_Check(unicode)) {
6960 PyErr_BadArgument();
6961 return NULL;
6962 }
6963 if (PyUnicode_READY(unicode) == -1)
6964 return NULL;
6965 /* Fast path: if it is a one-byte string, construct
6966 bytes object directly. */
6967 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6968 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6969 PyUnicode_GET_LENGTH(unicode));
6970 /* Non-Latin-1 characters present. Defer to above function to
6971 raise the exception. */
6972 return unicode_encode_ucs1(unicode, errors, 256);
6973 }
6974
6975 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6976 PyUnicode_AsLatin1String(PyObject *unicode)
6977 {
6978 return _PyUnicode_AsLatin1String(unicode, NULL);
6979 }
6980
6981 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6982
6983 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6984 PyUnicode_DecodeASCII(const char *s,
6985 Py_ssize_t size,
6986 const char *errors)
6987 {
6988 const char *starts = s;
6989 _PyUnicodeWriter writer;
6990 int kind;
6991 void *data;
6992 Py_ssize_t startinpos;
6993 Py_ssize_t endinpos;
6994 Py_ssize_t outpos;
6995 const char *e;
6996 PyObject *error_handler_obj = NULL;
6997 PyObject *exc = NULL;
6998 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6999
7000 if (size == 0)
7001 _Py_RETURN_UNICODE_EMPTY();
7002
7003 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7004 if (size == 1 && (unsigned char)s[0] < 128)
7005 return get_latin1_char((unsigned char)s[0]);
7006
7007 _PyUnicodeWriter_Init(&writer);
7008 writer.min_length = size;
7009 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
7010 return NULL;
7011
7012 e = s + size;
7013 data = writer.data;
7014 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
7015 writer.pos = outpos;
7016 if (writer.pos == size)
7017 return _PyUnicodeWriter_Finish(&writer);
7018
7019 s += writer.pos;
7020 kind = writer.kind;
7021 while (s < e) {
7022 unsigned char c = (unsigned char)*s;
7023 if (c < 128) {
7024 PyUnicode_WRITE(kind, data, writer.pos, c);
7025 writer.pos++;
7026 ++s;
7027 continue;
7028 }
7029
7030 /* byte outsize range 0x00..0x7f: call the error handler */
7031
7032 if (error_handler == _Py_ERROR_UNKNOWN)
7033 error_handler = _Py_GetErrorHandler(errors);
7034
7035 switch (error_handler)
7036 {
7037 case _Py_ERROR_REPLACE:
7038 case _Py_ERROR_SURROGATEESCAPE:
7039 /* Fast-path: the error handler only writes one character,
7040 but we may switch to UCS2 at the first write */
7041 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7042 goto onError;
7043 kind = writer.kind;
7044 data = writer.data;
7045
7046 if (error_handler == _Py_ERROR_REPLACE)
7047 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7048 else
7049 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7050 writer.pos++;
7051 ++s;
7052 break;
7053
7054 case _Py_ERROR_IGNORE:
7055 ++s;
7056 break;
7057
7058 default:
7059 startinpos = s-starts;
7060 endinpos = startinpos + 1;
7061 if (unicode_decode_call_errorhandler_writer(
7062 errors, &error_handler_obj,
7063 "ascii", "ordinal not in range(128)",
7064 &starts, &e, &startinpos, &endinpos, &exc, &s,
7065 &writer))
7066 goto onError;
7067 kind = writer.kind;
7068 data = writer.data;
7069 }
7070 }
7071 Py_XDECREF(error_handler_obj);
7072 Py_XDECREF(exc);
7073 return _PyUnicodeWriter_Finish(&writer);
7074
7075 onError:
7076 _PyUnicodeWriter_Dealloc(&writer);
7077 Py_XDECREF(error_handler_obj);
7078 Py_XDECREF(exc);
7079 return NULL;
7080 }
7081
7082 /* Deprecated */
7083 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7084 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7085 Py_ssize_t size,
7086 const char *errors)
7087 {
7088 PyObject *result;
7089 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7090 if (unicode == NULL)
7091 return NULL;
7092 result = unicode_encode_ucs1(unicode, errors, 128);
7093 Py_DECREF(unicode);
7094 return result;
7095 }
7096
7097 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7098 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7099 {
7100 if (!PyUnicode_Check(unicode)) {
7101 PyErr_BadArgument();
7102 return NULL;
7103 }
7104 if (PyUnicode_READY(unicode) == -1)
7105 return NULL;
7106 /* Fast path: if it is an ASCII-only string, construct bytes object
7107 directly. Else defer to above function to raise the exception. */
7108 if (PyUnicode_IS_ASCII(unicode))
7109 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7110 PyUnicode_GET_LENGTH(unicode));
7111 return unicode_encode_ucs1(unicode, errors, 128);
7112 }
7113
7114 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7115 PyUnicode_AsASCIIString(PyObject *unicode)
7116 {
7117 return _PyUnicode_AsASCIIString(unicode, NULL);
7118 }
7119
7120 #ifdef MS_WINDOWS
7121
7122 /* --- MBCS codecs for Windows -------------------------------------------- */
7123
7124 #if SIZEOF_INT < SIZEOF_SIZE_T
7125 #define NEED_RETRY
7126 #endif
7127
7128 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7129 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7130 both cases also and avoids partial characters overrunning the
7131 length limit in MultiByteToWideChar on Windows */
7132 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7133
7134 #ifndef WC_ERR_INVALID_CHARS
7135 # define WC_ERR_INVALID_CHARS 0x0080
7136 #endif
7137
7138 static const char*
code_page_name(UINT code_page,PyObject ** obj)7139 code_page_name(UINT code_page, PyObject **obj)
7140 {
7141 *obj = NULL;
7142 if (code_page == CP_ACP)
7143 return "mbcs";
7144 if (code_page == CP_UTF7)
7145 return "CP_UTF7";
7146 if (code_page == CP_UTF8)
7147 return "CP_UTF8";
7148
7149 *obj = PyBytes_FromFormat("cp%u", code_page);
7150 if (*obj == NULL)
7151 return NULL;
7152 return PyBytes_AS_STRING(*obj);
7153 }
7154
7155 static DWORD
decode_code_page_flags(UINT code_page)7156 decode_code_page_flags(UINT code_page)
7157 {
7158 if (code_page == CP_UTF7) {
7159 /* The CP_UTF7 decoder only supports flags=0 */
7160 return 0;
7161 }
7162 else
7163 return MB_ERR_INVALID_CHARS;
7164 }
7165
7166 /*
7167 * Decode a byte string from a Windows code page into unicode object in strict
7168 * mode.
7169 *
7170 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7171 * OSError and returns -1 on other error.
7172 */
7173 static int
decode_code_page_strict(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,int insize)7174 decode_code_page_strict(UINT code_page,
7175 wchar_t **buf,
7176 Py_ssize_t *bufsize,
7177 const char *in,
7178 int insize)
7179 {
7180 DWORD flags = MB_ERR_INVALID_CHARS;
7181 wchar_t *out;
7182 DWORD outsize;
7183
7184 /* First get the size of the result */
7185 assert(insize > 0);
7186 while ((outsize = MultiByteToWideChar(code_page, flags,
7187 in, insize, NULL, 0)) <= 0)
7188 {
7189 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7190 goto error;
7191 }
7192 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7193 flags = 0;
7194 }
7195
7196 /* Extend a wchar_t* buffer */
7197 Py_ssize_t n = *bufsize; /* Get the current length */
7198 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7199 return -1;
7200 }
7201 out = *buf + n;
7202
7203 /* Do the conversion */
7204 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7205 if (outsize <= 0)
7206 goto error;
7207 return insize;
7208
7209 error:
7210 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7211 return -2;
7212 PyErr_SetFromWindowsErr(0);
7213 return -1;
7214 }
7215
7216 /*
7217 * Decode a byte string from a code page into unicode object with an error
7218 * handler.
7219 *
7220 * Returns consumed size if succeed, or raise an OSError or
7221 * UnicodeDecodeError exception and returns -1 on error.
7222 */
7223 static int
decode_code_page_errors(UINT code_page,wchar_t ** buf,Py_ssize_t * bufsize,const char * in,const int size,const char * errors,int final)7224 decode_code_page_errors(UINT code_page,
7225 wchar_t **buf,
7226 Py_ssize_t *bufsize,
7227 const char *in, const int size,
7228 const char *errors, int final)
7229 {
7230 const char *startin = in;
7231 const char *endin = in + size;
7232 DWORD flags = MB_ERR_INVALID_CHARS;
7233 /* Ideally, we should get reason from FormatMessage. This is the Windows
7234 2000 English version of the message. */
7235 const char *reason = "No mapping for the Unicode character exists "
7236 "in the target code page.";
7237 /* each step cannot decode more than 1 character, but a character can be
7238 represented as a surrogate pair */
7239 wchar_t buffer[2], *out;
7240 int insize;
7241 Py_ssize_t outsize;
7242 PyObject *errorHandler = NULL;
7243 PyObject *exc = NULL;
7244 PyObject *encoding_obj = NULL;
7245 const char *encoding;
7246 DWORD err;
7247 int ret = -1;
7248
7249 assert(size > 0);
7250
7251 encoding = code_page_name(code_page, &encoding_obj);
7252 if (encoding == NULL)
7253 return -1;
7254
7255 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7256 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7257 UnicodeDecodeError. */
7258 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7259 if (exc != NULL) {
7260 PyCodec_StrictErrors(exc);
7261 Py_CLEAR(exc);
7262 }
7263 goto error;
7264 }
7265
7266 /* Extend a wchar_t* buffer */
7267 Py_ssize_t n = *bufsize; /* Get the current length */
7268 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7269 PyErr_NoMemory();
7270 goto error;
7271 }
7272 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7273 goto error;
7274 }
7275 out = *buf + n;
7276
7277 /* Decode the byte string character per character */
7278 while (in < endin)
7279 {
7280 /* Decode a character */
7281 insize = 1;
7282 do
7283 {
7284 outsize = MultiByteToWideChar(code_page, flags,
7285 in, insize,
7286 buffer, Py_ARRAY_LENGTH(buffer));
7287 if (outsize > 0)
7288 break;
7289 err = GetLastError();
7290 if (err == ERROR_INVALID_FLAGS && flags) {
7291 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7292 flags = 0;
7293 continue;
7294 }
7295 if (err != ERROR_NO_UNICODE_TRANSLATION
7296 && err != ERROR_INSUFFICIENT_BUFFER)
7297 {
7298 PyErr_SetFromWindowsErr(0);
7299 goto error;
7300 }
7301 insize++;
7302 }
7303 /* 4=maximum length of a UTF-8 sequence */
7304 while (insize <= 4 && (in + insize) <= endin);
7305
7306 if (outsize <= 0) {
7307 Py_ssize_t startinpos, endinpos, outpos;
7308
7309 /* last character in partial decode? */
7310 if (in + insize >= endin && !final)
7311 break;
7312
7313 startinpos = in - startin;
7314 endinpos = startinpos + 1;
7315 outpos = out - *buf;
7316 if (unicode_decode_call_errorhandler_wchar(
7317 errors, &errorHandler,
7318 encoding, reason,
7319 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7320 buf, bufsize, &outpos))
7321 {
7322 goto error;
7323 }
7324 out = *buf + outpos;
7325 }
7326 else {
7327 in += insize;
7328 memcpy(out, buffer, outsize * sizeof(wchar_t));
7329 out += outsize;
7330 }
7331 }
7332
7333 /* Shrink the buffer */
7334 assert(out - *buf <= *bufsize);
7335 *bufsize = out - *buf;
7336 /* (in - startin) <= size and size is an int */
7337 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7338
7339 error:
7340 Py_XDECREF(encoding_obj);
7341 Py_XDECREF(errorHandler);
7342 Py_XDECREF(exc);
7343 return ret;
7344 }
7345
7346 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7347 decode_code_page_stateful(int code_page,
7348 const char *s, Py_ssize_t size,
7349 const char *errors, Py_ssize_t *consumed)
7350 {
7351 wchar_t *buf = NULL;
7352 Py_ssize_t bufsize = 0;
7353 int chunk_size, final, converted, done;
7354
7355 if (code_page < 0) {
7356 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7357 return NULL;
7358 }
7359 if (size < 0) {
7360 PyErr_BadInternalCall();
7361 return NULL;
7362 }
7363
7364 if (consumed)
7365 *consumed = 0;
7366
7367 do
7368 {
7369 #ifdef NEED_RETRY
7370 if (size > DECODING_CHUNK_SIZE) {
7371 chunk_size = DECODING_CHUNK_SIZE;
7372 final = 0;
7373 done = 0;
7374 }
7375 else
7376 #endif
7377 {
7378 chunk_size = (int)size;
7379 final = (consumed == NULL);
7380 done = 1;
7381 }
7382
7383 if (chunk_size == 0 && done) {
7384 if (buf != NULL)
7385 break;
7386 _Py_RETURN_UNICODE_EMPTY();
7387 }
7388
7389 converted = decode_code_page_strict(code_page, &buf, &bufsize,
7390 s, chunk_size);
7391 if (converted == -2)
7392 converted = decode_code_page_errors(code_page, &buf, &bufsize,
7393 s, chunk_size,
7394 errors, final);
7395 assert(converted != 0 || done);
7396
7397 if (converted < 0) {
7398 PyMem_Free(buf);
7399 return NULL;
7400 }
7401
7402 if (consumed)
7403 *consumed += converted;
7404
7405 s += converted;
7406 size -= converted;
7407 } while (!done);
7408
7409 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7410 PyMem_Free(buf);
7411 return v;
7412 }
7413
7414 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7415 PyUnicode_DecodeCodePageStateful(int code_page,
7416 const char *s,
7417 Py_ssize_t size,
7418 const char *errors,
7419 Py_ssize_t *consumed)
7420 {
7421 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7422 }
7423
7424 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7425 PyUnicode_DecodeMBCSStateful(const char *s,
7426 Py_ssize_t size,
7427 const char *errors,
7428 Py_ssize_t *consumed)
7429 {
7430 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7431 }
7432
7433 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7434 PyUnicode_DecodeMBCS(const char *s,
7435 Py_ssize_t size,
7436 const char *errors)
7437 {
7438 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7439 }
7440
7441 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7442 encode_code_page_flags(UINT code_page, const char *errors)
7443 {
7444 if (code_page == CP_UTF8) {
7445 return WC_ERR_INVALID_CHARS;
7446 }
7447 else if (code_page == CP_UTF7) {
7448 /* CP_UTF7 only supports flags=0 */
7449 return 0;
7450 }
7451 else {
7452 if (errors != NULL && strcmp(errors, "replace") == 0)
7453 return 0;
7454 else
7455 return WC_NO_BEST_FIT_CHARS;
7456 }
7457 }
7458
7459 /*
7460 * Encode a Unicode string to a Windows code page into a byte string in strict
7461 * mode.
7462 *
7463 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7464 * an OSError and returns -1 on other error.
7465 */
7466 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7467 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7468 PyObject *unicode, Py_ssize_t offset, int len,
7469 const char* errors)
7470 {
7471 BOOL usedDefaultChar = FALSE;
7472 BOOL *pusedDefaultChar = &usedDefaultChar;
7473 int outsize;
7474 wchar_t *p;
7475 Py_ssize_t size;
7476 const DWORD flags = encode_code_page_flags(code_page, NULL);
7477 char *out;
7478 /* Create a substring so that we can get the UTF-16 representation
7479 of just the slice under consideration. */
7480 PyObject *substring;
7481
7482 assert(len > 0);
7483
7484 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7485 pusedDefaultChar = &usedDefaultChar;
7486 else
7487 pusedDefaultChar = NULL;
7488
7489 substring = PyUnicode_Substring(unicode, offset, offset+len);
7490 if (substring == NULL)
7491 return -1;
7492 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7493 if (p == NULL) {
7494 Py_DECREF(substring);
7495 return -1;
7496 }
7497 assert(size <= INT_MAX);
7498
7499 /* First get the size of the result */
7500 outsize = WideCharToMultiByte(code_page, flags,
7501 p, (int)size,
7502 NULL, 0,
7503 NULL, pusedDefaultChar);
7504 if (outsize <= 0)
7505 goto error;
7506 /* If we used a default char, then we failed! */
7507 if (pusedDefaultChar && *pusedDefaultChar) {
7508 Py_DECREF(substring);
7509 return -2;
7510 }
7511
7512 if (*outbytes == NULL) {
7513 /* Create string object */
7514 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7515 if (*outbytes == NULL) {
7516 Py_DECREF(substring);
7517 return -1;
7518 }
7519 out = PyBytes_AS_STRING(*outbytes);
7520 }
7521 else {
7522 /* Extend string object */
7523 const Py_ssize_t n = PyBytes_Size(*outbytes);
7524 if (outsize > PY_SSIZE_T_MAX - n) {
7525 PyErr_NoMemory();
7526 Py_DECREF(substring);
7527 return -1;
7528 }
7529 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7530 Py_DECREF(substring);
7531 return -1;
7532 }
7533 out = PyBytes_AS_STRING(*outbytes) + n;
7534 }
7535
7536 /* Do the conversion */
7537 outsize = WideCharToMultiByte(code_page, flags,
7538 p, (int)size,
7539 out, outsize,
7540 NULL, pusedDefaultChar);
7541 Py_CLEAR(substring);
7542 if (outsize <= 0)
7543 goto error;
7544 if (pusedDefaultChar && *pusedDefaultChar)
7545 return -2;
7546 return 0;
7547
7548 error:
7549 Py_XDECREF(substring);
7550 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7551 return -2;
7552 PyErr_SetFromWindowsErr(0);
7553 return -1;
7554 }
7555
7556 /*
7557 * Encode a Unicode string to a Windows code page into a byte string using an
7558 * error handler.
7559 *
7560 * Returns consumed characters if succeed, or raise an OSError and returns
7561 * -1 on other error.
7562 */
7563 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7564 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7565 PyObject *unicode, Py_ssize_t unicode_offset,
7566 Py_ssize_t insize, const char* errors)
7567 {
7568 const DWORD flags = encode_code_page_flags(code_page, errors);
7569 Py_ssize_t pos = unicode_offset;
7570 Py_ssize_t endin = unicode_offset + insize;
7571 /* Ideally, we should get reason from FormatMessage. This is the Windows
7572 2000 English version of the message. */
7573 const char *reason = "invalid character";
7574 /* 4=maximum length of a UTF-8 sequence */
7575 char buffer[4];
7576 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7577 Py_ssize_t outsize;
7578 char *out;
7579 PyObject *errorHandler = NULL;
7580 PyObject *exc = NULL;
7581 PyObject *encoding_obj = NULL;
7582 const char *encoding;
7583 Py_ssize_t newpos, newoutsize;
7584 PyObject *rep;
7585 int ret = -1;
7586
7587 assert(insize > 0);
7588
7589 encoding = code_page_name(code_page, &encoding_obj);
7590 if (encoding == NULL)
7591 return -1;
7592
7593 if (errors == NULL || strcmp(errors, "strict") == 0) {
7594 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7595 then we raise a UnicodeEncodeError. */
7596 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7597 if (exc != NULL) {
7598 PyCodec_StrictErrors(exc);
7599 Py_DECREF(exc);
7600 }
7601 Py_XDECREF(encoding_obj);
7602 return -1;
7603 }
7604
7605 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7606 pusedDefaultChar = &usedDefaultChar;
7607 else
7608 pusedDefaultChar = NULL;
7609
7610 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7611 PyErr_NoMemory();
7612 goto error;
7613 }
7614 outsize = insize * Py_ARRAY_LENGTH(buffer);
7615
7616 if (*outbytes == NULL) {
7617 /* Create string object */
7618 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7619 if (*outbytes == NULL)
7620 goto error;
7621 out = PyBytes_AS_STRING(*outbytes);
7622 }
7623 else {
7624 /* Extend string object */
7625 Py_ssize_t n = PyBytes_Size(*outbytes);
7626 if (n > PY_SSIZE_T_MAX - outsize) {
7627 PyErr_NoMemory();
7628 goto error;
7629 }
7630 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7631 goto error;
7632 out = PyBytes_AS_STRING(*outbytes) + n;
7633 }
7634
7635 /* Encode the string character per character */
7636 while (pos < endin)
7637 {
7638 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7639 wchar_t chars[2];
7640 int charsize;
7641 if (ch < 0x10000) {
7642 chars[0] = (wchar_t)ch;
7643 charsize = 1;
7644 }
7645 else {
7646 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7647 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7648 charsize = 2;
7649 }
7650
7651 outsize = WideCharToMultiByte(code_page, flags,
7652 chars, charsize,
7653 buffer, Py_ARRAY_LENGTH(buffer),
7654 NULL, pusedDefaultChar);
7655 if (outsize > 0) {
7656 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7657 {
7658 pos++;
7659 memcpy(out, buffer, outsize);
7660 out += outsize;
7661 continue;
7662 }
7663 }
7664 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7665 PyErr_SetFromWindowsErr(0);
7666 goto error;
7667 }
7668
7669 rep = unicode_encode_call_errorhandler(
7670 errors, &errorHandler, encoding, reason,
7671 unicode, &exc,
7672 pos, pos + 1, &newpos);
7673 if (rep == NULL)
7674 goto error;
7675 pos = newpos;
7676
7677 if (PyBytes_Check(rep)) {
7678 outsize = PyBytes_GET_SIZE(rep);
7679 if (outsize != 1) {
7680 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7681 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7682 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7683 Py_DECREF(rep);
7684 goto error;
7685 }
7686 out = PyBytes_AS_STRING(*outbytes) + offset;
7687 }
7688 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7689 out += outsize;
7690 }
7691 else {
7692 Py_ssize_t i;
7693 enum PyUnicode_Kind kind;
7694 void *data;
7695
7696 if (PyUnicode_READY(rep) == -1) {
7697 Py_DECREF(rep);
7698 goto error;
7699 }
7700
7701 outsize = PyUnicode_GET_LENGTH(rep);
7702 if (outsize != 1) {
7703 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7704 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7705 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7706 Py_DECREF(rep);
7707 goto error;
7708 }
7709 out = PyBytes_AS_STRING(*outbytes) + offset;
7710 }
7711 kind = PyUnicode_KIND(rep);
7712 data = PyUnicode_DATA(rep);
7713 for (i=0; i < outsize; i++) {
7714 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7715 if (ch > 127) {
7716 raise_encode_exception(&exc,
7717 encoding, unicode,
7718 pos, pos + 1,
7719 "unable to encode error handler result to ASCII");
7720 Py_DECREF(rep);
7721 goto error;
7722 }
7723 *out = (unsigned char)ch;
7724 out++;
7725 }
7726 }
7727 Py_DECREF(rep);
7728 }
7729 /* write a NUL byte */
7730 *out = 0;
7731 outsize = out - PyBytes_AS_STRING(*outbytes);
7732 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7733 if (_PyBytes_Resize(outbytes, outsize) < 0)
7734 goto error;
7735 ret = 0;
7736
7737 error:
7738 Py_XDECREF(encoding_obj);
7739 Py_XDECREF(errorHandler);
7740 Py_XDECREF(exc);
7741 return ret;
7742 }
7743
7744 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7745 encode_code_page(int code_page,
7746 PyObject *unicode,
7747 const char *errors)
7748 {
7749 Py_ssize_t len;
7750 PyObject *outbytes = NULL;
7751 Py_ssize_t offset;
7752 int chunk_len, ret, done;
7753
7754 if (!PyUnicode_Check(unicode)) {
7755 PyErr_BadArgument();
7756 return NULL;
7757 }
7758
7759 if (PyUnicode_READY(unicode) == -1)
7760 return NULL;
7761 len = PyUnicode_GET_LENGTH(unicode);
7762
7763 if (code_page < 0) {
7764 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7765 return NULL;
7766 }
7767
7768 if (len == 0)
7769 return PyBytes_FromStringAndSize(NULL, 0);
7770
7771 offset = 0;
7772 do
7773 {
7774 #ifdef NEED_RETRY
7775 if (len > DECODING_CHUNK_SIZE) {
7776 chunk_len = DECODING_CHUNK_SIZE;
7777 done = 0;
7778 }
7779 else
7780 #endif
7781 {
7782 chunk_len = (int)len;
7783 done = 1;
7784 }
7785
7786 ret = encode_code_page_strict(code_page, &outbytes,
7787 unicode, offset, chunk_len,
7788 errors);
7789 if (ret == -2)
7790 ret = encode_code_page_errors(code_page, &outbytes,
7791 unicode, offset,
7792 chunk_len, errors);
7793 if (ret < 0) {
7794 Py_XDECREF(outbytes);
7795 return NULL;
7796 }
7797
7798 offset += chunk_len;
7799 len -= chunk_len;
7800 } while (!done);
7801
7802 return outbytes;
7803 }
7804
7805 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7806 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7807 Py_ssize_t size,
7808 const char *errors)
7809 {
7810 PyObject *unicode, *res;
7811 unicode = PyUnicode_FromWideChar(p, size);
7812 if (unicode == NULL)
7813 return NULL;
7814 res = encode_code_page(CP_ACP, unicode, errors);
7815 Py_DECREF(unicode);
7816 return res;
7817 }
7818
7819 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7820 PyUnicode_EncodeCodePage(int code_page,
7821 PyObject *unicode,
7822 const char *errors)
7823 {
7824 return encode_code_page(code_page, unicode, errors);
7825 }
7826
7827 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7828 PyUnicode_AsMBCSString(PyObject *unicode)
7829 {
7830 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7831 }
7832
7833 #undef NEED_RETRY
7834
7835 #endif /* MS_WINDOWS */
7836
7837 /* --- Character Mapping Codec -------------------------------------------- */
7838
7839 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7840 charmap_decode_string(const char *s,
7841 Py_ssize_t size,
7842 PyObject *mapping,
7843 const char *errors,
7844 _PyUnicodeWriter *writer)
7845 {
7846 const char *starts = s;
7847 const char *e;
7848 Py_ssize_t startinpos, endinpos;
7849 PyObject *errorHandler = NULL, *exc = NULL;
7850 Py_ssize_t maplen;
7851 enum PyUnicode_Kind mapkind;
7852 void *mapdata;
7853 Py_UCS4 x;
7854 unsigned char ch;
7855
7856 if (PyUnicode_READY(mapping) == -1)
7857 return -1;
7858
7859 maplen = PyUnicode_GET_LENGTH(mapping);
7860 mapdata = PyUnicode_DATA(mapping);
7861 mapkind = PyUnicode_KIND(mapping);
7862
7863 e = s + size;
7864
7865 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7866 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7867 * is disabled in encoding aliases, latin1 is preferred because
7868 * its implementation is faster. */
7869 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7870 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7871 Py_UCS4 maxchar = writer->maxchar;
7872
7873 assert (writer->kind == PyUnicode_1BYTE_KIND);
7874 while (s < e) {
7875 ch = *s;
7876 x = mapdata_ucs1[ch];
7877 if (x > maxchar) {
7878 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7879 goto onError;
7880 maxchar = writer->maxchar;
7881 outdata = (Py_UCS1 *)writer->data;
7882 }
7883 outdata[writer->pos] = x;
7884 writer->pos++;
7885 ++s;
7886 }
7887 return 0;
7888 }
7889
7890 while (s < e) {
7891 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7892 enum PyUnicode_Kind outkind = writer->kind;
7893 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7894 if (outkind == PyUnicode_1BYTE_KIND) {
7895 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7896 Py_UCS4 maxchar = writer->maxchar;
7897 while (s < e) {
7898 ch = *s;
7899 x = mapdata_ucs2[ch];
7900 if (x > maxchar)
7901 goto Error;
7902 outdata[writer->pos] = x;
7903 writer->pos++;
7904 ++s;
7905 }
7906 break;
7907 }
7908 else if (outkind == PyUnicode_2BYTE_KIND) {
7909 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7910 while (s < e) {
7911 ch = *s;
7912 x = mapdata_ucs2[ch];
7913 if (x == 0xFFFE)
7914 goto Error;
7915 outdata[writer->pos] = x;
7916 writer->pos++;
7917 ++s;
7918 }
7919 break;
7920 }
7921 }
7922 ch = *s;
7923
7924 if (ch < maplen)
7925 x = PyUnicode_READ(mapkind, mapdata, ch);
7926 else
7927 x = 0xfffe; /* invalid value */
7928 Error:
7929 if (x == 0xfffe)
7930 {
7931 /* undefined mapping */
7932 startinpos = s-starts;
7933 endinpos = startinpos+1;
7934 if (unicode_decode_call_errorhandler_writer(
7935 errors, &errorHandler,
7936 "charmap", "character maps to <undefined>",
7937 &starts, &e, &startinpos, &endinpos, &exc, &s,
7938 writer)) {
7939 goto onError;
7940 }
7941 continue;
7942 }
7943
7944 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7945 goto onError;
7946 ++s;
7947 }
7948 Py_XDECREF(errorHandler);
7949 Py_XDECREF(exc);
7950 return 0;
7951
7952 onError:
7953 Py_XDECREF(errorHandler);
7954 Py_XDECREF(exc);
7955 return -1;
7956 }
7957
7958 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7959 charmap_decode_mapping(const char *s,
7960 Py_ssize_t size,
7961 PyObject *mapping,
7962 const char *errors,
7963 _PyUnicodeWriter *writer)
7964 {
7965 const char *starts = s;
7966 const char *e;
7967 Py_ssize_t startinpos, endinpos;
7968 PyObject *errorHandler = NULL, *exc = NULL;
7969 unsigned char ch;
7970 PyObject *key, *item = NULL;
7971
7972 e = s + size;
7973
7974 while (s < e) {
7975 ch = *s;
7976
7977 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7978 key = PyLong_FromLong((long)ch);
7979 if (key == NULL)
7980 goto onError;
7981
7982 item = PyObject_GetItem(mapping, key);
7983 Py_DECREF(key);
7984 if (item == NULL) {
7985 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7986 /* No mapping found means: mapping is undefined. */
7987 PyErr_Clear();
7988 goto Undefined;
7989 } else
7990 goto onError;
7991 }
7992
7993 /* Apply mapping */
7994 if (item == Py_None)
7995 goto Undefined;
7996 if (PyLong_Check(item)) {
7997 long value = PyLong_AS_LONG(item);
7998 if (value == 0xFFFE)
7999 goto Undefined;
8000 if (value < 0 || value > MAX_UNICODE) {
8001 PyErr_Format(PyExc_TypeError,
8002 "character mapping must be in range(0x%x)",
8003 (unsigned long)MAX_UNICODE + 1);
8004 goto onError;
8005 }
8006
8007 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8008 goto onError;
8009 }
8010 else if (PyUnicode_Check(item)) {
8011 if (PyUnicode_READY(item) == -1)
8012 goto onError;
8013 if (PyUnicode_GET_LENGTH(item) == 1) {
8014 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8015 if (value == 0xFFFE)
8016 goto Undefined;
8017 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8018 goto onError;
8019 }
8020 else {
8021 writer->overallocate = 1;
8022 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8023 goto onError;
8024 }
8025 }
8026 else {
8027 /* wrong return value */
8028 PyErr_SetString(PyExc_TypeError,
8029 "character mapping must return integer, None or str");
8030 goto onError;
8031 }
8032 Py_CLEAR(item);
8033 ++s;
8034 continue;
8035
8036 Undefined:
8037 /* undefined mapping */
8038 Py_CLEAR(item);
8039 startinpos = s-starts;
8040 endinpos = startinpos+1;
8041 if (unicode_decode_call_errorhandler_writer(
8042 errors, &errorHandler,
8043 "charmap", "character maps to <undefined>",
8044 &starts, &e, &startinpos, &endinpos, &exc, &s,
8045 writer)) {
8046 goto onError;
8047 }
8048 }
8049 Py_XDECREF(errorHandler);
8050 Py_XDECREF(exc);
8051 return 0;
8052
8053 onError:
8054 Py_XDECREF(item);
8055 Py_XDECREF(errorHandler);
8056 Py_XDECREF(exc);
8057 return -1;
8058 }
8059
8060 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8061 PyUnicode_DecodeCharmap(const char *s,
8062 Py_ssize_t size,
8063 PyObject *mapping,
8064 const char *errors)
8065 {
8066 _PyUnicodeWriter writer;
8067
8068 /* Default to Latin-1 */
8069 if (mapping == NULL)
8070 return PyUnicode_DecodeLatin1(s, size, errors);
8071
8072 if (size == 0)
8073 _Py_RETURN_UNICODE_EMPTY();
8074 _PyUnicodeWriter_Init(&writer);
8075 writer.min_length = size;
8076 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8077 goto onError;
8078
8079 if (PyUnicode_CheckExact(mapping)) {
8080 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8081 goto onError;
8082 }
8083 else {
8084 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8085 goto onError;
8086 }
8087 return _PyUnicodeWriter_Finish(&writer);
8088
8089 onError:
8090 _PyUnicodeWriter_Dealloc(&writer);
8091 return NULL;
8092 }
8093
8094 /* Charmap encoding: the lookup table */
8095
8096 struct encoding_map {
8097 PyObject_HEAD
8098 unsigned char level1[32];
8099 int count2, count3;
8100 unsigned char level23[1];
8101 };
8102
8103 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8104 encoding_map_size(PyObject *obj, PyObject* args)
8105 {
8106 struct encoding_map *map = (struct encoding_map*)obj;
8107 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8108 128*map->count3);
8109 }
8110
8111 static PyMethodDef encoding_map_methods[] = {
8112 {"size", encoding_map_size, METH_NOARGS,
8113 PyDoc_STR("Return the size (in bytes) of this object") },
8114 { 0 }
8115 };
8116
8117 static PyTypeObject EncodingMapType = {
8118 PyVarObject_HEAD_INIT(NULL, 0)
8119 "EncodingMap", /*tp_name*/
8120 sizeof(struct encoding_map), /*tp_basicsize*/
8121 0, /*tp_itemsize*/
8122 /* methods */
8123 0, /*tp_dealloc*/
8124 0, /*tp_vectorcall_offset*/
8125 0, /*tp_getattr*/
8126 0, /*tp_setattr*/
8127 0, /*tp_as_async*/
8128 0, /*tp_repr*/
8129 0, /*tp_as_number*/
8130 0, /*tp_as_sequence*/
8131 0, /*tp_as_mapping*/
8132 0, /*tp_hash*/
8133 0, /*tp_call*/
8134 0, /*tp_str*/
8135 0, /*tp_getattro*/
8136 0, /*tp_setattro*/
8137 0, /*tp_as_buffer*/
8138 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8139 0, /*tp_doc*/
8140 0, /*tp_traverse*/
8141 0, /*tp_clear*/
8142 0, /*tp_richcompare*/
8143 0, /*tp_weaklistoffset*/
8144 0, /*tp_iter*/
8145 0, /*tp_iternext*/
8146 encoding_map_methods, /*tp_methods*/
8147 0, /*tp_members*/
8148 0, /*tp_getset*/
8149 0, /*tp_base*/
8150 0, /*tp_dict*/
8151 0, /*tp_descr_get*/
8152 0, /*tp_descr_set*/
8153 0, /*tp_dictoffset*/
8154 0, /*tp_init*/
8155 0, /*tp_alloc*/
8156 0, /*tp_new*/
8157 0, /*tp_free*/
8158 0, /*tp_is_gc*/
8159 };
8160
8161 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8162 PyUnicode_BuildEncodingMap(PyObject* string)
8163 {
8164 PyObject *result;
8165 struct encoding_map *mresult;
8166 int i;
8167 int need_dict = 0;
8168 unsigned char level1[32];
8169 unsigned char level2[512];
8170 unsigned char *mlevel1, *mlevel2, *mlevel3;
8171 int count2 = 0, count3 = 0;
8172 int kind;
8173 void *data;
8174 Py_ssize_t length;
8175 Py_UCS4 ch;
8176
8177 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8178 PyErr_BadArgument();
8179 return NULL;
8180 }
8181 kind = PyUnicode_KIND(string);
8182 data = PyUnicode_DATA(string);
8183 length = PyUnicode_GET_LENGTH(string);
8184 length = Py_MIN(length, 256);
8185 memset(level1, 0xFF, sizeof level1);
8186 memset(level2, 0xFF, sizeof level2);
8187
8188 /* If there isn't a one-to-one mapping of NULL to \0,
8189 or if there are non-BMP characters, we need to use
8190 a mapping dictionary. */
8191 if (PyUnicode_READ(kind, data, 0) != 0)
8192 need_dict = 1;
8193 for (i = 1; i < length; i++) {
8194 int l1, l2;
8195 ch = PyUnicode_READ(kind, data, i);
8196 if (ch == 0 || ch > 0xFFFF) {
8197 need_dict = 1;
8198 break;
8199 }
8200 if (ch == 0xFFFE)
8201 /* unmapped character */
8202 continue;
8203 l1 = ch >> 11;
8204 l2 = ch >> 7;
8205 if (level1[l1] == 0xFF)
8206 level1[l1] = count2++;
8207 if (level2[l2] == 0xFF)
8208 level2[l2] = count3++;
8209 }
8210
8211 if (count2 >= 0xFF || count3 >= 0xFF)
8212 need_dict = 1;
8213
8214 if (need_dict) {
8215 PyObject *result = PyDict_New();
8216 PyObject *key, *value;
8217 if (!result)
8218 return NULL;
8219 for (i = 0; i < length; i++) {
8220 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8221 value = PyLong_FromLong(i);
8222 if (!key || !value)
8223 goto failed1;
8224 if (PyDict_SetItem(result, key, value) == -1)
8225 goto failed1;
8226 Py_DECREF(key);
8227 Py_DECREF(value);
8228 }
8229 return result;
8230 failed1:
8231 Py_XDECREF(key);
8232 Py_XDECREF(value);
8233 Py_DECREF(result);
8234 return NULL;
8235 }
8236
8237 /* Create a three-level trie */
8238 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8239 16*count2 + 128*count3 - 1);
8240 if (!result)
8241 return PyErr_NoMemory();
8242 PyObject_Init(result, &EncodingMapType);
8243 mresult = (struct encoding_map*)result;
8244 mresult->count2 = count2;
8245 mresult->count3 = count3;
8246 mlevel1 = mresult->level1;
8247 mlevel2 = mresult->level23;
8248 mlevel3 = mresult->level23 + 16*count2;
8249 memcpy(mlevel1, level1, 32);
8250 memset(mlevel2, 0xFF, 16*count2);
8251 memset(mlevel3, 0, 128*count3);
8252 count3 = 0;
8253 for (i = 1; i < length; i++) {
8254 int o1, o2, o3, i2, i3;
8255 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8256 if (ch == 0xFFFE)
8257 /* unmapped character */
8258 continue;
8259 o1 = ch>>11;
8260 o2 = (ch>>7) & 0xF;
8261 i2 = 16*mlevel1[o1] + o2;
8262 if (mlevel2[i2] == 0xFF)
8263 mlevel2[i2] = count3++;
8264 o3 = ch & 0x7F;
8265 i3 = 128*mlevel2[i2] + o3;
8266 mlevel3[i3] = i;
8267 }
8268 return result;
8269 }
8270
8271 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8272 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8273 {
8274 struct encoding_map *map = (struct encoding_map*)mapping;
8275 int l1 = c>>11;
8276 int l2 = (c>>7) & 0xF;
8277 int l3 = c & 0x7F;
8278 int i;
8279
8280 if (c > 0xFFFF)
8281 return -1;
8282 if (c == 0)
8283 return 0;
8284 /* level 1*/
8285 i = map->level1[l1];
8286 if (i == 0xFF) {
8287 return -1;
8288 }
8289 /* level 2*/
8290 i = map->level23[16*i+l2];
8291 if (i == 0xFF) {
8292 return -1;
8293 }
8294 /* level 3 */
8295 i = map->level23[16*map->count2 + 128*i + l3];
8296 if (i == 0) {
8297 return -1;
8298 }
8299 return i;
8300 }
8301
8302 /* Lookup the character ch in the mapping. If the character
8303 can't be found, Py_None is returned (or NULL, if another
8304 error occurred). */
8305 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8306 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8307 {
8308 PyObject *w = PyLong_FromLong((long)c);
8309 PyObject *x;
8310
8311 if (w == NULL)
8312 return NULL;
8313 x = PyObject_GetItem(mapping, w);
8314 Py_DECREF(w);
8315 if (x == NULL) {
8316 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8317 /* No mapping found means: mapping is undefined. */
8318 PyErr_Clear();
8319 Py_RETURN_NONE;
8320 } else
8321 return NULL;
8322 }
8323 else if (x == Py_None)
8324 return x;
8325 else if (PyLong_Check(x)) {
8326 long value = PyLong_AS_LONG(x);
8327 if (value < 0 || value > 255) {
8328 PyErr_SetString(PyExc_TypeError,
8329 "character mapping must be in range(256)");
8330 Py_DECREF(x);
8331 return NULL;
8332 }
8333 return x;
8334 }
8335 else if (PyBytes_Check(x))
8336 return x;
8337 else {
8338 /* wrong return value */
8339 PyErr_Format(PyExc_TypeError,
8340 "character mapping must return integer, bytes or None, not %.400s",
8341 x->ob_type->tp_name);
8342 Py_DECREF(x);
8343 return NULL;
8344 }
8345 }
8346
8347 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8348 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8349 {
8350 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8351 /* exponentially overallocate to minimize reallocations */
8352 if (requiredsize < 2*outsize)
8353 requiredsize = 2*outsize;
8354 if (_PyBytes_Resize(outobj, requiredsize))
8355 return -1;
8356 return 0;
8357 }
8358
8359 typedef enum charmapencode_result {
8360 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8361 } charmapencode_result;
8362 /* lookup the character, put the result in the output string and adjust
8363 various state variables. Resize the output bytes object if not enough
8364 space is available. Return a new reference to the object that
8365 was put in the output buffer, or Py_None, if the mapping was undefined
8366 (in which case no character was written) or NULL, if a
8367 reallocation error occurred. The caller must decref the result */
8368 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8369 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8370 PyObject **outobj, Py_ssize_t *outpos)
8371 {
8372 PyObject *rep;
8373 char *outstart;
8374 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8375
8376 if (Py_TYPE(mapping) == &EncodingMapType) {
8377 int res = encoding_map_lookup(c, mapping);
8378 Py_ssize_t requiredsize = *outpos+1;
8379 if (res == -1)
8380 return enc_FAILED;
8381 if (outsize<requiredsize)
8382 if (charmapencode_resize(outobj, outpos, requiredsize))
8383 return enc_EXCEPTION;
8384 outstart = PyBytes_AS_STRING(*outobj);
8385 outstart[(*outpos)++] = (char)res;
8386 return enc_SUCCESS;
8387 }
8388
8389 rep = charmapencode_lookup(c, mapping);
8390 if (rep==NULL)
8391 return enc_EXCEPTION;
8392 else if (rep==Py_None) {
8393 Py_DECREF(rep);
8394 return enc_FAILED;
8395 } else {
8396 if (PyLong_Check(rep)) {
8397 Py_ssize_t requiredsize = *outpos+1;
8398 if (outsize<requiredsize)
8399 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8400 Py_DECREF(rep);
8401 return enc_EXCEPTION;
8402 }
8403 outstart = PyBytes_AS_STRING(*outobj);
8404 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8405 }
8406 else {
8407 const char *repchars = PyBytes_AS_STRING(rep);
8408 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8409 Py_ssize_t requiredsize = *outpos+repsize;
8410 if (outsize<requiredsize)
8411 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8412 Py_DECREF(rep);
8413 return enc_EXCEPTION;
8414 }
8415 outstart = PyBytes_AS_STRING(*outobj);
8416 memcpy(outstart + *outpos, repchars, repsize);
8417 *outpos += repsize;
8418 }
8419 }
8420 Py_DECREF(rep);
8421 return enc_SUCCESS;
8422 }
8423
8424 /* handle an error in PyUnicode_EncodeCharmap
8425 Return 0 on success, -1 on error */
8426 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8427 charmap_encoding_error(
8428 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8429 PyObject **exceptionObject,
8430 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8431 PyObject **res, Py_ssize_t *respos)
8432 {
8433 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8434 Py_ssize_t size, repsize;
8435 Py_ssize_t newpos;
8436 enum PyUnicode_Kind kind;
8437 void *data;
8438 Py_ssize_t index;
8439 /* startpos for collecting unencodable chars */
8440 Py_ssize_t collstartpos = *inpos;
8441 Py_ssize_t collendpos = *inpos+1;
8442 Py_ssize_t collpos;
8443 const char *encoding = "charmap";
8444 const char *reason = "character maps to <undefined>";
8445 charmapencode_result x;
8446 Py_UCS4 ch;
8447 int val;
8448
8449 if (PyUnicode_READY(unicode) == -1)
8450 return -1;
8451 size = PyUnicode_GET_LENGTH(unicode);
8452 /* find all unencodable characters */
8453 while (collendpos < size) {
8454 PyObject *rep;
8455 if (Py_TYPE(mapping) == &EncodingMapType) {
8456 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8457 val = encoding_map_lookup(ch, mapping);
8458 if (val != -1)
8459 break;
8460 ++collendpos;
8461 continue;
8462 }
8463
8464 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8465 rep = charmapencode_lookup(ch, mapping);
8466 if (rep==NULL)
8467 return -1;
8468 else if (rep!=Py_None) {
8469 Py_DECREF(rep);
8470 break;
8471 }
8472 Py_DECREF(rep);
8473 ++collendpos;
8474 }
8475 /* cache callback name lookup
8476 * (if not done yet, i.e. it's the first error) */
8477 if (*error_handler == _Py_ERROR_UNKNOWN)
8478 *error_handler = _Py_GetErrorHandler(errors);
8479
8480 switch (*error_handler) {
8481 case _Py_ERROR_STRICT:
8482 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8483 return -1;
8484
8485 case _Py_ERROR_REPLACE:
8486 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8487 x = charmapencode_output('?', mapping, res, respos);
8488 if (x==enc_EXCEPTION) {
8489 return -1;
8490 }
8491 else if (x==enc_FAILED) {
8492 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8493 return -1;
8494 }
8495 }
8496 /* fall through */
8497 case _Py_ERROR_IGNORE:
8498 *inpos = collendpos;
8499 break;
8500
8501 case _Py_ERROR_XMLCHARREFREPLACE:
8502 /* generate replacement (temporarily (mis)uses p) */
8503 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8504 char buffer[2+29+1+1];
8505 char *cp;
8506 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8507 for (cp = buffer; *cp; ++cp) {
8508 x = charmapencode_output(*cp, mapping, res, respos);
8509 if (x==enc_EXCEPTION)
8510 return -1;
8511 else if (x==enc_FAILED) {
8512 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8513 return -1;
8514 }
8515 }
8516 }
8517 *inpos = collendpos;
8518 break;
8519
8520 default:
8521 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8522 encoding, reason, unicode, exceptionObject,
8523 collstartpos, collendpos, &newpos);
8524 if (repunicode == NULL)
8525 return -1;
8526 if (PyBytes_Check(repunicode)) {
8527 /* Directly copy bytes result to output. */
8528 Py_ssize_t outsize = PyBytes_Size(*res);
8529 Py_ssize_t requiredsize;
8530 repsize = PyBytes_Size(repunicode);
8531 requiredsize = *respos + repsize;
8532 if (requiredsize > outsize)
8533 /* Make room for all additional bytes. */
8534 if (charmapencode_resize(res, respos, requiredsize)) {
8535 Py_DECREF(repunicode);
8536 return -1;
8537 }
8538 memcpy(PyBytes_AsString(*res) + *respos,
8539 PyBytes_AsString(repunicode), repsize);
8540 *respos += repsize;
8541 *inpos = newpos;
8542 Py_DECREF(repunicode);
8543 break;
8544 }
8545 /* generate replacement */
8546 if (PyUnicode_READY(repunicode) == -1) {
8547 Py_DECREF(repunicode);
8548 return -1;
8549 }
8550 repsize = PyUnicode_GET_LENGTH(repunicode);
8551 data = PyUnicode_DATA(repunicode);
8552 kind = PyUnicode_KIND(repunicode);
8553 for (index = 0; index < repsize; index++) {
8554 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8555 x = charmapencode_output(repch, mapping, res, respos);
8556 if (x==enc_EXCEPTION) {
8557 Py_DECREF(repunicode);
8558 return -1;
8559 }
8560 else if (x==enc_FAILED) {
8561 Py_DECREF(repunicode);
8562 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8563 return -1;
8564 }
8565 }
8566 *inpos = newpos;
8567 Py_DECREF(repunicode);
8568 }
8569 return 0;
8570 }
8571
8572 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8573 _PyUnicode_EncodeCharmap(PyObject *unicode,
8574 PyObject *mapping,
8575 const char *errors)
8576 {
8577 /* output object */
8578 PyObject *res = NULL;
8579 /* current input position */
8580 Py_ssize_t inpos = 0;
8581 Py_ssize_t size;
8582 /* current output position */
8583 Py_ssize_t respos = 0;
8584 PyObject *error_handler_obj = NULL;
8585 PyObject *exc = NULL;
8586 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8587 void *data;
8588 int kind;
8589
8590 if (PyUnicode_READY(unicode) == -1)
8591 return NULL;
8592 size = PyUnicode_GET_LENGTH(unicode);
8593 data = PyUnicode_DATA(unicode);
8594 kind = PyUnicode_KIND(unicode);
8595
8596 /* Default to Latin-1 */
8597 if (mapping == NULL)
8598 return unicode_encode_ucs1(unicode, errors, 256);
8599
8600 /* allocate enough for a simple encoding without
8601 replacements, if we need more, we'll resize */
8602 res = PyBytes_FromStringAndSize(NULL, size);
8603 if (res == NULL)
8604 goto onError;
8605 if (size == 0)
8606 return res;
8607
8608 while (inpos<size) {
8609 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8610 /* try to encode it */
8611 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8612 if (x==enc_EXCEPTION) /* error */
8613 goto onError;
8614 if (x==enc_FAILED) { /* unencodable character */
8615 if (charmap_encoding_error(unicode, &inpos, mapping,
8616 &exc,
8617 &error_handler, &error_handler_obj, errors,
8618 &res, &respos)) {
8619 goto onError;
8620 }
8621 }
8622 else
8623 /* done with this character => adjust input position */
8624 ++inpos;
8625 }
8626
8627 /* Resize if we allocated to much */
8628 if (respos<PyBytes_GET_SIZE(res))
8629 if (_PyBytes_Resize(&res, respos) < 0)
8630 goto onError;
8631
8632 Py_XDECREF(exc);
8633 Py_XDECREF(error_handler_obj);
8634 return res;
8635
8636 onError:
8637 Py_XDECREF(res);
8638 Py_XDECREF(exc);
8639 Py_XDECREF(error_handler_obj);
8640 return NULL;
8641 }
8642
8643 /* Deprecated */
8644 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8645 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8646 Py_ssize_t size,
8647 PyObject *mapping,
8648 const char *errors)
8649 {
8650 PyObject *result;
8651 PyObject *unicode = PyUnicode_FromWideChar(p, size);
8652 if (unicode == NULL)
8653 return NULL;
8654 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8655 Py_DECREF(unicode);
8656 return result;
8657 }
8658
8659 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8660 PyUnicode_AsCharmapString(PyObject *unicode,
8661 PyObject *mapping)
8662 {
8663 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8664 PyErr_BadArgument();
8665 return NULL;
8666 }
8667 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8668 }
8669
8670 /* create or adjust a UnicodeTranslateError */
8671 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8672 make_translate_exception(PyObject **exceptionObject,
8673 PyObject *unicode,
8674 Py_ssize_t startpos, Py_ssize_t endpos,
8675 const char *reason)
8676 {
8677 if (*exceptionObject == NULL) {
8678 *exceptionObject = _PyUnicodeTranslateError_Create(
8679 unicode, startpos, endpos, reason);
8680 }
8681 else {
8682 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8683 goto onError;
8684 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8685 goto onError;
8686 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8687 goto onError;
8688 return;
8689 onError:
8690 Py_CLEAR(*exceptionObject);
8691 }
8692 }
8693
8694 /* error handling callback helper:
8695 build arguments, call the callback and check the arguments,
8696 put the result into newpos and return the replacement string, which
8697 has to be freed by the caller */
8698 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8699 unicode_translate_call_errorhandler(const char *errors,
8700 PyObject **errorHandler,
8701 const char *reason,
8702 PyObject *unicode, PyObject **exceptionObject,
8703 Py_ssize_t startpos, Py_ssize_t endpos,
8704 Py_ssize_t *newpos)
8705 {
8706 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8707
8708 Py_ssize_t i_newpos;
8709 PyObject *restuple;
8710 PyObject *resunicode;
8711
8712 if (*errorHandler == NULL) {
8713 *errorHandler = PyCodec_LookupError(errors);
8714 if (*errorHandler == NULL)
8715 return NULL;
8716 }
8717
8718 make_translate_exception(exceptionObject,
8719 unicode, startpos, endpos, reason);
8720 if (*exceptionObject == NULL)
8721 return NULL;
8722
8723 restuple = PyObject_CallFunctionObjArgs(
8724 *errorHandler, *exceptionObject, NULL);
8725 if (restuple == NULL)
8726 return NULL;
8727 if (!PyTuple_Check(restuple)) {
8728 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8729 Py_DECREF(restuple);
8730 return NULL;
8731 }
8732 if (!PyArg_ParseTuple(restuple, argparse,
8733 &resunicode, &i_newpos)) {
8734 Py_DECREF(restuple);
8735 return NULL;
8736 }
8737 if (i_newpos<0)
8738 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8739 else
8740 *newpos = i_newpos;
8741 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8742 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8743 Py_DECREF(restuple);
8744 return NULL;
8745 }
8746 Py_INCREF(resunicode);
8747 Py_DECREF(restuple);
8748 return resunicode;
8749 }
8750
8751 /* Lookup the character ch in the mapping and put the result in result,
8752 which must be decrefed by the caller.
8753 Return 0 on success, -1 on error */
8754 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8755 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8756 {
8757 PyObject *w = PyLong_FromLong((long)c);
8758 PyObject *x;
8759
8760 if (w == NULL)
8761 return -1;
8762 x = PyObject_GetItem(mapping, w);
8763 Py_DECREF(w);
8764 if (x == NULL) {
8765 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8766 /* No mapping found means: use 1:1 mapping. */
8767 PyErr_Clear();
8768 *result = NULL;
8769 return 0;
8770 } else
8771 return -1;
8772 }
8773 else if (x == Py_None) {
8774 *result = x;
8775 return 0;
8776 }
8777 else if (PyLong_Check(x)) {
8778 long value = PyLong_AS_LONG(x);
8779 if (value < 0 || value > MAX_UNICODE) {
8780 PyErr_Format(PyExc_ValueError,
8781 "character mapping must be in range(0x%x)",
8782 MAX_UNICODE+1);
8783 Py_DECREF(x);
8784 return -1;
8785 }
8786 *result = x;
8787 return 0;
8788 }
8789 else if (PyUnicode_Check(x)) {
8790 *result = x;
8791 return 0;
8792 }
8793 else {
8794 /* wrong return value */
8795 PyErr_SetString(PyExc_TypeError,
8796 "character mapping must return integer, None or str");
8797 Py_DECREF(x);
8798 return -1;
8799 }
8800 }
8801
8802 /* lookup the character, write the result into the writer.
8803 Return 1 if the result was written into the writer, return 0 if the mapping
8804 was undefined, raise an exception return -1 on error. */
8805 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8806 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8807 _PyUnicodeWriter *writer)
8808 {
8809 PyObject *item;
8810
8811 if (charmaptranslate_lookup(ch, mapping, &item))
8812 return -1;
8813
8814 if (item == NULL) {
8815 /* not found => default to 1:1 mapping */
8816 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8817 return -1;
8818 }
8819 return 1;
8820 }
8821
8822 if (item == Py_None) {
8823 Py_DECREF(item);
8824 return 0;
8825 }
8826
8827 if (PyLong_Check(item)) {
8828 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8829 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8830 used it */
8831 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8832 Py_DECREF(item);
8833 return -1;
8834 }
8835 Py_DECREF(item);
8836 return 1;
8837 }
8838
8839 if (!PyUnicode_Check(item)) {
8840 Py_DECREF(item);
8841 return -1;
8842 }
8843
8844 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8845 Py_DECREF(item);
8846 return -1;
8847 }
8848
8849 Py_DECREF(item);
8850 return 1;
8851 }
8852
8853 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8854 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8855 Py_UCS1 *translate)
8856 {
8857 PyObject *item = NULL;
8858 int ret = 0;
8859
8860 if (charmaptranslate_lookup(ch, mapping, &item)) {
8861 return -1;
8862 }
8863
8864 if (item == Py_None) {
8865 /* deletion */
8866 translate[ch] = 0xfe;
8867 }
8868 else if (item == NULL) {
8869 /* not found => default to 1:1 mapping */
8870 translate[ch] = ch;
8871 return 1;
8872 }
8873 else if (PyLong_Check(item)) {
8874 long replace = PyLong_AS_LONG(item);
8875 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8876 used it */
8877 if (127 < replace) {
8878 /* invalid character or character outside ASCII:
8879 skip the fast translate */
8880 goto exit;
8881 }
8882 translate[ch] = (Py_UCS1)replace;
8883 }
8884 else if (PyUnicode_Check(item)) {
8885 Py_UCS4 replace;
8886
8887 if (PyUnicode_READY(item) == -1) {
8888 Py_DECREF(item);
8889 return -1;
8890 }
8891 if (PyUnicode_GET_LENGTH(item) != 1)
8892 goto exit;
8893
8894 replace = PyUnicode_READ_CHAR(item, 0);
8895 if (replace > 127)
8896 goto exit;
8897 translate[ch] = (Py_UCS1)replace;
8898 }
8899 else {
8900 /* not None, NULL, long or unicode */
8901 goto exit;
8902 }
8903 ret = 1;
8904
8905 exit:
8906 Py_DECREF(item);
8907 return ret;
8908 }
8909
8910 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8911 was translated into writer, return 0 if the input string was partially
8912 translated into writer, raise an exception and return -1 on error. */
8913 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8914 unicode_fast_translate(PyObject *input, PyObject *mapping,
8915 _PyUnicodeWriter *writer, int ignore,
8916 Py_ssize_t *input_pos)
8917 {
8918 Py_UCS1 ascii_table[128], ch, ch2;
8919 Py_ssize_t len;
8920 Py_UCS1 *in, *end, *out;
8921 int res = 0;
8922
8923 len = PyUnicode_GET_LENGTH(input);
8924
8925 memset(ascii_table, 0xff, 128);
8926
8927 in = PyUnicode_1BYTE_DATA(input);
8928 end = in + len;
8929
8930 assert(PyUnicode_IS_ASCII(writer->buffer));
8931 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8932 out = PyUnicode_1BYTE_DATA(writer->buffer);
8933
8934 for (; in < end; in++) {
8935 ch = *in;
8936 ch2 = ascii_table[ch];
8937 if (ch2 == 0xff) {
8938 int translate = unicode_fast_translate_lookup(mapping, ch,
8939 ascii_table);
8940 if (translate < 0)
8941 return -1;
8942 if (translate == 0)
8943 goto exit;
8944 ch2 = ascii_table[ch];
8945 }
8946 if (ch2 == 0xfe) {
8947 if (ignore)
8948 continue;
8949 goto exit;
8950 }
8951 assert(ch2 < 128);
8952 *out = ch2;
8953 out++;
8954 }
8955 res = 1;
8956
8957 exit:
8958 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8959 *input_pos = in - PyUnicode_1BYTE_DATA(input);
8960 return res;
8961 }
8962
8963 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8964 _PyUnicode_TranslateCharmap(PyObject *input,
8965 PyObject *mapping,
8966 const char *errors)
8967 {
8968 /* input object */
8969 char *data;
8970 Py_ssize_t size, i;
8971 int kind;
8972 /* output buffer */
8973 _PyUnicodeWriter writer;
8974 /* error handler */
8975 const char *reason = "character maps to <undefined>";
8976 PyObject *errorHandler = NULL;
8977 PyObject *exc = NULL;
8978 int ignore;
8979 int res;
8980
8981 if (mapping == NULL) {
8982 PyErr_BadArgument();
8983 return NULL;
8984 }
8985
8986 if (PyUnicode_READY(input) == -1)
8987 return NULL;
8988 data = (char*)PyUnicode_DATA(input);
8989 kind = PyUnicode_KIND(input);
8990 size = PyUnicode_GET_LENGTH(input);
8991
8992 if (size == 0)
8993 return PyUnicode_FromObject(input);
8994
8995 /* allocate enough for a simple 1:1 translation without
8996 replacements, if we need more, we'll resize */
8997 _PyUnicodeWriter_Init(&writer);
8998 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8999 goto onError;
9000
9001 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9002
9003 if (PyUnicode_READY(input) == -1)
9004 return NULL;
9005 if (PyUnicode_IS_ASCII(input)) {
9006 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9007 if (res < 0) {
9008 _PyUnicodeWriter_Dealloc(&writer);
9009 return NULL;
9010 }
9011 if (res == 1)
9012 return _PyUnicodeWriter_Finish(&writer);
9013 }
9014 else {
9015 i = 0;
9016 }
9017
9018 while (i<size) {
9019 /* try to encode it */
9020 int translate;
9021 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9022 Py_ssize_t newpos;
9023 /* startpos for collecting untranslatable chars */
9024 Py_ssize_t collstart;
9025 Py_ssize_t collend;
9026 Py_UCS4 ch;
9027
9028 ch = PyUnicode_READ(kind, data, i);
9029 translate = charmaptranslate_output(ch, mapping, &writer);
9030 if (translate < 0)
9031 goto onError;
9032
9033 if (translate != 0) {
9034 /* it worked => adjust input pointer */
9035 ++i;
9036 continue;
9037 }
9038
9039 /* untranslatable character */
9040 collstart = i;
9041 collend = i+1;
9042
9043 /* find all untranslatable characters */
9044 while (collend < size) {
9045 PyObject *x;
9046 ch = PyUnicode_READ(kind, data, collend);
9047 if (charmaptranslate_lookup(ch, mapping, &x))
9048 goto onError;
9049 Py_XDECREF(x);
9050 if (x != Py_None)
9051 break;
9052 ++collend;
9053 }
9054
9055 if (ignore) {
9056 i = collend;
9057 }
9058 else {
9059 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9060 reason, input, &exc,
9061 collstart, collend, &newpos);
9062 if (repunicode == NULL)
9063 goto onError;
9064 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9065 Py_DECREF(repunicode);
9066 goto onError;
9067 }
9068 Py_DECREF(repunicode);
9069 i = newpos;
9070 }
9071 }
9072 Py_XDECREF(exc);
9073 Py_XDECREF(errorHandler);
9074 return _PyUnicodeWriter_Finish(&writer);
9075
9076 onError:
9077 _PyUnicodeWriter_Dealloc(&writer);
9078 Py_XDECREF(exc);
9079 Py_XDECREF(errorHandler);
9080 return NULL;
9081 }
9082
9083 /* Deprecated. Use PyUnicode_Translate instead. */
9084 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9085 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9086 Py_ssize_t size,
9087 PyObject *mapping,
9088 const char *errors)
9089 {
9090 PyObject *result;
9091 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9092 if (!unicode)
9093 return NULL;
9094 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9095 Py_DECREF(unicode);
9096 return result;
9097 }
9098
9099 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9100 PyUnicode_Translate(PyObject *str,
9101 PyObject *mapping,
9102 const char *errors)
9103 {
9104 if (ensure_unicode(str) < 0)
9105 return NULL;
9106 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9107 }
9108
9109 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9110 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9111 {
9112 if (!PyUnicode_Check(unicode)) {
9113 PyErr_BadInternalCall();
9114 return NULL;
9115 }
9116 if (PyUnicode_READY(unicode) == -1)
9117 return NULL;
9118 if (PyUnicode_IS_ASCII(unicode)) {
9119 /* If the string is already ASCII, just return the same string */
9120 Py_INCREF(unicode);
9121 return unicode;
9122 }
9123
9124 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9125 PyObject *result = PyUnicode_New(len, 127);
9126 if (result == NULL) {
9127 return NULL;
9128 }
9129
9130 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9131 int kind = PyUnicode_KIND(unicode);
9132 const void *data = PyUnicode_DATA(unicode);
9133 Py_ssize_t i;
9134 for (i = 0; i < len; ++i) {
9135 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9136 if (ch < 127) {
9137 out[i] = ch;
9138 }
9139 else if (Py_UNICODE_ISSPACE(ch)) {
9140 out[i] = ' ';
9141 }
9142 else {
9143 int decimal = Py_UNICODE_TODECIMAL(ch);
9144 if (decimal < 0) {
9145 out[i] = '?';
9146 out[i+1] = '\0';
9147 _PyUnicode_LENGTH(result) = i + 1;
9148 break;
9149 }
9150 out[i] = '0' + decimal;
9151 }
9152 }
9153
9154 assert(_PyUnicode_CheckConsistency(result, 1));
9155 return result;
9156 }
9157
9158 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9159 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9160 Py_ssize_t length)
9161 {
9162 PyObject *decimal;
9163 Py_ssize_t i;
9164 Py_UCS4 maxchar;
9165 enum PyUnicode_Kind kind;
9166 void *data;
9167
9168 maxchar = 127;
9169 for (i = 0; i < length; i++) {
9170 Py_UCS4 ch = s[i];
9171 if (ch > 127) {
9172 int decimal = Py_UNICODE_TODECIMAL(ch);
9173 if (decimal >= 0)
9174 ch = '0' + decimal;
9175 maxchar = Py_MAX(maxchar, ch);
9176 }
9177 }
9178
9179 /* Copy to a new string */
9180 decimal = PyUnicode_New(length, maxchar);
9181 if (decimal == NULL)
9182 return decimal;
9183 kind = PyUnicode_KIND(decimal);
9184 data = PyUnicode_DATA(decimal);
9185 /* Iterate over code points */
9186 for (i = 0; i < length; i++) {
9187 Py_UCS4 ch = s[i];
9188 if (ch > 127) {
9189 int decimal = Py_UNICODE_TODECIMAL(ch);
9190 if (decimal >= 0)
9191 ch = '0' + decimal;
9192 }
9193 PyUnicode_WRITE(kind, data, i, ch);
9194 }
9195 return unicode_result(decimal);
9196 }
9197 /* --- Decimal Encoder ---------------------------------------------------- */
9198
9199 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9200 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9201 Py_ssize_t length,
9202 char *output,
9203 const char *errors)
9204 {
9205 PyObject *unicode;
9206 Py_ssize_t i;
9207 enum PyUnicode_Kind kind;
9208 void *data;
9209
9210 if (output == NULL) {
9211 PyErr_BadArgument();
9212 return -1;
9213 }
9214
9215 unicode = PyUnicode_FromWideChar(s, length);
9216 if (unicode == NULL)
9217 return -1;
9218
9219 kind = PyUnicode_KIND(unicode);
9220 data = PyUnicode_DATA(unicode);
9221
9222 for (i=0; i < length; ) {
9223 PyObject *exc;
9224 Py_UCS4 ch;
9225 int decimal;
9226 Py_ssize_t startpos;
9227
9228 ch = PyUnicode_READ(kind, data, i);
9229
9230 if (Py_UNICODE_ISSPACE(ch)) {
9231 *output++ = ' ';
9232 i++;
9233 continue;
9234 }
9235 decimal = Py_UNICODE_TODECIMAL(ch);
9236 if (decimal >= 0) {
9237 *output++ = '0' + decimal;
9238 i++;
9239 continue;
9240 }
9241 if (0 < ch && ch < 256) {
9242 *output++ = (char)ch;
9243 i++;
9244 continue;
9245 }
9246
9247 startpos = i;
9248 exc = NULL;
9249 raise_encode_exception(&exc, "decimal", unicode,
9250 startpos, startpos+1,
9251 "invalid decimal Unicode string");
9252 Py_XDECREF(exc);
9253 Py_DECREF(unicode);
9254 return -1;
9255 }
9256 /* 0-terminate the output string */
9257 *output++ = '\0';
9258 Py_DECREF(unicode);
9259 return 0;
9260 }
9261
9262 /* --- Helpers ------------------------------------------------------------ */
9263
9264 /* helper macro to fixup start/end slice values */
9265 #define ADJUST_INDICES(start, end, len) \
9266 if (end > len) \
9267 end = len; \
9268 else if (end < 0) { \
9269 end += len; \
9270 if (end < 0) \
9271 end = 0; \
9272 } \
9273 if (start < 0) { \
9274 start += len; \
9275 if (start < 0) \
9276 start = 0; \
9277 }
9278
9279 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9280 any_find_slice(PyObject* s1, PyObject* s2,
9281 Py_ssize_t start,
9282 Py_ssize_t end,
9283 int direction)
9284 {
9285 int kind1, kind2;
9286 void *buf1, *buf2;
9287 Py_ssize_t len1, len2, result;
9288
9289 kind1 = PyUnicode_KIND(s1);
9290 kind2 = PyUnicode_KIND(s2);
9291 if (kind1 < kind2)
9292 return -1;
9293
9294 len1 = PyUnicode_GET_LENGTH(s1);
9295 len2 = PyUnicode_GET_LENGTH(s2);
9296 ADJUST_INDICES(start, end, len1);
9297 if (end - start < len2)
9298 return -1;
9299
9300 buf1 = PyUnicode_DATA(s1);
9301 buf2 = PyUnicode_DATA(s2);
9302 if (len2 == 1) {
9303 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9304 result = findchar((const char *)buf1 + kind1*start,
9305 kind1, end - start, ch, direction);
9306 if (result == -1)
9307 return -1;
9308 else
9309 return start + result;
9310 }
9311
9312 if (kind2 != kind1) {
9313 buf2 = _PyUnicode_AsKind(s2, kind1);
9314 if (!buf2)
9315 return -2;
9316 }
9317
9318 if (direction > 0) {
9319 switch (kind1) {
9320 case PyUnicode_1BYTE_KIND:
9321 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9322 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9323 else
9324 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9325 break;
9326 case PyUnicode_2BYTE_KIND:
9327 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9328 break;
9329 case PyUnicode_4BYTE_KIND:
9330 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9331 break;
9332 default:
9333 Py_UNREACHABLE();
9334 }
9335 }
9336 else {
9337 switch (kind1) {
9338 case PyUnicode_1BYTE_KIND:
9339 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9340 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9341 else
9342 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9343 break;
9344 case PyUnicode_2BYTE_KIND:
9345 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9346 break;
9347 case PyUnicode_4BYTE_KIND:
9348 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9349 break;
9350 default:
9351 Py_UNREACHABLE();
9352 }
9353 }
9354
9355 if (kind2 != kind1)
9356 PyMem_Free(buf2);
9357
9358 return result;
9359 }
9360
9361 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9362 #include "stringlib/localeutil.h"
9363
9364 /**
9365 * InsertThousandsGrouping:
9366 * @writer: Unicode writer.
9367 * @n_buffer: Number of characters in @buffer.
9368 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9369 * @d_pos: Start of digits string.
9370 * @n_digits: The number of digits in the string, in which we want
9371 * to put the grouping chars.
9372 * @min_width: The minimum width of the digits in the output string.
9373 * Output will be zero-padded on the left to fill.
9374 * @grouping: see definition in localeconv().
9375 * @thousands_sep: see definition in localeconv().
9376 *
9377 * There are 2 modes: counting and filling. If @writer is NULL,
9378 * we are in counting mode, else filling mode.
9379 * If counting, the required buffer size is returned.
9380 * If filling, we know the buffer will be large enough, so we don't
9381 * need to pass in the buffer size.
9382 * Inserts thousand grouping characters (as defined by grouping and
9383 * thousands_sep) into @writer.
9384 *
9385 * Return value: -1 on error, number of characters otherwise.
9386 **/
9387 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9388 _PyUnicode_InsertThousandsGrouping(
9389 _PyUnicodeWriter *writer,
9390 Py_ssize_t n_buffer,
9391 PyObject *digits,
9392 Py_ssize_t d_pos,
9393 Py_ssize_t n_digits,
9394 Py_ssize_t min_width,
9395 const char *grouping,
9396 PyObject *thousands_sep,
9397 Py_UCS4 *maxchar)
9398 {
9399 min_width = Py_MAX(0, min_width);
9400 if (writer) {
9401 assert(digits != NULL);
9402 assert(maxchar == NULL);
9403 }
9404 else {
9405 assert(digits == NULL);
9406 assert(maxchar != NULL);
9407 }
9408 assert(0 <= d_pos);
9409 assert(0 <= n_digits);
9410 assert(grouping != NULL);
9411
9412 if (digits != NULL) {
9413 if (PyUnicode_READY(digits) == -1) {
9414 return -1;
9415 }
9416 }
9417 if (PyUnicode_READY(thousands_sep) == -1) {
9418 return -1;
9419 }
9420
9421 Py_ssize_t count = 0;
9422 Py_ssize_t n_zeros;
9423 int loop_broken = 0;
9424 int use_separator = 0; /* First time through, don't append the
9425 separator. They only go between
9426 groups. */
9427 Py_ssize_t buffer_pos;
9428 Py_ssize_t digits_pos;
9429 Py_ssize_t len;
9430 Py_ssize_t n_chars;
9431 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9432 be looked at */
9433 /* A generator that returns all of the grouping widths, until it
9434 returns 0. */
9435 GroupGenerator groupgen;
9436 GroupGenerator_init(&groupgen, grouping);
9437 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9438
9439 /* if digits are not grouped, thousands separator
9440 should be an empty string */
9441 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9442
9443 digits_pos = d_pos + n_digits;
9444 if (writer) {
9445 buffer_pos = writer->pos + n_buffer;
9446 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9447 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9448 }
9449 else {
9450 buffer_pos = n_buffer;
9451 }
9452
9453 if (!writer) {
9454 *maxchar = 127;
9455 }
9456
9457 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9458 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9459 n_zeros = Py_MAX(0, len - remaining);
9460 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9461
9462 /* Use n_zero zero's and n_chars chars */
9463
9464 /* Count only, don't do anything. */
9465 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9466
9467 /* Copy into the writer. */
9468 InsertThousandsGrouping_fill(writer, &buffer_pos,
9469 digits, &digits_pos,
9470 n_chars, n_zeros,
9471 use_separator ? thousands_sep : NULL,
9472 thousands_sep_len, maxchar);
9473
9474 /* Use a separator next time. */
9475 use_separator = 1;
9476
9477 remaining -= n_chars;
9478 min_width -= len;
9479
9480 if (remaining <= 0 && min_width <= 0) {
9481 loop_broken = 1;
9482 break;
9483 }
9484 min_width -= thousands_sep_len;
9485 }
9486 if (!loop_broken) {
9487 /* We left the loop without using a break statement. */
9488
9489 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9490 n_zeros = Py_MAX(0, len - remaining);
9491 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9492
9493 /* Use n_zero zero's and n_chars chars */
9494 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9495
9496 /* Copy into the writer. */
9497 InsertThousandsGrouping_fill(writer, &buffer_pos,
9498 digits, &digits_pos,
9499 n_chars, n_zeros,
9500 use_separator ? thousands_sep : NULL,
9501 thousands_sep_len, maxchar);
9502 }
9503 return count;
9504 }
9505
9506
9507 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9508 PyUnicode_Count(PyObject *str,
9509 PyObject *substr,
9510 Py_ssize_t start,
9511 Py_ssize_t end)
9512 {
9513 Py_ssize_t result;
9514 int kind1, kind2;
9515 void *buf1 = NULL, *buf2 = NULL;
9516 Py_ssize_t len1, len2;
9517
9518 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9519 return -1;
9520
9521 kind1 = PyUnicode_KIND(str);
9522 kind2 = PyUnicode_KIND(substr);
9523 if (kind1 < kind2)
9524 return 0;
9525
9526 len1 = PyUnicode_GET_LENGTH(str);
9527 len2 = PyUnicode_GET_LENGTH(substr);
9528 ADJUST_INDICES(start, end, len1);
9529 if (end - start < len2)
9530 return 0;
9531
9532 buf1 = PyUnicode_DATA(str);
9533 buf2 = PyUnicode_DATA(substr);
9534 if (kind2 != kind1) {
9535 buf2 = _PyUnicode_AsKind(substr, kind1);
9536 if (!buf2)
9537 goto onError;
9538 }
9539
9540 switch (kind1) {
9541 case PyUnicode_1BYTE_KIND:
9542 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9543 result = asciilib_count(
9544 ((Py_UCS1*)buf1) + start, end - start,
9545 buf2, len2, PY_SSIZE_T_MAX
9546 );
9547 else
9548 result = ucs1lib_count(
9549 ((Py_UCS1*)buf1) + start, end - start,
9550 buf2, len2, PY_SSIZE_T_MAX
9551 );
9552 break;
9553 case PyUnicode_2BYTE_KIND:
9554 result = ucs2lib_count(
9555 ((Py_UCS2*)buf1) + start, end - start,
9556 buf2, len2, PY_SSIZE_T_MAX
9557 );
9558 break;
9559 case PyUnicode_4BYTE_KIND:
9560 result = ucs4lib_count(
9561 ((Py_UCS4*)buf1) + start, end - start,
9562 buf2, len2, PY_SSIZE_T_MAX
9563 );
9564 break;
9565 default:
9566 Py_UNREACHABLE();
9567 }
9568
9569 if (kind2 != kind1)
9570 PyMem_Free(buf2);
9571
9572 return result;
9573 onError:
9574 if (kind2 != kind1 && buf2)
9575 PyMem_Free(buf2);
9576 return -1;
9577 }
9578
9579 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9580 PyUnicode_Find(PyObject *str,
9581 PyObject *substr,
9582 Py_ssize_t start,
9583 Py_ssize_t end,
9584 int direction)
9585 {
9586 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9587 return -2;
9588
9589 return any_find_slice(str, substr, start, end, direction);
9590 }
9591
9592 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9593 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9594 Py_ssize_t start, Py_ssize_t end,
9595 int direction)
9596 {
9597 int kind;
9598 Py_ssize_t len, result;
9599 if (PyUnicode_READY(str) == -1)
9600 return -2;
9601 len = PyUnicode_GET_LENGTH(str);
9602 ADJUST_INDICES(start, end, len);
9603 if (end - start < 1)
9604 return -1;
9605 kind = PyUnicode_KIND(str);
9606 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9607 kind, end-start, ch, direction);
9608 if (result == -1)
9609 return -1;
9610 else
9611 return start + result;
9612 }
9613
9614 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9615 tailmatch(PyObject *self,
9616 PyObject *substring,
9617 Py_ssize_t start,
9618 Py_ssize_t end,
9619 int direction)
9620 {
9621 int kind_self;
9622 int kind_sub;
9623 void *data_self;
9624 void *data_sub;
9625 Py_ssize_t offset;
9626 Py_ssize_t i;
9627 Py_ssize_t end_sub;
9628
9629 if (PyUnicode_READY(self) == -1 ||
9630 PyUnicode_READY(substring) == -1)
9631 return -1;
9632
9633 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9634 end -= PyUnicode_GET_LENGTH(substring);
9635 if (end < start)
9636 return 0;
9637
9638 if (PyUnicode_GET_LENGTH(substring) == 0)
9639 return 1;
9640
9641 kind_self = PyUnicode_KIND(self);
9642 data_self = PyUnicode_DATA(self);
9643 kind_sub = PyUnicode_KIND(substring);
9644 data_sub = PyUnicode_DATA(substring);
9645 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9646
9647 if (direction > 0)
9648 offset = end;
9649 else
9650 offset = start;
9651
9652 if (PyUnicode_READ(kind_self, data_self, offset) ==
9653 PyUnicode_READ(kind_sub, data_sub, 0) &&
9654 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9655 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9656 /* If both are of the same kind, memcmp is sufficient */
9657 if (kind_self == kind_sub) {
9658 return ! memcmp((char *)data_self +
9659 (offset * PyUnicode_KIND(substring)),
9660 data_sub,
9661 PyUnicode_GET_LENGTH(substring) *
9662 PyUnicode_KIND(substring));
9663 }
9664 /* otherwise we have to compare each character by first accessing it */
9665 else {
9666 /* We do not need to compare 0 and len(substring)-1 because
9667 the if statement above ensured already that they are equal
9668 when we end up here. */
9669 for (i = 1; i < end_sub; ++i) {
9670 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9671 PyUnicode_READ(kind_sub, data_sub, i))
9672 return 0;
9673 }
9674 return 1;
9675 }
9676 }
9677
9678 return 0;
9679 }
9680
9681 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9682 PyUnicode_Tailmatch(PyObject *str,
9683 PyObject *substr,
9684 Py_ssize_t start,
9685 Py_ssize_t end,
9686 int direction)
9687 {
9688 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9689 return -1;
9690
9691 return tailmatch(str, substr, start, end, direction);
9692 }
9693
9694 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9695 ascii_upper_or_lower(PyObject *self, int lower)
9696 {
9697 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9698 char *resdata, *data = PyUnicode_DATA(self);
9699 PyObject *res;
9700
9701 res = PyUnicode_New(len, 127);
9702 if (res == NULL)
9703 return NULL;
9704 resdata = PyUnicode_DATA(res);
9705 if (lower)
9706 _Py_bytes_lower(resdata, data, len);
9707 else
9708 _Py_bytes_upper(resdata, data, len);
9709 return res;
9710 }
9711
9712 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9713 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9714 {
9715 Py_ssize_t j;
9716 int final_sigma;
9717 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9718 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9719
9720 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9721
9722 where ! is a negation and \p{xxx} is a character with property xxx.
9723 */
9724 for (j = i - 1; j >= 0; j--) {
9725 c = PyUnicode_READ(kind, data, j);
9726 if (!_PyUnicode_IsCaseIgnorable(c))
9727 break;
9728 }
9729 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9730 if (final_sigma) {
9731 for (j = i + 1; j < length; j++) {
9732 c = PyUnicode_READ(kind, data, j);
9733 if (!_PyUnicode_IsCaseIgnorable(c))
9734 break;
9735 }
9736 final_sigma = j == length || !_PyUnicode_IsCased(c);
9737 }
9738 return (final_sigma) ? 0x3C2 : 0x3C3;
9739 }
9740
9741 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9742 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9743 Py_UCS4 c, Py_UCS4 *mapped)
9744 {
9745 /* Obscure special case. */
9746 if (c == 0x3A3) {
9747 mapped[0] = handle_capital_sigma(kind, data, length, i);
9748 return 1;
9749 }
9750 return _PyUnicode_ToLowerFull(c, mapped);
9751 }
9752
9753 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9754 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9755 {
9756 Py_ssize_t i, k = 0;
9757 int n_res, j;
9758 Py_UCS4 c, mapped[3];
9759
9760 c = PyUnicode_READ(kind, data, 0);
9761 n_res = _PyUnicode_ToTitleFull(c, mapped);
9762 for (j = 0; j < n_res; j++) {
9763 *maxchar = Py_MAX(*maxchar, mapped[j]);
9764 res[k++] = mapped[j];
9765 }
9766 for (i = 1; i < length; i++) {
9767 c = PyUnicode_READ(kind, data, i);
9768 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9769 for (j = 0; j < n_res; j++) {
9770 *maxchar = Py_MAX(*maxchar, mapped[j]);
9771 res[k++] = mapped[j];
9772 }
9773 }
9774 return k;
9775 }
9776
9777 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9778 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9779 Py_ssize_t i, k = 0;
9780
9781 for (i = 0; i < length; i++) {
9782 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9783 int n_res, j;
9784 if (Py_UNICODE_ISUPPER(c)) {
9785 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9786 }
9787 else if (Py_UNICODE_ISLOWER(c)) {
9788 n_res = _PyUnicode_ToUpperFull(c, mapped);
9789 }
9790 else {
9791 n_res = 1;
9792 mapped[0] = c;
9793 }
9794 for (j = 0; j < n_res; j++) {
9795 *maxchar = Py_MAX(*maxchar, mapped[j]);
9796 res[k++] = mapped[j];
9797 }
9798 }
9799 return k;
9800 }
9801
9802 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9803 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9804 Py_UCS4 *maxchar, int lower)
9805 {
9806 Py_ssize_t i, k = 0;
9807
9808 for (i = 0; i < length; i++) {
9809 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9810 int n_res, j;
9811 if (lower)
9812 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9813 else
9814 n_res = _PyUnicode_ToUpperFull(c, mapped);
9815 for (j = 0; j < n_res; j++) {
9816 *maxchar = Py_MAX(*maxchar, mapped[j]);
9817 res[k++] = mapped[j];
9818 }
9819 }
9820 return k;
9821 }
9822
9823 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9824 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9825 {
9826 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9827 }
9828
9829 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9830 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9831 {
9832 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9833 }
9834
9835 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9836 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9837 {
9838 Py_ssize_t i, k = 0;
9839
9840 for (i = 0; i < length; i++) {
9841 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9842 Py_UCS4 mapped[3];
9843 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9844 for (j = 0; j < n_res; j++) {
9845 *maxchar = Py_MAX(*maxchar, mapped[j]);
9846 res[k++] = mapped[j];
9847 }
9848 }
9849 return k;
9850 }
9851
9852 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9853 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9854 {
9855 Py_ssize_t i, k = 0;
9856 int previous_is_cased;
9857
9858 previous_is_cased = 0;
9859 for (i = 0; i < length; i++) {
9860 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9861 Py_UCS4 mapped[3];
9862 int n_res, j;
9863
9864 if (previous_is_cased)
9865 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9866 else
9867 n_res = _PyUnicode_ToTitleFull(c, mapped);
9868
9869 for (j = 0; j < n_res; j++) {
9870 *maxchar = Py_MAX(*maxchar, mapped[j]);
9871 res[k++] = mapped[j];
9872 }
9873
9874 previous_is_cased = _PyUnicode_IsCased(c);
9875 }
9876 return k;
9877 }
9878
9879 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9880 case_operation(PyObject *self,
9881 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9882 {
9883 PyObject *res = NULL;
9884 Py_ssize_t length, newlength = 0;
9885 int kind, outkind;
9886 void *data, *outdata;
9887 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9888
9889 assert(PyUnicode_IS_READY(self));
9890
9891 kind = PyUnicode_KIND(self);
9892 data = PyUnicode_DATA(self);
9893 length = PyUnicode_GET_LENGTH(self);
9894 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9895 PyErr_SetString(PyExc_OverflowError, "string is too long");
9896 return NULL;
9897 }
9898 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9899 if (tmp == NULL)
9900 return PyErr_NoMemory();
9901 newlength = perform(kind, data, length, tmp, &maxchar);
9902 res = PyUnicode_New(newlength, maxchar);
9903 if (res == NULL)
9904 goto leave;
9905 tmpend = tmp + newlength;
9906 outdata = PyUnicode_DATA(res);
9907 outkind = PyUnicode_KIND(res);
9908 switch (outkind) {
9909 case PyUnicode_1BYTE_KIND:
9910 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9911 break;
9912 case PyUnicode_2BYTE_KIND:
9913 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9914 break;
9915 case PyUnicode_4BYTE_KIND:
9916 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9917 break;
9918 default:
9919 Py_UNREACHABLE();
9920 }
9921 leave:
9922 PyMem_FREE(tmp);
9923 return res;
9924 }
9925
9926 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9927 PyUnicode_Join(PyObject *separator, PyObject *seq)
9928 {
9929 PyObject *res;
9930 PyObject *fseq;
9931 Py_ssize_t seqlen;
9932 PyObject **items;
9933
9934 fseq = PySequence_Fast(seq, "can only join an iterable");
9935 if (fseq == NULL) {
9936 return NULL;
9937 }
9938
9939 /* NOTE: the following code can't call back into Python code,
9940 * so we are sure that fseq won't be mutated.
9941 */
9942
9943 items = PySequence_Fast_ITEMS(fseq);
9944 seqlen = PySequence_Fast_GET_SIZE(fseq);
9945 res = _PyUnicode_JoinArray(separator, items, seqlen);
9946 Py_DECREF(fseq);
9947 return res;
9948 }
9949
9950 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9951 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9952 {
9953 PyObject *res = NULL; /* the result */
9954 PyObject *sep = NULL;
9955 Py_ssize_t seplen;
9956 PyObject *item;
9957 Py_ssize_t sz, i, res_offset;
9958 Py_UCS4 maxchar;
9959 Py_UCS4 item_maxchar;
9960 int use_memcpy;
9961 unsigned char *res_data = NULL, *sep_data = NULL;
9962 PyObject *last_obj;
9963 unsigned int kind = 0;
9964
9965 /* If empty sequence, return u"". */
9966 if (seqlen == 0) {
9967 _Py_RETURN_UNICODE_EMPTY();
9968 }
9969
9970 /* If singleton sequence with an exact Unicode, return that. */
9971 last_obj = NULL;
9972 if (seqlen == 1) {
9973 if (PyUnicode_CheckExact(items[0])) {
9974 res = items[0];
9975 Py_INCREF(res);
9976 return res;
9977 }
9978 seplen = 0;
9979 maxchar = 0;
9980 }
9981 else {
9982 /* Set up sep and seplen */
9983 if (separator == NULL) {
9984 /* fall back to a blank space separator */
9985 sep = PyUnicode_FromOrdinal(' ');
9986 if (!sep)
9987 goto onError;
9988 seplen = 1;
9989 maxchar = 32;
9990 }
9991 else {
9992 if (!PyUnicode_Check(separator)) {
9993 PyErr_Format(PyExc_TypeError,
9994 "separator: expected str instance,"
9995 " %.80s found",
9996 Py_TYPE(separator)->tp_name);
9997 goto onError;
9998 }
9999 if (PyUnicode_READY(separator))
10000 goto onError;
10001 sep = separator;
10002 seplen = PyUnicode_GET_LENGTH(separator);
10003 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10004 /* inc refcount to keep this code path symmetric with the
10005 above case of a blank separator */
10006 Py_INCREF(sep);
10007 }
10008 last_obj = sep;
10009 }
10010
10011 /* There are at least two things to join, or else we have a subclass
10012 * of str in the sequence.
10013 * Do a pre-pass to figure out the total amount of space we'll
10014 * need (sz), and see whether all argument are strings.
10015 */
10016 sz = 0;
10017 #ifdef Py_DEBUG
10018 use_memcpy = 0;
10019 #else
10020 use_memcpy = 1;
10021 #endif
10022 for (i = 0; i < seqlen; i++) {
10023 size_t add_sz;
10024 item = items[i];
10025 if (!PyUnicode_Check(item)) {
10026 PyErr_Format(PyExc_TypeError,
10027 "sequence item %zd: expected str instance,"
10028 " %.80s found",
10029 i, Py_TYPE(item)->tp_name);
10030 goto onError;
10031 }
10032 if (PyUnicode_READY(item) == -1)
10033 goto onError;
10034 add_sz = PyUnicode_GET_LENGTH(item);
10035 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10036 maxchar = Py_MAX(maxchar, item_maxchar);
10037 if (i != 0) {
10038 add_sz += seplen;
10039 }
10040 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10041 PyErr_SetString(PyExc_OverflowError,
10042 "join() result is too long for a Python string");
10043 goto onError;
10044 }
10045 sz += add_sz;
10046 if (use_memcpy && last_obj != NULL) {
10047 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10048 use_memcpy = 0;
10049 }
10050 last_obj = item;
10051 }
10052
10053 res = PyUnicode_New(sz, maxchar);
10054 if (res == NULL)
10055 goto onError;
10056
10057 /* Catenate everything. */
10058 #ifdef Py_DEBUG
10059 use_memcpy = 0;
10060 #else
10061 if (use_memcpy) {
10062 res_data = PyUnicode_1BYTE_DATA(res);
10063 kind = PyUnicode_KIND(res);
10064 if (seplen != 0)
10065 sep_data = PyUnicode_1BYTE_DATA(sep);
10066 }
10067 #endif
10068 if (use_memcpy) {
10069 for (i = 0; i < seqlen; ++i) {
10070 Py_ssize_t itemlen;
10071 item = items[i];
10072
10073 /* Copy item, and maybe the separator. */
10074 if (i && seplen != 0) {
10075 memcpy(res_data,
10076 sep_data,
10077 kind * seplen);
10078 res_data += kind * seplen;
10079 }
10080
10081 itemlen = PyUnicode_GET_LENGTH(item);
10082 if (itemlen != 0) {
10083 memcpy(res_data,
10084 PyUnicode_DATA(item),
10085 kind * itemlen);
10086 res_data += kind * itemlen;
10087 }
10088 }
10089 assert(res_data == PyUnicode_1BYTE_DATA(res)
10090 + kind * PyUnicode_GET_LENGTH(res));
10091 }
10092 else {
10093 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10094 Py_ssize_t itemlen;
10095 item = items[i];
10096
10097 /* Copy item, and maybe the separator. */
10098 if (i && seplen != 0) {
10099 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10100 res_offset += seplen;
10101 }
10102
10103 itemlen = PyUnicode_GET_LENGTH(item);
10104 if (itemlen != 0) {
10105 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10106 res_offset += itemlen;
10107 }
10108 }
10109 assert(res_offset == PyUnicode_GET_LENGTH(res));
10110 }
10111
10112 Py_XDECREF(sep);
10113 assert(_PyUnicode_CheckConsistency(res, 1));
10114 return res;
10115
10116 onError:
10117 Py_XDECREF(sep);
10118 Py_XDECREF(res);
10119 return NULL;
10120 }
10121
10122 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10123 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10124 Py_UCS4 fill_char)
10125 {
10126 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10127 void *data = PyUnicode_DATA(unicode);
10128 assert(PyUnicode_IS_READY(unicode));
10129 assert(unicode_modifiable(unicode));
10130 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10131 assert(start >= 0);
10132 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10133 unicode_fill(kind, data, fill_char, start, length);
10134 }
10135
10136 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10137 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10138 Py_UCS4 fill_char)
10139 {
10140 Py_ssize_t maxlen;
10141
10142 if (!PyUnicode_Check(unicode)) {
10143 PyErr_BadInternalCall();
10144 return -1;
10145 }
10146 if (PyUnicode_READY(unicode) == -1)
10147 return -1;
10148 if (unicode_check_modifiable(unicode))
10149 return -1;
10150
10151 if (start < 0) {
10152 PyErr_SetString(PyExc_IndexError, "string index out of range");
10153 return -1;
10154 }
10155 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10156 PyErr_SetString(PyExc_ValueError,
10157 "fill character is bigger than "
10158 "the string maximum character");
10159 return -1;
10160 }
10161
10162 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10163 length = Py_MIN(maxlen, length);
10164 if (length <= 0)
10165 return 0;
10166
10167 _PyUnicode_FastFill(unicode, start, length, fill_char);
10168 return length;
10169 }
10170
10171 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10172 pad(PyObject *self,
10173 Py_ssize_t left,
10174 Py_ssize_t right,
10175 Py_UCS4 fill)
10176 {
10177 PyObject *u;
10178 Py_UCS4 maxchar;
10179 int kind;
10180 void *data;
10181
10182 if (left < 0)
10183 left = 0;
10184 if (right < 0)
10185 right = 0;
10186
10187 if (left == 0 && right == 0)
10188 return unicode_result_unchanged(self);
10189
10190 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10191 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10192 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10193 return NULL;
10194 }
10195 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10196 maxchar = Py_MAX(maxchar, fill);
10197 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10198 if (!u)
10199 return NULL;
10200
10201 kind = PyUnicode_KIND(u);
10202 data = PyUnicode_DATA(u);
10203 if (left)
10204 unicode_fill(kind, data, fill, 0, left);
10205 if (right)
10206 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10207 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10208 assert(_PyUnicode_CheckConsistency(u, 1));
10209 return u;
10210 }
10211
10212 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10213 PyUnicode_Splitlines(PyObject *string, int keepends)
10214 {
10215 PyObject *list;
10216
10217 if (ensure_unicode(string) < 0)
10218 return NULL;
10219
10220 switch (PyUnicode_KIND(string)) {
10221 case PyUnicode_1BYTE_KIND:
10222 if (PyUnicode_IS_ASCII(string))
10223 list = asciilib_splitlines(
10224 string, PyUnicode_1BYTE_DATA(string),
10225 PyUnicode_GET_LENGTH(string), keepends);
10226 else
10227 list = ucs1lib_splitlines(
10228 string, PyUnicode_1BYTE_DATA(string),
10229 PyUnicode_GET_LENGTH(string), keepends);
10230 break;
10231 case PyUnicode_2BYTE_KIND:
10232 list = ucs2lib_splitlines(
10233 string, PyUnicode_2BYTE_DATA(string),
10234 PyUnicode_GET_LENGTH(string), keepends);
10235 break;
10236 case PyUnicode_4BYTE_KIND:
10237 list = ucs4lib_splitlines(
10238 string, PyUnicode_4BYTE_DATA(string),
10239 PyUnicode_GET_LENGTH(string), keepends);
10240 break;
10241 default:
10242 Py_UNREACHABLE();
10243 }
10244 return list;
10245 }
10246
10247 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10248 split(PyObject *self,
10249 PyObject *substring,
10250 Py_ssize_t maxcount)
10251 {
10252 int kind1, kind2;
10253 void *buf1, *buf2;
10254 Py_ssize_t len1, len2;
10255 PyObject* out;
10256
10257 if (maxcount < 0)
10258 maxcount = PY_SSIZE_T_MAX;
10259
10260 if (PyUnicode_READY(self) == -1)
10261 return NULL;
10262
10263 if (substring == NULL)
10264 switch (PyUnicode_KIND(self)) {
10265 case PyUnicode_1BYTE_KIND:
10266 if (PyUnicode_IS_ASCII(self))
10267 return asciilib_split_whitespace(
10268 self, PyUnicode_1BYTE_DATA(self),
10269 PyUnicode_GET_LENGTH(self), maxcount
10270 );
10271 else
10272 return ucs1lib_split_whitespace(
10273 self, PyUnicode_1BYTE_DATA(self),
10274 PyUnicode_GET_LENGTH(self), maxcount
10275 );
10276 case PyUnicode_2BYTE_KIND:
10277 return ucs2lib_split_whitespace(
10278 self, PyUnicode_2BYTE_DATA(self),
10279 PyUnicode_GET_LENGTH(self), maxcount
10280 );
10281 case PyUnicode_4BYTE_KIND:
10282 return ucs4lib_split_whitespace(
10283 self, PyUnicode_4BYTE_DATA(self),
10284 PyUnicode_GET_LENGTH(self), maxcount
10285 );
10286 default:
10287 Py_UNREACHABLE();
10288 }
10289
10290 if (PyUnicode_READY(substring) == -1)
10291 return NULL;
10292
10293 kind1 = PyUnicode_KIND(self);
10294 kind2 = PyUnicode_KIND(substring);
10295 len1 = PyUnicode_GET_LENGTH(self);
10296 len2 = PyUnicode_GET_LENGTH(substring);
10297 if (kind1 < kind2 || len1 < len2) {
10298 out = PyList_New(1);
10299 if (out == NULL)
10300 return NULL;
10301 Py_INCREF(self);
10302 PyList_SET_ITEM(out, 0, self);
10303 return out;
10304 }
10305 buf1 = PyUnicode_DATA(self);
10306 buf2 = PyUnicode_DATA(substring);
10307 if (kind2 != kind1) {
10308 buf2 = _PyUnicode_AsKind(substring, kind1);
10309 if (!buf2)
10310 return NULL;
10311 }
10312
10313 switch (kind1) {
10314 case PyUnicode_1BYTE_KIND:
10315 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10316 out = asciilib_split(
10317 self, buf1, len1, buf2, len2, maxcount);
10318 else
10319 out = ucs1lib_split(
10320 self, buf1, len1, buf2, len2, maxcount);
10321 break;
10322 case PyUnicode_2BYTE_KIND:
10323 out = ucs2lib_split(
10324 self, buf1, len1, buf2, len2, maxcount);
10325 break;
10326 case PyUnicode_4BYTE_KIND:
10327 out = ucs4lib_split(
10328 self, buf1, len1, buf2, len2, maxcount);
10329 break;
10330 default:
10331 out = NULL;
10332 }
10333 if (kind2 != kind1)
10334 PyMem_Free(buf2);
10335 return out;
10336 }
10337
10338 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10339 rsplit(PyObject *self,
10340 PyObject *substring,
10341 Py_ssize_t maxcount)
10342 {
10343 int kind1, kind2;
10344 void *buf1, *buf2;
10345 Py_ssize_t len1, len2;
10346 PyObject* out;
10347
10348 if (maxcount < 0)
10349 maxcount = PY_SSIZE_T_MAX;
10350
10351 if (PyUnicode_READY(self) == -1)
10352 return NULL;
10353
10354 if (substring == NULL)
10355 switch (PyUnicode_KIND(self)) {
10356 case PyUnicode_1BYTE_KIND:
10357 if (PyUnicode_IS_ASCII(self))
10358 return asciilib_rsplit_whitespace(
10359 self, PyUnicode_1BYTE_DATA(self),
10360 PyUnicode_GET_LENGTH(self), maxcount
10361 );
10362 else
10363 return ucs1lib_rsplit_whitespace(
10364 self, PyUnicode_1BYTE_DATA(self),
10365 PyUnicode_GET_LENGTH(self), maxcount
10366 );
10367 case PyUnicode_2BYTE_KIND:
10368 return ucs2lib_rsplit_whitespace(
10369 self, PyUnicode_2BYTE_DATA(self),
10370 PyUnicode_GET_LENGTH(self), maxcount
10371 );
10372 case PyUnicode_4BYTE_KIND:
10373 return ucs4lib_rsplit_whitespace(
10374 self, PyUnicode_4BYTE_DATA(self),
10375 PyUnicode_GET_LENGTH(self), maxcount
10376 );
10377 default:
10378 Py_UNREACHABLE();
10379 }
10380
10381 if (PyUnicode_READY(substring) == -1)
10382 return NULL;
10383
10384 kind1 = PyUnicode_KIND(self);
10385 kind2 = PyUnicode_KIND(substring);
10386 len1 = PyUnicode_GET_LENGTH(self);
10387 len2 = PyUnicode_GET_LENGTH(substring);
10388 if (kind1 < kind2 || len1 < len2) {
10389 out = PyList_New(1);
10390 if (out == NULL)
10391 return NULL;
10392 Py_INCREF(self);
10393 PyList_SET_ITEM(out, 0, self);
10394 return out;
10395 }
10396 buf1 = PyUnicode_DATA(self);
10397 buf2 = PyUnicode_DATA(substring);
10398 if (kind2 != kind1) {
10399 buf2 = _PyUnicode_AsKind(substring, kind1);
10400 if (!buf2)
10401 return NULL;
10402 }
10403
10404 switch (kind1) {
10405 case PyUnicode_1BYTE_KIND:
10406 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10407 out = asciilib_rsplit(
10408 self, buf1, len1, buf2, len2, maxcount);
10409 else
10410 out = ucs1lib_rsplit(
10411 self, buf1, len1, buf2, len2, maxcount);
10412 break;
10413 case PyUnicode_2BYTE_KIND:
10414 out = ucs2lib_rsplit(
10415 self, buf1, len1, buf2, len2, maxcount);
10416 break;
10417 case PyUnicode_4BYTE_KIND:
10418 out = ucs4lib_rsplit(
10419 self, buf1, len1, buf2, len2, maxcount);
10420 break;
10421 default:
10422 out = NULL;
10423 }
10424 if (kind2 != kind1)
10425 PyMem_Free(buf2);
10426 return out;
10427 }
10428
10429 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10430 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10431 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10432 {
10433 switch (kind) {
10434 case PyUnicode_1BYTE_KIND:
10435 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10436 return asciilib_find(buf1, len1, buf2, len2, offset);
10437 else
10438 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10439 case PyUnicode_2BYTE_KIND:
10440 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10441 case PyUnicode_4BYTE_KIND:
10442 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10443 }
10444 Py_UNREACHABLE();
10445 }
10446
10447 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10448 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10449 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10450 {
10451 switch (kind) {
10452 case PyUnicode_1BYTE_KIND:
10453 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10454 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10455 else
10456 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10457 case PyUnicode_2BYTE_KIND:
10458 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10459 case PyUnicode_4BYTE_KIND:
10460 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10461 }
10462 Py_UNREACHABLE();
10463 }
10464
10465 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10466 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10467 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10468 {
10469 int kind = PyUnicode_KIND(u);
10470 void *data = PyUnicode_DATA(u);
10471 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10472 if (kind == PyUnicode_1BYTE_KIND) {
10473 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10474 (Py_UCS1 *)data + len,
10475 u1, u2, maxcount);
10476 }
10477 else if (kind == PyUnicode_2BYTE_KIND) {
10478 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10479 (Py_UCS2 *)data + len,
10480 u1, u2, maxcount);
10481 }
10482 else {
10483 assert(kind == PyUnicode_4BYTE_KIND);
10484 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10485 (Py_UCS4 *)data + len,
10486 u1, u2, maxcount);
10487 }
10488 }
10489
10490 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10491 replace(PyObject *self, PyObject *str1,
10492 PyObject *str2, Py_ssize_t maxcount)
10493 {
10494 PyObject *u;
10495 char *sbuf = PyUnicode_DATA(self);
10496 char *buf1 = PyUnicode_DATA(str1);
10497 char *buf2 = PyUnicode_DATA(str2);
10498 int srelease = 0, release1 = 0, release2 = 0;
10499 int skind = PyUnicode_KIND(self);
10500 int kind1 = PyUnicode_KIND(str1);
10501 int kind2 = PyUnicode_KIND(str2);
10502 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10503 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10504 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10505 int mayshrink;
10506 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10507
10508 if (maxcount < 0)
10509 maxcount = PY_SSIZE_T_MAX;
10510 else if (maxcount == 0 || slen == 0)
10511 goto nothing;
10512
10513 if (str1 == str2)
10514 goto nothing;
10515
10516 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10517 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10518 if (maxchar < maxchar_str1)
10519 /* substring too wide to be present */
10520 goto nothing;
10521 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10522 /* Replacing str1 with str2 may cause a maxchar reduction in the
10523 result string. */
10524 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10525 maxchar = Py_MAX(maxchar, maxchar_str2);
10526
10527 if (len1 == len2) {
10528 /* same length */
10529 if (len1 == 0)
10530 goto nothing;
10531 if (len1 == 1) {
10532 /* replace characters */
10533 Py_UCS4 u1, u2;
10534 Py_ssize_t pos;
10535
10536 u1 = PyUnicode_READ(kind1, buf1, 0);
10537 pos = findchar(sbuf, skind, slen, u1, 1);
10538 if (pos < 0)
10539 goto nothing;
10540 u2 = PyUnicode_READ(kind2, buf2, 0);
10541 u = PyUnicode_New(slen, maxchar);
10542 if (!u)
10543 goto error;
10544
10545 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10546 replace_1char_inplace(u, pos, u1, u2, maxcount);
10547 }
10548 else {
10549 int rkind = skind;
10550 char *res;
10551 Py_ssize_t i;
10552
10553 if (kind1 < rkind) {
10554 /* widen substring */
10555 buf1 = _PyUnicode_AsKind(str1, rkind);
10556 if (!buf1) goto error;
10557 release1 = 1;
10558 }
10559 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10560 if (i < 0)
10561 goto nothing;
10562 if (rkind > kind2) {
10563 /* widen replacement */
10564 buf2 = _PyUnicode_AsKind(str2, rkind);
10565 if (!buf2) goto error;
10566 release2 = 1;
10567 }
10568 else if (rkind < kind2) {
10569 /* widen self and buf1 */
10570 rkind = kind2;
10571 if (release1) PyMem_Free(buf1);
10572 release1 = 0;
10573 sbuf = _PyUnicode_AsKind(self, rkind);
10574 if (!sbuf) goto error;
10575 srelease = 1;
10576 buf1 = _PyUnicode_AsKind(str1, rkind);
10577 if (!buf1) goto error;
10578 release1 = 1;
10579 }
10580 u = PyUnicode_New(slen, maxchar);
10581 if (!u)
10582 goto error;
10583 assert(PyUnicode_KIND(u) == rkind);
10584 res = PyUnicode_DATA(u);
10585
10586 memcpy(res, sbuf, rkind * slen);
10587 /* change everything in-place, starting with this one */
10588 memcpy(res + rkind * i,
10589 buf2,
10590 rkind * len2);
10591 i += len1;
10592
10593 while ( --maxcount > 0) {
10594 i = anylib_find(rkind, self,
10595 sbuf+rkind*i, slen-i,
10596 str1, buf1, len1, i);
10597 if (i == -1)
10598 break;
10599 memcpy(res + rkind * i,
10600 buf2,
10601 rkind * len2);
10602 i += len1;
10603 }
10604 }
10605 }
10606 else {
10607 Py_ssize_t n, i, j, ires;
10608 Py_ssize_t new_size;
10609 int rkind = skind;
10610 char *res;
10611
10612 if (kind1 < rkind) {
10613 /* widen substring */
10614 buf1 = _PyUnicode_AsKind(str1, rkind);
10615 if (!buf1) goto error;
10616 release1 = 1;
10617 }
10618 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10619 if (n == 0)
10620 goto nothing;
10621 if (kind2 < rkind) {
10622 /* widen replacement */
10623 buf2 = _PyUnicode_AsKind(str2, rkind);
10624 if (!buf2) goto error;
10625 release2 = 1;
10626 }
10627 else if (kind2 > rkind) {
10628 /* widen self and buf1 */
10629 rkind = kind2;
10630 sbuf = _PyUnicode_AsKind(self, rkind);
10631 if (!sbuf) goto error;
10632 srelease = 1;
10633 if (release1) PyMem_Free(buf1);
10634 release1 = 0;
10635 buf1 = _PyUnicode_AsKind(str1, rkind);
10636 if (!buf1) goto error;
10637 release1 = 1;
10638 }
10639 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10640 PyUnicode_GET_LENGTH(str1))); */
10641 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10642 PyErr_SetString(PyExc_OverflowError,
10643 "replace string is too long");
10644 goto error;
10645 }
10646 new_size = slen + n * (len2 - len1);
10647 if (new_size == 0) {
10648 _Py_INCREF_UNICODE_EMPTY();
10649 if (!unicode_empty)
10650 goto error;
10651 u = unicode_empty;
10652 goto done;
10653 }
10654 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10655 PyErr_SetString(PyExc_OverflowError,
10656 "replace string is too long");
10657 goto error;
10658 }
10659 u = PyUnicode_New(new_size, maxchar);
10660 if (!u)
10661 goto error;
10662 assert(PyUnicode_KIND(u) == rkind);
10663 res = PyUnicode_DATA(u);
10664 ires = i = 0;
10665 if (len1 > 0) {
10666 while (n-- > 0) {
10667 /* look for next match */
10668 j = anylib_find(rkind, self,
10669 sbuf + rkind * i, slen-i,
10670 str1, buf1, len1, i);
10671 if (j == -1)
10672 break;
10673 else if (j > i) {
10674 /* copy unchanged part [i:j] */
10675 memcpy(res + rkind * ires,
10676 sbuf + rkind * i,
10677 rkind * (j-i));
10678 ires += j - i;
10679 }
10680 /* copy substitution string */
10681 if (len2 > 0) {
10682 memcpy(res + rkind * ires,
10683 buf2,
10684 rkind * len2);
10685 ires += len2;
10686 }
10687 i = j + len1;
10688 }
10689 if (i < slen)
10690 /* copy tail [i:] */
10691 memcpy(res + rkind * ires,
10692 sbuf + rkind * i,
10693 rkind * (slen-i));
10694 }
10695 else {
10696 /* interleave */
10697 while (n > 0) {
10698 memcpy(res + rkind * ires,
10699 buf2,
10700 rkind * len2);
10701 ires += len2;
10702 if (--n <= 0)
10703 break;
10704 memcpy(res + rkind * ires,
10705 sbuf + rkind * i,
10706 rkind);
10707 ires++;
10708 i++;
10709 }
10710 memcpy(res + rkind * ires,
10711 sbuf + rkind * i,
10712 rkind * (slen-i));
10713 }
10714 }
10715
10716 if (mayshrink) {
10717 unicode_adjust_maxchar(&u);
10718 if (u == NULL)
10719 goto error;
10720 }
10721
10722 done:
10723 if (srelease)
10724 PyMem_FREE(sbuf);
10725 if (release1)
10726 PyMem_FREE(buf1);
10727 if (release2)
10728 PyMem_FREE(buf2);
10729 assert(_PyUnicode_CheckConsistency(u, 1));
10730 return u;
10731
10732 nothing:
10733 /* nothing to replace; return original string (when possible) */
10734 if (srelease)
10735 PyMem_FREE(sbuf);
10736 if (release1)
10737 PyMem_FREE(buf1);
10738 if (release2)
10739 PyMem_FREE(buf2);
10740 return unicode_result_unchanged(self);
10741
10742 error:
10743 if (srelease && sbuf)
10744 PyMem_FREE(sbuf);
10745 if (release1 && buf1)
10746 PyMem_FREE(buf1);
10747 if (release2 && buf2)
10748 PyMem_FREE(buf2);
10749 return NULL;
10750 }
10751
10752 /* --- Unicode Object Methods --------------------------------------------- */
10753
10754 /*[clinic input]
10755 str.title as unicode_title
10756
10757 Return a version of the string where each word is titlecased.
10758
10759 More specifically, words start with uppercased characters and all remaining
10760 cased characters have lower case.
10761 [clinic start generated code]*/
10762
10763 static PyObject *
unicode_title_impl(PyObject * self)10764 unicode_title_impl(PyObject *self)
10765 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10766 {
10767 if (PyUnicode_READY(self) == -1)
10768 return NULL;
10769 return case_operation(self, do_title);
10770 }
10771
10772 /*[clinic input]
10773 str.capitalize as unicode_capitalize
10774
10775 Return a capitalized version of the string.
10776
10777 More specifically, make the first character have upper case and the rest lower
10778 case.
10779 [clinic start generated code]*/
10780
10781 static PyObject *
unicode_capitalize_impl(PyObject * self)10782 unicode_capitalize_impl(PyObject *self)
10783 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10784 {
10785 if (PyUnicode_READY(self) == -1)
10786 return NULL;
10787 if (PyUnicode_GET_LENGTH(self) == 0)
10788 return unicode_result_unchanged(self);
10789 return case_operation(self, do_capitalize);
10790 }
10791
10792 /*[clinic input]
10793 str.casefold as unicode_casefold
10794
10795 Return a version of the string suitable for caseless comparisons.
10796 [clinic start generated code]*/
10797
10798 static PyObject *
unicode_casefold_impl(PyObject * self)10799 unicode_casefold_impl(PyObject *self)
10800 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10801 {
10802 if (PyUnicode_READY(self) == -1)
10803 return NULL;
10804 if (PyUnicode_IS_ASCII(self))
10805 return ascii_upper_or_lower(self, 1);
10806 return case_operation(self, do_casefold);
10807 }
10808
10809
10810 /* Argument converter. Accepts a single Unicode character. */
10811
10812 static int
convert_uc(PyObject * obj,void * addr)10813 convert_uc(PyObject *obj, void *addr)
10814 {
10815 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10816
10817 if (!PyUnicode_Check(obj)) {
10818 PyErr_Format(PyExc_TypeError,
10819 "The fill character must be a unicode character, "
10820 "not %.100s", Py_TYPE(obj)->tp_name);
10821 return 0;
10822 }
10823 if (PyUnicode_READY(obj) < 0)
10824 return 0;
10825 if (PyUnicode_GET_LENGTH(obj) != 1) {
10826 PyErr_SetString(PyExc_TypeError,
10827 "The fill character must be exactly one character long");
10828 return 0;
10829 }
10830 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10831 return 1;
10832 }
10833
10834 /*[clinic input]
10835 str.center as unicode_center
10836
10837 width: Py_ssize_t
10838 fillchar: Py_UCS4 = ' '
10839 /
10840
10841 Return a centered string of length width.
10842
10843 Padding is done using the specified fill character (default is a space).
10844 [clinic start generated code]*/
10845
10846 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10847 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10848 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10849 {
10850 Py_ssize_t marg, left;
10851
10852 if (PyUnicode_READY(self) == -1)
10853 return NULL;
10854
10855 if (PyUnicode_GET_LENGTH(self) >= width)
10856 return unicode_result_unchanged(self);
10857
10858 marg = width - PyUnicode_GET_LENGTH(self);
10859 left = marg / 2 + (marg & width & 1);
10860
10861 return pad(self, left, marg - left, fillchar);
10862 }
10863
10864 /* This function assumes that str1 and str2 are readied by the caller. */
10865
10866 static int
unicode_compare(PyObject * str1,PyObject * str2)10867 unicode_compare(PyObject *str1, PyObject *str2)
10868 {
10869 #define COMPARE(TYPE1, TYPE2) \
10870 do { \
10871 TYPE1* p1 = (TYPE1 *)data1; \
10872 TYPE2* p2 = (TYPE2 *)data2; \
10873 TYPE1* end = p1 + len; \
10874 Py_UCS4 c1, c2; \
10875 for (; p1 != end; p1++, p2++) { \
10876 c1 = *p1; \
10877 c2 = *p2; \
10878 if (c1 != c2) \
10879 return (c1 < c2) ? -1 : 1; \
10880 } \
10881 } \
10882 while (0)
10883
10884 int kind1, kind2;
10885 void *data1, *data2;
10886 Py_ssize_t len1, len2, len;
10887
10888 kind1 = PyUnicode_KIND(str1);
10889 kind2 = PyUnicode_KIND(str2);
10890 data1 = PyUnicode_DATA(str1);
10891 data2 = PyUnicode_DATA(str2);
10892 len1 = PyUnicode_GET_LENGTH(str1);
10893 len2 = PyUnicode_GET_LENGTH(str2);
10894 len = Py_MIN(len1, len2);
10895
10896 switch(kind1) {
10897 case PyUnicode_1BYTE_KIND:
10898 {
10899 switch(kind2) {
10900 case PyUnicode_1BYTE_KIND:
10901 {
10902 int cmp = memcmp(data1, data2, len);
10903 /* normalize result of memcmp() into the range [-1; 1] */
10904 if (cmp < 0)
10905 return -1;
10906 if (cmp > 0)
10907 return 1;
10908 break;
10909 }
10910 case PyUnicode_2BYTE_KIND:
10911 COMPARE(Py_UCS1, Py_UCS2);
10912 break;
10913 case PyUnicode_4BYTE_KIND:
10914 COMPARE(Py_UCS1, Py_UCS4);
10915 break;
10916 default:
10917 Py_UNREACHABLE();
10918 }
10919 break;
10920 }
10921 case PyUnicode_2BYTE_KIND:
10922 {
10923 switch(kind2) {
10924 case PyUnicode_1BYTE_KIND:
10925 COMPARE(Py_UCS2, Py_UCS1);
10926 break;
10927 case PyUnicode_2BYTE_KIND:
10928 {
10929 COMPARE(Py_UCS2, Py_UCS2);
10930 break;
10931 }
10932 case PyUnicode_4BYTE_KIND:
10933 COMPARE(Py_UCS2, Py_UCS4);
10934 break;
10935 default:
10936 Py_UNREACHABLE();
10937 }
10938 break;
10939 }
10940 case PyUnicode_4BYTE_KIND:
10941 {
10942 switch(kind2) {
10943 case PyUnicode_1BYTE_KIND:
10944 COMPARE(Py_UCS4, Py_UCS1);
10945 break;
10946 case PyUnicode_2BYTE_KIND:
10947 COMPARE(Py_UCS4, Py_UCS2);
10948 break;
10949 case PyUnicode_4BYTE_KIND:
10950 {
10951 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10952 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10953 /* normalize result of wmemcmp() into the range [-1; 1] */
10954 if (cmp < 0)
10955 return -1;
10956 if (cmp > 0)
10957 return 1;
10958 #else
10959 COMPARE(Py_UCS4, Py_UCS4);
10960 #endif
10961 break;
10962 }
10963 default:
10964 Py_UNREACHABLE();
10965 }
10966 break;
10967 }
10968 default:
10969 Py_UNREACHABLE();
10970 }
10971
10972 if (len1 == len2)
10973 return 0;
10974 if (len1 < len2)
10975 return -1;
10976 else
10977 return 1;
10978
10979 #undef COMPARE
10980 }
10981
10982 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10983 unicode_compare_eq(PyObject *str1, PyObject *str2)
10984 {
10985 int kind;
10986 void *data1, *data2;
10987 Py_ssize_t len;
10988 int cmp;
10989
10990 len = PyUnicode_GET_LENGTH(str1);
10991 if (PyUnicode_GET_LENGTH(str2) != len)
10992 return 0;
10993 kind = PyUnicode_KIND(str1);
10994 if (PyUnicode_KIND(str2) != kind)
10995 return 0;
10996 data1 = PyUnicode_DATA(str1);
10997 data2 = PyUnicode_DATA(str2);
10998
10999 cmp = memcmp(data1, data2, len * kind);
11000 return (cmp == 0);
11001 }
11002
11003
11004 int
PyUnicode_Compare(PyObject * left,PyObject * right)11005 PyUnicode_Compare(PyObject *left, PyObject *right)
11006 {
11007 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11008 if (PyUnicode_READY(left) == -1 ||
11009 PyUnicode_READY(right) == -1)
11010 return -1;
11011
11012 /* a string is equal to itself */
11013 if (left == right)
11014 return 0;
11015
11016 return unicode_compare(left, right);
11017 }
11018 PyErr_Format(PyExc_TypeError,
11019 "Can't compare %.100s and %.100s",
11020 left->ob_type->tp_name,
11021 right->ob_type->tp_name);
11022 return -1;
11023 }
11024
11025 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11026 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11027 {
11028 Py_ssize_t i;
11029 int kind;
11030 Py_UCS4 chr;
11031 const unsigned char *ustr = (const unsigned char *)str;
11032
11033 assert(_PyUnicode_CHECK(uni));
11034 if (!PyUnicode_IS_READY(uni)) {
11035 const wchar_t *ws = _PyUnicode_WSTR(uni);
11036 /* Compare Unicode string and source character set string */
11037 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11038 if (chr != ustr[i])
11039 return (chr < ustr[i]) ? -1 : 1;
11040 }
11041 /* This check keeps Python strings that end in '\0' from comparing equal
11042 to C strings identical up to that point. */
11043 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11044 return 1; /* uni is longer */
11045 if (ustr[i])
11046 return -1; /* str is longer */
11047 return 0;
11048 }
11049 kind = PyUnicode_KIND(uni);
11050 if (kind == PyUnicode_1BYTE_KIND) {
11051 const void *data = PyUnicode_1BYTE_DATA(uni);
11052 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11053 size_t len, len2 = strlen(str);
11054 int cmp;
11055
11056 len = Py_MIN(len1, len2);
11057 cmp = memcmp(data, str, len);
11058 if (cmp != 0) {
11059 if (cmp < 0)
11060 return -1;
11061 else
11062 return 1;
11063 }
11064 if (len1 > len2)
11065 return 1; /* uni is longer */
11066 if (len1 < len2)
11067 return -1; /* str is longer */
11068 return 0;
11069 }
11070 else {
11071 void *data = PyUnicode_DATA(uni);
11072 /* Compare Unicode string and source character set string */
11073 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11074 if (chr != (unsigned char)str[i])
11075 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11076 /* This check keeps Python strings that end in '\0' from comparing equal
11077 to C strings identical up to that point. */
11078 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11079 return 1; /* uni is longer */
11080 if (str[i])
11081 return -1; /* str is longer */
11082 return 0;
11083 }
11084 }
11085
11086 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11087 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11088 {
11089 size_t i, len;
11090 const wchar_t *p;
11091 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11092 if (strlen(str) != len)
11093 return 0;
11094 p = _PyUnicode_WSTR(unicode);
11095 assert(p);
11096 for (i = 0; i < len; i++) {
11097 unsigned char c = (unsigned char)str[i];
11098 if (c >= 128 || p[i] != (wchar_t)c)
11099 return 0;
11100 }
11101 return 1;
11102 }
11103
11104 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11105 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11106 {
11107 size_t len;
11108 assert(_PyUnicode_CHECK(unicode));
11109 assert(str);
11110 #ifndef NDEBUG
11111 for (const char *p = str; *p; p++) {
11112 assert((unsigned char)*p < 128);
11113 }
11114 #endif
11115 if (PyUnicode_READY(unicode) == -1) {
11116 /* Memory error or bad data */
11117 PyErr_Clear();
11118 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11119 }
11120 if (!PyUnicode_IS_ASCII(unicode))
11121 return 0;
11122 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11123 return strlen(str) == len &&
11124 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11125 }
11126
11127 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11128 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11129 {
11130 PyObject *right_uni;
11131 Py_hash_t hash;
11132
11133 assert(_PyUnicode_CHECK(left));
11134 assert(right->string);
11135 #ifndef NDEBUG
11136 for (const char *p = right->string; *p; p++) {
11137 assert((unsigned char)*p < 128);
11138 }
11139 #endif
11140
11141 if (PyUnicode_READY(left) == -1) {
11142 /* memory error or bad data */
11143 PyErr_Clear();
11144 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11145 }
11146
11147 if (!PyUnicode_IS_ASCII(left))
11148 return 0;
11149
11150 right_uni = _PyUnicode_FromId(right); /* borrowed */
11151 if (right_uni == NULL) {
11152 /* memory error or bad data */
11153 PyErr_Clear();
11154 return _PyUnicode_EqualToASCIIString(left, right->string);
11155 }
11156
11157 if (left == right_uni)
11158 return 1;
11159
11160 if (PyUnicode_CHECK_INTERNED(left))
11161 return 0;
11162
11163 assert(_PyUnicode_HASH(right_uni) != -1);
11164 hash = _PyUnicode_HASH(left);
11165 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11166 return 0;
11167
11168 return unicode_compare_eq(left, right_uni);
11169 }
11170
11171 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11172 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11173 {
11174 int result;
11175
11176 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11177 Py_RETURN_NOTIMPLEMENTED;
11178
11179 if (PyUnicode_READY(left) == -1 ||
11180 PyUnicode_READY(right) == -1)
11181 return NULL;
11182
11183 if (left == right) {
11184 switch (op) {
11185 case Py_EQ:
11186 case Py_LE:
11187 case Py_GE:
11188 /* a string is equal to itself */
11189 Py_RETURN_TRUE;
11190 case Py_NE:
11191 case Py_LT:
11192 case Py_GT:
11193 Py_RETURN_FALSE;
11194 default:
11195 PyErr_BadArgument();
11196 return NULL;
11197 }
11198 }
11199 else if (op == Py_EQ || op == Py_NE) {
11200 result = unicode_compare_eq(left, right);
11201 result ^= (op == Py_NE);
11202 return PyBool_FromLong(result);
11203 }
11204 else {
11205 result = unicode_compare(left, right);
11206 Py_RETURN_RICHCOMPARE(result, 0, op);
11207 }
11208 }
11209
11210 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11211 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11212 {
11213 return unicode_eq(aa, bb);
11214 }
11215
11216 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11217 PyUnicode_Contains(PyObject *str, PyObject *substr)
11218 {
11219 int kind1, kind2;
11220 void *buf1, *buf2;
11221 Py_ssize_t len1, len2;
11222 int result;
11223
11224 if (!PyUnicode_Check(substr)) {
11225 PyErr_Format(PyExc_TypeError,
11226 "'in <string>' requires string as left operand, not %.100s",
11227 Py_TYPE(substr)->tp_name);
11228 return -1;
11229 }
11230 if (PyUnicode_READY(substr) == -1)
11231 return -1;
11232 if (ensure_unicode(str) < 0)
11233 return -1;
11234
11235 kind1 = PyUnicode_KIND(str);
11236 kind2 = PyUnicode_KIND(substr);
11237 if (kind1 < kind2)
11238 return 0;
11239 len1 = PyUnicode_GET_LENGTH(str);
11240 len2 = PyUnicode_GET_LENGTH(substr);
11241 if (len1 < len2)
11242 return 0;
11243 buf1 = PyUnicode_DATA(str);
11244 buf2 = PyUnicode_DATA(substr);
11245 if (len2 == 1) {
11246 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11247 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11248 return result;
11249 }
11250 if (kind2 != kind1) {
11251 buf2 = _PyUnicode_AsKind(substr, kind1);
11252 if (!buf2)
11253 return -1;
11254 }
11255
11256 switch (kind1) {
11257 case PyUnicode_1BYTE_KIND:
11258 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11259 break;
11260 case PyUnicode_2BYTE_KIND:
11261 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11262 break;
11263 case PyUnicode_4BYTE_KIND:
11264 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11265 break;
11266 default:
11267 Py_UNREACHABLE();
11268 }
11269
11270 if (kind2 != kind1)
11271 PyMem_Free(buf2);
11272
11273 return result;
11274 }
11275
11276 /* Concat to string or Unicode object giving a new Unicode object. */
11277
11278 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11279 PyUnicode_Concat(PyObject *left, PyObject *right)
11280 {
11281 PyObject *result;
11282 Py_UCS4 maxchar, maxchar2;
11283 Py_ssize_t left_len, right_len, new_len;
11284
11285 if (ensure_unicode(left) < 0)
11286 return NULL;
11287
11288 if (!PyUnicode_Check(right)) {
11289 PyErr_Format(PyExc_TypeError,
11290 "can only concatenate str (not \"%.200s\") to str",
11291 right->ob_type->tp_name);
11292 return NULL;
11293 }
11294 if (PyUnicode_READY(right) < 0)
11295 return NULL;
11296
11297 /* Shortcuts */
11298 if (left == unicode_empty)
11299 return PyUnicode_FromObject(right);
11300 if (right == unicode_empty)
11301 return PyUnicode_FromObject(left);
11302
11303 left_len = PyUnicode_GET_LENGTH(left);
11304 right_len = PyUnicode_GET_LENGTH(right);
11305 if (left_len > PY_SSIZE_T_MAX - right_len) {
11306 PyErr_SetString(PyExc_OverflowError,
11307 "strings are too large to concat");
11308 return NULL;
11309 }
11310 new_len = left_len + right_len;
11311
11312 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11313 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11314 maxchar = Py_MAX(maxchar, maxchar2);
11315
11316 /* Concat the two Unicode strings */
11317 result = PyUnicode_New(new_len, maxchar);
11318 if (result == NULL)
11319 return NULL;
11320 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11321 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11322 assert(_PyUnicode_CheckConsistency(result, 1));
11323 return result;
11324 }
11325
11326 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11327 PyUnicode_Append(PyObject **p_left, PyObject *right)
11328 {
11329 PyObject *left, *res;
11330 Py_UCS4 maxchar, maxchar2;
11331 Py_ssize_t left_len, right_len, new_len;
11332
11333 if (p_left == NULL) {
11334 if (!PyErr_Occurred())
11335 PyErr_BadInternalCall();
11336 return;
11337 }
11338 left = *p_left;
11339 if (right == NULL || left == NULL
11340 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11341 if (!PyErr_Occurred())
11342 PyErr_BadInternalCall();
11343 goto error;
11344 }
11345
11346 if (PyUnicode_READY(left) == -1)
11347 goto error;
11348 if (PyUnicode_READY(right) == -1)
11349 goto error;
11350
11351 /* Shortcuts */
11352 if (left == unicode_empty) {
11353 Py_DECREF(left);
11354 Py_INCREF(right);
11355 *p_left = right;
11356 return;
11357 }
11358 if (right == unicode_empty)
11359 return;
11360
11361 left_len = PyUnicode_GET_LENGTH(left);
11362 right_len = PyUnicode_GET_LENGTH(right);
11363 if (left_len > PY_SSIZE_T_MAX - right_len) {
11364 PyErr_SetString(PyExc_OverflowError,
11365 "strings are too large to concat");
11366 goto error;
11367 }
11368 new_len = left_len + right_len;
11369
11370 if (unicode_modifiable(left)
11371 && PyUnicode_CheckExact(right)
11372 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11373 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11374 to change the structure size, but characters are stored just after
11375 the structure, and so it requires to move all characters which is
11376 not so different than duplicating the string. */
11377 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11378 {
11379 /* append inplace */
11380 if (unicode_resize(p_left, new_len) != 0)
11381 goto error;
11382
11383 /* copy 'right' into the newly allocated area of 'left' */
11384 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11385 }
11386 else {
11387 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11388 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11389 maxchar = Py_MAX(maxchar, maxchar2);
11390
11391 /* Concat the two Unicode strings */
11392 res = PyUnicode_New(new_len, maxchar);
11393 if (res == NULL)
11394 goto error;
11395 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11396 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11397 Py_DECREF(left);
11398 *p_left = res;
11399 }
11400 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11401 return;
11402
11403 error:
11404 Py_CLEAR(*p_left);
11405 }
11406
11407 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11408 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11409 {
11410 PyUnicode_Append(pleft, right);
11411 Py_XDECREF(right);
11412 }
11413
11414 /*
11415 Wraps stringlib_parse_args_finds() and additionally ensures that the
11416 first argument is a unicode object.
11417 */
11418
11419 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11420 parse_args_finds_unicode(const char * function_name, PyObject *args,
11421 PyObject **substring,
11422 Py_ssize_t *start, Py_ssize_t *end)
11423 {
11424 if(stringlib_parse_args_finds(function_name, args, substring,
11425 start, end)) {
11426 if (ensure_unicode(*substring) < 0)
11427 return 0;
11428 return 1;
11429 }
11430 return 0;
11431 }
11432
11433 PyDoc_STRVAR(count__doc__,
11434 "S.count(sub[, start[, end]]) -> int\n\
11435 \n\
11436 Return the number of non-overlapping occurrences of substring sub in\n\
11437 string S[start:end]. Optional arguments start and end are\n\
11438 interpreted as in slice notation.");
11439
11440 static PyObject *
unicode_count(PyObject * self,PyObject * args)11441 unicode_count(PyObject *self, PyObject *args)
11442 {
11443 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11444 Py_ssize_t start = 0;
11445 Py_ssize_t end = PY_SSIZE_T_MAX;
11446 PyObject *result;
11447 int kind1, kind2;
11448 void *buf1, *buf2;
11449 Py_ssize_t len1, len2, iresult;
11450
11451 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11452 return NULL;
11453
11454 kind1 = PyUnicode_KIND(self);
11455 kind2 = PyUnicode_KIND(substring);
11456 if (kind1 < kind2)
11457 return PyLong_FromLong(0);
11458
11459 len1 = PyUnicode_GET_LENGTH(self);
11460 len2 = PyUnicode_GET_LENGTH(substring);
11461 ADJUST_INDICES(start, end, len1);
11462 if (end - start < len2)
11463 return PyLong_FromLong(0);
11464
11465 buf1 = PyUnicode_DATA(self);
11466 buf2 = PyUnicode_DATA(substring);
11467 if (kind2 != kind1) {
11468 buf2 = _PyUnicode_AsKind(substring, kind1);
11469 if (!buf2)
11470 return NULL;
11471 }
11472 switch (kind1) {
11473 case PyUnicode_1BYTE_KIND:
11474 iresult = ucs1lib_count(
11475 ((Py_UCS1*)buf1) + start, end - start,
11476 buf2, len2, PY_SSIZE_T_MAX
11477 );
11478 break;
11479 case PyUnicode_2BYTE_KIND:
11480 iresult = ucs2lib_count(
11481 ((Py_UCS2*)buf1) + start, end - start,
11482 buf2, len2, PY_SSIZE_T_MAX
11483 );
11484 break;
11485 case PyUnicode_4BYTE_KIND:
11486 iresult = ucs4lib_count(
11487 ((Py_UCS4*)buf1) + start, end - start,
11488 buf2, len2, PY_SSIZE_T_MAX
11489 );
11490 break;
11491 default:
11492 Py_UNREACHABLE();
11493 }
11494
11495 result = PyLong_FromSsize_t(iresult);
11496
11497 if (kind2 != kind1)
11498 PyMem_Free(buf2);
11499
11500 return result;
11501 }
11502
11503 /*[clinic input]
11504 str.encode as unicode_encode
11505
11506 encoding: str(c_default="NULL") = 'utf-8'
11507 The encoding in which to encode the string.
11508 errors: str(c_default="NULL") = 'strict'
11509 The error handling scheme to use for encoding errors.
11510 The default is 'strict' meaning that encoding errors raise a
11511 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11512 'xmlcharrefreplace' as well as any other name registered with
11513 codecs.register_error that can handle UnicodeEncodeErrors.
11514
11515 Encode the string using the codec registered for encoding.
11516 [clinic start generated code]*/
11517
11518 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11519 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11520 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11521 {
11522 return PyUnicode_AsEncodedString(self, encoding, errors);
11523 }
11524
11525 /*[clinic input]
11526 str.expandtabs as unicode_expandtabs
11527
11528 tabsize: int = 8
11529
11530 Return a copy where all tab characters are expanded using spaces.
11531
11532 If tabsize is not given, a tab size of 8 characters is assumed.
11533 [clinic start generated code]*/
11534
11535 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11536 unicode_expandtabs_impl(PyObject *self, int tabsize)
11537 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11538 {
11539 Py_ssize_t i, j, line_pos, src_len, incr;
11540 Py_UCS4 ch;
11541 PyObject *u;
11542 void *src_data, *dest_data;
11543 int kind;
11544 int found;
11545
11546 if (PyUnicode_READY(self) == -1)
11547 return NULL;
11548
11549 /* First pass: determine size of output string */
11550 src_len = PyUnicode_GET_LENGTH(self);
11551 i = j = line_pos = 0;
11552 kind = PyUnicode_KIND(self);
11553 src_data = PyUnicode_DATA(self);
11554 found = 0;
11555 for (; i < src_len; i++) {
11556 ch = PyUnicode_READ(kind, src_data, i);
11557 if (ch == '\t') {
11558 found = 1;
11559 if (tabsize > 0) {
11560 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11561 if (j > PY_SSIZE_T_MAX - incr)
11562 goto overflow;
11563 line_pos += incr;
11564 j += incr;
11565 }
11566 }
11567 else {
11568 if (j > PY_SSIZE_T_MAX - 1)
11569 goto overflow;
11570 line_pos++;
11571 j++;
11572 if (ch == '\n' || ch == '\r')
11573 line_pos = 0;
11574 }
11575 }
11576 if (!found)
11577 return unicode_result_unchanged(self);
11578
11579 /* Second pass: create output string and fill it */
11580 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11581 if (!u)
11582 return NULL;
11583 dest_data = PyUnicode_DATA(u);
11584
11585 i = j = line_pos = 0;
11586
11587 for (; i < src_len; i++) {
11588 ch = PyUnicode_READ(kind, src_data, i);
11589 if (ch == '\t') {
11590 if (tabsize > 0) {
11591 incr = tabsize - (line_pos % tabsize);
11592 line_pos += incr;
11593 unicode_fill(kind, dest_data, ' ', j, incr);
11594 j += incr;
11595 }
11596 }
11597 else {
11598 line_pos++;
11599 PyUnicode_WRITE(kind, dest_data, j, ch);
11600 j++;
11601 if (ch == '\n' || ch == '\r')
11602 line_pos = 0;
11603 }
11604 }
11605 assert (j == PyUnicode_GET_LENGTH(u));
11606 return unicode_result(u);
11607
11608 overflow:
11609 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11610 return NULL;
11611 }
11612
11613 PyDoc_STRVAR(find__doc__,
11614 "S.find(sub[, start[, end]]) -> int\n\
11615 \n\
11616 Return the lowest index in S where substring sub is found,\n\
11617 such that sub is contained within S[start:end]. Optional\n\
11618 arguments start and end are interpreted as in slice notation.\n\
11619 \n\
11620 Return -1 on failure.");
11621
11622 static PyObject *
unicode_find(PyObject * self,PyObject * args)11623 unicode_find(PyObject *self, PyObject *args)
11624 {
11625 /* initialize variables to prevent gcc warning */
11626 PyObject *substring = NULL;
11627 Py_ssize_t start = 0;
11628 Py_ssize_t end = 0;
11629 Py_ssize_t result;
11630
11631 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11632 return NULL;
11633
11634 if (PyUnicode_READY(self) == -1)
11635 return NULL;
11636
11637 result = any_find_slice(self, substring, start, end, 1);
11638
11639 if (result == -2)
11640 return NULL;
11641
11642 return PyLong_FromSsize_t(result);
11643 }
11644
11645 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11646 unicode_getitem(PyObject *self, Py_ssize_t index)
11647 {
11648 void *data;
11649 enum PyUnicode_Kind kind;
11650 Py_UCS4 ch;
11651
11652 if (!PyUnicode_Check(self)) {
11653 PyErr_BadArgument();
11654 return NULL;
11655 }
11656 if (PyUnicode_READY(self) == -1) {
11657 return NULL;
11658 }
11659 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11660 PyErr_SetString(PyExc_IndexError, "string index out of range");
11661 return NULL;
11662 }
11663 kind = PyUnicode_KIND(self);
11664 data = PyUnicode_DATA(self);
11665 ch = PyUnicode_READ(kind, data, index);
11666 return unicode_char(ch);
11667 }
11668
11669 /* Believe it or not, this produces the same value for ASCII strings
11670 as bytes_hash(). */
11671 static Py_hash_t
unicode_hash(PyObject * self)11672 unicode_hash(PyObject *self)
11673 {
11674 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11675
11676 #ifdef Py_DEBUG
11677 assert(_Py_HashSecret_Initialized);
11678 #endif
11679 if (_PyUnicode_HASH(self) != -1)
11680 return _PyUnicode_HASH(self);
11681 if (PyUnicode_READY(self) == -1)
11682 return -1;
11683
11684 x = _Py_HashBytes(PyUnicode_DATA(self),
11685 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11686 _PyUnicode_HASH(self) = x;
11687 return x;
11688 }
11689
11690 PyDoc_STRVAR(index__doc__,
11691 "S.index(sub[, start[, end]]) -> int\n\
11692 \n\
11693 Return the lowest index in S where substring sub is found,\n\
11694 such that sub is contained within S[start:end]. Optional\n\
11695 arguments start and end are interpreted as in slice notation.\n\
11696 \n\
11697 Raises ValueError when the substring is not found.");
11698
11699 static PyObject *
unicode_index(PyObject * self,PyObject * args)11700 unicode_index(PyObject *self, PyObject *args)
11701 {
11702 /* initialize variables to prevent gcc warning */
11703 Py_ssize_t result;
11704 PyObject *substring = NULL;
11705 Py_ssize_t start = 0;
11706 Py_ssize_t end = 0;
11707
11708 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11709 return NULL;
11710
11711 if (PyUnicode_READY(self) == -1)
11712 return NULL;
11713
11714 result = any_find_slice(self, substring, start, end, 1);
11715
11716 if (result == -2)
11717 return NULL;
11718
11719 if (result < 0) {
11720 PyErr_SetString(PyExc_ValueError, "substring not found");
11721 return NULL;
11722 }
11723
11724 return PyLong_FromSsize_t(result);
11725 }
11726
11727 /*[clinic input]
11728 str.isascii as unicode_isascii
11729
11730 Return True if all characters in the string are ASCII, False otherwise.
11731
11732 ASCII characters have code points in the range U+0000-U+007F.
11733 Empty string is ASCII too.
11734 [clinic start generated code]*/
11735
11736 static PyObject *
unicode_isascii_impl(PyObject * self)11737 unicode_isascii_impl(PyObject *self)
11738 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11739 {
11740 if (PyUnicode_READY(self) == -1) {
11741 return NULL;
11742 }
11743 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11744 }
11745
11746 /*[clinic input]
11747 str.islower as unicode_islower
11748
11749 Return True if the string is a lowercase string, False otherwise.
11750
11751 A string is lowercase if all cased characters in the string are lowercase and
11752 there is at least one cased character in the string.
11753 [clinic start generated code]*/
11754
11755 static PyObject *
unicode_islower_impl(PyObject * self)11756 unicode_islower_impl(PyObject *self)
11757 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11758 {
11759 Py_ssize_t i, length;
11760 int kind;
11761 void *data;
11762 int cased;
11763
11764 if (PyUnicode_READY(self) == -1)
11765 return NULL;
11766 length = PyUnicode_GET_LENGTH(self);
11767 kind = PyUnicode_KIND(self);
11768 data = PyUnicode_DATA(self);
11769
11770 /* Shortcut for single character strings */
11771 if (length == 1)
11772 return PyBool_FromLong(
11773 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11774
11775 /* Special case for empty strings */
11776 if (length == 0)
11777 Py_RETURN_FALSE;
11778
11779 cased = 0;
11780 for (i = 0; i < length; i++) {
11781 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11782
11783 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11784 Py_RETURN_FALSE;
11785 else if (!cased && Py_UNICODE_ISLOWER(ch))
11786 cased = 1;
11787 }
11788 return PyBool_FromLong(cased);
11789 }
11790
11791 /*[clinic input]
11792 str.isupper as unicode_isupper
11793
11794 Return True if the string is an uppercase string, False otherwise.
11795
11796 A string is uppercase if all cased characters in the string are uppercase and
11797 there is at least one cased character in the string.
11798 [clinic start generated code]*/
11799
11800 static PyObject *
unicode_isupper_impl(PyObject * self)11801 unicode_isupper_impl(PyObject *self)
11802 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11803 {
11804 Py_ssize_t i, length;
11805 int kind;
11806 void *data;
11807 int cased;
11808
11809 if (PyUnicode_READY(self) == -1)
11810 return NULL;
11811 length = PyUnicode_GET_LENGTH(self);
11812 kind = PyUnicode_KIND(self);
11813 data = PyUnicode_DATA(self);
11814
11815 /* Shortcut for single character strings */
11816 if (length == 1)
11817 return PyBool_FromLong(
11818 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11819
11820 /* Special case for empty strings */
11821 if (length == 0)
11822 Py_RETURN_FALSE;
11823
11824 cased = 0;
11825 for (i = 0; i < length; i++) {
11826 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11827
11828 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11829 Py_RETURN_FALSE;
11830 else if (!cased && Py_UNICODE_ISUPPER(ch))
11831 cased = 1;
11832 }
11833 return PyBool_FromLong(cased);
11834 }
11835
11836 /*[clinic input]
11837 str.istitle as unicode_istitle
11838
11839 Return True if the string is a title-cased string, False otherwise.
11840
11841 In a title-cased string, upper- and title-case characters may only
11842 follow uncased characters and lowercase characters only cased ones.
11843 [clinic start generated code]*/
11844
11845 static PyObject *
unicode_istitle_impl(PyObject * self)11846 unicode_istitle_impl(PyObject *self)
11847 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11848 {
11849 Py_ssize_t i, length;
11850 int kind;
11851 void *data;
11852 int cased, previous_is_cased;
11853
11854 if (PyUnicode_READY(self) == -1)
11855 return NULL;
11856 length = PyUnicode_GET_LENGTH(self);
11857 kind = PyUnicode_KIND(self);
11858 data = PyUnicode_DATA(self);
11859
11860 /* Shortcut for single character strings */
11861 if (length == 1) {
11862 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11863 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11864 (Py_UNICODE_ISUPPER(ch) != 0));
11865 }
11866
11867 /* Special case for empty strings */
11868 if (length == 0)
11869 Py_RETURN_FALSE;
11870
11871 cased = 0;
11872 previous_is_cased = 0;
11873 for (i = 0; i < length; i++) {
11874 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11875
11876 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11877 if (previous_is_cased)
11878 Py_RETURN_FALSE;
11879 previous_is_cased = 1;
11880 cased = 1;
11881 }
11882 else if (Py_UNICODE_ISLOWER(ch)) {
11883 if (!previous_is_cased)
11884 Py_RETURN_FALSE;
11885 previous_is_cased = 1;
11886 cased = 1;
11887 }
11888 else
11889 previous_is_cased = 0;
11890 }
11891 return PyBool_FromLong(cased);
11892 }
11893
11894 /*[clinic input]
11895 str.isspace as unicode_isspace
11896
11897 Return True if the string is a whitespace string, False otherwise.
11898
11899 A string is whitespace if all characters in the string are whitespace and there
11900 is at least one character in the string.
11901 [clinic start generated code]*/
11902
11903 static PyObject *
unicode_isspace_impl(PyObject * self)11904 unicode_isspace_impl(PyObject *self)
11905 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11906 {
11907 Py_ssize_t i, length;
11908 int kind;
11909 void *data;
11910
11911 if (PyUnicode_READY(self) == -1)
11912 return NULL;
11913 length = PyUnicode_GET_LENGTH(self);
11914 kind = PyUnicode_KIND(self);
11915 data = PyUnicode_DATA(self);
11916
11917 /* Shortcut for single character strings */
11918 if (length == 1)
11919 return PyBool_FromLong(
11920 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11921
11922 /* Special case for empty strings */
11923 if (length == 0)
11924 Py_RETURN_FALSE;
11925
11926 for (i = 0; i < length; i++) {
11927 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11928 if (!Py_UNICODE_ISSPACE(ch))
11929 Py_RETURN_FALSE;
11930 }
11931 Py_RETURN_TRUE;
11932 }
11933
11934 /*[clinic input]
11935 str.isalpha as unicode_isalpha
11936
11937 Return True if the string is an alphabetic string, False otherwise.
11938
11939 A string is alphabetic if all characters in the string are alphabetic and there
11940 is at least one character in the string.
11941 [clinic start generated code]*/
11942
11943 static PyObject *
unicode_isalpha_impl(PyObject * self)11944 unicode_isalpha_impl(PyObject *self)
11945 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11946 {
11947 Py_ssize_t i, length;
11948 int kind;
11949 void *data;
11950
11951 if (PyUnicode_READY(self) == -1)
11952 return NULL;
11953 length = PyUnicode_GET_LENGTH(self);
11954 kind = PyUnicode_KIND(self);
11955 data = PyUnicode_DATA(self);
11956
11957 /* Shortcut for single character strings */
11958 if (length == 1)
11959 return PyBool_FromLong(
11960 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11961
11962 /* Special case for empty strings */
11963 if (length == 0)
11964 Py_RETURN_FALSE;
11965
11966 for (i = 0; i < length; i++) {
11967 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11968 Py_RETURN_FALSE;
11969 }
11970 Py_RETURN_TRUE;
11971 }
11972
11973 /*[clinic input]
11974 str.isalnum as unicode_isalnum
11975
11976 Return True if the string is an alpha-numeric string, False otherwise.
11977
11978 A string is alpha-numeric if all characters in the string are alpha-numeric and
11979 there is at least one character in the string.
11980 [clinic start generated code]*/
11981
11982 static PyObject *
unicode_isalnum_impl(PyObject * self)11983 unicode_isalnum_impl(PyObject *self)
11984 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11985 {
11986 int kind;
11987 void *data;
11988 Py_ssize_t len, i;
11989
11990 if (PyUnicode_READY(self) == -1)
11991 return NULL;
11992
11993 kind = PyUnicode_KIND(self);
11994 data = PyUnicode_DATA(self);
11995 len = PyUnicode_GET_LENGTH(self);
11996
11997 /* Shortcut for single character strings */
11998 if (len == 1) {
11999 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12000 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12001 }
12002
12003 /* Special case for empty strings */
12004 if (len == 0)
12005 Py_RETURN_FALSE;
12006
12007 for (i = 0; i < len; i++) {
12008 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12009 if (!Py_UNICODE_ISALNUM(ch))
12010 Py_RETURN_FALSE;
12011 }
12012 Py_RETURN_TRUE;
12013 }
12014
12015 /*[clinic input]
12016 str.isdecimal as unicode_isdecimal
12017
12018 Return True if the string is a decimal string, False otherwise.
12019
12020 A string is a decimal string if all characters in the string are decimal and
12021 there is at least one character in the string.
12022 [clinic start generated code]*/
12023
12024 static PyObject *
unicode_isdecimal_impl(PyObject * self)12025 unicode_isdecimal_impl(PyObject *self)
12026 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12027 {
12028 Py_ssize_t i, length;
12029 int kind;
12030 void *data;
12031
12032 if (PyUnicode_READY(self) == -1)
12033 return NULL;
12034 length = PyUnicode_GET_LENGTH(self);
12035 kind = PyUnicode_KIND(self);
12036 data = PyUnicode_DATA(self);
12037
12038 /* Shortcut for single character strings */
12039 if (length == 1)
12040 return PyBool_FromLong(
12041 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12042
12043 /* Special case for empty strings */
12044 if (length == 0)
12045 Py_RETURN_FALSE;
12046
12047 for (i = 0; i < length; i++) {
12048 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12049 Py_RETURN_FALSE;
12050 }
12051 Py_RETURN_TRUE;
12052 }
12053
12054 /*[clinic input]
12055 str.isdigit as unicode_isdigit
12056
12057 Return True if the string is a digit string, False otherwise.
12058
12059 A string is a digit string if all characters in the string are digits and there
12060 is at least one character in the string.
12061 [clinic start generated code]*/
12062
12063 static PyObject *
unicode_isdigit_impl(PyObject * self)12064 unicode_isdigit_impl(PyObject *self)
12065 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12066 {
12067 Py_ssize_t i, length;
12068 int kind;
12069 void *data;
12070
12071 if (PyUnicode_READY(self) == -1)
12072 return NULL;
12073 length = PyUnicode_GET_LENGTH(self);
12074 kind = PyUnicode_KIND(self);
12075 data = PyUnicode_DATA(self);
12076
12077 /* Shortcut for single character strings */
12078 if (length == 1) {
12079 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12080 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12081 }
12082
12083 /* Special case for empty strings */
12084 if (length == 0)
12085 Py_RETURN_FALSE;
12086
12087 for (i = 0; i < length; i++) {
12088 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12089 Py_RETURN_FALSE;
12090 }
12091 Py_RETURN_TRUE;
12092 }
12093
12094 /*[clinic input]
12095 str.isnumeric as unicode_isnumeric
12096
12097 Return True if the string is a numeric string, False otherwise.
12098
12099 A string is numeric if all characters in the string are numeric and there is at
12100 least one character in the string.
12101 [clinic start generated code]*/
12102
12103 static PyObject *
unicode_isnumeric_impl(PyObject * self)12104 unicode_isnumeric_impl(PyObject *self)
12105 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12106 {
12107 Py_ssize_t i, length;
12108 int kind;
12109 void *data;
12110
12111 if (PyUnicode_READY(self) == -1)
12112 return NULL;
12113 length = PyUnicode_GET_LENGTH(self);
12114 kind = PyUnicode_KIND(self);
12115 data = PyUnicode_DATA(self);
12116
12117 /* Shortcut for single character strings */
12118 if (length == 1)
12119 return PyBool_FromLong(
12120 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12121
12122 /* Special case for empty strings */
12123 if (length == 0)
12124 Py_RETURN_FALSE;
12125
12126 for (i = 0; i < length; i++) {
12127 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12128 Py_RETURN_FALSE;
12129 }
12130 Py_RETURN_TRUE;
12131 }
12132
12133 int
PyUnicode_IsIdentifier(PyObject * self)12134 PyUnicode_IsIdentifier(PyObject *self)
12135 {
12136 int kind;
12137 void *data;
12138 Py_ssize_t i;
12139 Py_UCS4 first;
12140
12141 if (PyUnicode_READY(self) == -1) {
12142 Py_FatalError("identifier not ready");
12143 return 0;
12144 }
12145
12146 /* Special case for empty strings */
12147 if (PyUnicode_GET_LENGTH(self) == 0)
12148 return 0;
12149 kind = PyUnicode_KIND(self);
12150 data = PyUnicode_DATA(self);
12151
12152 /* PEP 3131 says that the first character must be in
12153 XID_Start and subsequent characters in XID_Continue,
12154 and for the ASCII range, the 2.x rules apply (i.e
12155 start with letters and underscore, continue with
12156 letters, digits, underscore). However, given the current
12157 definition of XID_Start and XID_Continue, it is sufficient
12158 to check just for these, except that _ must be allowed
12159 as starting an identifier. */
12160 first = PyUnicode_READ(kind, data, 0);
12161 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12162 return 0;
12163
12164 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12165 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12166 return 0;
12167 return 1;
12168 }
12169
12170 /*[clinic input]
12171 str.isidentifier as unicode_isidentifier
12172
12173 Return True if the string is a valid Python identifier, False otherwise.
12174
12175 Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12176 such as "def" or "class".
12177 [clinic start generated code]*/
12178
12179 static PyObject *
unicode_isidentifier_impl(PyObject * self)12180 unicode_isidentifier_impl(PyObject *self)
12181 /*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12182 {
12183 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12184 }
12185
12186 /*[clinic input]
12187 str.isprintable as unicode_isprintable
12188
12189 Return True if the string is printable, False otherwise.
12190
12191 A string is printable if all of its characters are considered printable in
12192 repr() or if it is empty.
12193 [clinic start generated code]*/
12194
12195 static PyObject *
unicode_isprintable_impl(PyObject * self)12196 unicode_isprintable_impl(PyObject *self)
12197 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12198 {
12199 Py_ssize_t i, length;
12200 int kind;
12201 void *data;
12202
12203 if (PyUnicode_READY(self) == -1)
12204 return NULL;
12205 length = PyUnicode_GET_LENGTH(self);
12206 kind = PyUnicode_KIND(self);
12207 data = PyUnicode_DATA(self);
12208
12209 /* Shortcut for single character strings */
12210 if (length == 1)
12211 return PyBool_FromLong(
12212 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12213
12214 for (i = 0; i < length; i++) {
12215 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12216 Py_RETURN_FALSE;
12217 }
12218 }
12219 Py_RETURN_TRUE;
12220 }
12221
12222 /*[clinic input]
12223 str.join as unicode_join
12224
12225 iterable: object
12226 /
12227
12228 Concatenate any number of strings.
12229
12230 The string whose method is called is inserted in between each given string.
12231 The result is returned as a new string.
12232
12233 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12234 [clinic start generated code]*/
12235
12236 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12237 unicode_join(PyObject *self, PyObject *iterable)
12238 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12239 {
12240 return PyUnicode_Join(self, iterable);
12241 }
12242
12243 static Py_ssize_t
unicode_length(PyObject * self)12244 unicode_length(PyObject *self)
12245 {
12246 if (PyUnicode_READY(self) == -1)
12247 return -1;
12248 return PyUnicode_GET_LENGTH(self);
12249 }
12250
12251 /*[clinic input]
12252 str.ljust as unicode_ljust
12253
12254 width: Py_ssize_t
12255 fillchar: Py_UCS4 = ' '
12256 /
12257
12258 Return a left-justified string of length width.
12259
12260 Padding is done using the specified fill character (default is a space).
12261 [clinic start generated code]*/
12262
12263 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12264 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12265 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12266 {
12267 if (PyUnicode_READY(self) == -1)
12268 return NULL;
12269
12270 if (PyUnicode_GET_LENGTH(self) >= width)
12271 return unicode_result_unchanged(self);
12272
12273 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12274 }
12275
12276 /*[clinic input]
12277 str.lower as unicode_lower
12278
12279 Return a copy of the string converted to lowercase.
12280 [clinic start generated code]*/
12281
12282 static PyObject *
unicode_lower_impl(PyObject * self)12283 unicode_lower_impl(PyObject *self)
12284 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12285 {
12286 if (PyUnicode_READY(self) == -1)
12287 return NULL;
12288 if (PyUnicode_IS_ASCII(self))
12289 return ascii_upper_or_lower(self, 1);
12290 return case_operation(self, do_lower);
12291 }
12292
12293 #define LEFTSTRIP 0
12294 #define RIGHTSTRIP 1
12295 #define BOTHSTRIP 2
12296
12297 /* Arrays indexed by above */
12298 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12299
12300 #define STRIPNAME(i) (stripfuncnames[i])
12301
12302 /* externally visible for str.strip(unicode) */
12303 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12304 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12305 {
12306 void *data;
12307 int kind;
12308 Py_ssize_t i, j, len;
12309 BLOOM_MASK sepmask;
12310 Py_ssize_t seplen;
12311
12312 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12313 return NULL;
12314
12315 kind = PyUnicode_KIND(self);
12316 data = PyUnicode_DATA(self);
12317 len = PyUnicode_GET_LENGTH(self);
12318 seplen = PyUnicode_GET_LENGTH(sepobj);
12319 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12320 PyUnicode_DATA(sepobj),
12321 seplen);
12322
12323 i = 0;
12324 if (striptype != RIGHTSTRIP) {
12325 while (i < len) {
12326 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12327 if (!BLOOM(sepmask, ch))
12328 break;
12329 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12330 break;
12331 i++;
12332 }
12333 }
12334
12335 j = len;
12336 if (striptype != LEFTSTRIP) {
12337 j--;
12338 while (j >= i) {
12339 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12340 if (!BLOOM(sepmask, ch))
12341 break;
12342 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12343 break;
12344 j--;
12345 }
12346
12347 j++;
12348 }
12349
12350 return PyUnicode_Substring(self, i, j);
12351 }
12352
12353 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12354 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12355 {
12356 unsigned char *data;
12357 int kind;
12358 Py_ssize_t length;
12359
12360 if (PyUnicode_READY(self) == -1)
12361 return NULL;
12362
12363 length = PyUnicode_GET_LENGTH(self);
12364 end = Py_MIN(end, length);
12365
12366 if (start == 0 && end == length)
12367 return unicode_result_unchanged(self);
12368
12369 if (start < 0 || end < 0) {
12370 PyErr_SetString(PyExc_IndexError, "string index out of range");
12371 return NULL;
12372 }
12373 if (start >= length || end < start)
12374 _Py_RETURN_UNICODE_EMPTY();
12375
12376 length = end - start;
12377 if (PyUnicode_IS_ASCII(self)) {
12378 data = PyUnicode_1BYTE_DATA(self);
12379 return _PyUnicode_FromASCII((char*)(data + start), length);
12380 }
12381 else {
12382 kind = PyUnicode_KIND(self);
12383 data = PyUnicode_1BYTE_DATA(self);
12384 return PyUnicode_FromKindAndData(kind,
12385 data + kind * start,
12386 length);
12387 }
12388 }
12389
12390 static PyObject *
do_strip(PyObject * self,int striptype)12391 do_strip(PyObject *self, int striptype)
12392 {
12393 Py_ssize_t len, i, j;
12394
12395 if (PyUnicode_READY(self) == -1)
12396 return NULL;
12397
12398 len = PyUnicode_GET_LENGTH(self);
12399
12400 if (PyUnicode_IS_ASCII(self)) {
12401 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12402
12403 i = 0;
12404 if (striptype != RIGHTSTRIP) {
12405 while (i < len) {
12406 Py_UCS1 ch = data[i];
12407 if (!_Py_ascii_whitespace[ch])
12408 break;
12409 i++;
12410 }
12411 }
12412
12413 j = len;
12414 if (striptype != LEFTSTRIP) {
12415 j--;
12416 while (j >= i) {
12417 Py_UCS1 ch = data[j];
12418 if (!_Py_ascii_whitespace[ch])
12419 break;
12420 j--;
12421 }
12422 j++;
12423 }
12424 }
12425 else {
12426 int kind = PyUnicode_KIND(self);
12427 void *data = PyUnicode_DATA(self);
12428
12429 i = 0;
12430 if (striptype != RIGHTSTRIP) {
12431 while (i < len) {
12432 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12433 if (!Py_UNICODE_ISSPACE(ch))
12434 break;
12435 i++;
12436 }
12437 }
12438
12439 j = len;
12440 if (striptype != LEFTSTRIP) {
12441 j--;
12442 while (j >= i) {
12443 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12444 if (!Py_UNICODE_ISSPACE(ch))
12445 break;
12446 j--;
12447 }
12448 j++;
12449 }
12450 }
12451
12452 return PyUnicode_Substring(self, i, j);
12453 }
12454
12455
12456 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12457 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12458 {
12459 if (sep != Py_None) {
12460 if (PyUnicode_Check(sep))
12461 return _PyUnicode_XStrip(self, striptype, sep);
12462 else {
12463 PyErr_Format(PyExc_TypeError,
12464 "%s arg must be None or str",
12465 STRIPNAME(striptype));
12466 return NULL;
12467 }
12468 }
12469
12470 return do_strip(self, striptype);
12471 }
12472
12473
12474 /*[clinic input]
12475 str.strip as unicode_strip
12476
12477 chars: object = None
12478 /
12479
12480 Return a copy of the string with leading and trailing whitespace removed.
12481
12482 If chars is given and not None, remove characters in chars instead.
12483 [clinic start generated code]*/
12484
12485 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12486 unicode_strip_impl(PyObject *self, PyObject *chars)
12487 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12488 {
12489 return do_argstrip(self, BOTHSTRIP, chars);
12490 }
12491
12492
12493 /*[clinic input]
12494 str.lstrip as unicode_lstrip
12495
12496 chars: object = None
12497 /
12498
12499 Return a copy of the string with leading whitespace removed.
12500
12501 If chars is given and not None, remove characters in chars instead.
12502 [clinic start generated code]*/
12503
12504 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12505 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12506 /*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12507 {
12508 return do_argstrip(self, LEFTSTRIP, chars);
12509 }
12510
12511
12512 /*[clinic input]
12513 str.rstrip as unicode_rstrip
12514
12515 chars: object = None
12516 /
12517
12518 Return a copy of the string with trailing whitespace removed.
12519
12520 If chars is given and not None, remove characters in chars instead.
12521 [clinic start generated code]*/
12522
12523 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12524 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12525 /*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12526 {
12527 return do_argstrip(self, RIGHTSTRIP, chars);
12528 }
12529
12530
12531 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12532 unicode_repeat(PyObject *str, Py_ssize_t len)
12533 {
12534 PyObject *u;
12535 Py_ssize_t nchars, n;
12536
12537 if (len < 1)
12538 _Py_RETURN_UNICODE_EMPTY();
12539
12540 /* no repeat, return original string */
12541 if (len == 1)
12542 return unicode_result_unchanged(str);
12543
12544 if (PyUnicode_READY(str) == -1)
12545 return NULL;
12546
12547 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12548 PyErr_SetString(PyExc_OverflowError,
12549 "repeated string is too long");
12550 return NULL;
12551 }
12552 nchars = len * PyUnicode_GET_LENGTH(str);
12553
12554 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12555 if (!u)
12556 return NULL;
12557 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12558
12559 if (PyUnicode_GET_LENGTH(str) == 1) {
12560 const int kind = PyUnicode_KIND(str);
12561 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12562 if (kind == PyUnicode_1BYTE_KIND) {
12563 void *to = PyUnicode_DATA(u);
12564 memset(to, (unsigned char)fill_char, len);
12565 }
12566 else if (kind == PyUnicode_2BYTE_KIND) {
12567 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12568 for (n = 0; n < len; ++n)
12569 ucs2[n] = fill_char;
12570 } else {
12571 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12572 assert(kind == PyUnicode_4BYTE_KIND);
12573 for (n = 0; n < len; ++n)
12574 ucs4[n] = fill_char;
12575 }
12576 }
12577 else {
12578 /* number of characters copied this far */
12579 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12580 const Py_ssize_t char_size = PyUnicode_KIND(str);
12581 char *to = (char *) PyUnicode_DATA(u);
12582 memcpy(to, PyUnicode_DATA(str),
12583 PyUnicode_GET_LENGTH(str) * char_size);
12584 while (done < nchars) {
12585 n = (done <= nchars-done) ? done : nchars-done;
12586 memcpy(to + (done * char_size), to, n * char_size);
12587 done += n;
12588 }
12589 }
12590
12591 assert(_PyUnicode_CheckConsistency(u, 1));
12592 return u;
12593 }
12594
12595 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12596 PyUnicode_Replace(PyObject *str,
12597 PyObject *substr,
12598 PyObject *replstr,
12599 Py_ssize_t maxcount)
12600 {
12601 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12602 ensure_unicode(replstr) < 0)
12603 return NULL;
12604 return replace(str, substr, replstr, maxcount);
12605 }
12606
12607 /*[clinic input]
12608 str.replace as unicode_replace
12609
12610 old: unicode
12611 new: unicode
12612 count: Py_ssize_t = -1
12613 Maximum number of occurrences to replace.
12614 -1 (the default value) means replace all occurrences.
12615 /
12616
12617 Return a copy with all occurrences of substring old replaced by new.
12618
12619 If the optional argument count is given, only the first count occurrences are
12620 replaced.
12621 [clinic start generated code]*/
12622
12623 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12624 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12625 Py_ssize_t count)
12626 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12627 {
12628 if (PyUnicode_READY(self) == -1)
12629 return NULL;
12630 return replace(self, old, new, count);
12631 }
12632
12633 static PyObject *
unicode_repr(PyObject * unicode)12634 unicode_repr(PyObject *unicode)
12635 {
12636 PyObject *repr;
12637 Py_ssize_t isize;
12638 Py_ssize_t osize, squote, dquote, i, o;
12639 Py_UCS4 max, quote;
12640 int ikind, okind, unchanged;
12641 void *idata, *odata;
12642
12643 if (PyUnicode_READY(unicode) == -1)
12644 return NULL;
12645
12646 isize = PyUnicode_GET_LENGTH(unicode);
12647 idata = PyUnicode_DATA(unicode);
12648
12649 /* Compute length of output, quote characters, and
12650 maximum character */
12651 osize = 0;
12652 max = 127;
12653 squote = dquote = 0;
12654 ikind = PyUnicode_KIND(unicode);
12655 for (i = 0; i < isize; i++) {
12656 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12657 Py_ssize_t incr = 1;
12658 switch (ch) {
12659 case '\'': squote++; break;
12660 case '"': dquote++; break;
12661 case '\\': case '\t': case '\r': case '\n':
12662 incr = 2;
12663 break;
12664 default:
12665 /* Fast-path ASCII */
12666 if (ch < ' ' || ch == 0x7f)
12667 incr = 4; /* \xHH */
12668 else if (ch < 0x7f)
12669 ;
12670 else if (Py_UNICODE_ISPRINTABLE(ch))
12671 max = ch > max ? ch : max;
12672 else if (ch < 0x100)
12673 incr = 4; /* \xHH */
12674 else if (ch < 0x10000)
12675 incr = 6; /* \uHHHH */
12676 else
12677 incr = 10; /* \uHHHHHHHH */
12678 }
12679 if (osize > PY_SSIZE_T_MAX - incr) {
12680 PyErr_SetString(PyExc_OverflowError,
12681 "string is too long to generate repr");
12682 return NULL;
12683 }
12684 osize += incr;
12685 }
12686
12687 quote = '\'';
12688 unchanged = (osize == isize);
12689 if (squote) {
12690 unchanged = 0;
12691 if (dquote)
12692 /* Both squote and dquote present. Use squote,
12693 and escape them */
12694 osize += squote;
12695 else
12696 quote = '"';
12697 }
12698 osize += 2; /* quotes */
12699
12700 repr = PyUnicode_New(osize, max);
12701 if (repr == NULL)
12702 return NULL;
12703 okind = PyUnicode_KIND(repr);
12704 odata = PyUnicode_DATA(repr);
12705
12706 PyUnicode_WRITE(okind, odata, 0, quote);
12707 PyUnicode_WRITE(okind, odata, osize-1, quote);
12708 if (unchanged) {
12709 _PyUnicode_FastCopyCharacters(repr, 1,
12710 unicode, 0,
12711 isize);
12712 }
12713 else {
12714 for (i = 0, o = 1; i < isize; i++) {
12715 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12716
12717 /* Escape quotes and backslashes */
12718 if ((ch == quote) || (ch == '\\')) {
12719 PyUnicode_WRITE(okind, odata, o++, '\\');
12720 PyUnicode_WRITE(okind, odata, o++, ch);
12721 continue;
12722 }
12723
12724 /* Map special whitespace to '\t', \n', '\r' */
12725 if (ch == '\t') {
12726 PyUnicode_WRITE(okind, odata, o++, '\\');
12727 PyUnicode_WRITE(okind, odata, o++, 't');
12728 }
12729 else if (ch == '\n') {
12730 PyUnicode_WRITE(okind, odata, o++, '\\');
12731 PyUnicode_WRITE(okind, odata, o++, 'n');
12732 }
12733 else if (ch == '\r') {
12734 PyUnicode_WRITE(okind, odata, o++, '\\');
12735 PyUnicode_WRITE(okind, odata, o++, 'r');
12736 }
12737
12738 /* Map non-printable US ASCII to '\xhh' */
12739 else if (ch < ' ' || ch == 0x7F) {
12740 PyUnicode_WRITE(okind, odata, o++, '\\');
12741 PyUnicode_WRITE(okind, odata, o++, 'x');
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12743 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12744 }
12745
12746 /* Copy ASCII characters as-is */
12747 else if (ch < 0x7F) {
12748 PyUnicode_WRITE(okind, odata, o++, ch);
12749 }
12750
12751 /* Non-ASCII characters */
12752 else {
12753 /* Map Unicode whitespace and control characters
12754 (categories Z* and C* except ASCII space)
12755 */
12756 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12757 PyUnicode_WRITE(okind, odata, o++, '\\');
12758 /* Map 8-bit characters to '\xhh' */
12759 if (ch <= 0xff) {
12760 PyUnicode_WRITE(okind, odata, o++, 'x');
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12762 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12763 }
12764 /* Map 16-bit characters to '\uxxxx' */
12765 else if (ch <= 0xffff) {
12766 PyUnicode_WRITE(okind, odata, o++, 'u');
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12771 }
12772 /* Map 21-bit characters to '\U00xxxxxx' */
12773 else {
12774 PyUnicode_WRITE(okind, odata, o++, 'U');
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12776 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12777 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12778 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12779 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12782 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12783 }
12784 }
12785 /* Copy characters as-is */
12786 else {
12787 PyUnicode_WRITE(okind, odata, o++, ch);
12788 }
12789 }
12790 }
12791 }
12792 /* Closing quote already added at the beginning */
12793 assert(_PyUnicode_CheckConsistency(repr, 1));
12794 return repr;
12795 }
12796
12797 PyDoc_STRVAR(rfind__doc__,
12798 "S.rfind(sub[, start[, end]]) -> int\n\
12799 \n\
12800 Return the highest index in S where substring sub is found,\n\
12801 such that sub is contained within S[start:end]. Optional\n\
12802 arguments start and end are interpreted as in slice notation.\n\
12803 \n\
12804 Return -1 on failure.");
12805
12806 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12807 unicode_rfind(PyObject *self, PyObject *args)
12808 {
12809 /* initialize variables to prevent gcc warning */
12810 PyObject *substring = NULL;
12811 Py_ssize_t start = 0;
12812 Py_ssize_t end = 0;
12813 Py_ssize_t result;
12814
12815 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12816 return NULL;
12817
12818 if (PyUnicode_READY(self) == -1)
12819 return NULL;
12820
12821 result = any_find_slice(self, substring, start, end, -1);
12822
12823 if (result == -2)
12824 return NULL;
12825
12826 return PyLong_FromSsize_t(result);
12827 }
12828
12829 PyDoc_STRVAR(rindex__doc__,
12830 "S.rindex(sub[, start[, end]]) -> int\n\
12831 \n\
12832 Return the highest index in S where substring sub is found,\n\
12833 such that sub is contained within S[start:end]. Optional\n\
12834 arguments start and end are interpreted as in slice notation.\n\
12835 \n\
12836 Raises ValueError when the substring is not found.");
12837
12838 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12839 unicode_rindex(PyObject *self, PyObject *args)
12840 {
12841 /* initialize variables to prevent gcc warning */
12842 PyObject *substring = NULL;
12843 Py_ssize_t start = 0;
12844 Py_ssize_t end = 0;
12845 Py_ssize_t result;
12846
12847 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12848 return NULL;
12849
12850 if (PyUnicode_READY(self) == -1)
12851 return NULL;
12852
12853 result = any_find_slice(self, substring, start, end, -1);
12854
12855 if (result == -2)
12856 return NULL;
12857
12858 if (result < 0) {
12859 PyErr_SetString(PyExc_ValueError, "substring not found");
12860 return NULL;
12861 }
12862
12863 return PyLong_FromSsize_t(result);
12864 }
12865
12866 /*[clinic input]
12867 str.rjust as unicode_rjust
12868
12869 width: Py_ssize_t
12870 fillchar: Py_UCS4 = ' '
12871 /
12872
12873 Return a right-justified string of length width.
12874
12875 Padding is done using the specified fill character (default is a space).
12876 [clinic start generated code]*/
12877
12878 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12879 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12880 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12881 {
12882 if (PyUnicode_READY(self) == -1)
12883 return NULL;
12884
12885 if (PyUnicode_GET_LENGTH(self) >= width)
12886 return unicode_result_unchanged(self);
12887
12888 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12889 }
12890
12891 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12892 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12893 {
12894 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12895 return NULL;
12896
12897 return split(s, sep, maxsplit);
12898 }
12899
12900 /*[clinic input]
12901 str.split as unicode_split
12902
12903 sep: object = None
12904 The delimiter according which to split the string.
12905 None (the default value) means split according to any whitespace,
12906 and discard empty strings from the result.
12907 maxsplit: Py_ssize_t = -1
12908 Maximum number of splits to do.
12909 -1 (the default value) means no limit.
12910
12911 Return a list of the words in the string, using sep as the delimiter string.
12912 [clinic start generated code]*/
12913
12914 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12915 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12916 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12917 {
12918 if (sep == Py_None)
12919 return split(self, NULL, maxsplit);
12920 if (PyUnicode_Check(sep))
12921 return split(self, sep, maxsplit);
12922
12923 PyErr_Format(PyExc_TypeError,
12924 "must be str or None, not %.100s",
12925 Py_TYPE(sep)->tp_name);
12926 return NULL;
12927 }
12928
12929 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12930 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12931 {
12932 PyObject* out;
12933 int kind1, kind2;
12934 void *buf1, *buf2;
12935 Py_ssize_t len1, len2;
12936
12937 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12938 return NULL;
12939
12940 kind1 = PyUnicode_KIND(str_obj);
12941 kind2 = PyUnicode_KIND(sep_obj);
12942 len1 = PyUnicode_GET_LENGTH(str_obj);
12943 len2 = PyUnicode_GET_LENGTH(sep_obj);
12944 if (kind1 < kind2 || len1 < len2) {
12945 _Py_INCREF_UNICODE_EMPTY();
12946 if (!unicode_empty)
12947 out = NULL;
12948 else {
12949 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12950 Py_DECREF(unicode_empty);
12951 }
12952 return out;
12953 }
12954 buf1 = PyUnicode_DATA(str_obj);
12955 buf2 = PyUnicode_DATA(sep_obj);
12956 if (kind2 != kind1) {
12957 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12958 if (!buf2)
12959 return NULL;
12960 }
12961
12962 switch (kind1) {
12963 case PyUnicode_1BYTE_KIND:
12964 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12965 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12966 else
12967 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968 break;
12969 case PyUnicode_2BYTE_KIND:
12970 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971 break;
12972 case PyUnicode_4BYTE_KIND:
12973 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12974 break;
12975 default:
12976 Py_UNREACHABLE();
12977 }
12978
12979 if (kind2 != kind1)
12980 PyMem_Free(buf2);
12981
12982 return out;
12983 }
12984
12985
12986 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12987 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12988 {
12989 PyObject* out;
12990 int kind1, kind2;
12991 void *buf1, *buf2;
12992 Py_ssize_t len1, len2;
12993
12994 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12995 return NULL;
12996
12997 kind1 = PyUnicode_KIND(str_obj);
12998 kind2 = PyUnicode_KIND(sep_obj);
12999 len1 = PyUnicode_GET_LENGTH(str_obj);
13000 len2 = PyUnicode_GET_LENGTH(sep_obj);
13001 if (kind1 < kind2 || len1 < len2) {
13002 _Py_INCREF_UNICODE_EMPTY();
13003 if (!unicode_empty)
13004 out = NULL;
13005 else {
13006 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13007 Py_DECREF(unicode_empty);
13008 }
13009 return out;
13010 }
13011 buf1 = PyUnicode_DATA(str_obj);
13012 buf2 = PyUnicode_DATA(sep_obj);
13013 if (kind2 != kind1) {
13014 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13015 if (!buf2)
13016 return NULL;
13017 }
13018
13019 switch (kind1) {
13020 case PyUnicode_1BYTE_KIND:
13021 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13022 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13023 else
13024 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13025 break;
13026 case PyUnicode_2BYTE_KIND:
13027 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13028 break;
13029 case PyUnicode_4BYTE_KIND:
13030 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13031 break;
13032 default:
13033 Py_UNREACHABLE();
13034 }
13035
13036 if (kind2 != kind1)
13037 PyMem_Free(buf2);
13038
13039 return out;
13040 }
13041
13042 /*[clinic input]
13043 str.partition as unicode_partition
13044
13045 sep: object
13046 /
13047
13048 Partition the string into three parts using the given separator.
13049
13050 This will search for the separator in the string. If the separator is found,
13051 returns a 3-tuple containing the part before the separator, the separator
13052 itself, and the part after it.
13053
13054 If the separator is not found, returns a 3-tuple containing the original string
13055 and two empty strings.
13056 [clinic start generated code]*/
13057
13058 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13059 unicode_partition(PyObject *self, PyObject *sep)
13060 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13061 {
13062 return PyUnicode_Partition(self, sep);
13063 }
13064
13065 /*[clinic input]
13066 str.rpartition as unicode_rpartition = str.partition
13067
13068 Partition the string into three parts using the given separator.
13069
13070 This will search for the separator in the string, starting at the end. If
13071 the separator is found, returns a 3-tuple containing the part before the
13072 separator, the separator itself, and the part after it.
13073
13074 If the separator is not found, returns a 3-tuple containing two empty strings
13075 and the original string.
13076 [clinic start generated code]*/
13077
13078 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13079 unicode_rpartition(PyObject *self, PyObject *sep)
13080 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13081 {
13082 return PyUnicode_RPartition(self, sep);
13083 }
13084
13085 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13086 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13087 {
13088 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13089 return NULL;
13090
13091 return rsplit(s, sep, maxsplit);
13092 }
13093
13094 /*[clinic input]
13095 str.rsplit as unicode_rsplit = str.split
13096
13097 Return a list of the words in the string, using sep as the delimiter string.
13098
13099 Splits are done starting at the end of the string and working to the front.
13100 [clinic start generated code]*/
13101
13102 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13103 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13104 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13105 {
13106 if (sep == Py_None)
13107 return rsplit(self, NULL, maxsplit);
13108 if (PyUnicode_Check(sep))
13109 return rsplit(self, sep, maxsplit);
13110
13111 PyErr_Format(PyExc_TypeError,
13112 "must be str or None, not %.100s",
13113 Py_TYPE(sep)->tp_name);
13114 return NULL;
13115 }
13116
13117 /*[clinic input]
13118 str.splitlines as unicode_splitlines
13119
13120 keepends: bool(accept={int}) = False
13121
13122 Return a list of the lines in the string, breaking at line boundaries.
13123
13124 Line breaks are not included in the resulting list unless keepends is given and
13125 true.
13126 [clinic start generated code]*/
13127
13128 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13129 unicode_splitlines_impl(PyObject *self, int keepends)
13130 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13131 {
13132 return PyUnicode_Splitlines(self, keepends);
13133 }
13134
13135 static
unicode_str(PyObject * self)13136 PyObject *unicode_str(PyObject *self)
13137 {
13138 return unicode_result_unchanged(self);
13139 }
13140
13141 /*[clinic input]
13142 str.swapcase as unicode_swapcase
13143
13144 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13145 [clinic start generated code]*/
13146
13147 static PyObject *
unicode_swapcase_impl(PyObject * self)13148 unicode_swapcase_impl(PyObject *self)
13149 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13150 {
13151 if (PyUnicode_READY(self) == -1)
13152 return NULL;
13153 return case_operation(self, do_swapcase);
13154 }
13155
13156 /*[clinic input]
13157
13158 @staticmethod
13159 str.maketrans as unicode_maketrans
13160
13161 x: object
13162
13163 y: unicode=NULL
13164
13165 z: unicode=NULL
13166
13167 /
13168
13169 Return a translation table usable for str.translate().
13170
13171 If there is only one argument, it must be a dictionary mapping Unicode
13172 ordinals (integers) or characters to Unicode ordinals, strings or None.
13173 Character keys will be then converted to ordinals.
13174 If there are two arguments, they must be strings of equal length, and
13175 in the resulting dictionary, each character in x will be mapped to the
13176 character at the same position in y. If there is a third argument, it
13177 must be a string, whose characters will be mapped to None in the result.
13178 [clinic start generated code]*/
13179
13180 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13181 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13182 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13183 {
13184 PyObject *new = NULL, *key, *value;
13185 Py_ssize_t i = 0;
13186 int res;
13187
13188 new = PyDict_New();
13189 if (!new)
13190 return NULL;
13191 if (y != NULL) {
13192 int x_kind, y_kind, z_kind;
13193 void *x_data, *y_data, *z_data;
13194
13195 /* x must be a string too, of equal length */
13196 if (!PyUnicode_Check(x)) {
13197 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13198 "be a string if there is a second argument");
13199 goto err;
13200 }
13201 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13202 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13203 "arguments must have equal length");
13204 goto err;
13205 }
13206 /* create entries for translating chars in x to those in y */
13207 x_kind = PyUnicode_KIND(x);
13208 y_kind = PyUnicode_KIND(y);
13209 x_data = PyUnicode_DATA(x);
13210 y_data = PyUnicode_DATA(y);
13211 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13212 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13213 if (!key)
13214 goto err;
13215 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13216 if (!value) {
13217 Py_DECREF(key);
13218 goto err;
13219 }
13220 res = PyDict_SetItem(new, key, value);
13221 Py_DECREF(key);
13222 Py_DECREF(value);
13223 if (res < 0)
13224 goto err;
13225 }
13226 /* create entries for deleting chars in z */
13227 if (z != NULL) {
13228 z_kind = PyUnicode_KIND(z);
13229 z_data = PyUnicode_DATA(z);
13230 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13231 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13232 if (!key)
13233 goto err;
13234 res = PyDict_SetItem(new, key, Py_None);
13235 Py_DECREF(key);
13236 if (res < 0)
13237 goto err;
13238 }
13239 }
13240 } else {
13241 int kind;
13242 void *data;
13243
13244 /* x must be a dict */
13245 if (!PyDict_CheckExact(x)) {
13246 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13247 "to maketrans it must be a dict");
13248 goto err;
13249 }
13250 /* copy entries into the new dict, converting string keys to int keys */
13251 while (PyDict_Next(x, &i, &key, &value)) {
13252 if (PyUnicode_Check(key)) {
13253 /* convert string keys to integer keys */
13254 PyObject *newkey;
13255 if (PyUnicode_GET_LENGTH(key) != 1) {
13256 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13257 "table must be of length 1");
13258 goto err;
13259 }
13260 kind = PyUnicode_KIND(key);
13261 data = PyUnicode_DATA(key);
13262 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13263 if (!newkey)
13264 goto err;
13265 res = PyDict_SetItem(new, newkey, value);
13266 Py_DECREF(newkey);
13267 if (res < 0)
13268 goto err;
13269 } else if (PyLong_Check(key)) {
13270 /* just keep integer keys */
13271 if (PyDict_SetItem(new, key, value) < 0)
13272 goto err;
13273 } else {
13274 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13275 "be strings or integers");
13276 goto err;
13277 }
13278 }
13279 }
13280 return new;
13281 err:
13282 Py_DECREF(new);
13283 return NULL;
13284 }
13285
13286 /*[clinic input]
13287 str.translate as unicode_translate
13288
13289 table: object
13290 Translation table, which must be a mapping of Unicode ordinals to
13291 Unicode ordinals, strings, or None.
13292 /
13293
13294 Replace each character in the string using the given translation table.
13295
13296 The table must implement lookup/indexing via __getitem__, for instance a
13297 dictionary or list. If this operation raises LookupError, the character is
13298 left untouched. Characters mapped to None are deleted.
13299 [clinic start generated code]*/
13300
13301 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13302 unicode_translate(PyObject *self, PyObject *table)
13303 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13304 {
13305 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13306 }
13307
13308 /*[clinic input]
13309 str.upper as unicode_upper
13310
13311 Return a copy of the string converted to uppercase.
13312 [clinic start generated code]*/
13313
13314 static PyObject *
unicode_upper_impl(PyObject * self)13315 unicode_upper_impl(PyObject *self)
13316 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13317 {
13318 if (PyUnicode_READY(self) == -1)
13319 return NULL;
13320 if (PyUnicode_IS_ASCII(self))
13321 return ascii_upper_or_lower(self, 0);
13322 return case_operation(self, do_upper);
13323 }
13324
13325 /*[clinic input]
13326 str.zfill as unicode_zfill
13327
13328 width: Py_ssize_t
13329 /
13330
13331 Pad a numeric string with zeros on the left, to fill a field of the given width.
13332
13333 The string is never truncated.
13334 [clinic start generated code]*/
13335
13336 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13337 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13338 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13339 {
13340 Py_ssize_t fill;
13341 PyObject *u;
13342 int kind;
13343 void *data;
13344 Py_UCS4 chr;
13345
13346 if (PyUnicode_READY(self) == -1)
13347 return NULL;
13348
13349 if (PyUnicode_GET_LENGTH(self) >= width)
13350 return unicode_result_unchanged(self);
13351
13352 fill = width - PyUnicode_GET_LENGTH(self);
13353
13354 u = pad(self, fill, 0, '0');
13355
13356 if (u == NULL)
13357 return NULL;
13358
13359 kind = PyUnicode_KIND(u);
13360 data = PyUnicode_DATA(u);
13361 chr = PyUnicode_READ(kind, data, fill);
13362
13363 if (chr == '+' || chr == '-') {
13364 /* move sign to beginning of string */
13365 PyUnicode_WRITE(kind, data, 0, chr);
13366 PyUnicode_WRITE(kind, data, fill, '0');
13367 }
13368
13369 assert(_PyUnicode_CheckConsistency(u, 1));
13370 return u;
13371 }
13372
13373 #if 0
13374 static PyObject *
13375 unicode__decimal2ascii(PyObject *self)
13376 {
13377 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13378 }
13379 #endif
13380
13381 PyDoc_STRVAR(startswith__doc__,
13382 "S.startswith(prefix[, start[, end]]) -> bool\n\
13383 \n\
13384 Return True if S starts with the specified prefix, False otherwise.\n\
13385 With optional start, test S beginning at that position.\n\
13386 With optional end, stop comparing S at that position.\n\
13387 prefix can also be a tuple of strings to try.");
13388
13389 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13390 unicode_startswith(PyObject *self,
13391 PyObject *args)
13392 {
13393 PyObject *subobj;
13394 PyObject *substring;
13395 Py_ssize_t start = 0;
13396 Py_ssize_t end = PY_SSIZE_T_MAX;
13397 int result;
13398
13399 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13400 return NULL;
13401 if (PyTuple_Check(subobj)) {
13402 Py_ssize_t i;
13403 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13404 substring = PyTuple_GET_ITEM(subobj, i);
13405 if (!PyUnicode_Check(substring)) {
13406 PyErr_Format(PyExc_TypeError,
13407 "tuple for startswith must only contain str, "
13408 "not %.100s",
13409 Py_TYPE(substring)->tp_name);
13410 return NULL;
13411 }
13412 result = tailmatch(self, substring, start, end, -1);
13413 if (result == -1)
13414 return NULL;
13415 if (result) {
13416 Py_RETURN_TRUE;
13417 }
13418 }
13419 /* nothing matched */
13420 Py_RETURN_FALSE;
13421 }
13422 if (!PyUnicode_Check(subobj)) {
13423 PyErr_Format(PyExc_TypeError,
13424 "startswith first arg must be str or "
13425 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13426 return NULL;
13427 }
13428 result = tailmatch(self, subobj, start, end, -1);
13429 if (result == -1)
13430 return NULL;
13431 return PyBool_FromLong(result);
13432 }
13433
13434
13435 PyDoc_STRVAR(endswith__doc__,
13436 "S.endswith(suffix[, start[, end]]) -> bool\n\
13437 \n\
13438 Return True if S ends with the specified suffix, False otherwise.\n\
13439 With optional start, test S beginning at that position.\n\
13440 With optional end, stop comparing S at that position.\n\
13441 suffix can also be a tuple of strings to try.");
13442
13443 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13444 unicode_endswith(PyObject *self,
13445 PyObject *args)
13446 {
13447 PyObject *subobj;
13448 PyObject *substring;
13449 Py_ssize_t start = 0;
13450 Py_ssize_t end = PY_SSIZE_T_MAX;
13451 int result;
13452
13453 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13454 return NULL;
13455 if (PyTuple_Check(subobj)) {
13456 Py_ssize_t i;
13457 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13458 substring = PyTuple_GET_ITEM(subobj, i);
13459 if (!PyUnicode_Check(substring)) {
13460 PyErr_Format(PyExc_TypeError,
13461 "tuple for endswith must only contain str, "
13462 "not %.100s",
13463 Py_TYPE(substring)->tp_name);
13464 return NULL;
13465 }
13466 result = tailmatch(self, substring, start, end, +1);
13467 if (result == -1)
13468 return NULL;
13469 if (result) {
13470 Py_RETURN_TRUE;
13471 }
13472 }
13473 Py_RETURN_FALSE;
13474 }
13475 if (!PyUnicode_Check(subobj)) {
13476 PyErr_Format(PyExc_TypeError,
13477 "endswith first arg must be str or "
13478 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13479 return NULL;
13480 }
13481 result = tailmatch(self, subobj, start, end, +1);
13482 if (result == -1)
13483 return NULL;
13484 return PyBool_FromLong(result);
13485 }
13486
13487 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13488 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13489 {
13490 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13491 writer->data = PyUnicode_DATA(writer->buffer);
13492
13493 if (!writer->readonly) {
13494 writer->kind = PyUnicode_KIND(writer->buffer);
13495 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13496 }
13497 else {
13498 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13499 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13500 writer->kind = PyUnicode_WCHAR_KIND;
13501 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13502
13503 /* Copy-on-write mode: set buffer size to 0 so
13504 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13505 * next write. */
13506 writer->size = 0;
13507 }
13508 }
13509
13510 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13511 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13512 {
13513 memset(writer, 0, sizeof(*writer));
13514
13515 /* ASCII is the bare minimum */
13516 writer->min_char = 127;
13517
13518 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13519 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13520 writer->kind = PyUnicode_WCHAR_KIND;
13521 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13522 }
13523
13524 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13525 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13526 Py_ssize_t length, Py_UCS4 maxchar)
13527 {
13528 Py_ssize_t newlen;
13529 PyObject *newbuffer;
13530
13531 assert(maxchar <= MAX_UNICODE);
13532
13533 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13534 assert((maxchar > writer->maxchar && length >= 0)
13535 || length > 0);
13536
13537 if (length > PY_SSIZE_T_MAX - writer->pos) {
13538 PyErr_NoMemory();
13539 return -1;
13540 }
13541 newlen = writer->pos + length;
13542
13543 maxchar = Py_MAX(maxchar, writer->min_char);
13544
13545 if (writer->buffer == NULL) {
13546 assert(!writer->readonly);
13547 if (writer->overallocate
13548 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13549 /* overallocate to limit the number of realloc() */
13550 newlen += newlen / OVERALLOCATE_FACTOR;
13551 }
13552 if (newlen < writer->min_length)
13553 newlen = writer->min_length;
13554
13555 writer->buffer = PyUnicode_New(newlen, maxchar);
13556 if (writer->buffer == NULL)
13557 return -1;
13558 }
13559 else if (newlen > writer->size) {
13560 if (writer->overallocate
13561 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13562 /* overallocate to limit the number of realloc() */
13563 newlen += newlen / OVERALLOCATE_FACTOR;
13564 }
13565 if (newlen < writer->min_length)
13566 newlen = writer->min_length;
13567
13568 if (maxchar > writer->maxchar || writer->readonly) {
13569 /* resize + widen */
13570 maxchar = Py_MAX(maxchar, writer->maxchar);
13571 newbuffer = PyUnicode_New(newlen, maxchar);
13572 if (newbuffer == NULL)
13573 return -1;
13574 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13575 writer->buffer, 0, writer->pos);
13576 Py_DECREF(writer->buffer);
13577 writer->readonly = 0;
13578 }
13579 else {
13580 newbuffer = resize_compact(writer->buffer, newlen);
13581 if (newbuffer == NULL)
13582 return -1;
13583 }
13584 writer->buffer = newbuffer;
13585 }
13586 else if (maxchar > writer->maxchar) {
13587 assert(!writer->readonly);
13588 newbuffer = PyUnicode_New(writer->size, maxchar);
13589 if (newbuffer == NULL)
13590 return -1;
13591 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13592 writer->buffer, 0, writer->pos);
13593 Py_SETREF(writer->buffer, newbuffer);
13594 }
13595 _PyUnicodeWriter_Update(writer);
13596 return 0;
13597
13598 #undef OVERALLOCATE_FACTOR
13599 }
13600
13601 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13602 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13603 enum PyUnicode_Kind kind)
13604 {
13605 Py_UCS4 maxchar;
13606
13607 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13608 assert(writer->kind < kind);
13609
13610 switch (kind)
13611 {
13612 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13613 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13614 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13615 default:
13616 Py_UNREACHABLE();
13617 }
13618
13619 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13620 }
13621
13622 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13623 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13624 {
13625 assert(ch <= MAX_UNICODE);
13626 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13627 return -1;
13628 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13629 writer->pos++;
13630 return 0;
13631 }
13632
13633 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13634 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13635 {
13636 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13637 }
13638
13639 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13640 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13641 {
13642 Py_UCS4 maxchar;
13643 Py_ssize_t len;
13644
13645 if (PyUnicode_READY(str) == -1)
13646 return -1;
13647 len = PyUnicode_GET_LENGTH(str);
13648 if (len == 0)
13649 return 0;
13650 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13651 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13652 if (writer->buffer == NULL && !writer->overallocate) {
13653 assert(_PyUnicode_CheckConsistency(str, 1));
13654 writer->readonly = 1;
13655 Py_INCREF(str);
13656 writer->buffer = str;
13657 _PyUnicodeWriter_Update(writer);
13658 writer->pos += len;
13659 return 0;
13660 }
13661 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13662 return -1;
13663 }
13664 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13665 str, 0, len);
13666 writer->pos += len;
13667 return 0;
13668 }
13669
13670 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13671 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13672 Py_ssize_t start, Py_ssize_t end)
13673 {
13674 Py_UCS4 maxchar;
13675 Py_ssize_t len;
13676
13677 if (PyUnicode_READY(str) == -1)
13678 return -1;
13679
13680 assert(0 <= start);
13681 assert(end <= PyUnicode_GET_LENGTH(str));
13682 assert(start <= end);
13683
13684 if (end == 0)
13685 return 0;
13686
13687 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13688 return _PyUnicodeWriter_WriteStr(writer, str);
13689
13690 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13691 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13692 else
13693 maxchar = writer->maxchar;
13694 len = end - start;
13695
13696 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13697 return -1;
13698
13699 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13700 str, start, len);
13701 writer->pos += len;
13702 return 0;
13703 }
13704
13705 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13706 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13707 const char *ascii, Py_ssize_t len)
13708 {
13709 if (len == -1)
13710 len = strlen(ascii);
13711
13712 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13713
13714 if (writer->buffer == NULL && !writer->overallocate) {
13715 PyObject *str;
13716
13717 str = _PyUnicode_FromASCII(ascii, len);
13718 if (str == NULL)
13719 return -1;
13720
13721 writer->readonly = 1;
13722 writer->buffer = str;
13723 _PyUnicodeWriter_Update(writer);
13724 writer->pos += len;
13725 return 0;
13726 }
13727
13728 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13729 return -1;
13730
13731 switch (writer->kind)
13732 {
13733 case PyUnicode_1BYTE_KIND:
13734 {
13735 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13736 Py_UCS1 *data = writer->data;
13737
13738 memcpy(data + writer->pos, str, len);
13739 break;
13740 }
13741 case PyUnicode_2BYTE_KIND:
13742 {
13743 _PyUnicode_CONVERT_BYTES(
13744 Py_UCS1, Py_UCS2,
13745 ascii, ascii + len,
13746 (Py_UCS2 *)writer->data + writer->pos);
13747 break;
13748 }
13749 case PyUnicode_4BYTE_KIND:
13750 {
13751 _PyUnicode_CONVERT_BYTES(
13752 Py_UCS1, Py_UCS4,
13753 ascii, ascii + len,
13754 (Py_UCS4 *)writer->data + writer->pos);
13755 break;
13756 }
13757 default:
13758 Py_UNREACHABLE();
13759 }
13760
13761 writer->pos += len;
13762 return 0;
13763 }
13764
13765 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13766 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13767 const char *str, Py_ssize_t len)
13768 {
13769 Py_UCS4 maxchar;
13770
13771 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
13772 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13773 return -1;
13774 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13775 writer->pos += len;
13776 return 0;
13777 }
13778
13779 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13780 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13781 {
13782 PyObject *str;
13783
13784 if (writer->pos == 0) {
13785 Py_CLEAR(writer->buffer);
13786 _Py_RETURN_UNICODE_EMPTY();
13787 }
13788
13789 str = writer->buffer;
13790 writer->buffer = NULL;
13791
13792 if (writer->readonly) {
13793 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13794 return str;
13795 }
13796
13797 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13798 PyObject *str2;
13799 str2 = resize_compact(str, writer->pos);
13800 if (str2 == NULL) {
13801 Py_DECREF(str);
13802 return NULL;
13803 }
13804 str = str2;
13805 }
13806
13807 assert(_PyUnicode_CheckConsistency(str, 1));
13808 return unicode_result_ready(str);
13809 }
13810
13811 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13812 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13813 {
13814 Py_CLEAR(writer->buffer);
13815 }
13816
13817 #include "stringlib/unicode_format.h"
13818
13819 PyDoc_STRVAR(format__doc__,
13820 "S.format(*args, **kwargs) -> str\n\
13821 \n\
13822 Return a formatted version of S, using substitutions from args and kwargs.\n\
13823 The substitutions are identified by braces ('{' and '}').");
13824
13825 PyDoc_STRVAR(format_map__doc__,
13826 "S.format_map(mapping) -> str\n\
13827 \n\
13828 Return a formatted version of S, using substitutions from mapping.\n\
13829 The substitutions are identified by braces ('{' and '}').");
13830
13831 /*[clinic input]
13832 str.__format__ as unicode___format__
13833
13834 format_spec: unicode
13835 /
13836
13837 Return a formatted version of the string as described by format_spec.
13838 [clinic start generated code]*/
13839
13840 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13841 unicode___format___impl(PyObject *self, PyObject *format_spec)
13842 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13843 {
13844 _PyUnicodeWriter writer;
13845 int ret;
13846
13847 if (PyUnicode_READY(self) == -1)
13848 return NULL;
13849 _PyUnicodeWriter_Init(&writer);
13850 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13851 self, format_spec, 0,
13852 PyUnicode_GET_LENGTH(format_spec));
13853 if (ret == -1) {
13854 _PyUnicodeWriter_Dealloc(&writer);
13855 return NULL;
13856 }
13857 return _PyUnicodeWriter_Finish(&writer);
13858 }
13859
13860 /*[clinic input]
13861 str.__sizeof__ as unicode_sizeof
13862
13863 Return the size of the string in memory, in bytes.
13864 [clinic start generated code]*/
13865
13866 static PyObject *
unicode_sizeof_impl(PyObject * self)13867 unicode_sizeof_impl(PyObject *self)
13868 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13869 {
13870 Py_ssize_t size;
13871
13872 /* If it's a compact object, account for base structure +
13873 character data. */
13874 if (PyUnicode_IS_COMPACT_ASCII(self))
13875 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13876 else if (PyUnicode_IS_COMPACT(self))
13877 size = sizeof(PyCompactUnicodeObject) +
13878 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13879 else {
13880 /* If it is a two-block object, account for base object, and
13881 for character block if present. */
13882 size = sizeof(PyUnicodeObject);
13883 if (_PyUnicode_DATA_ANY(self))
13884 size += (PyUnicode_GET_LENGTH(self) + 1) *
13885 PyUnicode_KIND(self);
13886 }
13887 /* If the wstr pointer is present, account for it unless it is shared
13888 with the data pointer. Check if the data is not shared. */
13889 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13890 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13891 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13892 size += PyUnicode_UTF8_LENGTH(self) + 1;
13893
13894 return PyLong_FromSsize_t(size);
13895 }
13896
13897 static PyObject *
unicode_getnewargs(PyObject * v,PyObject * Py_UNUSED (ignored))13898 unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13899 {
13900 PyObject *copy = _PyUnicode_Copy(v);
13901 if (!copy)
13902 return NULL;
13903 return Py_BuildValue("(N)", copy);
13904 }
13905
13906 static PyMethodDef unicode_methods[] = {
13907 UNICODE_ENCODE_METHODDEF
13908 UNICODE_REPLACE_METHODDEF
13909 UNICODE_SPLIT_METHODDEF
13910 UNICODE_RSPLIT_METHODDEF
13911 UNICODE_JOIN_METHODDEF
13912 UNICODE_CAPITALIZE_METHODDEF
13913 UNICODE_CASEFOLD_METHODDEF
13914 UNICODE_TITLE_METHODDEF
13915 UNICODE_CENTER_METHODDEF
13916 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13917 UNICODE_EXPANDTABS_METHODDEF
13918 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13919 UNICODE_PARTITION_METHODDEF
13920 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13921 UNICODE_LJUST_METHODDEF
13922 UNICODE_LOWER_METHODDEF
13923 UNICODE_LSTRIP_METHODDEF
13924 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13925 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13926 UNICODE_RJUST_METHODDEF
13927 UNICODE_RSTRIP_METHODDEF
13928 UNICODE_RPARTITION_METHODDEF
13929 UNICODE_SPLITLINES_METHODDEF
13930 UNICODE_STRIP_METHODDEF
13931 UNICODE_SWAPCASE_METHODDEF
13932 UNICODE_TRANSLATE_METHODDEF
13933 UNICODE_UPPER_METHODDEF
13934 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13935 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13936 UNICODE_ISASCII_METHODDEF
13937 UNICODE_ISLOWER_METHODDEF
13938 UNICODE_ISUPPER_METHODDEF
13939 UNICODE_ISTITLE_METHODDEF
13940 UNICODE_ISSPACE_METHODDEF
13941 UNICODE_ISDECIMAL_METHODDEF
13942 UNICODE_ISDIGIT_METHODDEF
13943 UNICODE_ISNUMERIC_METHODDEF
13944 UNICODE_ISALPHA_METHODDEF
13945 UNICODE_ISALNUM_METHODDEF
13946 UNICODE_ISIDENTIFIER_METHODDEF
13947 UNICODE_ISPRINTABLE_METHODDEF
13948 UNICODE_ZFILL_METHODDEF
13949 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13950 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13951 UNICODE___FORMAT___METHODDEF
13952 UNICODE_MAKETRANS_METHODDEF
13953 UNICODE_SIZEOF_METHODDEF
13954 #if 0
13955 /* These methods are just used for debugging the implementation. */
13956 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13957 #endif
13958
13959 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
13960 {NULL, NULL}
13961 };
13962
13963 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13964 unicode_mod(PyObject *v, PyObject *w)
13965 {
13966 if (!PyUnicode_Check(v))
13967 Py_RETURN_NOTIMPLEMENTED;
13968 return PyUnicode_Format(v, w);
13969 }
13970
13971 static PyNumberMethods unicode_as_number = {
13972 0, /*nb_add*/
13973 0, /*nb_subtract*/
13974 0, /*nb_multiply*/
13975 unicode_mod, /*nb_remainder*/
13976 };
13977
13978 static PySequenceMethods unicode_as_sequence = {
13979 (lenfunc) unicode_length, /* sq_length */
13980 PyUnicode_Concat, /* sq_concat */
13981 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13982 (ssizeargfunc) unicode_getitem, /* sq_item */
13983 0, /* sq_slice */
13984 0, /* sq_ass_item */
13985 0, /* sq_ass_slice */
13986 PyUnicode_Contains, /* sq_contains */
13987 };
13988
13989 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13990 unicode_subscript(PyObject* self, PyObject* item)
13991 {
13992 if (PyUnicode_READY(self) == -1)
13993 return NULL;
13994
13995 if (PyIndex_Check(item)) {
13996 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13997 if (i == -1 && PyErr_Occurred())
13998 return NULL;
13999 if (i < 0)
14000 i += PyUnicode_GET_LENGTH(self);
14001 return unicode_getitem(self, i);
14002 } else if (PySlice_Check(item)) {
14003 Py_ssize_t start, stop, step, slicelength, i;
14004 size_t cur;
14005 PyObject *result;
14006 void *src_data, *dest_data;
14007 int src_kind, dest_kind;
14008 Py_UCS4 ch, max_char, kind_limit;
14009
14010 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14011 return NULL;
14012 }
14013 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14014 &start, &stop, step);
14015
14016 if (slicelength <= 0) {
14017 _Py_RETURN_UNICODE_EMPTY();
14018 } else if (start == 0 && step == 1 &&
14019 slicelength == PyUnicode_GET_LENGTH(self)) {
14020 return unicode_result_unchanged(self);
14021 } else if (step == 1) {
14022 return PyUnicode_Substring(self,
14023 start, start + slicelength);
14024 }
14025 /* General case */
14026 src_kind = PyUnicode_KIND(self);
14027 src_data = PyUnicode_DATA(self);
14028 if (!PyUnicode_IS_ASCII(self)) {
14029 kind_limit = kind_maxchar_limit(src_kind);
14030 max_char = 0;
14031 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14032 ch = PyUnicode_READ(src_kind, src_data, cur);
14033 if (ch > max_char) {
14034 max_char = ch;
14035 if (max_char >= kind_limit)
14036 break;
14037 }
14038 }
14039 }
14040 else
14041 max_char = 127;
14042 result = PyUnicode_New(slicelength, max_char);
14043 if (result == NULL)
14044 return NULL;
14045 dest_kind = PyUnicode_KIND(result);
14046 dest_data = PyUnicode_DATA(result);
14047
14048 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14049 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14050 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14051 }
14052 assert(_PyUnicode_CheckConsistency(result, 1));
14053 return result;
14054 } else {
14055 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14056 return NULL;
14057 }
14058 }
14059
14060 static PyMappingMethods unicode_as_mapping = {
14061 (lenfunc)unicode_length, /* mp_length */
14062 (binaryfunc)unicode_subscript, /* mp_subscript */
14063 (objobjargproc)0, /* mp_ass_subscript */
14064 };
14065
14066
14067 /* Helpers for PyUnicode_Format() */
14068
14069 struct unicode_formatter_t {
14070 PyObject *args;
14071 int args_owned;
14072 Py_ssize_t arglen, argidx;
14073 PyObject *dict;
14074
14075 enum PyUnicode_Kind fmtkind;
14076 Py_ssize_t fmtcnt, fmtpos;
14077 void *fmtdata;
14078 PyObject *fmtstr;
14079
14080 _PyUnicodeWriter writer;
14081 };
14082
14083 struct unicode_format_arg_t {
14084 Py_UCS4 ch;
14085 int flags;
14086 Py_ssize_t width;
14087 int prec;
14088 int sign;
14089 };
14090
14091 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14092 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14093 {
14094 Py_ssize_t argidx = ctx->argidx;
14095
14096 if (argidx < ctx->arglen) {
14097 ctx->argidx++;
14098 if (ctx->arglen < 0)
14099 return ctx->args;
14100 else
14101 return PyTuple_GetItem(ctx->args, argidx);
14102 }
14103 PyErr_SetString(PyExc_TypeError,
14104 "not enough arguments for format string");
14105 return NULL;
14106 }
14107
14108 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14109
14110 /* Format a float into the writer if the writer is not NULL, or into *p_output
14111 otherwise.
14112
14113 Return 0 on success, raise an exception and return -1 on error. */
14114 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14115 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14116 PyObject **p_output,
14117 _PyUnicodeWriter *writer)
14118 {
14119 char *p;
14120 double x;
14121 Py_ssize_t len;
14122 int prec;
14123 int dtoa_flags;
14124
14125 x = PyFloat_AsDouble(v);
14126 if (x == -1.0 && PyErr_Occurred())
14127 return -1;
14128
14129 prec = arg->prec;
14130 if (prec < 0)
14131 prec = 6;
14132
14133 if (arg->flags & F_ALT)
14134 dtoa_flags = Py_DTSF_ALT;
14135 else
14136 dtoa_flags = 0;
14137 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14138 if (p == NULL)
14139 return -1;
14140 len = strlen(p);
14141 if (writer) {
14142 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14143 PyMem_Free(p);
14144 return -1;
14145 }
14146 }
14147 else
14148 *p_output = _PyUnicode_FromASCII(p, len);
14149 PyMem_Free(p);
14150 return 0;
14151 }
14152
14153 /* formatlong() emulates the format codes d, u, o, x and X, and
14154 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14155 * Python's regular ints.
14156 * Return value: a new PyUnicodeObject*, or NULL if error.
14157 * The output string is of the form
14158 * "-"? ("0x" | "0X")? digit+
14159 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14160 * set in flags. The case of hex digits will be correct,
14161 * There will be at least prec digits, zero-filled on the left if
14162 * necessary to get that many.
14163 * val object to be converted
14164 * flags bitmask of format flags; only F_ALT is looked at
14165 * prec minimum number of digits; 0-fill on left if needed
14166 * type a character in [duoxX]; u acts the same as d
14167 *
14168 * CAUTION: o, x and X conversions on regular ints can never
14169 * produce a '-' sign, but can for Python's unbounded ints.
14170 */
14171 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14172 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14173 {
14174 PyObject *result = NULL;
14175 char *buf;
14176 Py_ssize_t i;
14177 int sign; /* 1 if '-', else 0 */
14178 int len; /* number of characters */
14179 Py_ssize_t llen;
14180 int numdigits; /* len == numnondigits + numdigits */
14181 int numnondigits = 0;
14182
14183 /* Avoid exceeding SSIZE_T_MAX */
14184 if (prec > INT_MAX-3) {
14185 PyErr_SetString(PyExc_OverflowError,
14186 "precision too large");
14187 return NULL;
14188 }
14189
14190 assert(PyLong_Check(val));
14191
14192 switch (type) {
14193 default:
14194 Py_UNREACHABLE();
14195 case 'd':
14196 case 'i':
14197 case 'u':
14198 /* int and int subclasses should print numerically when a numeric */
14199 /* format code is used (see issue18780) */
14200 result = PyNumber_ToBase(val, 10);
14201 break;
14202 case 'o':
14203 numnondigits = 2;
14204 result = PyNumber_ToBase(val, 8);
14205 break;
14206 case 'x':
14207 case 'X':
14208 numnondigits = 2;
14209 result = PyNumber_ToBase(val, 16);
14210 break;
14211 }
14212 if (!result)
14213 return NULL;
14214
14215 assert(unicode_modifiable(result));
14216 assert(PyUnicode_IS_READY(result));
14217 assert(PyUnicode_IS_ASCII(result));
14218
14219 /* To modify the string in-place, there can only be one reference. */
14220 if (Py_REFCNT(result) != 1) {
14221 Py_DECREF(result);
14222 PyErr_BadInternalCall();
14223 return NULL;
14224 }
14225 buf = PyUnicode_DATA(result);
14226 llen = PyUnicode_GET_LENGTH(result);
14227 if (llen > INT_MAX) {
14228 Py_DECREF(result);
14229 PyErr_SetString(PyExc_ValueError,
14230 "string too large in _PyUnicode_FormatLong");
14231 return NULL;
14232 }
14233 len = (int)llen;
14234 sign = buf[0] == '-';
14235 numnondigits += sign;
14236 numdigits = len - numnondigits;
14237 assert(numdigits > 0);
14238
14239 /* Get rid of base marker unless F_ALT */
14240 if (((alt) == 0 &&
14241 (type == 'o' || type == 'x' || type == 'X'))) {
14242 assert(buf[sign] == '0');
14243 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14244 buf[sign+1] == 'o');
14245 numnondigits -= 2;
14246 buf += 2;
14247 len -= 2;
14248 if (sign)
14249 buf[0] = '-';
14250 assert(len == numnondigits + numdigits);
14251 assert(numdigits > 0);
14252 }
14253
14254 /* Fill with leading zeroes to meet minimum width. */
14255 if (prec > numdigits) {
14256 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14257 numnondigits + prec);
14258 char *b1;
14259 if (!r1) {
14260 Py_DECREF(result);
14261 return NULL;
14262 }
14263 b1 = PyBytes_AS_STRING(r1);
14264 for (i = 0; i < numnondigits; ++i)
14265 *b1++ = *buf++;
14266 for (i = 0; i < prec - numdigits; i++)
14267 *b1++ = '0';
14268 for (i = 0; i < numdigits; i++)
14269 *b1++ = *buf++;
14270 *b1 = '\0';
14271 Py_DECREF(result);
14272 result = r1;
14273 buf = PyBytes_AS_STRING(result);
14274 len = numnondigits + prec;
14275 }
14276
14277 /* Fix up case for hex conversions. */
14278 if (type == 'X') {
14279 /* Need to convert all lower case letters to upper case.
14280 and need to convert 0x to 0X (and -0x to -0X). */
14281 for (i = 0; i < len; i++)
14282 if (buf[i] >= 'a' && buf[i] <= 'x')
14283 buf[i] -= 'a'-'A';
14284 }
14285 if (!PyUnicode_Check(result)
14286 || buf != PyUnicode_DATA(result)) {
14287 PyObject *unicode;
14288 unicode = _PyUnicode_FromASCII(buf, len);
14289 Py_DECREF(result);
14290 result = unicode;
14291 }
14292 else if (len != PyUnicode_GET_LENGTH(result)) {
14293 if (PyUnicode_Resize(&result, len) < 0)
14294 Py_CLEAR(result);
14295 }
14296 return result;
14297 }
14298
14299 /* Format an integer or a float as an integer.
14300 * Return 1 if the number has been formatted into the writer,
14301 * 0 if the number has been formatted into *p_output
14302 * -1 and raise an exception on error */
14303 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14304 mainformatlong(PyObject *v,
14305 struct unicode_format_arg_t *arg,
14306 PyObject **p_output,
14307 _PyUnicodeWriter *writer)
14308 {
14309 PyObject *iobj, *res;
14310 char type = (char)arg->ch;
14311
14312 if (!PyNumber_Check(v))
14313 goto wrongtype;
14314
14315 /* make sure number is a type of integer for o, x, and X */
14316 if (!PyLong_Check(v)) {
14317 if (type == 'o' || type == 'x' || type == 'X') {
14318 iobj = PyNumber_Index(v);
14319 if (iobj == NULL) {
14320 if (PyErr_ExceptionMatches(PyExc_TypeError))
14321 goto wrongtype;
14322 return -1;
14323 }
14324 }
14325 else {
14326 iobj = PyNumber_Long(v);
14327 if (iobj == NULL ) {
14328 if (PyErr_ExceptionMatches(PyExc_TypeError))
14329 goto wrongtype;
14330 return -1;
14331 }
14332 }
14333 assert(PyLong_Check(iobj));
14334 }
14335 else {
14336 iobj = v;
14337 Py_INCREF(iobj);
14338 }
14339
14340 if (PyLong_CheckExact(v)
14341 && arg->width == -1 && arg->prec == -1
14342 && !(arg->flags & (F_SIGN | F_BLANK))
14343 && type != 'X')
14344 {
14345 /* Fast path */
14346 int alternate = arg->flags & F_ALT;
14347 int base;
14348
14349 switch(type)
14350 {
14351 default:
14352 Py_UNREACHABLE();
14353 case 'd':
14354 case 'i':
14355 case 'u':
14356 base = 10;
14357 break;
14358 case 'o':
14359 base = 8;
14360 break;
14361 case 'x':
14362 case 'X':
14363 base = 16;
14364 break;
14365 }
14366
14367 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14368 Py_DECREF(iobj);
14369 return -1;
14370 }
14371 Py_DECREF(iobj);
14372 return 1;
14373 }
14374
14375 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14376 Py_DECREF(iobj);
14377 if (res == NULL)
14378 return -1;
14379 *p_output = res;
14380 return 0;
14381
14382 wrongtype:
14383 switch(type)
14384 {
14385 case 'o':
14386 case 'x':
14387 case 'X':
14388 PyErr_Format(PyExc_TypeError,
14389 "%%%c format: an integer is required, "
14390 "not %.200s",
14391 type, Py_TYPE(v)->tp_name);
14392 break;
14393 default:
14394 PyErr_Format(PyExc_TypeError,
14395 "%%%c format: a number is required, "
14396 "not %.200s",
14397 type, Py_TYPE(v)->tp_name);
14398 break;
14399 }
14400 return -1;
14401 }
14402
14403 static Py_UCS4
formatchar(PyObject * v)14404 formatchar(PyObject *v)
14405 {
14406 /* presume that the buffer is at least 3 characters long */
14407 if (PyUnicode_Check(v)) {
14408 if (PyUnicode_GET_LENGTH(v) == 1) {
14409 return PyUnicode_READ_CHAR(v, 0);
14410 }
14411 goto onError;
14412 }
14413 else {
14414 PyObject *iobj;
14415 long x;
14416 /* make sure number is a type of integer */
14417 if (!PyLong_Check(v)) {
14418 iobj = PyNumber_Index(v);
14419 if (iobj == NULL) {
14420 goto onError;
14421 }
14422 x = PyLong_AsLong(iobj);
14423 Py_DECREF(iobj);
14424 }
14425 else {
14426 x = PyLong_AsLong(v);
14427 }
14428 if (x == -1 && PyErr_Occurred())
14429 goto onError;
14430
14431 if (x < 0 || x > MAX_UNICODE) {
14432 PyErr_SetString(PyExc_OverflowError,
14433 "%c arg not in range(0x110000)");
14434 return (Py_UCS4) -1;
14435 }
14436
14437 return (Py_UCS4) x;
14438 }
14439
14440 onError:
14441 PyErr_SetString(PyExc_TypeError,
14442 "%c requires int or char");
14443 return (Py_UCS4) -1;
14444 }
14445
14446 /* Parse options of an argument: flags, width, precision.
14447 Handle also "%(name)" syntax.
14448
14449 Return 0 if the argument has been formatted into arg->str.
14450 Return 1 if the argument has been written into ctx->writer,
14451 Raise an exception and return -1 on error. */
14452 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14453 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14454 struct unicode_format_arg_t *arg)
14455 {
14456 #define FORMAT_READ(ctx) \
14457 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14458
14459 PyObject *v;
14460
14461 if (arg->ch == '(') {
14462 /* Get argument value from a dictionary. Example: "%(name)s". */
14463 Py_ssize_t keystart;
14464 Py_ssize_t keylen;
14465 PyObject *key;
14466 int pcount = 1;
14467
14468 if (ctx->dict == NULL) {
14469 PyErr_SetString(PyExc_TypeError,
14470 "format requires a mapping");
14471 return -1;
14472 }
14473 ++ctx->fmtpos;
14474 --ctx->fmtcnt;
14475 keystart = ctx->fmtpos;
14476 /* Skip over balanced parentheses */
14477 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14478 arg->ch = FORMAT_READ(ctx);
14479 if (arg->ch == ')')
14480 --pcount;
14481 else if (arg->ch == '(')
14482 ++pcount;
14483 ctx->fmtpos++;
14484 }
14485 keylen = ctx->fmtpos - keystart - 1;
14486 if (ctx->fmtcnt < 0 || pcount > 0) {
14487 PyErr_SetString(PyExc_ValueError,
14488 "incomplete format key");
14489 return -1;
14490 }
14491 key = PyUnicode_Substring(ctx->fmtstr,
14492 keystart, keystart + keylen);
14493 if (key == NULL)
14494 return -1;
14495 if (ctx->args_owned) {
14496 ctx->args_owned = 0;
14497 Py_DECREF(ctx->args);
14498 }
14499 ctx->args = PyObject_GetItem(ctx->dict, key);
14500 Py_DECREF(key);
14501 if (ctx->args == NULL)
14502 return -1;
14503 ctx->args_owned = 1;
14504 ctx->arglen = -1;
14505 ctx->argidx = -2;
14506 }
14507
14508 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14509 while (--ctx->fmtcnt >= 0) {
14510 arg->ch = FORMAT_READ(ctx);
14511 ctx->fmtpos++;
14512 switch (arg->ch) {
14513 case '-': arg->flags |= F_LJUST; continue;
14514 case '+': arg->flags |= F_SIGN; continue;
14515 case ' ': arg->flags |= F_BLANK; continue;
14516 case '#': arg->flags |= F_ALT; continue;
14517 case '0': arg->flags |= F_ZERO; continue;
14518 }
14519 break;
14520 }
14521
14522 /* Parse width. Example: "%10s" => width=10 */
14523 if (arg->ch == '*') {
14524 v = unicode_format_getnextarg(ctx);
14525 if (v == NULL)
14526 return -1;
14527 if (!PyLong_Check(v)) {
14528 PyErr_SetString(PyExc_TypeError,
14529 "* wants int");
14530 return -1;
14531 }
14532 arg->width = PyLong_AsSsize_t(v);
14533 if (arg->width == -1 && PyErr_Occurred())
14534 return -1;
14535 if (arg->width < 0) {
14536 arg->flags |= F_LJUST;
14537 arg->width = -arg->width;
14538 }
14539 if (--ctx->fmtcnt >= 0) {
14540 arg->ch = FORMAT_READ(ctx);
14541 ctx->fmtpos++;
14542 }
14543 }
14544 else if (arg->ch >= '0' && arg->ch <= '9') {
14545 arg->width = arg->ch - '0';
14546 while (--ctx->fmtcnt >= 0) {
14547 arg->ch = FORMAT_READ(ctx);
14548 ctx->fmtpos++;
14549 if (arg->ch < '0' || arg->ch > '9')
14550 break;
14551 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14552 mixing signed and unsigned comparison. Since arg->ch is between
14553 '0' and '9', casting to int is safe. */
14554 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14555 PyErr_SetString(PyExc_ValueError,
14556 "width too big");
14557 return -1;
14558 }
14559 arg->width = arg->width*10 + (arg->ch - '0');
14560 }
14561 }
14562
14563 /* Parse precision. Example: "%.3f" => prec=3 */
14564 if (arg->ch == '.') {
14565 arg->prec = 0;
14566 if (--ctx->fmtcnt >= 0) {
14567 arg->ch = FORMAT_READ(ctx);
14568 ctx->fmtpos++;
14569 }
14570 if (arg->ch == '*') {
14571 v = unicode_format_getnextarg(ctx);
14572 if (v == NULL)
14573 return -1;
14574 if (!PyLong_Check(v)) {
14575 PyErr_SetString(PyExc_TypeError,
14576 "* wants int");
14577 return -1;
14578 }
14579 arg->prec = _PyLong_AsInt(v);
14580 if (arg->prec == -1 && PyErr_Occurred())
14581 return -1;
14582 if (arg->prec < 0)
14583 arg->prec = 0;
14584 if (--ctx->fmtcnt >= 0) {
14585 arg->ch = FORMAT_READ(ctx);
14586 ctx->fmtpos++;
14587 }
14588 }
14589 else if (arg->ch >= '0' && arg->ch <= '9') {
14590 arg->prec = arg->ch - '0';
14591 while (--ctx->fmtcnt >= 0) {
14592 arg->ch = FORMAT_READ(ctx);
14593 ctx->fmtpos++;
14594 if (arg->ch < '0' || arg->ch > '9')
14595 break;
14596 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14597 PyErr_SetString(PyExc_ValueError,
14598 "precision too big");
14599 return -1;
14600 }
14601 arg->prec = arg->prec*10 + (arg->ch - '0');
14602 }
14603 }
14604 }
14605
14606 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14607 if (ctx->fmtcnt >= 0) {
14608 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14609 if (--ctx->fmtcnt >= 0) {
14610 arg->ch = FORMAT_READ(ctx);
14611 ctx->fmtpos++;
14612 }
14613 }
14614 }
14615 if (ctx->fmtcnt < 0) {
14616 PyErr_SetString(PyExc_ValueError,
14617 "incomplete format");
14618 return -1;
14619 }
14620 return 0;
14621
14622 #undef FORMAT_READ
14623 }
14624
14625 /* Format one argument. Supported conversion specifiers:
14626
14627 - "s", "r", "a": any type
14628 - "i", "d", "u": int or float
14629 - "o", "x", "X": int
14630 - "e", "E", "f", "F", "g", "G": float
14631 - "c": int or str (1 character)
14632
14633 When possible, the output is written directly into the Unicode writer
14634 (ctx->writer). A string is created when padding is required.
14635
14636 Return 0 if the argument has been formatted into *p_str,
14637 1 if the argument has been written into ctx->writer,
14638 -1 on error. */
14639 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14640 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14641 struct unicode_format_arg_t *arg,
14642 PyObject **p_str)
14643 {
14644 PyObject *v;
14645 _PyUnicodeWriter *writer = &ctx->writer;
14646
14647 if (ctx->fmtcnt == 0)
14648 ctx->writer.overallocate = 0;
14649
14650 v = unicode_format_getnextarg(ctx);
14651 if (v == NULL)
14652 return -1;
14653
14654
14655 switch (arg->ch) {
14656 case 's':
14657 case 'r':
14658 case 'a':
14659 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14660 /* Fast path */
14661 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14662 return -1;
14663 return 1;
14664 }
14665
14666 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14667 *p_str = v;
14668 Py_INCREF(*p_str);
14669 }
14670 else {
14671 if (arg->ch == 's')
14672 *p_str = PyObject_Str(v);
14673 else if (arg->ch == 'r')
14674 *p_str = PyObject_Repr(v);
14675 else
14676 *p_str = PyObject_ASCII(v);
14677 }
14678 break;
14679
14680 case 'i':
14681 case 'd':
14682 case 'u':
14683 case 'o':
14684 case 'x':
14685 case 'X':
14686 {
14687 int ret = mainformatlong(v, arg, p_str, writer);
14688 if (ret != 0)
14689 return ret;
14690 arg->sign = 1;
14691 break;
14692 }
14693
14694 case 'e':
14695 case 'E':
14696 case 'f':
14697 case 'F':
14698 case 'g':
14699 case 'G':
14700 if (arg->width == -1 && arg->prec == -1
14701 && !(arg->flags & (F_SIGN | F_BLANK)))
14702 {
14703 /* Fast path */
14704 if (formatfloat(v, arg, NULL, writer) == -1)
14705 return -1;
14706 return 1;
14707 }
14708
14709 arg->sign = 1;
14710 if (formatfloat(v, arg, p_str, NULL) == -1)
14711 return -1;
14712 break;
14713
14714 case 'c':
14715 {
14716 Py_UCS4 ch = formatchar(v);
14717 if (ch == (Py_UCS4) -1)
14718 return -1;
14719 if (arg->width == -1 && arg->prec == -1) {
14720 /* Fast path */
14721 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14722 return -1;
14723 return 1;
14724 }
14725 *p_str = PyUnicode_FromOrdinal(ch);
14726 break;
14727 }
14728
14729 default:
14730 PyErr_Format(PyExc_ValueError,
14731 "unsupported format character '%c' (0x%x) "
14732 "at index %zd",
14733 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14734 (int)arg->ch,
14735 ctx->fmtpos - 1);
14736 return -1;
14737 }
14738 if (*p_str == NULL)
14739 return -1;
14740 assert (PyUnicode_Check(*p_str));
14741 return 0;
14742 }
14743
14744 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14745 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14746 struct unicode_format_arg_t *arg,
14747 PyObject *str)
14748 {
14749 Py_ssize_t len;
14750 enum PyUnicode_Kind kind;
14751 void *pbuf;
14752 Py_ssize_t pindex;
14753 Py_UCS4 signchar;
14754 Py_ssize_t buflen;
14755 Py_UCS4 maxchar;
14756 Py_ssize_t sublen;
14757 _PyUnicodeWriter *writer = &ctx->writer;
14758 Py_UCS4 fill;
14759
14760 fill = ' ';
14761 if (arg->sign && arg->flags & F_ZERO)
14762 fill = '0';
14763
14764 if (PyUnicode_READY(str) == -1)
14765 return -1;
14766
14767 len = PyUnicode_GET_LENGTH(str);
14768 if ((arg->width == -1 || arg->width <= len)
14769 && (arg->prec == -1 || arg->prec >= len)
14770 && !(arg->flags & (F_SIGN | F_BLANK)))
14771 {
14772 /* Fast path */
14773 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14774 return -1;
14775 return 0;
14776 }
14777
14778 /* Truncate the string for "s", "r" and "a" formats
14779 if the precision is set */
14780 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14781 if (arg->prec >= 0 && len > arg->prec)
14782 len = arg->prec;
14783 }
14784
14785 /* Adjust sign and width */
14786 kind = PyUnicode_KIND(str);
14787 pbuf = PyUnicode_DATA(str);
14788 pindex = 0;
14789 signchar = '\0';
14790 if (arg->sign) {
14791 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14792 if (ch == '-' || ch == '+') {
14793 signchar = ch;
14794 len--;
14795 pindex++;
14796 }
14797 else if (arg->flags & F_SIGN)
14798 signchar = '+';
14799 else if (arg->flags & F_BLANK)
14800 signchar = ' ';
14801 else
14802 arg->sign = 0;
14803 }
14804 if (arg->width < len)
14805 arg->width = len;
14806
14807 /* Prepare the writer */
14808 maxchar = writer->maxchar;
14809 if (!(arg->flags & F_LJUST)) {
14810 if (arg->sign) {
14811 if ((arg->width-1) > len)
14812 maxchar = Py_MAX(maxchar, fill);
14813 }
14814 else {
14815 if (arg->width > len)
14816 maxchar = Py_MAX(maxchar, fill);
14817 }
14818 }
14819 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14820 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14821 maxchar = Py_MAX(maxchar, strmaxchar);
14822 }
14823
14824 buflen = arg->width;
14825 if (arg->sign && len == arg->width)
14826 buflen++;
14827 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14828 return -1;
14829
14830 /* Write the sign if needed */
14831 if (arg->sign) {
14832 if (fill != ' ') {
14833 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14834 writer->pos += 1;
14835 }
14836 if (arg->width > len)
14837 arg->width--;
14838 }
14839
14840 /* Write the numeric prefix for "x", "X" and "o" formats
14841 if the alternate form is used.
14842 For example, write "0x" for the "%#x" format. */
14843 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14844 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14845 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14846 if (fill != ' ') {
14847 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14848 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14849 writer->pos += 2;
14850 pindex += 2;
14851 }
14852 arg->width -= 2;
14853 if (arg->width < 0)
14854 arg->width = 0;
14855 len -= 2;
14856 }
14857
14858 /* Pad left with the fill character if needed */
14859 if (arg->width > len && !(arg->flags & F_LJUST)) {
14860 sublen = arg->width - len;
14861 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
14862 writer->pos += sublen;
14863 arg->width = len;
14864 }
14865
14866 /* If padding with spaces: write sign if needed and/or numeric prefix if
14867 the alternate form is used */
14868 if (fill == ' ') {
14869 if (arg->sign) {
14870 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14871 writer->pos += 1;
14872 }
14873 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14874 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14875 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14876 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14877 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14878 writer->pos += 2;
14879 pindex += 2;
14880 }
14881 }
14882
14883 /* Write characters */
14884 if (len) {
14885 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14886 str, pindex, len);
14887 writer->pos += len;
14888 }
14889
14890 /* Pad right with the fill character if needed */
14891 if (arg->width > len) {
14892 sublen = arg->width - len;
14893 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
14894 writer->pos += sublen;
14895 }
14896 return 0;
14897 }
14898
14899 /* Helper of PyUnicode_Format(): format one arg.
14900 Return 0 on success, raise an exception and return -1 on error. */
14901 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14902 unicode_format_arg(struct unicode_formatter_t *ctx)
14903 {
14904 struct unicode_format_arg_t arg;
14905 PyObject *str;
14906 int ret;
14907
14908 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14909 if (arg.ch == '%') {
14910 ctx->fmtpos++;
14911 ctx->fmtcnt--;
14912 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14913 return -1;
14914 return 0;
14915 }
14916 arg.flags = 0;
14917 arg.width = -1;
14918 arg.prec = -1;
14919 arg.sign = 0;
14920 str = NULL;
14921
14922 ret = unicode_format_arg_parse(ctx, &arg);
14923 if (ret == -1)
14924 return -1;
14925
14926 ret = unicode_format_arg_format(ctx, &arg, &str);
14927 if (ret == -1)
14928 return -1;
14929
14930 if (ret != 1) {
14931 ret = unicode_format_arg_output(ctx, &arg, str);
14932 Py_DECREF(str);
14933 if (ret == -1)
14934 return -1;
14935 }
14936
14937 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14938 PyErr_SetString(PyExc_TypeError,
14939 "not all arguments converted during string formatting");
14940 return -1;
14941 }
14942 return 0;
14943 }
14944
14945 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14946 PyUnicode_Format(PyObject *format, PyObject *args)
14947 {
14948 struct unicode_formatter_t ctx;
14949
14950 if (format == NULL || args == NULL) {
14951 PyErr_BadInternalCall();
14952 return NULL;
14953 }
14954
14955 if (ensure_unicode(format) < 0)
14956 return NULL;
14957
14958 ctx.fmtstr = format;
14959 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14960 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14961 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14962 ctx.fmtpos = 0;
14963
14964 _PyUnicodeWriter_Init(&ctx.writer);
14965 ctx.writer.min_length = ctx.fmtcnt + 100;
14966 ctx.writer.overallocate = 1;
14967
14968 if (PyTuple_Check(args)) {
14969 ctx.arglen = PyTuple_Size(args);
14970 ctx.argidx = 0;
14971 }
14972 else {
14973 ctx.arglen = -1;
14974 ctx.argidx = -2;
14975 }
14976 ctx.args_owned = 0;
14977 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14978 ctx.dict = args;
14979 else
14980 ctx.dict = NULL;
14981 ctx.args = args;
14982
14983 while (--ctx.fmtcnt >= 0) {
14984 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14985 Py_ssize_t nonfmtpos;
14986
14987 nonfmtpos = ctx.fmtpos++;
14988 while (ctx.fmtcnt >= 0 &&
14989 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14990 ctx.fmtpos++;
14991 ctx.fmtcnt--;
14992 }
14993 if (ctx.fmtcnt < 0) {
14994 ctx.fmtpos--;
14995 ctx.writer.overallocate = 0;
14996 }
14997
14998 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14999 nonfmtpos, ctx.fmtpos) < 0)
15000 goto onError;
15001 }
15002 else {
15003 ctx.fmtpos++;
15004 if (unicode_format_arg(&ctx) == -1)
15005 goto onError;
15006 }
15007 }
15008
15009 if (ctx.argidx < ctx.arglen && !ctx.dict) {
15010 PyErr_SetString(PyExc_TypeError,
15011 "not all arguments converted during string formatting");
15012 goto onError;
15013 }
15014
15015 if (ctx.args_owned) {
15016 Py_DECREF(ctx.args);
15017 }
15018 return _PyUnicodeWriter_Finish(&ctx.writer);
15019
15020 onError:
15021 _PyUnicodeWriter_Dealloc(&ctx.writer);
15022 if (ctx.args_owned) {
15023 Py_DECREF(ctx.args);
15024 }
15025 return NULL;
15026 }
15027
15028 static PyObject *
15029 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15030
15031 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15032 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15033 {
15034 PyObject *x = NULL;
15035 static char *kwlist[] = {"object", "encoding", "errors", 0};
15036 char *encoding = NULL;
15037 char *errors = NULL;
15038
15039 if (type != &PyUnicode_Type)
15040 return unicode_subtype_new(type, args, kwds);
15041 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15042 kwlist, &x, &encoding, &errors))
15043 return NULL;
15044 if (x == NULL)
15045 _Py_RETURN_UNICODE_EMPTY();
15046 if (encoding == NULL && errors == NULL)
15047 return PyObject_Str(x);
15048 else
15049 return PyUnicode_FromEncodedObject(x, encoding, errors);
15050 }
15051
15052 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15053 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15054 {
15055 PyObject *unicode, *self;
15056 Py_ssize_t length, char_size;
15057 int share_wstr, share_utf8;
15058 unsigned int kind;
15059 void *data;
15060
15061 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15062
15063 unicode = unicode_new(&PyUnicode_Type, args, kwds);
15064 if (unicode == NULL)
15065 return NULL;
15066 assert(_PyUnicode_CHECK(unicode));
15067 if (PyUnicode_READY(unicode) == -1) {
15068 Py_DECREF(unicode);
15069 return NULL;
15070 }
15071
15072 self = type->tp_alloc(type, 0);
15073 if (self == NULL) {
15074 Py_DECREF(unicode);
15075 return NULL;
15076 }
15077 kind = PyUnicode_KIND(unicode);
15078 length = PyUnicode_GET_LENGTH(unicode);
15079
15080 _PyUnicode_LENGTH(self) = length;
15081 #ifdef Py_DEBUG
15082 _PyUnicode_HASH(self) = -1;
15083 #else
15084 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15085 #endif
15086 _PyUnicode_STATE(self).interned = 0;
15087 _PyUnicode_STATE(self).kind = kind;
15088 _PyUnicode_STATE(self).compact = 0;
15089 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15090 _PyUnicode_STATE(self).ready = 1;
15091 _PyUnicode_WSTR(self) = NULL;
15092 _PyUnicode_UTF8_LENGTH(self) = 0;
15093 _PyUnicode_UTF8(self) = NULL;
15094 _PyUnicode_WSTR_LENGTH(self) = 0;
15095 _PyUnicode_DATA_ANY(self) = NULL;
15096
15097 share_utf8 = 0;
15098 share_wstr = 0;
15099 if (kind == PyUnicode_1BYTE_KIND) {
15100 char_size = 1;
15101 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15102 share_utf8 = 1;
15103 }
15104 else if (kind == PyUnicode_2BYTE_KIND) {
15105 char_size = 2;
15106 if (sizeof(wchar_t) == 2)
15107 share_wstr = 1;
15108 }
15109 else {
15110 assert(kind == PyUnicode_4BYTE_KIND);
15111 char_size = 4;
15112 if (sizeof(wchar_t) == 4)
15113 share_wstr = 1;
15114 }
15115
15116 /* Ensure we won't overflow the length. */
15117 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15118 PyErr_NoMemory();
15119 goto onError;
15120 }
15121 data = PyObject_MALLOC((length + 1) * char_size);
15122 if (data == NULL) {
15123 PyErr_NoMemory();
15124 goto onError;
15125 }
15126
15127 _PyUnicode_DATA_ANY(self) = data;
15128 if (share_utf8) {
15129 _PyUnicode_UTF8_LENGTH(self) = length;
15130 _PyUnicode_UTF8(self) = data;
15131 }
15132 if (share_wstr) {
15133 _PyUnicode_WSTR_LENGTH(self) = length;
15134 _PyUnicode_WSTR(self) = (wchar_t *)data;
15135 }
15136
15137 memcpy(data, PyUnicode_DATA(unicode),
15138 kind * (length + 1));
15139 assert(_PyUnicode_CheckConsistency(self, 1));
15140 #ifdef Py_DEBUG
15141 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15142 #endif
15143 Py_DECREF(unicode);
15144 return self;
15145
15146 onError:
15147 Py_DECREF(unicode);
15148 Py_DECREF(self);
15149 return NULL;
15150 }
15151
15152 PyDoc_STRVAR(unicode_doc,
15153 "str(object='') -> str\n\
15154 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15155 \n\
15156 Create a new string object from the given object. If encoding or\n\
15157 errors is specified, then the object must expose a data buffer\n\
15158 that will be decoded using the given encoding and error handler.\n\
15159 Otherwise, returns the result of object.__str__() (if defined)\n\
15160 or repr(object).\n\
15161 encoding defaults to sys.getdefaultencoding().\n\
15162 errors defaults to 'strict'.");
15163
15164 static PyObject *unicode_iter(PyObject *seq);
15165
15166 PyTypeObject PyUnicode_Type = {
15167 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15168 "str", /* tp_name */
15169 sizeof(PyUnicodeObject), /* tp_basicsize */
15170 0, /* tp_itemsize */
15171 /* Slots */
15172 (destructor)unicode_dealloc, /* tp_dealloc */
15173 0, /* tp_vectorcall_offset */
15174 0, /* tp_getattr */
15175 0, /* tp_setattr */
15176 0, /* tp_as_async */
15177 unicode_repr, /* tp_repr */
15178 &unicode_as_number, /* tp_as_number */
15179 &unicode_as_sequence, /* tp_as_sequence */
15180 &unicode_as_mapping, /* tp_as_mapping */
15181 (hashfunc) unicode_hash, /* tp_hash*/
15182 0, /* tp_call*/
15183 (reprfunc) unicode_str, /* tp_str */
15184 PyObject_GenericGetAttr, /* tp_getattro */
15185 0, /* tp_setattro */
15186 0, /* tp_as_buffer */
15187 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15188 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15189 unicode_doc, /* tp_doc */
15190 0, /* tp_traverse */
15191 0, /* tp_clear */
15192 PyUnicode_RichCompare, /* tp_richcompare */
15193 0, /* tp_weaklistoffset */
15194 unicode_iter, /* tp_iter */
15195 0, /* tp_iternext */
15196 unicode_methods, /* tp_methods */
15197 0, /* tp_members */
15198 0, /* tp_getset */
15199 &PyBaseObject_Type, /* tp_base */
15200 0, /* tp_dict */
15201 0, /* tp_descr_get */
15202 0, /* tp_descr_set */
15203 0, /* tp_dictoffset */
15204 0, /* tp_init */
15205 0, /* tp_alloc */
15206 unicode_new, /* tp_new */
15207 PyObject_Del, /* tp_free */
15208 };
15209
15210 /* Initialize the Unicode implementation */
15211
15212 PyStatus
_PyUnicode_Init(void)15213 _PyUnicode_Init(void)
15214 {
15215 /* XXX - move this array to unicodectype.c ? */
15216 Py_UCS2 linebreak[] = {
15217 0x000A, /* LINE FEED */
15218 0x000D, /* CARRIAGE RETURN */
15219 0x001C, /* FILE SEPARATOR */
15220 0x001D, /* GROUP SEPARATOR */
15221 0x001E, /* RECORD SEPARATOR */
15222 0x0085, /* NEXT LINE */
15223 0x2028, /* LINE SEPARATOR */
15224 0x2029, /* PARAGRAPH SEPARATOR */
15225 };
15226
15227 /* Init the implementation */
15228 _Py_INCREF_UNICODE_EMPTY();
15229 if (!unicode_empty) {
15230 return _PyStatus_ERR("Can't create empty string");
15231 }
15232 Py_DECREF(unicode_empty);
15233
15234 if (PyType_Ready(&PyUnicode_Type) < 0) {
15235 return _PyStatus_ERR("Can't initialize unicode type");
15236 }
15237
15238 /* initialize the linebreak bloom filter */
15239 bloom_linebreak = make_bloom_mask(
15240 PyUnicode_2BYTE_KIND, linebreak,
15241 Py_ARRAY_LENGTH(linebreak));
15242
15243 if (PyType_Ready(&EncodingMapType) < 0) {
15244 return _PyStatus_ERR("Can't initialize encoding map type");
15245 }
15246 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15247 return _PyStatus_ERR("Can't initialize field name iterator type");
15248 }
15249 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15250 return _PyStatus_ERR("Can't initialize formatter iter type");
15251 }
15252 return _PyStatus_OK();
15253 }
15254
15255 /* Finalize the Unicode implementation */
15256
15257 int
PyUnicode_ClearFreeList(void)15258 PyUnicode_ClearFreeList(void)
15259 {
15260 return 0;
15261 }
15262
15263
15264 void
PyUnicode_InternInPlace(PyObject ** p)15265 PyUnicode_InternInPlace(PyObject **p)
15266 {
15267 PyObject *s = *p;
15268 PyObject *t;
15269 #ifdef Py_DEBUG
15270 assert(s != NULL);
15271 assert(_PyUnicode_CHECK(s));
15272 #else
15273 if (s == NULL || !PyUnicode_Check(s))
15274 return;
15275 #endif
15276 /* If it's a subclass, we don't really know what putting
15277 it in the interned dict might do. */
15278 if (!PyUnicode_CheckExact(s))
15279 return;
15280 if (PyUnicode_CHECK_INTERNED(s))
15281 return;
15282 if (interned == NULL) {
15283 interned = PyDict_New();
15284 if (interned == NULL) {
15285 PyErr_Clear(); /* Don't leave an exception */
15286 return;
15287 }
15288 }
15289 t = PyDict_SetDefault(interned, s, s);
15290 if (t == NULL) {
15291 PyErr_Clear();
15292 return;
15293 }
15294 if (t != s) {
15295 Py_INCREF(t);
15296 Py_SETREF(*p, t);
15297 return;
15298 }
15299 /* The two references in interned are not counted by refcnt.
15300 The deallocator will take care of this */
15301 Py_REFCNT(s) -= 2;
15302 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15303 }
15304
15305 void
PyUnicode_InternImmortal(PyObject ** p)15306 PyUnicode_InternImmortal(PyObject **p)
15307 {
15308 PyUnicode_InternInPlace(p);
15309 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15310 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15311 Py_INCREF(*p);
15312 }
15313 }
15314
15315 PyObject *
PyUnicode_InternFromString(const char * cp)15316 PyUnicode_InternFromString(const char *cp)
15317 {
15318 PyObject *s = PyUnicode_FromString(cp);
15319 if (s == NULL)
15320 return NULL;
15321 PyUnicode_InternInPlace(&s);
15322 return s;
15323 }
15324
15325
15326 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15327 static void
unicode_release_interned(void)15328 unicode_release_interned(void)
15329 {
15330 PyObject *keys;
15331 PyObject *s;
15332 Py_ssize_t i, n;
15333 Py_ssize_t immortal_size = 0, mortal_size = 0;
15334
15335 if (interned == NULL || !PyDict_Check(interned))
15336 return;
15337 keys = PyDict_Keys(interned);
15338 if (keys == NULL || !PyList_Check(keys)) {
15339 PyErr_Clear();
15340 return;
15341 }
15342
15343 /* Since unicode_release_interned() is intended to help a leak
15344 detector, interned unicode strings are not forcibly deallocated;
15345 rather, we give them their stolen references back, and then clear
15346 and DECREF the interned dict. */
15347
15348 n = PyList_GET_SIZE(keys);
15349 #ifdef INTERNED_STATS
15350 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15351 n);
15352 #endif
15353 for (i = 0; i < n; i++) {
15354 s = PyList_GET_ITEM(keys, i);
15355 if (PyUnicode_READY(s) == -1) {
15356 Py_UNREACHABLE();
15357 }
15358 switch (PyUnicode_CHECK_INTERNED(s)) {
15359 case SSTATE_NOT_INTERNED:
15360 /* XXX Shouldn't happen */
15361 break;
15362 case SSTATE_INTERNED_IMMORTAL:
15363 Py_REFCNT(s) += 1;
15364 immortal_size += PyUnicode_GET_LENGTH(s);
15365 break;
15366 case SSTATE_INTERNED_MORTAL:
15367 Py_REFCNT(s) += 2;
15368 mortal_size += PyUnicode_GET_LENGTH(s);
15369 break;
15370 default:
15371 Py_FatalError("Inconsistent interned string state.");
15372 }
15373 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15374 }
15375 #ifdef INTERNED_STATS
15376 fprintf(stderr, "total size of all interned strings: "
15377 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15378 "mortal/immortal\n", mortal_size, immortal_size);
15379 #endif
15380 Py_DECREF(keys);
15381 PyDict_Clear(interned);
15382 Py_CLEAR(interned);
15383 }
15384 #endif
15385
15386
15387 /********************* Unicode Iterator **************************/
15388
15389 typedef struct {
15390 PyObject_HEAD
15391 Py_ssize_t it_index;
15392 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15393 } unicodeiterobject;
15394
15395 static void
unicodeiter_dealloc(unicodeiterobject * it)15396 unicodeiter_dealloc(unicodeiterobject *it)
15397 {
15398 _PyObject_GC_UNTRACK(it);
15399 Py_XDECREF(it->it_seq);
15400 PyObject_GC_Del(it);
15401 }
15402
15403 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15404 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15405 {
15406 Py_VISIT(it->it_seq);
15407 return 0;
15408 }
15409
15410 static PyObject *
unicodeiter_next(unicodeiterobject * it)15411 unicodeiter_next(unicodeiterobject *it)
15412 {
15413 PyObject *seq, *item;
15414
15415 assert(it != NULL);
15416 seq = it->it_seq;
15417 if (seq == NULL)
15418 return NULL;
15419 assert(_PyUnicode_CHECK(seq));
15420
15421 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15422 int kind = PyUnicode_KIND(seq);
15423 void *data = PyUnicode_DATA(seq);
15424 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15425 item = PyUnicode_FromOrdinal(chr);
15426 if (item != NULL)
15427 ++it->it_index;
15428 return item;
15429 }
15430
15431 it->it_seq = NULL;
15432 Py_DECREF(seq);
15433 return NULL;
15434 }
15435
15436 static PyObject *
unicodeiter_len(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15437 unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15438 {
15439 Py_ssize_t len = 0;
15440 if (it->it_seq)
15441 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15442 return PyLong_FromSsize_t(len);
15443 }
15444
15445 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15446
15447 static PyObject *
unicodeiter_reduce(unicodeiterobject * it,PyObject * Py_UNUSED (ignored))15448 unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15449 {
15450 _Py_IDENTIFIER(iter);
15451 if (it->it_seq != NULL) {
15452 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
15453 it->it_seq, it->it_index);
15454 } else {
15455 PyObject *u = (PyObject *)_PyUnicode_New(0);
15456 if (u == NULL)
15457 return NULL;
15458 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
15459 }
15460 }
15461
15462 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15463
15464 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15465 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15466 {
15467 Py_ssize_t index = PyLong_AsSsize_t(state);
15468 if (index == -1 && PyErr_Occurred())
15469 return NULL;
15470 if (it->it_seq != NULL) {
15471 if (index < 0)
15472 index = 0;
15473 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15474 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15475 it->it_index = index;
15476 }
15477 Py_RETURN_NONE;
15478 }
15479
15480 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15481
15482 static PyMethodDef unicodeiter_methods[] = {
15483 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15484 length_hint_doc},
15485 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15486 reduce_doc},
15487 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15488 setstate_doc},
15489 {NULL, NULL} /* sentinel */
15490 };
15491
15492 PyTypeObject PyUnicodeIter_Type = {
15493 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15494 "str_iterator", /* tp_name */
15495 sizeof(unicodeiterobject), /* tp_basicsize */
15496 0, /* tp_itemsize */
15497 /* methods */
15498 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15499 0, /* tp_vectorcall_offset */
15500 0, /* tp_getattr */
15501 0, /* tp_setattr */
15502 0, /* tp_as_async */
15503 0, /* tp_repr */
15504 0, /* tp_as_number */
15505 0, /* tp_as_sequence */
15506 0, /* tp_as_mapping */
15507 0, /* tp_hash */
15508 0, /* tp_call */
15509 0, /* tp_str */
15510 PyObject_GenericGetAttr, /* tp_getattro */
15511 0, /* tp_setattro */
15512 0, /* tp_as_buffer */
15513 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15514 0, /* tp_doc */
15515 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15516 0, /* tp_clear */
15517 0, /* tp_richcompare */
15518 0, /* tp_weaklistoffset */
15519 PyObject_SelfIter, /* tp_iter */
15520 (iternextfunc)unicodeiter_next, /* tp_iternext */
15521 unicodeiter_methods, /* tp_methods */
15522 0,
15523 };
15524
15525 static PyObject *
unicode_iter(PyObject * seq)15526 unicode_iter(PyObject *seq)
15527 {
15528 unicodeiterobject *it;
15529
15530 if (!PyUnicode_Check(seq)) {
15531 PyErr_BadInternalCall();
15532 return NULL;
15533 }
15534 if (PyUnicode_READY(seq) == -1)
15535 return NULL;
15536 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15537 if (it == NULL)
15538 return NULL;
15539 it->it_index = 0;
15540 Py_INCREF(seq);
15541 it->it_seq = seq;
15542 _PyObject_GC_TRACK(it);
15543 return (PyObject *)it;
15544 }
15545
15546
15547 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15548 Py_UNICODE_strlen(const Py_UNICODE *u)
15549 {
15550 return wcslen(u);
15551 }
15552
15553 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15554 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15555 {
15556 Py_UNICODE *u = s1;
15557 while ((*u++ = *s2++));
15558 return s1;
15559 }
15560
15561 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15562 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15563 {
15564 Py_UNICODE *u = s1;
15565 while ((*u++ = *s2++))
15566 if (n-- == 0)
15567 break;
15568 return s1;
15569 }
15570
15571 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15572 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15573 {
15574 Py_UNICODE *u1 = s1;
15575 u1 += wcslen(u1);
15576 while ((*u1++ = *s2++));
15577 return s1;
15578 }
15579
15580 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15581 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15582 {
15583 while (*s1 && *s2 && *s1 == *s2)
15584 s1++, s2++;
15585 if (*s1 && *s2)
15586 return (*s1 < *s2) ? -1 : +1;
15587 if (*s1)
15588 return 1;
15589 if (*s2)
15590 return -1;
15591 return 0;
15592 }
15593
15594 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15595 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15596 {
15597 Py_UNICODE u1, u2;
15598 for (; n != 0; n--) {
15599 u1 = *s1;
15600 u2 = *s2;
15601 if (u1 != u2)
15602 return (u1 < u2) ? -1 : +1;
15603 if (u1 == '\0')
15604 return 0;
15605 s1++;
15606 s2++;
15607 }
15608 return 0;
15609 }
15610
15611 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15612 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15613 {
15614 const Py_UNICODE *p;
15615 for (p = s; *p; p++)
15616 if (*p == c)
15617 return (Py_UNICODE*)p;
15618 return NULL;
15619 }
15620
15621 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15622 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15623 {
15624 const Py_UNICODE *p;
15625 p = s + wcslen(s);
15626 while (p != s) {
15627 p--;
15628 if (*p == c)
15629 return (Py_UNICODE*)p;
15630 }
15631 return NULL;
15632 }
15633
15634 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15635 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15636 {
15637 Py_UNICODE *u, *copy;
15638 Py_ssize_t len, size;
15639
15640 if (!PyUnicode_Check(unicode)) {
15641 PyErr_BadArgument();
15642 return NULL;
15643 }
15644 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15645 if (u == NULL)
15646 return NULL;
15647 /* Ensure we won't overflow the size. */
15648 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15649 PyErr_NoMemory();
15650 return NULL;
15651 }
15652 size = len + 1; /* copy the null character */
15653 size *= sizeof(Py_UNICODE);
15654 copy = PyMem_Malloc(size);
15655 if (copy == NULL) {
15656 PyErr_NoMemory();
15657 return NULL;
15658 }
15659 memcpy(copy, u, size);
15660 return copy;
15661 }
15662
15663
15664 static int
encode_wstr_utf8(wchar_t * wstr,char ** str,const char * name)15665 encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15666 {
15667 int res;
15668 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15669 if (res == -2) {
15670 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15671 return -1;
15672 }
15673 if (res < 0) {
15674 PyErr_NoMemory();
15675 return -1;
15676 }
15677 return 0;
15678 }
15679
15680
15681 static int
config_get_codec_name(wchar_t ** config_encoding)15682 config_get_codec_name(wchar_t **config_encoding)
15683 {
15684 char *encoding;
15685 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15686 return -1;
15687 }
15688
15689 PyObject *name_obj = NULL;
15690 PyObject *codec = _PyCodec_Lookup(encoding);
15691 PyMem_RawFree(encoding);
15692
15693 if (!codec)
15694 goto error;
15695
15696 name_obj = PyObject_GetAttrString(codec, "name");
15697 Py_CLEAR(codec);
15698 if (!name_obj) {
15699 goto error;
15700 }
15701
15702 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15703 Py_DECREF(name_obj);
15704 if (wname == NULL) {
15705 goto error;
15706 }
15707
15708 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15709 if (raw_wname == NULL) {
15710 PyMem_Free(wname);
15711 PyErr_NoMemory();
15712 goto error;
15713 }
15714
15715 PyMem_RawFree(*config_encoding);
15716 *config_encoding = raw_wname;
15717
15718 PyMem_Free(wname);
15719 return 0;
15720
15721 error:
15722 Py_XDECREF(codec);
15723 Py_XDECREF(name_obj);
15724 return -1;
15725 }
15726
15727
15728 static PyStatus
init_stdio_encoding(PyThreadState * tstate)15729 init_stdio_encoding(PyThreadState *tstate)
15730 {
15731 /* Update the stdio encoding to the normalized Python codec name. */
15732 PyConfig *config = &tstate->interp->config;
15733 if (config_get_codec_name(&config->stdio_encoding) < 0) {
15734 return _PyStatus_ERR("failed to get the Python codec name "
15735 "of the stdio encoding");
15736 }
15737 return _PyStatus_OK();
15738 }
15739
15740
15741 static int
init_fs_codec(PyInterpreterState * interp)15742 init_fs_codec(PyInterpreterState *interp)
15743 {
15744 PyConfig *config = &interp->config;
15745
15746 _Py_error_handler error_handler;
15747 error_handler = get_error_handler_wide(config->filesystem_errors);
15748 if (error_handler == _Py_ERROR_UNKNOWN) {
15749 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15750 return -1;
15751 }
15752
15753 char *encoding, *errors;
15754 if (encode_wstr_utf8(config->filesystem_encoding,
15755 &encoding,
15756 "filesystem_encoding") < 0) {
15757 return -1;
15758 }
15759
15760 if (encode_wstr_utf8(config->filesystem_errors,
15761 &errors,
15762 "filesystem_errors") < 0) {
15763 PyMem_RawFree(encoding);
15764 return -1;
15765 }
15766
15767 PyMem_RawFree(interp->fs_codec.encoding);
15768 interp->fs_codec.encoding = encoding;
15769 PyMem_RawFree(interp->fs_codec.errors);
15770 interp->fs_codec.errors = errors;
15771 interp->fs_codec.error_handler = error_handler;
15772
15773 /* At this point, PyUnicode_EncodeFSDefault() and
15774 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15775 the C implementation of the filesystem encoding. */
15776
15777 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15778 global configuration variables. */
15779 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15780 interp->fs_codec.errors) < 0) {
15781 PyErr_NoMemory();
15782 return -1;
15783 }
15784 return 0;
15785 }
15786
15787
15788 static PyStatus
init_fs_encoding(PyThreadState * tstate)15789 init_fs_encoding(PyThreadState *tstate)
15790 {
15791 PyInterpreterState *interp = tstate->interp;
15792
15793 /* Update the filesystem encoding to the normalized Python codec name.
15794 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15795 (Python codec name). */
15796 PyConfig *config = &interp->config;
15797 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
15798 _Py_DumpPathConfig(tstate);
15799 return _PyStatus_ERR("failed to get the Python codec "
15800 "of the filesystem encoding");
15801 }
15802
15803 if (init_fs_codec(interp) < 0) {
15804 return _PyStatus_ERR("cannot initialize filesystem codec");
15805 }
15806 return _PyStatus_OK();
15807 }
15808
15809
15810 PyStatus
_PyUnicode_InitEncodings(PyThreadState * tstate)15811 _PyUnicode_InitEncodings(PyThreadState *tstate)
15812 {
15813 PyStatus status = init_fs_encoding(tstate);
15814 if (_PyStatus_EXCEPTION(status)) {
15815 return status;
15816 }
15817
15818 return init_stdio_encoding(tstate);
15819 }
15820
15821
15822 #ifdef MS_WINDOWS
15823 int
_PyUnicode_EnableLegacyWindowsFSEncoding(void)15824 _PyUnicode_EnableLegacyWindowsFSEncoding(void)
15825 {
15826 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15827 PyConfig *config = &interp->config;
15828
15829 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15830 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15831 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15832 if (encoding == NULL || errors == NULL) {
15833 PyMem_RawFree(encoding);
15834 PyMem_RawFree(errors);
15835 PyErr_NoMemory();
15836 return -1;
15837 }
15838
15839 PyMem_RawFree(config->filesystem_encoding);
15840 config->filesystem_encoding = encoding;
15841 PyMem_RawFree(config->filesystem_errors);
15842 config->filesystem_errors = errors;
15843
15844 return init_fs_codec(interp);
15845 }
15846 #endif
15847
15848
15849 void
_PyUnicode_Fini(void)15850 _PyUnicode_Fini(void)
15851 {
15852 #if defined(WITH_VALGRIND) || defined(__INSURE__)
15853 /* Insure++ is a memory analysis tool that aids in discovering
15854 * memory leaks and other memory problems. On Python exit, the
15855 * interned string dictionaries are flagged as being in use at exit
15856 * (which it is). Under normal circumstances, this is fine because
15857 * the memory will be automatically reclaimed by the system. Under
15858 * memory debugging, it's a huge source of useless noise, so we
15859 * trade off slower shutdown for less distraction in the memory
15860 * reports. -baw
15861 */
15862 unicode_release_interned();
15863 #endif /* __INSURE__ */
15864
15865 Py_CLEAR(unicode_empty);
15866
15867 for (Py_ssize_t i = 0; i < 256; i++) {
15868 Py_CLEAR(unicode_latin1[i]);
15869 }
15870 _PyUnicode_ClearStaticStrings();
15871 (void)PyUnicode_ClearFreeList();
15872
15873 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15874 PyMem_RawFree(interp->fs_codec.encoding);
15875 interp->fs_codec.encoding = NULL;
15876 PyMem_RawFree(interp->fs_codec.errors);
15877 interp->fs_codec.errors = NULL;
15878 }
15879
15880
15881 /* A _string module, to export formatter_parser and formatter_field_name_split
15882 to the string.Formatter class implemented in Python. */
15883
15884 static PyMethodDef _string_methods[] = {
15885 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15886 METH_O, PyDoc_STR("split the argument as a field name")},
15887 {"formatter_parser", (PyCFunction) formatter_parser,
15888 METH_O, PyDoc_STR("parse the argument as a format string")},
15889 {NULL, NULL}
15890 };
15891
15892 static struct PyModuleDef _string_module = {
15893 PyModuleDef_HEAD_INIT,
15894 "_string",
15895 PyDoc_STR("string helper module"),
15896 0,
15897 _string_methods,
15898 NULL,
15899 NULL,
15900 NULL,
15901 NULL
15902 };
15903
15904 PyMODINIT_FUNC
PyInit__string(void)15905 PyInit__string(void)
15906 {
15907 return PyModule_Create(&_string_module);
15908 }
15909
15910
15911 #ifdef __cplusplus
15912 }
15913 #endif
15914