1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6 Major speed upgrades to the method implementations at the Reykjavik
7 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9 Copyright (c) Corporation for National Research Initiatives.
10
11 --------------------------------------------------------------------
12 The original string type implementation is:
13
14 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
16
17 By obtaining, using, and/or copying this software and/or its
18 associated documentation, you agree that you have read, understood,
19 and will comply with the following terms and conditions:
20
21 Permission to use, copy, modify, and distribute this software and its
22 associated documentation for any purpose and without fee is hereby
23 granted, provided that the above copyright notice appears in all
24 copies, and that both that copyright notice and this permission notice
25 appear in supporting documentation, and that the name of Secret Labs
26 AB or the author not be used in advertising or publicity pertaining to
27 distribution of the software without specific, written prior
28 permission.
29
30 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37 --------------------------------------------------------------------
38
39 */
40
41 #define PY_SSIZE_T_CLEAN
42 #include "Python.h"
43 #include "internal/pystate.h"
44 #include "ucnhash.h"
45 #include "bytes_methods.h"
46 #include "stringlib/eq.h"
47
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51
52 /*[clinic input]
53 class str "PyObject *" "&PyUnicode_Type"
54 [clinic start generated code]*/
55 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
56
57 /*[python input]
58 class Py_UCS4_converter(CConverter):
59 type = 'Py_UCS4'
60 converter = 'convert_uc'
61
62 def converter_init(self):
63 if self.default is not unspecified:
64 self.c_default = ascii(self.default)
65 if len(self.c_default) > 4 or self.c_default[0] != "'":
66 self.c_default = hex(ord(self.default))
67
68 [python start generated code]*/
69 /*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
70
71 /* --- Globals ------------------------------------------------------------
72
73 NOTE: In the interpreter's initialization phase, some globals are currently
74 initialized dynamically as needed. In the process Unicode objects may
75 be created before the Unicode type is ready.
76
77 */
78
79
80 #ifdef __cplusplus
81 extern "C" {
82 #endif
83
84 /* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
85 #define MAX_UNICODE 0x10ffff
86
87 #ifdef Py_DEBUG
88 # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
89 #else
90 # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
91 #endif
92
93 #define _PyUnicode_UTF8(op) \
94 (((PyCompactUnicodeObject*)(op))->utf8)
95 #define PyUnicode_UTF8(op) \
96 (assert(_PyUnicode_CHECK(op)), \
97 assert(PyUnicode_IS_READY(op)), \
98 PyUnicode_IS_COMPACT_ASCII(op) ? \
99 ((char*)((PyASCIIObject*)(op) + 1)) : \
100 _PyUnicode_UTF8(op))
101 #define _PyUnicode_UTF8_LENGTH(op) \
102 (((PyCompactUnicodeObject*)(op))->utf8_length)
103 #define PyUnicode_UTF8_LENGTH(op) \
104 (assert(_PyUnicode_CHECK(op)), \
105 assert(PyUnicode_IS_READY(op)), \
106 PyUnicode_IS_COMPACT_ASCII(op) ? \
107 ((PyASCIIObject*)(op))->length : \
108 _PyUnicode_UTF8_LENGTH(op))
109 #define _PyUnicode_WSTR(op) \
110 (((PyASCIIObject*)(op))->wstr)
111 #define _PyUnicode_WSTR_LENGTH(op) \
112 (((PyCompactUnicodeObject*)(op))->wstr_length)
113 #define _PyUnicode_LENGTH(op) \
114 (((PyASCIIObject *)(op))->length)
115 #define _PyUnicode_STATE(op) \
116 (((PyASCIIObject *)(op))->state)
117 #define _PyUnicode_HASH(op) \
118 (((PyASCIIObject *)(op))->hash)
119 #define _PyUnicode_KIND(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 ((PyASCIIObject *)(op))->state.kind)
122 #define _PyUnicode_GET_LENGTH(op) \
123 (assert(_PyUnicode_CHECK(op)), \
124 ((PyASCIIObject *)(op))->length)
125 #define _PyUnicode_DATA_ANY(op) \
126 (((PyUnicodeObject*)(op))->data.any)
127
128 #undef PyUnicode_READY
129 #define PyUnicode_READY(op) \
130 (assert(_PyUnicode_CHECK(op)), \
131 (PyUnicode_IS_READY(op) ? \
132 0 : \
133 _PyUnicode_Ready(op)))
134
135 #define _PyUnicode_SHARE_UTF8(op) \
136 (assert(_PyUnicode_CHECK(op)), \
137 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
138 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
139 #define _PyUnicode_SHARE_WSTR(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
142
143 /* true if the Unicode object has an allocated UTF-8 memory block
144 (not shared with other data) */
145 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
146 ((!PyUnicode_IS_COMPACT_ASCII(op) \
147 && _PyUnicode_UTF8(op) \
148 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
149
150 /* true if the Unicode object has an allocated wstr memory block
151 (not shared with other data) */
152 #define _PyUnicode_HAS_WSTR_MEMORY(op) \
153 ((_PyUnicode_WSTR(op) && \
154 (!PyUnicode_IS_READY(op) || \
155 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
156
157 /* Generic helper macro to convert characters of different types.
158 from_type and to_type have to be valid type names, begin and end
159 are pointers to the source characters which should be of type
160 "from_type *". to is a pointer of type "to_type *" and points to the
161 buffer where the result characters are written to. */
162 #define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
163 do { \
164 to_type *_to = (to_type *)(to); \
165 const from_type *_iter = (from_type *)(begin); \
166 const from_type *_end = (from_type *)(end); \
167 Py_ssize_t n = (_end) - (_iter); \
168 const from_type *_unrolled_end = \
169 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
170 while (_iter < (_unrolled_end)) { \
171 _to[0] = (to_type) _iter[0]; \
172 _to[1] = (to_type) _iter[1]; \
173 _to[2] = (to_type) _iter[2]; \
174 _to[3] = (to_type) _iter[3]; \
175 _iter += 4; _to += 4; \
176 } \
177 while (_iter < (_end)) \
178 *_to++ = (to_type) *_iter++; \
179 } while (0)
180
181 #ifdef MS_WINDOWS
182 /* On Windows, overallocate by 50% is the best factor */
183 # define OVERALLOCATE_FACTOR 2
184 #else
185 /* On Linux, overallocate by 25% is the best factor */
186 # define OVERALLOCATE_FACTOR 4
187 #endif
188
189 /* This dictionary holds all interned unicode strings. Note that references
190 to strings in this dictionary are *not* counted in the string's ob_refcnt.
191 When the interned string reaches a refcnt of 0 the string deallocation
192 function will delete the reference from this dictionary.
193
194 Another way to look at this is that to say that the actual reference
195 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
196 */
197 static PyObject *interned = NULL;
198
199 /* The empty Unicode object is shared to improve performance. */
200 static PyObject *unicode_empty = NULL;
201
202 #define _Py_INCREF_UNICODE_EMPTY() \
203 do { \
204 if (unicode_empty != NULL) \
205 Py_INCREF(unicode_empty); \
206 else { \
207 unicode_empty = PyUnicode_New(0, 0); \
208 if (unicode_empty != NULL) { \
209 Py_INCREF(unicode_empty); \
210 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
211 } \
212 } \
213 } while (0)
214
215 #define _Py_RETURN_UNICODE_EMPTY() \
216 do { \
217 _Py_INCREF_UNICODE_EMPTY(); \
218 return unicode_empty; \
219 } while (0)
220
221 #define FILL(kind, data, value, start, length) \
222 do { \
223 assert(0 <= start); \
224 assert(kind != PyUnicode_WCHAR_KIND); \
225 switch (kind) { \
226 case PyUnicode_1BYTE_KIND: { \
227 assert(value <= 0xff); \
228 Py_UCS1 ch = (unsigned char)value; \
229 Py_UCS1 *to = (Py_UCS1 *)data + start; \
230 memset(to, ch, length); \
231 break; \
232 } \
233 case PyUnicode_2BYTE_KIND: { \
234 assert(value <= 0xffff); \
235 Py_UCS2 ch = (Py_UCS2)value; \
236 Py_UCS2 *to = (Py_UCS2 *)data + start; \
237 const Py_UCS2 *end = to + length; \
238 for (; to < end; ++to) *to = ch; \
239 break; \
240 } \
241 case PyUnicode_4BYTE_KIND: { \
242 assert(value <= MAX_UNICODE); \
243 Py_UCS4 ch = value; \
244 Py_UCS4 * to = (Py_UCS4 *)data + start; \
245 const Py_UCS4 *end = to + length; \
246 for (; to < end; ++to) *to = ch; \
247 break; \
248 } \
249 default: Py_UNREACHABLE(); \
250 } \
251 } while (0)
252
253
254 /* Forward declaration */
255 static inline int
256 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
257
258 /* List of static strings. */
259 static _Py_Identifier *static_strings = NULL;
260
261 /* Single character Unicode strings in the Latin-1 range are being
262 shared as well. */
263 static PyObject *unicode_latin1[256] = {NULL};
264
265 /* Fast detection of the most frequent whitespace characters */
266 const unsigned char _Py_ascii_whitespace[] = {
267 0, 0, 0, 0, 0, 0, 0, 0,
268 /* case 0x0009: * CHARACTER TABULATION */
269 /* case 0x000A: * LINE FEED */
270 /* case 0x000B: * LINE TABULATION */
271 /* case 0x000C: * FORM FEED */
272 /* case 0x000D: * CARRIAGE RETURN */
273 0, 1, 1, 1, 1, 1, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 /* case 0x001C: * FILE SEPARATOR */
276 /* case 0x001D: * GROUP SEPARATOR */
277 /* case 0x001E: * RECORD SEPARATOR */
278 /* case 0x001F: * UNIT SEPARATOR */
279 0, 0, 0, 0, 1, 1, 1, 1,
280 /* case 0x0020: * SPACE */
281 1, 0, 0, 0, 0, 0, 0, 0,
282 0, 0, 0, 0, 0, 0, 0, 0,
283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
293 0, 0, 0, 0, 0, 0, 0, 0
294 };
295
296 /* forward */
297 static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
298 static PyObject* get_latin1_char(unsigned char ch);
299 static int unicode_modifiable(PyObject *unicode);
300
301
302 static PyObject *
303 _PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
304 static PyObject *
305 _PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
306 static PyObject *
307 _PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
308
309 static PyObject *
310 unicode_encode_call_errorhandler(const char *errors,
311 PyObject **errorHandler,const char *encoding, const char *reason,
312 PyObject *unicode, PyObject **exceptionObject,
313 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
314
315 static void
316 raise_encode_exception(PyObject **exceptionObject,
317 const char *encoding,
318 PyObject *unicode,
319 Py_ssize_t startpos, Py_ssize_t endpos,
320 const char *reason);
321
322 /* Same for linebreaks */
323 static const unsigned char ascii_linebreak[] = {
324 0, 0, 0, 0, 0, 0, 0, 0,
325 /* 0x000A, * LINE FEED */
326 /* 0x000B, * LINE TABULATION */
327 /* 0x000C, * FORM FEED */
328 /* 0x000D, * CARRIAGE RETURN */
329 0, 0, 1, 1, 1, 1, 0, 0,
330 0, 0, 0, 0, 0, 0, 0, 0,
331 /* 0x001C, * FILE SEPARATOR */
332 /* 0x001D, * GROUP SEPARATOR */
333 /* 0x001E, * RECORD SEPARATOR */
334 0, 0, 0, 0, 1, 1, 1, 0,
335 0, 0, 0, 0, 0, 0, 0, 0,
336 0, 0, 0, 0, 0, 0, 0, 0,
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
339
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
343 0, 0, 0, 0, 0, 0, 0, 0,
344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0
348 };
349
350 static int convert_uc(PyObject *obj, void *addr);
351
352 #include "clinic/unicodeobject.c.h"
353
354 typedef enum {
355 _Py_ERROR_UNKNOWN=0,
356 _Py_ERROR_STRICT,
357 _Py_ERROR_SURROGATEESCAPE,
358 _Py_ERROR_REPLACE,
359 _Py_ERROR_IGNORE,
360 _Py_ERROR_BACKSLASHREPLACE,
361 _Py_ERROR_SURROGATEPASS,
362 _Py_ERROR_XMLCHARREFREPLACE,
363 _Py_ERROR_OTHER
364 } _Py_error_handler;
365
366 static _Py_error_handler
get_error_handler(const char * errors)367 get_error_handler(const char *errors)
368 {
369 if (errors == NULL || strcmp(errors, "strict") == 0) {
370 return _Py_ERROR_STRICT;
371 }
372 if (strcmp(errors, "surrogateescape") == 0) {
373 return _Py_ERROR_SURROGATEESCAPE;
374 }
375 if (strcmp(errors, "replace") == 0) {
376 return _Py_ERROR_REPLACE;
377 }
378 if (strcmp(errors, "ignore") == 0) {
379 return _Py_ERROR_IGNORE;
380 }
381 if (strcmp(errors, "backslashreplace") == 0) {
382 return _Py_ERROR_BACKSLASHREPLACE;
383 }
384 if (strcmp(errors, "surrogatepass") == 0) {
385 return _Py_ERROR_SURROGATEPASS;
386 }
387 if (strcmp(errors, "xmlcharrefreplace") == 0) {
388 return _Py_ERROR_XMLCHARREFREPLACE;
389 }
390 return _Py_ERROR_OTHER;
391 }
392
393 /* The max unicode value is always 0x10FFFF while using the PEP-393 API.
394 This function is kept for backward compatibility with the old API. */
395 Py_UNICODE
PyUnicode_GetMax(void)396 PyUnicode_GetMax(void)
397 {
398 #ifdef Py_UNICODE_WIDE
399 return 0x10FFFF;
400 #else
401 /* This is actually an illegal character, so it should
402 not be passed to unichr. */
403 return 0xFFFF;
404 #endif
405 }
406
407 #ifdef Py_DEBUG
408 int
_PyUnicode_CheckConsistency(PyObject * op,int check_content)409 _PyUnicode_CheckConsistency(PyObject *op, int check_content)
410 {
411 PyASCIIObject *ascii;
412 unsigned int kind;
413
414 assert(PyUnicode_Check(op));
415
416 ascii = (PyASCIIObject *)op;
417 kind = ascii->state.kind;
418
419 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
420 assert(kind == PyUnicode_1BYTE_KIND);
421 assert(ascii->state.ready == 1);
422 }
423 else {
424 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
425 void *data;
426
427 if (ascii->state.compact == 1) {
428 data = compact + 1;
429 assert(kind == PyUnicode_1BYTE_KIND
430 || kind == PyUnicode_2BYTE_KIND
431 || kind == PyUnicode_4BYTE_KIND);
432 assert(ascii->state.ascii == 0);
433 assert(ascii->state.ready == 1);
434 assert (compact->utf8 != data);
435 }
436 else {
437 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
438
439 data = unicode->data.any;
440 if (kind == PyUnicode_WCHAR_KIND) {
441 assert(ascii->length == 0);
442 assert(ascii->hash == -1);
443 assert(ascii->state.compact == 0);
444 assert(ascii->state.ascii == 0);
445 assert(ascii->state.ready == 0);
446 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
447 assert(ascii->wstr != NULL);
448 assert(data == NULL);
449 assert(compact->utf8 == NULL);
450 }
451 else {
452 assert(kind == PyUnicode_1BYTE_KIND
453 || kind == PyUnicode_2BYTE_KIND
454 || kind == PyUnicode_4BYTE_KIND);
455 assert(ascii->state.compact == 0);
456 assert(ascii->state.ready == 1);
457 assert(data != NULL);
458 if (ascii->state.ascii) {
459 assert (compact->utf8 == data);
460 assert (compact->utf8_length == ascii->length);
461 }
462 else
463 assert (compact->utf8 != data);
464 }
465 }
466 if (kind != PyUnicode_WCHAR_KIND) {
467 if (
468 #if SIZEOF_WCHAR_T == 2
469 kind == PyUnicode_2BYTE_KIND
470 #else
471 kind == PyUnicode_4BYTE_KIND
472 #endif
473 )
474 {
475 assert(ascii->wstr == data);
476 assert(compact->wstr_length == ascii->length);
477 } else
478 assert(ascii->wstr != data);
479 }
480
481 if (compact->utf8 == NULL)
482 assert(compact->utf8_length == 0);
483 if (ascii->wstr == NULL)
484 assert(compact->wstr_length == 0);
485 }
486 /* check that the best kind is used */
487 if (check_content && kind != PyUnicode_WCHAR_KIND)
488 {
489 Py_ssize_t i;
490 Py_UCS4 maxchar = 0;
491 void *data;
492 Py_UCS4 ch;
493
494 data = PyUnicode_DATA(ascii);
495 for (i=0; i < ascii->length; i++)
496 {
497 ch = PyUnicode_READ(kind, data, i);
498 if (ch > maxchar)
499 maxchar = ch;
500 }
501 if (kind == PyUnicode_1BYTE_KIND) {
502 if (ascii->state.ascii == 0) {
503 assert(maxchar >= 128);
504 assert(maxchar <= 255);
505 }
506 else
507 assert(maxchar < 128);
508 }
509 else if (kind == PyUnicode_2BYTE_KIND) {
510 assert(maxchar >= 0x100);
511 assert(maxchar <= 0xFFFF);
512 }
513 else {
514 assert(maxchar >= 0x10000);
515 assert(maxchar <= MAX_UNICODE);
516 }
517 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
518 }
519 return 1;
520 }
521 #endif
522
523 static PyObject*
unicode_result_wchar(PyObject * unicode)524 unicode_result_wchar(PyObject *unicode)
525 {
526 #ifndef Py_DEBUG
527 Py_ssize_t len;
528
529 len = _PyUnicode_WSTR_LENGTH(unicode);
530 if (len == 0) {
531 Py_DECREF(unicode);
532 _Py_RETURN_UNICODE_EMPTY();
533 }
534
535 if (len == 1) {
536 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
537 if ((Py_UCS4)ch < 256) {
538 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
539 Py_DECREF(unicode);
540 return latin1_char;
541 }
542 }
543
544 if (_PyUnicode_Ready(unicode) < 0) {
545 Py_DECREF(unicode);
546 return NULL;
547 }
548 #else
549 assert(Py_REFCNT(unicode) == 1);
550
551 /* don't make the result ready in debug mode to ensure that the caller
552 makes the string ready before using it */
553 assert(_PyUnicode_CheckConsistency(unicode, 1));
554 #endif
555 return unicode;
556 }
557
558 static PyObject*
unicode_result_ready(PyObject * unicode)559 unicode_result_ready(PyObject *unicode)
560 {
561 Py_ssize_t length;
562
563 length = PyUnicode_GET_LENGTH(unicode);
564 if (length == 0) {
565 if (unicode != unicode_empty) {
566 Py_DECREF(unicode);
567 _Py_RETURN_UNICODE_EMPTY();
568 }
569 return unicode_empty;
570 }
571
572 if (length == 1) {
573 void *data = PyUnicode_DATA(unicode);
574 int kind = PyUnicode_KIND(unicode);
575 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
576 if (ch < 256) {
577 PyObject *latin1_char = unicode_latin1[ch];
578 if (latin1_char != NULL) {
579 if (unicode != latin1_char) {
580 Py_INCREF(latin1_char);
581 Py_DECREF(unicode);
582 }
583 return latin1_char;
584 }
585 else {
586 assert(_PyUnicode_CheckConsistency(unicode, 1));
587 Py_INCREF(unicode);
588 unicode_latin1[ch] = unicode;
589 return unicode;
590 }
591 }
592 }
593
594 assert(_PyUnicode_CheckConsistency(unicode, 1));
595 return unicode;
596 }
597
598 static PyObject*
unicode_result(PyObject * unicode)599 unicode_result(PyObject *unicode)
600 {
601 assert(_PyUnicode_CHECK(unicode));
602 if (PyUnicode_IS_READY(unicode))
603 return unicode_result_ready(unicode);
604 else
605 return unicode_result_wchar(unicode);
606 }
607
608 static PyObject*
unicode_result_unchanged(PyObject * unicode)609 unicode_result_unchanged(PyObject *unicode)
610 {
611 if (PyUnicode_CheckExact(unicode)) {
612 if (PyUnicode_READY(unicode) == -1)
613 return NULL;
614 Py_INCREF(unicode);
615 return unicode;
616 }
617 else
618 /* Subtype -- return genuine unicode string with the same value. */
619 return _PyUnicode_Copy(unicode);
620 }
621
622 /* Implementation of the "backslashreplace" error handler for 8-bit encodings:
623 ASCII, Latin1, UTF-8, etc. */
624 static char*
backslashreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)625 backslashreplace(_PyBytesWriter *writer, char *str,
626 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
627 {
628 Py_ssize_t size, i;
629 Py_UCS4 ch;
630 enum PyUnicode_Kind kind;
631 void *data;
632
633 assert(PyUnicode_IS_READY(unicode));
634 kind = PyUnicode_KIND(unicode);
635 data = PyUnicode_DATA(unicode);
636
637 size = 0;
638 /* determine replacement size */
639 for (i = collstart; i < collend; ++i) {
640 Py_ssize_t incr;
641
642 ch = PyUnicode_READ(kind, data, i);
643 if (ch < 0x100)
644 incr = 2+2;
645 else if (ch < 0x10000)
646 incr = 2+4;
647 else {
648 assert(ch <= MAX_UNICODE);
649 incr = 2+8;
650 }
651 if (size > PY_SSIZE_T_MAX - incr) {
652 PyErr_SetString(PyExc_OverflowError,
653 "encoded result is too long for a Python string");
654 return NULL;
655 }
656 size += incr;
657 }
658
659 str = _PyBytesWriter_Prepare(writer, str, size);
660 if (str == NULL)
661 return NULL;
662
663 /* generate replacement */
664 for (i = collstart; i < collend; ++i) {
665 ch = PyUnicode_READ(kind, data, i);
666 *str++ = '\\';
667 if (ch >= 0x00010000) {
668 *str++ = 'U';
669 *str++ = Py_hexdigits[(ch>>28)&0xf];
670 *str++ = Py_hexdigits[(ch>>24)&0xf];
671 *str++ = Py_hexdigits[(ch>>20)&0xf];
672 *str++ = Py_hexdigits[(ch>>16)&0xf];
673 *str++ = Py_hexdigits[(ch>>12)&0xf];
674 *str++ = Py_hexdigits[(ch>>8)&0xf];
675 }
676 else if (ch >= 0x100) {
677 *str++ = 'u';
678 *str++ = Py_hexdigits[(ch>>12)&0xf];
679 *str++ = Py_hexdigits[(ch>>8)&0xf];
680 }
681 else
682 *str++ = 'x';
683 *str++ = Py_hexdigits[(ch>>4)&0xf];
684 *str++ = Py_hexdigits[ch&0xf];
685 }
686 return str;
687 }
688
689 /* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
690 ASCII, Latin1, UTF-8, etc. */
691 static char*
xmlcharrefreplace(_PyBytesWriter * writer,char * str,PyObject * unicode,Py_ssize_t collstart,Py_ssize_t collend)692 xmlcharrefreplace(_PyBytesWriter *writer, char *str,
693 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
694 {
695 Py_ssize_t size, i;
696 Py_UCS4 ch;
697 enum PyUnicode_Kind kind;
698 void *data;
699
700 assert(PyUnicode_IS_READY(unicode));
701 kind = PyUnicode_KIND(unicode);
702 data = PyUnicode_DATA(unicode);
703
704 size = 0;
705 /* determine replacement size */
706 for (i = collstart; i < collend; ++i) {
707 Py_ssize_t incr;
708
709 ch = PyUnicode_READ(kind, data, i);
710 if (ch < 10)
711 incr = 2+1+1;
712 else if (ch < 100)
713 incr = 2+2+1;
714 else if (ch < 1000)
715 incr = 2+3+1;
716 else if (ch < 10000)
717 incr = 2+4+1;
718 else if (ch < 100000)
719 incr = 2+5+1;
720 else if (ch < 1000000)
721 incr = 2+6+1;
722 else {
723 assert(ch <= MAX_UNICODE);
724 incr = 2+7+1;
725 }
726 if (size > PY_SSIZE_T_MAX - incr) {
727 PyErr_SetString(PyExc_OverflowError,
728 "encoded result is too long for a Python string");
729 return NULL;
730 }
731 size += incr;
732 }
733
734 str = _PyBytesWriter_Prepare(writer, str, size);
735 if (str == NULL)
736 return NULL;
737
738 /* generate replacement */
739 for (i = collstart; i < collend; ++i) {
740 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
741 }
742 return str;
743 }
744
745 /* --- Bloom Filters ----------------------------------------------------- */
746
747 /* stuff to implement simple "bloom filters" for Unicode characters.
748 to keep things simple, we use a single bitmask, using the least 5
749 bits from each unicode characters as the bit index. */
750
751 /* the linebreak mask is set up by Unicode_Init below */
752
753 #if LONG_BIT >= 128
754 #define BLOOM_WIDTH 128
755 #elif LONG_BIT >= 64
756 #define BLOOM_WIDTH 64
757 #elif LONG_BIT >= 32
758 #define BLOOM_WIDTH 32
759 #else
760 #error "LONG_BIT is smaller than 32"
761 #endif
762
763 #define BLOOM_MASK unsigned long
764
765 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
766
767 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
768
769 #define BLOOM_LINEBREAK(ch) \
770 ((ch) < 128U ? ascii_linebreak[(ch)] : \
771 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
772
773 static inline BLOOM_MASK
make_bloom_mask(int kind,void * ptr,Py_ssize_t len)774 make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
775 {
776 #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
777 do { \
778 TYPE *data = (TYPE *)PTR; \
779 TYPE *end = data + LEN; \
780 Py_UCS4 ch; \
781 for (; data != end; data++) { \
782 ch = *data; \
783 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
784 } \
785 break; \
786 } while (0)
787
788 /* calculate simple bloom-style bitmask for a given unicode string */
789
790 BLOOM_MASK mask;
791
792 mask = 0;
793 switch (kind) {
794 case PyUnicode_1BYTE_KIND:
795 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
796 break;
797 case PyUnicode_2BYTE_KIND:
798 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
799 break;
800 case PyUnicode_4BYTE_KIND:
801 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
802 break;
803 default:
804 Py_UNREACHABLE();
805 }
806 return mask;
807
808 #undef BLOOM_UPDATE
809 }
810
811 static int
ensure_unicode(PyObject * obj)812 ensure_unicode(PyObject *obj)
813 {
814 if (!PyUnicode_Check(obj)) {
815 PyErr_Format(PyExc_TypeError,
816 "must be str, not %.100s",
817 Py_TYPE(obj)->tp_name);
818 return -1;
819 }
820 return PyUnicode_READY(obj);
821 }
822
823 /* Compilation of templated routines */
824
825 #include "stringlib/asciilib.h"
826 #include "stringlib/fastsearch.h"
827 #include "stringlib/partition.h"
828 #include "stringlib/split.h"
829 #include "stringlib/count.h"
830 #include "stringlib/find.h"
831 #include "stringlib/find_max_char.h"
832 #include "stringlib/undef.h"
833
834 #include "stringlib/ucs1lib.h"
835 #include "stringlib/fastsearch.h"
836 #include "stringlib/partition.h"
837 #include "stringlib/split.h"
838 #include "stringlib/count.h"
839 #include "stringlib/find.h"
840 #include "stringlib/replace.h"
841 #include "stringlib/find_max_char.h"
842 #include "stringlib/undef.h"
843
844 #include "stringlib/ucs2lib.h"
845 #include "stringlib/fastsearch.h"
846 #include "stringlib/partition.h"
847 #include "stringlib/split.h"
848 #include "stringlib/count.h"
849 #include "stringlib/find.h"
850 #include "stringlib/replace.h"
851 #include "stringlib/find_max_char.h"
852 #include "stringlib/undef.h"
853
854 #include "stringlib/ucs4lib.h"
855 #include "stringlib/fastsearch.h"
856 #include "stringlib/partition.h"
857 #include "stringlib/split.h"
858 #include "stringlib/count.h"
859 #include "stringlib/find.h"
860 #include "stringlib/replace.h"
861 #include "stringlib/find_max_char.h"
862 #include "stringlib/undef.h"
863
864 #include "stringlib/unicodedefs.h"
865 #include "stringlib/fastsearch.h"
866 #include "stringlib/count.h"
867 #include "stringlib/find.h"
868 #include "stringlib/undef.h"
869
870 /* --- Unicode Object ----------------------------------------------------- */
871
872 static inline Py_ssize_t
findchar(const void * s,int kind,Py_ssize_t size,Py_UCS4 ch,int direction)873 findchar(const void *s, int kind,
874 Py_ssize_t size, Py_UCS4 ch,
875 int direction)
876 {
877 switch (kind) {
878 case PyUnicode_1BYTE_KIND:
879 if ((Py_UCS1) ch != ch)
880 return -1;
881 if (direction > 0)
882 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
883 else
884 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
885 case PyUnicode_2BYTE_KIND:
886 if ((Py_UCS2) ch != ch)
887 return -1;
888 if (direction > 0)
889 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
890 else
891 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
892 case PyUnicode_4BYTE_KIND:
893 if (direction > 0)
894 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
895 else
896 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
897 default:
898 Py_UNREACHABLE();
899 }
900 }
901
902 #ifdef Py_DEBUG
903 /* Fill the data of a Unicode string with invalid characters to detect bugs
904 earlier.
905
906 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
907 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
908 invalid character in Unicode 6.0. */
909 static void
unicode_fill_invalid(PyObject * unicode,Py_ssize_t old_length)910 unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
911 {
912 int kind = PyUnicode_KIND(unicode);
913 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
914 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
915 if (length <= old_length)
916 return;
917 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
918 }
919 #endif
920
921 static PyObject*
resize_compact(PyObject * unicode,Py_ssize_t length)922 resize_compact(PyObject *unicode, Py_ssize_t length)
923 {
924 Py_ssize_t char_size;
925 Py_ssize_t struct_size;
926 Py_ssize_t new_size;
927 int share_wstr;
928 PyObject *new_unicode;
929 #ifdef Py_DEBUG
930 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
931 #endif
932
933 assert(unicode_modifiable(unicode));
934 assert(PyUnicode_IS_READY(unicode));
935 assert(PyUnicode_IS_COMPACT(unicode));
936
937 char_size = PyUnicode_KIND(unicode);
938 if (PyUnicode_IS_ASCII(unicode))
939 struct_size = sizeof(PyASCIIObject);
940 else
941 struct_size = sizeof(PyCompactUnicodeObject);
942 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
943
944 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
945 PyErr_NoMemory();
946 return NULL;
947 }
948 new_size = (struct_size + (length + 1) * char_size);
949
950 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
951 PyObject_DEL(_PyUnicode_UTF8(unicode));
952 _PyUnicode_UTF8(unicode) = NULL;
953 _PyUnicode_UTF8_LENGTH(unicode) = 0;
954 }
955 _Py_DEC_REFTOTAL;
956 _Py_ForgetReference(unicode);
957
958 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
959 if (new_unicode == NULL) {
960 _Py_NewReference(unicode);
961 PyErr_NoMemory();
962 return NULL;
963 }
964 unicode = new_unicode;
965 _Py_NewReference(unicode);
966
967 _PyUnicode_LENGTH(unicode) = length;
968 if (share_wstr) {
969 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
970 if (!PyUnicode_IS_ASCII(unicode))
971 _PyUnicode_WSTR_LENGTH(unicode) = length;
972 }
973 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
974 PyObject_DEL(_PyUnicode_WSTR(unicode));
975 _PyUnicode_WSTR(unicode) = NULL;
976 if (!PyUnicode_IS_ASCII(unicode))
977 _PyUnicode_WSTR_LENGTH(unicode) = 0;
978 }
979 #ifdef Py_DEBUG
980 unicode_fill_invalid(unicode, old_length);
981 #endif
982 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
983 length, 0);
984 assert(_PyUnicode_CheckConsistency(unicode, 0));
985 return unicode;
986 }
987
988 static int
resize_inplace(PyObject * unicode,Py_ssize_t length)989 resize_inplace(PyObject *unicode, Py_ssize_t length)
990 {
991 wchar_t *wstr;
992 Py_ssize_t new_size;
993 assert(!PyUnicode_IS_COMPACT(unicode));
994 assert(Py_REFCNT(unicode) == 1);
995
996 if (PyUnicode_IS_READY(unicode)) {
997 Py_ssize_t char_size;
998 int share_wstr, share_utf8;
999 void *data;
1000 #ifdef Py_DEBUG
1001 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1002 #endif
1003
1004 data = _PyUnicode_DATA_ANY(unicode);
1005 char_size = PyUnicode_KIND(unicode);
1006 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1007 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1008
1009 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1010 PyErr_NoMemory();
1011 return -1;
1012 }
1013 new_size = (length + 1) * char_size;
1014
1015 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1016 {
1017 PyObject_DEL(_PyUnicode_UTF8(unicode));
1018 _PyUnicode_UTF8(unicode) = NULL;
1019 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1020 }
1021
1022 data = (PyObject *)PyObject_REALLOC(data, new_size);
1023 if (data == NULL) {
1024 PyErr_NoMemory();
1025 return -1;
1026 }
1027 _PyUnicode_DATA_ANY(unicode) = data;
1028 if (share_wstr) {
1029 _PyUnicode_WSTR(unicode) = data;
1030 _PyUnicode_WSTR_LENGTH(unicode) = length;
1031 }
1032 if (share_utf8) {
1033 _PyUnicode_UTF8(unicode) = data;
1034 _PyUnicode_UTF8_LENGTH(unicode) = length;
1035 }
1036 _PyUnicode_LENGTH(unicode) = length;
1037 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1038 #ifdef Py_DEBUG
1039 unicode_fill_invalid(unicode, old_length);
1040 #endif
1041 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1042 assert(_PyUnicode_CheckConsistency(unicode, 0));
1043 return 0;
1044 }
1045 }
1046 assert(_PyUnicode_WSTR(unicode) != NULL);
1047
1048 /* check for integer overflow */
1049 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1050 PyErr_NoMemory();
1051 return -1;
1052 }
1053 new_size = sizeof(wchar_t) * (length + 1);
1054 wstr = _PyUnicode_WSTR(unicode);
1055 wstr = PyObject_REALLOC(wstr, new_size);
1056 if (!wstr) {
1057 PyErr_NoMemory();
1058 return -1;
1059 }
1060 _PyUnicode_WSTR(unicode) = wstr;
1061 _PyUnicode_WSTR(unicode)[length] = 0;
1062 _PyUnicode_WSTR_LENGTH(unicode) = length;
1063 assert(_PyUnicode_CheckConsistency(unicode, 0));
1064 return 0;
1065 }
1066
1067 static PyObject*
resize_copy(PyObject * unicode,Py_ssize_t length)1068 resize_copy(PyObject *unicode, Py_ssize_t length)
1069 {
1070 Py_ssize_t copy_length;
1071 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1072 PyObject *copy;
1073
1074 assert(PyUnicode_IS_READY(unicode));
1075
1076 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1077 if (copy == NULL)
1078 return NULL;
1079
1080 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1081 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1082 return copy;
1083 }
1084 else {
1085 PyObject *w;
1086
1087 w = (PyObject*)_PyUnicode_New(length);
1088 if (w == NULL)
1089 return NULL;
1090 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1091 copy_length = Py_MIN(copy_length, length);
1092 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1093 copy_length * sizeof(wchar_t));
1094 return w;
1095 }
1096 }
1097
1098 /* We allocate one more byte to make sure the string is
1099 Ux0000 terminated; some code (e.g. new_identifier)
1100 relies on that.
1101
1102 XXX This allocator could further be enhanced by assuring that the
1103 free list never reduces its size below 1.
1104
1105 */
1106
1107 static PyUnicodeObject *
_PyUnicode_New(Py_ssize_t length)1108 _PyUnicode_New(Py_ssize_t length)
1109 {
1110 PyUnicodeObject *unicode;
1111 size_t new_size;
1112
1113 /* Optimization for empty strings */
1114 if (length == 0 && unicode_empty != NULL) {
1115 Py_INCREF(unicode_empty);
1116 return (PyUnicodeObject*)unicode_empty;
1117 }
1118
1119 /* Ensure we won't overflow the size. */
1120 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1121 return (PyUnicodeObject *)PyErr_NoMemory();
1122 }
1123 if (length < 0) {
1124 PyErr_SetString(PyExc_SystemError,
1125 "Negative size passed to _PyUnicode_New");
1126 return NULL;
1127 }
1128
1129 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1130 if (unicode == NULL)
1131 return NULL;
1132 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1133
1134 _PyUnicode_WSTR_LENGTH(unicode) = length;
1135 _PyUnicode_HASH(unicode) = -1;
1136 _PyUnicode_STATE(unicode).interned = 0;
1137 _PyUnicode_STATE(unicode).kind = 0;
1138 _PyUnicode_STATE(unicode).compact = 0;
1139 _PyUnicode_STATE(unicode).ready = 0;
1140 _PyUnicode_STATE(unicode).ascii = 0;
1141 _PyUnicode_DATA_ANY(unicode) = NULL;
1142 _PyUnicode_LENGTH(unicode) = 0;
1143 _PyUnicode_UTF8(unicode) = NULL;
1144 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1145
1146 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1147 if (!_PyUnicode_WSTR(unicode)) {
1148 Py_DECREF(unicode);
1149 PyErr_NoMemory();
1150 return NULL;
1151 }
1152
1153 /* Initialize the first element to guard against cases where
1154 * the caller fails before initializing str -- unicode_resize()
1155 * reads str[0], and the Keep-Alive optimization can keep memory
1156 * allocated for str alive across a call to unicode_dealloc(unicode).
1157 * We don't want unicode_resize to read uninitialized memory in
1158 * that case.
1159 */
1160 _PyUnicode_WSTR(unicode)[0] = 0;
1161 _PyUnicode_WSTR(unicode)[length] = 0;
1162
1163 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1164 return unicode;
1165 }
1166
1167 static const char*
unicode_kind_name(PyObject * unicode)1168 unicode_kind_name(PyObject *unicode)
1169 {
1170 /* don't check consistency: unicode_kind_name() is called from
1171 _PyUnicode_Dump() */
1172 if (!PyUnicode_IS_COMPACT(unicode))
1173 {
1174 if (!PyUnicode_IS_READY(unicode))
1175 return "wstr";
1176 switch (PyUnicode_KIND(unicode))
1177 {
1178 case PyUnicode_1BYTE_KIND:
1179 if (PyUnicode_IS_ASCII(unicode))
1180 return "legacy ascii";
1181 else
1182 return "legacy latin1";
1183 case PyUnicode_2BYTE_KIND:
1184 return "legacy UCS2";
1185 case PyUnicode_4BYTE_KIND:
1186 return "legacy UCS4";
1187 default:
1188 return "<legacy invalid kind>";
1189 }
1190 }
1191 assert(PyUnicode_IS_READY(unicode));
1192 switch (PyUnicode_KIND(unicode)) {
1193 case PyUnicode_1BYTE_KIND:
1194 if (PyUnicode_IS_ASCII(unicode))
1195 return "ascii";
1196 else
1197 return "latin1";
1198 case PyUnicode_2BYTE_KIND:
1199 return "UCS2";
1200 case PyUnicode_4BYTE_KIND:
1201 return "UCS4";
1202 default:
1203 return "<invalid compact kind>";
1204 }
1205 }
1206
1207 #ifdef Py_DEBUG
1208 /* Functions wrapping macros for use in debugger */
_PyUnicode_utf8(void * unicode)1209 char *_PyUnicode_utf8(void *unicode){
1210 return PyUnicode_UTF8(unicode);
1211 }
1212
_PyUnicode_compact_data(void * unicode)1213 void *_PyUnicode_compact_data(void *unicode) {
1214 return _PyUnicode_COMPACT_DATA(unicode);
1215 }
_PyUnicode_data(void * unicode)1216 void *_PyUnicode_data(void *unicode){
1217 printf("obj %p\n", unicode);
1218 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1219 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1220 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1221 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1222 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1223 return PyUnicode_DATA(unicode);
1224 }
1225
1226 void
_PyUnicode_Dump(PyObject * op)1227 _PyUnicode_Dump(PyObject *op)
1228 {
1229 PyASCIIObject *ascii = (PyASCIIObject *)op;
1230 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1231 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1232 void *data;
1233
1234 if (ascii->state.compact)
1235 {
1236 if (ascii->state.ascii)
1237 data = (ascii + 1);
1238 else
1239 data = (compact + 1);
1240 }
1241 else
1242 data = unicode->data.any;
1243 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1244 unicode_kind_name(op), ascii->length);
1245
1246 if (ascii->wstr == data)
1247 printf("shared ");
1248 printf("wstr=%p", ascii->wstr);
1249
1250 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1251 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1252 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1253 printf("shared ");
1254 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1255 compact->utf8, compact->utf8_length);
1256 }
1257 printf(", data=%p\n", data);
1258 }
1259 #endif
1260
1261 PyObject *
PyUnicode_New(Py_ssize_t size,Py_UCS4 maxchar)1262 PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1263 {
1264 PyObject *obj;
1265 PyCompactUnicodeObject *unicode;
1266 void *data;
1267 enum PyUnicode_Kind kind;
1268 int is_sharing, is_ascii;
1269 Py_ssize_t char_size;
1270 Py_ssize_t struct_size;
1271
1272 /* Optimization for empty strings */
1273 if (size == 0 && unicode_empty != NULL) {
1274 Py_INCREF(unicode_empty);
1275 return unicode_empty;
1276 }
1277
1278 is_ascii = 0;
1279 is_sharing = 0;
1280 struct_size = sizeof(PyCompactUnicodeObject);
1281 if (maxchar < 128) {
1282 kind = PyUnicode_1BYTE_KIND;
1283 char_size = 1;
1284 is_ascii = 1;
1285 struct_size = sizeof(PyASCIIObject);
1286 }
1287 else if (maxchar < 256) {
1288 kind = PyUnicode_1BYTE_KIND;
1289 char_size = 1;
1290 }
1291 else if (maxchar < 65536) {
1292 kind = PyUnicode_2BYTE_KIND;
1293 char_size = 2;
1294 if (sizeof(wchar_t) == 2)
1295 is_sharing = 1;
1296 }
1297 else {
1298 if (maxchar > MAX_UNICODE) {
1299 PyErr_SetString(PyExc_SystemError,
1300 "invalid maximum character passed to PyUnicode_New");
1301 return NULL;
1302 }
1303 kind = PyUnicode_4BYTE_KIND;
1304 char_size = 4;
1305 if (sizeof(wchar_t) == 4)
1306 is_sharing = 1;
1307 }
1308
1309 /* Ensure we won't overflow the size. */
1310 if (size < 0) {
1311 PyErr_SetString(PyExc_SystemError,
1312 "Negative size passed to PyUnicode_New");
1313 return NULL;
1314 }
1315 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1316 return PyErr_NoMemory();
1317
1318 /* Duplicated allocation code from _PyObject_New() instead of a call to
1319 * PyObject_New() so we are able to allocate space for the object and
1320 * it's data buffer.
1321 */
1322 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1323 if (obj == NULL)
1324 return PyErr_NoMemory();
1325 obj = PyObject_INIT(obj, &PyUnicode_Type);
1326 if (obj == NULL)
1327 return NULL;
1328
1329 unicode = (PyCompactUnicodeObject *)obj;
1330 if (is_ascii)
1331 data = ((PyASCIIObject*)obj) + 1;
1332 else
1333 data = unicode + 1;
1334 _PyUnicode_LENGTH(unicode) = size;
1335 _PyUnicode_HASH(unicode) = -1;
1336 _PyUnicode_STATE(unicode).interned = 0;
1337 _PyUnicode_STATE(unicode).kind = kind;
1338 _PyUnicode_STATE(unicode).compact = 1;
1339 _PyUnicode_STATE(unicode).ready = 1;
1340 _PyUnicode_STATE(unicode).ascii = is_ascii;
1341 if (is_ascii) {
1342 ((char*)data)[size] = 0;
1343 _PyUnicode_WSTR(unicode) = NULL;
1344 }
1345 else if (kind == PyUnicode_1BYTE_KIND) {
1346 ((char*)data)[size] = 0;
1347 _PyUnicode_WSTR(unicode) = NULL;
1348 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1349 unicode->utf8 = NULL;
1350 unicode->utf8_length = 0;
1351 }
1352 else {
1353 unicode->utf8 = NULL;
1354 unicode->utf8_length = 0;
1355 if (kind == PyUnicode_2BYTE_KIND)
1356 ((Py_UCS2*)data)[size] = 0;
1357 else /* kind == PyUnicode_4BYTE_KIND */
1358 ((Py_UCS4*)data)[size] = 0;
1359 if (is_sharing) {
1360 _PyUnicode_WSTR_LENGTH(unicode) = size;
1361 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1362 }
1363 else {
1364 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1365 _PyUnicode_WSTR(unicode) = NULL;
1366 }
1367 }
1368 #ifdef Py_DEBUG
1369 unicode_fill_invalid((PyObject*)unicode, 0);
1370 #endif
1371 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1372 return obj;
1373 }
1374
1375 #if SIZEOF_WCHAR_T == 2
1376 /* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1377 will decode surrogate pairs, the other conversions are implemented as macros
1378 for efficiency.
1379
1380 This function assumes that unicode can hold one more code point than wstr
1381 characters for a terminating null character. */
1382 static void
unicode_convert_wchar_to_ucs4(const wchar_t * begin,const wchar_t * end,PyObject * unicode)1383 unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1384 PyObject *unicode)
1385 {
1386 const wchar_t *iter;
1387 Py_UCS4 *ucs4_out;
1388
1389 assert(unicode != NULL);
1390 assert(_PyUnicode_CHECK(unicode));
1391 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1392 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1393
1394 for (iter = begin; iter < end; ) {
1395 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1396 _PyUnicode_GET_LENGTH(unicode)));
1397 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1398 && (iter+1) < end
1399 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1400 {
1401 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1402 iter += 2;
1403 }
1404 else {
1405 *ucs4_out++ = *iter;
1406 iter++;
1407 }
1408 }
1409 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1410 _PyUnicode_GET_LENGTH(unicode)));
1411
1412 }
1413 #endif
1414
1415 static int
unicode_check_modifiable(PyObject * unicode)1416 unicode_check_modifiable(PyObject *unicode)
1417 {
1418 if (!unicode_modifiable(unicode)) {
1419 PyErr_SetString(PyExc_SystemError,
1420 "Cannot modify a string currently used");
1421 return -1;
1422 }
1423 return 0;
1424 }
1425
1426 static int
_copy_characters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many,int check_maxchar)1427 _copy_characters(PyObject *to, Py_ssize_t to_start,
1428 PyObject *from, Py_ssize_t from_start,
1429 Py_ssize_t how_many, int check_maxchar)
1430 {
1431 unsigned int from_kind, to_kind;
1432 void *from_data, *to_data;
1433
1434 assert(0 <= how_many);
1435 assert(0 <= from_start);
1436 assert(0 <= to_start);
1437 assert(PyUnicode_Check(from));
1438 assert(PyUnicode_IS_READY(from));
1439 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1440
1441 assert(PyUnicode_Check(to));
1442 assert(PyUnicode_IS_READY(to));
1443 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1444
1445 if (how_many == 0)
1446 return 0;
1447
1448 from_kind = PyUnicode_KIND(from);
1449 from_data = PyUnicode_DATA(from);
1450 to_kind = PyUnicode_KIND(to);
1451 to_data = PyUnicode_DATA(to);
1452
1453 #ifdef Py_DEBUG
1454 if (!check_maxchar
1455 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1456 {
1457 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1458 Py_UCS4 ch;
1459 Py_ssize_t i;
1460 for (i=0; i < how_many; i++) {
1461 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1462 assert(ch <= to_maxchar);
1463 }
1464 }
1465 #endif
1466
1467 if (from_kind == to_kind) {
1468 if (check_maxchar
1469 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1470 {
1471 /* Writing Latin-1 characters into an ASCII string requires to
1472 check that all written characters are pure ASCII */
1473 Py_UCS4 max_char;
1474 max_char = ucs1lib_find_max_char(from_data,
1475 (Py_UCS1*)from_data + how_many);
1476 if (max_char >= 128)
1477 return -1;
1478 }
1479 memcpy((char*)to_data + to_kind * to_start,
1480 (char*)from_data + from_kind * from_start,
1481 to_kind * how_many);
1482 }
1483 else if (from_kind == PyUnicode_1BYTE_KIND
1484 && to_kind == PyUnicode_2BYTE_KIND)
1485 {
1486 _PyUnicode_CONVERT_BYTES(
1487 Py_UCS1, Py_UCS2,
1488 PyUnicode_1BYTE_DATA(from) + from_start,
1489 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1490 PyUnicode_2BYTE_DATA(to) + to_start
1491 );
1492 }
1493 else if (from_kind == PyUnicode_1BYTE_KIND
1494 && to_kind == PyUnicode_4BYTE_KIND)
1495 {
1496 _PyUnicode_CONVERT_BYTES(
1497 Py_UCS1, Py_UCS4,
1498 PyUnicode_1BYTE_DATA(from) + from_start,
1499 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1500 PyUnicode_4BYTE_DATA(to) + to_start
1501 );
1502 }
1503 else if (from_kind == PyUnicode_2BYTE_KIND
1504 && to_kind == PyUnicode_4BYTE_KIND)
1505 {
1506 _PyUnicode_CONVERT_BYTES(
1507 Py_UCS2, Py_UCS4,
1508 PyUnicode_2BYTE_DATA(from) + from_start,
1509 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1510 PyUnicode_4BYTE_DATA(to) + to_start
1511 );
1512 }
1513 else {
1514 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1515
1516 if (!check_maxchar) {
1517 if (from_kind == PyUnicode_2BYTE_KIND
1518 && to_kind == PyUnicode_1BYTE_KIND)
1519 {
1520 _PyUnicode_CONVERT_BYTES(
1521 Py_UCS2, Py_UCS1,
1522 PyUnicode_2BYTE_DATA(from) + from_start,
1523 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1524 PyUnicode_1BYTE_DATA(to) + to_start
1525 );
1526 }
1527 else if (from_kind == PyUnicode_4BYTE_KIND
1528 && to_kind == PyUnicode_1BYTE_KIND)
1529 {
1530 _PyUnicode_CONVERT_BYTES(
1531 Py_UCS4, Py_UCS1,
1532 PyUnicode_4BYTE_DATA(from) + from_start,
1533 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1534 PyUnicode_1BYTE_DATA(to) + to_start
1535 );
1536 }
1537 else if (from_kind == PyUnicode_4BYTE_KIND
1538 && to_kind == PyUnicode_2BYTE_KIND)
1539 {
1540 _PyUnicode_CONVERT_BYTES(
1541 Py_UCS4, Py_UCS2,
1542 PyUnicode_4BYTE_DATA(from) + from_start,
1543 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1544 PyUnicode_2BYTE_DATA(to) + to_start
1545 );
1546 }
1547 else {
1548 Py_UNREACHABLE();
1549 }
1550 }
1551 else {
1552 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1553 Py_UCS4 ch;
1554 Py_ssize_t i;
1555
1556 for (i=0; i < how_many; i++) {
1557 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1558 if (ch > to_maxchar)
1559 return -1;
1560 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1561 }
1562 }
1563 }
1564 return 0;
1565 }
1566
1567 void
_PyUnicode_FastCopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1568 _PyUnicode_FastCopyCharacters(
1569 PyObject *to, Py_ssize_t to_start,
1570 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1571 {
1572 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1573 }
1574
1575 Py_ssize_t
PyUnicode_CopyCharacters(PyObject * to,Py_ssize_t to_start,PyObject * from,Py_ssize_t from_start,Py_ssize_t how_many)1576 PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1577 PyObject *from, Py_ssize_t from_start,
1578 Py_ssize_t how_many)
1579 {
1580 int err;
1581
1582 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1583 PyErr_BadInternalCall();
1584 return -1;
1585 }
1586
1587 if (PyUnicode_READY(from) == -1)
1588 return -1;
1589 if (PyUnicode_READY(to) == -1)
1590 return -1;
1591
1592 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1593 PyErr_SetString(PyExc_IndexError, "string index out of range");
1594 return -1;
1595 }
1596 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1597 PyErr_SetString(PyExc_IndexError, "string index out of range");
1598 return -1;
1599 }
1600 if (how_many < 0) {
1601 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1602 return -1;
1603 }
1604 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1605 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1606 PyErr_Format(PyExc_SystemError,
1607 "Cannot write %zi characters at %zi "
1608 "in a string of %zi characters",
1609 how_many, to_start, PyUnicode_GET_LENGTH(to));
1610 return -1;
1611 }
1612
1613 if (how_many == 0)
1614 return 0;
1615
1616 if (unicode_check_modifiable(to))
1617 return -1;
1618
1619 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1620 if (err) {
1621 PyErr_Format(PyExc_SystemError,
1622 "Cannot copy %s characters "
1623 "into a string of %s characters",
1624 unicode_kind_name(from),
1625 unicode_kind_name(to));
1626 return -1;
1627 }
1628 return how_many;
1629 }
1630
1631 /* Find the maximum code point and count the number of surrogate pairs so a
1632 correct string length can be computed before converting a string to UCS4.
1633 This function counts single surrogates as a character and not as a pair.
1634
1635 Return 0 on success, or -1 on error. */
1636 static int
find_maxchar_surrogates(const wchar_t * begin,const wchar_t * end,Py_UCS4 * maxchar,Py_ssize_t * num_surrogates)1637 find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1638 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1639 {
1640 const wchar_t *iter;
1641 Py_UCS4 ch;
1642
1643 assert(num_surrogates != NULL && maxchar != NULL);
1644 *num_surrogates = 0;
1645 *maxchar = 0;
1646
1647 for (iter = begin; iter < end; ) {
1648 #if SIZEOF_WCHAR_T == 2
1649 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1650 && (iter+1) < end
1651 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1652 {
1653 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1654 ++(*num_surrogates);
1655 iter += 2;
1656 }
1657 else
1658 #endif
1659 {
1660 ch = *iter;
1661 iter++;
1662 }
1663 if (ch > *maxchar) {
1664 *maxchar = ch;
1665 if (*maxchar > MAX_UNICODE) {
1666 PyErr_Format(PyExc_ValueError,
1667 "character U+%x is not in range [U+0000; U+10ffff]",
1668 ch);
1669 return -1;
1670 }
1671 }
1672 }
1673 return 0;
1674 }
1675
1676 int
_PyUnicode_Ready(PyObject * unicode)1677 _PyUnicode_Ready(PyObject *unicode)
1678 {
1679 wchar_t *end;
1680 Py_UCS4 maxchar = 0;
1681 Py_ssize_t num_surrogates;
1682 #if SIZEOF_WCHAR_T == 2
1683 Py_ssize_t length_wo_surrogates;
1684 #endif
1685
1686 /* _PyUnicode_Ready() is only intended for old-style API usage where
1687 strings were created using _PyObject_New() and where no canonical
1688 representation (the str field) has been set yet aka strings
1689 which are not yet ready. */
1690 assert(_PyUnicode_CHECK(unicode));
1691 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1692 assert(_PyUnicode_WSTR(unicode) != NULL);
1693 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1694 assert(_PyUnicode_UTF8(unicode) == NULL);
1695 /* Actually, it should neither be interned nor be anything else: */
1696 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1697
1698 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1699 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1700 &maxchar, &num_surrogates) == -1)
1701 return -1;
1702
1703 if (maxchar < 256) {
1704 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1705 if (!_PyUnicode_DATA_ANY(unicode)) {
1706 PyErr_NoMemory();
1707 return -1;
1708 }
1709 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1710 _PyUnicode_WSTR(unicode), end,
1711 PyUnicode_1BYTE_DATA(unicode));
1712 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1713 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1714 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1715 if (maxchar < 128) {
1716 _PyUnicode_STATE(unicode).ascii = 1;
1717 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1718 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1719 }
1720 else {
1721 _PyUnicode_STATE(unicode).ascii = 0;
1722 _PyUnicode_UTF8(unicode) = NULL;
1723 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1724 }
1725 PyObject_FREE(_PyUnicode_WSTR(unicode));
1726 _PyUnicode_WSTR(unicode) = NULL;
1727 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1728 }
1729 /* In this case we might have to convert down from 4-byte native
1730 wchar_t to 2-byte unicode. */
1731 else if (maxchar < 65536) {
1732 assert(num_surrogates == 0 &&
1733 "FindMaxCharAndNumSurrogatePairs() messed up");
1734
1735 #if SIZEOF_WCHAR_T == 2
1736 /* We can share representations and are done. */
1737 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1738 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1739 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1740 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1741 _PyUnicode_UTF8(unicode) = NULL;
1742 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1743 #else
1744 /* sizeof(wchar_t) == 4 */
1745 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1746 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1747 if (!_PyUnicode_DATA_ANY(unicode)) {
1748 PyErr_NoMemory();
1749 return -1;
1750 }
1751 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1752 _PyUnicode_WSTR(unicode), end,
1753 PyUnicode_2BYTE_DATA(unicode));
1754 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1755 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1756 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1757 _PyUnicode_UTF8(unicode) = NULL;
1758 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1759 PyObject_FREE(_PyUnicode_WSTR(unicode));
1760 _PyUnicode_WSTR(unicode) = NULL;
1761 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1762 #endif
1763 }
1764 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1765 else {
1766 #if SIZEOF_WCHAR_T == 2
1767 /* in case the native representation is 2-bytes, we need to allocate a
1768 new normalized 4-byte version. */
1769 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1770 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1771 PyErr_NoMemory();
1772 return -1;
1773 }
1774 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1775 if (!_PyUnicode_DATA_ANY(unicode)) {
1776 PyErr_NoMemory();
1777 return -1;
1778 }
1779 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1780 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1781 _PyUnicode_UTF8(unicode) = NULL;
1782 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1783 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1784 _PyUnicode_STATE(unicode).ready = 1;
1785 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1786 PyObject_FREE(_PyUnicode_WSTR(unicode));
1787 _PyUnicode_WSTR(unicode) = NULL;
1788 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1789 #else
1790 assert(num_surrogates == 0);
1791
1792 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1793 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1796 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1797 #endif
1798 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1799 }
1800 _PyUnicode_STATE(unicode).ready = 1;
1801 assert(_PyUnicode_CheckConsistency(unicode, 1));
1802 return 0;
1803 }
1804
1805 static void
unicode_dealloc(PyObject * unicode)1806 unicode_dealloc(PyObject *unicode)
1807 {
1808 switch (PyUnicode_CHECK_INTERNED(unicode)) {
1809 case SSTATE_NOT_INTERNED:
1810 break;
1811
1812 case SSTATE_INTERNED_MORTAL:
1813 /* revive dead object temporarily for DelItem */
1814 Py_REFCNT(unicode) = 3;
1815 if (PyDict_DelItem(interned, unicode) != 0)
1816 Py_FatalError(
1817 "deletion of interned string failed");
1818 break;
1819
1820 case SSTATE_INTERNED_IMMORTAL:
1821 Py_FatalError("Immortal interned string died.");
1822 /* fall through */
1823
1824 default:
1825 Py_FatalError("Inconsistent interned string state.");
1826 }
1827
1828 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1829 PyObject_DEL(_PyUnicode_WSTR(unicode));
1830 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1831 PyObject_DEL(_PyUnicode_UTF8(unicode));
1832 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1833 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1834
1835 Py_TYPE(unicode)->tp_free(unicode);
1836 }
1837
1838 #ifdef Py_DEBUG
1839 static int
unicode_is_singleton(PyObject * unicode)1840 unicode_is_singleton(PyObject *unicode)
1841 {
1842 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1843 if (unicode == unicode_empty)
1844 return 1;
1845 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1846 {
1847 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1848 if (ch < 256 && unicode_latin1[ch] == unicode)
1849 return 1;
1850 }
1851 return 0;
1852 }
1853 #endif
1854
1855 static int
unicode_modifiable(PyObject * unicode)1856 unicode_modifiable(PyObject *unicode)
1857 {
1858 assert(_PyUnicode_CHECK(unicode));
1859 if (Py_REFCNT(unicode) != 1)
1860 return 0;
1861 if (_PyUnicode_HASH(unicode) != -1)
1862 return 0;
1863 if (PyUnicode_CHECK_INTERNED(unicode))
1864 return 0;
1865 if (!PyUnicode_CheckExact(unicode))
1866 return 0;
1867 #ifdef Py_DEBUG
1868 /* singleton refcount is greater than 1 */
1869 assert(!unicode_is_singleton(unicode));
1870 #endif
1871 return 1;
1872 }
1873
1874 static int
unicode_resize(PyObject ** p_unicode,Py_ssize_t length)1875 unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1876 {
1877 PyObject *unicode;
1878 Py_ssize_t old_length;
1879
1880 assert(p_unicode != NULL);
1881 unicode = *p_unicode;
1882
1883 assert(unicode != NULL);
1884 assert(PyUnicode_Check(unicode));
1885 assert(0 <= length);
1886
1887 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1888 old_length = PyUnicode_WSTR_LENGTH(unicode);
1889 else
1890 old_length = PyUnicode_GET_LENGTH(unicode);
1891 if (old_length == length)
1892 return 0;
1893
1894 if (length == 0) {
1895 _Py_INCREF_UNICODE_EMPTY();
1896 if (!unicode_empty)
1897 return -1;
1898 Py_SETREF(*p_unicode, unicode_empty);
1899 return 0;
1900 }
1901
1902 if (!unicode_modifiable(unicode)) {
1903 PyObject *copy = resize_copy(unicode, length);
1904 if (copy == NULL)
1905 return -1;
1906 Py_SETREF(*p_unicode, copy);
1907 return 0;
1908 }
1909
1910 if (PyUnicode_IS_COMPACT(unicode)) {
1911 PyObject *new_unicode = resize_compact(unicode, length);
1912 if (new_unicode == NULL)
1913 return -1;
1914 *p_unicode = new_unicode;
1915 return 0;
1916 }
1917 return resize_inplace(unicode, length);
1918 }
1919
1920 int
PyUnicode_Resize(PyObject ** p_unicode,Py_ssize_t length)1921 PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1922 {
1923 PyObject *unicode;
1924 if (p_unicode == NULL) {
1925 PyErr_BadInternalCall();
1926 return -1;
1927 }
1928 unicode = *p_unicode;
1929 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1930 {
1931 PyErr_BadInternalCall();
1932 return -1;
1933 }
1934 return unicode_resize(p_unicode, length);
1935 }
1936
1937 /* Copy an ASCII or latin1 char* string into a Python Unicode string.
1938
1939 WARNING: The function doesn't copy the terminating null character and
1940 doesn't check the maximum character (may write a latin1 character in an
1941 ASCII string). */
1942 static void
unicode_write_cstr(PyObject * unicode,Py_ssize_t index,const char * str,Py_ssize_t len)1943 unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1944 const char *str, Py_ssize_t len)
1945 {
1946 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1947 void *data = PyUnicode_DATA(unicode);
1948 const char *end = str + len;
1949
1950 switch (kind) {
1951 case PyUnicode_1BYTE_KIND: {
1952 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1953 #ifdef Py_DEBUG
1954 if (PyUnicode_IS_ASCII(unicode)) {
1955 Py_UCS4 maxchar = ucs1lib_find_max_char(
1956 (const Py_UCS1*)str,
1957 (const Py_UCS1*)str + len);
1958 assert(maxchar < 128);
1959 }
1960 #endif
1961 memcpy((char *) data + index, str, len);
1962 break;
1963 }
1964 case PyUnicode_2BYTE_KIND: {
1965 Py_UCS2 *start = (Py_UCS2 *)data + index;
1966 Py_UCS2 *ucs2 = start;
1967 assert(index <= PyUnicode_GET_LENGTH(unicode));
1968
1969 for (; str < end; ++ucs2, ++str)
1970 *ucs2 = (Py_UCS2)*str;
1971
1972 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1973 break;
1974 }
1975 default: {
1976 Py_UCS4 *start = (Py_UCS4 *)data + index;
1977 Py_UCS4 *ucs4 = start;
1978 assert(kind == PyUnicode_4BYTE_KIND);
1979 assert(index <= PyUnicode_GET_LENGTH(unicode));
1980
1981 for (; str < end; ++ucs4, ++str)
1982 *ucs4 = (Py_UCS4)*str;
1983
1984 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1985 }
1986 }
1987 }
1988
1989 static PyObject*
get_latin1_char(unsigned char ch)1990 get_latin1_char(unsigned char ch)
1991 {
1992 PyObject *unicode = unicode_latin1[ch];
1993 if (!unicode) {
1994 unicode = PyUnicode_New(1, ch);
1995 if (!unicode)
1996 return NULL;
1997 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1998 assert(_PyUnicode_CheckConsistency(unicode, 1));
1999 unicode_latin1[ch] = unicode;
2000 }
2001 Py_INCREF(unicode);
2002 return unicode;
2003 }
2004
2005 static PyObject*
unicode_char(Py_UCS4 ch)2006 unicode_char(Py_UCS4 ch)
2007 {
2008 PyObject *unicode;
2009
2010 assert(ch <= MAX_UNICODE);
2011
2012 if (ch < 256)
2013 return get_latin1_char(ch);
2014
2015 unicode = PyUnicode_New(1, ch);
2016 if (unicode == NULL)
2017 return NULL;
2018
2019 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2020 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2021 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2022 } else {
2023 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2024 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2025 }
2026 assert(_PyUnicode_CheckConsistency(unicode, 1));
2027 return unicode;
2028 }
2029
2030 PyObject *
PyUnicode_FromUnicode(const Py_UNICODE * u,Py_ssize_t size)2031 PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2032 {
2033 if (u == NULL)
2034 return (PyObject*)_PyUnicode_New(size);
2035
2036 if (size < 0) {
2037 PyErr_BadInternalCall();
2038 return NULL;
2039 }
2040
2041 return PyUnicode_FromWideChar(u, size);
2042 }
2043
2044 PyObject *
PyUnicode_FromWideChar(const wchar_t * u,Py_ssize_t size)2045 PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2046 {
2047 PyObject *unicode;
2048 Py_UCS4 maxchar = 0;
2049 Py_ssize_t num_surrogates;
2050
2051 if (u == NULL && size != 0) {
2052 PyErr_BadInternalCall();
2053 return NULL;
2054 }
2055
2056 if (size == -1) {
2057 size = wcslen(u);
2058 }
2059
2060 /* If the Unicode data is known at construction time, we can apply
2061 some optimizations which share commonly used objects. */
2062
2063 /* Optimization for empty strings */
2064 if (size == 0)
2065 _Py_RETURN_UNICODE_EMPTY();
2066
2067 /* Single character Unicode objects in the Latin-1 range are
2068 shared when using this constructor */
2069 if (size == 1 && (Py_UCS4)*u < 256)
2070 return get_latin1_char((unsigned char)*u);
2071
2072 /* If not empty and not single character, copy the Unicode data
2073 into the new object */
2074 if (find_maxchar_surrogates(u, u + size,
2075 &maxchar, &num_surrogates) == -1)
2076 return NULL;
2077
2078 unicode = PyUnicode_New(size - num_surrogates, maxchar);
2079 if (!unicode)
2080 return NULL;
2081
2082 switch (PyUnicode_KIND(unicode)) {
2083 case PyUnicode_1BYTE_KIND:
2084 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2085 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2086 break;
2087 case PyUnicode_2BYTE_KIND:
2088 #if Py_UNICODE_SIZE == 2
2089 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2090 #else
2091 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2092 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2093 #endif
2094 break;
2095 case PyUnicode_4BYTE_KIND:
2096 #if SIZEOF_WCHAR_T == 2
2097 /* This is the only case which has to process surrogates, thus
2098 a simple copy loop is not enough and we need a function. */
2099 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2100 #else
2101 assert(num_surrogates == 0);
2102 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2103 #endif
2104 break;
2105 default:
2106 Py_UNREACHABLE();
2107 }
2108
2109 return unicode_result(unicode);
2110 }
2111
2112 PyObject *
PyUnicode_FromStringAndSize(const char * u,Py_ssize_t size)2113 PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2114 {
2115 if (size < 0) {
2116 PyErr_SetString(PyExc_SystemError,
2117 "Negative size passed to PyUnicode_FromStringAndSize");
2118 return NULL;
2119 }
2120 if (u != NULL)
2121 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2122 else
2123 return (PyObject *)_PyUnicode_New(size);
2124 }
2125
2126 PyObject *
PyUnicode_FromString(const char * u)2127 PyUnicode_FromString(const char *u)
2128 {
2129 size_t size = strlen(u);
2130 if (size > PY_SSIZE_T_MAX) {
2131 PyErr_SetString(PyExc_OverflowError, "input too long");
2132 return NULL;
2133 }
2134 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2135 }
2136
2137 PyObject *
_PyUnicode_FromId(_Py_Identifier * id)2138 _PyUnicode_FromId(_Py_Identifier *id)
2139 {
2140 if (!id->object) {
2141 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2142 strlen(id->string),
2143 NULL, NULL);
2144 if (!id->object)
2145 return NULL;
2146 PyUnicode_InternInPlace(&id->object);
2147 assert(!id->next);
2148 id->next = static_strings;
2149 static_strings = id;
2150 }
2151 return id->object;
2152 }
2153
2154 void
_PyUnicode_ClearStaticStrings()2155 _PyUnicode_ClearStaticStrings()
2156 {
2157 _Py_Identifier *tmp, *s = static_strings;
2158 while (s) {
2159 Py_CLEAR(s->object);
2160 tmp = s->next;
2161 s->next = NULL;
2162 s = tmp;
2163 }
2164 static_strings = NULL;
2165 }
2166
2167 /* Internal function, doesn't check maximum character */
2168
2169 PyObject*
_PyUnicode_FromASCII(const char * buffer,Py_ssize_t size)2170 _PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2171 {
2172 const unsigned char *s = (const unsigned char *)buffer;
2173 PyObject *unicode;
2174 if (size == 1) {
2175 #ifdef Py_DEBUG
2176 assert((unsigned char)s[0] < 128);
2177 #endif
2178 return get_latin1_char(s[0]);
2179 }
2180 unicode = PyUnicode_New(size, 127);
2181 if (!unicode)
2182 return NULL;
2183 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2184 assert(_PyUnicode_CheckConsistency(unicode, 1));
2185 return unicode;
2186 }
2187
2188 static Py_UCS4
kind_maxchar_limit(unsigned int kind)2189 kind_maxchar_limit(unsigned int kind)
2190 {
2191 switch (kind) {
2192 case PyUnicode_1BYTE_KIND:
2193 return 0x80;
2194 case PyUnicode_2BYTE_KIND:
2195 return 0x100;
2196 case PyUnicode_4BYTE_KIND:
2197 return 0x10000;
2198 default:
2199 Py_UNREACHABLE();
2200 }
2201 }
2202
2203 static PyObject*
_PyUnicode_FromUCS1(const Py_UCS1 * u,Py_ssize_t size)2204 _PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2205 {
2206 PyObject *res;
2207 unsigned char max_char;
2208
2209 if (size == 0)
2210 _Py_RETURN_UNICODE_EMPTY();
2211 assert(size > 0);
2212 if (size == 1)
2213 return get_latin1_char(u[0]);
2214
2215 max_char = ucs1lib_find_max_char(u, u + size);
2216 res = PyUnicode_New(size, max_char);
2217 if (!res)
2218 return NULL;
2219 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2220 assert(_PyUnicode_CheckConsistency(res, 1));
2221 return res;
2222 }
2223
2224 static PyObject*
_PyUnicode_FromUCS2(const Py_UCS2 * u,Py_ssize_t size)2225 _PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2226 {
2227 PyObject *res;
2228 Py_UCS2 max_char;
2229
2230 if (size == 0)
2231 _Py_RETURN_UNICODE_EMPTY();
2232 assert(size > 0);
2233 if (size == 1)
2234 return unicode_char(u[0]);
2235
2236 max_char = ucs2lib_find_max_char(u, u + size);
2237 res = PyUnicode_New(size, max_char);
2238 if (!res)
2239 return NULL;
2240 if (max_char >= 256)
2241 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2242 else {
2243 _PyUnicode_CONVERT_BYTES(
2244 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2245 }
2246 assert(_PyUnicode_CheckConsistency(res, 1));
2247 return res;
2248 }
2249
2250 static PyObject*
_PyUnicode_FromUCS4(const Py_UCS4 * u,Py_ssize_t size)2251 _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2252 {
2253 PyObject *res;
2254 Py_UCS4 max_char;
2255
2256 if (size == 0)
2257 _Py_RETURN_UNICODE_EMPTY();
2258 assert(size > 0);
2259 if (size == 1)
2260 return unicode_char(u[0]);
2261
2262 max_char = ucs4lib_find_max_char(u, u + size);
2263 res = PyUnicode_New(size, max_char);
2264 if (!res)
2265 return NULL;
2266 if (max_char < 256)
2267 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2268 PyUnicode_1BYTE_DATA(res));
2269 else if (max_char < 0x10000)
2270 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2271 PyUnicode_2BYTE_DATA(res));
2272 else
2273 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2274 assert(_PyUnicode_CheckConsistency(res, 1));
2275 return res;
2276 }
2277
2278 PyObject*
PyUnicode_FromKindAndData(int kind,const void * buffer,Py_ssize_t size)2279 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2280 {
2281 if (size < 0) {
2282 PyErr_SetString(PyExc_ValueError, "size must be positive");
2283 return NULL;
2284 }
2285 switch (kind) {
2286 case PyUnicode_1BYTE_KIND:
2287 return _PyUnicode_FromUCS1(buffer, size);
2288 case PyUnicode_2BYTE_KIND:
2289 return _PyUnicode_FromUCS2(buffer, size);
2290 case PyUnicode_4BYTE_KIND:
2291 return _PyUnicode_FromUCS4(buffer, size);
2292 default:
2293 PyErr_SetString(PyExc_SystemError, "invalid kind");
2294 return NULL;
2295 }
2296 }
2297
2298 Py_UCS4
_PyUnicode_FindMaxChar(PyObject * unicode,Py_ssize_t start,Py_ssize_t end)2299 _PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2300 {
2301 enum PyUnicode_Kind kind;
2302 void *startptr, *endptr;
2303
2304 assert(PyUnicode_IS_READY(unicode));
2305 assert(0 <= start);
2306 assert(end <= PyUnicode_GET_LENGTH(unicode));
2307 assert(start <= end);
2308
2309 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2310 return PyUnicode_MAX_CHAR_VALUE(unicode);
2311
2312 if (start == end)
2313 return 127;
2314
2315 if (PyUnicode_IS_ASCII(unicode))
2316 return 127;
2317
2318 kind = PyUnicode_KIND(unicode);
2319 startptr = PyUnicode_DATA(unicode);
2320 endptr = (char *)startptr + end * kind;
2321 startptr = (char *)startptr + start * kind;
2322 switch(kind) {
2323 case PyUnicode_1BYTE_KIND:
2324 return ucs1lib_find_max_char(startptr, endptr);
2325 case PyUnicode_2BYTE_KIND:
2326 return ucs2lib_find_max_char(startptr, endptr);
2327 case PyUnicode_4BYTE_KIND:
2328 return ucs4lib_find_max_char(startptr, endptr);
2329 default:
2330 Py_UNREACHABLE();
2331 }
2332 }
2333
2334 /* Ensure that a string uses the most efficient storage, if it is not the
2335 case: create a new string with of the right kind. Write NULL into *p_unicode
2336 on error. */
2337 static void
unicode_adjust_maxchar(PyObject ** p_unicode)2338 unicode_adjust_maxchar(PyObject **p_unicode)
2339 {
2340 PyObject *unicode, *copy;
2341 Py_UCS4 max_char;
2342 Py_ssize_t len;
2343 unsigned int kind;
2344
2345 assert(p_unicode != NULL);
2346 unicode = *p_unicode;
2347 assert(PyUnicode_IS_READY(unicode));
2348 if (PyUnicode_IS_ASCII(unicode))
2349 return;
2350
2351 len = PyUnicode_GET_LENGTH(unicode);
2352 kind = PyUnicode_KIND(unicode);
2353 if (kind == PyUnicode_1BYTE_KIND) {
2354 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2355 max_char = ucs1lib_find_max_char(u, u + len);
2356 if (max_char >= 128)
2357 return;
2358 }
2359 else if (kind == PyUnicode_2BYTE_KIND) {
2360 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2361 max_char = ucs2lib_find_max_char(u, u + len);
2362 if (max_char >= 256)
2363 return;
2364 }
2365 else {
2366 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2367 assert(kind == PyUnicode_4BYTE_KIND);
2368 max_char = ucs4lib_find_max_char(u, u + len);
2369 if (max_char >= 0x10000)
2370 return;
2371 }
2372 copy = PyUnicode_New(len, max_char);
2373 if (copy != NULL)
2374 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2375 Py_DECREF(unicode);
2376 *p_unicode = copy;
2377 }
2378
2379 PyObject*
_PyUnicode_Copy(PyObject * unicode)2380 _PyUnicode_Copy(PyObject *unicode)
2381 {
2382 Py_ssize_t length;
2383 PyObject *copy;
2384
2385 if (!PyUnicode_Check(unicode)) {
2386 PyErr_BadInternalCall();
2387 return NULL;
2388 }
2389 if (PyUnicode_READY(unicode) == -1)
2390 return NULL;
2391
2392 length = PyUnicode_GET_LENGTH(unicode);
2393 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2394 if (!copy)
2395 return NULL;
2396 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2397
2398 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2399 length * PyUnicode_KIND(unicode));
2400 assert(_PyUnicode_CheckConsistency(copy, 1));
2401 return copy;
2402 }
2403
2404
2405 /* Widen Unicode objects to larger buffers. Don't write terminating null
2406 character. Return NULL on error. */
2407
2408 void*
_PyUnicode_AsKind(PyObject * s,unsigned int kind)2409 _PyUnicode_AsKind(PyObject *s, unsigned int kind)
2410 {
2411 Py_ssize_t len;
2412 void *result;
2413 unsigned int skind;
2414
2415 if (PyUnicode_READY(s) == -1)
2416 return NULL;
2417
2418 len = PyUnicode_GET_LENGTH(s);
2419 skind = PyUnicode_KIND(s);
2420 if (skind >= kind) {
2421 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2422 return NULL;
2423 }
2424 switch (kind) {
2425 case PyUnicode_2BYTE_KIND:
2426 result = PyMem_New(Py_UCS2, len);
2427 if (!result)
2428 return PyErr_NoMemory();
2429 assert(skind == PyUnicode_1BYTE_KIND);
2430 _PyUnicode_CONVERT_BYTES(
2431 Py_UCS1, Py_UCS2,
2432 PyUnicode_1BYTE_DATA(s),
2433 PyUnicode_1BYTE_DATA(s) + len,
2434 result);
2435 return result;
2436 case PyUnicode_4BYTE_KIND:
2437 result = PyMem_New(Py_UCS4, len);
2438 if (!result)
2439 return PyErr_NoMemory();
2440 if (skind == PyUnicode_2BYTE_KIND) {
2441 _PyUnicode_CONVERT_BYTES(
2442 Py_UCS2, Py_UCS4,
2443 PyUnicode_2BYTE_DATA(s),
2444 PyUnicode_2BYTE_DATA(s) + len,
2445 result);
2446 }
2447 else {
2448 assert(skind == PyUnicode_1BYTE_KIND);
2449 _PyUnicode_CONVERT_BYTES(
2450 Py_UCS1, Py_UCS4,
2451 PyUnicode_1BYTE_DATA(s),
2452 PyUnicode_1BYTE_DATA(s) + len,
2453 result);
2454 }
2455 return result;
2456 default:
2457 break;
2458 }
2459 PyErr_SetString(PyExc_SystemError, "invalid kind");
2460 return NULL;
2461 }
2462
2463 static Py_UCS4*
as_ucs4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2464 as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2465 int copy_null)
2466 {
2467 int kind;
2468 void *data;
2469 Py_ssize_t len, targetlen;
2470 if (PyUnicode_READY(string) == -1)
2471 return NULL;
2472 kind = PyUnicode_KIND(string);
2473 data = PyUnicode_DATA(string);
2474 len = PyUnicode_GET_LENGTH(string);
2475 targetlen = len;
2476 if (copy_null)
2477 targetlen++;
2478 if (!target) {
2479 target = PyMem_New(Py_UCS4, targetlen);
2480 if (!target) {
2481 PyErr_NoMemory();
2482 return NULL;
2483 }
2484 }
2485 else {
2486 if (targetsize < targetlen) {
2487 PyErr_Format(PyExc_SystemError,
2488 "string is longer than the buffer");
2489 if (copy_null && 0 < targetsize)
2490 target[0] = 0;
2491 return NULL;
2492 }
2493 }
2494 if (kind == PyUnicode_1BYTE_KIND) {
2495 Py_UCS1 *start = (Py_UCS1 *) data;
2496 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2497 }
2498 else if (kind == PyUnicode_2BYTE_KIND) {
2499 Py_UCS2 *start = (Py_UCS2 *) data;
2500 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2501 }
2502 else {
2503 assert(kind == PyUnicode_4BYTE_KIND);
2504 memcpy(target, data, len * sizeof(Py_UCS4));
2505 }
2506 if (copy_null)
2507 target[len] = 0;
2508 return target;
2509 }
2510
2511 Py_UCS4*
PyUnicode_AsUCS4(PyObject * string,Py_UCS4 * target,Py_ssize_t targetsize,int copy_null)2512 PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2513 int copy_null)
2514 {
2515 if (target == NULL || targetsize < 0) {
2516 PyErr_BadInternalCall();
2517 return NULL;
2518 }
2519 return as_ucs4(string, target, targetsize, copy_null);
2520 }
2521
2522 Py_UCS4*
PyUnicode_AsUCS4Copy(PyObject * string)2523 PyUnicode_AsUCS4Copy(PyObject *string)
2524 {
2525 return as_ucs4(string, NULL, 0, 1);
2526 }
2527
2528 /* maximum number of characters required for output of %lld or %p.
2529 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2530 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2531 #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2532
2533 static int
unicode_fromformat_write_str(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t width,Py_ssize_t precision)2534 unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2535 Py_ssize_t width, Py_ssize_t precision)
2536 {
2537 Py_ssize_t length, fill, arglen;
2538 Py_UCS4 maxchar;
2539
2540 if (PyUnicode_READY(str) == -1)
2541 return -1;
2542
2543 length = PyUnicode_GET_LENGTH(str);
2544 if ((precision == -1 || precision >= length)
2545 && width <= length)
2546 return _PyUnicodeWriter_WriteStr(writer, str);
2547
2548 if (precision != -1)
2549 length = Py_MIN(precision, length);
2550
2551 arglen = Py_MAX(length, width);
2552 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2553 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2554 else
2555 maxchar = writer->maxchar;
2556
2557 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2558 return -1;
2559
2560 if (width > length) {
2561 fill = width - length;
2562 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2563 return -1;
2564 writer->pos += fill;
2565 }
2566
2567 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2568 str, 0, length);
2569 writer->pos += length;
2570 return 0;
2571 }
2572
2573 static int
unicode_fromformat_write_cstr(_PyUnicodeWriter * writer,const char * str,Py_ssize_t width,Py_ssize_t precision)2574 unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2575 Py_ssize_t width, Py_ssize_t precision)
2576 {
2577 /* UTF-8 */
2578 Py_ssize_t length;
2579 PyObject *unicode;
2580 int res;
2581
2582 if (precision == -1) {
2583 length = strlen(str);
2584 }
2585 else {
2586 length = 0;
2587 while (length < precision && str[length]) {
2588 length++;
2589 }
2590 }
2591 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2592 if (unicode == NULL)
2593 return -1;
2594
2595 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2596 Py_DECREF(unicode);
2597 return res;
2598 }
2599
2600 static const char*
unicode_fromformat_arg(_PyUnicodeWriter * writer,const char * f,va_list * vargs)2601 unicode_fromformat_arg(_PyUnicodeWriter *writer,
2602 const char *f, va_list *vargs)
2603 {
2604 const char *p;
2605 Py_ssize_t len;
2606 int zeropad;
2607 Py_ssize_t width;
2608 Py_ssize_t precision;
2609 int longflag;
2610 int longlongflag;
2611 int size_tflag;
2612 Py_ssize_t fill;
2613
2614 p = f;
2615 f++;
2616 zeropad = 0;
2617 if (*f == '0') {
2618 zeropad = 1;
2619 f++;
2620 }
2621
2622 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2623 width = -1;
2624 if (Py_ISDIGIT((unsigned)*f)) {
2625 width = *f - '0';
2626 f++;
2627 while (Py_ISDIGIT((unsigned)*f)) {
2628 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2629 PyErr_SetString(PyExc_ValueError,
2630 "width too big");
2631 return NULL;
2632 }
2633 width = (width * 10) + (*f - '0');
2634 f++;
2635 }
2636 }
2637 precision = -1;
2638 if (*f == '.') {
2639 f++;
2640 if (Py_ISDIGIT((unsigned)*f)) {
2641 precision = (*f - '0');
2642 f++;
2643 while (Py_ISDIGIT((unsigned)*f)) {
2644 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2645 PyErr_SetString(PyExc_ValueError,
2646 "precision too big");
2647 return NULL;
2648 }
2649 precision = (precision * 10) + (*f - '0');
2650 f++;
2651 }
2652 }
2653 if (*f == '%') {
2654 /* "%.3%s" => f points to "3" */
2655 f--;
2656 }
2657 }
2658 if (*f == '\0') {
2659 /* bogus format "%.123" => go backward, f points to "3" */
2660 f--;
2661 }
2662
2663 /* Handle %ld, %lu, %lld and %llu. */
2664 longflag = 0;
2665 longlongflag = 0;
2666 size_tflag = 0;
2667 if (*f == 'l') {
2668 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2669 longflag = 1;
2670 ++f;
2671 }
2672 else if (f[1] == 'l' &&
2673 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2674 longlongflag = 1;
2675 f += 2;
2676 }
2677 }
2678 /* handle the size_t flag. */
2679 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2680 size_tflag = 1;
2681 ++f;
2682 }
2683
2684 if (f[1] == '\0')
2685 writer->overallocate = 0;
2686
2687 switch (*f) {
2688 case 'c':
2689 {
2690 int ordinal = va_arg(*vargs, int);
2691 if (ordinal < 0 || ordinal > MAX_UNICODE) {
2692 PyErr_SetString(PyExc_OverflowError,
2693 "character argument not in range(0x110000)");
2694 return NULL;
2695 }
2696 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2697 return NULL;
2698 break;
2699 }
2700
2701 case 'i':
2702 case 'd':
2703 case 'u':
2704 case 'x':
2705 {
2706 /* used by sprintf */
2707 char buffer[MAX_LONG_LONG_CHARS];
2708 Py_ssize_t arglen;
2709
2710 if (*f == 'u') {
2711 if (longflag)
2712 len = sprintf(buffer, "%lu",
2713 va_arg(*vargs, unsigned long));
2714 else if (longlongflag)
2715 len = sprintf(buffer, "%llu",
2716 va_arg(*vargs, unsigned long long));
2717 else if (size_tflag)
2718 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2719 va_arg(*vargs, size_t));
2720 else
2721 len = sprintf(buffer, "%u",
2722 va_arg(*vargs, unsigned int));
2723 }
2724 else if (*f == 'x') {
2725 len = sprintf(buffer, "%x", va_arg(*vargs, int));
2726 }
2727 else {
2728 if (longflag)
2729 len = sprintf(buffer, "%li",
2730 va_arg(*vargs, long));
2731 else if (longlongflag)
2732 len = sprintf(buffer, "%lli",
2733 va_arg(*vargs, long long));
2734 else if (size_tflag)
2735 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2736 va_arg(*vargs, Py_ssize_t));
2737 else
2738 len = sprintf(buffer, "%i",
2739 va_arg(*vargs, int));
2740 }
2741 assert(len >= 0);
2742
2743 if (precision < len)
2744 precision = len;
2745
2746 arglen = Py_MAX(precision, width);
2747 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2748 return NULL;
2749
2750 if (width > precision) {
2751 Py_UCS4 fillchar;
2752 fill = width - precision;
2753 fillchar = zeropad?'0':' ';
2754 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2755 return NULL;
2756 writer->pos += fill;
2757 }
2758 if (precision > len) {
2759 fill = precision - len;
2760 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2761 return NULL;
2762 writer->pos += fill;
2763 }
2764
2765 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2766 return NULL;
2767 break;
2768 }
2769
2770 case 'p':
2771 {
2772 char number[MAX_LONG_LONG_CHARS];
2773
2774 len = sprintf(number, "%p", va_arg(*vargs, void*));
2775 assert(len >= 0);
2776
2777 /* %p is ill-defined: ensure leading 0x. */
2778 if (number[1] == 'X')
2779 number[1] = 'x';
2780 else if (number[1] != 'x') {
2781 memmove(number + 2, number,
2782 strlen(number) + 1);
2783 number[0] = '0';
2784 number[1] = 'x';
2785 len += 2;
2786 }
2787
2788 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2789 return NULL;
2790 break;
2791 }
2792
2793 case 's':
2794 {
2795 /* UTF-8 */
2796 const char *s = va_arg(*vargs, const char*);
2797 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2798 return NULL;
2799 break;
2800 }
2801
2802 case 'U':
2803 {
2804 PyObject *obj = va_arg(*vargs, PyObject *);
2805 assert(obj && _PyUnicode_CHECK(obj));
2806
2807 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2808 return NULL;
2809 break;
2810 }
2811
2812 case 'V':
2813 {
2814 PyObject *obj = va_arg(*vargs, PyObject *);
2815 const char *str = va_arg(*vargs, const char *);
2816 if (obj) {
2817 assert(_PyUnicode_CHECK(obj));
2818 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2819 return NULL;
2820 }
2821 else {
2822 assert(str != NULL);
2823 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2824 return NULL;
2825 }
2826 break;
2827 }
2828
2829 case 'S':
2830 {
2831 PyObject *obj = va_arg(*vargs, PyObject *);
2832 PyObject *str;
2833 assert(obj);
2834 str = PyObject_Str(obj);
2835 if (!str)
2836 return NULL;
2837 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2838 Py_DECREF(str);
2839 return NULL;
2840 }
2841 Py_DECREF(str);
2842 break;
2843 }
2844
2845 case 'R':
2846 {
2847 PyObject *obj = va_arg(*vargs, PyObject *);
2848 PyObject *repr;
2849 assert(obj);
2850 repr = PyObject_Repr(obj);
2851 if (!repr)
2852 return NULL;
2853 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2854 Py_DECREF(repr);
2855 return NULL;
2856 }
2857 Py_DECREF(repr);
2858 break;
2859 }
2860
2861 case 'A':
2862 {
2863 PyObject *obj = va_arg(*vargs, PyObject *);
2864 PyObject *ascii;
2865 assert(obj);
2866 ascii = PyObject_ASCII(obj);
2867 if (!ascii)
2868 return NULL;
2869 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2870 Py_DECREF(ascii);
2871 return NULL;
2872 }
2873 Py_DECREF(ascii);
2874 break;
2875 }
2876
2877 case '%':
2878 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2879 return NULL;
2880 break;
2881
2882 default:
2883 /* if we stumble upon an unknown formatting code, copy the rest
2884 of the format string to the output string. (we cannot just
2885 skip the code, since there's no way to know what's in the
2886 argument list) */
2887 len = strlen(p);
2888 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2889 return NULL;
2890 f = p+len;
2891 return f;
2892 }
2893
2894 f++;
2895 return f;
2896 }
2897
2898 PyObject *
PyUnicode_FromFormatV(const char * format,va_list vargs)2899 PyUnicode_FromFormatV(const char *format, va_list vargs)
2900 {
2901 va_list vargs2;
2902 const char *f;
2903 _PyUnicodeWriter writer;
2904
2905 _PyUnicodeWriter_Init(&writer);
2906 writer.min_length = strlen(format) + 100;
2907 writer.overallocate = 1;
2908
2909 // Copy varags to be able to pass a reference to a subfunction.
2910 va_copy(vargs2, vargs);
2911
2912 for (f = format; *f; ) {
2913 if (*f == '%') {
2914 f = unicode_fromformat_arg(&writer, f, &vargs2);
2915 if (f == NULL)
2916 goto fail;
2917 }
2918 else {
2919 const char *p;
2920 Py_ssize_t len;
2921
2922 p = f;
2923 do
2924 {
2925 if ((unsigned char)*p > 127) {
2926 PyErr_Format(PyExc_ValueError,
2927 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2928 "string, got a non-ASCII byte: 0x%02x",
2929 (unsigned char)*p);
2930 goto fail;
2931 }
2932 p++;
2933 }
2934 while (*p != '\0' && *p != '%');
2935 len = p - f;
2936
2937 if (*p == '\0')
2938 writer.overallocate = 0;
2939
2940 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2941 goto fail;
2942
2943 f = p;
2944 }
2945 }
2946 va_end(vargs2);
2947 return _PyUnicodeWriter_Finish(&writer);
2948
2949 fail:
2950 va_end(vargs2);
2951 _PyUnicodeWriter_Dealloc(&writer);
2952 return NULL;
2953 }
2954
2955 PyObject *
PyUnicode_FromFormat(const char * format,...)2956 PyUnicode_FromFormat(const char *format, ...)
2957 {
2958 PyObject* ret;
2959 va_list vargs;
2960
2961 #ifdef HAVE_STDARG_PROTOTYPES
2962 va_start(vargs, format);
2963 #else
2964 va_start(vargs);
2965 #endif
2966 ret = PyUnicode_FromFormatV(format, vargs);
2967 va_end(vargs);
2968 return ret;
2969 }
2970
2971 #ifdef HAVE_WCHAR_H
2972
2973 /* Convert a Unicode object to a wide character string.
2974
2975 - If w is NULL: return the number of wide characters (including the null
2976 character) required to convert the unicode object. Ignore size argument.
2977
2978 - Otherwise: return the number of wide characters (excluding the null
2979 character) written into w. Write at most size wide characters (including
2980 the null character). */
2981 Py_ssize_t
PyUnicode_AsWideChar(PyObject * unicode,wchar_t * w,Py_ssize_t size)2982 PyUnicode_AsWideChar(PyObject *unicode,
2983 wchar_t *w,
2984 Py_ssize_t size)
2985 {
2986 Py_ssize_t res;
2987 const wchar_t *wstr;
2988
2989 if (unicode == NULL) {
2990 PyErr_BadInternalCall();
2991 return -1;
2992 }
2993 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2994 if (wstr == NULL)
2995 return -1;
2996
2997 if (w != NULL) {
2998 if (size > res)
2999 size = res + 1;
3000 else
3001 res = size;
3002 memcpy(w, wstr, size * sizeof(wchar_t));
3003 return res;
3004 }
3005 else
3006 return res + 1;
3007 }
3008
3009 wchar_t*
PyUnicode_AsWideCharString(PyObject * unicode,Py_ssize_t * size)3010 PyUnicode_AsWideCharString(PyObject *unicode,
3011 Py_ssize_t *size)
3012 {
3013 const wchar_t *wstr;
3014 wchar_t *buffer;
3015 Py_ssize_t buflen;
3016
3017 if (unicode == NULL) {
3018 PyErr_BadInternalCall();
3019 return NULL;
3020 }
3021
3022 wstr = PyUnicode_AsUnicodeAndSize(unicode, &buflen);
3023 if (wstr == NULL) {
3024 return NULL;
3025 }
3026 if (size == NULL && wcslen(wstr) != (size_t)buflen) {
3027 PyErr_SetString(PyExc_ValueError,
3028 "embedded null character");
3029 return NULL;
3030 }
3031
3032 buffer = PyMem_NEW(wchar_t, buflen + 1);
3033 if (buffer == NULL) {
3034 PyErr_NoMemory();
3035 return NULL;
3036 }
3037 memcpy(buffer, wstr, (buflen + 1) * sizeof(wchar_t));
3038 if (size != NULL)
3039 *size = buflen;
3040 return buffer;
3041 }
3042
3043 #endif /* HAVE_WCHAR_H */
3044
3045 PyObject *
PyUnicode_FromOrdinal(int ordinal)3046 PyUnicode_FromOrdinal(int ordinal)
3047 {
3048 if (ordinal < 0 || ordinal > MAX_UNICODE) {
3049 PyErr_SetString(PyExc_ValueError,
3050 "chr() arg not in range(0x110000)");
3051 return NULL;
3052 }
3053
3054 return unicode_char((Py_UCS4)ordinal);
3055 }
3056
3057 PyObject *
PyUnicode_FromObject(PyObject * obj)3058 PyUnicode_FromObject(PyObject *obj)
3059 {
3060 /* XXX Perhaps we should make this API an alias of
3061 PyObject_Str() instead ?! */
3062 if (PyUnicode_CheckExact(obj)) {
3063 if (PyUnicode_READY(obj) == -1)
3064 return NULL;
3065 Py_INCREF(obj);
3066 return obj;
3067 }
3068 if (PyUnicode_Check(obj)) {
3069 /* For a Unicode subtype that's not a Unicode object,
3070 return a true Unicode object with the same data. */
3071 return _PyUnicode_Copy(obj);
3072 }
3073 PyErr_Format(PyExc_TypeError,
3074 "Can't convert '%.100s' object to str implicitly",
3075 Py_TYPE(obj)->tp_name);
3076 return NULL;
3077 }
3078
3079 PyObject *
PyUnicode_FromEncodedObject(PyObject * obj,const char * encoding,const char * errors)3080 PyUnicode_FromEncodedObject(PyObject *obj,
3081 const char *encoding,
3082 const char *errors)
3083 {
3084 Py_buffer buffer;
3085 PyObject *v;
3086
3087 if (obj == NULL) {
3088 PyErr_BadInternalCall();
3089 return NULL;
3090 }
3091
3092 /* Decoding bytes objects is the most common case and should be fast */
3093 if (PyBytes_Check(obj)) {
3094 if (PyBytes_GET_SIZE(obj) == 0)
3095 _Py_RETURN_UNICODE_EMPTY();
3096 v = PyUnicode_Decode(
3097 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3098 encoding, errors);
3099 return v;
3100 }
3101
3102 if (PyUnicode_Check(obj)) {
3103 PyErr_SetString(PyExc_TypeError,
3104 "decoding str is not supported");
3105 return NULL;
3106 }
3107
3108 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3109 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3110 PyErr_Format(PyExc_TypeError,
3111 "decoding to str: need a bytes-like object, %.80s found",
3112 Py_TYPE(obj)->tp_name);
3113 return NULL;
3114 }
3115
3116 if (buffer.len == 0) {
3117 PyBuffer_Release(&buffer);
3118 _Py_RETURN_UNICODE_EMPTY();
3119 }
3120
3121 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3122 PyBuffer_Release(&buffer);
3123 return v;
3124 }
3125
3126 /* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3127 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3128 longer than lower_len-1). */
3129 int
_Py_normalize_encoding(const char * encoding,char * lower,size_t lower_len)3130 _Py_normalize_encoding(const char *encoding,
3131 char *lower,
3132 size_t lower_len)
3133 {
3134 const char *e;
3135 char *l;
3136 char *l_end;
3137 int punct;
3138
3139 assert(encoding != NULL);
3140
3141 e = encoding;
3142 l = lower;
3143 l_end = &lower[lower_len - 1];
3144 punct = 0;
3145 while (1) {
3146 char c = *e;
3147 if (c == 0) {
3148 break;
3149 }
3150
3151 if (Py_ISALNUM(c) || c == '.') {
3152 if (punct && l != lower) {
3153 if (l == l_end) {
3154 return 0;
3155 }
3156 *l++ = '_';
3157 }
3158 punct = 0;
3159
3160 if (l == l_end) {
3161 return 0;
3162 }
3163 *l++ = Py_TOLOWER(c);
3164 }
3165 else {
3166 punct = 1;
3167 }
3168
3169 e++;
3170 }
3171 *l = '\0';
3172 return 1;
3173 }
3174
3175 PyObject *
PyUnicode_Decode(const char * s,Py_ssize_t size,const char * encoding,const char * errors)3176 PyUnicode_Decode(const char *s,
3177 Py_ssize_t size,
3178 const char *encoding,
3179 const char *errors)
3180 {
3181 PyObject *buffer = NULL, *unicode;
3182 Py_buffer info;
3183 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3184
3185 if (encoding == NULL) {
3186 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3187 }
3188
3189 /* Shortcuts for common default encodings */
3190 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3191 char *lower = buflower;
3192
3193 /* Fast paths */
3194 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3195 lower += 3;
3196 if (*lower == '_') {
3197 /* Match "utf8" and "utf_8" */
3198 lower++;
3199 }
3200
3201 if (lower[0] == '8' && lower[1] == 0) {
3202 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3203 }
3204 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3205 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3206 }
3207 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3208 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3209 }
3210 }
3211 else {
3212 if (strcmp(lower, "ascii") == 0
3213 || strcmp(lower, "us_ascii") == 0) {
3214 return PyUnicode_DecodeASCII(s, size, errors);
3215 }
3216 #ifdef MS_WINDOWS
3217 else if (strcmp(lower, "mbcs") == 0) {
3218 return PyUnicode_DecodeMBCS(s, size, errors);
3219 }
3220 #endif
3221 else if (strcmp(lower, "latin1") == 0
3222 || strcmp(lower, "latin_1") == 0
3223 || strcmp(lower, "iso_8859_1") == 0
3224 || strcmp(lower, "iso8859_1") == 0) {
3225 return PyUnicode_DecodeLatin1(s, size, errors);
3226 }
3227 }
3228 }
3229
3230 /* Decode via the codec registry */
3231 buffer = NULL;
3232 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3233 goto onError;
3234 buffer = PyMemoryView_FromBuffer(&info);
3235 if (buffer == NULL)
3236 goto onError;
3237 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3238 if (unicode == NULL)
3239 goto onError;
3240 if (!PyUnicode_Check(unicode)) {
3241 PyErr_Format(PyExc_TypeError,
3242 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3243 "use codecs.decode() to decode to arbitrary types",
3244 encoding,
3245 Py_TYPE(unicode)->tp_name);
3246 Py_DECREF(unicode);
3247 goto onError;
3248 }
3249 Py_DECREF(buffer);
3250 return unicode_result(unicode);
3251
3252 onError:
3253 Py_XDECREF(buffer);
3254 return NULL;
3255 }
3256
3257 PyObject *
PyUnicode_AsDecodedObject(PyObject * unicode,const char * encoding,const char * errors)3258 PyUnicode_AsDecodedObject(PyObject *unicode,
3259 const char *encoding,
3260 const char *errors)
3261 {
3262 if (!PyUnicode_Check(unicode)) {
3263 PyErr_BadArgument();
3264 return NULL;
3265 }
3266
3267 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3268 "PyUnicode_AsDecodedObject() is deprecated; "
3269 "use PyCodec_Decode() to decode from str", 1) < 0)
3270 return NULL;
3271
3272 if (encoding == NULL)
3273 encoding = PyUnicode_GetDefaultEncoding();
3274
3275 /* Decode via the codec registry */
3276 return PyCodec_Decode(unicode, encoding, errors);
3277 }
3278
3279 PyObject *
PyUnicode_AsDecodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3280 PyUnicode_AsDecodedUnicode(PyObject *unicode,
3281 const char *encoding,
3282 const char *errors)
3283 {
3284 PyObject *v;
3285
3286 if (!PyUnicode_Check(unicode)) {
3287 PyErr_BadArgument();
3288 goto onError;
3289 }
3290
3291 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3292 "PyUnicode_AsDecodedUnicode() is deprecated; "
3293 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3294 return NULL;
3295
3296 if (encoding == NULL)
3297 encoding = PyUnicode_GetDefaultEncoding();
3298
3299 /* Decode via the codec registry */
3300 v = PyCodec_Decode(unicode, encoding, errors);
3301 if (v == NULL)
3302 goto onError;
3303 if (!PyUnicode_Check(v)) {
3304 PyErr_Format(PyExc_TypeError,
3305 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3306 "use codecs.decode() to decode to arbitrary types",
3307 encoding,
3308 Py_TYPE(unicode)->tp_name);
3309 Py_DECREF(v);
3310 goto onError;
3311 }
3312 return unicode_result(v);
3313
3314 onError:
3315 return NULL;
3316 }
3317
3318 PyObject *
PyUnicode_Encode(const Py_UNICODE * s,Py_ssize_t size,const char * encoding,const char * errors)3319 PyUnicode_Encode(const Py_UNICODE *s,
3320 Py_ssize_t size,
3321 const char *encoding,
3322 const char *errors)
3323 {
3324 PyObject *v, *unicode;
3325
3326 unicode = PyUnicode_FromWideChar(s, size);
3327 if (unicode == NULL)
3328 return NULL;
3329 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3330 Py_DECREF(unicode);
3331 return v;
3332 }
3333
3334 PyObject *
PyUnicode_AsEncodedObject(PyObject * unicode,const char * encoding,const char * errors)3335 PyUnicode_AsEncodedObject(PyObject *unicode,
3336 const char *encoding,
3337 const char *errors)
3338 {
3339 PyObject *v;
3340
3341 if (!PyUnicode_Check(unicode)) {
3342 PyErr_BadArgument();
3343 goto onError;
3344 }
3345
3346 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3347 "PyUnicode_AsEncodedObject() is deprecated; "
3348 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3349 "or PyCodec_Encode() for generic encoding", 1) < 0)
3350 return NULL;
3351
3352 if (encoding == NULL)
3353 encoding = PyUnicode_GetDefaultEncoding();
3354
3355 /* Encode via the codec registry */
3356 v = PyCodec_Encode(unicode, encoding, errors);
3357 if (v == NULL)
3358 goto onError;
3359 return v;
3360
3361 onError:
3362 return NULL;
3363 }
3364
3365 static int
locale_error_handler(const char * errors,int * surrogateescape)3366 locale_error_handler(const char *errors, int *surrogateescape)
3367 {
3368 _Py_error_handler error_handler = get_error_handler(errors);
3369 switch (error_handler)
3370 {
3371 case _Py_ERROR_STRICT:
3372 *surrogateescape = 0;
3373 return 0;
3374 case _Py_ERROR_SURROGATEESCAPE:
3375 *surrogateescape = 1;
3376 return 0;
3377 default:
3378 PyErr_Format(PyExc_ValueError,
3379 "only 'strict' and 'surrogateescape' error handlers "
3380 "are supported, not '%s'",
3381 errors);
3382 return -1;
3383 }
3384 }
3385
3386 static PyObject *
unicode_encode_locale(PyObject * unicode,const char * errors,int current_locale)3387 unicode_encode_locale(PyObject *unicode, const char *errors,
3388 int current_locale)
3389 {
3390 int surrogateescape;
3391 if (locale_error_handler(errors, &surrogateescape) < 0)
3392 return NULL;
3393
3394 Py_ssize_t wlen;
3395 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3396 if (wstr == NULL) {
3397 return NULL;
3398 }
3399
3400 if ((size_t)wlen != wcslen(wstr)) {
3401 PyErr_SetString(PyExc_ValueError, "embedded null character");
3402 PyMem_Free(wstr);
3403 return NULL;
3404 }
3405
3406 char *str;
3407 size_t error_pos;
3408 const char *reason;
3409 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3410 current_locale, surrogateescape);
3411 PyMem_Free(wstr);
3412
3413 if (res != 0) {
3414 if (res == -2) {
3415 PyObject *exc;
3416 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3417 "locale", unicode,
3418 (Py_ssize_t)error_pos,
3419 (Py_ssize_t)(error_pos+1),
3420 reason);
3421 if (exc != NULL) {
3422 PyCodec_StrictErrors(exc);
3423 Py_DECREF(exc);
3424 }
3425 }
3426 else {
3427 PyErr_NoMemory();
3428 }
3429 return NULL;
3430 }
3431
3432 PyObject *bytes = PyBytes_FromString(str);
3433 PyMem_RawFree(str);
3434 return bytes;
3435 }
3436
3437 PyObject *
PyUnicode_EncodeLocale(PyObject * unicode,const char * errors)3438 PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3439 {
3440 return unicode_encode_locale(unicode, errors, 1);
3441 }
3442
3443 PyObject *
PyUnicode_EncodeFSDefault(PyObject * unicode)3444 PyUnicode_EncodeFSDefault(PyObject *unicode)
3445 {
3446 #if defined(__APPLE__)
3447 return _PyUnicode_AsUTF8String(unicode, Py_FileSystemDefaultEncodeErrors);
3448 #else
3449 PyInterpreterState *interp = PyThreadState_GET()->interp;
3450 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3451 cannot use it to encode and decode filenames before it is loaded. Load
3452 the Python codec requires to encode at least its own filename. Use the C
3453 version of the locale codec until the codec registry is initialized and
3454 the Python codec is loaded.
3455
3456 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3457 cannot only rely on it: check also interp->fscodec_initialized for
3458 subinterpreters. */
3459 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3460 return PyUnicode_AsEncodedString(unicode,
3461 Py_FileSystemDefaultEncoding,
3462 Py_FileSystemDefaultEncodeErrors);
3463 }
3464 else {
3465 return unicode_encode_locale(unicode,
3466 Py_FileSystemDefaultEncodeErrors, 0);
3467 }
3468 #endif
3469 }
3470
3471 PyObject *
PyUnicode_AsEncodedString(PyObject * unicode,const char * encoding,const char * errors)3472 PyUnicode_AsEncodedString(PyObject *unicode,
3473 const char *encoding,
3474 const char *errors)
3475 {
3476 PyObject *v;
3477 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
3478
3479 if (!PyUnicode_Check(unicode)) {
3480 PyErr_BadArgument();
3481 return NULL;
3482 }
3483
3484 if (encoding == NULL) {
3485 return _PyUnicode_AsUTF8String(unicode, errors);
3486 }
3487
3488 /* Shortcuts for common default encodings */
3489 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3490 char *lower = buflower;
3491
3492 /* Fast paths */
3493 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3494 lower += 3;
3495 if (*lower == '_') {
3496 /* Match "utf8" and "utf_8" */
3497 lower++;
3498 }
3499
3500 if (lower[0] == '8' && lower[1] == 0) {
3501 return _PyUnicode_AsUTF8String(unicode, errors);
3502 }
3503 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3504 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3505 }
3506 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3507 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3508 }
3509 }
3510 else {
3511 if (strcmp(lower, "ascii") == 0
3512 || strcmp(lower, "us_ascii") == 0) {
3513 return _PyUnicode_AsASCIIString(unicode, errors);
3514 }
3515 #ifdef MS_WINDOWS
3516 else if (strcmp(lower, "mbcs") == 0) {
3517 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3518 }
3519 #endif
3520 else if (strcmp(lower, "latin1") == 0 ||
3521 strcmp(lower, "latin_1") == 0 ||
3522 strcmp(lower, "iso_8859_1") == 0 ||
3523 strcmp(lower, "iso8859_1") == 0) {
3524 return _PyUnicode_AsLatin1String(unicode, errors);
3525 }
3526 }
3527 }
3528
3529 /* Encode via the codec registry */
3530 v = _PyCodec_EncodeText(unicode, encoding, errors);
3531 if (v == NULL)
3532 return NULL;
3533
3534 /* The normal path */
3535 if (PyBytes_Check(v))
3536 return v;
3537
3538 /* If the codec returns a buffer, raise a warning and convert to bytes */
3539 if (PyByteArray_Check(v)) {
3540 int error;
3541 PyObject *b;
3542
3543 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3544 "encoder %s returned bytearray instead of bytes; "
3545 "use codecs.encode() to encode to arbitrary types",
3546 encoding);
3547 if (error) {
3548 Py_DECREF(v);
3549 return NULL;
3550 }
3551
3552 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3553 PyByteArray_GET_SIZE(v));
3554 Py_DECREF(v);
3555 return b;
3556 }
3557
3558 PyErr_Format(PyExc_TypeError,
3559 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3560 "use codecs.encode() to encode to arbitrary types",
3561 encoding,
3562 Py_TYPE(v)->tp_name);
3563 Py_DECREF(v);
3564 return NULL;
3565 }
3566
3567 PyObject *
PyUnicode_AsEncodedUnicode(PyObject * unicode,const char * encoding,const char * errors)3568 PyUnicode_AsEncodedUnicode(PyObject *unicode,
3569 const char *encoding,
3570 const char *errors)
3571 {
3572 PyObject *v;
3573
3574 if (!PyUnicode_Check(unicode)) {
3575 PyErr_BadArgument();
3576 goto onError;
3577 }
3578
3579 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3580 "PyUnicode_AsEncodedUnicode() is deprecated; "
3581 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3582 return NULL;
3583
3584 if (encoding == NULL)
3585 encoding = PyUnicode_GetDefaultEncoding();
3586
3587 /* Encode via the codec registry */
3588 v = PyCodec_Encode(unicode, encoding, errors);
3589 if (v == NULL)
3590 goto onError;
3591 if (!PyUnicode_Check(v)) {
3592 PyErr_Format(PyExc_TypeError,
3593 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3594 "use codecs.encode() to encode to arbitrary types",
3595 encoding,
3596 Py_TYPE(v)->tp_name);
3597 Py_DECREF(v);
3598 goto onError;
3599 }
3600 return v;
3601
3602 onError:
3603 return NULL;
3604 }
3605
3606 static PyObject*
unicode_decode_locale(const char * str,Py_ssize_t len,const char * errors,int current_locale)3607 unicode_decode_locale(const char *str, Py_ssize_t len, const char *errors,
3608 int current_locale)
3609 {
3610 int surrogateescape;
3611 if (locale_error_handler(errors, &surrogateescape) < 0)
3612 return NULL;
3613
3614 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3615 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3616 return NULL;
3617 }
3618
3619 wchar_t *wstr;
3620 size_t wlen;
3621 const char *reason;
3622 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3623 current_locale, surrogateescape);
3624 if (res != 0) {
3625 if (res == -2) {
3626 PyObject *exc;
3627 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3628 "locale", str, len,
3629 (Py_ssize_t)wlen,
3630 (Py_ssize_t)(wlen + 1),
3631 reason);
3632 if (exc != NULL) {
3633 PyCodec_StrictErrors(exc);
3634 Py_DECREF(exc);
3635 }
3636 }
3637 else {
3638 PyErr_NoMemory();
3639 }
3640 return NULL;
3641 }
3642
3643 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3644 PyMem_RawFree(wstr);
3645 return unicode;
3646 }
3647
3648 PyObject*
PyUnicode_DecodeLocaleAndSize(const char * str,Py_ssize_t len,const char * errors)3649 PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3650 const char *errors)
3651 {
3652 return unicode_decode_locale(str, len, errors, 1);
3653 }
3654
3655 PyObject*
PyUnicode_DecodeLocale(const char * str,const char * errors)3656 PyUnicode_DecodeLocale(const char *str, const char *errors)
3657 {
3658 Py_ssize_t size = (Py_ssize_t)strlen(str);
3659 return unicode_decode_locale(str, size, errors, 1);
3660 }
3661
3662
3663 PyObject*
PyUnicode_DecodeFSDefault(const char * s)3664 PyUnicode_DecodeFSDefault(const char *s) {
3665 Py_ssize_t size = (Py_ssize_t)strlen(s);
3666 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3667 }
3668
3669 PyObject*
PyUnicode_DecodeFSDefaultAndSize(const char * s,Py_ssize_t size)3670 PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3671 {
3672 #if defined(__APPLE__)
3673 return PyUnicode_DecodeUTF8Stateful(s, size, Py_FileSystemDefaultEncodeErrors, NULL);
3674 #else
3675 PyInterpreterState *interp = PyThreadState_GET()->interp;
3676 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3677 cannot use it to encode and decode filenames before it is loaded. Load
3678 the Python codec requires to encode at least its own filename. Use the C
3679 version of the locale codec until the codec registry is initialized and
3680 the Python codec is loaded.
3681
3682 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3683 cannot only rely on it: check also interp->fscodec_initialized for
3684 subinterpreters. */
3685 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3686 return PyUnicode_Decode(s, size,
3687 Py_FileSystemDefaultEncoding,
3688 Py_FileSystemDefaultEncodeErrors);
3689 }
3690 else {
3691 return unicode_decode_locale(s, size,
3692 Py_FileSystemDefaultEncodeErrors, 0);
3693 }
3694 #endif
3695 }
3696
3697
3698 int
PyUnicode_FSConverter(PyObject * arg,void * addr)3699 PyUnicode_FSConverter(PyObject* arg, void* addr)
3700 {
3701 PyObject *path = NULL;
3702 PyObject *output = NULL;
3703 Py_ssize_t size;
3704 void *data;
3705 if (arg == NULL) {
3706 Py_DECREF(*(PyObject**)addr);
3707 *(PyObject**)addr = NULL;
3708 return 1;
3709 }
3710 path = PyOS_FSPath(arg);
3711 if (path == NULL) {
3712 return 0;
3713 }
3714 if (PyBytes_Check(path)) {
3715 output = path;
3716 }
3717 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3718 output = PyUnicode_EncodeFSDefault(path);
3719 Py_DECREF(path);
3720 if (!output) {
3721 return 0;
3722 }
3723 assert(PyBytes_Check(output));
3724 }
3725
3726 size = PyBytes_GET_SIZE(output);
3727 data = PyBytes_AS_STRING(output);
3728 if ((size_t)size != strlen(data)) {
3729 PyErr_SetString(PyExc_ValueError, "embedded null byte");
3730 Py_DECREF(output);
3731 return 0;
3732 }
3733 *(PyObject**)addr = output;
3734 return Py_CLEANUP_SUPPORTED;
3735 }
3736
3737
3738 int
PyUnicode_FSDecoder(PyObject * arg,void * addr)3739 PyUnicode_FSDecoder(PyObject* arg, void* addr)
3740 {
3741 int is_buffer = 0;
3742 PyObject *path = NULL;
3743 PyObject *output = NULL;
3744 if (arg == NULL) {
3745 Py_DECREF(*(PyObject**)addr);
3746 *(PyObject**)addr = NULL;
3747 return 1;
3748 }
3749
3750 is_buffer = PyObject_CheckBuffer(arg);
3751 if (!is_buffer) {
3752 path = PyOS_FSPath(arg);
3753 if (path == NULL) {
3754 return 0;
3755 }
3756 }
3757 else {
3758 path = arg;
3759 Py_INCREF(arg);
3760 }
3761
3762 if (PyUnicode_Check(path)) {
3763 if (PyUnicode_READY(path) == -1) {
3764 Py_DECREF(path);
3765 return 0;
3766 }
3767 output = path;
3768 }
3769 else if (PyBytes_Check(path) || is_buffer) {
3770 PyObject *path_bytes = NULL;
3771
3772 if (!PyBytes_Check(path) &&
3773 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3774 "path should be string, bytes, or os.PathLike, not %.200s",
3775 Py_TYPE(arg)->tp_name)) {
3776 Py_DECREF(path);
3777 return 0;
3778 }
3779 path_bytes = PyBytes_FromObject(path);
3780 Py_DECREF(path);
3781 if (!path_bytes) {
3782 return 0;
3783 }
3784 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3785 PyBytes_GET_SIZE(path_bytes));
3786 Py_DECREF(path_bytes);
3787 if (!output) {
3788 return 0;
3789 }
3790 }
3791 else {
3792 PyErr_Format(PyExc_TypeError,
3793 "path should be string, bytes, or os.PathLike, not %.200s",
3794 Py_TYPE(arg)->tp_name);
3795 Py_DECREF(path);
3796 return 0;
3797 }
3798 if (PyUnicode_READY(output) == -1) {
3799 Py_DECREF(output);
3800 return 0;
3801 }
3802 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3803 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3804 PyErr_SetString(PyExc_ValueError, "embedded null character");
3805 Py_DECREF(output);
3806 return 0;
3807 }
3808 *(PyObject**)addr = output;
3809 return Py_CLEANUP_SUPPORTED;
3810 }
3811
3812
3813 const char *
PyUnicode_AsUTF8AndSize(PyObject * unicode,Py_ssize_t * psize)3814 PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3815 {
3816 PyObject *bytes;
3817
3818 if (!PyUnicode_Check(unicode)) {
3819 PyErr_BadArgument();
3820 return NULL;
3821 }
3822 if (PyUnicode_READY(unicode) == -1)
3823 return NULL;
3824
3825 if (PyUnicode_UTF8(unicode) == NULL) {
3826 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3827 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3828 if (bytes == NULL)
3829 return NULL;
3830 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3831 if (_PyUnicode_UTF8(unicode) == NULL) {
3832 PyErr_NoMemory();
3833 Py_DECREF(bytes);
3834 return NULL;
3835 }
3836 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3837 memcpy(_PyUnicode_UTF8(unicode),
3838 PyBytes_AS_STRING(bytes),
3839 _PyUnicode_UTF8_LENGTH(unicode) + 1);
3840 Py_DECREF(bytes);
3841 }
3842
3843 if (psize)
3844 *psize = PyUnicode_UTF8_LENGTH(unicode);
3845 return PyUnicode_UTF8(unicode);
3846 }
3847
3848 const char *
PyUnicode_AsUTF8(PyObject * unicode)3849 PyUnicode_AsUTF8(PyObject *unicode)
3850 {
3851 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3852 }
3853
3854 Py_UNICODE *
PyUnicode_AsUnicodeAndSize(PyObject * unicode,Py_ssize_t * size)3855 PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3856 {
3857 const unsigned char *one_byte;
3858 #if SIZEOF_WCHAR_T == 4
3859 const Py_UCS2 *two_bytes;
3860 #else
3861 const Py_UCS4 *four_bytes;
3862 const Py_UCS4 *ucs4_end;
3863 Py_ssize_t num_surrogates;
3864 #endif
3865 wchar_t *w;
3866 wchar_t *wchar_end;
3867
3868 if (!PyUnicode_Check(unicode)) {
3869 PyErr_BadArgument();
3870 return NULL;
3871 }
3872 if (_PyUnicode_WSTR(unicode) == NULL) {
3873 /* Non-ASCII compact unicode object */
3874 assert(_PyUnicode_KIND(unicode) != 0);
3875 assert(PyUnicode_IS_READY(unicode));
3876
3877 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3878 #if SIZEOF_WCHAR_T == 2
3879 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3880 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3881 num_surrogates = 0;
3882
3883 for (; four_bytes < ucs4_end; ++four_bytes) {
3884 if (*four_bytes > 0xFFFF)
3885 ++num_surrogates;
3886 }
3887
3888 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3889 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3890 if (!_PyUnicode_WSTR(unicode)) {
3891 PyErr_NoMemory();
3892 return NULL;
3893 }
3894 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3895
3896 w = _PyUnicode_WSTR(unicode);
3897 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3898 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3899 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3900 if (*four_bytes > 0xFFFF) {
3901 assert(*four_bytes <= MAX_UNICODE);
3902 /* encode surrogate pair in this case */
3903 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3904 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3905 }
3906 else
3907 *w = *four_bytes;
3908
3909 if (w > wchar_end) {
3910 Py_UNREACHABLE();
3911 }
3912 }
3913 *w = 0;
3914 #else
3915 /* sizeof(wchar_t) == 4 */
3916 Py_FatalError("Impossible unicode object state, wstr and str "
3917 "should share memory already.");
3918 return NULL;
3919 #endif
3920 }
3921 else {
3922 if ((size_t)_PyUnicode_LENGTH(unicode) >
3923 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3924 PyErr_NoMemory();
3925 return NULL;
3926 }
3927 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3928 (_PyUnicode_LENGTH(unicode) + 1));
3929 if (!_PyUnicode_WSTR(unicode)) {
3930 PyErr_NoMemory();
3931 return NULL;
3932 }
3933 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3934 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3935 w = _PyUnicode_WSTR(unicode);
3936 wchar_end = w + _PyUnicode_LENGTH(unicode);
3937
3938 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3939 one_byte = PyUnicode_1BYTE_DATA(unicode);
3940 for (; w < wchar_end; ++one_byte, ++w)
3941 *w = *one_byte;
3942 /* null-terminate the wstr */
3943 *w = 0;
3944 }
3945 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3946 #if SIZEOF_WCHAR_T == 4
3947 two_bytes = PyUnicode_2BYTE_DATA(unicode);
3948 for (; w < wchar_end; ++two_bytes, ++w)
3949 *w = *two_bytes;
3950 /* null-terminate the wstr */
3951 *w = 0;
3952 #else
3953 /* sizeof(wchar_t) == 2 */
3954 PyObject_FREE(_PyUnicode_WSTR(unicode));
3955 _PyUnicode_WSTR(unicode) = NULL;
3956 Py_FatalError("Impossible unicode object state, wstr "
3957 "and str should share memory already.");
3958 return NULL;
3959 #endif
3960 }
3961 else {
3962 Py_UNREACHABLE();
3963 }
3964 }
3965 }
3966 if (size != NULL)
3967 *size = PyUnicode_WSTR_LENGTH(unicode);
3968 return _PyUnicode_WSTR(unicode);
3969 }
3970
3971 Py_UNICODE *
PyUnicode_AsUnicode(PyObject * unicode)3972 PyUnicode_AsUnicode(PyObject *unicode)
3973 {
3974 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3975 }
3976
3977 const Py_UNICODE *
_PyUnicode_AsUnicode(PyObject * unicode)3978 _PyUnicode_AsUnicode(PyObject *unicode)
3979 {
3980 Py_ssize_t size;
3981 const Py_UNICODE *wstr;
3982
3983 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
3984 if (wstr && wcslen(wstr) != (size_t)size) {
3985 PyErr_SetString(PyExc_ValueError, "embedded null character");
3986 return NULL;
3987 }
3988 return wstr;
3989 }
3990
3991
3992 Py_ssize_t
PyUnicode_GetSize(PyObject * unicode)3993 PyUnicode_GetSize(PyObject *unicode)
3994 {
3995 if (!PyUnicode_Check(unicode)) {
3996 PyErr_BadArgument();
3997 goto onError;
3998 }
3999 if (_PyUnicode_WSTR(unicode) == NULL) {
4000 if (PyUnicode_AsUnicode(unicode) == NULL)
4001 goto onError;
4002 }
4003 return PyUnicode_WSTR_LENGTH(unicode);
4004
4005 onError:
4006 return -1;
4007 }
4008
4009 Py_ssize_t
PyUnicode_GetLength(PyObject * unicode)4010 PyUnicode_GetLength(PyObject *unicode)
4011 {
4012 if (!PyUnicode_Check(unicode)) {
4013 PyErr_BadArgument();
4014 return -1;
4015 }
4016 if (PyUnicode_READY(unicode) == -1)
4017 return -1;
4018 return PyUnicode_GET_LENGTH(unicode);
4019 }
4020
4021 Py_UCS4
PyUnicode_ReadChar(PyObject * unicode,Py_ssize_t index)4022 PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4023 {
4024 void *data;
4025 int kind;
4026
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 return (Py_UCS4)-1;
4030 }
4031 if (PyUnicode_READY(unicode) == -1) {
4032 return (Py_UCS4)-1;
4033 }
4034 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4035 PyErr_SetString(PyExc_IndexError, "string index out of range");
4036 return (Py_UCS4)-1;
4037 }
4038 data = PyUnicode_DATA(unicode);
4039 kind = PyUnicode_KIND(unicode);
4040 return PyUnicode_READ(kind, data, index);
4041 }
4042
4043 int
PyUnicode_WriteChar(PyObject * unicode,Py_ssize_t index,Py_UCS4 ch)4044 PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4045 {
4046 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4047 PyErr_BadArgument();
4048 return -1;
4049 }
4050 assert(PyUnicode_IS_READY(unicode));
4051 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4052 PyErr_SetString(PyExc_IndexError, "string index out of range");
4053 return -1;
4054 }
4055 if (unicode_check_modifiable(unicode))
4056 return -1;
4057 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4058 PyErr_SetString(PyExc_ValueError, "character out of range");
4059 return -1;
4060 }
4061 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4062 index, ch);
4063 return 0;
4064 }
4065
4066 const char *
PyUnicode_GetDefaultEncoding(void)4067 PyUnicode_GetDefaultEncoding(void)
4068 {
4069 return "utf-8";
4070 }
4071
4072 /* create or adjust a UnicodeDecodeError */
4073 static void
make_decode_exception(PyObject ** exceptionObject,const char * encoding,const char * input,Py_ssize_t length,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)4074 make_decode_exception(PyObject **exceptionObject,
4075 const char *encoding,
4076 const char *input, Py_ssize_t length,
4077 Py_ssize_t startpos, Py_ssize_t endpos,
4078 const char *reason)
4079 {
4080 if (*exceptionObject == NULL) {
4081 *exceptionObject = PyUnicodeDecodeError_Create(
4082 encoding, input, length, startpos, endpos, reason);
4083 }
4084 else {
4085 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4086 goto onError;
4087 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4088 goto onError;
4089 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4090 goto onError;
4091 }
4092 return;
4093
4094 onError:
4095 Py_CLEAR(*exceptionObject);
4096 }
4097
4098 #ifdef MS_WINDOWS
4099 /* error handling callback helper:
4100 build arguments, call the callback and check the arguments,
4101 if no exception occurred, copy the replacement to the output
4102 and adjust various state variables.
4103 return 0 on success, -1 on error
4104 */
4105
4106 static int
unicode_decode_call_errorhandler_wchar(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,PyObject ** output,Py_ssize_t * outpos)4107 unicode_decode_call_errorhandler_wchar(
4108 const char *errors, PyObject **errorHandler,
4109 const char *encoding, const char *reason,
4110 const char **input, const char **inend, Py_ssize_t *startinpos,
4111 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4112 PyObject **output, Py_ssize_t *outpos)
4113 {
4114 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4115
4116 PyObject *restuple = NULL;
4117 PyObject *repunicode = NULL;
4118 Py_ssize_t outsize;
4119 Py_ssize_t insize;
4120 Py_ssize_t requiredsize;
4121 Py_ssize_t newpos;
4122 PyObject *inputobj = NULL;
4123 wchar_t *repwstr;
4124 Py_ssize_t repwlen;
4125
4126 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4127 outsize = _PyUnicode_WSTR_LENGTH(*output);
4128
4129 if (*errorHandler == NULL) {
4130 *errorHandler = PyCodec_LookupError(errors);
4131 if (*errorHandler == NULL)
4132 goto onError;
4133 }
4134
4135 make_decode_exception(exceptionObject,
4136 encoding,
4137 *input, *inend - *input,
4138 *startinpos, *endinpos,
4139 reason);
4140 if (*exceptionObject == NULL)
4141 goto onError;
4142
4143 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4144 if (restuple == NULL)
4145 goto onError;
4146 if (!PyTuple_Check(restuple)) {
4147 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4148 goto onError;
4149 }
4150 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4151 goto onError;
4152
4153 /* Copy back the bytes variables, which might have been modified by the
4154 callback */
4155 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4156 if (!inputobj)
4157 goto onError;
4158 *input = PyBytes_AS_STRING(inputobj);
4159 insize = PyBytes_GET_SIZE(inputobj);
4160 *inend = *input + insize;
4161 /* we can DECREF safely, as the exception has another reference,
4162 so the object won't go away. */
4163 Py_DECREF(inputobj);
4164
4165 if (newpos<0)
4166 newpos = insize+newpos;
4167 if (newpos<0 || newpos>insize) {
4168 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4169 goto onError;
4170 }
4171
4172 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4173 if (repwstr == NULL)
4174 goto onError;
4175 /* need more space? (at least enough for what we
4176 have+the replacement+the rest of the string (starting
4177 at the new input position), so we won't have to check space
4178 when there are no errors in the rest of the string) */
4179 requiredsize = *outpos;
4180 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4181 goto overflow;
4182 requiredsize += repwlen;
4183 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4184 goto overflow;
4185 requiredsize += insize - newpos;
4186 if (requiredsize > outsize) {
4187 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4188 requiredsize = 2*outsize;
4189 if (unicode_resize(output, requiredsize) < 0)
4190 goto onError;
4191 }
4192 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4193 *outpos += repwlen;
4194 *endinpos = newpos;
4195 *inptr = *input + newpos;
4196
4197 /* we made it! */
4198 Py_DECREF(restuple);
4199 return 0;
4200
4201 overflow:
4202 PyErr_SetString(PyExc_OverflowError,
4203 "decoded result is too long for a Python string");
4204
4205 onError:
4206 Py_XDECREF(restuple);
4207 return -1;
4208 }
4209 #endif /* MS_WINDOWS */
4210
4211 static int
unicode_decode_call_errorhandler_writer(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,const char ** input,const char ** inend,Py_ssize_t * startinpos,Py_ssize_t * endinpos,PyObject ** exceptionObject,const char ** inptr,_PyUnicodeWriter * writer)4212 unicode_decode_call_errorhandler_writer(
4213 const char *errors, PyObject **errorHandler,
4214 const char *encoding, const char *reason,
4215 const char **input, const char **inend, Py_ssize_t *startinpos,
4216 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4217 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4218 {
4219 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4220
4221 PyObject *restuple = NULL;
4222 PyObject *repunicode = NULL;
4223 Py_ssize_t insize;
4224 Py_ssize_t newpos;
4225 Py_ssize_t replen;
4226 Py_ssize_t remain;
4227 PyObject *inputobj = NULL;
4228 int need_to_grow = 0;
4229 const char *new_inptr;
4230
4231 if (*errorHandler == NULL) {
4232 *errorHandler = PyCodec_LookupError(errors);
4233 if (*errorHandler == NULL)
4234 goto onError;
4235 }
4236
4237 make_decode_exception(exceptionObject,
4238 encoding,
4239 *input, *inend - *input,
4240 *startinpos, *endinpos,
4241 reason);
4242 if (*exceptionObject == NULL)
4243 goto onError;
4244
4245 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4246 if (restuple == NULL)
4247 goto onError;
4248 if (!PyTuple_Check(restuple)) {
4249 PyErr_SetString(PyExc_TypeError, &argparse[3]);
4250 goto onError;
4251 }
4252 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4253 goto onError;
4254
4255 /* Copy back the bytes variables, which might have been modified by the
4256 callback */
4257 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4258 if (!inputobj)
4259 goto onError;
4260 remain = *inend - *input - *endinpos;
4261 *input = PyBytes_AS_STRING(inputobj);
4262 insize = PyBytes_GET_SIZE(inputobj);
4263 *inend = *input + insize;
4264 /* we can DECREF safely, as the exception has another reference,
4265 so the object won't go away. */
4266 Py_DECREF(inputobj);
4267
4268 if (newpos<0)
4269 newpos = insize+newpos;
4270 if (newpos<0 || newpos>insize) {
4271 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4272 goto onError;
4273 }
4274
4275 replen = PyUnicode_GET_LENGTH(repunicode);
4276 if (replen > 1) {
4277 writer->min_length += replen - 1;
4278 need_to_grow = 1;
4279 }
4280 new_inptr = *input + newpos;
4281 if (*inend - new_inptr > remain) {
4282 /* We don't know the decoding algorithm here so we make the worst
4283 assumption that one byte decodes to one unicode character.
4284 If unfortunately one byte could decode to more unicode characters,
4285 the decoder may write out-of-bound then. Is it possible for the
4286 algorithms using this function? */
4287 writer->min_length += *inend - new_inptr - remain;
4288 need_to_grow = 1;
4289 }
4290 if (need_to_grow) {
4291 writer->overallocate = 1;
4292 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4293 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4294 goto onError;
4295 }
4296 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4297 goto onError;
4298
4299 *endinpos = newpos;
4300 *inptr = new_inptr;
4301
4302 /* we made it! */
4303 Py_DECREF(restuple);
4304 return 0;
4305
4306 onError:
4307 Py_XDECREF(restuple);
4308 return -1;
4309 }
4310
4311 /* --- UTF-7 Codec -------------------------------------------------------- */
4312
4313 /* See RFC2152 for details. We encode conservatively and decode liberally. */
4314
4315 /* Three simple macros defining base-64. */
4316
4317 /* Is c a base-64 character? */
4318
4319 #define IS_BASE64(c) \
4320 (((c) >= 'A' && (c) <= 'Z') || \
4321 ((c) >= 'a' && (c) <= 'z') || \
4322 ((c) >= '0' && (c) <= '9') || \
4323 (c) == '+' || (c) == '/')
4324
4325 /* given that c is a base-64 character, what is its base-64 value? */
4326
4327 #define FROM_BASE64(c) \
4328 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4329 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4330 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4331 (c) == '+' ? 62 : 63)
4332
4333 /* What is the base-64 character of the bottom 6 bits of n? */
4334
4335 #define TO_BASE64(n) \
4336 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4337
4338 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4339 * decoded as itself. We are permissive on decoding; the only ASCII
4340 * byte not decoding to itself is the + which begins a base64
4341 * string. */
4342
4343 #define DECODE_DIRECT(c) \
4344 ((c) <= 127 && (c) != '+')
4345
4346 /* The UTF-7 encoder treats ASCII characters differently according to
4347 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4348 * the above). See RFC2152. This array identifies these different
4349 * sets:
4350 * 0 : "Set D"
4351 * alphanumeric and '(),-./:?
4352 * 1 : "Set O"
4353 * !"#$%&*;<=>@[]^_`{|}
4354 * 2 : "whitespace"
4355 * ht nl cr sp
4356 * 3 : special (must be base64 encoded)
4357 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4358 */
4359
4360 static
4361 char utf7_category[128] = {
4362 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4363 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4364 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4365 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4366 /* sp ! " # $ % & ' ( ) * + , - . / */
4367 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4368 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4369 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4370 /* @ A B C D E F G H I J K L M N O */
4371 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4372 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
4373 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4374 /* ` a b c d e f g h i j k l m n o */
4375 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4376 /* p q r s t u v w x y z { | } ~ del */
4377 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
4378 };
4379
4380 /* ENCODE_DIRECT: this character should be encoded as itself. The
4381 * answer depends on whether we are encoding set O as itself, and also
4382 * on whether we are encoding whitespace as itself. RFC2152 makes it
4383 * clear that the answers to these questions vary between
4384 * applications, so this code needs to be flexible. */
4385
4386 #define ENCODE_DIRECT(c, directO, directWS) \
4387 ((c) < 128 && (c) > 0 && \
4388 ((utf7_category[(c)] == 0) || \
4389 (directWS && (utf7_category[(c)] == 2)) || \
4390 (directO && (utf7_category[(c)] == 1))))
4391
4392 PyObject *
PyUnicode_DecodeUTF7(const char * s,Py_ssize_t size,const char * errors)4393 PyUnicode_DecodeUTF7(const char *s,
4394 Py_ssize_t size,
4395 const char *errors)
4396 {
4397 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4398 }
4399
4400 /* The decoder. The only state we preserve is our read position,
4401 * i.e. how many characters we have consumed. So if we end in the
4402 * middle of a shift sequence we have to back off the read position
4403 * and the output to the beginning of the sequence, otherwise we lose
4404 * all the shift state (seen bits, number of bits seen, high
4405 * surrogate). */
4406
4407 PyObject *
PyUnicode_DecodeUTF7Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4408 PyUnicode_DecodeUTF7Stateful(const char *s,
4409 Py_ssize_t size,
4410 const char *errors,
4411 Py_ssize_t *consumed)
4412 {
4413 const char *starts = s;
4414 Py_ssize_t startinpos;
4415 Py_ssize_t endinpos;
4416 const char *e;
4417 _PyUnicodeWriter writer;
4418 const char *errmsg = "";
4419 int inShift = 0;
4420 Py_ssize_t shiftOutStart;
4421 unsigned int base64bits = 0;
4422 unsigned long base64buffer = 0;
4423 Py_UCS4 surrogate = 0;
4424 PyObject *errorHandler = NULL;
4425 PyObject *exc = NULL;
4426
4427 if (size == 0) {
4428 if (consumed)
4429 *consumed = 0;
4430 _Py_RETURN_UNICODE_EMPTY();
4431 }
4432
4433 /* Start off assuming it's all ASCII. Widen later as necessary. */
4434 _PyUnicodeWriter_Init(&writer);
4435 writer.min_length = size;
4436
4437 shiftOutStart = 0;
4438 e = s + size;
4439
4440 while (s < e) {
4441 Py_UCS4 ch;
4442 restart:
4443 ch = (unsigned char) *s;
4444
4445 if (inShift) { /* in a base-64 section */
4446 if (IS_BASE64(ch)) { /* consume a base-64 character */
4447 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4448 base64bits += 6;
4449 s++;
4450 if (base64bits >= 16) {
4451 /* we have enough bits for a UTF-16 value */
4452 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4453 base64bits -= 16;
4454 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4455 assert(outCh <= 0xffff);
4456 if (surrogate) {
4457 /* expecting a second surrogate */
4458 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4459 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4460 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4461 goto onError;
4462 surrogate = 0;
4463 continue;
4464 }
4465 else {
4466 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4467 goto onError;
4468 surrogate = 0;
4469 }
4470 }
4471 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4472 /* first surrogate */
4473 surrogate = outCh;
4474 }
4475 else {
4476 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4477 goto onError;
4478 }
4479 }
4480 }
4481 else { /* now leaving a base-64 section */
4482 inShift = 0;
4483 if (base64bits > 0) { /* left-over bits */
4484 if (base64bits >= 6) {
4485 /* We've seen at least one base-64 character */
4486 s++;
4487 errmsg = "partial character in shift sequence";
4488 goto utf7Error;
4489 }
4490 else {
4491 /* Some bits remain; they should be zero */
4492 if (base64buffer != 0) {
4493 s++;
4494 errmsg = "non-zero padding bits in shift sequence";
4495 goto utf7Error;
4496 }
4497 }
4498 }
4499 if (surrogate && DECODE_DIRECT(ch)) {
4500 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4501 goto onError;
4502 }
4503 surrogate = 0;
4504 if (ch == '-') {
4505 /* '-' is absorbed; other terminating
4506 characters are preserved */
4507 s++;
4508 }
4509 }
4510 }
4511 else if ( ch == '+' ) {
4512 startinpos = s-starts;
4513 s++; /* consume '+' */
4514 if (s < e && *s == '-') { /* '+-' encodes '+' */
4515 s++;
4516 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4517 goto onError;
4518 }
4519 else { /* begin base64-encoded section */
4520 inShift = 1;
4521 surrogate = 0;
4522 shiftOutStart = writer.pos;
4523 base64bits = 0;
4524 base64buffer = 0;
4525 }
4526 }
4527 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4528 s++;
4529 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4530 goto onError;
4531 }
4532 else {
4533 startinpos = s-starts;
4534 s++;
4535 errmsg = "unexpected special character";
4536 goto utf7Error;
4537 }
4538 continue;
4539 utf7Error:
4540 endinpos = s-starts;
4541 if (unicode_decode_call_errorhandler_writer(
4542 errors, &errorHandler,
4543 "utf7", errmsg,
4544 &starts, &e, &startinpos, &endinpos, &exc, &s,
4545 &writer))
4546 goto onError;
4547 }
4548
4549 /* end of string */
4550
4551 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4552 /* if we're in an inconsistent state, that's an error */
4553 inShift = 0;
4554 if (surrogate ||
4555 (base64bits >= 6) ||
4556 (base64bits > 0 && base64buffer != 0)) {
4557 endinpos = size;
4558 if (unicode_decode_call_errorhandler_writer(
4559 errors, &errorHandler,
4560 "utf7", "unterminated shift sequence",
4561 &starts, &e, &startinpos, &endinpos, &exc, &s,
4562 &writer))
4563 goto onError;
4564 if (s < e)
4565 goto restart;
4566 }
4567 }
4568
4569 /* return state */
4570 if (consumed) {
4571 if (inShift) {
4572 *consumed = startinpos;
4573 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4574 PyObject *result = PyUnicode_FromKindAndData(
4575 writer.kind, writer.data, shiftOutStart);
4576 Py_XDECREF(errorHandler);
4577 Py_XDECREF(exc);
4578 _PyUnicodeWriter_Dealloc(&writer);
4579 return result;
4580 }
4581 writer.pos = shiftOutStart; /* back off output */
4582 }
4583 else {
4584 *consumed = s-starts;
4585 }
4586 }
4587
4588 Py_XDECREF(errorHandler);
4589 Py_XDECREF(exc);
4590 return _PyUnicodeWriter_Finish(&writer);
4591
4592 onError:
4593 Py_XDECREF(errorHandler);
4594 Py_XDECREF(exc);
4595 _PyUnicodeWriter_Dealloc(&writer);
4596 return NULL;
4597 }
4598
4599
4600 PyObject *
_PyUnicode_EncodeUTF7(PyObject * str,int base64SetO,int base64WhiteSpace,const char * errors)4601 _PyUnicode_EncodeUTF7(PyObject *str,
4602 int base64SetO,
4603 int base64WhiteSpace,
4604 const char *errors)
4605 {
4606 int kind;
4607 void *data;
4608 Py_ssize_t len;
4609 PyObject *v;
4610 int inShift = 0;
4611 Py_ssize_t i;
4612 unsigned int base64bits = 0;
4613 unsigned long base64buffer = 0;
4614 char * out;
4615 char * start;
4616
4617 if (PyUnicode_READY(str) == -1)
4618 return NULL;
4619 kind = PyUnicode_KIND(str);
4620 data = PyUnicode_DATA(str);
4621 len = PyUnicode_GET_LENGTH(str);
4622
4623 if (len == 0)
4624 return PyBytes_FromStringAndSize(NULL, 0);
4625
4626 /* It might be possible to tighten this worst case */
4627 if (len > PY_SSIZE_T_MAX / 8)
4628 return PyErr_NoMemory();
4629 v = PyBytes_FromStringAndSize(NULL, len * 8);
4630 if (v == NULL)
4631 return NULL;
4632
4633 start = out = PyBytes_AS_STRING(v);
4634 for (i = 0; i < len; ++i) {
4635 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4636
4637 if (inShift) {
4638 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4639 /* shifting out */
4640 if (base64bits) { /* output remaining bits */
4641 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4642 base64buffer = 0;
4643 base64bits = 0;
4644 }
4645 inShift = 0;
4646 /* Characters not in the BASE64 set implicitly unshift the sequence
4647 so no '-' is required, except if the character is itself a '-' */
4648 if (IS_BASE64(ch) || ch == '-') {
4649 *out++ = '-';
4650 }
4651 *out++ = (char) ch;
4652 }
4653 else {
4654 goto encode_char;
4655 }
4656 }
4657 else { /* not in a shift sequence */
4658 if (ch == '+') {
4659 *out++ = '+';
4660 *out++ = '-';
4661 }
4662 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4663 *out++ = (char) ch;
4664 }
4665 else {
4666 *out++ = '+';
4667 inShift = 1;
4668 goto encode_char;
4669 }
4670 }
4671 continue;
4672 encode_char:
4673 if (ch >= 0x10000) {
4674 assert(ch <= MAX_UNICODE);
4675
4676 /* code first surrogate */
4677 base64bits += 16;
4678 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4679 while (base64bits >= 6) {
4680 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4681 base64bits -= 6;
4682 }
4683 /* prepare second surrogate */
4684 ch = Py_UNICODE_LOW_SURROGATE(ch);
4685 }
4686 base64bits += 16;
4687 base64buffer = (base64buffer << 16) | ch;
4688 while (base64bits >= 6) {
4689 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4690 base64bits -= 6;
4691 }
4692 }
4693 if (base64bits)
4694 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4695 if (inShift)
4696 *out++ = '-';
4697 if (_PyBytes_Resize(&v, out - start) < 0)
4698 return NULL;
4699 return v;
4700 }
4701 PyObject *
PyUnicode_EncodeUTF7(const Py_UNICODE * s,Py_ssize_t size,int base64SetO,int base64WhiteSpace,const char * errors)4702 PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4703 Py_ssize_t size,
4704 int base64SetO,
4705 int base64WhiteSpace,
4706 const char *errors)
4707 {
4708 PyObject *result;
4709 PyObject *tmp = PyUnicode_FromWideChar(s, size);
4710 if (tmp == NULL)
4711 return NULL;
4712 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4713 base64WhiteSpace, errors);
4714 Py_DECREF(tmp);
4715 return result;
4716 }
4717
4718 #undef IS_BASE64
4719 #undef FROM_BASE64
4720 #undef TO_BASE64
4721 #undef DECODE_DIRECT
4722 #undef ENCODE_DIRECT
4723
4724 /* --- UTF-8 Codec -------------------------------------------------------- */
4725
4726 PyObject *
PyUnicode_DecodeUTF8(const char * s,Py_ssize_t size,const char * errors)4727 PyUnicode_DecodeUTF8(const char *s,
4728 Py_ssize_t size,
4729 const char *errors)
4730 {
4731 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4732 }
4733
4734 #include "stringlib/asciilib.h"
4735 #include "stringlib/codecs.h"
4736 #include "stringlib/undef.h"
4737
4738 #include "stringlib/ucs1lib.h"
4739 #include "stringlib/codecs.h"
4740 #include "stringlib/undef.h"
4741
4742 #include "stringlib/ucs2lib.h"
4743 #include "stringlib/codecs.h"
4744 #include "stringlib/undef.h"
4745
4746 #include "stringlib/ucs4lib.h"
4747 #include "stringlib/codecs.h"
4748 #include "stringlib/undef.h"
4749
4750 /* Mask to quickly check whether a C 'long' contains a
4751 non-ASCII, UTF8-encoded char. */
4752 #if (SIZEOF_LONG == 8)
4753 # define ASCII_CHAR_MASK 0x8080808080808080UL
4754 #elif (SIZEOF_LONG == 4)
4755 # define ASCII_CHAR_MASK 0x80808080UL
4756 #else
4757 # error C 'long' size should be either 4 or 8!
4758 #endif
4759
4760 static Py_ssize_t
ascii_decode(const char * start,const char * end,Py_UCS1 * dest)4761 ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4762 {
4763 const char *p = start;
4764 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4765
4766 /*
4767 * Issue #17237: m68k is a bit different from most architectures in
4768 * that objects do not use "natural alignment" - for example, int and
4769 * long are only aligned at 2-byte boundaries. Therefore the assert()
4770 * won't work; also, tests have shown that skipping the "optimised
4771 * version" will even speed up m68k.
4772 */
4773 #if !defined(__m68k__)
4774 #if SIZEOF_LONG <= SIZEOF_VOID_P
4775 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4776 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4777 /* Fast path, see in STRINGLIB(utf8_decode) for
4778 an explanation. */
4779 /* Help allocation */
4780 const char *_p = p;
4781 Py_UCS1 * q = dest;
4782 while (_p < aligned_end) {
4783 unsigned long value = *(const unsigned long *) _p;
4784 if (value & ASCII_CHAR_MASK)
4785 break;
4786 *((unsigned long *)q) = value;
4787 _p += SIZEOF_LONG;
4788 q += SIZEOF_LONG;
4789 }
4790 p = _p;
4791 while (p < end) {
4792 if ((unsigned char)*p & 0x80)
4793 break;
4794 *q++ = *p++;
4795 }
4796 return p - start;
4797 }
4798 #endif
4799 #endif
4800 while (p < end) {
4801 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4802 for an explanation. */
4803 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4804 /* Help allocation */
4805 const char *_p = p;
4806 while (_p < aligned_end) {
4807 unsigned long value = *(unsigned long *) _p;
4808 if (value & ASCII_CHAR_MASK)
4809 break;
4810 _p += SIZEOF_LONG;
4811 }
4812 p = _p;
4813 if (_p == end)
4814 break;
4815 }
4816 if ((unsigned char)*p & 0x80)
4817 break;
4818 ++p;
4819 }
4820 memcpy(dest, start, p - start);
4821 return p - start;
4822 }
4823
4824 PyObject *
PyUnicode_DecodeUTF8Stateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)4825 PyUnicode_DecodeUTF8Stateful(const char *s,
4826 Py_ssize_t size,
4827 const char *errors,
4828 Py_ssize_t *consumed)
4829 {
4830 _PyUnicodeWriter writer;
4831 const char *starts = s;
4832 const char *end = s + size;
4833
4834 Py_ssize_t startinpos;
4835 Py_ssize_t endinpos;
4836 const char *errmsg = "";
4837 PyObject *error_handler_obj = NULL;
4838 PyObject *exc = NULL;
4839 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
4840
4841 if (size == 0) {
4842 if (consumed)
4843 *consumed = 0;
4844 _Py_RETURN_UNICODE_EMPTY();
4845 }
4846
4847 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4848 if (size == 1 && (unsigned char)s[0] < 128) {
4849 if (consumed)
4850 *consumed = 1;
4851 return get_latin1_char((unsigned char)s[0]);
4852 }
4853
4854 _PyUnicodeWriter_Init(&writer);
4855 writer.min_length = size;
4856 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4857 goto onError;
4858
4859 writer.pos = ascii_decode(s, end, writer.data);
4860 s += writer.pos;
4861 while (s < end) {
4862 Py_UCS4 ch;
4863 int kind = writer.kind;
4864
4865 if (kind == PyUnicode_1BYTE_KIND) {
4866 if (PyUnicode_IS_ASCII(writer.buffer))
4867 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4868 else
4869 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4870 } else if (kind == PyUnicode_2BYTE_KIND) {
4871 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4872 } else {
4873 assert(kind == PyUnicode_4BYTE_KIND);
4874 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4875 }
4876
4877 switch (ch) {
4878 case 0:
4879 if (s == end || consumed)
4880 goto End;
4881 errmsg = "unexpected end of data";
4882 startinpos = s - starts;
4883 endinpos = end - starts;
4884 break;
4885 case 1:
4886 errmsg = "invalid start byte";
4887 startinpos = s - starts;
4888 endinpos = startinpos + 1;
4889 break;
4890 case 2:
4891 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4892 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4893 {
4894 /* Truncated surrogate code in range D800-DFFF */
4895 goto End;
4896 }
4897 /* fall through */
4898 case 3:
4899 case 4:
4900 errmsg = "invalid continuation byte";
4901 startinpos = s - starts;
4902 endinpos = startinpos + ch - 1;
4903 break;
4904 default:
4905 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4906 goto onError;
4907 continue;
4908 }
4909
4910 if (error_handler == _Py_ERROR_UNKNOWN)
4911 error_handler = get_error_handler(errors);
4912
4913 switch (error_handler) {
4914 case _Py_ERROR_IGNORE:
4915 s += (endinpos - startinpos);
4916 break;
4917
4918 case _Py_ERROR_REPLACE:
4919 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4920 goto onError;
4921 s += (endinpos - startinpos);
4922 break;
4923
4924 case _Py_ERROR_SURROGATEESCAPE:
4925 {
4926 Py_ssize_t i;
4927
4928 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4929 goto onError;
4930 for (i=startinpos; i<endinpos; i++) {
4931 ch = (Py_UCS4)(unsigned char)(starts[i]);
4932 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4933 ch + 0xdc00);
4934 writer.pos++;
4935 }
4936 s += (endinpos - startinpos);
4937 break;
4938 }
4939
4940 default:
4941 if (unicode_decode_call_errorhandler_writer(
4942 errors, &error_handler_obj,
4943 "utf-8", errmsg,
4944 &starts, &end, &startinpos, &endinpos, &exc, &s,
4945 &writer))
4946 goto onError;
4947 }
4948 }
4949
4950 End:
4951 if (consumed)
4952 *consumed = s - starts;
4953
4954 Py_XDECREF(error_handler_obj);
4955 Py_XDECREF(exc);
4956 return _PyUnicodeWriter_Finish(&writer);
4957
4958 onError:
4959 Py_XDECREF(error_handler_obj);
4960 Py_XDECREF(exc);
4961 _PyUnicodeWriter_Dealloc(&writer);
4962 return NULL;
4963 }
4964
4965
4966 /* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
4967 non-zero, use strict error handler otherwise.
4968
4969 On success, write a pointer to a newly allocated wide character string into
4970 *wstr (use PyMem_RawFree() to free the memory) and write the output length
4971 (in number of wchar_t units) into *wlen (if wlen is set).
4972
4973 On memory allocation failure, return -1.
4974
4975 On decoding error (if surrogateescape is zero), return -2. If wlen is
4976 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
4977 is not NULL, write the decoding error message into *reason. */
4978 int
_Py_DecodeUTF8Ex(const char * s,Py_ssize_t size,wchar_t ** wstr,size_t * wlen,const char ** reason,int surrogateescape)4979 _Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
4980 const char **reason, int surrogateescape)
4981 {
4982 const char *orig_s = s;
4983 const char *e;
4984 wchar_t *unicode;
4985 Py_ssize_t outpos;
4986
4987 /* Note: size will always be longer than the resulting Unicode
4988 character count */
4989 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
4990 return -1;
4991 }
4992
4993 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4994 if (!unicode) {
4995 return -1;
4996 }
4997
4998 /* Unpack UTF-8 encoded data */
4999 e = s + size;
5000 outpos = 0;
5001 while (s < e) {
5002 Py_UCS4 ch;
5003 #if SIZEOF_WCHAR_T == 4
5004 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5005 #else
5006 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5007 #endif
5008 if (ch > 0xFF) {
5009 #if SIZEOF_WCHAR_T == 4
5010 Py_UNREACHABLE();
5011 #else
5012 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5013 /* write a surrogate pair */
5014 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5015 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5016 #endif
5017 }
5018 else {
5019 if (!ch && s == e)
5020 break;
5021 if (!surrogateescape) {
5022 PyMem_RawFree(unicode );
5023 if (reason != NULL) {
5024 switch (ch) {
5025 case 0:
5026 *reason = "unexpected end of data";
5027 break;
5028 case 1:
5029 *reason = "invalid start byte";
5030 break;
5031 /* 2, 3, 4 */
5032 default:
5033 *reason = "invalid continuation byte";
5034 break;
5035 }
5036 }
5037 if (wlen != NULL) {
5038 *wlen = s - orig_s;
5039 }
5040 return -2;
5041 }
5042 /* surrogateescape */
5043 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5044 }
5045 }
5046 unicode[outpos] = L'\0';
5047 if (wlen) {
5048 *wlen = outpos;
5049 }
5050 *wstr = unicode;
5051 return 0;
5052 }
5053
5054 wchar_t*
_Py_DecodeUTF8_surrogateescape(const char * arg,Py_ssize_t arglen)5055 _Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen)
5056 {
5057 wchar_t *wstr;
5058 int res = _Py_DecodeUTF8Ex(arg, arglen, &wstr, NULL, NULL, 1);
5059 if (res != 0) {
5060 return NULL;
5061 }
5062 return wstr;
5063 }
5064
5065
5066 /* UTF-8 encoder using the surrogateescape error handler .
5067
5068 On success, return 0 and write the newly allocated character string (use
5069 PyMem_Free() to free the memory) into *str.
5070
5071 On encoding failure, return -2 and write the position of the invalid
5072 surrogate character into *error_pos (if error_pos is set) and the decoding
5073 error message into *reason (if reason is set).
5074
5075 On memory allocation failure, return -1. */
5076 int
_Py_EncodeUTF8Ex(const wchar_t * text,char ** str,size_t * error_pos,const char ** reason,int raw_malloc,int surrogateescape)5077 _Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5078 const char **reason, int raw_malloc, int surrogateescape)
5079 {
5080 const Py_ssize_t max_char_size = 4;
5081 Py_ssize_t len = wcslen(text);
5082
5083 assert(len >= 0);
5084
5085 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5086 return -1;
5087 }
5088 char *bytes;
5089 if (raw_malloc) {
5090 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5091 }
5092 else {
5093 bytes = PyMem_Malloc((len + 1) * max_char_size);
5094 }
5095 if (bytes == NULL) {
5096 return -1;
5097 }
5098
5099 char *p = bytes;
5100 Py_ssize_t i;
5101 for (i = 0; i < len; i++) {
5102 Py_UCS4 ch = text[i];
5103
5104 if (ch < 0x80) {
5105 /* Encode ASCII */
5106 *p++ = (char) ch;
5107
5108 }
5109 else if (ch < 0x0800) {
5110 /* Encode Latin-1 */
5111 *p++ = (char)(0xc0 | (ch >> 6));
5112 *p++ = (char)(0x80 | (ch & 0x3f));
5113 }
5114 else if (Py_UNICODE_IS_SURROGATE(ch)) {
5115 /* surrogateescape error handler */
5116 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5117 if (error_pos != NULL) {
5118 *error_pos = (size_t)i;
5119 }
5120 if (reason != NULL) {
5121 *reason = "encoding error";
5122 }
5123 if (raw_malloc) {
5124 PyMem_RawFree(bytes);
5125 }
5126 else {
5127 PyMem_Free(bytes);
5128 }
5129 return -2;
5130 }
5131 *p++ = (char)(ch & 0xff);
5132 }
5133 else if (ch < 0x10000) {
5134 *p++ = (char)(0xe0 | (ch >> 12));
5135 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5136 *p++ = (char)(0x80 | (ch & 0x3f));
5137 }
5138 else { /* ch >= 0x10000 */
5139 assert(ch <= MAX_UNICODE);
5140 /* Encode UCS4 Unicode ordinals */
5141 *p++ = (char)(0xf0 | (ch >> 18));
5142 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5143 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5144 *p++ = (char)(0x80 | (ch & 0x3f));
5145 }
5146 }
5147 *p++ = '\0';
5148
5149 size_t final_size = (p - bytes);
5150 char *bytes2;
5151 if (raw_malloc) {
5152 bytes2 = PyMem_RawRealloc(bytes, final_size);
5153 }
5154 else {
5155 bytes2 = PyMem_Realloc(bytes, final_size);
5156 }
5157 if (bytes2 == NULL) {
5158 if (error_pos != NULL) {
5159 *error_pos = (size_t)-1;
5160 }
5161 if (raw_malloc) {
5162 PyMem_RawFree(bytes);
5163 }
5164 else {
5165 PyMem_Free(bytes);
5166 }
5167 return -1;
5168 }
5169 *str = bytes2;
5170 return 0;
5171 }
5172
5173
5174 /* Primary internal function which creates utf8 encoded bytes objects.
5175
5176 Allocation strategy: if the string is short, convert into a stack buffer
5177 and allocate exactly as much space needed at the end. Else allocate the
5178 maximum possible needed (4 result bytes per Unicode character), and return
5179 the excess memory at the end.
5180 */
5181 PyObject *
_PyUnicode_AsUTF8String(PyObject * unicode,const char * errors)5182 _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5183 {
5184 enum PyUnicode_Kind kind;
5185 void *data;
5186 Py_ssize_t size;
5187
5188 if (!PyUnicode_Check(unicode)) {
5189 PyErr_BadArgument();
5190 return NULL;
5191 }
5192
5193 if (PyUnicode_READY(unicode) == -1)
5194 return NULL;
5195
5196 if (PyUnicode_UTF8(unicode))
5197 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5198 PyUnicode_UTF8_LENGTH(unicode));
5199
5200 kind = PyUnicode_KIND(unicode);
5201 data = PyUnicode_DATA(unicode);
5202 size = PyUnicode_GET_LENGTH(unicode);
5203
5204 switch (kind) {
5205 default:
5206 Py_UNREACHABLE();
5207 case PyUnicode_1BYTE_KIND:
5208 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5209 assert(!PyUnicode_IS_ASCII(unicode));
5210 return ucs1lib_utf8_encoder(unicode, data, size, errors);
5211 case PyUnicode_2BYTE_KIND:
5212 return ucs2lib_utf8_encoder(unicode, data, size, errors);
5213 case PyUnicode_4BYTE_KIND:
5214 return ucs4lib_utf8_encoder(unicode, data, size, errors);
5215 }
5216 }
5217
5218 PyObject *
PyUnicode_EncodeUTF8(const Py_UNICODE * s,Py_ssize_t size,const char * errors)5219 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5220 Py_ssize_t size,
5221 const char *errors)
5222 {
5223 PyObject *v, *unicode;
5224
5225 unicode = PyUnicode_FromWideChar(s, size);
5226 if (unicode == NULL)
5227 return NULL;
5228 v = _PyUnicode_AsUTF8String(unicode, errors);
5229 Py_DECREF(unicode);
5230 return v;
5231 }
5232
5233 PyObject *
PyUnicode_AsUTF8String(PyObject * unicode)5234 PyUnicode_AsUTF8String(PyObject *unicode)
5235 {
5236 return _PyUnicode_AsUTF8String(unicode, NULL);
5237 }
5238
5239 /* --- UTF-32 Codec ------------------------------------------------------- */
5240
5241 PyObject *
PyUnicode_DecodeUTF32(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5242 PyUnicode_DecodeUTF32(const char *s,
5243 Py_ssize_t size,
5244 const char *errors,
5245 int *byteorder)
5246 {
5247 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5248 }
5249
5250 PyObject *
PyUnicode_DecodeUTF32Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5251 PyUnicode_DecodeUTF32Stateful(const char *s,
5252 Py_ssize_t size,
5253 const char *errors,
5254 int *byteorder,
5255 Py_ssize_t *consumed)
5256 {
5257 const char *starts = s;
5258 Py_ssize_t startinpos;
5259 Py_ssize_t endinpos;
5260 _PyUnicodeWriter writer;
5261 const unsigned char *q, *e;
5262 int le, bo = 0; /* assume native ordering by default */
5263 const char *encoding;
5264 const char *errmsg = "";
5265 PyObject *errorHandler = NULL;
5266 PyObject *exc = NULL;
5267
5268 q = (unsigned char *)s;
5269 e = q + size;
5270
5271 if (byteorder)
5272 bo = *byteorder;
5273
5274 /* Check for BOM marks (U+FEFF) in the input and adjust current
5275 byte order setting accordingly. In native mode, the leading BOM
5276 mark is skipped, in all other modes, it is copied to the output
5277 stream as-is (giving a ZWNBSP character). */
5278 if (bo == 0 && size >= 4) {
5279 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5280 if (bom == 0x0000FEFF) {
5281 bo = -1;
5282 q += 4;
5283 }
5284 else if (bom == 0xFFFE0000) {
5285 bo = 1;
5286 q += 4;
5287 }
5288 if (byteorder)
5289 *byteorder = bo;
5290 }
5291
5292 if (q == e) {
5293 if (consumed)
5294 *consumed = size;
5295 _Py_RETURN_UNICODE_EMPTY();
5296 }
5297
5298 #ifdef WORDS_BIGENDIAN
5299 le = bo < 0;
5300 #else
5301 le = bo <= 0;
5302 #endif
5303 encoding = le ? "utf-32-le" : "utf-32-be";
5304
5305 _PyUnicodeWriter_Init(&writer);
5306 writer.min_length = (e - q + 3) / 4;
5307 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5308 goto onError;
5309
5310 while (1) {
5311 Py_UCS4 ch = 0;
5312 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5313
5314 if (e - q >= 4) {
5315 enum PyUnicode_Kind kind = writer.kind;
5316 void *data = writer.data;
5317 const unsigned char *last = e - 4;
5318 Py_ssize_t pos = writer.pos;
5319 if (le) {
5320 do {
5321 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5322 if (ch > maxch)
5323 break;
5324 if (kind != PyUnicode_1BYTE_KIND &&
5325 Py_UNICODE_IS_SURROGATE(ch))
5326 break;
5327 PyUnicode_WRITE(kind, data, pos++, ch);
5328 q += 4;
5329 } while (q <= last);
5330 }
5331 else {
5332 do {
5333 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5334 if (ch > maxch)
5335 break;
5336 if (kind != PyUnicode_1BYTE_KIND &&
5337 Py_UNICODE_IS_SURROGATE(ch))
5338 break;
5339 PyUnicode_WRITE(kind, data, pos++, ch);
5340 q += 4;
5341 } while (q <= last);
5342 }
5343 writer.pos = pos;
5344 }
5345
5346 if (Py_UNICODE_IS_SURROGATE(ch)) {
5347 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5348 startinpos = ((const char *)q) - starts;
5349 endinpos = startinpos + 4;
5350 }
5351 else if (ch <= maxch) {
5352 if (q == e || consumed)
5353 break;
5354 /* remaining bytes at the end? (size should be divisible by 4) */
5355 errmsg = "truncated data";
5356 startinpos = ((const char *)q) - starts;
5357 endinpos = ((const char *)e) - starts;
5358 }
5359 else {
5360 if (ch < 0x110000) {
5361 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5362 goto onError;
5363 q += 4;
5364 continue;
5365 }
5366 errmsg = "code point not in range(0x110000)";
5367 startinpos = ((const char *)q) - starts;
5368 endinpos = startinpos + 4;
5369 }
5370
5371 /* The remaining input chars are ignored if the callback
5372 chooses to skip the input */
5373 if (unicode_decode_call_errorhandler_writer(
5374 errors, &errorHandler,
5375 encoding, errmsg,
5376 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5377 &writer))
5378 goto onError;
5379 }
5380
5381 if (consumed)
5382 *consumed = (const char *)q-starts;
5383
5384 Py_XDECREF(errorHandler);
5385 Py_XDECREF(exc);
5386 return _PyUnicodeWriter_Finish(&writer);
5387
5388 onError:
5389 _PyUnicodeWriter_Dealloc(&writer);
5390 Py_XDECREF(errorHandler);
5391 Py_XDECREF(exc);
5392 return NULL;
5393 }
5394
5395 PyObject *
_PyUnicode_EncodeUTF32(PyObject * str,const char * errors,int byteorder)5396 _PyUnicode_EncodeUTF32(PyObject *str,
5397 const char *errors,
5398 int byteorder)
5399 {
5400 enum PyUnicode_Kind kind;
5401 const void *data;
5402 Py_ssize_t len;
5403 PyObject *v;
5404 uint32_t *out;
5405 #if PY_LITTLE_ENDIAN
5406 int native_ordering = byteorder <= 0;
5407 #else
5408 int native_ordering = byteorder >= 0;
5409 #endif
5410 const char *encoding;
5411 Py_ssize_t nsize, pos;
5412 PyObject *errorHandler = NULL;
5413 PyObject *exc = NULL;
5414 PyObject *rep = NULL;
5415
5416 if (!PyUnicode_Check(str)) {
5417 PyErr_BadArgument();
5418 return NULL;
5419 }
5420 if (PyUnicode_READY(str) == -1)
5421 return NULL;
5422 kind = PyUnicode_KIND(str);
5423 data = PyUnicode_DATA(str);
5424 len = PyUnicode_GET_LENGTH(str);
5425
5426 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5427 return PyErr_NoMemory();
5428 nsize = len + (byteorder == 0);
5429 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5430 if (v == NULL)
5431 return NULL;
5432
5433 /* output buffer is 4-bytes aligned */
5434 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5435 out = (uint32_t *)PyBytes_AS_STRING(v);
5436 if (byteorder == 0)
5437 *out++ = 0xFEFF;
5438 if (len == 0)
5439 goto done;
5440
5441 if (byteorder == -1)
5442 encoding = "utf-32-le";
5443 else if (byteorder == 1)
5444 encoding = "utf-32-be";
5445 else
5446 encoding = "utf-32";
5447
5448 if (kind == PyUnicode_1BYTE_KIND) {
5449 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5450 goto done;
5451 }
5452
5453 pos = 0;
5454 while (pos < len) {
5455 Py_ssize_t repsize, moreunits;
5456
5457 if (kind == PyUnicode_2BYTE_KIND) {
5458 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5459 &out, native_ordering);
5460 }
5461 else {
5462 assert(kind == PyUnicode_4BYTE_KIND);
5463 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5464 &out, native_ordering);
5465 }
5466 if (pos == len)
5467 break;
5468
5469 rep = unicode_encode_call_errorhandler(
5470 errors, &errorHandler,
5471 encoding, "surrogates not allowed",
5472 str, &exc, pos, pos + 1, &pos);
5473 if (!rep)
5474 goto error;
5475
5476 if (PyBytes_Check(rep)) {
5477 repsize = PyBytes_GET_SIZE(rep);
5478 if (repsize & 3) {
5479 raise_encode_exception(&exc, encoding,
5480 str, pos - 1, pos,
5481 "surrogates not allowed");
5482 goto error;
5483 }
5484 moreunits = repsize / 4;
5485 }
5486 else {
5487 assert(PyUnicode_Check(rep));
5488 if (PyUnicode_READY(rep) < 0)
5489 goto error;
5490 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5491 if (!PyUnicode_IS_ASCII(rep)) {
5492 raise_encode_exception(&exc, encoding,
5493 str, pos - 1, pos,
5494 "surrogates not allowed");
5495 goto error;
5496 }
5497 }
5498
5499 /* four bytes are reserved for each surrogate */
5500 if (moreunits > 1) {
5501 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5502 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5503 /* integer overflow */
5504 PyErr_NoMemory();
5505 goto error;
5506 }
5507 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5508 goto error;
5509 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5510 }
5511
5512 if (PyBytes_Check(rep)) {
5513 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5514 out += moreunits;
5515 } else /* rep is unicode */ {
5516 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5517 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5518 &out, native_ordering);
5519 }
5520
5521 Py_CLEAR(rep);
5522 }
5523
5524 /* Cut back to size actually needed. This is necessary for, for example,
5525 encoding of a string containing isolated surrogates and the 'ignore'
5526 handler is used. */
5527 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5528 if (nsize != PyBytes_GET_SIZE(v))
5529 _PyBytes_Resize(&v, nsize);
5530 Py_XDECREF(errorHandler);
5531 Py_XDECREF(exc);
5532 done:
5533 return v;
5534 error:
5535 Py_XDECREF(rep);
5536 Py_XDECREF(errorHandler);
5537 Py_XDECREF(exc);
5538 Py_XDECREF(v);
5539 return NULL;
5540 }
5541
5542 PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5543 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5544 Py_ssize_t size,
5545 const char *errors,
5546 int byteorder)
5547 {
5548 PyObject *result;
5549 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5550 if (tmp == NULL)
5551 return NULL;
5552 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5553 Py_DECREF(tmp);
5554 return result;
5555 }
5556
5557 PyObject *
PyUnicode_AsUTF32String(PyObject * unicode)5558 PyUnicode_AsUTF32String(PyObject *unicode)
5559 {
5560 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5561 }
5562
5563 /* --- UTF-16 Codec ------------------------------------------------------- */
5564
5565 PyObject *
PyUnicode_DecodeUTF16(const char * s,Py_ssize_t size,const char * errors,int * byteorder)5566 PyUnicode_DecodeUTF16(const char *s,
5567 Py_ssize_t size,
5568 const char *errors,
5569 int *byteorder)
5570 {
5571 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5572 }
5573
5574 PyObject *
PyUnicode_DecodeUTF16Stateful(const char * s,Py_ssize_t size,const char * errors,int * byteorder,Py_ssize_t * consumed)5575 PyUnicode_DecodeUTF16Stateful(const char *s,
5576 Py_ssize_t size,
5577 const char *errors,
5578 int *byteorder,
5579 Py_ssize_t *consumed)
5580 {
5581 const char *starts = s;
5582 Py_ssize_t startinpos;
5583 Py_ssize_t endinpos;
5584 _PyUnicodeWriter writer;
5585 const unsigned char *q, *e;
5586 int bo = 0; /* assume native ordering by default */
5587 int native_ordering;
5588 const char *errmsg = "";
5589 PyObject *errorHandler = NULL;
5590 PyObject *exc = NULL;
5591 const char *encoding;
5592
5593 q = (unsigned char *)s;
5594 e = q + size;
5595
5596 if (byteorder)
5597 bo = *byteorder;
5598
5599 /* Check for BOM marks (U+FEFF) in the input and adjust current
5600 byte order setting accordingly. In native mode, the leading BOM
5601 mark is skipped, in all other modes, it is copied to the output
5602 stream as-is (giving a ZWNBSP character). */
5603 if (bo == 0 && size >= 2) {
5604 const Py_UCS4 bom = (q[1] << 8) | q[0];
5605 if (bom == 0xFEFF) {
5606 q += 2;
5607 bo = -1;
5608 }
5609 else if (bom == 0xFFFE) {
5610 q += 2;
5611 bo = 1;
5612 }
5613 if (byteorder)
5614 *byteorder = bo;
5615 }
5616
5617 if (q == e) {
5618 if (consumed)
5619 *consumed = size;
5620 _Py_RETURN_UNICODE_EMPTY();
5621 }
5622
5623 #if PY_LITTLE_ENDIAN
5624 native_ordering = bo <= 0;
5625 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5626 #else
5627 native_ordering = bo >= 0;
5628 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5629 #endif
5630
5631 /* Note: size will always be longer than the resulting Unicode
5632 character count normally. Error handler will take care of
5633 resizing when needed. */
5634 _PyUnicodeWriter_Init(&writer);
5635 writer.min_length = (e - q + 1) / 2;
5636 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5637 goto onError;
5638
5639 while (1) {
5640 Py_UCS4 ch = 0;
5641 if (e - q >= 2) {
5642 int kind = writer.kind;
5643 if (kind == PyUnicode_1BYTE_KIND) {
5644 if (PyUnicode_IS_ASCII(writer.buffer))
5645 ch = asciilib_utf16_decode(&q, e,
5646 (Py_UCS1*)writer.data, &writer.pos,
5647 native_ordering);
5648 else
5649 ch = ucs1lib_utf16_decode(&q, e,
5650 (Py_UCS1*)writer.data, &writer.pos,
5651 native_ordering);
5652 } else if (kind == PyUnicode_2BYTE_KIND) {
5653 ch = ucs2lib_utf16_decode(&q, e,
5654 (Py_UCS2*)writer.data, &writer.pos,
5655 native_ordering);
5656 } else {
5657 assert(kind == PyUnicode_4BYTE_KIND);
5658 ch = ucs4lib_utf16_decode(&q, e,
5659 (Py_UCS4*)writer.data, &writer.pos,
5660 native_ordering);
5661 }
5662 }
5663
5664 switch (ch)
5665 {
5666 case 0:
5667 /* remaining byte at the end? (size should be even) */
5668 if (q == e || consumed)
5669 goto End;
5670 errmsg = "truncated data";
5671 startinpos = ((const char *)q) - starts;
5672 endinpos = ((const char *)e) - starts;
5673 break;
5674 /* The remaining input chars are ignored if the callback
5675 chooses to skip the input */
5676 case 1:
5677 q -= 2;
5678 if (consumed)
5679 goto End;
5680 errmsg = "unexpected end of data";
5681 startinpos = ((const char *)q) - starts;
5682 endinpos = ((const char *)e) - starts;
5683 break;
5684 case 2:
5685 errmsg = "illegal encoding";
5686 startinpos = ((const char *)q) - 2 - starts;
5687 endinpos = startinpos + 2;
5688 break;
5689 case 3:
5690 errmsg = "illegal UTF-16 surrogate";
5691 startinpos = ((const char *)q) - 4 - starts;
5692 endinpos = startinpos + 2;
5693 break;
5694 default:
5695 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5696 goto onError;
5697 continue;
5698 }
5699
5700 if (unicode_decode_call_errorhandler_writer(
5701 errors,
5702 &errorHandler,
5703 encoding, errmsg,
5704 &starts,
5705 (const char **)&e,
5706 &startinpos,
5707 &endinpos,
5708 &exc,
5709 (const char **)&q,
5710 &writer))
5711 goto onError;
5712 }
5713
5714 End:
5715 if (consumed)
5716 *consumed = (const char *)q-starts;
5717
5718 Py_XDECREF(errorHandler);
5719 Py_XDECREF(exc);
5720 return _PyUnicodeWriter_Finish(&writer);
5721
5722 onError:
5723 _PyUnicodeWriter_Dealloc(&writer);
5724 Py_XDECREF(errorHandler);
5725 Py_XDECREF(exc);
5726 return NULL;
5727 }
5728
5729 PyObject *
_PyUnicode_EncodeUTF16(PyObject * str,const char * errors,int byteorder)5730 _PyUnicode_EncodeUTF16(PyObject *str,
5731 const char *errors,
5732 int byteorder)
5733 {
5734 enum PyUnicode_Kind kind;
5735 const void *data;
5736 Py_ssize_t len;
5737 PyObject *v;
5738 unsigned short *out;
5739 Py_ssize_t pairs;
5740 #if PY_BIG_ENDIAN
5741 int native_ordering = byteorder >= 0;
5742 #else
5743 int native_ordering = byteorder <= 0;
5744 #endif
5745 const char *encoding;
5746 Py_ssize_t nsize, pos;
5747 PyObject *errorHandler = NULL;
5748 PyObject *exc = NULL;
5749 PyObject *rep = NULL;
5750
5751 if (!PyUnicode_Check(str)) {
5752 PyErr_BadArgument();
5753 return NULL;
5754 }
5755 if (PyUnicode_READY(str) == -1)
5756 return NULL;
5757 kind = PyUnicode_KIND(str);
5758 data = PyUnicode_DATA(str);
5759 len = PyUnicode_GET_LENGTH(str);
5760
5761 pairs = 0;
5762 if (kind == PyUnicode_4BYTE_KIND) {
5763 const Py_UCS4 *in = (const Py_UCS4 *)data;
5764 const Py_UCS4 *end = in + len;
5765 while (in < end) {
5766 if (*in++ >= 0x10000) {
5767 pairs++;
5768 }
5769 }
5770 }
5771 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5772 return PyErr_NoMemory();
5773 }
5774 nsize = len + pairs + (byteorder == 0);
5775 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5776 if (v == NULL) {
5777 return NULL;
5778 }
5779
5780 /* output buffer is 2-bytes aligned */
5781 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5782 out = (unsigned short *)PyBytes_AS_STRING(v);
5783 if (byteorder == 0) {
5784 *out++ = 0xFEFF;
5785 }
5786 if (len == 0) {
5787 goto done;
5788 }
5789
5790 if (kind == PyUnicode_1BYTE_KIND) {
5791 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5792 goto done;
5793 }
5794
5795 if (byteorder < 0) {
5796 encoding = "utf-16-le";
5797 }
5798 else if (byteorder > 0) {
5799 encoding = "utf-16-be";
5800 }
5801 else {
5802 encoding = "utf-16";
5803 }
5804
5805 pos = 0;
5806 while (pos < len) {
5807 Py_ssize_t repsize, moreunits;
5808
5809 if (kind == PyUnicode_2BYTE_KIND) {
5810 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5811 &out, native_ordering);
5812 }
5813 else {
5814 assert(kind == PyUnicode_4BYTE_KIND);
5815 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5816 &out, native_ordering);
5817 }
5818 if (pos == len)
5819 break;
5820
5821 rep = unicode_encode_call_errorhandler(
5822 errors, &errorHandler,
5823 encoding, "surrogates not allowed",
5824 str, &exc, pos, pos + 1, &pos);
5825 if (!rep)
5826 goto error;
5827
5828 if (PyBytes_Check(rep)) {
5829 repsize = PyBytes_GET_SIZE(rep);
5830 if (repsize & 1) {
5831 raise_encode_exception(&exc, encoding,
5832 str, pos - 1, pos,
5833 "surrogates not allowed");
5834 goto error;
5835 }
5836 moreunits = repsize / 2;
5837 }
5838 else {
5839 assert(PyUnicode_Check(rep));
5840 if (PyUnicode_READY(rep) < 0)
5841 goto error;
5842 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5843 if (!PyUnicode_IS_ASCII(rep)) {
5844 raise_encode_exception(&exc, encoding,
5845 str, pos - 1, pos,
5846 "surrogates not allowed");
5847 goto error;
5848 }
5849 }
5850
5851 /* two bytes are reserved for each surrogate */
5852 if (moreunits > 1) {
5853 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5854 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5855 /* integer overflow */
5856 PyErr_NoMemory();
5857 goto error;
5858 }
5859 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
5860 goto error;
5861 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5862 }
5863
5864 if (PyBytes_Check(rep)) {
5865 memcpy(out, PyBytes_AS_STRING(rep), repsize);
5866 out += moreunits;
5867 } else /* rep is unicode */ {
5868 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5869 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5870 &out, native_ordering);
5871 }
5872
5873 Py_CLEAR(rep);
5874 }
5875
5876 /* Cut back to size actually needed. This is necessary for, for example,
5877 encoding of a string containing isolated surrogates and the 'ignore' handler
5878 is used. */
5879 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5880 if (nsize != PyBytes_GET_SIZE(v))
5881 _PyBytes_Resize(&v, nsize);
5882 Py_XDECREF(errorHandler);
5883 Py_XDECREF(exc);
5884 done:
5885 return v;
5886 error:
5887 Py_XDECREF(rep);
5888 Py_XDECREF(errorHandler);
5889 Py_XDECREF(exc);
5890 Py_XDECREF(v);
5891 return NULL;
5892 #undef STORECHAR
5893 }
5894
5895 PyObject *
PyUnicode_EncodeUTF16(const Py_UNICODE * s,Py_ssize_t size,const char * errors,int byteorder)5896 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5897 Py_ssize_t size,
5898 const char *errors,
5899 int byteorder)
5900 {
5901 PyObject *result;
5902 PyObject *tmp = PyUnicode_FromWideChar(s, size);
5903 if (tmp == NULL)
5904 return NULL;
5905 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5906 Py_DECREF(tmp);
5907 return result;
5908 }
5909
5910 PyObject *
PyUnicode_AsUTF16String(PyObject * unicode)5911 PyUnicode_AsUTF16String(PyObject *unicode)
5912 {
5913 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5914 }
5915
5916 /* --- Unicode Escape Codec ----------------------------------------------- */
5917
5918 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5919
5920 PyObject *
_PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors,const char ** first_invalid_escape)5921 _PyUnicode_DecodeUnicodeEscape(const char *s,
5922 Py_ssize_t size,
5923 const char *errors,
5924 const char **first_invalid_escape)
5925 {
5926 const char *starts = s;
5927 _PyUnicodeWriter writer;
5928 const char *end;
5929 PyObject *errorHandler = NULL;
5930 PyObject *exc = NULL;
5931
5932 // so we can remember if we've seen an invalid escape char or not
5933 *first_invalid_escape = NULL;
5934
5935 if (size == 0) {
5936 _Py_RETURN_UNICODE_EMPTY();
5937 }
5938 /* Escaped strings will always be longer than the resulting
5939 Unicode string, so we start with size here and then reduce the
5940 length after conversion to the true value.
5941 (but if the error callback returns a long replacement string
5942 we'll have to allocate more space) */
5943 _PyUnicodeWriter_Init(&writer);
5944 writer.min_length = size;
5945 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
5946 goto onError;
5947 }
5948
5949 end = s + size;
5950 while (s < end) {
5951 unsigned char c = (unsigned char) *s++;
5952 Py_UCS4 ch;
5953 int count;
5954 Py_ssize_t startinpos;
5955 Py_ssize_t endinpos;
5956 const char *message;
5957
5958 #define WRITE_ASCII_CHAR(ch) \
5959 do { \
5960 assert(ch <= 127); \
5961 assert(writer.pos < writer.size); \
5962 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5963 } while(0)
5964
5965 #define WRITE_CHAR(ch) \
5966 do { \
5967 if (ch <= writer.maxchar) { \
5968 assert(writer.pos < writer.size); \
5969 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
5970 } \
5971 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
5972 goto onError; \
5973 } \
5974 } while(0)
5975
5976 /* Non-escape characters are interpreted as Unicode ordinals */
5977 if (c != '\\') {
5978 WRITE_CHAR(c);
5979 continue;
5980 }
5981
5982 startinpos = s - starts - 1;
5983 /* \ - Escapes */
5984 if (s >= end) {
5985 message = "\\ at end of string";
5986 goto error;
5987 }
5988 c = (unsigned char) *s++;
5989
5990 assert(writer.pos < writer.size);
5991 switch (c) {
5992
5993 /* \x escapes */
5994 case '\n': continue;
5995 case '\\': WRITE_ASCII_CHAR('\\'); continue;
5996 case '\'': WRITE_ASCII_CHAR('\''); continue;
5997 case '\"': WRITE_ASCII_CHAR('\"'); continue;
5998 case 'b': WRITE_ASCII_CHAR('\b'); continue;
5999 /* FF */
6000 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6001 case 't': WRITE_ASCII_CHAR('\t'); continue;
6002 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6003 case 'r': WRITE_ASCII_CHAR('\r'); continue;
6004 /* VT */
6005 case 'v': WRITE_ASCII_CHAR('\013'); continue;
6006 /* BEL, not classic C */
6007 case 'a': WRITE_ASCII_CHAR('\007'); continue;
6008
6009 /* \OOO (octal) escapes */
6010 case '0': case '1': case '2': case '3':
6011 case '4': case '5': case '6': case '7':
6012 ch = c - '0';
6013 if (s < end && '0' <= *s && *s <= '7') {
6014 ch = (ch<<3) + *s++ - '0';
6015 if (s < end && '0' <= *s && *s <= '7') {
6016 ch = (ch<<3) + *s++ - '0';
6017 }
6018 }
6019 WRITE_CHAR(ch);
6020 continue;
6021
6022 /* hex escapes */
6023 /* \xXX */
6024 case 'x':
6025 count = 2;
6026 message = "truncated \\xXX escape";
6027 goto hexescape;
6028
6029 /* \uXXXX */
6030 case 'u':
6031 count = 4;
6032 message = "truncated \\uXXXX escape";
6033 goto hexescape;
6034
6035 /* \UXXXXXXXX */
6036 case 'U':
6037 count = 8;
6038 message = "truncated \\UXXXXXXXX escape";
6039 hexescape:
6040 for (ch = 0; count && s < end; ++s, --count) {
6041 c = (unsigned char)*s;
6042 ch <<= 4;
6043 if (c >= '0' && c <= '9') {
6044 ch += c - '0';
6045 }
6046 else if (c >= 'a' && c <= 'f') {
6047 ch += c - ('a' - 10);
6048 }
6049 else if (c >= 'A' && c <= 'F') {
6050 ch += c - ('A' - 10);
6051 }
6052 else {
6053 break;
6054 }
6055 }
6056 if (count) {
6057 goto error;
6058 }
6059
6060 /* when we get here, ch is a 32-bit unicode character */
6061 if (ch > MAX_UNICODE) {
6062 message = "illegal Unicode character";
6063 goto error;
6064 }
6065
6066 WRITE_CHAR(ch);
6067 continue;
6068
6069 /* \N{name} */
6070 case 'N':
6071 if (ucnhash_CAPI == NULL) {
6072 /* load the unicode data module */
6073 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6074 PyUnicodeData_CAPSULE_NAME, 1);
6075 if (ucnhash_CAPI == NULL) {
6076 PyErr_SetString(
6077 PyExc_UnicodeError,
6078 "\\N escapes not supported (can't load unicodedata module)"
6079 );
6080 goto onError;
6081 }
6082 }
6083
6084 message = "malformed \\N character escape";
6085 if (s < end && *s == '{') {
6086 const char *start = ++s;
6087 size_t namelen;
6088 /* look for the closing brace */
6089 while (s < end && *s != '}')
6090 s++;
6091 namelen = s - start;
6092 if (namelen && s < end) {
6093 /* found a name. look it up in the unicode database */
6094 s++;
6095 ch = 0xffffffff; /* in case 'getcode' messes up */
6096 if (namelen <= INT_MAX &&
6097 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6098 &ch, 0)) {
6099 assert(ch <= MAX_UNICODE);
6100 WRITE_CHAR(ch);
6101 continue;
6102 }
6103 message = "unknown Unicode character name";
6104 }
6105 }
6106 goto error;
6107
6108 default:
6109 if (*first_invalid_escape == NULL) {
6110 *first_invalid_escape = s-1; /* Back up one char, since we've
6111 already incremented s. */
6112 }
6113 WRITE_ASCII_CHAR('\\');
6114 WRITE_CHAR(c);
6115 continue;
6116 }
6117
6118 error:
6119 endinpos = s-starts;
6120 writer.min_length = end - s + writer.pos;
6121 if (unicode_decode_call_errorhandler_writer(
6122 errors, &errorHandler,
6123 "unicodeescape", message,
6124 &starts, &end, &startinpos, &endinpos, &exc, &s,
6125 &writer)) {
6126 goto onError;
6127 }
6128 assert(end - s <= writer.size - writer.pos);
6129
6130 #undef WRITE_ASCII_CHAR
6131 #undef WRITE_CHAR
6132 }
6133
6134 Py_XDECREF(errorHandler);
6135 Py_XDECREF(exc);
6136 return _PyUnicodeWriter_Finish(&writer);
6137
6138 onError:
6139 _PyUnicodeWriter_Dealloc(&writer);
6140 Py_XDECREF(errorHandler);
6141 Py_XDECREF(exc);
6142 return NULL;
6143 }
6144
6145 PyObject *
PyUnicode_DecodeUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6146 PyUnicode_DecodeUnicodeEscape(const char *s,
6147 Py_ssize_t size,
6148 const char *errors)
6149 {
6150 const char *first_invalid_escape;
6151 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6152 &first_invalid_escape);
6153 if (result == NULL)
6154 return NULL;
6155 if (first_invalid_escape != NULL) {
6156 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6157 "invalid escape sequence '\\%c'",
6158 (unsigned char)*first_invalid_escape) < 0) {
6159 Py_DECREF(result);
6160 return NULL;
6161 }
6162 }
6163 return result;
6164 }
6165
6166 /* Return a Unicode-Escape string version of the Unicode object. */
6167
6168 PyObject *
PyUnicode_AsUnicodeEscapeString(PyObject * unicode)6169 PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6170 {
6171 Py_ssize_t i, len;
6172 PyObject *repr;
6173 char *p;
6174 enum PyUnicode_Kind kind;
6175 void *data;
6176 Py_ssize_t expandsize;
6177
6178 /* Initial allocation is based on the longest-possible character
6179 escape.
6180
6181 For UCS1 strings it's '\xxx', 4 bytes per source character.
6182 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6183 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6184 */
6185
6186 if (!PyUnicode_Check(unicode)) {
6187 PyErr_BadArgument();
6188 return NULL;
6189 }
6190 if (PyUnicode_READY(unicode) == -1) {
6191 return NULL;
6192 }
6193
6194 len = PyUnicode_GET_LENGTH(unicode);
6195 if (len == 0) {
6196 return PyBytes_FromStringAndSize(NULL, 0);
6197 }
6198
6199 kind = PyUnicode_KIND(unicode);
6200 data = PyUnicode_DATA(unicode);
6201 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6202 bytes, and 1 byte characters 4. */
6203 expandsize = kind * 2 + 2;
6204 if (len > PY_SSIZE_T_MAX / expandsize) {
6205 return PyErr_NoMemory();
6206 }
6207 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6208 if (repr == NULL) {
6209 return NULL;
6210 }
6211
6212 p = PyBytes_AS_STRING(repr);
6213 for (i = 0; i < len; i++) {
6214 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6215
6216 /* U+0000-U+00ff range */
6217 if (ch < 0x100) {
6218 if (ch >= ' ' && ch < 127) {
6219 if (ch != '\\') {
6220 /* Copy printable US ASCII as-is */
6221 *p++ = (char) ch;
6222 }
6223 /* Escape backslashes */
6224 else {
6225 *p++ = '\\';
6226 *p++ = '\\';
6227 }
6228 }
6229
6230 /* Map special whitespace to '\t', \n', '\r' */
6231 else if (ch == '\t') {
6232 *p++ = '\\';
6233 *p++ = 't';
6234 }
6235 else if (ch == '\n') {
6236 *p++ = '\\';
6237 *p++ = 'n';
6238 }
6239 else if (ch == '\r') {
6240 *p++ = '\\';
6241 *p++ = 'r';
6242 }
6243
6244 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6245 else {
6246 *p++ = '\\';
6247 *p++ = 'x';
6248 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6249 *p++ = Py_hexdigits[ch & 0x000F];
6250 }
6251 }
6252 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6253 else if (ch < 0x10000) {
6254 *p++ = '\\';
6255 *p++ = 'u';
6256 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6257 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6258 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6259 *p++ = Py_hexdigits[ch & 0x000F];
6260 }
6261 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6262 else {
6263
6264 /* Make sure that the first two digits are zero */
6265 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6266 *p++ = '\\';
6267 *p++ = 'U';
6268 *p++ = '0';
6269 *p++ = '0';
6270 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6271 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6272 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6273 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6274 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6275 *p++ = Py_hexdigits[ch & 0x0000000F];
6276 }
6277 }
6278
6279 assert(p - PyBytes_AS_STRING(repr) > 0);
6280 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6281 return NULL;
6282 }
6283 return repr;
6284 }
6285
6286 PyObject *
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6287 PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6288 Py_ssize_t size)
6289 {
6290 PyObject *result;
6291 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6292 if (tmp == NULL) {
6293 return NULL;
6294 }
6295
6296 result = PyUnicode_AsUnicodeEscapeString(tmp);
6297 Py_DECREF(tmp);
6298 return result;
6299 }
6300
6301 /* --- Raw Unicode Escape Codec ------------------------------------------- */
6302
6303 PyObject *
PyUnicode_DecodeRawUnicodeEscape(const char * s,Py_ssize_t size,const char * errors)6304 PyUnicode_DecodeRawUnicodeEscape(const char *s,
6305 Py_ssize_t size,
6306 const char *errors)
6307 {
6308 const char *starts = s;
6309 _PyUnicodeWriter writer;
6310 const char *end;
6311 PyObject *errorHandler = NULL;
6312 PyObject *exc = NULL;
6313
6314 if (size == 0) {
6315 _Py_RETURN_UNICODE_EMPTY();
6316 }
6317
6318 /* Escaped strings will always be longer than the resulting
6319 Unicode string, so we start with size here and then reduce the
6320 length after conversion to the true value. (But decoding error
6321 handler might have to resize the string) */
6322 _PyUnicodeWriter_Init(&writer);
6323 writer.min_length = size;
6324 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6325 goto onError;
6326 }
6327
6328 end = s + size;
6329 while (s < end) {
6330 unsigned char c = (unsigned char) *s++;
6331 Py_UCS4 ch;
6332 int count;
6333 Py_ssize_t startinpos;
6334 Py_ssize_t endinpos;
6335 const char *message;
6336
6337 #define WRITE_CHAR(ch) \
6338 do { \
6339 if (ch <= writer.maxchar) { \
6340 assert(writer.pos < writer.size); \
6341 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6342 } \
6343 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6344 goto onError; \
6345 } \
6346 } while(0)
6347
6348 /* Non-escape characters are interpreted as Unicode ordinals */
6349 if (c != '\\' || s >= end) {
6350 WRITE_CHAR(c);
6351 continue;
6352 }
6353
6354 c = (unsigned char) *s++;
6355 if (c == 'u') {
6356 count = 4;
6357 message = "truncated \\uXXXX escape";
6358 }
6359 else if (c == 'U') {
6360 count = 8;
6361 message = "truncated \\UXXXXXXXX escape";
6362 }
6363 else {
6364 assert(writer.pos < writer.size);
6365 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6366 WRITE_CHAR(c);
6367 continue;
6368 }
6369 startinpos = s - starts - 2;
6370
6371 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6372 for (ch = 0; count && s < end; ++s, --count) {
6373 c = (unsigned char)*s;
6374 ch <<= 4;
6375 if (c >= '0' && c <= '9') {
6376 ch += c - '0';
6377 }
6378 else if (c >= 'a' && c <= 'f') {
6379 ch += c - ('a' - 10);
6380 }
6381 else if (c >= 'A' && c <= 'F') {
6382 ch += c - ('A' - 10);
6383 }
6384 else {
6385 break;
6386 }
6387 }
6388 if (!count) {
6389 if (ch <= MAX_UNICODE) {
6390 WRITE_CHAR(ch);
6391 continue;
6392 }
6393 message = "\\Uxxxxxxxx out of range";
6394 }
6395
6396 endinpos = s-starts;
6397 writer.min_length = end - s + writer.pos;
6398 if (unicode_decode_call_errorhandler_writer(
6399 errors, &errorHandler,
6400 "rawunicodeescape", message,
6401 &starts, &end, &startinpos, &endinpos, &exc, &s,
6402 &writer)) {
6403 goto onError;
6404 }
6405 assert(end - s <= writer.size - writer.pos);
6406
6407 #undef WRITE_CHAR
6408 }
6409 Py_XDECREF(errorHandler);
6410 Py_XDECREF(exc);
6411 return _PyUnicodeWriter_Finish(&writer);
6412
6413 onError:
6414 _PyUnicodeWriter_Dealloc(&writer);
6415 Py_XDECREF(errorHandler);
6416 Py_XDECREF(exc);
6417 return NULL;
6418
6419 }
6420
6421
6422 PyObject *
PyUnicode_AsRawUnicodeEscapeString(PyObject * unicode)6423 PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6424 {
6425 PyObject *repr;
6426 char *p;
6427 Py_ssize_t expandsize, pos;
6428 int kind;
6429 void *data;
6430 Py_ssize_t len;
6431
6432 if (!PyUnicode_Check(unicode)) {
6433 PyErr_BadArgument();
6434 return NULL;
6435 }
6436 if (PyUnicode_READY(unicode) == -1) {
6437 return NULL;
6438 }
6439 kind = PyUnicode_KIND(unicode);
6440 data = PyUnicode_DATA(unicode);
6441 len = PyUnicode_GET_LENGTH(unicode);
6442 if (kind == PyUnicode_1BYTE_KIND) {
6443 return PyBytes_FromStringAndSize(data, len);
6444 }
6445
6446 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6447 bytes, and 1 byte characters 4. */
6448 expandsize = kind * 2 + 2;
6449
6450 if (len > PY_SSIZE_T_MAX / expandsize) {
6451 return PyErr_NoMemory();
6452 }
6453 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6454 if (repr == NULL) {
6455 return NULL;
6456 }
6457 if (len == 0) {
6458 return repr;
6459 }
6460
6461 p = PyBytes_AS_STRING(repr);
6462 for (pos = 0; pos < len; pos++) {
6463 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6464
6465 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6466 if (ch < 0x100) {
6467 *p++ = (char) ch;
6468 }
6469 /* U+0000-U+00ff range: Map 16-bit characters to '\uHHHH' */
6470 else if (ch < 0x10000) {
6471 *p++ = '\\';
6472 *p++ = 'u';
6473 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6474 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6475 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6476 *p++ = Py_hexdigits[ch & 15];
6477 }
6478 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6479 else {
6480 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6481 *p++ = '\\';
6482 *p++ = 'U';
6483 *p++ = '0';
6484 *p++ = '0';
6485 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6486 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6487 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6488 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6489 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6490 *p++ = Py_hexdigits[ch & 15];
6491 }
6492 }
6493
6494 assert(p > PyBytes_AS_STRING(repr));
6495 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6496 return NULL;
6497 }
6498 return repr;
6499 }
6500
6501 PyObject *
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE * s,Py_ssize_t size)6502 PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6503 Py_ssize_t size)
6504 {
6505 PyObject *result;
6506 PyObject *tmp = PyUnicode_FromWideChar(s, size);
6507 if (tmp == NULL)
6508 return NULL;
6509 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6510 Py_DECREF(tmp);
6511 return result;
6512 }
6513
6514 /* --- Unicode Internal Codec ------------------------------------------- */
6515
6516 PyObject *
_PyUnicode_DecodeUnicodeInternal(const char * s,Py_ssize_t size,const char * errors)6517 _PyUnicode_DecodeUnicodeInternal(const char *s,
6518 Py_ssize_t size,
6519 const char *errors)
6520 {
6521 const char *starts = s;
6522 Py_ssize_t startinpos;
6523 Py_ssize_t endinpos;
6524 _PyUnicodeWriter writer;
6525 const char *end;
6526 const char *reason;
6527 PyObject *errorHandler = NULL;
6528 PyObject *exc = NULL;
6529
6530 if (PyErr_WarnEx(PyExc_DeprecationWarning,
6531 "unicode_internal codec has been deprecated",
6532 1))
6533 return NULL;
6534
6535 if (size < 0) {
6536 PyErr_BadInternalCall();
6537 return NULL;
6538 }
6539 if (size == 0)
6540 _Py_RETURN_UNICODE_EMPTY();
6541
6542 _PyUnicodeWriter_Init(&writer);
6543 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6544 PyErr_NoMemory();
6545 goto onError;
6546 }
6547 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6548
6549 end = s + size;
6550 while (s < end) {
6551 Py_UNICODE uch;
6552 Py_UCS4 ch;
6553 if (end - s < Py_UNICODE_SIZE) {
6554 endinpos = end-starts;
6555 reason = "truncated input";
6556 goto error;
6557 }
6558 /* We copy the raw representation one byte at a time because the
6559 pointer may be unaligned (see test_codeccallbacks). */
6560 ((char *) &uch)[0] = s[0];
6561 ((char *) &uch)[1] = s[1];
6562 #ifdef Py_UNICODE_WIDE
6563 ((char *) &uch)[2] = s[2];
6564 ((char *) &uch)[3] = s[3];
6565 #endif
6566 ch = uch;
6567 #ifdef Py_UNICODE_WIDE
6568 /* We have to sanity check the raw data, otherwise doom looms for
6569 some malformed UCS-4 data. */
6570 if (ch > 0x10ffff) {
6571 endinpos = s - starts + Py_UNICODE_SIZE;
6572 reason = "illegal code point (> 0x10FFFF)";
6573 goto error;
6574 }
6575 #endif
6576 s += Py_UNICODE_SIZE;
6577 #ifndef Py_UNICODE_WIDE
6578 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6579 {
6580 Py_UNICODE uch2;
6581 ((char *) &uch2)[0] = s[0];
6582 ((char *) &uch2)[1] = s[1];
6583 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6584 {
6585 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6586 s += Py_UNICODE_SIZE;
6587 }
6588 }
6589 #endif
6590
6591 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6592 goto onError;
6593 continue;
6594
6595 error:
6596 startinpos = s - starts;
6597 if (unicode_decode_call_errorhandler_writer(
6598 errors, &errorHandler,
6599 "unicode_internal", reason,
6600 &starts, &end, &startinpos, &endinpos, &exc, &s,
6601 &writer))
6602 goto onError;
6603 }
6604
6605 Py_XDECREF(errorHandler);
6606 Py_XDECREF(exc);
6607 return _PyUnicodeWriter_Finish(&writer);
6608
6609 onError:
6610 _PyUnicodeWriter_Dealloc(&writer);
6611 Py_XDECREF(errorHandler);
6612 Py_XDECREF(exc);
6613 return NULL;
6614 }
6615
6616 /* --- Latin-1 Codec ------------------------------------------------------ */
6617
6618 PyObject *
PyUnicode_DecodeLatin1(const char * s,Py_ssize_t size,const char * errors)6619 PyUnicode_DecodeLatin1(const char *s,
6620 Py_ssize_t size,
6621 const char *errors)
6622 {
6623 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6624 return _PyUnicode_FromUCS1((unsigned char*)s, size);
6625 }
6626
6627 /* create or adjust a UnicodeEncodeError */
6628 static void
make_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6629 make_encode_exception(PyObject **exceptionObject,
6630 const char *encoding,
6631 PyObject *unicode,
6632 Py_ssize_t startpos, Py_ssize_t endpos,
6633 const char *reason)
6634 {
6635 if (*exceptionObject == NULL) {
6636 *exceptionObject = PyObject_CallFunction(
6637 PyExc_UnicodeEncodeError, "sOnns",
6638 encoding, unicode, startpos, endpos, reason);
6639 }
6640 else {
6641 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6642 goto onError;
6643 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6644 goto onError;
6645 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6646 goto onError;
6647 return;
6648 onError:
6649 Py_CLEAR(*exceptionObject);
6650 }
6651 }
6652
6653 /* raises a UnicodeEncodeError */
6654 static void
raise_encode_exception(PyObject ** exceptionObject,const char * encoding,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)6655 raise_encode_exception(PyObject **exceptionObject,
6656 const char *encoding,
6657 PyObject *unicode,
6658 Py_ssize_t startpos, Py_ssize_t endpos,
6659 const char *reason)
6660 {
6661 make_encode_exception(exceptionObject,
6662 encoding, unicode, startpos, endpos, reason);
6663 if (*exceptionObject != NULL)
6664 PyCodec_StrictErrors(*exceptionObject);
6665 }
6666
6667 /* error handling callback helper:
6668 build arguments, call the callback and check the arguments,
6669 put the result into newpos and return the replacement string, which
6670 has to be freed by the caller */
6671 static PyObject *
unicode_encode_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * encoding,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)6672 unicode_encode_call_errorhandler(const char *errors,
6673 PyObject **errorHandler,
6674 const char *encoding, const char *reason,
6675 PyObject *unicode, PyObject **exceptionObject,
6676 Py_ssize_t startpos, Py_ssize_t endpos,
6677 Py_ssize_t *newpos)
6678 {
6679 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6680 Py_ssize_t len;
6681 PyObject *restuple;
6682 PyObject *resunicode;
6683
6684 if (*errorHandler == NULL) {
6685 *errorHandler = PyCodec_LookupError(errors);
6686 if (*errorHandler == NULL)
6687 return NULL;
6688 }
6689
6690 if (PyUnicode_READY(unicode) == -1)
6691 return NULL;
6692 len = PyUnicode_GET_LENGTH(unicode);
6693
6694 make_encode_exception(exceptionObject,
6695 encoding, unicode, startpos, endpos, reason);
6696 if (*exceptionObject == NULL)
6697 return NULL;
6698
6699 restuple = PyObject_CallFunctionObjArgs(
6700 *errorHandler, *exceptionObject, NULL);
6701 if (restuple == NULL)
6702 return NULL;
6703 if (!PyTuple_Check(restuple)) {
6704 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6705 Py_DECREF(restuple);
6706 return NULL;
6707 }
6708 if (!PyArg_ParseTuple(restuple, argparse,
6709 &resunicode, newpos)) {
6710 Py_DECREF(restuple);
6711 return NULL;
6712 }
6713 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6714 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6715 Py_DECREF(restuple);
6716 return NULL;
6717 }
6718 if (*newpos<0)
6719 *newpos = len + *newpos;
6720 if (*newpos<0 || *newpos>len) {
6721 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6722 Py_DECREF(restuple);
6723 return NULL;
6724 }
6725 Py_INCREF(resunicode);
6726 Py_DECREF(restuple);
6727 return resunicode;
6728 }
6729
6730 static PyObject *
unicode_encode_ucs1(PyObject * unicode,const char * errors,const Py_UCS4 limit)6731 unicode_encode_ucs1(PyObject *unicode,
6732 const char *errors,
6733 const Py_UCS4 limit)
6734 {
6735 /* input state */
6736 Py_ssize_t pos=0, size;
6737 int kind;
6738 void *data;
6739 /* pointer into the output */
6740 char *str;
6741 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6742 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6743 PyObject *error_handler_obj = NULL;
6744 PyObject *exc = NULL;
6745 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6746 PyObject *rep = NULL;
6747 /* output object */
6748 _PyBytesWriter writer;
6749
6750 if (PyUnicode_READY(unicode) == -1)
6751 return NULL;
6752 size = PyUnicode_GET_LENGTH(unicode);
6753 kind = PyUnicode_KIND(unicode);
6754 data = PyUnicode_DATA(unicode);
6755 /* allocate enough for a simple encoding without
6756 replacements, if we need more, we'll resize */
6757 if (size == 0)
6758 return PyBytes_FromStringAndSize(NULL, 0);
6759
6760 _PyBytesWriter_Init(&writer);
6761 str = _PyBytesWriter_Alloc(&writer, size);
6762 if (str == NULL)
6763 return NULL;
6764
6765 while (pos < size) {
6766 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6767
6768 /* can we encode this? */
6769 if (ch < limit) {
6770 /* no overflow check, because we know that the space is enough */
6771 *str++ = (char)ch;
6772 ++pos;
6773 }
6774 else {
6775 Py_ssize_t newpos, i;
6776 /* startpos for collecting unencodable chars */
6777 Py_ssize_t collstart = pos;
6778 Py_ssize_t collend = collstart + 1;
6779 /* find all unecodable characters */
6780
6781 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6782 ++collend;
6783
6784 /* Only overallocate the buffer if it's not the last write */
6785 writer.overallocate = (collend < size);
6786
6787 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6788 if (error_handler == _Py_ERROR_UNKNOWN)
6789 error_handler = get_error_handler(errors);
6790
6791 switch (error_handler) {
6792 case _Py_ERROR_STRICT:
6793 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6794 goto onError;
6795
6796 case _Py_ERROR_REPLACE:
6797 memset(str, '?', collend - collstart);
6798 str += (collend - collstart);
6799 /* fall through */
6800 case _Py_ERROR_IGNORE:
6801 pos = collend;
6802 break;
6803
6804 case _Py_ERROR_BACKSLASHREPLACE:
6805 /* subtract preallocated bytes */
6806 writer.min_size -= (collend - collstart);
6807 str = backslashreplace(&writer, str,
6808 unicode, collstart, collend);
6809 if (str == NULL)
6810 goto onError;
6811 pos = collend;
6812 break;
6813
6814 case _Py_ERROR_XMLCHARREFREPLACE:
6815 /* subtract preallocated bytes */
6816 writer.min_size -= (collend - collstart);
6817 str = xmlcharrefreplace(&writer, str,
6818 unicode, collstart, collend);
6819 if (str == NULL)
6820 goto onError;
6821 pos = collend;
6822 break;
6823
6824 case _Py_ERROR_SURROGATEESCAPE:
6825 for (i = collstart; i < collend; ++i) {
6826 ch = PyUnicode_READ(kind, data, i);
6827 if (ch < 0xdc80 || 0xdcff < ch) {
6828 /* Not a UTF-8b surrogate */
6829 break;
6830 }
6831 *str++ = (char)(ch - 0xdc00);
6832 ++pos;
6833 }
6834 if (i >= collend)
6835 break;
6836 collstart = pos;
6837 assert(collstart != collend);
6838 /* fall through */
6839
6840 default:
6841 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6842 encoding, reason, unicode, &exc,
6843 collstart, collend, &newpos);
6844 if (rep == NULL)
6845 goto onError;
6846
6847 /* subtract preallocated bytes */
6848 writer.min_size -= newpos - collstart;
6849
6850 if (PyBytes_Check(rep)) {
6851 /* Directly copy bytes result to output. */
6852 str = _PyBytesWriter_WriteBytes(&writer, str,
6853 PyBytes_AS_STRING(rep),
6854 PyBytes_GET_SIZE(rep));
6855 }
6856 else {
6857 assert(PyUnicode_Check(rep));
6858
6859 if (PyUnicode_READY(rep) < 0)
6860 goto onError;
6861
6862 if (limit == 256 ?
6863 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6864 !PyUnicode_IS_ASCII(rep))
6865 {
6866 /* Not all characters are smaller than limit */
6867 raise_encode_exception(&exc, encoding, unicode,
6868 collstart, collend, reason);
6869 goto onError;
6870 }
6871 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6872 str = _PyBytesWriter_WriteBytes(&writer, str,
6873 PyUnicode_DATA(rep),
6874 PyUnicode_GET_LENGTH(rep));
6875 }
6876 if (str == NULL)
6877 goto onError;
6878
6879 pos = newpos;
6880 Py_CLEAR(rep);
6881 }
6882
6883 /* If overallocation was disabled, ensure that it was the last
6884 write. Otherwise, we missed an optimization */
6885 assert(writer.overallocate || pos == size);
6886 }
6887 }
6888
6889 Py_XDECREF(error_handler_obj);
6890 Py_XDECREF(exc);
6891 return _PyBytesWriter_Finish(&writer, str);
6892
6893 onError:
6894 Py_XDECREF(rep);
6895 _PyBytesWriter_Dealloc(&writer);
6896 Py_XDECREF(error_handler_obj);
6897 Py_XDECREF(exc);
6898 return NULL;
6899 }
6900
6901 /* Deprecated */
6902 PyObject *
PyUnicode_EncodeLatin1(const Py_UNICODE * p,Py_ssize_t size,const char * errors)6903 PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6904 Py_ssize_t size,
6905 const char *errors)
6906 {
6907 PyObject *result;
6908 PyObject *unicode = PyUnicode_FromWideChar(p, size);
6909 if (unicode == NULL)
6910 return NULL;
6911 result = unicode_encode_ucs1(unicode, errors, 256);
6912 Py_DECREF(unicode);
6913 return result;
6914 }
6915
6916 PyObject *
_PyUnicode_AsLatin1String(PyObject * unicode,const char * errors)6917 _PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6918 {
6919 if (!PyUnicode_Check(unicode)) {
6920 PyErr_BadArgument();
6921 return NULL;
6922 }
6923 if (PyUnicode_READY(unicode) == -1)
6924 return NULL;
6925 /* Fast path: if it is a one-byte string, construct
6926 bytes object directly. */
6927 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6928 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6929 PyUnicode_GET_LENGTH(unicode));
6930 /* Non-Latin-1 characters present. Defer to above function to
6931 raise the exception. */
6932 return unicode_encode_ucs1(unicode, errors, 256);
6933 }
6934
6935 PyObject*
PyUnicode_AsLatin1String(PyObject * unicode)6936 PyUnicode_AsLatin1String(PyObject *unicode)
6937 {
6938 return _PyUnicode_AsLatin1String(unicode, NULL);
6939 }
6940
6941 /* --- 7-bit ASCII Codec -------------------------------------------------- */
6942
6943 PyObject *
PyUnicode_DecodeASCII(const char * s,Py_ssize_t size,const char * errors)6944 PyUnicode_DecodeASCII(const char *s,
6945 Py_ssize_t size,
6946 const char *errors)
6947 {
6948 const char *starts = s;
6949 _PyUnicodeWriter writer;
6950 int kind;
6951 void *data;
6952 Py_ssize_t startinpos;
6953 Py_ssize_t endinpos;
6954 Py_ssize_t outpos;
6955 const char *e;
6956 PyObject *error_handler_obj = NULL;
6957 PyObject *exc = NULL;
6958 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6959
6960 if (size == 0)
6961 _Py_RETURN_UNICODE_EMPTY();
6962
6963 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6964 if (size == 1 && (unsigned char)s[0] < 128)
6965 return get_latin1_char((unsigned char)s[0]);
6966
6967 _PyUnicodeWriter_Init(&writer);
6968 writer.min_length = size;
6969 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6970 return NULL;
6971
6972 e = s + size;
6973 data = writer.data;
6974 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6975 writer.pos = outpos;
6976 if (writer.pos == size)
6977 return _PyUnicodeWriter_Finish(&writer);
6978
6979 s += writer.pos;
6980 kind = writer.kind;
6981 while (s < e) {
6982 unsigned char c = (unsigned char)*s;
6983 if (c < 128) {
6984 PyUnicode_WRITE(kind, data, writer.pos, c);
6985 writer.pos++;
6986 ++s;
6987 continue;
6988 }
6989
6990 /* byte outsize range 0x00..0x7f: call the error handler */
6991
6992 if (error_handler == _Py_ERROR_UNKNOWN)
6993 error_handler = get_error_handler(errors);
6994
6995 switch (error_handler)
6996 {
6997 case _Py_ERROR_REPLACE:
6998 case _Py_ERROR_SURROGATEESCAPE:
6999 /* Fast-path: the error handler only writes one character,
7000 but we may switch to UCS2 at the first write */
7001 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7002 goto onError;
7003 kind = writer.kind;
7004 data = writer.data;
7005
7006 if (error_handler == _Py_ERROR_REPLACE)
7007 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7008 else
7009 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7010 writer.pos++;
7011 ++s;
7012 break;
7013
7014 case _Py_ERROR_IGNORE:
7015 ++s;
7016 break;
7017
7018 default:
7019 startinpos = s-starts;
7020 endinpos = startinpos + 1;
7021 if (unicode_decode_call_errorhandler_writer(
7022 errors, &error_handler_obj,
7023 "ascii", "ordinal not in range(128)",
7024 &starts, &e, &startinpos, &endinpos, &exc, &s,
7025 &writer))
7026 goto onError;
7027 kind = writer.kind;
7028 data = writer.data;
7029 }
7030 }
7031 Py_XDECREF(error_handler_obj);
7032 Py_XDECREF(exc);
7033 return _PyUnicodeWriter_Finish(&writer);
7034
7035 onError:
7036 _PyUnicodeWriter_Dealloc(&writer);
7037 Py_XDECREF(error_handler_obj);
7038 Py_XDECREF(exc);
7039 return NULL;
7040 }
7041
7042 /* Deprecated */
7043 PyObject *
PyUnicode_EncodeASCII(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7044 PyUnicode_EncodeASCII(const Py_UNICODE *p,
7045 Py_ssize_t size,
7046 const char *errors)
7047 {
7048 PyObject *result;
7049 PyObject *unicode = PyUnicode_FromWideChar(p, size);
7050 if (unicode == NULL)
7051 return NULL;
7052 result = unicode_encode_ucs1(unicode, errors, 128);
7053 Py_DECREF(unicode);
7054 return result;
7055 }
7056
7057 PyObject *
_PyUnicode_AsASCIIString(PyObject * unicode,const char * errors)7058 _PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7059 {
7060 if (!PyUnicode_Check(unicode)) {
7061 PyErr_BadArgument();
7062 return NULL;
7063 }
7064 if (PyUnicode_READY(unicode) == -1)
7065 return NULL;
7066 /* Fast path: if it is an ASCII-only string, construct bytes object
7067 directly. Else defer to above function to raise the exception. */
7068 if (PyUnicode_IS_ASCII(unicode))
7069 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7070 PyUnicode_GET_LENGTH(unicode));
7071 return unicode_encode_ucs1(unicode, errors, 128);
7072 }
7073
7074 PyObject *
PyUnicode_AsASCIIString(PyObject * unicode)7075 PyUnicode_AsASCIIString(PyObject *unicode)
7076 {
7077 return _PyUnicode_AsASCIIString(unicode, NULL);
7078 }
7079
7080 #ifdef MS_WINDOWS
7081
7082 /* --- MBCS codecs for Windows -------------------------------------------- */
7083
7084 #if SIZEOF_INT < SIZEOF_SIZE_T
7085 #define NEED_RETRY
7086 #endif
7087
7088 /* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7089 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7090 both cases also and avoids partial characters overrunning the
7091 length limit in MultiByteToWideChar on Windows */
7092 #define DECODING_CHUNK_SIZE (INT_MAX/4)
7093
7094 #ifndef WC_ERR_INVALID_CHARS
7095 # define WC_ERR_INVALID_CHARS 0x0080
7096 #endif
7097
7098 static const char*
code_page_name(UINT code_page,PyObject ** obj)7099 code_page_name(UINT code_page, PyObject **obj)
7100 {
7101 *obj = NULL;
7102 if (code_page == CP_ACP)
7103 return "mbcs";
7104 if (code_page == CP_UTF7)
7105 return "CP_UTF7";
7106 if (code_page == CP_UTF8)
7107 return "CP_UTF8";
7108
7109 *obj = PyBytes_FromFormat("cp%u", code_page);
7110 if (*obj == NULL)
7111 return NULL;
7112 return PyBytes_AS_STRING(*obj);
7113 }
7114
7115 static DWORD
decode_code_page_flags(UINT code_page)7116 decode_code_page_flags(UINT code_page)
7117 {
7118 if (code_page == CP_UTF7) {
7119 /* The CP_UTF7 decoder only supports flags=0 */
7120 return 0;
7121 }
7122 else
7123 return MB_ERR_INVALID_CHARS;
7124 }
7125
7126 /*
7127 * Decode a byte string from a Windows code page into unicode object in strict
7128 * mode.
7129 *
7130 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7131 * OSError and returns -1 on other error.
7132 */
7133 static int
decode_code_page_strict(UINT code_page,PyObject ** v,const char * in,int insize)7134 decode_code_page_strict(UINT code_page,
7135 PyObject **v,
7136 const char *in,
7137 int insize)
7138 {
7139 DWORD flags = MB_ERR_INVALID_CHARS;
7140 wchar_t *out;
7141 DWORD outsize;
7142
7143 /* First get the size of the result */
7144 assert(insize > 0);
7145 while ((outsize = MultiByteToWideChar(code_page, flags,
7146 in, insize, NULL, 0)) <= 0)
7147 {
7148 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7149 goto error;
7150 }
7151 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7152 flags = 0;
7153 }
7154
7155 if (*v == NULL) {
7156 /* Create unicode object */
7157 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7158 *v = (PyObject*)_PyUnicode_New(outsize);
7159 if (*v == NULL)
7160 return -1;
7161 out = PyUnicode_AS_UNICODE(*v);
7162 }
7163 else {
7164 /* Extend unicode object */
7165 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7166 if (unicode_resize(v, n + outsize) < 0)
7167 return -1;
7168 out = PyUnicode_AS_UNICODE(*v) + n;
7169 }
7170
7171 /* Do the conversion */
7172 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7173 if (outsize <= 0)
7174 goto error;
7175 return insize;
7176
7177 error:
7178 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7179 return -2;
7180 PyErr_SetFromWindowsErr(0);
7181 return -1;
7182 }
7183
7184 /*
7185 * Decode a byte string from a code page into unicode object with an error
7186 * handler.
7187 *
7188 * Returns consumed size if succeed, or raise an OSError or
7189 * UnicodeDecodeError exception and returns -1 on error.
7190 */
7191 static int
decode_code_page_errors(UINT code_page,PyObject ** v,const char * in,const int size,const char * errors,int final)7192 decode_code_page_errors(UINT code_page,
7193 PyObject **v,
7194 const char *in, const int size,
7195 const char *errors, int final)
7196 {
7197 const char *startin = in;
7198 const char *endin = in + size;
7199 DWORD flags = MB_ERR_INVALID_CHARS;
7200 /* Ideally, we should get reason from FormatMessage. This is the Windows
7201 2000 English version of the message. */
7202 const char *reason = "No mapping for the Unicode character exists "
7203 "in the target code page.";
7204 /* each step cannot decode more than 1 character, but a character can be
7205 represented as a surrogate pair */
7206 wchar_t buffer[2], *out;
7207 int insize;
7208 Py_ssize_t outsize;
7209 PyObject *errorHandler = NULL;
7210 PyObject *exc = NULL;
7211 PyObject *encoding_obj = NULL;
7212 const char *encoding;
7213 DWORD err;
7214 int ret = -1;
7215
7216 assert(size > 0);
7217
7218 encoding = code_page_name(code_page, &encoding_obj);
7219 if (encoding == NULL)
7220 return -1;
7221
7222 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7223 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7224 UnicodeDecodeError. */
7225 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7226 if (exc != NULL) {
7227 PyCodec_StrictErrors(exc);
7228 Py_CLEAR(exc);
7229 }
7230 goto error;
7231 }
7232
7233 if (*v == NULL) {
7234 /* Create unicode object */
7235 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7236 PyErr_NoMemory();
7237 goto error;
7238 }
7239 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
7240 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7241 if (*v == NULL)
7242 goto error;
7243 out = PyUnicode_AS_UNICODE(*v);
7244 }
7245 else {
7246 /* Extend unicode object */
7247 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7248 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7249 PyErr_NoMemory();
7250 goto error;
7251 }
7252 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7253 goto error;
7254 out = PyUnicode_AS_UNICODE(*v) + n;
7255 }
7256
7257 /* Decode the byte string character per character */
7258 while (in < endin)
7259 {
7260 /* Decode a character */
7261 insize = 1;
7262 do
7263 {
7264 outsize = MultiByteToWideChar(code_page, flags,
7265 in, insize,
7266 buffer, Py_ARRAY_LENGTH(buffer));
7267 if (outsize > 0)
7268 break;
7269 err = GetLastError();
7270 if (err == ERROR_INVALID_FLAGS && flags) {
7271 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7272 flags = 0;
7273 continue;
7274 }
7275 if (err != ERROR_NO_UNICODE_TRANSLATION
7276 && err != ERROR_INSUFFICIENT_BUFFER)
7277 {
7278 PyErr_SetFromWindowsErr(0);
7279 goto error;
7280 }
7281 insize++;
7282 }
7283 /* 4=maximum length of a UTF-8 sequence */
7284 while (insize <= 4 && (in + insize) <= endin);
7285
7286 if (outsize <= 0) {
7287 Py_ssize_t startinpos, endinpos, outpos;
7288
7289 /* last character in partial decode? */
7290 if (in + insize >= endin && !final)
7291 break;
7292
7293 startinpos = in - startin;
7294 endinpos = startinpos + 1;
7295 outpos = out - PyUnicode_AS_UNICODE(*v);
7296 if (unicode_decode_call_errorhandler_wchar(
7297 errors, &errorHandler,
7298 encoding, reason,
7299 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7300 v, &outpos))
7301 {
7302 goto error;
7303 }
7304 out = PyUnicode_AS_UNICODE(*v) + outpos;
7305 }
7306 else {
7307 in += insize;
7308 memcpy(out, buffer, outsize * sizeof(wchar_t));
7309 out += outsize;
7310 }
7311 }
7312
7313 /* write a NUL character at the end */
7314 *out = 0;
7315
7316 /* Extend unicode object */
7317 outsize = out - PyUnicode_AS_UNICODE(*v);
7318 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7319 if (unicode_resize(v, outsize) < 0)
7320 goto error;
7321 /* (in - startin) <= size and size is an int */
7322 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7323
7324 error:
7325 Py_XDECREF(encoding_obj);
7326 Py_XDECREF(errorHandler);
7327 Py_XDECREF(exc);
7328 return ret;
7329 }
7330
7331 static PyObject *
decode_code_page_stateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7332 decode_code_page_stateful(int code_page,
7333 const char *s, Py_ssize_t size,
7334 const char *errors, Py_ssize_t *consumed)
7335 {
7336 PyObject *v = NULL;
7337 int chunk_size, final, converted, done;
7338
7339 if (code_page < 0) {
7340 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7341 return NULL;
7342 }
7343 if (size < 0) {
7344 PyErr_BadInternalCall();
7345 return NULL;
7346 }
7347
7348 if (consumed)
7349 *consumed = 0;
7350
7351 do
7352 {
7353 #ifdef NEED_RETRY
7354 if (size > DECODING_CHUNK_SIZE) {
7355 chunk_size = DECODING_CHUNK_SIZE;
7356 final = 0;
7357 done = 0;
7358 }
7359 else
7360 #endif
7361 {
7362 chunk_size = (int)size;
7363 final = (consumed == NULL);
7364 done = 1;
7365 }
7366
7367 if (chunk_size == 0 && done) {
7368 if (v != NULL)
7369 break;
7370 _Py_RETURN_UNICODE_EMPTY();
7371 }
7372
7373 converted = decode_code_page_strict(code_page, &v,
7374 s, chunk_size);
7375 if (converted == -2)
7376 converted = decode_code_page_errors(code_page, &v,
7377 s, chunk_size,
7378 errors, final);
7379 assert(converted != 0 || done);
7380
7381 if (converted < 0) {
7382 Py_XDECREF(v);
7383 return NULL;
7384 }
7385
7386 if (consumed)
7387 *consumed += converted;
7388
7389 s += converted;
7390 size -= converted;
7391 } while (!done);
7392
7393 return unicode_result(v);
7394 }
7395
7396 PyObject *
PyUnicode_DecodeCodePageStateful(int code_page,const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7397 PyUnicode_DecodeCodePageStateful(int code_page,
7398 const char *s,
7399 Py_ssize_t size,
7400 const char *errors,
7401 Py_ssize_t *consumed)
7402 {
7403 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7404 }
7405
7406 PyObject *
PyUnicode_DecodeMBCSStateful(const char * s,Py_ssize_t size,const char * errors,Py_ssize_t * consumed)7407 PyUnicode_DecodeMBCSStateful(const char *s,
7408 Py_ssize_t size,
7409 const char *errors,
7410 Py_ssize_t *consumed)
7411 {
7412 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7413 }
7414
7415 PyObject *
PyUnicode_DecodeMBCS(const char * s,Py_ssize_t size,const char * errors)7416 PyUnicode_DecodeMBCS(const char *s,
7417 Py_ssize_t size,
7418 const char *errors)
7419 {
7420 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7421 }
7422
7423 static DWORD
encode_code_page_flags(UINT code_page,const char * errors)7424 encode_code_page_flags(UINT code_page, const char *errors)
7425 {
7426 if (code_page == CP_UTF8) {
7427 return WC_ERR_INVALID_CHARS;
7428 }
7429 else if (code_page == CP_UTF7) {
7430 /* CP_UTF7 only supports flags=0 */
7431 return 0;
7432 }
7433 else {
7434 if (errors != NULL && strcmp(errors, "replace") == 0)
7435 return 0;
7436 else
7437 return WC_NO_BEST_FIT_CHARS;
7438 }
7439 }
7440
7441 /*
7442 * Encode a Unicode string to a Windows code page into a byte string in strict
7443 * mode.
7444 *
7445 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7446 * an OSError and returns -1 on other error.
7447 */
7448 static int
encode_code_page_strict(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t offset,int len,const char * errors)7449 encode_code_page_strict(UINT code_page, PyObject **outbytes,
7450 PyObject *unicode, Py_ssize_t offset, int len,
7451 const char* errors)
7452 {
7453 BOOL usedDefaultChar = FALSE;
7454 BOOL *pusedDefaultChar = &usedDefaultChar;
7455 int outsize;
7456 wchar_t *p;
7457 Py_ssize_t size;
7458 const DWORD flags = encode_code_page_flags(code_page, NULL);
7459 char *out;
7460 /* Create a substring so that we can get the UTF-16 representation
7461 of just the slice under consideration. */
7462 PyObject *substring;
7463
7464 assert(len > 0);
7465
7466 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7467 pusedDefaultChar = &usedDefaultChar;
7468 else
7469 pusedDefaultChar = NULL;
7470
7471 substring = PyUnicode_Substring(unicode, offset, offset+len);
7472 if (substring == NULL)
7473 return -1;
7474 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7475 if (p == NULL) {
7476 Py_DECREF(substring);
7477 return -1;
7478 }
7479 assert(size <= INT_MAX);
7480
7481 /* First get the size of the result */
7482 outsize = WideCharToMultiByte(code_page, flags,
7483 p, (int)size,
7484 NULL, 0,
7485 NULL, pusedDefaultChar);
7486 if (outsize <= 0)
7487 goto error;
7488 /* If we used a default char, then we failed! */
7489 if (pusedDefaultChar && *pusedDefaultChar) {
7490 Py_DECREF(substring);
7491 return -2;
7492 }
7493
7494 if (*outbytes == NULL) {
7495 /* Create string object */
7496 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7497 if (*outbytes == NULL) {
7498 Py_DECREF(substring);
7499 return -1;
7500 }
7501 out = PyBytes_AS_STRING(*outbytes);
7502 }
7503 else {
7504 /* Extend string object */
7505 const Py_ssize_t n = PyBytes_Size(*outbytes);
7506 if (outsize > PY_SSIZE_T_MAX - n) {
7507 PyErr_NoMemory();
7508 Py_DECREF(substring);
7509 return -1;
7510 }
7511 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7512 Py_DECREF(substring);
7513 return -1;
7514 }
7515 out = PyBytes_AS_STRING(*outbytes) + n;
7516 }
7517
7518 /* Do the conversion */
7519 outsize = WideCharToMultiByte(code_page, flags,
7520 p, (int)size,
7521 out, outsize,
7522 NULL, pusedDefaultChar);
7523 Py_CLEAR(substring);
7524 if (outsize <= 0)
7525 goto error;
7526 if (pusedDefaultChar && *pusedDefaultChar)
7527 return -2;
7528 return 0;
7529
7530 error:
7531 Py_XDECREF(substring);
7532 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7533 return -2;
7534 PyErr_SetFromWindowsErr(0);
7535 return -1;
7536 }
7537
7538 /*
7539 * Encode a Unicode string to a Windows code page into a byte string using an
7540 * error handler.
7541 *
7542 * Returns consumed characters if succeed, or raise an OSError and returns
7543 * -1 on other error.
7544 */
7545 static int
encode_code_page_errors(UINT code_page,PyObject ** outbytes,PyObject * unicode,Py_ssize_t unicode_offset,Py_ssize_t insize,const char * errors)7546 encode_code_page_errors(UINT code_page, PyObject **outbytes,
7547 PyObject *unicode, Py_ssize_t unicode_offset,
7548 Py_ssize_t insize, const char* errors)
7549 {
7550 const DWORD flags = encode_code_page_flags(code_page, errors);
7551 Py_ssize_t pos = unicode_offset;
7552 Py_ssize_t endin = unicode_offset + insize;
7553 /* Ideally, we should get reason from FormatMessage. This is the Windows
7554 2000 English version of the message. */
7555 const char *reason = "invalid character";
7556 /* 4=maximum length of a UTF-8 sequence */
7557 char buffer[4];
7558 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7559 Py_ssize_t outsize;
7560 char *out;
7561 PyObject *errorHandler = NULL;
7562 PyObject *exc = NULL;
7563 PyObject *encoding_obj = NULL;
7564 const char *encoding;
7565 Py_ssize_t newpos, newoutsize;
7566 PyObject *rep;
7567 int ret = -1;
7568
7569 assert(insize > 0);
7570
7571 encoding = code_page_name(code_page, &encoding_obj);
7572 if (encoding == NULL)
7573 return -1;
7574
7575 if (errors == NULL || strcmp(errors, "strict") == 0) {
7576 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7577 then we raise a UnicodeEncodeError. */
7578 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7579 if (exc != NULL) {
7580 PyCodec_StrictErrors(exc);
7581 Py_DECREF(exc);
7582 }
7583 Py_XDECREF(encoding_obj);
7584 return -1;
7585 }
7586
7587 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7588 pusedDefaultChar = &usedDefaultChar;
7589 else
7590 pusedDefaultChar = NULL;
7591
7592 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7593 PyErr_NoMemory();
7594 goto error;
7595 }
7596 outsize = insize * Py_ARRAY_LENGTH(buffer);
7597
7598 if (*outbytes == NULL) {
7599 /* Create string object */
7600 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7601 if (*outbytes == NULL)
7602 goto error;
7603 out = PyBytes_AS_STRING(*outbytes);
7604 }
7605 else {
7606 /* Extend string object */
7607 Py_ssize_t n = PyBytes_Size(*outbytes);
7608 if (n > PY_SSIZE_T_MAX - outsize) {
7609 PyErr_NoMemory();
7610 goto error;
7611 }
7612 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7613 goto error;
7614 out = PyBytes_AS_STRING(*outbytes) + n;
7615 }
7616
7617 /* Encode the string character per character */
7618 while (pos < endin)
7619 {
7620 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7621 wchar_t chars[2];
7622 int charsize;
7623 if (ch < 0x10000) {
7624 chars[0] = (wchar_t)ch;
7625 charsize = 1;
7626 }
7627 else {
7628 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7629 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7630 charsize = 2;
7631 }
7632
7633 outsize = WideCharToMultiByte(code_page, flags,
7634 chars, charsize,
7635 buffer, Py_ARRAY_LENGTH(buffer),
7636 NULL, pusedDefaultChar);
7637 if (outsize > 0) {
7638 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7639 {
7640 pos++;
7641 memcpy(out, buffer, outsize);
7642 out += outsize;
7643 continue;
7644 }
7645 }
7646 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7647 PyErr_SetFromWindowsErr(0);
7648 goto error;
7649 }
7650
7651 rep = unicode_encode_call_errorhandler(
7652 errors, &errorHandler, encoding, reason,
7653 unicode, &exc,
7654 pos, pos + 1, &newpos);
7655 if (rep == NULL)
7656 goto error;
7657 pos = newpos;
7658
7659 if (PyBytes_Check(rep)) {
7660 outsize = PyBytes_GET_SIZE(rep);
7661 if (outsize != 1) {
7662 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7663 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7664 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7665 Py_DECREF(rep);
7666 goto error;
7667 }
7668 out = PyBytes_AS_STRING(*outbytes) + offset;
7669 }
7670 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7671 out += outsize;
7672 }
7673 else {
7674 Py_ssize_t i;
7675 enum PyUnicode_Kind kind;
7676 void *data;
7677
7678 if (PyUnicode_READY(rep) == -1) {
7679 Py_DECREF(rep);
7680 goto error;
7681 }
7682
7683 outsize = PyUnicode_GET_LENGTH(rep);
7684 if (outsize != 1) {
7685 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7686 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7687 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7688 Py_DECREF(rep);
7689 goto error;
7690 }
7691 out = PyBytes_AS_STRING(*outbytes) + offset;
7692 }
7693 kind = PyUnicode_KIND(rep);
7694 data = PyUnicode_DATA(rep);
7695 for (i=0; i < outsize; i++) {
7696 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7697 if (ch > 127) {
7698 raise_encode_exception(&exc,
7699 encoding, unicode,
7700 pos, pos + 1,
7701 "unable to encode error handler result to ASCII");
7702 Py_DECREF(rep);
7703 goto error;
7704 }
7705 *out = (unsigned char)ch;
7706 out++;
7707 }
7708 }
7709 Py_DECREF(rep);
7710 }
7711 /* write a NUL byte */
7712 *out = 0;
7713 outsize = out - PyBytes_AS_STRING(*outbytes);
7714 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7715 if (_PyBytes_Resize(outbytes, outsize) < 0)
7716 goto error;
7717 ret = 0;
7718
7719 error:
7720 Py_XDECREF(encoding_obj);
7721 Py_XDECREF(errorHandler);
7722 Py_XDECREF(exc);
7723 return ret;
7724 }
7725
7726 static PyObject *
encode_code_page(int code_page,PyObject * unicode,const char * errors)7727 encode_code_page(int code_page,
7728 PyObject *unicode,
7729 const char *errors)
7730 {
7731 Py_ssize_t len;
7732 PyObject *outbytes = NULL;
7733 Py_ssize_t offset;
7734 int chunk_len, ret, done;
7735
7736 if (!PyUnicode_Check(unicode)) {
7737 PyErr_BadArgument();
7738 return NULL;
7739 }
7740
7741 if (PyUnicode_READY(unicode) == -1)
7742 return NULL;
7743 len = PyUnicode_GET_LENGTH(unicode);
7744
7745 if (code_page < 0) {
7746 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7747 return NULL;
7748 }
7749
7750 if (len == 0)
7751 return PyBytes_FromStringAndSize(NULL, 0);
7752
7753 offset = 0;
7754 do
7755 {
7756 #ifdef NEED_RETRY
7757 if (len > DECODING_CHUNK_SIZE) {
7758 chunk_len = DECODING_CHUNK_SIZE;
7759 done = 0;
7760 }
7761 else
7762 #endif
7763 {
7764 chunk_len = (int)len;
7765 done = 1;
7766 }
7767
7768 ret = encode_code_page_strict(code_page, &outbytes,
7769 unicode, offset, chunk_len,
7770 errors);
7771 if (ret == -2)
7772 ret = encode_code_page_errors(code_page, &outbytes,
7773 unicode, offset,
7774 chunk_len, errors);
7775 if (ret < 0) {
7776 Py_XDECREF(outbytes);
7777 return NULL;
7778 }
7779
7780 offset += chunk_len;
7781 len -= chunk_len;
7782 } while (!done);
7783
7784 return outbytes;
7785 }
7786
7787 PyObject *
PyUnicode_EncodeMBCS(const Py_UNICODE * p,Py_ssize_t size,const char * errors)7788 PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7789 Py_ssize_t size,
7790 const char *errors)
7791 {
7792 PyObject *unicode, *res;
7793 unicode = PyUnicode_FromWideChar(p, size);
7794 if (unicode == NULL)
7795 return NULL;
7796 res = encode_code_page(CP_ACP, unicode, errors);
7797 Py_DECREF(unicode);
7798 return res;
7799 }
7800
7801 PyObject *
PyUnicode_EncodeCodePage(int code_page,PyObject * unicode,const char * errors)7802 PyUnicode_EncodeCodePage(int code_page,
7803 PyObject *unicode,
7804 const char *errors)
7805 {
7806 return encode_code_page(code_page, unicode, errors);
7807 }
7808
7809 PyObject *
PyUnicode_AsMBCSString(PyObject * unicode)7810 PyUnicode_AsMBCSString(PyObject *unicode)
7811 {
7812 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7813 }
7814
7815 #undef NEED_RETRY
7816
7817 #endif /* MS_WINDOWS */
7818
7819 /* --- Character Mapping Codec -------------------------------------------- */
7820
7821 static int
charmap_decode_string(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7822 charmap_decode_string(const char *s,
7823 Py_ssize_t size,
7824 PyObject *mapping,
7825 const char *errors,
7826 _PyUnicodeWriter *writer)
7827 {
7828 const char *starts = s;
7829 const char *e;
7830 Py_ssize_t startinpos, endinpos;
7831 PyObject *errorHandler = NULL, *exc = NULL;
7832 Py_ssize_t maplen;
7833 enum PyUnicode_Kind mapkind;
7834 void *mapdata;
7835 Py_UCS4 x;
7836 unsigned char ch;
7837
7838 if (PyUnicode_READY(mapping) == -1)
7839 return -1;
7840
7841 maplen = PyUnicode_GET_LENGTH(mapping);
7842 mapdata = PyUnicode_DATA(mapping);
7843 mapkind = PyUnicode_KIND(mapping);
7844
7845 e = s + size;
7846
7847 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7848 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7849 * is disabled in encoding aliases, latin1 is preferred because
7850 * its implementation is faster. */
7851 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7852 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7853 Py_UCS4 maxchar = writer->maxchar;
7854
7855 assert (writer->kind == PyUnicode_1BYTE_KIND);
7856 while (s < e) {
7857 ch = *s;
7858 x = mapdata_ucs1[ch];
7859 if (x > maxchar) {
7860 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7861 goto onError;
7862 maxchar = writer->maxchar;
7863 outdata = (Py_UCS1 *)writer->data;
7864 }
7865 outdata[writer->pos] = x;
7866 writer->pos++;
7867 ++s;
7868 }
7869 return 0;
7870 }
7871
7872 while (s < e) {
7873 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7874 enum PyUnicode_Kind outkind = writer->kind;
7875 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7876 if (outkind == PyUnicode_1BYTE_KIND) {
7877 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7878 Py_UCS4 maxchar = writer->maxchar;
7879 while (s < e) {
7880 ch = *s;
7881 x = mapdata_ucs2[ch];
7882 if (x > maxchar)
7883 goto Error;
7884 outdata[writer->pos] = x;
7885 writer->pos++;
7886 ++s;
7887 }
7888 break;
7889 }
7890 else if (outkind == PyUnicode_2BYTE_KIND) {
7891 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7892 while (s < e) {
7893 ch = *s;
7894 x = mapdata_ucs2[ch];
7895 if (x == 0xFFFE)
7896 goto Error;
7897 outdata[writer->pos] = x;
7898 writer->pos++;
7899 ++s;
7900 }
7901 break;
7902 }
7903 }
7904 ch = *s;
7905
7906 if (ch < maplen)
7907 x = PyUnicode_READ(mapkind, mapdata, ch);
7908 else
7909 x = 0xfffe; /* invalid value */
7910 Error:
7911 if (x == 0xfffe)
7912 {
7913 /* undefined mapping */
7914 startinpos = s-starts;
7915 endinpos = startinpos+1;
7916 if (unicode_decode_call_errorhandler_writer(
7917 errors, &errorHandler,
7918 "charmap", "character maps to <undefined>",
7919 &starts, &e, &startinpos, &endinpos, &exc, &s,
7920 writer)) {
7921 goto onError;
7922 }
7923 continue;
7924 }
7925
7926 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7927 goto onError;
7928 ++s;
7929 }
7930 Py_XDECREF(errorHandler);
7931 Py_XDECREF(exc);
7932 return 0;
7933
7934 onError:
7935 Py_XDECREF(errorHandler);
7936 Py_XDECREF(exc);
7937 return -1;
7938 }
7939
7940 static int
charmap_decode_mapping(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors,_PyUnicodeWriter * writer)7941 charmap_decode_mapping(const char *s,
7942 Py_ssize_t size,
7943 PyObject *mapping,
7944 const char *errors,
7945 _PyUnicodeWriter *writer)
7946 {
7947 const char *starts = s;
7948 const char *e;
7949 Py_ssize_t startinpos, endinpos;
7950 PyObject *errorHandler = NULL, *exc = NULL;
7951 unsigned char ch;
7952 PyObject *key, *item = NULL;
7953
7954 e = s + size;
7955
7956 while (s < e) {
7957 ch = *s;
7958
7959 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7960 key = PyLong_FromLong((long)ch);
7961 if (key == NULL)
7962 goto onError;
7963
7964 item = PyObject_GetItem(mapping, key);
7965 Py_DECREF(key);
7966 if (item == NULL) {
7967 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7968 /* No mapping found means: mapping is undefined. */
7969 PyErr_Clear();
7970 goto Undefined;
7971 } else
7972 goto onError;
7973 }
7974
7975 /* Apply mapping */
7976 if (item == Py_None)
7977 goto Undefined;
7978 if (PyLong_Check(item)) {
7979 long value = PyLong_AS_LONG(item);
7980 if (value == 0xFFFE)
7981 goto Undefined;
7982 if (value < 0 || value > MAX_UNICODE) {
7983 PyErr_Format(PyExc_TypeError,
7984 "character mapping must be in range(0x%lx)",
7985 (unsigned long)MAX_UNICODE + 1);
7986 goto onError;
7987 }
7988
7989 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7990 goto onError;
7991 }
7992 else if (PyUnicode_Check(item)) {
7993 if (PyUnicode_READY(item) == -1)
7994 goto onError;
7995 if (PyUnicode_GET_LENGTH(item) == 1) {
7996 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7997 if (value == 0xFFFE)
7998 goto Undefined;
7999 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8000 goto onError;
8001 }
8002 else {
8003 writer->overallocate = 1;
8004 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8005 goto onError;
8006 }
8007 }
8008 else {
8009 /* wrong return value */
8010 PyErr_SetString(PyExc_TypeError,
8011 "character mapping must return integer, None or str");
8012 goto onError;
8013 }
8014 Py_CLEAR(item);
8015 ++s;
8016 continue;
8017
8018 Undefined:
8019 /* undefined mapping */
8020 Py_CLEAR(item);
8021 startinpos = s-starts;
8022 endinpos = startinpos+1;
8023 if (unicode_decode_call_errorhandler_writer(
8024 errors, &errorHandler,
8025 "charmap", "character maps to <undefined>",
8026 &starts, &e, &startinpos, &endinpos, &exc, &s,
8027 writer)) {
8028 goto onError;
8029 }
8030 }
8031 Py_XDECREF(errorHandler);
8032 Py_XDECREF(exc);
8033 return 0;
8034
8035 onError:
8036 Py_XDECREF(item);
8037 Py_XDECREF(errorHandler);
8038 Py_XDECREF(exc);
8039 return -1;
8040 }
8041
8042 PyObject *
PyUnicode_DecodeCharmap(const char * s,Py_ssize_t size,PyObject * mapping,const char * errors)8043 PyUnicode_DecodeCharmap(const char *s,
8044 Py_ssize_t size,
8045 PyObject *mapping,
8046 const char *errors)
8047 {
8048 _PyUnicodeWriter writer;
8049
8050 /* Default to Latin-1 */
8051 if (mapping == NULL)
8052 return PyUnicode_DecodeLatin1(s, size, errors);
8053
8054 if (size == 0)
8055 _Py_RETURN_UNICODE_EMPTY();
8056 _PyUnicodeWriter_Init(&writer);
8057 writer.min_length = size;
8058 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8059 goto onError;
8060
8061 if (PyUnicode_CheckExact(mapping)) {
8062 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8063 goto onError;
8064 }
8065 else {
8066 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8067 goto onError;
8068 }
8069 return _PyUnicodeWriter_Finish(&writer);
8070
8071 onError:
8072 _PyUnicodeWriter_Dealloc(&writer);
8073 return NULL;
8074 }
8075
8076 /* Charmap encoding: the lookup table */
8077
8078 struct encoding_map {
8079 PyObject_HEAD
8080 unsigned char level1[32];
8081 int count2, count3;
8082 unsigned char level23[1];
8083 };
8084
8085 static PyObject*
encoding_map_size(PyObject * obj,PyObject * args)8086 encoding_map_size(PyObject *obj, PyObject* args)
8087 {
8088 struct encoding_map *map = (struct encoding_map*)obj;
8089 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8090 128*map->count3);
8091 }
8092
8093 static PyMethodDef encoding_map_methods[] = {
8094 {"size", encoding_map_size, METH_NOARGS,
8095 PyDoc_STR("Return the size (in bytes) of this object") },
8096 { 0 }
8097 };
8098
8099 static void
encoding_map_dealloc(PyObject * o)8100 encoding_map_dealloc(PyObject* o)
8101 {
8102 PyObject_FREE(o);
8103 }
8104
8105 static PyTypeObject EncodingMapType = {
8106 PyVarObject_HEAD_INIT(NULL, 0)
8107 "EncodingMap", /*tp_name*/
8108 sizeof(struct encoding_map), /*tp_basicsize*/
8109 0, /*tp_itemsize*/
8110 /* methods */
8111 encoding_map_dealloc, /*tp_dealloc*/
8112 0, /*tp_print*/
8113 0, /*tp_getattr*/
8114 0, /*tp_setattr*/
8115 0, /*tp_reserved*/
8116 0, /*tp_repr*/
8117 0, /*tp_as_number*/
8118 0, /*tp_as_sequence*/
8119 0, /*tp_as_mapping*/
8120 0, /*tp_hash*/
8121 0, /*tp_call*/
8122 0, /*tp_str*/
8123 0, /*tp_getattro*/
8124 0, /*tp_setattro*/
8125 0, /*tp_as_buffer*/
8126 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8127 0, /*tp_doc*/
8128 0, /*tp_traverse*/
8129 0, /*tp_clear*/
8130 0, /*tp_richcompare*/
8131 0, /*tp_weaklistoffset*/
8132 0, /*tp_iter*/
8133 0, /*tp_iternext*/
8134 encoding_map_methods, /*tp_methods*/
8135 0, /*tp_members*/
8136 0, /*tp_getset*/
8137 0, /*tp_base*/
8138 0, /*tp_dict*/
8139 0, /*tp_descr_get*/
8140 0, /*tp_descr_set*/
8141 0, /*tp_dictoffset*/
8142 0, /*tp_init*/
8143 0, /*tp_alloc*/
8144 0, /*tp_new*/
8145 0, /*tp_free*/
8146 0, /*tp_is_gc*/
8147 };
8148
8149 PyObject*
PyUnicode_BuildEncodingMap(PyObject * string)8150 PyUnicode_BuildEncodingMap(PyObject* string)
8151 {
8152 PyObject *result;
8153 struct encoding_map *mresult;
8154 int i;
8155 int need_dict = 0;
8156 unsigned char level1[32];
8157 unsigned char level2[512];
8158 unsigned char *mlevel1, *mlevel2, *mlevel3;
8159 int count2 = 0, count3 = 0;
8160 int kind;
8161 void *data;
8162 Py_ssize_t length;
8163 Py_UCS4 ch;
8164
8165 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8166 PyErr_BadArgument();
8167 return NULL;
8168 }
8169 kind = PyUnicode_KIND(string);
8170 data = PyUnicode_DATA(string);
8171 length = PyUnicode_GET_LENGTH(string);
8172 length = Py_MIN(length, 256);
8173 memset(level1, 0xFF, sizeof level1);
8174 memset(level2, 0xFF, sizeof level2);
8175
8176 /* If there isn't a one-to-one mapping of NULL to \0,
8177 or if there are non-BMP characters, we need to use
8178 a mapping dictionary. */
8179 if (PyUnicode_READ(kind, data, 0) != 0)
8180 need_dict = 1;
8181 for (i = 1; i < length; i++) {
8182 int l1, l2;
8183 ch = PyUnicode_READ(kind, data, i);
8184 if (ch == 0 || ch > 0xFFFF) {
8185 need_dict = 1;
8186 break;
8187 }
8188 if (ch == 0xFFFE)
8189 /* unmapped character */
8190 continue;
8191 l1 = ch >> 11;
8192 l2 = ch >> 7;
8193 if (level1[l1] == 0xFF)
8194 level1[l1] = count2++;
8195 if (level2[l2] == 0xFF)
8196 level2[l2] = count3++;
8197 }
8198
8199 if (count2 >= 0xFF || count3 >= 0xFF)
8200 need_dict = 1;
8201
8202 if (need_dict) {
8203 PyObject *result = PyDict_New();
8204 PyObject *key, *value;
8205 if (!result)
8206 return NULL;
8207 for (i = 0; i < length; i++) {
8208 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8209 value = PyLong_FromLong(i);
8210 if (!key || !value)
8211 goto failed1;
8212 if (PyDict_SetItem(result, key, value) == -1)
8213 goto failed1;
8214 Py_DECREF(key);
8215 Py_DECREF(value);
8216 }
8217 return result;
8218 failed1:
8219 Py_XDECREF(key);
8220 Py_XDECREF(value);
8221 Py_DECREF(result);
8222 return NULL;
8223 }
8224
8225 /* Create a three-level trie */
8226 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8227 16*count2 + 128*count3 - 1);
8228 if (!result)
8229 return PyErr_NoMemory();
8230 PyObject_Init(result, &EncodingMapType);
8231 mresult = (struct encoding_map*)result;
8232 mresult->count2 = count2;
8233 mresult->count3 = count3;
8234 mlevel1 = mresult->level1;
8235 mlevel2 = mresult->level23;
8236 mlevel3 = mresult->level23 + 16*count2;
8237 memcpy(mlevel1, level1, 32);
8238 memset(mlevel2, 0xFF, 16*count2);
8239 memset(mlevel3, 0, 128*count3);
8240 count3 = 0;
8241 for (i = 1; i < length; i++) {
8242 int o1, o2, o3, i2, i3;
8243 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8244 if (ch == 0xFFFE)
8245 /* unmapped character */
8246 continue;
8247 o1 = ch>>11;
8248 o2 = (ch>>7) & 0xF;
8249 i2 = 16*mlevel1[o1] + o2;
8250 if (mlevel2[i2] == 0xFF)
8251 mlevel2[i2] = count3++;
8252 o3 = ch & 0x7F;
8253 i3 = 128*mlevel2[i2] + o3;
8254 mlevel3[i3] = i;
8255 }
8256 return result;
8257 }
8258
8259 static int
encoding_map_lookup(Py_UCS4 c,PyObject * mapping)8260 encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8261 {
8262 struct encoding_map *map = (struct encoding_map*)mapping;
8263 int l1 = c>>11;
8264 int l2 = (c>>7) & 0xF;
8265 int l3 = c & 0x7F;
8266 int i;
8267
8268 if (c > 0xFFFF)
8269 return -1;
8270 if (c == 0)
8271 return 0;
8272 /* level 1*/
8273 i = map->level1[l1];
8274 if (i == 0xFF) {
8275 return -1;
8276 }
8277 /* level 2*/
8278 i = map->level23[16*i+l2];
8279 if (i == 0xFF) {
8280 return -1;
8281 }
8282 /* level 3 */
8283 i = map->level23[16*map->count2 + 128*i + l3];
8284 if (i == 0) {
8285 return -1;
8286 }
8287 return i;
8288 }
8289
8290 /* Lookup the character ch in the mapping. If the character
8291 can't be found, Py_None is returned (or NULL, if another
8292 error occurred). */
8293 static PyObject *
charmapencode_lookup(Py_UCS4 c,PyObject * mapping)8294 charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8295 {
8296 PyObject *w = PyLong_FromLong((long)c);
8297 PyObject *x;
8298
8299 if (w == NULL)
8300 return NULL;
8301 x = PyObject_GetItem(mapping, w);
8302 Py_DECREF(w);
8303 if (x == NULL) {
8304 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8305 /* No mapping found means: mapping is undefined. */
8306 PyErr_Clear();
8307 Py_RETURN_NONE;
8308 } else
8309 return NULL;
8310 }
8311 else if (x == Py_None)
8312 return x;
8313 else if (PyLong_Check(x)) {
8314 long value = PyLong_AS_LONG(x);
8315 if (value < 0 || value > 255) {
8316 PyErr_SetString(PyExc_TypeError,
8317 "character mapping must be in range(256)");
8318 Py_DECREF(x);
8319 return NULL;
8320 }
8321 return x;
8322 }
8323 else if (PyBytes_Check(x))
8324 return x;
8325 else {
8326 /* wrong return value */
8327 PyErr_Format(PyExc_TypeError,
8328 "character mapping must return integer, bytes or None, not %.400s",
8329 x->ob_type->tp_name);
8330 Py_DECREF(x);
8331 return NULL;
8332 }
8333 }
8334
8335 static int
charmapencode_resize(PyObject ** outobj,Py_ssize_t * outpos,Py_ssize_t requiredsize)8336 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8337 {
8338 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8339 /* exponentially overallocate to minimize reallocations */
8340 if (requiredsize < 2*outsize)
8341 requiredsize = 2*outsize;
8342 if (_PyBytes_Resize(outobj, requiredsize))
8343 return -1;
8344 return 0;
8345 }
8346
8347 typedef enum charmapencode_result {
8348 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8349 } charmapencode_result;
8350 /* lookup the character, put the result in the output string and adjust
8351 various state variables. Resize the output bytes object if not enough
8352 space is available. Return a new reference to the object that
8353 was put in the output buffer, or Py_None, if the mapping was undefined
8354 (in which case no character was written) or NULL, if a
8355 reallocation error occurred. The caller must decref the result */
8356 static charmapencode_result
charmapencode_output(Py_UCS4 c,PyObject * mapping,PyObject ** outobj,Py_ssize_t * outpos)8357 charmapencode_output(Py_UCS4 c, PyObject *mapping,
8358 PyObject **outobj, Py_ssize_t *outpos)
8359 {
8360 PyObject *rep;
8361 char *outstart;
8362 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8363
8364 if (Py_TYPE(mapping) == &EncodingMapType) {
8365 int res = encoding_map_lookup(c, mapping);
8366 Py_ssize_t requiredsize = *outpos+1;
8367 if (res == -1)
8368 return enc_FAILED;
8369 if (outsize<requiredsize)
8370 if (charmapencode_resize(outobj, outpos, requiredsize))
8371 return enc_EXCEPTION;
8372 outstart = PyBytes_AS_STRING(*outobj);
8373 outstart[(*outpos)++] = (char)res;
8374 return enc_SUCCESS;
8375 }
8376
8377 rep = charmapencode_lookup(c, mapping);
8378 if (rep==NULL)
8379 return enc_EXCEPTION;
8380 else if (rep==Py_None) {
8381 Py_DECREF(rep);
8382 return enc_FAILED;
8383 } else {
8384 if (PyLong_Check(rep)) {
8385 Py_ssize_t requiredsize = *outpos+1;
8386 if (outsize<requiredsize)
8387 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8388 Py_DECREF(rep);
8389 return enc_EXCEPTION;
8390 }
8391 outstart = PyBytes_AS_STRING(*outobj);
8392 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8393 }
8394 else {
8395 const char *repchars = PyBytes_AS_STRING(rep);
8396 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8397 Py_ssize_t requiredsize = *outpos+repsize;
8398 if (outsize<requiredsize)
8399 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8400 Py_DECREF(rep);
8401 return enc_EXCEPTION;
8402 }
8403 outstart = PyBytes_AS_STRING(*outobj);
8404 memcpy(outstart + *outpos, repchars, repsize);
8405 *outpos += repsize;
8406 }
8407 }
8408 Py_DECREF(rep);
8409 return enc_SUCCESS;
8410 }
8411
8412 /* handle an error in PyUnicode_EncodeCharmap
8413 Return 0 on success, -1 on error */
8414 static int
charmap_encoding_error(PyObject * unicode,Py_ssize_t * inpos,PyObject * mapping,PyObject ** exceptionObject,_Py_error_handler * error_handler,PyObject ** error_handler_obj,const char * errors,PyObject ** res,Py_ssize_t * respos)8415 charmap_encoding_error(
8416 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8417 PyObject **exceptionObject,
8418 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8419 PyObject **res, Py_ssize_t *respos)
8420 {
8421 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8422 Py_ssize_t size, repsize;
8423 Py_ssize_t newpos;
8424 enum PyUnicode_Kind kind;
8425 void *data;
8426 Py_ssize_t index;
8427 /* startpos for collecting unencodable chars */
8428 Py_ssize_t collstartpos = *inpos;
8429 Py_ssize_t collendpos = *inpos+1;
8430 Py_ssize_t collpos;
8431 const char *encoding = "charmap";
8432 const char *reason = "character maps to <undefined>";
8433 charmapencode_result x;
8434 Py_UCS4 ch;
8435 int val;
8436
8437 if (PyUnicode_READY(unicode) == -1)
8438 return -1;
8439 size = PyUnicode_GET_LENGTH(unicode);
8440 /* find all unencodable characters */
8441 while (collendpos < size) {
8442 PyObject *rep;
8443 if (Py_TYPE(mapping) == &EncodingMapType) {
8444 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8445 val = encoding_map_lookup(ch, mapping);
8446 if (val != -1)
8447 break;
8448 ++collendpos;
8449 continue;
8450 }
8451
8452 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8453 rep = charmapencode_lookup(ch, mapping);
8454 if (rep==NULL)
8455 return -1;
8456 else if (rep!=Py_None) {
8457 Py_DECREF(rep);
8458 break;
8459 }
8460 Py_DECREF(rep);
8461 ++collendpos;
8462 }
8463 /* cache callback name lookup
8464 * (if not done yet, i.e. it's the first error) */
8465 if (*error_handler == _Py_ERROR_UNKNOWN)
8466 *error_handler = get_error_handler(errors);
8467
8468 switch (*error_handler) {
8469 case _Py_ERROR_STRICT:
8470 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8471 return -1;
8472
8473 case _Py_ERROR_REPLACE:
8474 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8475 x = charmapencode_output('?', mapping, res, respos);
8476 if (x==enc_EXCEPTION) {
8477 return -1;
8478 }
8479 else if (x==enc_FAILED) {
8480 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8481 return -1;
8482 }
8483 }
8484 /* fall through */
8485 case _Py_ERROR_IGNORE:
8486 *inpos = collendpos;
8487 break;
8488
8489 case _Py_ERROR_XMLCHARREFREPLACE:
8490 /* generate replacement (temporarily (mis)uses p) */
8491 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8492 char buffer[2+29+1+1];
8493 char *cp;
8494 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8495 for (cp = buffer; *cp; ++cp) {
8496 x = charmapencode_output(*cp, mapping, res, respos);
8497 if (x==enc_EXCEPTION)
8498 return -1;
8499 else if (x==enc_FAILED) {
8500 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8501 return -1;
8502 }
8503 }
8504 }
8505 *inpos = collendpos;
8506 break;
8507
8508 default:
8509 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8510 encoding, reason, unicode, exceptionObject,
8511 collstartpos, collendpos, &newpos);
8512 if (repunicode == NULL)
8513 return -1;
8514 if (PyBytes_Check(repunicode)) {
8515 /* Directly copy bytes result to output. */
8516 Py_ssize_t outsize = PyBytes_Size(*res);
8517 Py_ssize_t requiredsize;
8518 repsize = PyBytes_Size(repunicode);
8519 requiredsize = *respos + repsize;
8520 if (requiredsize > outsize)
8521 /* Make room for all additional bytes. */
8522 if (charmapencode_resize(res, respos, requiredsize)) {
8523 Py_DECREF(repunicode);
8524 return -1;
8525 }
8526 memcpy(PyBytes_AsString(*res) + *respos,
8527 PyBytes_AsString(repunicode), repsize);
8528 *respos += repsize;
8529 *inpos = newpos;
8530 Py_DECREF(repunicode);
8531 break;
8532 }
8533 /* generate replacement */
8534 if (PyUnicode_READY(repunicode) == -1) {
8535 Py_DECREF(repunicode);
8536 return -1;
8537 }
8538 repsize = PyUnicode_GET_LENGTH(repunicode);
8539 data = PyUnicode_DATA(repunicode);
8540 kind = PyUnicode_KIND(repunicode);
8541 for (index = 0; index < repsize; index++) {
8542 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8543 x = charmapencode_output(repch, mapping, res, respos);
8544 if (x==enc_EXCEPTION) {
8545 Py_DECREF(repunicode);
8546 return -1;
8547 }
8548 else if (x==enc_FAILED) {
8549 Py_DECREF(repunicode);
8550 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8551 return -1;
8552 }
8553 }
8554 *inpos = newpos;
8555 Py_DECREF(repunicode);
8556 }
8557 return 0;
8558 }
8559
8560 PyObject *
_PyUnicode_EncodeCharmap(PyObject * unicode,PyObject * mapping,const char * errors)8561 _PyUnicode_EncodeCharmap(PyObject *unicode,
8562 PyObject *mapping,
8563 const char *errors)
8564 {
8565 /* output object */
8566 PyObject *res = NULL;
8567 /* current input position */
8568 Py_ssize_t inpos = 0;
8569 Py_ssize_t size;
8570 /* current output position */
8571 Py_ssize_t respos = 0;
8572 PyObject *error_handler_obj = NULL;
8573 PyObject *exc = NULL;
8574 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8575 void *data;
8576 int kind;
8577
8578 if (PyUnicode_READY(unicode) == -1)
8579 return NULL;
8580 size = PyUnicode_GET_LENGTH(unicode);
8581 data = PyUnicode_DATA(unicode);
8582 kind = PyUnicode_KIND(unicode);
8583
8584 /* Default to Latin-1 */
8585 if (mapping == NULL)
8586 return unicode_encode_ucs1(unicode, errors, 256);
8587
8588 /* allocate enough for a simple encoding without
8589 replacements, if we need more, we'll resize */
8590 res = PyBytes_FromStringAndSize(NULL, size);
8591 if (res == NULL)
8592 goto onError;
8593 if (size == 0)
8594 return res;
8595
8596 while (inpos<size) {
8597 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8598 /* try to encode it */
8599 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8600 if (x==enc_EXCEPTION) /* error */
8601 goto onError;
8602 if (x==enc_FAILED) { /* unencodable character */
8603 if (charmap_encoding_error(unicode, &inpos, mapping,
8604 &exc,
8605 &error_handler, &error_handler_obj, errors,
8606 &res, &respos)) {
8607 goto onError;
8608 }
8609 }
8610 else
8611 /* done with this character => adjust input position */
8612 ++inpos;
8613 }
8614
8615 /* Resize if we allocated to much */
8616 if (respos<PyBytes_GET_SIZE(res))
8617 if (_PyBytes_Resize(&res, respos) < 0)
8618 goto onError;
8619
8620 Py_XDECREF(exc);
8621 Py_XDECREF(error_handler_obj);
8622 return res;
8623
8624 onError:
8625 Py_XDECREF(res);
8626 Py_XDECREF(exc);
8627 Py_XDECREF(error_handler_obj);
8628 return NULL;
8629 }
8630
8631 /* Deprecated */
8632 PyObject *
PyUnicode_EncodeCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)8633 PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8634 Py_ssize_t size,
8635 PyObject *mapping,
8636 const char *errors)
8637 {
8638 PyObject *result;
8639 PyObject *unicode = PyUnicode_FromWideChar(p, size);
8640 if (unicode == NULL)
8641 return NULL;
8642 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8643 Py_DECREF(unicode);
8644 return result;
8645 }
8646
8647 PyObject *
PyUnicode_AsCharmapString(PyObject * unicode,PyObject * mapping)8648 PyUnicode_AsCharmapString(PyObject *unicode,
8649 PyObject *mapping)
8650 {
8651 if (!PyUnicode_Check(unicode) || mapping == NULL) {
8652 PyErr_BadArgument();
8653 return NULL;
8654 }
8655 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8656 }
8657
8658 /* create or adjust a UnicodeTranslateError */
8659 static void
make_translate_exception(PyObject ** exceptionObject,PyObject * unicode,Py_ssize_t startpos,Py_ssize_t endpos,const char * reason)8660 make_translate_exception(PyObject **exceptionObject,
8661 PyObject *unicode,
8662 Py_ssize_t startpos, Py_ssize_t endpos,
8663 const char *reason)
8664 {
8665 if (*exceptionObject == NULL) {
8666 *exceptionObject = _PyUnicodeTranslateError_Create(
8667 unicode, startpos, endpos, reason);
8668 }
8669 else {
8670 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8671 goto onError;
8672 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8673 goto onError;
8674 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8675 goto onError;
8676 return;
8677 onError:
8678 Py_CLEAR(*exceptionObject);
8679 }
8680 }
8681
8682 /* error handling callback helper:
8683 build arguments, call the callback and check the arguments,
8684 put the result into newpos and return the replacement string, which
8685 has to be freed by the caller */
8686 static PyObject *
unicode_translate_call_errorhandler(const char * errors,PyObject ** errorHandler,const char * reason,PyObject * unicode,PyObject ** exceptionObject,Py_ssize_t startpos,Py_ssize_t endpos,Py_ssize_t * newpos)8687 unicode_translate_call_errorhandler(const char *errors,
8688 PyObject **errorHandler,
8689 const char *reason,
8690 PyObject *unicode, PyObject **exceptionObject,
8691 Py_ssize_t startpos, Py_ssize_t endpos,
8692 Py_ssize_t *newpos)
8693 {
8694 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8695
8696 Py_ssize_t i_newpos;
8697 PyObject *restuple;
8698 PyObject *resunicode;
8699
8700 if (*errorHandler == NULL) {
8701 *errorHandler = PyCodec_LookupError(errors);
8702 if (*errorHandler == NULL)
8703 return NULL;
8704 }
8705
8706 make_translate_exception(exceptionObject,
8707 unicode, startpos, endpos, reason);
8708 if (*exceptionObject == NULL)
8709 return NULL;
8710
8711 restuple = PyObject_CallFunctionObjArgs(
8712 *errorHandler, *exceptionObject, NULL);
8713 if (restuple == NULL)
8714 return NULL;
8715 if (!PyTuple_Check(restuple)) {
8716 PyErr_SetString(PyExc_TypeError, &argparse[3]);
8717 Py_DECREF(restuple);
8718 return NULL;
8719 }
8720 if (!PyArg_ParseTuple(restuple, argparse,
8721 &resunicode, &i_newpos)) {
8722 Py_DECREF(restuple);
8723 return NULL;
8724 }
8725 if (i_newpos<0)
8726 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8727 else
8728 *newpos = i_newpos;
8729 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8730 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8731 Py_DECREF(restuple);
8732 return NULL;
8733 }
8734 Py_INCREF(resunicode);
8735 Py_DECREF(restuple);
8736 return resunicode;
8737 }
8738
8739 /* Lookup the character ch in the mapping and put the result in result,
8740 which must be decrefed by the caller.
8741 Return 0 on success, -1 on error */
8742 static int
charmaptranslate_lookup(Py_UCS4 c,PyObject * mapping,PyObject ** result)8743 charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8744 {
8745 PyObject *w = PyLong_FromLong((long)c);
8746 PyObject *x;
8747
8748 if (w == NULL)
8749 return -1;
8750 x = PyObject_GetItem(mapping, w);
8751 Py_DECREF(w);
8752 if (x == NULL) {
8753 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8754 /* No mapping found means: use 1:1 mapping. */
8755 PyErr_Clear();
8756 *result = NULL;
8757 return 0;
8758 } else
8759 return -1;
8760 }
8761 else if (x == Py_None) {
8762 *result = x;
8763 return 0;
8764 }
8765 else if (PyLong_Check(x)) {
8766 long value = PyLong_AS_LONG(x);
8767 if (value < 0 || value > MAX_UNICODE) {
8768 PyErr_Format(PyExc_ValueError,
8769 "character mapping must be in range(0x%x)",
8770 MAX_UNICODE+1);
8771 Py_DECREF(x);
8772 return -1;
8773 }
8774 *result = x;
8775 return 0;
8776 }
8777 else if (PyUnicode_Check(x)) {
8778 *result = x;
8779 return 0;
8780 }
8781 else {
8782 /* wrong return value */
8783 PyErr_SetString(PyExc_TypeError,
8784 "character mapping must return integer, None or str");
8785 Py_DECREF(x);
8786 return -1;
8787 }
8788 }
8789
8790 /* lookup the character, write the result into the writer.
8791 Return 1 if the result was written into the writer, return 0 if the mapping
8792 was undefined, raise an exception return -1 on error. */
8793 static int
charmaptranslate_output(Py_UCS4 ch,PyObject * mapping,_PyUnicodeWriter * writer)8794 charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8795 _PyUnicodeWriter *writer)
8796 {
8797 PyObject *item;
8798
8799 if (charmaptranslate_lookup(ch, mapping, &item))
8800 return -1;
8801
8802 if (item == NULL) {
8803 /* not found => default to 1:1 mapping */
8804 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8805 return -1;
8806 }
8807 return 1;
8808 }
8809
8810 if (item == Py_None) {
8811 Py_DECREF(item);
8812 return 0;
8813 }
8814
8815 if (PyLong_Check(item)) {
8816 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8817 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8818 used it */
8819 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8820 Py_DECREF(item);
8821 return -1;
8822 }
8823 Py_DECREF(item);
8824 return 1;
8825 }
8826
8827 if (!PyUnicode_Check(item)) {
8828 Py_DECREF(item);
8829 return -1;
8830 }
8831
8832 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8833 Py_DECREF(item);
8834 return -1;
8835 }
8836
8837 Py_DECREF(item);
8838 return 1;
8839 }
8840
8841 static int
unicode_fast_translate_lookup(PyObject * mapping,Py_UCS1 ch,Py_UCS1 * translate)8842 unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8843 Py_UCS1 *translate)
8844 {
8845 PyObject *item = NULL;
8846 int ret = 0;
8847
8848 if (charmaptranslate_lookup(ch, mapping, &item)) {
8849 return -1;
8850 }
8851
8852 if (item == Py_None) {
8853 /* deletion */
8854 translate[ch] = 0xfe;
8855 }
8856 else if (item == NULL) {
8857 /* not found => default to 1:1 mapping */
8858 translate[ch] = ch;
8859 return 1;
8860 }
8861 else if (PyLong_Check(item)) {
8862 long replace = PyLong_AS_LONG(item);
8863 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8864 used it */
8865 if (127 < replace) {
8866 /* invalid character or character outside ASCII:
8867 skip the fast translate */
8868 goto exit;
8869 }
8870 translate[ch] = (Py_UCS1)replace;
8871 }
8872 else if (PyUnicode_Check(item)) {
8873 Py_UCS4 replace;
8874
8875 if (PyUnicode_READY(item) == -1) {
8876 Py_DECREF(item);
8877 return -1;
8878 }
8879 if (PyUnicode_GET_LENGTH(item) != 1)
8880 goto exit;
8881
8882 replace = PyUnicode_READ_CHAR(item, 0);
8883 if (replace > 127)
8884 goto exit;
8885 translate[ch] = (Py_UCS1)replace;
8886 }
8887 else {
8888 /* not None, NULL, long or unicode */
8889 goto exit;
8890 }
8891 ret = 1;
8892
8893 exit:
8894 Py_DECREF(item);
8895 return ret;
8896 }
8897
8898 /* Fast path for ascii => ascii translation. Return 1 if the whole string
8899 was translated into writer, return 0 if the input string was partially
8900 translated into writer, raise an exception and return -1 on error. */
8901 static int
unicode_fast_translate(PyObject * input,PyObject * mapping,_PyUnicodeWriter * writer,int ignore,Py_ssize_t * input_pos)8902 unicode_fast_translate(PyObject *input, PyObject *mapping,
8903 _PyUnicodeWriter *writer, int ignore,
8904 Py_ssize_t *input_pos)
8905 {
8906 Py_UCS1 ascii_table[128], ch, ch2;
8907 Py_ssize_t len;
8908 Py_UCS1 *in, *end, *out;
8909 int res = 0;
8910
8911 len = PyUnicode_GET_LENGTH(input);
8912
8913 memset(ascii_table, 0xff, 128);
8914
8915 in = PyUnicode_1BYTE_DATA(input);
8916 end = in + len;
8917
8918 assert(PyUnicode_IS_ASCII(writer->buffer));
8919 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8920 out = PyUnicode_1BYTE_DATA(writer->buffer);
8921
8922 for (; in < end; in++) {
8923 ch = *in;
8924 ch2 = ascii_table[ch];
8925 if (ch2 == 0xff) {
8926 int translate = unicode_fast_translate_lookup(mapping, ch,
8927 ascii_table);
8928 if (translate < 0)
8929 return -1;
8930 if (translate == 0)
8931 goto exit;
8932 ch2 = ascii_table[ch];
8933 }
8934 if (ch2 == 0xfe) {
8935 if (ignore)
8936 continue;
8937 goto exit;
8938 }
8939 assert(ch2 < 128);
8940 *out = ch2;
8941 out++;
8942 }
8943 res = 1;
8944
8945 exit:
8946 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8947 *input_pos = in - PyUnicode_1BYTE_DATA(input);
8948 return res;
8949 }
8950
8951 static PyObject *
_PyUnicode_TranslateCharmap(PyObject * input,PyObject * mapping,const char * errors)8952 _PyUnicode_TranslateCharmap(PyObject *input,
8953 PyObject *mapping,
8954 const char *errors)
8955 {
8956 /* input object */
8957 char *data;
8958 Py_ssize_t size, i;
8959 int kind;
8960 /* output buffer */
8961 _PyUnicodeWriter writer;
8962 /* error handler */
8963 const char *reason = "character maps to <undefined>";
8964 PyObject *errorHandler = NULL;
8965 PyObject *exc = NULL;
8966 int ignore;
8967 int res;
8968
8969 if (mapping == NULL) {
8970 PyErr_BadArgument();
8971 return NULL;
8972 }
8973
8974 if (PyUnicode_READY(input) == -1)
8975 return NULL;
8976 data = (char*)PyUnicode_DATA(input);
8977 kind = PyUnicode_KIND(input);
8978 size = PyUnicode_GET_LENGTH(input);
8979
8980 if (size == 0)
8981 return PyUnicode_FromObject(input);
8982
8983 /* allocate enough for a simple 1:1 translation without
8984 replacements, if we need more, we'll resize */
8985 _PyUnicodeWriter_Init(&writer);
8986 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8987 goto onError;
8988
8989 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8990
8991 if (PyUnicode_READY(input) == -1)
8992 return NULL;
8993 if (PyUnicode_IS_ASCII(input)) {
8994 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8995 if (res < 0) {
8996 _PyUnicodeWriter_Dealloc(&writer);
8997 return NULL;
8998 }
8999 if (res == 1)
9000 return _PyUnicodeWriter_Finish(&writer);
9001 }
9002 else {
9003 i = 0;
9004 }
9005
9006 while (i<size) {
9007 /* try to encode it */
9008 int translate;
9009 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9010 Py_ssize_t newpos;
9011 /* startpos for collecting untranslatable chars */
9012 Py_ssize_t collstart;
9013 Py_ssize_t collend;
9014 Py_UCS4 ch;
9015
9016 ch = PyUnicode_READ(kind, data, i);
9017 translate = charmaptranslate_output(ch, mapping, &writer);
9018 if (translate < 0)
9019 goto onError;
9020
9021 if (translate != 0) {
9022 /* it worked => adjust input pointer */
9023 ++i;
9024 continue;
9025 }
9026
9027 /* untranslatable character */
9028 collstart = i;
9029 collend = i+1;
9030
9031 /* find all untranslatable characters */
9032 while (collend < size) {
9033 PyObject *x;
9034 ch = PyUnicode_READ(kind, data, collend);
9035 if (charmaptranslate_lookup(ch, mapping, &x))
9036 goto onError;
9037 Py_XDECREF(x);
9038 if (x != Py_None)
9039 break;
9040 ++collend;
9041 }
9042
9043 if (ignore) {
9044 i = collend;
9045 }
9046 else {
9047 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9048 reason, input, &exc,
9049 collstart, collend, &newpos);
9050 if (repunicode == NULL)
9051 goto onError;
9052 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9053 Py_DECREF(repunicode);
9054 goto onError;
9055 }
9056 Py_DECREF(repunicode);
9057 i = newpos;
9058 }
9059 }
9060 Py_XDECREF(exc);
9061 Py_XDECREF(errorHandler);
9062 return _PyUnicodeWriter_Finish(&writer);
9063
9064 onError:
9065 _PyUnicodeWriter_Dealloc(&writer);
9066 Py_XDECREF(exc);
9067 Py_XDECREF(errorHandler);
9068 return NULL;
9069 }
9070
9071 /* Deprecated. Use PyUnicode_Translate instead. */
9072 PyObject *
PyUnicode_TranslateCharmap(const Py_UNICODE * p,Py_ssize_t size,PyObject * mapping,const char * errors)9073 PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9074 Py_ssize_t size,
9075 PyObject *mapping,
9076 const char *errors)
9077 {
9078 PyObject *result;
9079 PyObject *unicode = PyUnicode_FromWideChar(p, size);
9080 if (!unicode)
9081 return NULL;
9082 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9083 Py_DECREF(unicode);
9084 return result;
9085 }
9086
9087 PyObject *
PyUnicode_Translate(PyObject * str,PyObject * mapping,const char * errors)9088 PyUnicode_Translate(PyObject *str,
9089 PyObject *mapping,
9090 const char *errors)
9091 {
9092 if (ensure_unicode(str) < 0)
9093 return NULL;
9094 return _PyUnicode_TranslateCharmap(str, mapping, errors);
9095 }
9096
9097 PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject * unicode)9098 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9099 {
9100 if (!PyUnicode_Check(unicode)) {
9101 PyErr_BadInternalCall();
9102 return NULL;
9103 }
9104 if (PyUnicode_READY(unicode) == -1)
9105 return NULL;
9106 if (PyUnicode_IS_ASCII(unicode)) {
9107 /* If the string is already ASCII, just return the same string */
9108 Py_INCREF(unicode);
9109 return unicode;
9110 }
9111
9112 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9113 PyObject *result = PyUnicode_New(len, 127);
9114 if (result == NULL) {
9115 return NULL;
9116 }
9117
9118 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9119 int kind = PyUnicode_KIND(unicode);
9120 const void *data = PyUnicode_DATA(unicode);
9121 Py_ssize_t i;
9122 for (i = 0; i < len; ++i) {
9123 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9124 if (ch < 127) {
9125 out[i] = ch;
9126 }
9127 else if (Py_UNICODE_ISSPACE(ch)) {
9128 out[i] = ' ';
9129 }
9130 else {
9131 int decimal = Py_UNICODE_TODECIMAL(ch);
9132 if (decimal < 0) {
9133 out[i] = '?';
9134 out[i+1] = '\0';
9135 _PyUnicode_LENGTH(result) = i + 1;
9136 break;
9137 }
9138 out[i] = '0' + decimal;
9139 }
9140 }
9141
9142 assert(_PyUnicode_CheckConsistency(result, 1));
9143 return result;
9144 }
9145
9146 PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE * s,Py_ssize_t length)9147 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9148 Py_ssize_t length)
9149 {
9150 PyObject *decimal;
9151 Py_ssize_t i;
9152 Py_UCS4 maxchar;
9153 enum PyUnicode_Kind kind;
9154 void *data;
9155
9156 maxchar = 127;
9157 for (i = 0; i < length; i++) {
9158 Py_UCS4 ch = s[i];
9159 if (ch > 127) {
9160 int decimal = Py_UNICODE_TODECIMAL(ch);
9161 if (decimal >= 0)
9162 ch = '0' + decimal;
9163 maxchar = Py_MAX(maxchar, ch);
9164 }
9165 }
9166
9167 /* Copy to a new string */
9168 decimal = PyUnicode_New(length, maxchar);
9169 if (decimal == NULL)
9170 return decimal;
9171 kind = PyUnicode_KIND(decimal);
9172 data = PyUnicode_DATA(decimal);
9173 /* Iterate over code points */
9174 for (i = 0; i < length; i++) {
9175 Py_UCS4 ch = s[i];
9176 if (ch > 127) {
9177 int decimal = Py_UNICODE_TODECIMAL(ch);
9178 if (decimal >= 0)
9179 ch = '0' + decimal;
9180 }
9181 PyUnicode_WRITE(kind, data, i, ch);
9182 }
9183 return unicode_result(decimal);
9184 }
9185 /* --- Decimal Encoder ---------------------------------------------------- */
9186
9187 int
PyUnicode_EncodeDecimal(Py_UNICODE * s,Py_ssize_t length,char * output,const char * errors)9188 PyUnicode_EncodeDecimal(Py_UNICODE *s,
9189 Py_ssize_t length,
9190 char *output,
9191 const char *errors)
9192 {
9193 PyObject *unicode;
9194 Py_ssize_t i;
9195 enum PyUnicode_Kind kind;
9196 void *data;
9197
9198 if (output == NULL) {
9199 PyErr_BadArgument();
9200 return -1;
9201 }
9202
9203 unicode = PyUnicode_FromWideChar(s, length);
9204 if (unicode == NULL)
9205 return -1;
9206
9207 kind = PyUnicode_KIND(unicode);
9208 data = PyUnicode_DATA(unicode);
9209
9210 for (i=0; i < length; ) {
9211 PyObject *exc;
9212 Py_UCS4 ch;
9213 int decimal;
9214 Py_ssize_t startpos;
9215
9216 ch = PyUnicode_READ(kind, data, i);
9217
9218 if (Py_UNICODE_ISSPACE(ch)) {
9219 *output++ = ' ';
9220 i++;
9221 continue;
9222 }
9223 decimal = Py_UNICODE_TODECIMAL(ch);
9224 if (decimal >= 0) {
9225 *output++ = '0' + decimal;
9226 i++;
9227 continue;
9228 }
9229 if (0 < ch && ch < 256) {
9230 *output++ = (char)ch;
9231 i++;
9232 continue;
9233 }
9234
9235 startpos = i;
9236 exc = NULL;
9237 raise_encode_exception(&exc, "decimal", unicode,
9238 startpos, startpos+1,
9239 "invalid decimal Unicode string");
9240 Py_XDECREF(exc);
9241 Py_DECREF(unicode);
9242 return -1;
9243 }
9244 /* 0-terminate the output string */
9245 *output++ = '\0';
9246 Py_DECREF(unicode);
9247 return 0;
9248 }
9249
9250 /* --- Helpers ------------------------------------------------------------ */
9251
9252 /* helper macro to fixup start/end slice values */
9253 #define ADJUST_INDICES(start, end, len) \
9254 if (end > len) \
9255 end = len; \
9256 else if (end < 0) { \
9257 end += len; \
9258 if (end < 0) \
9259 end = 0; \
9260 } \
9261 if (start < 0) { \
9262 start += len; \
9263 if (start < 0) \
9264 start = 0; \
9265 }
9266
9267 static Py_ssize_t
any_find_slice(PyObject * s1,PyObject * s2,Py_ssize_t start,Py_ssize_t end,int direction)9268 any_find_slice(PyObject* s1, PyObject* s2,
9269 Py_ssize_t start,
9270 Py_ssize_t end,
9271 int direction)
9272 {
9273 int kind1, kind2;
9274 void *buf1, *buf2;
9275 Py_ssize_t len1, len2, result;
9276
9277 kind1 = PyUnicode_KIND(s1);
9278 kind2 = PyUnicode_KIND(s2);
9279 if (kind1 < kind2)
9280 return -1;
9281
9282 len1 = PyUnicode_GET_LENGTH(s1);
9283 len2 = PyUnicode_GET_LENGTH(s2);
9284 ADJUST_INDICES(start, end, len1);
9285 if (end - start < len2)
9286 return -1;
9287
9288 buf1 = PyUnicode_DATA(s1);
9289 buf2 = PyUnicode_DATA(s2);
9290 if (len2 == 1) {
9291 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9292 result = findchar((const char *)buf1 + kind1*start,
9293 kind1, end - start, ch, direction);
9294 if (result == -1)
9295 return -1;
9296 else
9297 return start + result;
9298 }
9299
9300 if (kind2 != kind1) {
9301 buf2 = _PyUnicode_AsKind(s2, kind1);
9302 if (!buf2)
9303 return -2;
9304 }
9305
9306 if (direction > 0) {
9307 switch (kind1) {
9308 case PyUnicode_1BYTE_KIND:
9309 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9310 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9311 else
9312 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9313 break;
9314 case PyUnicode_2BYTE_KIND:
9315 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9316 break;
9317 case PyUnicode_4BYTE_KIND:
9318 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9319 break;
9320 default:
9321 Py_UNREACHABLE();
9322 }
9323 }
9324 else {
9325 switch (kind1) {
9326 case PyUnicode_1BYTE_KIND:
9327 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9328 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9329 else
9330 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9331 break;
9332 case PyUnicode_2BYTE_KIND:
9333 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9334 break;
9335 case PyUnicode_4BYTE_KIND:
9336 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9337 break;
9338 default:
9339 Py_UNREACHABLE();
9340 }
9341 }
9342
9343 if (kind2 != kind1)
9344 PyMem_Free(buf2);
9345
9346 return result;
9347 }
9348
9349 /* _PyUnicode_InsertThousandsGrouping() helper functions */
9350 #include "stringlib/localeutil.h"
9351
9352 /**
9353 * InsertThousandsGrouping:
9354 * @writer: Unicode writer.
9355 * @n_buffer: Number of characters in @buffer.
9356 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9357 * @d_pos: Start of digits string.
9358 * @n_digits: The number of digits in the string, in which we want
9359 * to put the grouping chars.
9360 * @min_width: The minimum width of the digits in the output string.
9361 * Output will be zero-padded on the left to fill.
9362 * @grouping: see definition in localeconv().
9363 * @thousands_sep: see definition in localeconv().
9364 *
9365 * There are 2 modes: counting and filling. If @writer is NULL,
9366 * we are in counting mode, else filling mode.
9367 * If counting, the required buffer size is returned.
9368 * If filling, we know the buffer will be large enough, so we don't
9369 * need to pass in the buffer size.
9370 * Inserts thousand grouping characters (as defined by grouping and
9371 * thousands_sep) into @writer.
9372 *
9373 * Return value: -1 on error, number of characters otherwise.
9374 **/
9375 Py_ssize_t
_PyUnicode_InsertThousandsGrouping(_PyUnicodeWriter * writer,Py_ssize_t n_buffer,PyObject * digits,Py_ssize_t d_pos,Py_ssize_t n_digits,Py_ssize_t min_width,const char * grouping,PyObject * thousands_sep,Py_UCS4 * maxchar)9376 _PyUnicode_InsertThousandsGrouping(
9377 _PyUnicodeWriter *writer,
9378 Py_ssize_t n_buffer,
9379 PyObject *digits,
9380 Py_ssize_t d_pos,
9381 Py_ssize_t n_digits,
9382 Py_ssize_t min_width,
9383 const char *grouping,
9384 PyObject *thousands_sep,
9385 Py_UCS4 *maxchar)
9386 {
9387 min_width = Py_MAX(0, min_width);
9388 if (writer) {
9389 assert(digits != NULL);
9390 assert(maxchar == NULL);
9391 }
9392 else {
9393 assert(digits == NULL);
9394 assert(maxchar != NULL);
9395 }
9396 assert(0 <= d_pos);
9397 assert(0 <= n_digits);
9398 assert(grouping != NULL);
9399
9400 if (digits != NULL) {
9401 if (PyUnicode_READY(digits) == -1) {
9402 return -1;
9403 }
9404 }
9405 if (PyUnicode_READY(thousands_sep) == -1) {
9406 return -1;
9407 }
9408
9409 Py_ssize_t count = 0;
9410 Py_ssize_t n_zeros;
9411 int loop_broken = 0;
9412 int use_separator = 0; /* First time through, don't append the
9413 separator. They only go between
9414 groups. */
9415 Py_ssize_t buffer_pos;
9416 Py_ssize_t digits_pos;
9417 Py_ssize_t len;
9418 Py_ssize_t n_chars;
9419 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9420 be looked at */
9421 /* A generator that returns all of the grouping widths, until it
9422 returns 0. */
9423 GroupGenerator groupgen;
9424 GroupGenerator_init(&groupgen, grouping);
9425 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9426
9427 /* if digits are not grouped, thousands separator
9428 should be an empty string */
9429 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9430
9431 digits_pos = d_pos + n_digits;
9432 if (writer) {
9433 buffer_pos = writer->pos + n_buffer;
9434 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9435 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9436 }
9437 else {
9438 buffer_pos = n_buffer;
9439 }
9440
9441 if (!writer) {
9442 *maxchar = 127;
9443 }
9444
9445 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9446 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9447 n_zeros = Py_MAX(0, len - remaining);
9448 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9449
9450 /* Use n_zero zero's and n_chars chars */
9451
9452 /* Count only, don't do anything. */
9453 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9454
9455 /* Copy into the writer. */
9456 InsertThousandsGrouping_fill(writer, &buffer_pos,
9457 digits, &digits_pos,
9458 n_chars, n_zeros,
9459 use_separator ? thousands_sep : NULL,
9460 thousands_sep_len, maxchar);
9461
9462 /* Use a separator next time. */
9463 use_separator = 1;
9464
9465 remaining -= n_chars;
9466 min_width -= len;
9467
9468 if (remaining <= 0 && min_width <= 0) {
9469 loop_broken = 1;
9470 break;
9471 }
9472 min_width -= thousands_sep_len;
9473 }
9474 if (!loop_broken) {
9475 /* We left the loop without using a break statement. */
9476
9477 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9478 n_zeros = Py_MAX(0, len - remaining);
9479 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9480
9481 /* Use n_zero zero's and n_chars chars */
9482 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9483
9484 /* Copy into the writer. */
9485 InsertThousandsGrouping_fill(writer, &buffer_pos,
9486 digits, &digits_pos,
9487 n_chars, n_zeros,
9488 use_separator ? thousands_sep : NULL,
9489 thousands_sep_len, maxchar);
9490 }
9491 return count;
9492 }
9493
9494
9495 Py_ssize_t
PyUnicode_Count(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end)9496 PyUnicode_Count(PyObject *str,
9497 PyObject *substr,
9498 Py_ssize_t start,
9499 Py_ssize_t end)
9500 {
9501 Py_ssize_t result;
9502 int kind1, kind2;
9503 void *buf1 = NULL, *buf2 = NULL;
9504 Py_ssize_t len1, len2;
9505
9506 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9507 return -1;
9508
9509 kind1 = PyUnicode_KIND(str);
9510 kind2 = PyUnicode_KIND(substr);
9511 if (kind1 < kind2)
9512 return 0;
9513
9514 len1 = PyUnicode_GET_LENGTH(str);
9515 len2 = PyUnicode_GET_LENGTH(substr);
9516 ADJUST_INDICES(start, end, len1);
9517 if (end - start < len2)
9518 return 0;
9519
9520 buf1 = PyUnicode_DATA(str);
9521 buf2 = PyUnicode_DATA(substr);
9522 if (kind2 != kind1) {
9523 buf2 = _PyUnicode_AsKind(substr, kind1);
9524 if (!buf2)
9525 goto onError;
9526 }
9527
9528 switch (kind1) {
9529 case PyUnicode_1BYTE_KIND:
9530 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9531 result = asciilib_count(
9532 ((Py_UCS1*)buf1) + start, end - start,
9533 buf2, len2, PY_SSIZE_T_MAX
9534 );
9535 else
9536 result = ucs1lib_count(
9537 ((Py_UCS1*)buf1) + start, end - start,
9538 buf2, len2, PY_SSIZE_T_MAX
9539 );
9540 break;
9541 case PyUnicode_2BYTE_KIND:
9542 result = ucs2lib_count(
9543 ((Py_UCS2*)buf1) + start, end - start,
9544 buf2, len2, PY_SSIZE_T_MAX
9545 );
9546 break;
9547 case PyUnicode_4BYTE_KIND:
9548 result = ucs4lib_count(
9549 ((Py_UCS4*)buf1) + start, end - start,
9550 buf2, len2, PY_SSIZE_T_MAX
9551 );
9552 break;
9553 default:
9554 Py_UNREACHABLE();
9555 }
9556
9557 if (kind2 != kind1)
9558 PyMem_Free(buf2);
9559
9560 return result;
9561 onError:
9562 if (kind2 != kind1 && buf2)
9563 PyMem_Free(buf2);
9564 return -1;
9565 }
9566
9567 Py_ssize_t
PyUnicode_Find(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9568 PyUnicode_Find(PyObject *str,
9569 PyObject *substr,
9570 Py_ssize_t start,
9571 Py_ssize_t end,
9572 int direction)
9573 {
9574 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9575 return -2;
9576
9577 return any_find_slice(str, substr, start, end, direction);
9578 }
9579
9580 Py_ssize_t
PyUnicode_FindChar(PyObject * str,Py_UCS4 ch,Py_ssize_t start,Py_ssize_t end,int direction)9581 PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9582 Py_ssize_t start, Py_ssize_t end,
9583 int direction)
9584 {
9585 int kind;
9586 Py_ssize_t len, result;
9587 if (PyUnicode_READY(str) == -1)
9588 return -2;
9589 len = PyUnicode_GET_LENGTH(str);
9590 ADJUST_INDICES(start, end, len);
9591 if (end - start < 1)
9592 return -1;
9593 kind = PyUnicode_KIND(str);
9594 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9595 kind, end-start, ch, direction);
9596 if (result == -1)
9597 return -1;
9598 else
9599 return start + result;
9600 }
9601
9602 static int
tailmatch(PyObject * self,PyObject * substring,Py_ssize_t start,Py_ssize_t end,int direction)9603 tailmatch(PyObject *self,
9604 PyObject *substring,
9605 Py_ssize_t start,
9606 Py_ssize_t end,
9607 int direction)
9608 {
9609 int kind_self;
9610 int kind_sub;
9611 void *data_self;
9612 void *data_sub;
9613 Py_ssize_t offset;
9614 Py_ssize_t i;
9615 Py_ssize_t end_sub;
9616
9617 if (PyUnicode_READY(self) == -1 ||
9618 PyUnicode_READY(substring) == -1)
9619 return -1;
9620
9621 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9622 end -= PyUnicode_GET_LENGTH(substring);
9623 if (end < start)
9624 return 0;
9625
9626 if (PyUnicode_GET_LENGTH(substring) == 0)
9627 return 1;
9628
9629 kind_self = PyUnicode_KIND(self);
9630 data_self = PyUnicode_DATA(self);
9631 kind_sub = PyUnicode_KIND(substring);
9632 data_sub = PyUnicode_DATA(substring);
9633 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9634
9635 if (direction > 0)
9636 offset = end;
9637 else
9638 offset = start;
9639
9640 if (PyUnicode_READ(kind_self, data_self, offset) ==
9641 PyUnicode_READ(kind_sub, data_sub, 0) &&
9642 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9643 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9644 /* If both are of the same kind, memcmp is sufficient */
9645 if (kind_self == kind_sub) {
9646 return ! memcmp((char *)data_self +
9647 (offset * PyUnicode_KIND(substring)),
9648 data_sub,
9649 PyUnicode_GET_LENGTH(substring) *
9650 PyUnicode_KIND(substring));
9651 }
9652 /* otherwise we have to compare each character by first accessing it */
9653 else {
9654 /* We do not need to compare 0 and len(substring)-1 because
9655 the if statement above ensured already that they are equal
9656 when we end up here. */
9657 for (i = 1; i < end_sub; ++i) {
9658 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9659 PyUnicode_READ(kind_sub, data_sub, i))
9660 return 0;
9661 }
9662 return 1;
9663 }
9664 }
9665
9666 return 0;
9667 }
9668
9669 Py_ssize_t
PyUnicode_Tailmatch(PyObject * str,PyObject * substr,Py_ssize_t start,Py_ssize_t end,int direction)9670 PyUnicode_Tailmatch(PyObject *str,
9671 PyObject *substr,
9672 Py_ssize_t start,
9673 Py_ssize_t end,
9674 int direction)
9675 {
9676 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9677 return -1;
9678
9679 return tailmatch(str, substr, start, end, direction);
9680 }
9681
9682 static PyObject *
ascii_upper_or_lower(PyObject * self,int lower)9683 ascii_upper_or_lower(PyObject *self, int lower)
9684 {
9685 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9686 char *resdata, *data = PyUnicode_DATA(self);
9687 PyObject *res;
9688
9689 res = PyUnicode_New(len, 127);
9690 if (res == NULL)
9691 return NULL;
9692 resdata = PyUnicode_DATA(res);
9693 if (lower)
9694 _Py_bytes_lower(resdata, data, len);
9695 else
9696 _Py_bytes_upper(resdata, data, len);
9697 return res;
9698 }
9699
9700 static Py_UCS4
handle_capital_sigma(int kind,void * data,Py_ssize_t length,Py_ssize_t i)9701 handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9702 {
9703 Py_ssize_t j;
9704 int final_sigma;
9705 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
9706 /* U+03A3 is in the Final_Sigma context when, it is found like this:
9707
9708 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9709
9710 where ! is a negation and \p{xxx} is a character with property xxx.
9711 */
9712 for (j = i - 1; j >= 0; j--) {
9713 c = PyUnicode_READ(kind, data, j);
9714 if (!_PyUnicode_IsCaseIgnorable(c))
9715 break;
9716 }
9717 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9718 if (final_sigma) {
9719 for (j = i + 1; j < length; j++) {
9720 c = PyUnicode_READ(kind, data, j);
9721 if (!_PyUnicode_IsCaseIgnorable(c))
9722 break;
9723 }
9724 final_sigma = j == length || !_PyUnicode_IsCased(c);
9725 }
9726 return (final_sigma) ? 0x3C2 : 0x3C3;
9727 }
9728
9729 static int
lower_ucs4(int kind,void * data,Py_ssize_t length,Py_ssize_t i,Py_UCS4 c,Py_UCS4 * mapped)9730 lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9731 Py_UCS4 c, Py_UCS4 *mapped)
9732 {
9733 /* Obscure special case. */
9734 if (c == 0x3A3) {
9735 mapped[0] = handle_capital_sigma(kind, data, length, i);
9736 return 1;
9737 }
9738 return _PyUnicode_ToLowerFull(c, mapped);
9739 }
9740
9741 static Py_ssize_t
do_capitalize(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9742 do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9743 {
9744 Py_ssize_t i, k = 0;
9745 int n_res, j;
9746 Py_UCS4 c, mapped[3];
9747
9748 c = PyUnicode_READ(kind, data, 0);
9749 n_res = _PyUnicode_ToUpperFull(c, mapped);
9750 for (j = 0; j < n_res; j++) {
9751 *maxchar = Py_MAX(*maxchar, mapped[j]);
9752 res[k++] = mapped[j];
9753 }
9754 for (i = 1; i < length; i++) {
9755 c = PyUnicode_READ(kind, data, i);
9756 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9757 for (j = 0; j < n_res; j++) {
9758 *maxchar = Py_MAX(*maxchar, mapped[j]);
9759 res[k++] = mapped[j];
9760 }
9761 }
9762 return k;
9763 }
9764
9765 static Py_ssize_t
do_swapcase(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9766 do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9767 Py_ssize_t i, k = 0;
9768
9769 for (i = 0; i < length; i++) {
9770 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9771 int n_res, j;
9772 if (Py_UNICODE_ISUPPER(c)) {
9773 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9774 }
9775 else if (Py_UNICODE_ISLOWER(c)) {
9776 n_res = _PyUnicode_ToUpperFull(c, mapped);
9777 }
9778 else {
9779 n_res = 1;
9780 mapped[0] = c;
9781 }
9782 for (j = 0; j < n_res; j++) {
9783 *maxchar = Py_MAX(*maxchar, mapped[j]);
9784 res[k++] = mapped[j];
9785 }
9786 }
9787 return k;
9788 }
9789
9790 static Py_ssize_t
do_upper_or_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar,int lower)9791 do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9792 Py_UCS4 *maxchar, int lower)
9793 {
9794 Py_ssize_t i, k = 0;
9795
9796 for (i = 0; i < length; i++) {
9797 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9798 int n_res, j;
9799 if (lower)
9800 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9801 else
9802 n_res = _PyUnicode_ToUpperFull(c, mapped);
9803 for (j = 0; j < n_res; j++) {
9804 *maxchar = Py_MAX(*maxchar, mapped[j]);
9805 res[k++] = mapped[j];
9806 }
9807 }
9808 return k;
9809 }
9810
9811 static Py_ssize_t
do_upper(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9812 do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9813 {
9814 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9815 }
9816
9817 static Py_ssize_t
do_lower(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9818 do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9819 {
9820 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9821 }
9822
9823 static Py_ssize_t
do_casefold(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9824 do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9825 {
9826 Py_ssize_t i, k = 0;
9827
9828 for (i = 0; i < length; i++) {
9829 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9830 Py_UCS4 mapped[3];
9831 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9832 for (j = 0; j < n_res; j++) {
9833 *maxchar = Py_MAX(*maxchar, mapped[j]);
9834 res[k++] = mapped[j];
9835 }
9836 }
9837 return k;
9838 }
9839
9840 static Py_ssize_t
do_title(int kind,void * data,Py_ssize_t length,Py_UCS4 * res,Py_UCS4 * maxchar)9841 do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9842 {
9843 Py_ssize_t i, k = 0;
9844 int previous_is_cased;
9845
9846 previous_is_cased = 0;
9847 for (i = 0; i < length; i++) {
9848 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9849 Py_UCS4 mapped[3];
9850 int n_res, j;
9851
9852 if (previous_is_cased)
9853 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9854 else
9855 n_res = _PyUnicode_ToTitleFull(c, mapped);
9856
9857 for (j = 0; j < n_res; j++) {
9858 *maxchar = Py_MAX(*maxchar, mapped[j]);
9859 res[k++] = mapped[j];
9860 }
9861
9862 previous_is_cased = _PyUnicode_IsCased(c);
9863 }
9864 return k;
9865 }
9866
9867 static PyObject *
case_operation(PyObject * self,Py_ssize_t (* perform)(int,void *,Py_ssize_t,Py_UCS4 *,Py_UCS4 *))9868 case_operation(PyObject *self,
9869 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9870 {
9871 PyObject *res = NULL;
9872 Py_ssize_t length, newlength = 0;
9873 int kind, outkind;
9874 void *data, *outdata;
9875 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9876
9877 assert(PyUnicode_IS_READY(self));
9878
9879 kind = PyUnicode_KIND(self);
9880 data = PyUnicode_DATA(self);
9881 length = PyUnicode_GET_LENGTH(self);
9882 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9883 PyErr_SetString(PyExc_OverflowError, "string is too long");
9884 return NULL;
9885 }
9886 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9887 if (tmp == NULL)
9888 return PyErr_NoMemory();
9889 newlength = perform(kind, data, length, tmp, &maxchar);
9890 res = PyUnicode_New(newlength, maxchar);
9891 if (res == NULL)
9892 goto leave;
9893 tmpend = tmp + newlength;
9894 outdata = PyUnicode_DATA(res);
9895 outkind = PyUnicode_KIND(res);
9896 switch (outkind) {
9897 case PyUnicode_1BYTE_KIND:
9898 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9899 break;
9900 case PyUnicode_2BYTE_KIND:
9901 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9902 break;
9903 case PyUnicode_4BYTE_KIND:
9904 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9905 break;
9906 default:
9907 Py_UNREACHABLE();
9908 }
9909 leave:
9910 PyMem_FREE(tmp);
9911 return res;
9912 }
9913
9914 PyObject *
PyUnicode_Join(PyObject * separator,PyObject * seq)9915 PyUnicode_Join(PyObject *separator, PyObject *seq)
9916 {
9917 PyObject *res;
9918 PyObject *fseq;
9919 Py_ssize_t seqlen;
9920 PyObject **items;
9921
9922 fseq = PySequence_Fast(seq, "can only join an iterable");
9923 if (fseq == NULL) {
9924 return NULL;
9925 }
9926
9927 /* NOTE: the following code can't call back into Python code,
9928 * so we are sure that fseq won't be mutated.
9929 */
9930
9931 items = PySequence_Fast_ITEMS(fseq);
9932 seqlen = PySequence_Fast_GET_SIZE(fseq);
9933 res = _PyUnicode_JoinArray(separator, items, seqlen);
9934 Py_DECREF(fseq);
9935 return res;
9936 }
9937
9938 PyObject *
_PyUnicode_JoinArray(PyObject * separator,PyObject * const * items,Py_ssize_t seqlen)9939 _PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9940 {
9941 PyObject *res = NULL; /* the result */
9942 PyObject *sep = NULL;
9943 Py_ssize_t seplen;
9944 PyObject *item;
9945 Py_ssize_t sz, i, res_offset;
9946 Py_UCS4 maxchar;
9947 Py_UCS4 item_maxchar;
9948 int use_memcpy;
9949 unsigned char *res_data = NULL, *sep_data = NULL;
9950 PyObject *last_obj;
9951 unsigned int kind = 0;
9952
9953 /* If empty sequence, return u"". */
9954 if (seqlen == 0) {
9955 _Py_RETURN_UNICODE_EMPTY();
9956 }
9957
9958 /* If singleton sequence with an exact Unicode, return that. */
9959 last_obj = NULL;
9960 if (seqlen == 1) {
9961 if (PyUnicode_CheckExact(items[0])) {
9962 res = items[0];
9963 Py_INCREF(res);
9964 return res;
9965 }
9966 seplen = 0;
9967 maxchar = 0;
9968 }
9969 else {
9970 /* Set up sep and seplen */
9971 if (separator == NULL) {
9972 /* fall back to a blank space separator */
9973 sep = PyUnicode_FromOrdinal(' ');
9974 if (!sep)
9975 goto onError;
9976 seplen = 1;
9977 maxchar = 32;
9978 }
9979 else {
9980 if (!PyUnicode_Check(separator)) {
9981 PyErr_Format(PyExc_TypeError,
9982 "separator: expected str instance,"
9983 " %.80s found",
9984 Py_TYPE(separator)->tp_name);
9985 goto onError;
9986 }
9987 if (PyUnicode_READY(separator))
9988 goto onError;
9989 sep = separator;
9990 seplen = PyUnicode_GET_LENGTH(separator);
9991 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9992 /* inc refcount to keep this code path symmetric with the
9993 above case of a blank separator */
9994 Py_INCREF(sep);
9995 }
9996 last_obj = sep;
9997 }
9998
9999 /* There are at least two things to join, or else we have a subclass
10000 * of str in the sequence.
10001 * Do a pre-pass to figure out the total amount of space we'll
10002 * need (sz), and see whether all argument are strings.
10003 */
10004 sz = 0;
10005 #ifdef Py_DEBUG
10006 use_memcpy = 0;
10007 #else
10008 use_memcpy = 1;
10009 #endif
10010 for (i = 0; i < seqlen; i++) {
10011 size_t add_sz;
10012 item = items[i];
10013 if (!PyUnicode_Check(item)) {
10014 PyErr_Format(PyExc_TypeError,
10015 "sequence item %zd: expected str instance,"
10016 " %.80s found",
10017 i, Py_TYPE(item)->tp_name);
10018 goto onError;
10019 }
10020 if (PyUnicode_READY(item) == -1)
10021 goto onError;
10022 add_sz = PyUnicode_GET_LENGTH(item);
10023 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10024 maxchar = Py_MAX(maxchar, item_maxchar);
10025 if (i != 0) {
10026 add_sz += seplen;
10027 }
10028 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10029 PyErr_SetString(PyExc_OverflowError,
10030 "join() result is too long for a Python string");
10031 goto onError;
10032 }
10033 sz += add_sz;
10034 if (use_memcpy && last_obj != NULL) {
10035 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10036 use_memcpy = 0;
10037 }
10038 last_obj = item;
10039 }
10040
10041 res = PyUnicode_New(sz, maxchar);
10042 if (res == NULL)
10043 goto onError;
10044
10045 /* Catenate everything. */
10046 #ifdef Py_DEBUG
10047 use_memcpy = 0;
10048 #else
10049 if (use_memcpy) {
10050 res_data = PyUnicode_1BYTE_DATA(res);
10051 kind = PyUnicode_KIND(res);
10052 if (seplen != 0)
10053 sep_data = PyUnicode_1BYTE_DATA(sep);
10054 }
10055 #endif
10056 if (use_memcpy) {
10057 for (i = 0; i < seqlen; ++i) {
10058 Py_ssize_t itemlen;
10059 item = items[i];
10060
10061 /* Copy item, and maybe the separator. */
10062 if (i && seplen != 0) {
10063 memcpy(res_data,
10064 sep_data,
10065 kind * seplen);
10066 res_data += kind * seplen;
10067 }
10068
10069 itemlen = PyUnicode_GET_LENGTH(item);
10070 if (itemlen != 0) {
10071 memcpy(res_data,
10072 PyUnicode_DATA(item),
10073 kind * itemlen);
10074 res_data += kind * itemlen;
10075 }
10076 }
10077 assert(res_data == PyUnicode_1BYTE_DATA(res)
10078 + kind * PyUnicode_GET_LENGTH(res));
10079 }
10080 else {
10081 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10082 Py_ssize_t itemlen;
10083 item = items[i];
10084
10085 /* Copy item, and maybe the separator. */
10086 if (i && seplen != 0) {
10087 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10088 res_offset += seplen;
10089 }
10090
10091 itemlen = PyUnicode_GET_LENGTH(item);
10092 if (itemlen != 0) {
10093 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10094 res_offset += itemlen;
10095 }
10096 }
10097 assert(res_offset == PyUnicode_GET_LENGTH(res));
10098 }
10099
10100 Py_XDECREF(sep);
10101 assert(_PyUnicode_CheckConsistency(res, 1));
10102 return res;
10103
10104 onError:
10105 Py_XDECREF(sep);
10106 Py_XDECREF(res);
10107 return NULL;
10108 }
10109
10110 void
_PyUnicode_FastFill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10111 _PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10112 Py_UCS4 fill_char)
10113 {
10114 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10115 void *data = PyUnicode_DATA(unicode);
10116 assert(PyUnicode_IS_READY(unicode));
10117 assert(unicode_modifiable(unicode));
10118 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10119 assert(start >= 0);
10120 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10121 FILL(kind, data, fill_char, start, length);
10122 }
10123
10124 Py_ssize_t
PyUnicode_Fill(PyObject * unicode,Py_ssize_t start,Py_ssize_t length,Py_UCS4 fill_char)10125 PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10126 Py_UCS4 fill_char)
10127 {
10128 Py_ssize_t maxlen;
10129
10130 if (!PyUnicode_Check(unicode)) {
10131 PyErr_BadInternalCall();
10132 return -1;
10133 }
10134 if (PyUnicode_READY(unicode) == -1)
10135 return -1;
10136 if (unicode_check_modifiable(unicode))
10137 return -1;
10138
10139 if (start < 0) {
10140 PyErr_SetString(PyExc_IndexError, "string index out of range");
10141 return -1;
10142 }
10143 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10144 PyErr_SetString(PyExc_ValueError,
10145 "fill character is bigger than "
10146 "the string maximum character");
10147 return -1;
10148 }
10149
10150 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10151 length = Py_MIN(maxlen, length);
10152 if (length <= 0)
10153 return 0;
10154
10155 _PyUnicode_FastFill(unicode, start, length, fill_char);
10156 return length;
10157 }
10158
10159 static PyObject *
pad(PyObject * self,Py_ssize_t left,Py_ssize_t right,Py_UCS4 fill)10160 pad(PyObject *self,
10161 Py_ssize_t left,
10162 Py_ssize_t right,
10163 Py_UCS4 fill)
10164 {
10165 PyObject *u;
10166 Py_UCS4 maxchar;
10167 int kind;
10168 void *data;
10169
10170 if (left < 0)
10171 left = 0;
10172 if (right < 0)
10173 right = 0;
10174
10175 if (left == 0 && right == 0)
10176 return unicode_result_unchanged(self);
10177
10178 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10179 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10180 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10181 return NULL;
10182 }
10183 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10184 maxchar = Py_MAX(maxchar, fill);
10185 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10186 if (!u)
10187 return NULL;
10188
10189 kind = PyUnicode_KIND(u);
10190 data = PyUnicode_DATA(u);
10191 if (left)
10192 FILL(kind, data, fill, 0, left);
10193 if (right)
10194 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10195 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10196 assert(_PyUnicode_CheckConsistency(u, 1));
10197 return u;
10198 }
10199
10200 PyObject *
PyUnicode_Splitlines(PyObject * string,int keepends)10201 PyUnicode_Splitlines(PyObject *string, int keepends)
10202 {
10203 PyObject *list;
10204
10205 if (ensure_unicode(string) < 0)
10206 return NULL;
10207
10208 switch (PyUnicode_KIND(string)) {
10209 case PyUnicode_1BYTE_KIND:
10210 if (PyUnicode_IS_ASCII(string))
10211 list = asciilib_splitlines(
10212 string, PyUnicode_1BYTE_DATA(string),
10213 PyUnicode_GET_LENGTH(string), keepends);
10214 else
10215 list = ucs1lib_splitlines(
10216 string, PyUnicode_1BYTE_DATA(string),
10217 PyUnicode_GET_LENGTH(string), keepends);
10218 break;
10219 case PyUnicode_2BYTE_KIND:
10220 list = ucs2lib_splitlines(
10221 string, PyUnicode_2BYTE_DATA(string),
10222 PyUnicode_GET_LENGTH(string), keepends);
10223 break;
10224 case PyUnicode_4BYTE_KIND:
10225 list = ucs4lib_splitlines(
10226 string, PyUnicode_4BYTE_DATA(string),
10227 PyUnicode_GET_LENGTH(string), keepends);
10228 break;
10229 default:
10230 Py_UNREACHABLE();
10231 }
10232 return list;
10233 }
10234
10235 static PyObject *
split(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10236 split(PyObject *self,
10237 PyObject *substring,
10238 Py_ssize_t maxcount)
10239 {
10240 int kind1, kind2;
10241 void *buf1, *buf2;
10242 Py_ssize_t len1, len2;
10243 PyObject* out;
10244
10245 if (maxcount < 0)
10246 maxcount = PY_SSIZE_T_MAX;
10247
10248 if (PyUnicode_READY(self) == -1)
10249 return NULL;
10250
10251 if (substring == NULL)
10252 switch (PyUnicode_KIND(self)) {
10253 case PyUnicode_1BYTE_KIND:
10254 if (PyUnicode_IS_ASCII(self))
10255 return asciilib_split_whitespace(
10256 self, PyUnicode_1BYTE_DATA(self),
10257 PyUnicode_GET_LENGTH(self), maxcount
10258 );
10259 else
10260 return ucs1lib_split_whitespace(
10261 self, PyUnicode_1BYTE_DATA(self),
10262 PyUnicode_GET_LENGTH(self), maxcount
10263 );
10264 case PyUnicode_2BYTE_KIND:
10265 return ucs2lib_split_whitespace(
10266 self, PyUnicode_2BYTE_DATA(self),
10267 PyUnicode_GET_LENGTH(self), maxcount
10268 );
10269 case PyUnicode_4BYTE_KIND:
10270 return ucs4lib_split_whitespace(
10271 self, PyUnicode_4BYTE_DATA(self),
10272 PyUnicode_GET_LENGTH(self), maxcount
10273 );
10274 default:
10275 Py_UNREACHABLE();
10276 }
10277
10278 if (PyUnicode_READY(substring) == -1)
10279 return NULL;
10280
10281 kind1 = PyUnicode_KIND(self);
10282 kind2 = PyUnicode_KIND(substring);
10283 len1 = PyUnicode_GET_LENGTH(self);
10284 len2 = PyUnicode_GET_LENGTH(substring);
10285 if (kind1 < kind2 || len1 < len2) {
10286 out = PyList_New(1);
10287 if (out == NULL)
10288 return NULL;
10289 Py_INCREF(self);
10290 PyList_SET_ITEM(out, 0, self);
10291 return out;
10292 }
10293 buf1 = PyUnicode_DATA(self);
10294 buf2 = PyUnicode_DATA(substring);
10295 if (kind2 != kind1) {
10296 buf2 = _PyUnicode_AsKind(substring, kind1);
10297 if (!buf2)
10298 return NULL;
10299 }
10300
10301 switch (kind1) {
10302 case PyUnicode_1BYTE_KIND:
10303 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10304 out = asciilib_split(
10305 self, buf1, len1, buf2, len2, maxcount);
10306 else
10307 out = ucs1lib_split(
10308 self, buf1, len1, buf2, len2, maxcount);
10309 break;
10310 case PyUnicode_2BYTE_KIND:
10311 out = ucs2lib_split(
10312 self, buf1, len1, buf2, len2, maxcount);
10313 break;
10314 case PyUnicode_4BYTE_KIND:
10315 out = ucs4lib_split(
10316 self, buf1, len1, buf2, len2, maxcount);
10317 break;
10318 default:
10319 out = NULL;
10320 }
10321 if (kind2 != kind1)
10322 PyMem_Free(buf2);
10323 return out;
10324 }
10325
10326 static PyObject *
rsplit(PyObject * self,PyObject * substring,Py_ssize_t maxcount)10327 rsplit(PyObject *self,
10328 PyObject *substring,
10329 Py_ssize_t maxcount)
10330 {
10331 int kind1, kind2;
10332 void *buf1, *buf2;
10333 Py_ssize_t len1, len2;
10334 PyObject* out;
10335
10336 if (maxcount < 0)
10337 maxcount = PY_SSIZE_T_MAX;
10338
10339 if (PyUnicode_READY(self) == -1)
10340 return NULL;
10341
10342 if (substring == NULL)
10343 switch (PyUnicode_KIND(self)) {
10344 case PyUnicode_1BYTE_KIND:
10345 if (PyUnicode_IS_ASCII(self))
10346 return asciilib_rsplit_whitespace(
10347 self, PyUnicode_1BYTE_DATA(self),
10348 PyUnicode_GET_LENGTH(self), maxcount
10349 );
10350 else
10351 return ucs1lib_rsplit_whitespace(
10352 self, PyUnicode_1BYTE_DATA(self),
10353 PyUnicode_GET_LENGTH(self), maxcount
10354 );
10355 case PyUnicode_2BYTE_KIND:
10356 return ucs2lib_rsplit_whitespace(
10357 self, PyUnicode_2BYTE_DATA(self),
10358 PyUnicode_GET_LENGTH(self), maxcount
10359 );
10360 case PyUnicode_4BYTE_KIND:
10361 return ucs4lib_rsplit_whitespace(
10362 self, PyUnicode_4BYTE_DATA(self),
10363 PyUnicode_GET_LENGTH(self), maxcount
10364 );
10365 default:
10366 Py_UNREACHABLE();
10367 }
10368
10369 if (PyUnicode_READY(substring) == -1)
10370 return NULL;
10371
10372 kind1 = PyUnicode_KIND(self);
10373 kind2 = PyUnicode_KIND(substring);
10374 len1 = PyUnicode_GET_LENGTH(self);
10375 len2 = PyUnicode_GET_LENGTH(substring);
10376 if (kind1 < kind2 || len1 < len2) {
10377 out = PyList_New(1);
10378 if (out == NULL)
10379 return NULL;
10380 Py_INCREF(self);
10381 PyList_SET_ITEM(out, 0, self);
10382 return out;
10383 }
10384 buf1 = PyUnicode_DATA(self);
10385 buf2 = PyUnicode_DATA(substring);
10386 if (kind2 != kind1) {
10387 buf2 = _PyUnicode_AsKind(substring, kind1);
10388 if (!buf2)
10389 return NULL;
10390 }
10391
10392 switch (kind1) {
10393 case PyUnicode_1BYTE_KIND:
10394 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10395 out = asciilib_rsplit(
10396 self, buf1, len1, buf2, len2, maxcount);
10397 else
10398 out = ucs1lib_rsplit(
10399 self, buf1, len1, buf2, len2, maxcount);
10400 break;
10401 case PyUnicode_2BYTE_KIND:
10402 out = ucs2lib_rsplit(
10403 self, buf1, len1, buf2, len2, maxcount);
10404 break;
10405 case PyUnicode_4BYTE_KIND:
10406 out = ucs4lib_rsplit(
10407 self, buf1, len1, buf2, len2, maxcount);
10408 break;
10409 default:
10410 out = NULL;
10411 }
10412 if (kind2 != kind1)
10413 PyMem_Free(buf2);
10414 return out;
10415 }
10416
10417 static Py_ssize_t
anylib_find(int kind,PyObject * str1,void * buf1,Py_ssize_t len1,PyObject * str2,void * buf2,Py_ssize_t len2,Py_ssize_t offset)10418 anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10419 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10420 {
10421 switch (kind) {
10422 case PyUnicode_1BYTE_KIND:
10423 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10424 return asciilib_find(buf1, len1, buf2, len2, offset);
10425 else
10426 return ucs1lib_find(buf1, len1, buf2, len2, offset);
10427 case PyUnicode_2BYTE_KIND:
10428 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10429 case PyUnicode_4BYTE_KIND:
10430 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10431 }
10432 Py_UNREACHABLE();
10433 }
10434
10435 static Py_ssize_t
anylib_count(int kind,PyObject * sstr,void * sbuf,Py_ssize_t slen,PyObject * str1,void * buf1,Py_ssize_t len1,Py_ssize_t maxcount)10436 anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10437 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10438 {
10439 switch (kind) {
10440 case PyUnicode_1BYTE_KIND:
10441 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10442 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10443 else
10444 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10445 case PyUnicode_2BYTE_KIND:
10446 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10447 case PyUnicode_4BYTE_KIND:
10448 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10449 }
10450 Py_UNREACHABLE();
10451 }
10452
10453 static void
replace_1char_inplace(PyObject * u,Py_ssize_t pos,Py_UCS4 u1,Py_UCS4 u2,Py_ssize_t maxcount)10454 replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10455 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10456 {
10457 int kind = PyUnicode_KIND(u);
10458 void *data = PyUnicode_DATA(u);
10459 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10460 if (kind == PyUnicode_1BYTE_KIND) {
10461 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10462 (Py_UCS1 *)data + len,
10463 u1, u2, maxcount);
10464 }
10465 else if (kind == PyUnicode_2BYTE_KIND) {
10466 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10467 (Py_UCS2 *)data + len,
10468 u1, u2, maxcount);
10469 }
10470 else {
10471 assert(kind == PyUnicode_4BYTE_KIND);
10472 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10473 (Py_UCS4 *)data + len,
10474 u1, u2, maxcount);
10475 }
10476 }
10477
10478 static PyObject *
replace(PyObject * self,PyObject * str1,PyObject * str2,Py_ssize_t maxcount)10479 replace(PyObject *self, PyObject *str1,
10480 PyObject *str2, Py_ssize_t maxcount)
10481 {
10482 PyObject *u;
10483 char *sbuf = PyUnicode_DATA(self);
10484 char *buf1 = PyUnicode_DATA(str1);
10485 char *buf2 = PyUnicode_DATA(str2);
10486 int srelease = 0, release1 = 0, release2 = 0;
10487 int skind = PyUnicode_KIND(self);
10488 int kind1 = PyUnicode_KIND(str1);
10489 int kind2 = PyUnicode_KIND(str2);
10490 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10491 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10492 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10493 int mayshrink;
10494 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10495
10496 if (maxcount < 0)
10497 maxcount = PY_SSIZE_T_MAX;
10498 else if (maxcount == 0 || slen == 0)
10499 goto nothing;
10500
10501 if (str1 == str2)
10502 goto nothing;
10503
10504 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10505 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10506 if (maxchar < maxchar_str1)
10507 /* substring too wide to be present */
10508 goto nothing;
10509 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10510 /* Replacing str1 with str2 may cause a maxchar reduction in the
10511 result string. */
10512 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10513 maxchar = Py_MAX(maxchar, maxchar_str2);
10514
10515 if (len1 == len2) {
10516 /* same length */
10517 if (len1 == 0)
10518 goto nothing;
10519 if (len1 == 1) {
10520 /* replace characters */
10521 Py_UCS4 u1, u2;
10522 Py_ssize_t pos;
10523
10524 u1 = PyUnicode_READ(kind1, buf1, 0);
10525 pos = findchar(sbuf, skind, slen, u1, 1);
10526 if (pos < 0)
10527 goto nothing;
10528 u2 = PyUnicode_READ(kind2, buf2, 0);
10529 u = PyUnicode_New(slen, maxchar);
10530 if (!u)
10531 goto error;
10532
10533 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10534 replace_1char_inplace(u, pos, u1, u2, maxcount);
10535 }
10536 else {
10537 int rkind = skind;
10538 char *res;
10539 Py_ssize_t i;
10540
10541 if (kind1 < rkind) {
10542 /* widen substring */
10543 buf1 = _PyUnicode_AsKind(str1, rkind);
10544 if (!buf1) goto error;
10545 release1 = 1;
10546 }
10547 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10548 if (i < 0)
10549 goto nothing;
10550 if (rkind > kind2) {
10551 /* widen replacement */
10552 buf2 = _PyUnicode_AsKind(str2, rkind);
10553 if (!buf2) goto error;
10554 release2 = 1;
10555 }
10556 else if (rkind < kind2) {
10557 /* widen self and buf1 */
10558 rkind = kind2;
10559 if (release1) PyMem_Free(buf1);
10560 release1 = 0;
10561 sbuf = _PyUnicode_AsKind(self, rkind);
10562 if (!sbuf) goto error;
10563 srelease = 1;
10564 buf1 = _PyUnicode_AsKind(str1, rkind);
10565 if (!buf1) goto error;
10566 release1 = 1;
10567 }
10568 u = PyUnicode_New(slen, maxchar);
10569 if (!u)
10570 goto error;
10571 assert(PyUnicode_KIND(u) == rkind);
10572 res = PyUnicode_DATA(u);
10573
10574 memcpy(res, sbuf, rkind * slen);
10575 /* change everything in-place, starting with this one */
10576 memcpy(res + rkind * i,
10577 buf2,
10578 rkind * len2);
10579 i += len1;
10580
10581 while ( --maxcount > 0) {
10582 i = anylib_find(rkind, self,
10583 sbuf+rkind*i, slen-i,
10584 str1, buf1, len1, i);
10585 if (i == -1)
10586 break;
10587 memcpy(res + rkind * i,
10588 buf2,
10589 rkind * len2);
10590 i += len1;
10591 }
10592 }
10593 }
10594 else {
10595 Py_ssize_t n, i, j, ires;
10596 Py_ssize_t new_size;
10597 int rkind = skind;
10598 char *res;
10599
10600 if (kind1 < rkind) {
10601 /* widen substring */
10602 buf1 = _PyUnicode_AsKind(str1, rkind);
10603 if (!buf1) goto error;
10604 release1 = 1;
10605 }
10606 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10607 if (n == 0)
10608 goto nothing;
10609 if (kind2 < rkind) {
10610 /* widen replacement */
10611 buf2 = _PyUnicode_AsKind(str2, rkind);
10612 if (!buf2) goto error;
10613 release2 = 1;
10614 }
10615 else if (kind2 > rkind) {
10616 /* widen self and buf1 */
10617 rkind = kind2;
10618 sbuf = _PyUnicode_AsKind(self, rkind);
10619 if (!sbuf) goto error;
10620 srelease = 1;
10621 if (release1) PyMem_Free(buf1);
10622 release1 = 0;
10623 buf1 = _PyUnicode_AsKind(str1, rkind);
10624 if (!buf1) goto error;
10625 release1 = 1;
10626 }
10627 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10628 PyUnicode_GET_LENGTH(str1))); */
10629 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10630 PyErr_SetString(PyExc_OverflowError,
10631 "replace string is too long");
10632 goto error;
10633 }
10634 new_size = slen + n * (len2 - len1);
10635 if (new_size == 0) {
10636 _Py_INCREF_UNICODE_EMPTY();
10637 if (!unicode_empty)
10638 goto error;
10639 u = unicode_empty;
10640 goto done;
10641 }
10642 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10643 PyErr_SetString(PyExc_OverflowError,
10644 "replace string is too long");
10645 goto error;
10646 }
10647 u = PyUnicode_New(new_size, maxchar);
10648 if (!u)
10649 goto error;
10650 assert(PyUnicode_KIND(u) == rkind);
10651 res = PyUnicode_DATA(u);
10652 ires = i = 0;
10653 if (len1 > 0) {
10654 while (n-- > 0) {
10655 /* look for next match */
10656 j = anylib_find(rkind, self,
10657 sbuf + rkind * i, slen-i,
10658 str1, buf1, len1, i);
10659 if (j == -1)
10660 break;
10661 else if (j > i) {
10662 /* copy unchanged part [i:j] */
10663 memcpy(res + rkind * ires,
10664 sbuf + rkind * i,
10665 rkind * (j-i));
10666 ires += j - i;
10667 }
10668 /* copy substitution string */
10669 if (len2 > 0) {
10670 memcpy(res + rkind * ires,
10671 buf2,
10672 rkind * len2);
10673 ires += len2;
10674 }
10675 i = j + len1;
10676 }
10677 if (i < slen)
10678 /* copy tail [i:] */
10679 memcpy(res + rkind * ires,
10680 sbuf + rkind * i,
10681 rkind * (slen-i));
10682 }
10683 else {
10684 /* interleave */
10685 while (n > 0) {
10686 memcpy(res + rkind * ires,
10687 buf2,
10688 rkind * len2);
10689 ires += len2;
10690 if (--n <= 0)
10691 break;
10692 memcpy(res + rkind * ires,
10693 sbuf + rkind * i,
10694 rkind);
10695 ires++;
10696 i++;
10697 }
10698 memcpy(res + rkind * ires,
10699 sbuf + rkind * i,
10700 rkind * (slen-i));
10701 }
10702 }
10703
10704 if (mayshrink) {
10705 unicode_adjust_maxchar(&u);
10706 if (u == NULL)
10707 goto error;
10708 }
10709
10710 done:
10711 if (srelease)
10712 PyMem_FREE(sbuf);
10713 if (release1)
10714 PyMem_FREE(buf1);
10715 if (release2)
10716 PyMem_FREE(buf2);
10717 assert(_PyUnicode_CheckConsistency(u, 1));
10718 return u;
10719
10720 nothing:
10721 /* nothing to replace; return original string (when possible) */
10722 if (srelease)
10723 PyMem_FREE(sbuf);
10724 if (release1)
10725 PyMem_FREE(buf1);
10726 if (release2)
10727 PyMem_FREE(buf2);
10728 return unicode_result_unchanged(self);
10729
10730 error:
10731 if (srelease && sbuf)
10732 PyMem_FREE(sbuf);
10733 if (release1 && buf1)
10734 PyMem_FREE(buf1);
10735 if (release2 && buf2)
10736 PyMem_FREE(buf2);
10737 return NULL;
10738 }
10739
10740 /* --- Unicode Object Methods --------------------------------------------- */
10741
10742 /*[clinic input]
10743 str.title as unicode_title
10744
10745 Return a version of the string where each word is titlecased.
10746
10747 More specifically, words start with uppercased characters and all remaining
10748 cased characters have lower case.
10749 [clinic start generated code]*/
10750
10751 static PyObject *
unicode_title_impl(PyObject * self)10752 unicode_title_impl(PyObject *self)
10753 /*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10754 {
10755 if (PyUnicode_READY(self) == -1)
10756 return NULL;
10757 return case_operation(self, do_title);
10758 }
10759
10760 /*[clinic input]
10761 str.capitalize as unicode_capitalize
10762
10763 Return a capitalized version of the string.
10764
10765 More specifically, make the first character have upper case and the rest lower
10766 case.
10767 [clinic start generated code]*/
10768
10769 static PyObject *
unicode_capitalize_impl(PyObject * self)10770 unicode_capitalize_impl(PyObject *self)
10771 /*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10772 {
10773 if (PyUnicode_READY(self) == -1)
10774 return NULL;
10775 if (PyUnicode_GET_LENGTH(self) == 0)
10776 return unicode_result_unchanged(self);
10777 return case_operation(self, do_capitalize);
10778 }
10779
10780 /*[clinic input]
10781 str.casefold as unicode_casefold
10782
10783 Return a version of the string suitable for caseless comparisons.
10784 [clinic start generated code]*/
10785
10786 static PyObject *
unicode_casefold_impl(PyObject * self)10787 unicode_casefold_impl(PyObject *self)
10788 /*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10789 {
10790 if (PyUnicode_READY(self) == -1)
10791 return NULL;
10792 if (PyUnicode_IS_ASCII(self))
10793 return ascii_upper_or_lower(self, 1);
10794 return case_operation(self, do_casefold);
10795 }
10796
10797
10798 /* Argument converter. Accepts a single Unicode character. */
10799
10800 static int
convert_uc(PyObject * obj,void * addr)10801 convert_uc(PyObject *obj, void *addr)
10802 {
10803 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10804
10805 if (!PyUnicode_Check(obj)) {
10806 PyErr_Format(PyExc_TypeError,
10807 "The fill character must be a unicode character, "
10808 "not %.100s", Py_TYPE(obj)->tp_name);
10809 return 0;
10810 }
10811 if (PyUnicode_READY(obj) < 0)
10812 return 0;
10813 if (PyUnicode_GET_LENGTH(obj) != 1) {
10814 PyErr_SetString(PyExc_TypeError,
10815 "The fill character must be exactly one character long");
10816 return 0;
10817 }
10818 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10819 return 1;
10820 }
10821
10822 /*[clinic input]
10823 str.center as unicode_center
10824
10825 width: Py_ssize_t
10826 fillchar: Py_UCS4 = ' '
10827 /
10828
10829 Return a centered string of length width.
10830
10831 Padding is done using the specified fill character (default is a space).
10832 [clinic start generated code]*/
10833
10834 static PyObject *
unicode_center_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)10835 unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10836 /*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10837 {
10838 Py_ssize_t marg, left;
10839
10840 if (PyUnicode_READY(self) == -1)
10841 return NULL;
10842
10843 if (PyUnicode_GET_LENGTH(self) >= width)
10844 return unicode_result_unchanged(self);
10845
10846 marg = width - PyUnicode_GET_LENGTH(self);
10847 left = marg / 2 + (marg & width & 1);
10848
10849 return pad(self, left, marg - left, fillchar);
10850 }
10851
10852 /* This function assumes that str1 and str2 are readied by the caller. */
10853
10854 static int
unicode_compare(PyObject * str1,PyObject * str2)10855 unicode_compare(PyObject *str1, PyObject *str2)
10856 {
10857 #define COMPARE(TYPE1, TYPE2) \
10858 do { \
10859 TYPE1* p1 = (TYPE1 *)data1; \
10860 TYPE2* p2 = (TYPE2 *)data2; \
10861 TYPE1* end = p1 + len; \
10862 Py_UCS4 c1, c2; \
10863 for (; p1 != end; p1++, p2++) { \
10864 c1 = *p1; \
10865 c2 = *p2; \
10866 if (c1 != c2) \
10867 return (c1 < c2) ? -1 : 1; \
10868 } \
10869 } \
10870 while (0)
10871
10872 int kind1, kind2;
10873 void *data1, *data2;
10874 Py_ssize_t len1, len2, len;
10875
10876 kind1 = PyUnicode_KIND(str1);
10877 kind2 = PyUnicode_KIND(str2);
10878 data1 = PyUnicode_DATA(str1);
10879 data2 = PyUnicode_DATA(str2);
10880 len1 = PyUnicode_GET_LENGTH(str1);
10881 len2 = PyUnicode_GET_LENGTH(str2);
10882 len = Py_MIN(len1, len2);
10883
10884 switch(kind1) {
10885 case PyUnicode_1BYTE_KIND:
10886 {
10887 switch(kind2) {
10888 case PyUnicode_1BYTE_KIND:
10889 {
10890 int cmp = memcmp(data1, data2, len);
10891 /* normalize result of memcmp() into the range [-1; 1] */
10892 if (cmp < 0)
10893 return -1;
10894 if (cmp > 0)
10895 return 1;
10896 break;
10897 }
10898 case PyUnicode_2BYTE_KIND:
10899 COMPARE(Py_UCS1, Py_UCS2);
10900 break;
10901 case PyUnicode_4BYTE_KIND:
10902 COMPARE(Py_UCS1, Py_UCS4);
10903 break;
10904 default:
10905 Py_UNREACHABLE();
10906 }
10907 break;
10908 }
10909 case PyUnicode_2BYTE_KIND:
10910 {
10911 switch(kind2) {
10912 case PyUnicode_1BYTE_KIND:
10913 COMPARE(Py_UCS2, Py_UCS1);
10914 break;
10915 case PyUnicode_2BYTE_KIND:
10916 {
10917 COMPARE(Py_UCS2, Py_UCS2);
10918 break;
10919 }
10920 case PyUnicode_4BYTE_KIND:
10921 COMPARE(Py_UCS2, Py_UCS4);
10922 break;
10923 default:
10924 Py_UNREACHABLE();
10925 }
10926 break;
10927 }
10928 case PyUnicode_4BYTE_KIND:
10929 {
10930 switch(kind2) {
10931 case PyUnicode_1BYTE_KIND:
10932 COMPARE(Py_UCS4, Py_UCS1);
10933 break;
10934 case PyUnicode_2BYTE_KIND:
10935 COMPARE(Py_UCS4, Py_UCS2);
10936 break;
10937 case PyUnicode_4BYTE_KIND:
10938 {
10939 #if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10940 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10941 /* normalize result of wmemcmp() into the range [-1; 1] */
10942 if (cmp < 0)
10943 return -1;
10944 if (cmp > 0)
10945 return 1;
10946 #else
10947 COMPARE(Py_UCS4, Py_UCS4);
10948 #endif
10949 break;
10950 }
10951 default:
10952 Py_UNREACHABLE();
10953 }
10954 break;
10955 }
10956 default:
10957 Py_UNREACHABLE();
10958 }
10959
10960 if (len1 == len2)
10961 return 0;
10962 if (len1 < len2)
10963 return -1;
10964 else
10965 return 1;
10966
10967 #undef COMPARE
10968 }
10969
10970 static int
unicode_compare_eq(PyObject * str1,PyObject * str2)10971 unicode_compare_eq(PyObject *str1, PyObject *str2)
10972 {
10973 int kind;
10974 void *data1, *data2;
10975 Py_ssize_t len;
10976 int cmp;
10977
10978 len = PyUnicode_GET_LENGTH(str1);
10979 if (PyUnicode_GET_LENGTH(str2) != len)
10980 return 0;
10981 kind = PyUnicode_KIND(str1);
10982 if (PyUnicode_KIND(str2) != kind)
10983 return 0;
10984 data1 = PyUnicode_DATA(str1);
10985 data2 = PyUnicode_DATA(str2);
10986
10987 cmp = memcmp(data1, data2, len * kind);
10988 return (cmp == 0);
10989 }
10990
10991
10992 int
PyUnicode_Compare(PyObject * left,PyObject * right)10993 PyUnicode_Compare(PyObject *left, PyObject *right)
10994 {
10995 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10996 if (PyUnicode_READY(left) == -1 ||
10997 PyUnicode_READY(right) == -1)
10998 return -1;
10999
11000 /* a string is equal to itself */
11001 if (left == right)
11002 return 0;
11003
11004 return unicode_compare(left, right);
11005 }
11006 PyErr_Format(PyExc_TypeError,
11007 "Can't compare %.100s and %.100s",
11008 left->ob_type->tp_name,
11009 right->ob_type->tp_name);
11010 return -1;
11011 }
11012
11013 int
PyUnicode_CompareWithASCIIString(PyObject * uni,const char * str)11014 PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11015 {
11016 Py_ssize_t i;
11017 int kind;
11018 Py_UCS4 chr;
11019 const unsigned char *ustr = (const unsigned char *)str;
11020
11021 assert(_PyUnicode_CHECK(uni));
11022 if (!PyUnicode_IS_READY(uni)) {
11023 const wchar_t *ws = _PyUnicode_WSTR(uni);
11024 /* Compare Unicode string and source character set string */
11025 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11026 if (chr != ustr[i])
11027 return (chr < ustr[i]) ? -1 : 1;
11028 }
11029 /* This check keeps Python strings that end in '\0' from comparing equal
11030 to C strings identical up to that point. */
11031 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11032 return 1; /* uni is longer */
11033 if (ustr[i])
11034 return -1; /* str is longer */
11035 return 0;
11036 }
11037 kind = PyUnicode_KIND(uni);
11038 if (kind == PyUnicode_1BYTE_KIND) {
11039 const void *data = PyUnicode_1BYTE_DATA(uni);
11040 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11041 size_t len, len2 = strlen(str);
11042 int cmp;
11043
11044 len = Py_MIN(len1, len2);
11045 cmp = memcmp(data, str, len);
11046 if (cmp != 0) {
11047 if (cmp < 0)
11048 return -1;
11049 else
11050 return 1;
11051 }
11052 if (len1 > len2)
11053 return 1; /* uni is longer */
11054 if (len1 < len2)
11055 return -1; /* str is longer */
11056 return 0;
11057 }
11058 else {
11059 void *data = PyUnicode_DATA(uni);
11060 /* Compare Unicode string and source character set string */
11061 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11062 if (chr != (unsigned char)str[i])
11063 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11064 /* This check keeps Python strings that end in '\0' from comparing equal
11065 to C strings identical up to that point. */
11066 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11067 return 1; /* uni is longer */
11068 if (str[i])
11069 return -1; /* str is longer */
11070 return 0;
11071 }
11072 }
11073
11074 static int
non_ready_unicode_equal_to_ascii_string(PyObject * unicode,const char * str)11075 non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11076 {
11077 size_t i, len;
11078 const wchar_t *p;
11079 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11080 if (strlen(str) != len)
11081 return 0;
11082 p = _PyUnicode_WSTR(unicode);
11083 assert(p);
11084 for (i = 0; i < len; i++) {
11085 unsigned char c = (unsigned char)str[i];
11086 if (c >= 128 || p[i] != (wchar_t)c)
11087 return 0;
11088 }
11089 return 1;
11090 }
11091
11092 int
_PyUnicode_EqualToASCIIString(PyObject * unicode,const char * str)11093 _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11094 {
11095 size_t len;
11096 assert(_PyUnicode_CHECK(unicode));
11097 assert(str);
11098 #ifndef NDEBUG
11099 for (const char *p = str; *p; p++) {
11100 assert((unsigned char)*p < 128);
11101 }
11102 #endif
11103 if (PyUnicode_READY(unicode) == -1) {
11104 /* Memory error or bad data */
11105 PyErr_Clear();
11106 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11107 }
11108 if (!PyUnicode_IS_ASCII(unicode))
11109 return 0;
11110 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11111 return strlen(str) == len &&
11112 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11113 }
11114
11115 int
_PyUnicode_EqualToASCIIId(PyObject * left,_Py_Identifier * right)11116 _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11117 {
11118 PyObject *right_uni;
11119 Py_hash_t hash;
11120
11121 assert(_PyUnicode_CHECK(left));
11122 assert(right->string);
11123 #ifndef NDEBUG
11124 for (const char *p = right->string; *p; p++) {
11125 assert((unsigned char)*p < 128);
11126 }
11127 #endif
11128
11129 if (PyUnicode_READY(left) == -1) {
11130 /* memory error or bad data */
11131 PyErr_Clear();
11132 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11133 }
11134
11135 if (!PyUnicode_IS_ASCII(left))
11136 return 0;
11137
11138 right_uni = _PyUnicode_FromId(right); /* borrowed */
11139 if (right_uni == NULL) {
11140 /* memory error or bad data */
11141 PyErr_Clear();
11142 return _PyUnicode_EqualToASCIIString(left, right->string);
11143 }
11144
11145 if (left == right_uni)
11146 return 1;
11147
11148 if (PyUnicode_CHECK_INTERNED(left))
11149 return 0;
11150
11151 assert(_PyUnicode_HASH(right_uni) != -1);
11152 hash = _PyUnicode_HASH(left);
11153 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11154 return 0;
11155
11156 return unicode_compare_eq(left, right_uni);
11157 }
11158
11159 PyObject *
PyUnicode_RichCompare(PyObject * left,PyObject * right,int op)11160 PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11161 {
11162 int result;
11163
11164 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11165 Py_RETURN_NOTIMPLEMENTED;
11166
11167 if (PyUnicode_READY(left) == -1 ||
11168 PyUnicode_READY(right) == -1)
11169 return NULL;
11170
11171 if (left == right) {
11172 switch (op) {
11173 case Py_EQ:
11174 case Py_LE:
11175 case Py_GE:
11176 /* a string is equal to itself */
11177 Py_RETURN_TRUE;
11178 case Py_NE:
11179 case Py_LT:
11180 case Py_GT:
11181 Py_RETURN_FALSE;
11182 default:
11183 PyErr_BadArgument();
11184 return NULL;
11185 }
11186 }
11187 else if (op == Py_EQ || op == Py_NE) {
11188 result = unicode_compare_eq(left, right);
11189 result ^= (op == Py_NE);
11190 return PyBool_FromLong(result);
11191 }
11192 else {
11193 result = unicode_compare(left, right);
11194 Py_RETURN_RICHCOMPARE(result, 0, op);
11195 }
11196 }
11197
11198 int
_PyUnicode_EQ(PyObject * aa,PyObject * bb)11199 _PyUnicode_EQ(PyObject *aa, PyObject *bb)
11200 {
11201 return unicode_eq(aa, bb);
11202 }
11203
11204 int
PyUnicode_Contains(PyObject * str,PyObject * substr)11205 PyUnicode_Contains(PyObject *str, PyObject *substr)
11206 {
11207 int kind1, kind2;
11208 void *buf1, *buf2;
11209 Py_ssize_t len1, len2;
11210 int result;
11211
11212 if (!PyUnicode_Check(substr)) {
11213 PyErr_Format(PyExc_TypeError,
11214 "'in <string>' requires string as left operand, not %.100s",
11215 Py_TYPE(substr)->tp_name);
11216 return -1;
11217 }
11218 if (PyUnicode_READY(substr) == -1)
11219 return -1;
11220 if (ensure_unicode(str) < 0)
11221 return -1;
11222
11223 kind1 = PyUnicode_KIND(str);
11224 kind2 = PyUnicode_KIND(substr);
11225 if (kind1 < kind2)
11226 return 0;
11227 len1 = PyUnicode_GET_LENGTH(str);
11228 len2 = PyUnicode_GET_LENGTH(substr);
11229 if (len1 < len2)
11230 return 0;
11231 buf1 = PyUnicode_DATA(str);
11232 buf2 = PyUnicode_DATA(substr);
11233 if (len2 == 1) {
11234 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11235 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11236 return result;
11237 }
11238 if (kind2 != kind1) {
11239 buf2 = _PyUnicode_AsKind(substr, kind1);
11240 if (!buf2)
11241 return -1;
11242 }
11243
11244 switch (kind1) {
11245 case PyUnicode_1BYTE_KIND:
11246 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11247 break;
11248 case PyUnicode_2BYTE_KIND:
11249 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11250 break;
11251 case PyUnicode_4BYTE_KIND:
11252 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11253 break;
11254 default:
11255 Py_UNREACHABLE();
11256 }
11257
11258 if (kind2 != kind1)
11259 PyMem_Free(buf2);
11260
11261 return result;
11262 }
11263
11264 /* Concat to string or Unicode object giving a new Unicode object. */
11265
11266 PyObject *
PyUnicode_Concat(PyObject * left,PyObject * right)11267 PyUnicode_Concat(PyObject *left, PyObject *right)
11268 {
11269 PyObject *result;
11270 Py_UCS4 maxchar, maxchar2;
11271 Py_ssize_t left_len, right_len, new_len;
11272
11273 if (ensure_unicode(left) < 0)
11274 return NULL;
11275
11276 if (!PyUnicode_Check(right)) {
11277 PyErr_Format(PyExc_TypeError,
11278 "can only concatenate str (not \"%.200s\") to str",
11279 right->ob_type->tp_name);
11280 return NULL;
11281 }
11282 if (PyUnicode_READY(right) < 0)
11283 return NULL;
11284
11285 /* Shortcuts */
11286 if (left == unicode_empty)
11287 return PyUnicode_FromObject(right);
11288 if (right == unicode_empty)
11289 return PyUnicode_FromObject(left);
11290
11291 left_len = PyUnicode_GET_LENGTH(left);
11292 right_len = PyUnicode_GET_LENGTH(right);
11293 if (left_len > PY_SSIZE_T_MAX - right_len) {
11294 PyErr_SetString(PyExc_OverflowError,
11295 "strings are too large to concat");
11296 return NULL;
11297 }
11298 new_len = left_len + right_len;
11299
11300 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11301 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11302 maxchar = Py_MAX(maxchar, maxchar2);
11303
11304 /* Concat the two Unicode strings */
11305 result = PyUnicode_New(new_len, maxchar);
11306 if (result == NULL)
11307 return NULL;
11308 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11309 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11310 assert(_PyUnicode_CheckConsistency(result, 1));
11311 return result;
11312 }
11313
11314 void
PyUnicode_Append(PyObject ** p_left,PyObject * right)11315 PyUnicode_Append(PyObject **p_left, PyObject *right)
11316 {
11317 PyObject *left, *res;
11318 Py_UCS4 maxchar, maxchar2;
11319 Py_ssize_t left_len, right_len, new_len;
11320
11321 if (p_left == NULL) {
11322 if (!PyErr_Occurred())
11323 PyErr_BadInternalCall();
11324 return;
11325 }
11326 left = *p_left;
11327 if (right == NULL || left == NULL
11328 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11329 if (!PyErr_Occurred())
11330 PyErr_BadInternalCall();
11331 goto error;
11332 }
11333
11334 if (PyUnicode_READY(left) == -1)
11335 goto error;
11336 if (PyUnicode_READY(right) == -1)
11337 goto error;
11338
11339 /* Shortcuts */
11340 if (left == unicode_empty) {
11341 Py_DECREF(left);
11342 Py_INCREF(right);
11343 *p_left = right;
11344 return;
11345 }
11346 if (right == unicode_empty)
11347 return;
11348
11349 left_len = PyUnicode_GET_LENGTH(left);
11350 right_len = PyUnicode_GET_LENGTH(right);
11351 if (left_len > PY_SSIZE_T_MAX - right_len) {
11352 PyErr_SetString(PyExc_OverflowError,
11353 "strings are too large to concat");
11354 goto error;
11355 }
11356 new_len = left_len + right_len;
11357
11358 if (unicode_modifiable(left)
11359 && PyUnicode_CheckExact(right)
11360 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11361 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11362 to change the structure size, but characters are stored just after
11363 the structure, and so it requires to move all characters which is
11364 not so different than duplicating the string. */
11365 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11366 {
11367 /* append inplace */
11368 if (unicode_resize(p_left, new_len) != 0)
11369 goto error;
11370
11371 /* copy 'right' into the newly allocated area of 'left' */
11372 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11373 }
11374 else {
11375 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11376 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11377 maxchar = Py_MAX(maxchar, maxchar2);
11378
11379 /* Concat the two Unicode strings */
11380 res = PyUnicode_New(new_len, maxchar);
11381 if (res == NULL)
11382 goto error;
11383 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11384 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11385 Py_DECREF(left);
11386 *p_left = res;
11387 }
11388 assert(_PyUnicode_CheckConsistency(*p_left, 1));
11389 return;
11390
11391 error:
11392 Py_CLEAR(*p_left);
11393 }
11394
11395 void
PyUnicode_AppendAndDel(PyObject ** pleft,PyObject * right)11396 PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11397 {
11398 PyUnicode_Append(pleft, right);
11399 Py_XDECREF(right);
11400 }
11401
11402 /*
11403 Wraps stringlib_parse_args_finds() and additionally ensures that the
11404 first argument is a unicode object.
11405 */
11406
11407 static inline int
parse_args_finds_unicode(const char * function_name,PyObject * args,PyObject ** substring,Py_ssize_t * start,Py_ssize_t * end)11408 parse_args_finds_unicode(const char * function_name, PyObject *args,
11409 PyObject **substring,
11410 Py_ssize_t *start, Py_ssize_t *end)
11411 {
11412 if(stringlib_parse_args_finds(function_name, args, substring,
11413 start, end)) {
11414 if (ensure_unicode(*substring) < 0)
11415 return 0;
11416 return 1;
11417 }
11418 return 0;
11419 }
11420
11421 PyDoc_STRVAR(count__doc__,
11422 "S.count(sub[, start[, end]]) -> int\n\
11423 \n\
11424 Return the number of non-overlapping occurrences of substring sub in\n\
11425 string S[start:end]. Optional arguments start and end are\n\
11426 interpreted as in slice notation.");
11427
11428 static PyObject *
unicode_count(PyObject * self,PyObject * args)11429 unicode_count(PyObject *self, PyObject *args)
11430 {
11431 PyObject *substring = NULL; /* initialize to fix a compiler warning */
11432 Py_ssize_t start = 0;
11433 Py_ssize_t end = PY_SSIZE_T_MAX;
11434 PyObject *result;
11435 int kind1, kind2;
11436 void *buf1, *buf2;
11437 Py_ssize_t len1, len2, iresult;
11438
11439 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11440 return NULL;
11441
11442 kind1 = PyUnicode_KIND(self);
11443 kind2 = PyUnicode_KIND(substring);
11444 if (kind1 < kind2)
11445 return PyLong_FromLong(0);
11446
11447 len1 = PyUnicode_GET_LENGTH(self);
11448 len2 = PyUnicode_GET_LENGTH(substring);
11449 ADJUST_INDICES(start, end, len1);
11450 if (end - start < len2)
11451 return PyLong_FromLong(0);
11452
11453 buf1 = PyUnicode_DATA(self);
11454 buf2 = PyUnicode_DATA(substring);
11455 if (kind2 != kind1) {
11456 buf2 = _PyUnicode_AsKind(substring, kind1);
11457 if (!buf2)
11458 return NULL;
11459 }
11460 switch (kind1) {
11461 case PyUnicode_1BYTE_KIND:
11462 iresult = ucs1lib_count(
11463 ((Py_UCS1*)buf1) + start, end - start,
11464 buf2, len2, PY_SSIZE_T_MAX
11465 );
11466 break;
11467 case PyUnicode_2BYTE_KIND:
11468 iresult = ucs2lib_count(
11469 ((Py_UCS2*)buf1) + start, end - start,
11470 buf2, len2, PY_SSIZE_T_MAX
11471 );
11472 break;
11473 case PyUnicode_4BYTE_KIND:
11474 iresult = ucs4lib_count(
11475 ((Py_UCS4*)buf1) + start, end - start,
11476 buf2, len2, PY_SSIZE_T_MAX
11477 );
11478 break;
11479 default:
11480 Py_UNREACHABLE();
11481 }
11482
11483 result = PyLong_FromSsize_t(iresult);
11484
11485 if (kind2 != kind1)
11486 PyMem_Free(buf2);
11487
11488 return result;
11489 }
11490
11491 /*[clinic input]
11492 str.encode as unicode_encode
11493
11494 encoding: str(c_default="NULL") = 'utf-8'
11495 The encoding in which to encode the string.
11496 errors: str(c_default="NULL") = 'strict'
11497 The error handling scheme to use for encoding errors.
11498 The default is 'strict' meaning that encoding errors raise a
11499 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11500 'xmlcharrefreplace' as well as any other name registered with
11501 codecs.register_error that can handle UnicodeEncodeErrors.
11502
11503 Encode the string using the codec registered for encoding.
11504 [clinic start generated code]*/
11505
11506 static PyObject *
unicode_encode_impl(PyObject * self,const char * encoding,const char * errors)11507 unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11508 /*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11509 {
11510 return PyUnicode_AsEncodedString(self, encoding, errors);
11511 }
11512
11513 /*[clinic input]
11514 str.expandtabs as unicode_expandtabs
11515
11516 tabsize: int = 8
11517
11518 Return a copy where all tab characters are expanded using spaces.
11519
11520 If tabsize is not given, a tab size of 8 characters is assumed.
11521 [clinic start generated code]*/
11522
11523 static PyObject *
unicode_expandtabs_impl(PyObject * self,int tabsize)11524 unicode_expandtabs_impl(PyObject *self, int tabsize)
11525 /*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11526 {
11527 Py_ssize_t i, j, line_pos, src_len, incr;
11528 Py_UCS4 ch;
11529 PyObject *u;
11530 void *src_data, *dest_data;
11531 int kind;
11532 int found;
11533
11534 if (PyUnicode_READY(self) == -1)
11535 return NULL;
11536
11537 /* First pass: determine size of output string */
11538 src_len = PyUnicode_GET_LENGTH(self);
11539 i = j = line_pos = 0;
11540 kind = PyUnicode_KIND(self);
11541 src_data = PyUnicode_DATA(self);
11542 found = 0;
11543 for (; i < src_len; i++) {
11544 ch = PyUnicode_READ(kind, src_data, i);
11545 if (ch == '\t') {
11546 found = 1;
11547 if (tabsize > 0) {
11548 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11549 if (j > PY_SSIZE_T_MAX - incr)
11550 goto overflow;
11551 line_pos += incr;
11552 j += incr;
11553 }
11554 }
11555 else {
11556 if (j > PY_SSIZE_T_MAX - 1)
11557 goto overflow;
11558 line_pos++;
11559 j++;
11560 if (ch == '\n' || ch == '\r')
11561 line_pos = 0;
11562 }
11563 }
11564 if (!found)
11565 return unicode_result_unchanged(self);
11566
11567 /* Second pass: create output string and fill it */
11568 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11569 if (!u)
11570 return NULL;
11571 dest_data = PyUnicode_DATA(u);
11572
11573 i = j = line_pos = 0;
11574
11575 for (; i < src_len; i++) {
11576 ch = PyUnicode_READ(kind, src_data, i);
11577 if (ch == '\t') {
11578 if (tabsize > 0) {
11579 incr = tabsize - (line_pos % tabsize);
11580 line_pos += incr;
11581 FILL(kind, dest_data, ' ', j, incr);
11582 j += incr;
11583 }
11584 }
11585 else {
11586 line_pos++;
11587 PyUnicode_WRITE(kind, dest_data, j, ch);
11588 j++;
11589 if (ch == '\n' || ch == '\r')
11590 line_pos = 0;
11591 }
11592 }
11593 assert (j == PyUnicode_GET_LENGTH(u));
11594 return unicode_result(u);
11595
11596 overflow:
11597 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11598 return NULL;
11599 }
11600
11601 PyDoc_STRVAR(find__doc__,
11602 "S.find(sub[, start[, end]]) -> int\n\
11603 \n\
11604 Return the lowest index in S where substring sub is found,\n\
11605 such that sub is contained within S[start:end]. Optional\n\
11606 arguments start and end are interpreted as in slice notation.\n\
11607 \n\
11608 Return -1 on failure.");
11609
11610 static PyObject *
unicode_find(PyObject * self,PyObject * args)11611 unicode_find(PyObject *self, PyObject *args)
11612 {
11613 /* initialize variables to prevent gcc warning */
11614 PyObject *substring = NULL;
11615 Py_ssize_t start = 0;
11616 Py_ssize_t end = 0;
11617 Py_ssize_t result;
11618
11619 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11620 return NULL;
11621
11622 if (PyUnicode_READY(self) == -1)
11623 return NULL;
11624
11625 result = any_find_slice(self, substring, start, end, 1);
11626
11627 if (result == -2)
11628 return NULL;
11629
11630 return PyLong_FromSsize_t(result);
11631 }
11632
11633 static PyObject *
unicode_getitem(PyObject * self,Py_ssize_t index)11634 unicode_getitem(PyObject *self, Py_ssize_t index)
11635 {
11636 void *data;
11637 enum PyUnicode_Kind kind;
11638 Py_UCS4 ch;
11639
11640 if (!PyUnicode_Check(self)) {
11641 PyErr_BadArgument();
11642 return NULL;
11643 }
11644 if (PyUnicode_READY(self) == -1) {
11645 return NULL;
11646 }
11647 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11648 PyErr_SetString(PyExc_IndexError, "string index out of range");
11649 return NULL;
11650 }
11651 kind = PyUnicode_KIND(self);
11652 data = PyUnicode_DATA(self);
11653 ch = PyUnicode_READ(kind, data, index);
11654 return unicode_char(ch);
11655 }
11656
11657 /* Believe it or not, this produces the same value for ASCII strings
11658 as bytes_hash(). */
11659 static Py_hash_t
unicode_hash(PyObject * self)11660 unicode_hash(PyObject *self)
11661 {
11662 Py_ssize_t len;
11663 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
11664
11665 #ifdef Py_DEBUG
11666 assert(_Py_HashSecret_Initialized);
11667 #endif
11668 if (_PyUnicode_HASH(self) != -1)
11669 return _PyUnicode_HASH(self);
11670 if (PyUnicode_READY(self) == -1)
11671 return -1;
11672 len = PyUnicode_GET_LENGTH(self);
11673 /*
11674 We make the hash of the empty string be 0, rather than using
11675 (prefix ^ suffix), since this slightly obfuscates the hash secret
11676 */
11677 if (len == 0) {
11678 _PyUnicode_HASH(self) = 0;
11679 return 0;
11680 }
11681 x = _Py_HashBytes(PyUnicode_DATA(self),
11682 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11683 _PyUnicode_HASH(self) = x;
11684 return x;
11685 }
11686
11687 PyDoc_STRVAR(index__doc__,
11688 "S.index(sub[, start[, end]]) -> int\n\
11689 \n\
11690 Return the lowest index in S where substring sub is found, \n\
11691 such that sub is contained within S[start:end]. Optional\n\
11692 arguments start and end are interpreted as in slice notation.\n\
11693 \n\
11694 Raises ValueError when the substring is not found.");
11695
11696 static PyObject *
unicode_index(PyObject * self,PyObject * args)11697 unicode_index(PyObject *self, PyObject *args)
11698 {
11699 /* initialize variables to prevent gcc warning */
11700 Py_ssize_t result;
11701 PyObject *substring = NULL;
11702 Py_ssize_t start = 0;
11703 Py_ssize_t end = 0;
11704
11705 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11706 return NULL;
11707
11708 if (PyUnicode_READY(self) == -1)
11709 return NULL;
11710
11711 result = any_find_slice(self, substring, start, end, 1);
11712
11713 if (result == -2)
11714 return NULL;
11715
11716 if (result < 0) {
11717 PyErr_SetString(PyExc_ValueError, "substring not found");
11718 return NULL;
11719 }
11720
11721 return PyLong_FromSsize_t(result);
11722 }
11723
11724 /*[clinic input]
11725 str.isascii as unicode_isascii
11726
11727 Return True if all characters in the string are ASCII, False otherwise.
11728
11729 ASCII characters have code points in the range U+0000-U+007F.
11730 Empty string is ASCII too.
11731 [clinic start generated code]*/
11732
11733 static PyObject *
unicode_isascii_impl(PyObject * self)11734 unicode_isascii_impl(PyObject *self)
11735 /*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11736 {
11737 if (PyUnicode_READY(self) == -1) {
11738 return NULL;
11739 }
11740 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11741 }
11742
11743 /*[clinic input]
11744 str.islower as unicode_islower
11745
11746 Return True if the string is a lowercase string, False otherwise.
11747
11748 A string is lowercase if all cased characters in the string are lowercase and
11749 there is at least one cased character in the string.
11750 [clinic start generated code]*/
11751
11752 static PyObject *
unicode_islower_impl(PyObject * self)11753 unicode_islower_impl(PyObject *self)
11754 /*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11755 {
11756 Py_ssize_t i, length;
11757 int kind;
11758 void *data;
11759 int cased;
11760
11761 if (PyUnicode_READY(self) == -1)
11762 return NULL;
11763 length = PyUnicode_GET_LENGTH(self);
11764 kind = PyUnicode_KIND(self);
11765 data = PyUnicode_DATA(self);
11766
11767 /* Shortcut for single character strings */
11768 if (length == 1)
11769 return PyBool_FromLong(
11770 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11771
11772 /* Special case for empty strings */
11773 if (length == 0)
11774 Py_RETURN_FALSE;
11775
11776 cased = 0;
11777 for (i = 0; i < length; i++) {
11778 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11779
11780 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11781 Py_RETURN_FALSE;
11782 else if (!cased && Py_UNICODE_ISLOWER(ch))
11783 cased = 1;
11784 }
11785 return PyBool_FromLong(cased);
11786 }
11787
11788 /*[clinic input]
11789 str.isupper as unicode_isupper
11790
11791 Return True if the string is an uppercase string, False otherwise.
11792
11793 A string is uppercase if all cased characters in the string are uppercase and
11794 there is at least one cased character in the string.
11795 [clinic start generated code]*/
11796
11797 static PyObject *
unicode_isupper_impl(PyObject * self)11798 unicode_isupper_impl(PyObject *self)
11799 /*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11800 {
11801 Py_ssize_t i, length;
11802 int kind;
11803 void *data;
11804 int cased;
11805
11806 if (PyUnicode_READY(self) == -1)
11807 return NULL;
11808 length = PyUnicode_GET_LENGTH(self);
11809 kind = PyUnicode_KIND(self);
11810 data = PyUnicode_DATA(self);
11811
11812 /* Shortcut for single character strings */
11813 if (length == 1)
11814 return PyBool_FromLong(
11815 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11816
11817 /* Special case for empty strings */
11818 if (length == 0)
11819 Py_RETURN_FALSE;
11820
11821 cased = 0;
11822 for (i = 0; i < length; i++) {
11823 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11824
11825 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11826 Py_RETURN_FALSE;
11827 else if (!cased && Py_UNICODE_ISUPPER(ch))
11828 cased = 1;
11829 }
11830 return PyBool_FromLong(cased);
11831 }
11832
11833 /*[clinic input]
11834 str.istitle as unicode_istitle
11835
11836 Return True if the string is a title-cased string, False otherwise.
11837
11838 In a title-cased string, upper- and title-case characters may only
11839 follow uncased characters and lowercase characters only cased ones.
11840 [clinic start generated code]*/
11841
11842 static PyObject *
unicode_istitle_impl(PyObject * self)11843 unicode_istitle_impl(PyObject *self)
11844 /*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11845 {
11846 Py_ssize_t i, length;
11847 int kind;
11848 void *data;
11849 int cased, previous_is_cased;
11850
11851 if (PyUnicode_READY(self) == -1)
11852 return NULL;
11853 length = PyUnicode_GET_LENGTH(self);
11854 kind = PyUnicode_KIND(self);
11855 data = PyUnicode_DATA(self);
11856
11857 /* Shortcut for single character strings */
11858 if (length == 1) {
11859 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11860 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11861 (Py_UNICODE_ISUPPER(ch) != 0));
11862 }
11863
11864 /* Special case for empty strings */
11865 if (length == 0)
11866 Py_RETURN_FALSE;
11867
11868 cased = 0;
11869 previous_is_cased = 0;
11870 for (i = 0; i < length; i++) {
11871 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11872
11873 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11874 if (previous_is_cased)
11875 Py_RETURN_FALSE;
11876 previous_is_cased = 1;
11877 cased = 1;
11878 }
11879 else if (Py_UNICODE_ISLOWER(ch)) {
11880 if (!previous_is_cased)
11881 Py_RETURN_FALSE;
11882 previous_is_cased = 1;
11883 cased = 1;
11884 }
11885 else
11886 previous_is_cased = 0;
11887 }
11888 return PyBool_FromLong(cased);
11889 }
11890
11891 /*[clinic input]
11892 str.isspace as unicode_isspace
11893
11894 Return True if the string is a whitespace string, False otherwise.
11895
11896 A string is whitespace if all characters in the string are whitespace and there
11897 is at least one character in the string.
11898 [clinic start generated code]*/
11899
11900 static PyObject *
unicode_isspace_impl(PyObject * self)11901 unicode_isspace_impl(PyObject *self)
11902 /*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11903 {
11904 Py_ssize_t i, length;
11905 int kind;
11906 void *data;
11907
11908 if (PyUnicode_READY(self) == -1)
11909 return NULL;
11910 length = PyUnicode_GET_LENGTH(self);
11911 kind = PyUnicode_KIND(self);
11912 data = PyUnicode_DATA(self);
11913
11914 /* Shortcut for single character strings */
11915 if (length == 1)
11916 return PyBool_FromLong(
11917 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11918
11919 /* Special case for empty strings */
11920 if (length == 0)
11921 Py_RETURN_FALSE;
11922
11923 for (i = 0; i < length; i++) {
11924 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11925 if (!Py_UNICODE_ISSPACE(ch))
11926 Py_RETURN_FALSE;
11927 }
11928 Py_RETURN_TRUE;
11929 }
11930
11931 /*[clinic input]
11932 str.isalpha as unicode_isalpha
11933
11934 Return True if the string is an alphabetic string, False otherwise.
11935
11936 A string is alphabetic if all characters in the string are alphabetic and there
11937 is at least one character in the string.
11938 [clinic start generated code]*/
11939
11940 static PyObject *
unicode_isalpha_impl(PyObject * self)11941 unicode_isalpha_impl(PyObject *self)
11942 /*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11943 {
11944 Py_ssize_t i, length;
11945 int kind;
11946 void *data;
11947
11948 if (PyUnicode_READY(self) == -1)
11949 return NULL;
11950 length = PyUnicode_GET_LENGTH(self);
11951 kind = PyUnicode_KIND(self);
11952 data = PyUnicode_DATA(self);
11953
11954 /* Shortcut for single character strings */
11955 if (length == 1)
11956 return PyBool_FromLong(
11957 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11958
11959 /* Special case for empty strings */
11960 if (length == 0)
11961 Py_RETURN_FALSE;
11962
11963 for (i = 0; i < length; i++) {
11964 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11965 Py_RETURN_FALSE;
11966 }
11967 Py_RETURN_TRUE;
11968 }
11969
11970 /*[clinic input]
11971 str.isalnum as unicode_isalnum
11972
11973 Return True if the string is an alpha-numeric string, False otherwise.
11974
11975 A string is alpha-numeric if all characters in the string are alpha-numeric and
11976 there is at least one character in the string.
11977 [clinic start generated code]*/
11978
11979 static PyObject *
unicode_isalnum_impl(PyObject * self)11980 unicode_isalnum_impl(PyObject *self)
11981 /*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11982 {
11983 int kind;
11984 void *data;
11985 Py_ssize_t len, i;
11986
11987 if (PyUnicode_READY(self) == -1)
11988 return NULL;
11989
11990 kind = PyUnicode_KIND(self);
11991 data = PyUnicode_DATA(self);
11992 len = PyUnicode_GET_LENGTH(self);
11993
11994 /* Shortcut for single character strings */
11995 if (len == 1) {
11996 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11997 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11998 }
11999
12000 /* Special case for empty strings */
12001 if (len == 0)
12002 Py_RETURN_FALSE;
12003
12004 for (i = 0; i < len; i++) {
12005 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12006 if (!Py_UNICODE_ISALNUM(ch))
12007 Py_RETURN_FALSE;
12008 }
12009 Py_RETURN_TRUE;
12010 }
12011
12012 /*[clinic input]
12013 str.isdecimal as unicode_isdecimal
12014
12015 Return True if the string is a decimal string, False otherwise.
12016
12017 A string is a decimal string if all characters in the string are decimal and
12018 there is at least one character in the string.
12019 [clinic start generated code]*/
12020
12021 static PyObject *
unicode_isdecimal_impl(PyObject * self)12022 unicode_isdecimal_impl(PyObject *self)
12023 /*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12024 {
12025 Py_ssize_t i, length;
12026 int kind;
12027 void *data;
12028
12029 if (PyUnicode_READY(self) == -1)
12030 return NULL;
12031 length = PyUnicode_GET_LENGTH(self);
12032 kind = PyUnicode_KIND(self);
12033 data = PyUnicode_DATA(self);
12034
12035 /* Shortcut for single character strings */
12036 if (length == 1)
12037 return PyBool_FromLong(
12038 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12039
12040 /* Special case for empty strings */
12041 if (length == 0)
12042 Py_RETURN_FALSE;
12043
12044 for (i = 0; i < length; i++) {
12045 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12046 Py_RETURN_FALSE;
12047 }
12048 Py_RETURN_TRUE;
12049 }
12050
12051 /*[clinic input]
12052 str.isdigit as unicode_isdigit
12053
12054 Return True if the string is a digit string, False otherwise.
12055
12056 A string is a digit string if all characters in the string are digits and there
12057 is at least one character in the string.
12058 [clinic start generated code]*/
12059
12060 static PyObject *
unicode_isdigit_impl(PyObject * self)12061 unicode_isdigit_impl(PyObject *self)
12062 /*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12063 {
12064 Py_ssize_t i, length;
12065 int kind;
12066 void *data;
12067
12068 if (PyUnicode_READY(self) == -1)
12069 return NULL;
12070 length = PyUnicode_GET_LENGTH(self);
12071 kind = PyUnicode_KIND(self);
12072 data = PyUnicode_DATA(self);
12073
12074 /* Shortcut for single character strings */
12075 if (length == 1) {
12076 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12077 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12078 }
12079
12080 /* Special case for empty strings */
12081 if (length == 0)
12082 Py_RETURN_FALSE;
12083
12084 for (i = 0; i < length; i++) {
12085 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12086 Py_RETURN_FALSE;
12087 }
12088 Py_RETURN_TRUE;
12089 }
12090
12091 /*[clinic input]
12092 str.isnumeric as unicode_isnumeric
12093
12094 Return True if the string is a numeric string, False otherwise.
12095
12096 A string is numeric if all characters in the string are numeric and there is at
12097 least one character in the string.
12098 [clinic start generated code]*/
12099
12100 static PyObject *
unicode_isnumeric_impl(PyObject * self)12101 unicode_isnumeric_impl(PyObject *self)
12102 /*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12103 {
12104 Py_ssize_t i, length;
12105 int kind;
12106 void *data;
12107
12108 if (PyUnicode_READY(self) == -1)
12109 return NULL;
12110 length = PyUnicode_GET_LENGTH(self);
12111 kind = PyUnicode_KIND(self);
12112 data = PyUnicode_DATA(self);
12113
12114 /* Shortcut for single character strings */
12115 if (length == 1)
12116 return PyBool_FromLong(
12117 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12118
12119 /* Special case for empty strings */
12120 if (length == 0)
12121 Py_RETURN_FALSE;
12122
12123 for (i = 0; i < length; i++) {
12124 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12125 Py_RETURN_FALSE;
12126 }
12127 Py_RETURN_TRUE;
12128 }
12129
12130 int
PyUnicode_IsIdentifier(PyObject * self)12131 PyUnicode_IsIdentifier(PyObject *self)
12132 {
12133 int kind;
12134 void *data;
12135 Py_ssize_t i;
12136 Py_UCS4 first;
12137
12138 if (PyUnicode_READY(self) == -1) {
12139 Py_FatalError("identifier not ready");
12140 return 0;
12141 }
12142
12143 /* Special case for empty strings */
12144 if (PyUnicode_GET_LENGTH(self) == 0)
12145 return 0;
12146 kind = PyUnicode_KIND(self);
12147 data = PyUnicode_DATA(self);
12148
12149 /* PEP 3131 says that the first character must be in
12150 XID_Start and subsequent characters in XID_Continue,
12151 and for the ASCII range, the 2.x rules apply (i.e
12152 start with letters and underscore, continue with
12153 letters, digits, underscore). However, given the current
12154 definition of XID_Start and XID_Continue, it is sufficient
12155 to check just for these, except that _ must be allowed
12156 as starting an identifier. */
12157 first = PyUnicode_READ(kind, data, 0);
12158 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12159 return 0;
12160
12161 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12162 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12163 return 0;
12164 return 1;
12165 }
12166
12167 /*[clinic input]
12168 str.isidentifier as unicode_isidentifier
12169
12170 Return True if the string is a valid Python identifier, False otherwise.
12171
12172 Use keyword.iskeyword() to test for reserved identifiers such as "def" and
12173 "class".
12174 [clinic start generated code]*/
12175
12176 static PyObject *
unicode_isidentifier_impl(PyObject * self)12177 unicode_isidentifier_impl(PyObject *self)
12178 /*[clinic end generated code: output=fe585a9666572905 input=916b0a3c9f57e919]*/
12179 {
12180 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12181 }
12182
12183 /*[clinic input]
12184 str.isprintable as unicode_isprintable
12185
12186 Return True if the string is printable, False otherwise.
12187
12188 A string is printable if all of its characters are considered printable in
12189 repr() or if it is empty.
12190 [clinic start generated code]*/
12191
12192 static PyObject *
unicode_isprintable_impl(PyObject * self)12193 unicode_isprintable_impl(PyObject *self)
12194 /*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12195 {
12196 Py_ssize_t i, length;
12197 int kind;
12198 void *data;
12199
12200 if (PyUnicode_READY(self) == -1)
12201 return NULL;
12202 length = PyUnicode_GET_LENGTH(self);
12203 kind = PyUnicode_KIND(self);
12204 data = PyUnicode_DATA(self);
12205
12206 /* Shortcut for single character strings */
12207 if (length == 1)
12208 return PyBool_FromLong(
12209 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12210
12211 for (i = 0; i < length; i++) {
12212 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12213 Py_RETURN_FALSE;
12214 }
12215 }
12216 Py_RETURN_TRUE;
12217 }
12218
12219 /*[clinic input]
12220 str.join as unicode_join
12221
12222 iterable: object
12223 /
12224
12225 Concatenate any number of strings.
12226
12227 The string whose method is called is inserted in between each given string.
12228 The result is returned as a new string.
12229
12230 Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12231 [clinic start generated code]*/
12232
12233 static PyObject *
unicode_join(PyObject * self,PyObject * iterable)12234 unicode_join(PyObject *self, PyObject *iterable)
12235 /*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12236 {
12237 return PyUnicode_Join(self, iterable);
12238 }
12239
12240 static Py_ssize_t
unicode_length(PyObject * self)12241 unicode_length(PyObject *self)
12242 {
12243 if (PyUnicode_READY(self) == -1)
12244 return -1;
12245 return PyUnicode_GET_LENGTH(self);
12246 }
12247
12248 /*[clinic input]
12249 str.ljust as unicode_ljust
12250
12251 width: Py_ssize_t
12252 fillchar: Py_UCS4 = ' '
12253 /
12254
12255 Return a left-justified string of length width.
12256
12257 Padding is done using the specified fill character (default is a space).
12258 [clinic start generated code]*/
12259
12260 static PyObject *
unicode_ljust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12261 unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12262 /*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12263 {
12264 if (PyUnicode_READY(self) == -1)
12265 return NULL;
12266
12267 if (PyUnicode_GET_LENGTH(self) >= width)
12268 return unicode_result_unchanged(self);
12269
12270 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12271 }
12272
12273 /*[clinic input]
12274 str.lower as unicode_lower
12275
12276 Return a copy of the string converted to lowercase.
12277 [clinic start generated code]*/
12278
12279 static PyObject *
unicode_lower_impl(PyObject * self)12280 unicode_lower_impl(PyObject *self)
12281 /*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12282 {
12283 if (PyUnicode_READY(self) == -1)
12284 return NULL;
12285 if (PyUnicode_IS_ASCII(self))
12286 return ascii_upper_or_lower(self, 1);
12287 return case_operation(self, do_lower);
12288 }
12289
12290 #define LEFTSTRIP 0
12291 #define RIGHTSTRIP 1
12292 #define BOTHSTRIP 2
12293
12294 /* Arrays indexed by above */
12295 static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12296
12297 #define STRIPNAME(i) (stripfuncnames[i])
12298
12299 /* externally visible for str.strip(unicode) */
12300 PyObject *
_PyUnicode_XStrip(PyObject * self,int striptype,PyObject * sepobj)12301 _PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12302 {
12303 void *data;
12304 int kind;
12305 Py_ssize_t i, j, len;
12306 BLOOM_MASK sepmask;
12307 Py_ssize_t seplen;
12308
12309 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12310 return NULL;
12311
12312 kind = PyUnicode_KIND(self);
12313 data = PyUnicode_DATA(self);
12314 len = PyUnicode_GET_LENGTH(self);
12315 seplen = PyUnicode_GET_LENGTH(sepobj);
12316 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12317 PyUnicode_DATA(sepobj),
12318 seplen);
12319
12320 i = 0;
12321 if (striptype != RIGHTSTRIP) {
12322 while (i < len) {
12323 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12324 if (!BLOOM(sepmask, ch))
12325 break;
12326 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12327 break;
12328 i++;
12329 }
12330 }
12331
12332 j = len;
12333 if (striptype != LEFTSTRIP) {
12334 j--;
12335 while (j >= i) {
12336 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12337 if (!BLOOM(sepmask, ch))
12338 break;
12339 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12340 break;
12341 j--;
12342 }
12343
12344 j++;
12345 }
12346
12347 return PyUnicode_Substring(self, i, j);
12348 }
12349
12350 PyObject*
PyUnicode_Substring(PyObject * self,Py_ssize_t start,Py_ssize_t end)12351 PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12352 {
12353 unsigned char *data;
12354 int kind;
12355 Py_ssize_t length;
12356
12357 if (PyUnicode_READY(self) == -1)
12358 return NULL;
12359
12360 length = PyUnicode_GET_LENGTH(self);
12361 end = Py_MIN(end, length);
12362
12363 if (start == 0 && end == length)
12364 return unicode_result_unchanged(self);
12365
12366 if (start < 0 || end < 0) {
12367 PyErr_SetString(PyExc_IndexError, "string index out of range");
12368 return NULL;
12369 }
12370 if (start >= length || end < start)
12371 _Py_RETURN_UNICODE_EMPTY();
12372
12373 length = end - start;
12374 if (PyUnicode_IS_ASCII(self)) {
12375 data = PyUnicode_1BYTE_DATA(self);
12376 return _PyUnicode_FromASCII((char*)(data + start), length);
12377 }
12378 else {
12379 kind = PyUnicode_KIND(self);
12380 data = PyUnicode_1BYTE_DATA(self);
12381 return PyUnicode_FromKindAndData(kind,
12382 data + kind * start,
12383 length);
12384 }
12385 }
12386
12387 static PyObject *
do_strip(PyObject * self,int striptype)12388 do_strip(PyObject *self, int striptype)
12389 {
12390 Py_ssize_t len, i, j;
12391
12392 if (PyUnicode_READY(self) == -1)
12393 return NULL;
12394
12395 len = PyUnicode_GET_LENGTH(self);
12396
12397 if (PyUnicode_IS_ASCII(self)) {
12398 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12399
12400 i = 0;
12401 if (striptype != RIGHTSTRIP) {
12402 while (i < len) {
12403 Py_UCS1 ch = data[i];
12404 if (!_Py_ascii_whitespace[ch])
12405 break;
12406 i++;
12407 }
12408 }
12409
12410 j = len;
12411 if (striptype != LEFTSTRIP) {
12412 j--;
12413 while (j >= i) {
12414 Py_UCS1 ch = data[j];
12415 if (!_Py_ascii_whitespace[ch])
12416 break;
12417 j--;
12418 }
12419 j++;
12420 }
12421 }
12422 else {
12423 int kind = PyUnicode_KIND(self);
12424 void *data = PyUnicode_DATA(self);
12425
12426 i = 0;
12427 if (striptype != RIGHTSTRIP) {
12428 while (i < len) {
12429 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12430 if (!Py_UNICODE_ISSPACE(ch))
12431 break;
12432 i++;
12433 }
12434 }
12435
12436 j = len;
12437 if (striptype != LEFTSTRIP) {
12438 j--;
12439 while (j >= i) {
12440 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12441 if (!Py_UNICODE_ISSPACE(ch))
12442 break;
12443 j--;
12444 }
12445 j++;
12446 }
12447 }
12448
12449 return PyUnicode_Substring(self, i, j);
12450 }
12451
12452
12453 static PyObject *
do_argstrip(PyObject * self,int striptype,PyObject * sep)12454 do_argstrip(PyObject *self, int striptype, PyObject *sep)
12455 {
12456 if (sep != NULL && sep != Py_None) {
12457 if (PyUnicode_Check(sep))
12458 return _PyUnicode_XStrip(self, striptype, sep);
12459 else {
12460 PyErr_Format(PyExc_TypeError,
12461 "%s arg must be None or str",
12462 STRIPNAME(striptype));
12463 return NULL;
12464 }
12465 }
12466
12467 return do_strip(self, striptype);
12468 }
12469
12470
12471 /*[clinic input]
12472 str.strip as unicode_strip
12473
12474 chars: object = None
12475 /
12476
12477 Return a copy of the string with leading and trailing whitespace removed.
12478
12479 If chars is given and not None, remove characters in chars instead.
12480 [clinic start generated code]*/
12481
12482 static PyObject *
unicode_strip_impl(PyObject * self,PyObject * chars)12483 unicode_strip_impl(PyObject *self, PyObject *chars)
12484 /*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12485 {
12486 return do_argstrip(self, BOTHSTRIP, chars);
12487 }
12488
12489
12490 /*[clinic input]
12491 str.lstrip as unicode_lstrip
12492
12493 chars: object = NULL
12494 /
12495
12496 Return a copy of the string with leading whitespace removed.
12497
12498 If chars is given and not None, remove characters in chars instead.
12499 [clinic start generated code]*/
12500
12501 static PyObject *
unicode_lstrip_impl(PyObject * self,PyObject * chars)12502 unicode_lstrip_impl(PyObject *self, PyObject *chars)
12503 /*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
12504 {
12505 return do_argstrip(self, LEFTSTRIP, chars);
12506 }
12507
12508
12509 /*[clinic input]
12510 str.rstrip as unicode_rstrip
12511
12512 chars: object = NULL
12513 /
12514
12515 Return a copy of the string with trailing whitespace removed.
12516
12517 If chars is given and not None, remove characters in chars instead.
12518 [clinic start generated code]*/
12519
12520 static PyObject *
unicode_rstrip_impl(PyObject * self,PyObject * chars)12521 unicode_rstrip_impl(PyObject *self, PyObject *chars)
12522 /*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
12523 {
12524 return do_argstrip(self, RIGHTSTRIP, chars);
12525 }
12526
12527
12528 static PyObject*
unicode_repeat(PyObject * str,Py_ssize_t len)12529 unicode_repeat(PyObject *str, Py_ssize_t len)
12530 {
12531 PyObject *u;
12532 Py_ssize_t nchars, n;
12533
12534 if (len < 1)
12535 _Py_RETURN_UNICODE_EMPTY();
12536
12537 /* no repeat, return original string */
12538 if (len == 1)
12539 return unicode_result_unchanged(str);
12540
12541 if (PyUnicode_READY(str) == -1)
12542 return NULL;
12543
12544 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12545 PyErr_SetString(PyExc_OverflowError,
12546 "repeated string is too long");
12547 return NULL;
12548 }
12549 nchars = len * PyUnicode_GET_LENGTH(str);
12550
12551 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12552 if (!u)
12553 return NULL;
12554 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12555
12556 if (PyUnicode_GET_LENGTH(str) == 1) {
12557 const int kind = PyUnicode_KIND(str);
12558 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12559 if (kind == PyUnicode_1BYTE_KIND) {
12560 void *to = PyUnicode_DATA(u);
12561 memset(to, (unsigned char)fill_char, len);
12562 }
12563 else if (kind == PyUnicode_2BYTE_KIND) {
12564 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12565 for (n = 0; n < len; ++n)
12566 ucs2[n] = fill_char;
12567 } else {
12568 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12569 assert(kind == PyUnicode_4BYTE_KIND);
12570 for (n = 0; n < len; ++n)
12571 ucs4[n] = fill_char;
12572 }
12573 }
12574 else {
12575 /* number of characters copied this far */
12576 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12577 const Py_ssize_t char_size = PyUnicode_KIND(str);
12578 char *to = (char *) PyUnicode_DATA(u);
12579 memcpy(to, PyUnicode_DATA(str),
12580 PyUnicode_GET_LENGTH(str) * char_size);
12581 while (done < nchars) {
12582 n = (done <= nchars-done) ? done : nchars-done;
12583 memcpy(to + (done * char_size), to, n * char_size);
12584 done += n;
12585 }
12586 }
12587
12588 assert(_PyUnicode_CheckConsistency(u, 1));
12589 return u;
12590 }
12591
12592 PyObject *
PyUnicode_Replace(PyObject * str,PyObject * substr,PyObject * replstr,Py_ssize_t maxcount)12593 PyUnicode_Replace(PyObject *str,
12594 PyObject *substr,
12595 PyObject *replstr,
12596 Py_ssize_t maxcount)
12597 {
12598 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12599 ensure_unicode(replstr) < 0)
12600 return NULL;
12601 return replace(str, substr, replstr, maxcount);
12602 }
12603
12604 /*[clinic input]
12605 str.replace as unicode_replace
12606
12607 old: unicode
12608 new: unicode
12609 count: Py_ssize_t = -1
12610 Maximum number of occurrences to replace.
12611 -1 (the default value) means replace all occurrences.
12612 /
12613
12614 Return a copy with all occurrences of substring old replaced by new.
12615
12616 If the optional argument count is given, only the first count occurrences are
12617 replaced.
12618 [clinic start generated code]*/
12619
12620 static PyObject *
unicode_replace_impl(PyObject * self,PyObject * old,PyObject * new,Py_ssize_t count)12621 unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12622 Py_ssize_t count)
12623 /*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12624 {
12625 if (PyUnicode_READY(self) == -1)
12626 return NULL;
12627 return replace(self, old, new, count);
12628 }
12629
12630 static PyObject *
unicode_repr(PyObject * unicode)12631 unicode_repr(PyObject *unicode)
12632 {
12633 PyObject *repr;
12634 Py_ssize_t isize;
12635 Py_ssize_t osize, squote, dquote, i, o;
12636 Py_UCS4 max, quote;
12637 int ikind, okind, unchanged;
12638 void *idata, *odata;
12639
12640 if (PyUnicode_READY(unicode) == -1)
12641 return NULL;
12642
12643 isize = PyUnicode_GET_LENGTH(unicode);
12644 idata = PyUnicode_DATA(unicode);
12645
12646 /* Compute length of output, quote characters, and
12647 maximum character */
12648 osize = 0;
12649 max = 127;
12650 squote = dquote = 0;
12651 ikind = PyUnicode_KIND(unicode);
12652 for (i = 0; i < isize; i++) {
12653 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12654 Py_ssize_t incr = 1;
12655 switch (ch) {
12656 case '\'': squote++; break;
12657 case '"': dquote++; break;
12658 case '\\': case '\t': case '\r': case '\n':
12659 incr = 2;
12660 break;
12661 default:
12662 /* Fast-path ASCII */
12663 if (ch < ' ' || ch == 0x7f)
12664 incr = 4; /* \xHH */
12665 else if (ch < 0x7f)
12666 ;
12667 else if (Py_UNICODE_ISPRINTABLE(ch))
12668 max = ch > max ? ch : max;
12669 else if (ch < 0x100)
12670 incr = 4; /* \xHH */
12671 else if (ch < 0x10000)
12672 incr = 6; /* \uHHHH */
12673 else
12674 incr = 10; /* \uHHHHHHHH */
12675 }
12676 if (osize > PY_SSIZE_T_MAX - incr) {
12677 PyErr_SetString(PyExc_OverflowError,
12678 "string is too long to generate repr");
12679 return NULL;
12680 }
12681 osize += incr;
12682 }
12683
12684 quote = '\'';
12685 unchanged = (osize == isize);
12686 if (squote) {
12687 unchanged = 0;
12688 if (dquote)
12689 /* Both squote and dquote present. Use squote,
12690 and escape them */
12691 osize += squote;
12692 else
12693 quote = '"';
12694 }
12695 osize += 2; /* quotes */
12696
12697 repr = PyUnicode_New(osize, max);
12698 if (repr == NULL)
12699 return NULL;
12700 okind = PyUnicode_KIND(repr);
12701 odata = PyUnicode_DATA(repr);
12702
12703 PyUnicode_WRITE(okind, odata, 0, quote);
12704 PyUnicode_WRITE(okind, odata, osize-1, quote);
12705 if (unchanged) {
12706 _PyUnicode_FastCopyCharacters(repr, 1,
12707 unicode, 0,
12708 isize);
12709 }
12710 else {
12711 for (i = 0, o = 1; i < isize; i++) {
12712 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12713
12714 /* Escape quotes and backslashes */
12715 if ((ch == quote) || (ch == '\\')) {
12716 PyUnicode_WRITE(okind, odata, o++, '\\');
12717 PyUnicode_WRITE(okind, odata, o++, ch);
12718 continue;
12719 }
12720
12721 /* Map special whitespace to '\t', \n', '\r' */
12722 if (ch == '\t') {
12723 PyUnicode_WRITE(okind, odata, o++, '\\');
12724 PyUnicode_WRITE(okind, odata, o++, 't');
12725 }
12726 else if (ch == '\n') {
12727 PyUnicode_WRITE(okind, odata, o++, '\\');
12728 PyUnicode_WRITE(okind, odata, o++, 'n');
12729 }
12730 else if (ch == '\r') {
12731 PyUnicode_WRITE(okind, odata, o++, '\\');
12732 PyUnicode_WRITE(okind, odata, o++, 'r');
12733 }
12734
12735 /* Map non-printable US ASCII to '\xhh' */
12736 else if (ch < ' ' || ch == 0x7F) {
12737 PyUnicode_WRITE(okind, odata, o++, '\\');
12738 PyUnicode_WRITE(okind, odata, o++, 'x');
12739 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12740 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12741 }
12742
12743 /* Copy ASCII characters as-is */
12744 else if (ch < 0x7F) {
12745 PyUnicode_WRITE(okind, odata, o++, ch);
12746 }
12747
12748 /* Non-ASCII characters */
12749 else {
12750 /* Map Unicode whitespace and control characters
12751 (categories Z* and C* except ASCII space)
12752 */
12753 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12754 PyUnicode_WRITE(okind, odata, o++, '\\');
12755 /* Map 8-bit characters to '\xhh' */
12756 if (ch <= 0xff) {
12757 PyUnicode_WRITE(okind, odata, o++, 'x');
12758 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12759 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12760 }
12761 /* Map 16-bit characters to '\uxxxx' */
12762 else if (ch <= 0xffff) {
12763 PyUnicode_WRITE(okind, odata, o++, 'u');
12764 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12765 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12768 }
12769 /* Map 21-bit characters to '\U00xxxxxx' */
12770 else {
12771 PyUnicode_WRITE(okind, odata, o++, 'U');
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12776 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12777 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12778 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12779 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12780 }
12781 }
12782 /* Copy characters as-is */
12783 else {
12784 PyUnicode_WRITE(okind, odata, o++, ch);
12785 }
12786 }
12787 }
12788 }
12789 /* Closing quote already added at the beginning */
12790 assert(_PyUnicode_CheckConsistency(repr, 1));
12791 return repr;
12792 }
12793
12794 PyDoc_STRVAR(rfind__doc__,
12795 "S.rfind(sub[, start[, end]]) -> int\n\
12796 \n\
12797 Return the highest index in S where substring sub is found,\n\
12798 such that sub is contained within S[start:end]. Optional\n\
12799 arguments start and end are interpreted as in slice notation.\n\
12800 \n\
12801 Return -1 on failure.");
12802
12803 static PyObject *
unicode_rfind(PyObject * self,PyObject * args)12804 unicode_rfind(PyObject *self, PyObject *args)
12805 {
12806 /* initialize variables to prevent gcc warning */
12807 PyObject *substring = NULL;
12808 Py_ssize_t start = 0;
12809 Py_ssize_t end = 0;
12810 Py_ssize_t result;
12811
12812 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12813 return NULL;
12814
12815 if (PyUnicode_READY(self) == -1)
12816 return NULL;
12817
12818 result = any_find_slice(self, substring, start, end, -1);
12819
12820 if (result == -2)
12821 return NULL;
12822
12823 return PyLong_FromSsize_t(result);
12824 }
12825
12826 PyDoc_STRVAR(rindex__doc__,
12827 "S.rindex(sub[, start[, end]]) -> int\n\
12828 \n\
12829 Return the highest index in S where substring sub is found,\n\
12830 such that sub is contained within S[start:end]. Optional\n\
12831 arguments start and end are interpreted as in slice notation.\n\
12832 \n\
12833 Raises ValueError when the substring is not found.");
12834
12835 static PyObject *
unicode_rindex(PyObject * self,PyObject * args)12836 unicode_rindex(PyObject *self, PyObject *args)
12837 {
12838 /* initialize variables to prevent gcc warning */
12839 PyObject *substring = NULL;
12840 Py_ssize_t start = 0;
12841 Py_ssize_t end = 0;
12842 Py_ssize_t result;
12843
12844 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12845 return NULL;
12846
12847 if (PyUnicode_READY(self) == -1)
12848 return NULL;
12849
12850 result = any_find_slice(self, substring, start, end, -1);
12851
12852 if (result == -2)
12853 return NULL;
12854
12855 if (result < 0) {
12856 PyErr_SetString(PyExc_ValueError, "substring not found");
12857 return NULL;
12858 }
12859
12860 return PyLong_FromSsize_t(result);
12861 }
12862
12863 /*[clinic input]
12864 str.rjust as unicode_rjust
12865
12866 width: Py_ssize_t
12867 fillchar: Py_UCS4 = ' '
12868 /
12869
12870 Return a right-justified string of length width.
12871
12872 Padding is done using the specified fill character (default is a space).
12873 [clinic start generated code]*/
12874
12875 static PyObject *
unicode_rjust_impl(PyObject * self,Py_ssize_t width,Py_UCS4 fillchar)12876 unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12877 /*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12878 {
12879 if (PyUnicode_READY(self) == -1)
12880 return NULL;
12881
12882 if (PyUnicode_GET_LENGTH(self) >= width)
12883 return unicode_result_unchanged(self);
12884
12885 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12886 }
12887
12888 PyObject *
PyUnicode_Split(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)12889 PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12890 {
12891 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12892 return NULL;
12893
12894 return split(s, sep, maxsplit);
12895 }
12896
12897 /*[clinic input]
12898 str.split as unicode_split
12899
12900 sep: object = None
12901 The delimiter according which to split the string.
12902 None (the default value) means split according to any whitespace,
12903 and discard empty strings from the result.
12904 maxsplit: Py_ssize_t = -1
12905 Maximum number of splits to do.
12906 -1 (the default value) means no limit.
12907
12908 Return a list of the words in the string, using sep as the delimiter string.
12909 [clinic start generated code]*/
12910
12911 static PyObject *
unicode_split_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)12912 unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12913 /*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12914 {
12915 if (sep == Py_None)
12916 return split(self, NULL, maxsplit);
12917 if (PyUnicode_Check(sep))
12918 return split(self, sep, maxsplit);
12919
12920 PyErr_Format(PyExc_TypeError,
12921 "must be str or None, not %.100s",
12922 Py_TYPE(sep)->tp_name);
12923 return NULL;
12924 }
12925
12926 PyObject *
PyUnicode_Partition(PyObject * str_obj,PyObject * sep_obj)12927 PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12928 {
12929 PyObject* out;
12930 int kind1, kind2;
12931 void *buf1, *buf2;
12932 Py_ssize_t len1, len2;
12933
12934 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12935 return NULL;
12936
12937 kind1 = PyUnicode_KIND(str_obj);
12938 kind2 = PyUnicode_KIND(sep_obj);
12939 len1 = PyUnicode_GET_LENGTH(str_obj);
12940 len2 = PyUnicode_GET_LENGTH(sep_obj);
12941 if (kind1 < kind2 || len1 < len2) {
12942 _Py_INCREF_UNICODE_EMPTY();
12943 if (!unicode_empty)
12944 out = NULL;
12945 else {
12946 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12947 Py_DECREF(unicode_empty);
12948 }
12949 return out;
12950 }
12951 buf1 = PyUnicode_DATA(str_obj);
12952 buf2 = PyUnicode_DATA(sep_obj);
12953 if (kind2 != kind1) {
12954 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12955 if (!buf2)
12956 return NULL;
12957 }
12958
12959 switch (kind1) {
12960 case PyUnicode_1BYTE_KIND:
12961 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12962 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12963 else
12964 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965 break;
12966 case PyUnicode_2BYTE_KIND:
12967 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12968 break;
12969 case PyUnicode_4BYTE_KIND:
12970 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12971 break;
12972 default:
12973 Py_UNREACHABLE();
12974 }
12975
12976 if (kind2 != kind1)
12977 PyMem_Free(buf2);
12978
12979 return out;
12980 }
12981
12982
12983 PyObject *
PyUnicode_RPartition(PyObject * str_obj,PyObject * sep_obj)12984 PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12985 {
12986 PyObject* out;
12987 int kind1, kind2;
12988 void *buf1, *buf2;
12989 Py_ssize_t len1, len2;
12990
12991 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12992 return NULL;
12993
12994 kind1 = PyUnicode_KIND(str_obj);
12995 kind2 = PyUnicode_KIND(sep_obj);
12996 len1 = PyUnicode_GET_LENGTH(str_obj);
12997 len2 = PyUnicode_GET_LENGTH(sep_obj);
12998 if (kind1 < kind2 || len1 < len2) {
12999 _Py_INCREF_UNICODE_EMPTY();
13000 if (!unicode_empty)
13001 out = NULL;
13002 else {
13003 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13004 Py_DECREF(unicode_empty);
13005 }
13006 return out;
13007 }
13008 buf1 = PyUnicode_DATA(str_obj);
13009 buf2 = PyUnicode_DATA(sep_obj);
13010 if (kind2 != kind1) {
13011 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13012 if (!buf2)
13013 return NULL;
13014 }
13015
13016 switch (kind1) {
13017 case PyUnicode_1BYTE_KIND:
13018 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13019 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13020 else
13021 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022 break;
13023 case PyUnicode_2BYTE_KIND:
13024 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13025 break;
13026 case PyUnicode_4BYTE_KIND:
13027 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13028 break;
13029 default:
13030 Py_UNREACHABLE();
13031 }
13032
13033 if (kind2 != kind1)
13034 PyMem_Free(buf2);
13035
13036 return out;
13037 }
13038
13039 /*[clinic input]
13040 str.partition as unicode_partition
13041
13042 sep: object
13043 /
13044
13045 Partition the string into three parts using the given separator.
13046
13047 This will search for the separator in the string. If the separator is found,
13048 returns a 3-tuple containing the part before the separator, the separator
13049 itself, and the part after it.
13050
13051 If the separator is not found, returns a 3-tuple containing the original string
13052 and two empty strings.
13053 [clinic start generated code]*/
13054
13055 static PyObject *
unicode_partition(PyObject * self,PyObject * sep)13056 unicode_partition(PyObject *self, PyObject *sep)
13057 /*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13058 {
13059 return PyUnicode_Partition(self, sep);
13060 }
13061
13062 /*[clinic input]
13063 str.rpartition as unicode_rpartition = str.partition
13064
13065 Partition the string into three parts using the given separator.
13066
13067 This will search for the separator in the string, starting at the end. If
13068 the separator is found, returns a 3-tuple containing the part before the
13069 separator, the separator itself, and the part after it.
13070
13071 If the separator is not found, returns a 3-tuple containing two empty strings
13072 and the original string.
13073 [clinic start generated code]*/
13074
13075 static PyObject *
unicode_rpartition(PyObject * self,PyObject * sep)13076 unicode_rpartition(PyObject *self, PyObject *sep)
13077 /*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13078 {
13079 return PyUnicode_RPartition(self, sep);
13080 }
13081
13082 PyObject *
PyUnicode_RSplit(PyObject * s,PyObject * sep,Py_ssize_t maxsplit)13083 PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13084 {
13085 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13086 return NULL;
13087
13088 return rsplit(s, sep, maxsplit);
13089 }
13090
13091 /*[clinic input]
13092 str.rsplit as unicode_rsplit = str.split
13093
13094 Return a list of the words in the string, using sep as the delimiter string.
13095
13096 Splits are done starting at the end of the string and working to the front.
13097 [clinic start generated code]*/
13098
13099 static PyObject *
unicode_rsplit_impl(PyObject * self,PyObject * sep,Py_ssize_t maxsplit)13100 unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13101 /*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13102 {
13103 if (sep == Py_None)
13104 return rsplit(self, NULL, maxsplit);
13105 if (PyUnicode_Check(sep))
13106 return rsplit(self, sep, maxsplit);
13107
13108 PyErr_Format(PyExc_TypeError,
13109 "must be str or None, not %.100s",
13110 Py_TYPE(sep)->tp_name);
13111 return NULL;
13112 }
13113
13114 /*[clinic input]
13115 str.splitlines as unicode_splitlines
13116
13117 keepends: bool(accept={int}) = False
13118
13119 Return a list of the lines in the string, breaking at line boundaries.
13120
13121 Line breaks are not included in the resulting list unless keepends is given and
13122 true.
13123 [clinic start generated code]*/
13124
13125 static PyObject *
unicode_splitlines_impl(PyObject * self,int keepends)13126 unicode_splitlines_impl(PyObject *self, int keepends)
13127 /*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13128 {
13129 return PyUnicode_Splitlines(self, keepends);
13130 }
13131
13132 static
unicode_str(PyObject * self)13133 PyObject *unicode_str(PyObject *self)
13134 {
13135 return unicode_result_unchanged(self);
13136 }
13137
13138 /*[clinic input]
13139 str.swapcase as unicode_swapcase
13140
13141 Convert uppercase characters to lowercase and lowercase characters to uppercase.
13142 [clinic start generated code]*/
13143
13144 static PyObject *
unicode_swapcase_impl(PyObject * self)13145 unicode_swapcase_impl(PyObject *self)
13146 /*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13147 {
13148 if (PyUnicode_READY(self) == -1)
13149 return NULL;
13150 return case_operation(self, do_swapcase);
13151 }
13152
13153 /*[clinic input]
13154
13155 @staticmethod
13156 str.maketrans as unicode_maketrans
13157
13158 x: object
13159
13160 y: unicode=NULL
13161
13162 z: unicode=NULL
13163
13164 /
13165
13166 Return a translation table usable for str.translate().
13167
13168 If there is only one argument, it must be a dictionary mapping Unicode
13169 ordinals (integers) or characters to Unicode ordinals, strings or None.
13170 Character keys will be then converted to ordinals.
13171 If there are two arguments, they must be strings of equal length, and
13172 in the resulting dictionary, each character in x will be mapped to the
13173 character at the same position in y. If there is a third argument, it
13174 must be a string, whose characters will be mapped to None in the result.
13175 [clinic start generated code]*/
13176
13177 static PyObject *
unicode_maketrans_impl(PyObject * x,PyObject * y,PyObject * z)13178 unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13179 /*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13180 {
13181 PyObject *new = NULL, *key, *value;
13182 Py_ssize_t i = 0;
13183 int res;
13184
13185 new = PyDict_New();
13186 if (!new)
13187 return NULL;
13188 if (y != NULL) {
13189 int x_kind, y_kind, z_kind;
13190 void *x_data, *y_data, *z_data;
13191
13192 /* x must be a string too, of equal length */
13193 if (!PyUnicode_Check(x)) {
13194 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13195 "be a string if there is a second argument");
13196 goto err;
13197 }
13198 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13199 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13200 "arguments must have equal length");
13201 goto err;
13202 }
13203 /* create entries for translating chars in x to those in y */
13204 x_kind = PyUnicode_KIND(x);
13205 y_kind = PyUnicode_KIND(y);
13206 x_data = PyUnicode_DATA(x);
13207 y_data = PyUnicode_DATA(y);
13208 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13209 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13210 if (!key)
13211 goto err;
13212 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13213 if (!value) {
13214 Py_DECREF(key);
13215 goto err;
13216 }
13217 res = PyDict_SetItem(new, key, value);
13218 Py_DECREF(key);
13219 Py_DECREF(value);
13220 if (res < 0)
13221 goto err;
13222 }
13223 /* create entries for deleting chars in z */
13224 if (z != NULL) {
13225 z_kind = PyUnicode_KIND(z);
13226 z_data = PyUnicode_DATA(z);
13227 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13228 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13229 if (!key)
13230 goto err;
13231 res = PyDict_SetItem(new, key, Py_None);
13232 Py_DECREF(key);
13233 if (res < 0)
13234 goto err;
13235 }
13236 }
13237 } else {
13238 int kind;
13239 void *data;
13240
13241 /* x must be a dict */
13242 if (!PyDict_CheckExact(x)) {
13243 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13244 "to maketrans it must be a dict");
13245 goto err;
13246 }
13247 /* copy entries into the new dict, converting string keys to int keys */
13248 while (PyDict_Next(x, &i, &key, &value)) {
13249 if (PyUnicode_Check(key)) {
13250 /* convert string keys to integer keys */
13251 PyObject *newkey;
13252 if (PyUnicode_GET_LENGTH(key) != 1) {
13253 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13254 "table must be of length 1");
13255 goto err;
13256 }
13257 kind = PyUnicode_KIND(key);
13258 data = PyUnicode_DATA(key);
13259 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13260 if (!newkey)
13261 goto err;
13262 res = PyDict_SetItem(new, newkey, value);
13263 Py_DECREF(newkey);
13264 if (res < 0)
13265 goto err;
13266 } else if (PyLong_Check(key)) {
13267 /* just keep integer keys */
13268 if (PyDict_SetItem(new, key, value) < 0)
13269 goto err;
13270 } else {
13271 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13272 "be strings or integers");
13273 goto err;
13274 }
13275 }
13276 }
13277 return new;
13278 err:
13279 Py_DECREF(new);
13280 return NULL;
13281 }
13282
13283 /*[clinic input]
13284 str.translate as unicode_translate
13285
13286 table: object
13287 Translation table, which must be a mapping of Unicode ordinals to
13288 Unicode ordinals, strings, or None.
13289 /
13290
13291 Replace each character in the string using the given translation table.
13292
13293 The table must implement lookup/indexing via __getitem__, for instance a
13294 dictionary or list. If this operation raises LookupError, the character is
13295 left untouched. Characters mapped to None are deleted.
13296 [clinic start generated code]*/
13297
13298 static PyObject *
unicode_translate(PyObject * self,PyObject * table)13299 unicode_translate(PyObject *self, PyObject *table)
13300 /*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13301 {
13302 return _PyUnicode_TranslateCharmap(self, table, "ignore");
13303 }
13304
13305 /*[clinic input]
13306 str.upper as unicode_upper
13307
13308 Return a copy of the string converted to uppercase.
13309 [clinic start generated code]*/
13310
13311 static PyObject *
unicode_upper_impl(PyObject * self)13312 unicode_upper_impl(PyObject *self)
13313 /*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13314 {
13315 if (PyUnicode_READY(self) == -1)
13316 return NULL;
13317 if (PyUnicode_IS_ASCII(self))
13318 return ascii_upper_or_lower(self, 0);
13319 return case_operation(self, do_upper);
13320 }
13321
13322 /*[clinic input]
13323 str.zfill as unicode_zfill
13324
13325 width: Py_ssize_t
13326 /
13327
13328 Pad a numeric string with zeros on the left, to fill a field of the given width.
13329
13330 The string is never truncated.
13331 [clinic start generated code]*/
13332
13333 static PyObject *
unicode_zfill_impl(PyObject * self,Py_ssize_t width)13334 unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13335 /*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13336 {
13337 Py_ssize_t fill;
13338 PyObject *u;
13339 int kind;
13340 void *data;
13341 Py_UCS4 chr;
13342
13343 if (PyUnicode_READY(self) == -1)
13344 return NULL;
13345
13346 if (PyUnicode_GET_LENGTH(self) >= width)
13347 return unicode_result_unchanged(self);
13348
13349 fill = width - PyUnicode_GET_LENGTH(self);
13350
13351 u = pad(self, fill, 0, '0');
13352
13353 if (u == NULL)
13354 return NULL;
13355
13356 kind = PyUnicode_KIND(u);
13357 data = PyUnicode_DATA(u);
13358 chr = PyUnicode_READ(kind, data, fill);
13359
13360 if (chr == '+' || chr == '-') {
13361 /* move sign to beginning of string */
13362 PyUnicode_WRITE(kind, data, 0, chr);
13363 PyUnicode_WRITE(kind, data, fill, '0');
13364 }
13365
13366 assert(_PyUnicode_CheckConsistency(u, 1));
13367 return u;
13368 }
13369
13370 #if 0
13371 static PyObject *
13372 unicode__decimal2ascii(PyObject *self)
13373 {
13374 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13375 }
13376 #endif
13377
13378 PyDoc_STRVAR(startswith__doc__,
13379 "S.startswith(prefix[, start[, end]]) -> bool\n\
13380 \n\
13381 Return True if S starts with the specified prefix, False otherwise.\n\
13382 With optional start, test S beginning at that position.\n\
13383 With optional end, stop comparing S at that position.\n\
13384 prefix can also be a tuple of strings to try.");
13385
13386 static PyObject *
unicode_startswith(PyObject * self,PyObject * args)13387 unicode_startswith(PyObject *self,
13388 PyObject *args)
13389 {
13390 PyObject *subobj;
13391 PyObject *substring;
13392 Py_ssize_t start = 0;
13393 Py_ssize_t end = PY_SSIZE_T_MAX;
13394 int result;
13395
13396 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13397 return NULL;
13398 if (PyTuple_Check(subobj)) {
13399 Py_ssize_t i;
13400 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13401 substring = PyTuple_GET_ITEM(subobj, i);
13402 if (!PyUnicode_Check(substring)) {
13403 PyErr_Format(PyExc_TypeError,
13404 "tuple for startswith must only contain str, "
13405 "not %.100s",
13406 Py_TYPE(substring)->tp_name);
13407 return NULL;
13408 }
13409 result = tailmatch(self, substring, start, end, -1);
13410 if (result == -1)
13411 return NULL;
13412 if (result) {
13413 Py_RETURN_TRUE;
13414 }
13415 }
13416 /* nothing matched */
13417 Py_RETURN_FALSE;
13418 }
13419 if (!PyUnicode_Check(subobj)) {
13420 PyErr_Format(PyExc_TypeError,
13421 "startswith first arg must be str or "
13422 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13423 return NULL;
13424 }
13425 result = tailmatch(self, subobj, start, end, -1);
13426 if (result == -1)
13427 return NULL;
13428 return PyBool_FromLong(result);
13429 }
13430
13431
13432 PyDoc_STRVAR(endswith__doc__,
13433 "S.endswith(suffix[, start[, end]]) -> bool\n\
13434 \n\
13435 Return True if S ends with the specified suffix, False otherwise.\n\
13436 With optional start, test S beginning at that position.\n\
13437 With optional end, stop comparing S at that position.\n\
13438 suffix can also be a tuple of strings to try.");
13439
13440 static PyObject *
unicode_endswith(PyObject * self,PyObject * args)13441 unicode_endswith(PyObject *self,
13442 PyObject *args)
13443 {
13444 PyObject *subobj;
13445 PyObject *substring;
13446 Py_ssize_t start = 0;
13447 Py_ssize_t end = PY_SSIZE_T_MAX;
13448 int result;
13449
13450 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13451 return NULL;
13452 if (PyTuple_Check(subobj)) {
13453 Py_ssize_t i;
13454 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13455 substring = PyTuple_GET_ITEM(subobj, i);
13456 if (!PyUnicode_Check(substring)) {
13457 PyErr_Format(PyExc_TypeError,
13458 "tuple for endswith must only contain str, "
13459 "not %.100s",
13460 Py_TYPE(substring)->tp_name);
13461 return NULL;
13462 }
13463 result = tailmatch(self, substring, start, end, +1);
13464 if (result == -1)
13465 return NULL;
13466 if (result) {
13467 Py_RETURN_TRUE;
13468 }
13469 }
13470 Py_RETURN_FALSE;
13471 }
13472 if (!PyUnicode_Check(subobj)) {
13473 PyErr_Format(PyExc_TypeError,
13474 "endswith first arg must be str or "
13475 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13476 return NULL;
13477 }
13478 result = tailmatch(self, subobj, start, end, +1);
13479 if (result == -1)
13480 return NULL;
13481 return PyBool_FromLong(result);
13482 }
13483
13484 static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter * writer)13485 _PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13486 {
13487 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13488 writer->data = PyUnicode_DATA(writer->buffer);
13489
13490 if (!writer->readonly) {
13491 writer->kind = PyUnicode_KIND(writer->buffer);
13492 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13493 }
13494 else {
13495 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13496 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13497 writer->kind = PyUnicode_WCHAR_KIND;
13498 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13499
13500 /* Copy-on-write mode: set buffer size to 0 so
13501 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13502 * next write. */
13503 writer->size = 0;
13504 }
13505 }
13506
13507 void
_PyUnicodeWriter_Init(_PyUnicodeWriter * writer)13508 _PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13509 {
13510 memset(writer, 0, sizeof(*writer));
13511
13512 /* ASCII is the bare minimum */
13513 writer->min_char = 127;
13514
13515 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13516 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13517 writer->kind = PyUnicode_WCHAR_KIND;
13518 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13519 }
13520
13521 int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter * writer,Py_ssize_t length,Py_UCS4 maxchar)13522 _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13523 Py_ssize_t length, Py_UCS4 maxchar)
13524 {
13525 Py_ssize_t newlen;
13526 PyObject *newbuffer;
13527
13528 assert(maxchar <= MAX_UNICODE);
13529
13530 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13531 assert((maxchar > writer->maxchar && length >= 0)
13532 || length > 0);
13533
13534 if (length > PY_SSIZE_T_MAX - writer->pos) {
13535 PyErr_NoMemory();
13536 return -1;
13537 }
13538 newlen = writer->pos + length;
13539
13540 maxchar = Py_MAX(maxchar, writer->min_char);
13541
13542 if (writer->buffer == NULL) {
13543 assert(!writer->readonly);
13544 if (writer->overallocate
13545 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13546 /* overallocate to limit the number of realloc() */
13547 newlen += newlen / OVERALLOCATE_FACTOR;
13548 }
13549 if (newlen < writer->min_length)
13550 newlen = writer->min_length;
13551
13552 writer->buffer = PyUnicode_New(newlen, maxchar);
13553 if (writer->buffer == NULL)
13554 return -1;
13555 }
13556 else if (newlen > writer->size) {
13557 if (writer->overallocate
13558 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13559 /* overallocate to limit the number of realloc() */
13560 newlen += newlen / OVERALLOCATE_FACTOR;
13561 }
13562 if (newlen < writer->min_length)
13563 newlen = writer->min_length;
13564
13565 if (maxchar > writer->maxchar || writer->readonly) {
13566 /* resize + widen */
13567 maxchar = Py_MAX(maxchar, writer->maxchar);
13568 newbuffer = PyUnicode_New(newlen, maxchar);
13569 if (newbuffer == NULL)
13570 return -1;
13571 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13572 writer->buffer, 0, writer->pos);
13573 Py_DECREF(writer->buffer);
13574 writer->readonly = 0;
13575 }
13576 else {
13577 newbuffer = resize_compact(writer->buffer, newlen);
13578 if (newbuffer == NULL)
13579 return -1;
13580 }
13581 writer->buffer = newbuffer;
13582 }
13583 else if (maxchar > writer->maxchar) {
13584 assert(!writer->readonly);
13585 newbuffer = PyUnicode_New(writer->size, maxchar);
13586 if (newbuffer == NULL)
13587 return -1;
13588 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13589 writer->buffer, 0, writer->pos);
13590 Py_SETREF(writer->buffer, newbuffer);
13591 }
13592 _PyUnicodeWriter_Update(writer);
13593 return 0;
13594
13595 #undef OVERALLOCATE_FACTOR
13596 }
13597
13598 int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter * writer,enum PyUnicode_Kind kind)13599 _PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13600 enum PyUnicode_Kind kind)
13601 {
13602 Py_UCS4 maxchar;
13603
13604 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13605 assert(writer->kind < kind);
13606
13607 switch (kind)
13608 {
13609 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13610 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13611 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13612 default:
13613 Py_UNREACHABLE();
13614 }
13615
13616 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13617 }
13618
13619 static inline int
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter * writer,Py_UCS4 ch)13620 _PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13621 {
13622 assert(ch <= MAX_UNICODE);
13623 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13624 return -1;
13625 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13626 writer->pos++;
13627 return 0;
13628 }
13629
13630 int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter * writer,Py_UCS4 ch)13631 _PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13632 {
13633 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13634 }
13635
13636 int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter * writer,PyObject * str)13637 _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13638 {
13639 Py_UCS4 maxchar;
13640 Py_ssize_t len;
13641
13642 if (PyUnicode_READY(str) == -1)
13643 return -1;
13644 len = PyUnicode_GET_LENGTH(str);
13645 if (len == 0)
13646 return 0;
13647 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13648 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13649 if (writer->buffer == NULL && !writer->overallocate) {
13650 assert(_PyUnicode_CheckConsistency(str, 1));
13651 writer->readonly = 1;
13652 Py_INCREF(str);
13653 writer->buffer = str;
13654 _PyUnicodeWriter_Update(writer);
13655 writer->pos += len;
13656 return 0;
13657 }
13658 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13659 return -1;
13660 }
13661 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13662 str, 0, len);
13663 writer->pos += len;
13664 return 0;
13665 }
13666
13667 int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter * writer,PyObject * str,Py_ssize_t start,Py_ssize_t end)13668 _PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13669 Py_ssize_t start, Py_ssize_t end)
13670 {
13671 Py_UCS4 maxchar;
13672 Py_ssize_t len;
13673
13674 if (PyUnicode_READY(str) == -1)
13675 return -1;
13676
13677 assert(0 <= start);
13678 assert(end <= PyUnicode_GET_LENGTH(str));
13679 assert(start <= end);
13680
13681 if (end == 0)
13682 return 0;
13683
13684 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13685 return _PyUnicodeWriter_WriteStr(writer, str);
13686
13687 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13688 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13689 else
13690 maxchar = writer->maxchar;
13691 len = end - start;
13692
13693 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13694 return -1;
13695
13696 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13697 str, start, len);
13698 writer->pos += len;
13699 return 0;
13700 }
13701
13702 int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter * writer,const char * ascii,Py_ssize_t len)13703 _PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13704 const char *ascii, Py_ssize_t len)
13705 {
13706 if (len == -1)
13707 len = strlen(ascii);
13708
13709 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13710
13711 if (writer->buffer == NULL && !writer->overallocate) {
13712 PyObject *str;
13713
13714 str = _PyUnicode_FromASCII(ascii, len);
13715 if (str == NULL)
13716 return -1;
13717
13718 writer->readonly = 1;
13719 writer->buffer = str;
13720 _PyUnicodeWriter_Update(writer);
13721 writer->pos += len;
13722 return 0;
13723 }
13724
13725 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13726 return -1;
13727
13728 switch (writer->kind)
13729 {
13730 case PyUnicode_1BYTE_KIND:
13731 {
13732 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13733 Py_UCS1 *data = writer->data;
13734
13735 memcpy(data + writer->pos, str, len);
13736 break;
13737 }
13738 case PyUnicode_2BYTE_KIND:
13739 {
13740 _PyUnicode_CONVERT_BYTES(
13741 Py_UCS1, Py_UCS2,
13742 ascii, ascii + len,
13743 (Py_UCS2 *)writer->data + writer->pos);
13744 break;
13745 }
13746 case PyUnicode_4BYTE_KIND:
13747 {
13748 _PyUnicode_CONVERT_BYTES(
13749 Py_UCS1, Py_UCS4,
13750 ascii, ascii + len,
13751 (Py_UCS4 *)writer->data + writer->pos);
13752 break;
13753 }
13754 default:
13755 Py_UNREACHABLE();
13756 }
13757
13758 writer->pos += len;
13759 return 0;
13760 }
13761
13762 int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter * writer,const char * str,Py_ssize_t len)13763 _PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13764 const char *str, Py_ssize_t len)
13765 {
13766 Py_UCS4 maxchar;
13767
13768 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13769 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13770 return -1;
13771 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13772 writer->pos += len;
13773 return 0;
13774 }
13775
13776 PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter * writer)13777 _PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13778 {
13779 PyObject *str;
13780
13781 if (writer->pos == 0) {
13782 Py_CLEAR(writer->buffer);
13783 _Py_RETURN_UNICODE_EMPTY();
13784 }
13785
13786 str = writer->buffer;
13787 writer->buffer = NULL;
13788
13789 if (writer->readonly) {
13790 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13791 return str;
13792 }
13793
13794 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13795 PyObject *str2;
13796 str2 = resize_compact(str, writer->pos);
13797 if (str2 == NULL) {
13798 Py_DECREF(str);
13799 return NULL;
13800 }
13801 str = str2;
13802 }
13803
13804 assert(_PyUnicode_CheckConsistency(str, 1));
13805 return unicode_result_ready(str);
13806 }
13807
13808 void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter * writer)13809 _PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13810 {
13811 Py_CLEAR(writer->buffer);
13812 }
13813
13814 #include "stringlib/unicode_format.h"
13815
13816 PyDoc_STRVAR(format__doc__,
13817 "S.format(*args, **kwargs) -> str\n\
13818 \n\
13819 Return a formatted version of S, using substitutions from args and kwargs.\n\
13820 The substitutions are identified by braces ('{' and '}').");
13821
13822 PyDoc_STRVAR(format_map__doc__,
13823 "S.format_map(mapping) -> str\n\
13824 \n\
13825 Return a formatted version of S, using substitutions from mapping.\n\
13826 The substitutions are identified by braces ('{' and '}').");
13827
13828 /*[clinic input]
13829 str.__format__ as unicode___format__
13830
13831 format_spec: unicode
13832 /
13833
13834 Return a formatted version of the string as described by format_spec.
13835 [clinic start generated code]*/
13836
13837 static PyObject *
unicode___format___impl(PyObject * self,PyObject * format_spec)13838 unicode___format___impl(PyObject *self, PyObject *format_spec)
13839 /*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13840 {
13841 _PyUnicodeWriter writer;
13842 int ret;
13843
13844 if (PyUnicode_READY(self) == -1)
13845 return NULL;
13846 _PyUnicodeWriter_Init(&writer);
13847 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13848 self, format_spec, 0,
13849 PyUnicode_GET_LENGTH(format_spec));
13850 if (ret == -1) {
13851 _PyUnicodeWriter_Dealloc(&writer);
13852 return NULL;
13853 }
13854 return _PyUnicodeWriter_Finish(&writer);
13855 }
13856
13857 /*[clinic input]
13858 str.__sizeof__ as unicode_sizeof
13859
13860 Return the size of the string in memory, in bytes.
13861 [clinic start generated code]*/
13862
13863 static PyObject *
unicode_sizeof_impl(PyObject * self)13864 unicode_sizeof_impl(PyObject *self)
13865 /*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13866 {
13867 Py_ssize_t size;
13868
13869 /* If it's a compact object, account for base structure +
13870 character data. */
13871 if (PyUnicode_IS_COMPACT_ASCII(self))
13872 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13873 else if (PyUnicode_IS_COMPACT(self))
13874 size = sizeof(PyCompactUnicodeObject) +
13875 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13876 else {
13877 /* If it is a two-block object, account for base object, and
13878 for character block if present. */
13879 size = sizeof(PyUnicodeObject);
13880 if (_PyUnicode_DATA_ANY(self))
13881 size += (PyUnicode_GET_LENGTH(self) + 1) *
13882 PyUnicode_KIND(self);
13883 }
13884 /* If the wstr pointer is present, account for it unless it is shared
13885 with the data pointer. Check if the data is not shared. */
13886 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13887 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13888 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13889 size += PyUnicode_UTF8_LENGTH(self) + 1;
13890
13891 return PyLong_FromSsize_t(size);
13892 }
13893
13894 static PyObject *
unicode_getnewargs(PyObject * v)13895 unicode_getnewargs(PyObject *v)
13896 {
13897 PyObject *copy = _PyUnicode_Copy(v);
13898 if (!copy)
13899 return NULL;
13900 return Py_BuildValue("(N)", copy);
13901 }
13902
13903 static PyMethodDef unicode_methods[] = {
13904 UNICODE_ENCODE_METHODDEF
13905 UNICODE_REPLACE_METHODDEF
13906 UNICODE_SPLIT_METHODDEF
13907 UNICODE_RSPLIT_METHODDEF
13908 UNICODE_JOIN_METHODDEF
13909 UNICODE_CAPITALIZE_METHODDEF
13910 UNICODE_CASEFOLD_METHODDEF
13911 UNICODE_TITLE_METHODDEF
13912 UNICODE_CENTER_METHODDEF
13913 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13914 UNICODE_EXPANDTABS_METHODDEF
13915 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13916 UNICODE_PARTITION_METHODDEF
13917 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13918 UNICODE_LJUST_METHODDEF
13919 UNICODE_LOWER_METHODDEF
13920 UNICODE_LSTRIP_METHODDEF
13921 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13922 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13923 UNICODE_RJUST_METHODDEF
13924 UNICODE_RSTRIP_METHODDEF
13925 UNICODE_RPARTITION_METHODDEF
13926 UNICODE_SPLITLINES_METHODDEF
13927 UNICODE_STRIP_METHODDEF
13928 UNICODE_SWAPCASE_METHODDEF
13929 UNICODE_TRANSLATE_METHODDEF
13930 UNICODE_UPPER_METHODDEF
13931 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13932 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13933 UNICODE_ISASCII_METHODDEF
13934 UNICODE_ISLOWER_METHODDEF
13935 UNICODE_ISUPPER_METHODDEF
13936 UNICODE_ISTITLE_METHODDEF
13937 UNICODE_ISSPACE_METHODDEF
13938 UNICODE_ISDECIMAL_METHODDEF
13939 UNICODE_ISDIGIT_METHODDEF
13940 UNICODE_ISNUMERIC_METHODDEF
13941 UNICODE_ISALPHA_METHODDEF
13942 UNICODE_ISALNUM_METHODDEF
13943 UNICODE_ISIDENTIFIER_METHODDEF
13944 UNICODE_ISPRINTABLE_METHODDEF
13945 UNICODE_ZFILL_METHODDEF
13946 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13947 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13948 UNICODE___FORMAT___METHODDEF
13949 UNICODE_MAKETRANS_METHODDEF
13950 UNICODE_SIZEOF_METHODDEF
13951 #if 0
13952 /* These methods are just used for debugging the implementation. */
13953 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13954 #endif
13955
13956 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
13957 {NULL, NULL}
13958 };
13959
13960 static PyObject *
unicode_mod(PyObject * v,PyObject * w)13961 unicode_mod(PyObject *v, PyObject *w)
13962 {
13963 if (!PyUnicode_Check(v))
13964 Py_RETURN_NOTIMPLEMENTED;
13965 return PyUnicode_Format(v, w);
13966 }
13967
13968 static PyNumberMethods unicode_as_number = {
13969 0, /*nb_add*/
13970 0, /*nb_subtract*/
13971 0, /*nb_multiply*/
13972 unicode_mod, /*nb_remainder*/
13973 };
13974
13975 static PySequenceMethods unicode_as_sequence = {
13976 (lenfunc) unicode_length, /* sq_length */
13977 PyUnicode_Concat, /* sq_concat */
13978 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13979 (ssizeargfunc) unicode_getitem, /* sq_item */
13980 0, /* sq_slice */
13981 0, /* sq_ass_item */
13982 0, /* sq_ass_slice */
13983 PyUnicode_Contains, /* sq_contains */
13984 };
13985
13986 static PyObject*
unicode_subscript(PyObject * self,PyObject * item)13987 unicode_subscript(PyObject* self, PyObject* item)
13988 {
13989 if (PyUnicode_READY(self) == -1)
13990 return NULL;
13991
13992 if (PyIndex_Check(item)) {
13993 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13994 if (i == -1 && PyErr_Occurred())
13995 return NULL;
13996 if (i < 0)
13997 i += PyUnicode_GET_LENGTH(self);
13998 return unicode_getitem(self, i);
13999 } else if (PySlice_Check(item)) {
14000 Py_ssize_t start, stop, step, slicelength, i;
14001 size_t cur;
14002 PyObject *result;
14003 void *src_data, *dest_data;
14004 int src_kind, dest_kind;
14005 Py_UCS4 ch, max_char, kind_limit;
14006
14007 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14008 return NULL;
14009 }
14010 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14011 &start, &stop, step);
14012
14013 if (slicelength <= 0) {
14014 _Py_RETURN_UNICODE_EMPTY();
14015 } else if (start == 0 && step == 1 &&
14016 slicelength == PyUnicode_GET_LENGTH(self)) {
14017 return unicode_result_unchanged(self);
14018 } else if (step == 1) {
14019 return PyUnicode_Substring(self,
14020 start, start + slicelength);
14021 }
14022 /* General case */
14023 src_kind = PyUnicode_KIND(self);
14024 src_data = PyUnicode_DATA(self);
14025 if (!PyUnicode_IS_ASCII(self)) {
14026 kind_limit = kind_maxchar_limit(src_kind);
14027 max_char = 0;
14028 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14029 ch = PyUnicode_READ(src_kind, src_data, cur);
14030 if (ch > max_char) {
14031 max_char = ch;
14032 if (max_char >= kind_limit)
14033 break;
14034 }
14035 }
14036 }
14037 else
14038 max_char = 127;
14039 result = PyUnicode_New(slicelength, max_char);
14040 if (result == NULL)
14041 return NULL;
14042 dest_kind = PyUnicode_KIND(result);
14043 dest_data = PyUnicode_DATA(result);
14044
14045 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14046 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14047 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14048 }
14049 assert(_PyUnicode_CheckConsistency(result, 1));
14050 return result;
14051 } else {
14052 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14053 return NULL;
14054 }
14055 }
14056
14057 static PyMappingMethods unicode_as_mapping = {
14058 (lenfunc)unicode_length, /* mp_length */
14059 (binaryfunc)unicode_subscript, /* mp_subscript */
14060 (objobjargproc)0, /* mp_ass_subscript */
14061 };
14062
14063
14064 /* Helpers for PyUnicode_Format() */
14065
14066 struct unicode_formatter_t {
14067 PyObject *args;
14068 int args_owned;
14069 Py_ssize_t arglen, argidx;
14070 PyObject *dict;
14071
14072 enum PyUnicode_Kind fmtkind;
14073 Py_ssize_t fmtcnt, fmtpos;
14074 void *fmtdata;
14075 PyObject *fmtstr;
14076
14077 _PyUnicodeWriter writer;
14078 };
14079
14080 struct unicode_format_arg_t {
14081 Py_UCS4 ch;
14082 int flags;
14083 Py_ssize_t width;
14084 int prec;
14085 int sign;
14086 };
14087
14088 static PyObject *
unicode_format_getnextarg(struct unicode_formatter_t * ctx)14089 unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14090 {
14091 Py_ssize_t argidx = ctx->argidx;
14092
14093 if (argidx < ctx->arglen) {
14094 ctx->argidx++;
14095 if (ctx->arglen < 0)
14096 return ctx->args;
14097 else
14098 return PyTuple_GetItem(ctx->args, argidx);
14099 }
14100 PyErr_SetString(PyExc_TypeError,
14101 "not enough arguments for format string");
14102 return NULL;
14103 }
14104
14105 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
14106
14107 /* Format a float into the writer if the writer is not NULL, or into *p_output
14108 otherwise.
14109
14110 Return 0 on success, raise an exception and return -1 on error. */
14111 static int
formatfloat(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14112 formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14113 PyObject **p_output,
14114 _PyUnicodeWriter *writer)
14115 {
14116 char *p;
14117 double x;
14118 Py_ssize_t len;
14119 int prec;
14120 int dtoa_flags;
14121
14122 x = PyFloat_AsDouble(v);
14123 if (x == -1.0 && PyErr_Occurred())
14124 return -1;
14125
14126 prec = arg->prec;
14127 if (prec < 0)
14128 prec = 6;
14129
14130 if (arg->flags & F_ALT)
14131 dtoa_flags = Py_DTSF_ALT;
14132 else
14133 dtoa_flags = 0;
14134 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14135 if (p == NULL)
14136 return -1;
14137 len = strlen(p);
14138 if (writer) {
14139 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14140 PyMem_Free(p);
14141 return -1;
14142 }
14143 }
14144 else
14145 *p_output = _PyUnicode_FromASCII(p, len);
14146 PyMem_Free(p);
14147 return 0;
14148 }
14149
14150 /* formatlong() emulates the format codes d, u, o, x and X, and
14151 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14152 * Python's regular ints.
14153 * Return value: a new PyUnicodeObject*, or NULL if error.
14154 * The output string is of the form
14155 * "-"? ("0x" | "0X")? digit+
14156 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14157 * set in flags. The case of hex digits will be correct,
14158 * There will be at least prec digits, zero-filled on the left if
14159 * necessary to get that many.
14160 * val object to be converted
14161 * flags bitmask of format flags; only F_ALT is looked at
14162 * prec minimum number of digits; 0-fill on left if needed
14163 * type a character in [duoxX]; u acts the same as d
14164 *
14165 * CAUTION: o, x and X conversions on regular ints can never
14166 * produce a '-' sign, but can for Python's unbounded ints.
14167 */
14168 PyObject *
_PyUnicode_FormatLong(PyObject * val,int alt,int prec,int type)14169 _PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14170 {
14171 PyObject *result = NULL;
14172 char *buf;
14173 Py_ssize_t i;
14174 int sign; /* 1 if '-', else 0 */
14175 int len; /* number of characters */
14176 Py_ssize_t llen;
14177 int numdigits; /* len == numnondigits + numdigits */
14178 int numnondigits = 0;
14179
14180 /* Avoid exceeding SSIZE_T_MAX */
14181 if (prec > INT_MAX-3) {
14182 PyErr_SetString(PyExc_OverflowError,
14183 "precision too large");
14184 return NULL;
14185 }
14186
14187 assert(PyLong_Check(val));
14188
14189 switch (type) {
14190 default:
14191 Py_UNREACHABLE();
14192 case 'd':
14193 case 'i':
14194 case 'u':
14195 /* int and int subclasses should print numerically when a numeric */
14196 /* format code is used (see issue18780) */
14197 result = PyNumber_ToBase(val, 10);
14198 break;
14199 case 'o':
14200 numnondigits = 2;
14201 result = PyNumber_ToBase(val, 8);
14202 break;
14203 case 'x':
14204 case 'X':
14205 numnondigits = 2;
14206 result = PyNumber_ToBase(val, 16);
14207 break;
14208 }
14209 if (!result)
14210 return NULL;
14211
14212 assert(unicode_modifiable(result));
14213 assert(PyUnicode_IS_READY(result));
14214 assert(PyUnicode_IS_ASCII(result));
14215
14216 /* To modify the string in-place, there can only be one reference. */
14217 if (Py_REFCNT(result) != 1) {
14218 Py_DECREF(result);
14219 PyErr_BadInternalCall();
14220 return NULL;
14221 }
14222 buf = PyUnicode_DATA(result);
14223 llen = PyUnicode_GET_LENGTH(result);
14224 if (llen > INT_MAX) {
14225 Py_DECREF(result);
14226 PyErr_SetString(PyExc_ValueError,
14227 "string too large in _PyUnicode_FormatLong");
14228 return NULL;
14229 }
14230 len = (int)llen;
14231 sign = buf[0] == '-';
14232 numnondigits += sign;
14233 numdigits = len - numnondigits;
14234 assert(numdigits > 0);
14235
14236 /* Get rid of base marker unless F_ALT */
14237 if (((alt) == 0 &&
14238 (type == 'o' || type == 'x' || type == 'X'))) {
14239 assert(buf[sign] == '0');
14240 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14241 buf[sign+1] == 'o');
14242 numnondigits -= 2;
14243 buf += 2;
14244 len -= 2;
14245 if (sign)
14246 buf[0] = '-';
14247 assert(len == numnondigits + numdigits);
14248 assert(numdigits > 0);
14249 }
14250
14251 /* Fill with leading zeroes to meet minimum width. */
14252 if (prec > numdigits) {
14253 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14254 numnondigits + prec);
14255 char *b1;
14256 if (!r1) {
14257 Py_DECREF(result);
14258 return NULL;
14259 }
14260 b1 = PyBytes_AS_STRING(r1);
14261 for (i = 0; i < numnondigits; ++i)
14262 *b1++ = *buf++;
14263 for (i = 0; i < prec - numdigits; i++)
14264 *b1++ = '0';
14265 for (i = 0; i < numdigits; i++)
14266 *b1++ = *buf++;
14267 *b1 = '\0';
14268 Py_DECREF(result);
14269 result = r1;
14270 buf = PyBytes_AS_STRING(result);
14271 len = numnondigits + prec;
14272 }
14273
14274 /* Fix up case for hex conversions. */
14275 if (type == 'X') {
14276 /* Need to convert all lower case letters to upper case.
14277 and need to convert 0x to 0X (and -0x to -0X). */
14278 for (i = 0; i < len; i++)
14279 if (buf[i] >= 'a' && buf[i] <= 'x')
14280 buf[i] -= 'a'-'A';
14281 }
14282 if (!PyUnicode_Check(result)
14283 || buf != PyUnicode_DATA(result)) {
14284 PyObject *unicode;
14285 unicode = _PyUnicode_FromASCII(buf, len);
14286 Py_DECREF(result);
14287 result = unicode;
14288 }
14289 else if (len != PyUnicode_GET_LENGTH(result)) {
14290 if (PyUnicode_Resize(&result, len) < 0)
14291 Py_CLEAR(result);
14292 }
14293 return result;
14294 }
14295
14296 /* Format an integer or a float as an integer.
14297 * Return 1 if the number has been formatted into the writer,
14298 * 0 if the number has been formatted into *p_output
14299 * -1 and raise an exception on error */
14300 static int
mainformatlong(PyObject * v,struct unicode_format_arg_t * arg,PyObject ** p_output,_PyUnicodeWriter * writer)14301 mainformatlong(PyObject *v,
14302 struct unicode_format_arg_t *arg,
14303 PyObject **p_output,
14304 _PyUnicodeWriter *writer)
14305 {
14306 PyObject *iobj, *res;
14307 char type = (char)arg->ch;
14308
14309 if (!PyNumber_Check(v))
14310 goto wrongtype;
14311
14312 /* make sure number is a type of integer for o, x, and X */
14313 if (!PyLong_Check(v)) {
14314 if (type == 'o' || type == 'x' || type == 'X') {
14315 iobj = PyNumber_Index(v);
14316 if (iobj == NULL) {
14317 if (PyErr_ExceptionMatches(PyExc_TypeError))
14318 goto wrongtype;
14319 return -1;
14320 }
14321 }
14322 else {
14323 iobj = PyNumber_Long(v);
14324 if (iobj == NULL ) {
14325 if (PyErr_ExceptionMatches(PyExc_TypeError))
14326 goto wrongtype;
14327 return -1;
14328 }
14329 }
14330 assert(PyLong_Check(iobj));
14331 }
14332 else {
14333 iobj = v;
14334 Py_INCREF(iobj);
14335 }
14336
14337 if (PyLong_CheckExact(v)
14338 && arg->width == -1 && arg->prec == -1
14339 && !(arg->flags & (F_SIGN | F_BLANK))
14340 && type != 'X')
14341 {
14342 /* Fast path */
14343 int alternate = arg->flags & F_ALT;
14344 int base;
14345
14346 switch(type)
14347 {
14348 default:
14349 Py_UNREACHABLE();
14350 case 'd':
14351 case 'i':
14352 case 'u':
14353 base = 10;
14354 break;
14355 case 'o':
14356 base = 8;
14357 break;
14358 case 'x':
14359 case 'X':
14360 base = 16;
14361 break;
14362 }
14363
14364 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14365 Py_DECREF(iobj);
14366 return -1;
14367 }
14368 Py_DECREF(iobj);
14369 return 1;
14370 }
14371
14372 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14373 Py_DECREF(iobj);
14374 if (res == NULL)
14375 return -1;
14376 *p_output = res;
14377 return 0;
14378
14379 wrongtype:
14380 switch(type)
14381 {
14382 case 'o':
14383 case 'x':
14384 case 'X':
14385 PyErr_Format(PyExc_TypeError,
14386 "%%%c format: an integer is required, "
14387 "not %.200s",
14388 type, Py_TYPE(v)->tp_name);
14389 break;
14390 default:
14391 PyErr_Format(PyExc_TypeError,
14392 "%%%c format: a number is required, "
14393 "not %.200s",
14394 type, Py_TYPE(v)->tp_name);
14395 break;
14396 }
14397 return -1;
14398 }
14399
14400 static Py_UCS4
formatchar(PyObject * v)14401 formatchar(PyObject *v)
14402 {
14403 /* presume that the buffer is at least 3 characters long */
14404 if (PyUnicode_Check(v)) {
14405 if (PyUnicode_GET_LENGTH(v) == 1) {
14406 return PyUnicode_READ_CHAR(v, 0);
14407 }
14408 goto onError;
14409 }
14410 else {
14411 PyObject *iobj;
14412 long x;
14413 /* make sure number is a type of integer */
14414 if (!PyLong_Check(v)) {
14415 iobj = PyNumber_Index(v);
14416 if (iobj == NULL) {
14417 goto onError;
14418 }
14419 x = PyLong_AsLong(iobj);
14420 Py_DECREF(iobj);
14421 }
14422 else {
14423 x = PyLong_AsLong(v);
14424 }
14425 if (x == -1 && PyErr_Occurred())
14426 goto onError;
14427
14428 if (x < 0 || x > MAX_UNICODE) {
14429 PyErr_SetString(PyExc_OverflowError,
14430 "%c arg not in range(0x110000)");
14431 return (Py_UCS4) -1;
14432 }
14433
14434 return (Py_UCS4) x;
14435 }
14436
14437 onError:
14438 PyErr_SetString(PyExc_TypeError,
14439 "%c requires int or char");
14440 return (Py_UCS4) -1;
14441 }
14442
14443 /* Parse options of an argument: flags, width, precision.
14444 Handle also "%(name)" syntax.
14445
14446 Return 0 if the argument has been formatted into arg->str.
14447 Return 1 if the argument has been written into ctx->writer,
14448 Raise an exception and return -1 on error. */
14449 static int
unicode_format_arg_parse(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg)14450 unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14451 struct unicode_format_arg_t *arg)
14452 {
14453 #define FORMAT_READ(ctx) \
14454 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14455
14456 PyObject *v;
14457
14458 if (arg->ch == '(') {
14459 /* Get argument value from a dictionary. Example: "%(name)s". */
14460 Py_ssize_t keystart;
14461 Py_ssize_t keylen;
14462 PyObject *key;
14463 int pcount = 1;
14464
14465 if (ctx->dict == NULL) {
14466 PyErr_SetString(PyExc_TypeError,
14467 "format requires a mapping");
14468 return -1;
14469 }
14470 ++ctx->fmtpos;
14471 --ctx->fmtcnt;
14472 keystart = ctx->fmtpos;
14473 /* Skip over balanced parentheses */
14474 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14475 arg->ch = FORMAT_READ(ctx);
14476 if (arg->ch == ')')
14477 --pcount;
14478 else if (arg->ch == '(')
14479 ++pcount;
14480 ctx->fmtpos++;
14481 }
14482 keylen = ctx->fmtpos - keystart - 1;
14483 if (ctx->fmtcnt < 0 || pcount > 0) {
14484 PyErr_SetString(PyExc_ValueError,
14485 "incomplete format key");
14486 return -1;
14487 }
14488 key = PyUnicode_Substring(ctx->fmtstr,
14489 keystart, keystart + keylen);
14490 if (key == NULL)
14491 return -1;
14492 if (ctx->args_owned) {
14493 ctx->args_owned = 0;
14494 Py_DECREF(ctx->args);
14495 }
14496 ctx->args = PyObject_GetItem(ctx->dict, key);
14497 Py_DECREF(key);
14498 if (ctx->args == NULL)
14499 return -1;
14500 ctx->args_owned = 1;
14501 ctx->arglen = -1;
14502 ctx->argidx = -2;
14503 }
14504
14505 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14506 while (--ctx->fmtcnt >= 0) {
14507 arg->ch = FORMAT_READ(ctx);
14508 ctx->fmtpos++;
14509 switch (arg->ch) {
14510 case '-': arg->flags |= F_LJUST; continue;
14511 case '+': arg->flags |= F_SIGN; continue;
14512 case ' ': arg->flags |= F_BLANK; continue;
14513 case '#': arg->flags |= F_ALT; continue;
14514 case '0': arg->flags |= F_ZERO; continue;
14515 }
14516 break;
14517 }
14518
14519 /* Parse width. Example: "%10s" => width=10 */
14520 if (arg->ch == '*') {
14521 v = unicode_format_getnextarg(ctx);
14522 if (v == NULL)
14523 return -1;
14524 if (!PyLong_Check(v)) {
14525 PyErr_SetString(PyExc_TypeError,
14526 "* wants int");
14527 return -1;
14528 }
14529 arg->width = PyLong_AsSsize_t(v);
14530 if (arg->width == -1 && PyErr_Occurred())
14531 return -1;
14532 if (arg->width < 0) {
14533 arg->flags |= F_LJUST;
14534 arg->width = -arg->width;
14535 }
14536 if (--ctx->fmtcnt >= 0) {
14537 arg->ch = FORMAT_READ(ctx);
14538 ctx->fmtpos++;
14539 }
14540 }
14541 else if (arg->ch >= '0' && arg->ch <= '9') {
14542 arg->width = arg->ch - '0';
14543 while (--ctx->fmtcnt >= 0) {
14544 arg->ch = FORMAT_READ(ctx);
14545 ctx->fmtpos++;
14546 if (arg->ch < '0' || arg->ch > '9')
14547 break;
14548 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14549 mixing signed and unsigned comparison. Since arg->ch is between
14550 '0' and '9', casting to int is safe. */
14551 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14552 PyErr_SetString(PyExc_ValueError,
14553 "width too big");
14554 return -1;
14555 }
14556 arg->width = arg->width*10 + (arg->ch - '0');
14557 }
14558 }
14559
14560 /* Parse precision. Example: "%.3f" => prec=3 */
14561 if (arg->ch == '.') {
14562 arg->prec = 0;
14563 if (--ctx->fmtcnt >= 0) {
14564 arg->ch = FORMAT_READ(ctx);
14565 ctx->fmtpos++;
14566 }
14567 if (arg->ch == '*') {
14568 v = unicode_format_getnextarg(ctx);
14569 if (v == NULL)
14570 return -1;
14571 if (!PyLong_Check(v)) {
14572 PyErr_SetString(PyExc_TypeError,
14573 "* wants int");
14574 return -1;
14575 }
14576 arg->prec = _PyLong_AsInt(v);
14577 if (arg->prec == -1 && PyErr_Occurred())
14578 return -1;
14579 if (arg->prec < 0)
14580 arg->prec = 0;
14581 if (--ctx->fmtcnt >= 0) {
14582 arg->ch = FORMAT_READ(ctx);
14583 ctx->fmtpos++;
14584 }
14585 }
14586 else if (arg->ch >= '0' && arg->ch <= '9') {
14587 arg->prec = arg->ch - '0';
14588 while (--ctx->fmtcnt >= 0) {
14589 arg->ch = FORMAT_READ(ctx);
14590 ctx->fmtpos++;
14591 if (arg->ch < '0' || arg->ch > '9')
14592 break;
14593 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14594 PyErr_SetString(PyExc_ValueError,
14595 "precision too big");
14596 return -1;
14597 }
14598 arg->prec = arg->prec*10 + (arg->ch - '0');
14599 }
14600 }
14601 }
14602
14603 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14604 if (ctx->fmtcnt >= 0) {
14605 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14606 if (--ctx->fmtcnt >= 0) {
14607 arg->ch = FORMAT_READ(ctx);
14608 ctx->fmtpos++;
14609 }
14610 }
14611 }
14612 if (ctx->fmtcnt < 0) {
14613 PyErr_SetString(PyExc_ValueError,
14614 "incomplete format");
14615 return -1;
14616 }
14617 return 0;
14618
14619 #undef FORMAT_READ
14620 }
14621
14622 /* Format one argument. Supported conversion specifiers:
14623
14624 - "s", "r", "a": any type
14625 - "i", "d", "u": int or float
14626 - "o", "x", "X": int
14627 - "e", "E", "f", "F", "g", "G": float
14628 - "c": int or str (1 character)
14629
14630 When possible, the output is written directly into the Unicode writer
14631 (ctx->writer). A string is created when padding is required.
14632
14633 Return 0 if the argument has been formatted into *p_str,
14634 1 if the argument has been written into ctx->writer,
14635 -1 on error. */
14636 static int
unicode_format_arg_format(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject ** p_str)14637 unicode_format_arg_format(struct unicode_formatter_t *ctx,
14638 struct unicode_format_arg_t *arg,
14639 PyObject **p_str)
14640 {
14641 PyObject *v;
14642 _PyUnicodeWriter *writer = &ctx->writer;
14643
14644 if (ctx->fmtcnt == 0)
14645 ctx->writer.overallocate = 0;
14646
14647 v = unicode_format_getnextarg(ctx);
14648 if (v == NULL)
14649 return -1;
14650
14651
14652 switch (arg->ch) {
14653 case 's':
14654 case 'r':
14655 case 'a':
14656 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14657 /* Fast path */
14658 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14659 return -1;
14660 return 1;
14661 }
14662
14663 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14664 *p_str = v;
14665 Py_INCREF(*p_str);
14666 }
14667 else {
14668 if (arg->ch == 's')
14669 *p_str = PyObject_Str(v);
14670 else if (arg->ch == 'r')
14671 *p_str = PyObject_Repr(v);
14672 else
14673 *p_str = PyObject_ASCII(v);
14674 }
14675 break;
14676
14677 case 'i':
14678 case 'd':
14679 case 'u':
14680 case 'o':
14681 case 'x':
14682 case 'X':
14683 {
14684 int ret = mainformatlong(v, arg, p_str, writer);
14685 if (ret != 0)
14686 return ret;
14687 arg->sign = 1;
14688 break;
14689 }
14690
14691 case 'e':
14692 case 'E':
14693 case 'f':
14694 case 'F':
14695 case 'g':
14696 case 'G':
14697 if (arg->width == -1 && arg->prec == -1
14698 && !(arg->flags & (F_SIGN | F_BLANK)))
14699 {
14700 /* Fast path */
14701 if (formatfloat(v, arg, NULL, writer) == -1)
14702 return -1;
14703 return 1;
14704 }
14705
14706 arg->sign = 1;
14707 if (formatfloat(v, arg, p_str, NULL) == -1)
14708 return -1;
14709 break;
14710
14711 case 'c':
14712 {
14713 Py_UCS4 ch = formatchar(v);
14714 if (ch == (Py_UCS4) -1)
14715 return -1;
14716 if (arg->width == -1 && arg->prec == -1) {
14717 /* Fast path */
14718 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14719 return -1;
14720 return 1;
14721 }
14722 *p_str = PyUnicode_FromOrdinal(ch);
14723 break;
14724 }
14725
14726 default:
14727 PyErr_Format(PyExc_ValueError,
14728 "unsupported format character '%c' (0x%x) "
14729 "at index %zd",
14730 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14731 (int)arg->ch,
14732 ctx->fmtpos - 1);
14733 return -1;
14734 }
14735 if (*p_str == NULL)
14736 return -1;
14737 assert (PyUnicode_Check(*p_str));
14738 return 0;
14739 }
14740
14741 static int
unicode_format_arg_output(struct unicode_formatter_t * ctx,struct unicode_format_arg_t * arg,PyObject * str)14742 unicode_format_arg_output(struct unicode_formatter_t *ctx,
14743 struct unicode_format_arg_t *arg,
14744 PyObject *str)
14745 {
14746 Py_ssize_t len;
14747 enum PyUnicode_Kind kind;
14748 void *pbuf;
14749 Py_ssize_t pindex;
14750 Py_UCS4 signchar;
14751 Py_ssize_t buflen;
14752 Py_UCS4 maxchar;
14753 Py_ssize_t sublen;
14754 _PyUnicodeWriter *writer = &ctx->writer;
14755 Py_UCS4 fill;
14756
14757 fill = ' ';
14758 if (arg->sign && arg->flags & F_ZERO)
14759 fill = '0';
14760
14761 if (PyUnicode_READY(str) == -1)
14762 return -1;
14763
14764 len = PyUnicode_GET_LENGTH(str);
14765 if ((arg->width == -1 || arg->width <= len)
14766 && (arg->prec == -1 || arg->prec >= len)
14767 && !(arg->flags & (F_SIGN | F_BLANK)))
14768 {
14769 /* Fast path */
14770 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14771 return -1;
14772 return 0;
14773 }
14774
14775 /* Truncate the string for "s", "r" and "a" formats
14776 if the precision is set */
14777 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14778 if (arg->prec >= 0 && len > arg->prec)
14779 len = arg->prec;
14780 }
14781
14782 /* Adjust sign and width */
14783 kind = PyUnicode_KIND(str);
14784 pbuf = PyUnicode_DATA(str);
14785 pindex = 0;
14786 signchar = '\0';
14787 if (arg->sign) {
14788 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14789 if (ch == '-' || ch == '+') {
14790 signchar = ch;
14791 len--;
14792 pindex++;
14793 }
14794 else if (arg->flags & F_SIGN)
14795 signchar = '+';
14796 else if (arg->flags & F_BLANK)
14797 signchar = ' ';
14798 else
14799 arg->sign = 0;
14800 }
14801 if (arg->width < len)
14802 arg->width = len;
14803
14804 /* Prepare the writer */
14805 maxchar = writer->maxchar;
14806 if (!(arg->flags & F_LJUST)) {
14807 if (arg->sign) {
14808 if ((arg->width-1) > len)
14809 maxchar = Py_MAX(maxchar, fill);
14810 }
14811 else {
14812 if (arg->width > len)
14813 maxchar = Py_MAX(maxchar, fill);
14814 }
14815 }
14816 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14817 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14818 maxchar = Py_MAX(maxchar, strmaxchar);
14819 }
14820
14821 buflen = arg->width;
14822 if (arg->sign && len == arg->width)
14823 buflen++;
14824 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14825 return -1;
14826
14827 /* Write the sign if needed */
14828 if (arg->sign) {
14829 if (fill != ' ') {
14830 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14831 writer->pos += 1;
14832 }
14833 if (arg->width > len)
14834 arg->width--;
14835 }
14836
14837 /* Write the numeric prefix for "x", "X" and "o" formats
14838 if the alternate form is used.
14839 For example, write "0x" for the "%#x" format. */
14840 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14841 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14842 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14843 if (fill != ' ') {
14844 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14845 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14846 writer->pos += 2;
14847 pindex += 2;
14848 }
14849 arg->width -= 2;
14850 if (arg->width < 0)
14851 arg->width = 0;
14852 len -= 2;
14853 }
14854
14855 /* Pad left with the fill character if needed */
14856 if (arg->width > len && !(arg->flags & F_LJUST)) {
14857 sublen = arg->width - len;
14858 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14859 writer->pos += sublen;
14860 arg->width = len;
14861 }
14862
14863 /* If padding with spaces: write sign if needed and/or numeric prefix if
14864 the alternate form is used */
14865 if (fill == ' ') {
14866 if (arg->sign) {
14867 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14868 writer->pos += 1;
14869 }
14870 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14871 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14872 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14873 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14874 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14875 writer->pos += 2;
14876 pindex += 2;
14877 }
14878 }
14879
14880 /* Write characters */
14881 if (len) {
14882 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14883 str, pindex, len);
14884 writer->pos += len;
14885 }
14886
14887 /* Pad right with the fill character if needed */
14888 if (arg->width > len) {
14889 sublen = arg->width - len;
14890 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14891 writer->pos += sublen;
14892 }
14893 return 0;
14894 }
14895
14896 /* Helper of PyUnicode_Format(): format one arg.
14897 Return 0 on success, raise an exception and return -1 on error. */
14898 static int
unicode_format_arg(struct unicode_formatter_t * ctx)14899 unicode_format_arg(struct unicode_formatter_t *ctx)
14900 {
14901 struct unicode_format_arg_t arg;
14902 PyObject *str;
14903 int ret;
14904
14905 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14906 if (arg.ch == '%') {
14907 ctx->fmtpos++;
14908 ctx->fmtcnt--;
14909 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14910 return -1;
14911 return 0;
14912 }
14913 arg.flags = 0;
14914 arg.width = -1;
14915 arg.prec = -1;
14916 arg.sign = 0;
14917 str = NULL;
14918
14919 ret = unicode_format_arg_parse(ctx, &arg);
14920 if (ret == -1)
14921 return -1;
14922
14923 ret = unicode_format_arg_format(ctx, &arg, &str);
14924 if (ret == -1)
14925 return -1;
14926
14927 if (ret != 1) {
14928 ret = unicode_format_arg_output(ctx, &arg, str);
14929 Py_DECREF(str);
14930 if (ret == -1)
14931 return -1;
14932 }
14933
14934 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14935 PyErr_SetString(PyExc_TypeError,
14936 "not all arguments converted during string formatting");
14937 return -1;
14938 }
14939 return 0;
14940 }
14941
14942 PyObject *
PyUnicode_Format(PyObject * format,PyObject * args)14943 PyUnicode_Format(PyObject *format, PyObject *args)
14944 {
14945 struct unicode_formatter_t ctx;
14946
14947 if (format == NULL || args == NULL) {
14948 PyErr_BadInternalCall();
14949 return NULL;
14950 }
14951
14952 if (ensure_unicode(format) < 0)
14953 return NULL;
14954
14955 ctx.fmtstr = format;
14956 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14957 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14958 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14959 ctx.fmtpos = 0;
14960
14961 _PyUnicodeWriter_Init(&ctx.writer);
14962 ctx.writer.min_length = ctx.fmtcnt + 100;
14963 ctx.writer.overallocate = 1;
14964
14965 if (PyTuple_Check(args)) {
14966 ctx.arglen = PyTuple_Size(args);
14967 ctx.argidx = 0;
14968 }
14969 else {
14970 ctx.arglen = -1;
14971 ctx.argidx = -2;
14972 }
14973 ctx.args_owned = 0;
14974 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14975 ctx.dict = args;
14976 else
14977 ctx.dict = NULL;
14978 ctx.args = args;
14979
14980 while (--ctx.fmtcnt >= 0) {
14981 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14982 Py_ssize_t nonfmtpos;
14983
14984 nonfmtpos = ctx.fmtpos++;
14985 while (ctx.fmtcnt >= 0 &&
14986 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14987 ctx.fmtpos++;
14988 ctx.fmtcnt--;
14989 }
14990 if (ctx.fmtcnt < 0) {
14991 ctx.fmtpos--;
14992 ctx.writer.overallocate = 0;
14993 }
14994
14995 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14996 nonfmtpos, ctx.fmtpos) < 0)
14997 goto onError;
14998 }
14999 else {
15000 ctx.fmtpos++;
15001 if (unicode_format_arg(&ctx) == -1)
15002 goto onError;
15003 }
15004 }
15005
15006 if (ctx.argidx < ctx.arglen && !ctx.dict) {
15007 PyErr_SetString(PyExc_TypeError,
15008 "not all arguments converted during string formatting");
15009 goto onError;
15010 }
15011
15012 if (ctx.args_owned) {
15013 Py_DECREF(ctx.args);
15014 }
15015 return _PyUnicodeWriter_Finish(&ctx.writer);
15016
15017 onError:
15018 _PyUnicodeWriter_Dealloc(&ctx.writer);
15019 if (ctx.args_owned) {
15020 Py_DECREF(ctx.args);
15021 }
15022 return NULL;
15023 }
15024
15025 static PyObject *
15026 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15027
15028 static PyObject *
unicode_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15029 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15030 {
15031 PyObject *x = NULL;
15032 static char *kwlist[] = {"object", "encoding", "errors", 0};
15033 char *encoding = NULL;
15034 char *errors = NULL;
15035
15036 if (type != &PyUnicode_Type)
15037 return unicode_subtype_new(type, args, kwds);
15038 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15039 kwlist, &x, &encoding, &errors))
15040 return NULL;
15041 if (x == NULL)
15042 _Py_RETURN_UNICODE_EMPTY();
15043 if (encoding == NULL && errors == NULL)
15044 return PyObject_Str(x);
15045 else
15046 return PyUnicode_FromEncodedObject(x, encoding, errors);
15047 }
15048
15049 static PyObject *
unicode_subtype_new(PyTypeObject * type,PyObject * args,PyObject * kwds)15050 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15051 {
15052 PyObject *unicode, *self;
15053 Py_ssize_t length, char_size;
15054 int share_wstr, share_utf8;
15055 unsigned int kind;
15056 void *data;
15057
15058 assert(PyType_IsSubtype(type, &PyUnicode_Type));
15059
15060 unicode = unicode_new(&PyUnicode_Type, args, kwds);
15061 if (unicode == NULL)
15062 return NULL;
15063 assert(_PyUnicode_CHECK(unicode));
15064 if (PyUnicode_READY(unicode) == -1) {
15065 Py_DECREF(unicode);
15066 return NULL;
15067 }
15068
15069 self = type->tp_alloc(type, 0);
15070 if (self == NULL) {
15071 Py_DECREF(unicode);
15072 return NULL;
15073 }
15074 kind = PyUnicode_KIND(unicode);
15075 length = PyUnicode_GET_LENGTH(unicode);
15076
15077 _PyUnicode_LENGTH(self) = length;
15078 #ifdef Py_DEBUG
15079 _PyUnicode_HASH(self) = -1;
15080 #else
15081 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15082 #endif
15083 _PyUnicode_STATE(self).interned = 0;
15084 _PyUnicode_STATE(self).kind = kind;
15085 _PyUnicode_STATE(self).compact = 0;
15086 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15087 _PyUnicode_STATE(self).ready = 1;
15088 _PyUnicode_WSTR(self) = NULL;
15089 _PyUnicode_UTF8_LENGTH(self) = 0;
15090 _PyUnicode_UTF8(self) = NULL;
15091 _PyUnicode_WSTR_LENGTH(self) = 0;
15092 _PyUnicode_DATA_ANY(self) = NULL;
15093
15094 share_utf8 = 0;
15095 share_wstr = 0;
15096 if (kind == PyUnicode_1BYTE_KIND) {
15097 char_size = 1;
15098 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15099 share_utf8 = 1;
15100 }
15101 else if (kind == PyUnicode_2BYTE_KIND) {
15102 char_size = 2;
15103 if (sizeof(wchar_t) == 2)
15104 share_wstr = 1;
15105 }
15106 else {
15107 assert(kind == PyUnicode_4BYTE_KIND);
15108 char_size = 4;
15109 if (sizeof(wchar_t) == 4)
15110 share_wstr = 1;
15111 }
15112
15113 /* Ensure we won't overflow the length. */
15114 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15115 PyErr_NoMemory();
15116 goto onError;
15117 }
15118 data = PyObject_MALLOC((length + 1) * char_size);
15119 if (data == NULL) {
15120 PyErr_NoMemory();
15121 goto onError;
15122 }
15123
15124 _PyUnicode_DATA_ANY(self) = data;
15125 if (share_utf8) {
15126 _PyUnicode_UTF8_LENGTH(self) = length;
15127 _PyUnicode_UTF8(self) = data;
15128 }
15129 if (share_wstr) {
15130 _PyUnicode_WSTR_LENGTH(self) = length;
15131 _PyUnicode_WSTR(self) = (wchar_t *)data;
15132 }
15133
15134 memcpy(data, PyUnicode_DATA(unicode),
15135 kind * (length + 1));
15136 assert(_PyUnicode_CheckConsistency(self, 1));
15137 #ifdef Py_DEBUG
15138 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15139 #endif
15140 Py_DECREF(unicode);
15141 return self;
15142
15143 onError:
15144 Py_DECREF(unicode);
15145 Py_DECREF(self);
15146 return NULL;
15147 }
15148
15149 PyDoc_STRVAR(unicode_doc,
15150 "str(object='') -> str\n\
15151 str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15152 \n\
15153 Create a new string object from the given object. If encoding or\n\
15154 errors is specified, then the object must expose a data buffer\n\
15155 that will be decoded using the given encoding and error handler.\n\
15156 Otherwise, returns the result of object.__str__() (if defined)\n\
15157 or repr(object).\n\
15158 encoding defaults to sys.getdefaultencoding().\n\
15159 errors defaults to 'strict'.");
15160
15161 static PyObject *unicode_iter(PyObject *seq);
15162
15163 PyTypeObject PyUnicode_Type = {
15164 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15165 "str", /* tp_name */
15166 sizeof(PyUnicodeObject), /* tp_size */
15167 0, /* tp_itemsize */
15168 /* Slots */
15169 (destructor)unicode_dealloc, /* tp_dealloc */
15170 0, /* tp_print */
15171 0, /* tp_getattr */
15172 0, /* tp_setattr */
15173 0, /* tp_reserved */
15174 unicode_repr, /* tp_repr */
15175 &unicode_as_number, /* tp_as_number */
15176 &unicode_as_sequence, /* tp_as_sequence */
15177 &unicode_as_mapping, /* tp_as_mapping */
15178 (hashfunc) unicode_hash, /* tp_hash*/
15179 0, /* tp_call*/
15180 (reprfunc) unicode_str, /* tp_str */
15181 PyObject_GenericGetAttr, /* tp_getattro */
15182 0, /* tp_setattro */
15183 0, /* tp_as_buffer */
15184 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15185 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15186 unicode_doc, /* tp_doc */
15187 0, /* tp_traverse */
15188 0, /* tp_clear */
15189 PyUnicode_RichCompare, /* tp_richcompare */
15190 0, /* tp_weaklistoffset */
15191 unicode_iter, /* tp_iter */
15192 0, /* tp_iternext */
15193 unicode_methods, /* tp_methods */
15194 0, /* tp_members */
15195 0, /* tp_getset */
15196 &PyBaseObject_Type, /* tp_base */
15197 0, /* tp_dict */
15198 0, /* tp_descr_get */
15199 0, /* tp_descr_set */
15200 0, /* tp_dictoffset */
15201 0, /* tp_init */
15202 0, /* tp_alloc */
15203 unicode_new, /* tp_new */
15204 PyObject_Del, /* tp_free */
15205 };
15206
15207 /* Initialize the Unicode implementation */
15208
_PyUnicode_Init(void)15209 int _PyUnicode_Init(void)
15210 {
15211 /* XXX - move this array to unicodectype.c ? */
15212 Py_UCS2 linebreak[] = {
15213 0x000A, /* LINE FEED */
15214 0x000D, /* CARRIAGE RETURN */
15215 0x001C, /* FILE SEPARATOR */
15216 0x001D, /* GROUP SEPARATOR */
15217 0x001E, /* RECORD SEPARATOR */
15218 0x0085, /* NEXT LINE */
15219 0x2028, /* LINE SEPARATOR */
15220 0x2029, /* PARAGRAPH SEPARATOR */
15221 };
15222
15223 /* Init the implementation */
15224 _Py_INCREF_UNICODE_EMPTY();
15225 if (!unicode_empty)
15226 Py_FatalError("Can't create empty string");
15227 Py_DECREF(unicode_empty);
15228
15229 if (PyType_Ready(&PyUnicode_Type) < 0)
15230 Py_FatalError("Can't initialize 'unicode'");
15231
15232 /* initialize the linebreak bloom filter */
15233 bloom_linebreak = make_bloom_mask(
15234 PyUnicode_2BYTE_KIND, linebreak,
15235 Py_ARRAY_LENGTH(linebreak));
15236
15237 if (PyType_Ready(&EncodingMapType) < 0)
15238 Py_FatalError("Can't initialize encoding map type");
15239
15240 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15241 Py_FatalError("Can't initialize field name iterator type");
15242
15243 if (PyType_Ready(&PyFormatterIter_Type) < 0)
15244 Py_FatalError("Can't initialize formatter iter type");
15245
15246 return 0;
15247 }
15248
15249 /* Finalize the Unicode implementation */
15250
15251 int
PyUnicode_ClearFreeList(void)15252 PyUnicode_ClearFreeList(void)
15253 {
15254 return 0;
15255 }
15256
15257 void
_PyUnicode_Fini(void)15258 _PyUnicode_Fini(void)
15259 {
15260 int i;
15261
15262 Py_CLEAR(unicode_empty);
15263
15264 for (i = 0; i < 256; i++)
15265 Py_CLEAR(unicode_latin1[i]);
15266 _PyUnicode_ClearStaticStrings();
15267 (void)PyUnicode_ClearFreeList();
15268 }
15269
15270 void
PyUnicode_InternInPlace(PyObject ** p)15271 PyUnicode_InternInPlace(PyObject **p)
15272 {
15273 PyObject *s = *p;
15274 PyObject *t;
15275 #ifdef Py_DEBUG
15276 assert(s != NULL);
15277 assert(_PyUnicode_CHECK(s));
15278 #else
15279 if (s == NULL || !PyUnicode_Check(s))
15280 return;
15281 #endif
15282 /* If it's a subclass, we don't really know what putting
15283 it in the interned dict might do. */
15284 if (!PyUnicode_CheckExact(s))
15285 return;
15286 if (PyUnicode_CHECK_INTERNED(s))
15287 return;
15288 if (interned == NULL) {
15289 interned = PyDict_New();
15290 if (interned == NULL) {
15291 PyErr_Clear(); /* Don't leave an exception */
15292 return;
15293 }
15294 }
15295 Py_ALLOW_RECURSION
15296 t = PyDict_SetDefault(interned, s, s);
15297 Py_END_ALLOW_RECURSION
15298 if (t == NULL) {
15299 PyErr_Clear();
15300 return;
15301 }
15302 if (t != s) {
15303 Py_INCREF(t);
15304 Py_SETREF(*p, t);
15305 return;
15306 }
15307 /* The two references in interned are not counted by refcnt.
15308 The deallocator will take care of this */
15309 Py_REFCNT(s) -= 2;
15310 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15311 }
15312
15313 void
PyUnicode_InternImmortal(PyObject ** p)15314 PyUnicode_InternImmortal(PyObject **p)
15315 {
15316 PyUnicode_InternInPlace(p);
15317 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15318 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15319 Py_INCREF(*p);
15320 }
15321 }
15322
15323 PyObject *
PyUnicode_InternFromString(const char * cp)15324 PyUnicode_InternFromString(const char *cp)
15325 {
15326 PyObject *s = PyUnicode_FromString(cp);
15327 if (s == NULL)
15328 return NULL;
15329 PyUnicode_InternInPlace(&s);
15330 return s;
15331 }
15332
15333 void
_Py_ReleaseInternedUnicodeStrings(void)15334 _Py_ReleaseInternedUnicodeStrings(void)
15335 {
15336 PyObject *keys;
15337 PyObject *s;
15338 Py_ssize_t i, n;
15339 Py_ssize_t immortal_size = 0, mortal_size = 0;
15340
15341 if (interned == NULL || !PyDict_Check(interned))
15342 return;
15343 keys = PyDict_Keys(interned);
15344 if (keys == NULL || !PyList_Check(keys)) {
15345 PyErr_Clear();
15346 return;
15347 }
15348
15349 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15350 detector, interned unicode strings are not forcibly deallocated;
15351 rather, we give them their stolen references back, and then clear
15352 and DECREF the interned dict. */
15353
15354 n = PyList_GET_SIZE(keys);
15355 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15356 n);
15357 for (i = 0; i < n; i++) {
15358 s = PyList_GET_ITEM(keys, i);
15359 if (PyUnicode_READY(s) == -1) {
15360 Py_UNREACHABLE();
15361 }
15362 switch (PyUnicode_CHECK_INTERNED(s)) {
15363 case SSTATE_NOT_INTERNED:
15364 /* XXX Shouldn't happen */
15365 break;
15366 case SSTATE_INTERNED_IMMORTAL:
15367 Py_REFCNT(s) += 1;
15368 immortal_size += PyUnicode_GET_LENGTH(s);
15369 break;
15370 case SSTATE_INTERNED_MORTAL:
15371 Py_REFCNT(s) += 2;
15372 mortal_size += PyUnicode_GET_LENGTH(s);
15373 break;
15374 default:
15375 Py_FatalError("Inconsistent interned string state.");
15376 }
15377 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15378 }
15379 fprintf(stderr, "total size of all interned strings: "
15380 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15381 "mortal/immortal\n", mortal_size, immortal_size);
15382 Py_DECREF(keys);
15383 PyDict_Clear(interned);
15384 Py_CLEAR(interned);
15385 }
15386
15387
15388 /********************* Unicode Iterator **************************/
15389
15390 typedef struct {
15391 PyObject_HEAD
15392 Py_ssize_t it_index;
15393 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
15394 } unicodeiterobject;
15395
15396 static void
unicodeiter_dealloc(unicodeiterobject * it)15397 unicodeiter_dealloc(unicodeiterobject *it)
15398 {
15399 _PyObject_GC_UNTRACK(it);
15400 Py_XDECREF(it->it_seq);
15401 PyObject_GC_Del(it);
15402 }
15403
15404 static int
unicodeiter_traverse(unicodeiterobject * it,visitproc visit,void * arg)15405 unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15406 {
15407 Py_VISIT(it->it_seq);
15408 return 0;
15409 }
15410
15411 static PyObject *
unicodeiter_next(unicodeiterobject * it)15412 unicodeiter_next(unicodeiterobject *it)
15413 {
15414 PyObject *seq, *item;
15415
15416 assert(it != NULL);
15417 seq = it->it_seq;
15418 if (seq == NULL)
15419 return NULL;
15420 assert(_PyUnicode_CHECK(seq));
15421
15422 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15423 int kind = PyUnicode_KIND(seq);
15424 void *data = PyUnicode_DATA(seq);
15425 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15426 item = PyUnicode_FromOrdinal(chr);
15427 if (item != NULL)
15428 ++it->it_index;
15429 return item;
15430 }
15431
15432 it->it_seq = NULL;
15433 Py_DECREF(seq);
15434 return NULL;
15435 }
15436
15437 static PyObject *
unicodeiter_len(unicodeiterobject * it)15438 unicodeiter_len(unicodeiterobject *it)
15439 {
15440 Py_ssize_t len = 0;
15441 if (it->it_seq)
15442 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15443 return PyLong_FromSsize_t(len);
15444 }
15445
15446 PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15447
15448 static PyObject *
unicodeiter_reduce(unicodeiterobject * it)15449 unicodeiter_reduce(unicodeiterobject *it)
15450 {
15451 if (it->it_seq != NULL) {
15452 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15453 it->it_seq, it->it_index);
15454 } else {
15455 PyObject *u = (PyObject *)_PyUnicode_New(0);
15456 if (u == NULL)
15457 return NULL;
15458 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15459 }
15460 }
15461
15462 PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15463
15464 static PyObject *
unicodeiter_setstate(unicodeiterobject * it,PyObject * state)15465 unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15466 {
15467 Py_ssize_t index = PyLong_AsSsize_t(state);
15468 if (index == -1 && PyErr_Occurred())
15469 return NULL;
15470 if (it->it_seq != NULL) {
15471 if (index < 0)
15472 index = 0;
15473 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15474 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15475 it->it_index = index;
15476 }
15477 Py_RETURN_NONE;
15478 }
15479
15480 PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15481
15482 static PyMethodDef unicodeiter_methods[] = {
15483 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15484 length_hint_doc},
15485 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15486 reduce_doc},
15487 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15488 setstate_doc},
15489 {NULL, NULL} /* sentinel */
15490 };
15491
15492 PyTypeObject PyUnicodeIter_Type = {
15493 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15494 "str_iterator", /* tp_name */
15495 sizeof(unicodeiterobject), /* tp_basicsize */
15496 0, /* tp_itemsize */
15497 /* methods */
15498 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15499 0, /* tp_print */
15500 0, /* tp_getattr */
15501 0, /* tp_setattr */
15502 0, /* tp_reserved */
15503 0, /* tp_repr */
15504 0, /* tp_as_number */
15505 0, /* tp_as_sequence */
15506 0, /* tp_as_mapping */
15507 0, /* tp_hash */
15508 0, /* tp_call */
15509 0, /* tp_str */
15510 PyObject_GenericGetAttr, /* tp_getattro */
15511 0, /* tp_setattro */
15512 0, /* tp_as_buffer */
15513 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15514 0, /* tp_doc */
15515 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15516 0, /* tp_clear */
15517 0, /* tp_richcompare */
15518 0, /* tp_weaklistoffset */
15519 PyObject_SelfIter, /* tp_iter */
15520 (iternextfunc)unicodeiter_next, /* tp_iternext */
15521 unicodeiter_methods, /* tp_methods */
15522 0,
15523 };
15524
15525 static PyObject *
unicode_iter(PyObject * seq)15526 unicode_iter(PyObject *seq)
15527 {
15528 unicodeiterobject *it;
15529
15530 if (!PyUnicode_Check(seq)) {
15531 PyErr_BadInternalCall();
15532 return NULL;
15533 }
15534 if (PyUnicode_READY(seq) == -1)
15535 return NULL;
15536 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15537 if (it == NULL)
15538 return NULL;
15539 it->it_index = 0;
15540 Py_INCREF(seq);
15541 it->it_seq = seq;
15542 _PyObject_GC_TRACK(it);
15543 return (PyObject *)it;
15544 }
15545
15546
15547 size_t
Py_UNICODE_strlen(const Py_UNICODE * u)15548 Py_UNICODE_strlen(const Py_UNICODE *u)
15549 {
15550 return wcslen(u);
15551 }
15552
15553 Py_UNICODE*
Py_UNICODE_strcpy(Py_UNICODE * s1,const Py_UNICODE * s2)15554 Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15555 {
15556 Py_UNICODE *u = s1;
15557 while ((*u++ = *s2++));
15558 return s1;
15559 }
15560
15561 Py_UNICODE*
Py_UNICODE_strncpy(Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15562 Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15563 {
15564 Py_UNICODE *u = s1;
15565 while ((*u++ = *s2++))
15566 if (n-- == 0)
15567 break;
15568 return s1;
15569 }
15570
15571 Py_UNICODE*
Py_UNICODE_strcat(Py_UNICODE * s1,const Py_UNICODE * s2)15572 Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15573 {
15574 Py_UNICODE *u1 = s1;
15575 u1 += wcslen(u1);
15576 while ((*u1++ = *s2++));
15577 return s1;
15578 }
15579
15580 int
Py_UNICODE_strcmp(const Py_UNICODE * s1,const Py_UNICODE * s2)15581 Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15582 {
15583 while (*s1 && *s2 && *s1 == *s2)
15584 s1++, s2++;
15585 if (*s1 && *s2)
15586 return (*s1 < *s2) ? -1 : +1;
15587 if (*s1)
15588 return 1;
15589 if (*s2)
15590 return -1;
15591 return 0;
15592 }
15593
15594 int
Py_UNICODE_strncmp(const Py_UNICODE * s1,const Py_UNICODE * s2,size_t n)15595 Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15596 {
15597 Py_UNICODE u1, u2;
15598 for (; n != 0; n--) {
15599 u1 = *s1;
15600 u2 = *s2;
15601 if (u1 != u2)
15602 return (u1 < u2) ? -1 : +1;
15603 if (u1 == '\0')
15604 return 0;
15605 s1++;
15606 s2++;
15607 }
15608 return 0;
15609 }
15610
15611 Py_UNICODE*
Py_UNICODE_strchr(const Py_UNICODE * s,Py_UNICODE c)15612 Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15613 {
15614 const Py_UNICODE *p;
15615 for (p = s; *p; p++)
15616 if (*p == c)
15617 return (Py_UNICODE*)p;
15618 return NULL;
15619 }
15620
15621 Py_UNICODE*
Py_UNICODE_strrchr(const Py_UNICODE * s,Py_UNICODE c)15622 Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15623 {
15624 const Py_UNICODE *p;
15625 p = s + wcslen(s);
15626 while (p != s) {
15627 p--;
15628 if (*p == c)
15629 return (Py_UNICODE*)p;
15630 }
15631 return NULL;
15632 }
15633
15634 Py_UNICODE*
PyUnicode_AsUnicodeCopy(PyObject * unicode)15635 PyUnicode_AsUnicodeCopy(PyObject *unicode)
15636 {
15637 Py_UNICODE *u, *copy;
15638 Py_ssize_t len, size;
15639
15640 if (!PyUnicode_Check(unicode)) {
15641 PyErr_BadArgument();
15642 return NULL;
15643 }
15644 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15645 if (u == NULL)
15646 return NULL;
15647 /* Ensure we won't overflow the size. */
15648 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15649 PyErr_NoMemory();
15650 return NULL;
15651 }
15652 size = len + 1; /* copy the null character */
15653 size *= sizeof(Py_UNICODE);
15654 copy = PyMem_Malloc(size);
15655 if (copy == NULL) {
15656 PyErr_NoMemory();
15657 return NULL;
15658 }
15659 memcpy(copy, u, size);
15660 return copy;
15661 }
15662
15663 /* A _string module, to export formatter_parser and formatter_field_name_split
15664 to the string.Formatter class implemented in Python. */
15665
15666 static PyMethodDef _string_methods[] = {
15667 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15668 METH_O, PyDoc_STR("split the argument as a field name")},
15669 {"formatter_parser", (PyCFunction) formatter_parser,
15670 METH_O, PyDoc_STR("parse the argument as a format string")},
15671 {NULL, NULL}
15672 };
15673
15674 static struct PyModuleDef _string_module = {
15675 PyModuleDef_HEAD_INIT,
15676 "_string",
15677 PyDoc_STR("string helper module"),
15678 0,
15679 _string_methods,
15680 NULL,
15681 NULL,
15682 NULL,
15683 NULL
15684 };
15685
15686 PyMODINIT_FUNC
PyInit__string(void)15687 PyInit__string(void)
15688 {
15689 return PyModule_Create(&_string_module);
15690 }
15691
15692
15693 #ifdef __cplusplus
15694 }
15695 #endif
15696