1 /* stringlib: codec implementations */
2
3 #if !STRINGLIB_IS_UNICODE
4 # error "codecs.h is specific to Unicode"
5 #endif
6
7 /* Mask to quickly check whether a C 'long' contains a
8 non-ASCII, UTF8-encoded char. */
9 #if (SIZEOF_LONG == 8)
10 # define ASCII_CHAR_MASK 0x8080808080808080UL
11 #elif (SIZEOF_LONG == 4)
12 # define ASCII_CHAR_MASK 0x80808080UL
13 #else
14 # error C 'long' size should be either 4 or 8!
15 #endif
16
17 /* 10xxxxxx */
18 #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
19
20 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_decode)21 STRINGLIB(utf8_decode)(const char **inptr, const char *end,
22 STRINGLIB_CHAR *dest,
23 Py_ssize_t *outpos)
24 {
25 Py_UCS4 ch;
26 const char *s = *inptr;
27 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
28 STRINGLIB_CHAR *p = dest + *outpos;
29
30 while (s < end) {
31 ch = (unsigned char)*s;
32
33 if (ch < 0x80) {
34 /* Fast path for runs of ASCII characters. Given that common UTF-8
35 input will consist of an overwhelming majority of ASCII
36 characters, we try to optimize for this case by checking
37 as many characters as a C 'long' can contain.
38 First, check if we can do an aligned read, as most CPUs have
39 a penalty for unaligned reads.
40 */
41 if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
42 /* Help register allocation */
43 const char *_s = s;
44 STRINGLIB_CHAR *_p = p;
45 while (_s < aligned_end) {
46 /* Read a whole long at a time (either 4 or 8 bytes),
47 and do a fast unrolled copy if it only contains ASCII
48 characters. */
49 unsigned long value = *(const unsigned long *) _s;
50 if (value & ASCII_CHAR_MASK)
51 break;
52 #if PY_LITTLE_ENDIAN
53 _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
54 _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
55 _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
56 _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
57 # if SIZEOF_LONG == 8
58 _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
59 _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
60 _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
61 _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
62 # endif
63 #else
64 # if SIZEOF_LONG == 8
65 _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
66 _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
67 _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
68 _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
69 _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
70 _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
71 _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
72 _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
73 # else
74 _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
75 _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
76 _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
77 _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
78 # endif
79 #endif
80 _s += SIZEOF_LONG;
81 _p += SIZEOF_LONG;
82 }
83 s = _s;
84 p = _p;
85 if (s == end)
86 break;
87 ch = (unsigned char)*s;
88 }
89 if (ch < 0x80) {
90 s++;
91 *p++ = ch;
92 continue;
93 }
94 }
95
96 if (ch < 0xE0) {
97 /* \xC2\x80-\xDF\xBF -- 0080-07FF */
98 Py_UCS4 ch2;
99 if (ch < 0xC2) {
100 /* invalid sequence
101 \x80-\xBF -- continuation byte
102 \xC0-\xC1 -- fake 0000-007F */
103 goto InvalidStart;
104 }
105 if (end - s < 2) {
106 /* unexpected end of data: the caller will decide whether
107 it's an error or not */
108 break;
109 }
110 ch2 = (unsigned char)s[1];
111 if (!IS_CONTINUATION_BYTE(ch2))
112 /* invalid continuation byte */
113 goto InvalidContinuation1;
114 ch = (ch << 6) + ch2 -
115 ((0xC0 << 6) + 0x80);
116 assert ((ch > 0x007F) && (ch <= 0x07FF));
117 s += 2;
118 if (STRINGLIB_MAX_CHAR <= 0x007F ||
119 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
120 /* Out-of-range */
121 goto Return;
122 *p++ = ch;
123 continue;
124 }
125
126 if (ch < 0xF0) {
127 /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
128 Py_UCS4 ch2, ch3;
129 if (end - s < 3) {
130 /* unexpected end of data: the caller will decide whether
131 it's an error or not */
132 if (end - s < 2)
133 break;
134 ch2 = (unsigned char)s[1];
135 if (!IS_CONTINUATION_BYTE(ch2) ||
136 (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
137 /* for clarification see comments below */
138 goto InvalidContinuation1;
139 break;
140 }
141 ch2 = (unsigned char)s[1];
142 ch3 = (unsigned char)s[2];
143 if (!IS_CONTINUATION_BYTE(ch2)) {
144 /* invalid continuation byte */
145 goto InvalidContinuation1;
146 }
147 if (ch == 0xE0) {
148 if (ch2 < 0xA0)
149 /* invalid sequence
150 \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
151 goto InvalidContinuation1;
152 } else if (ch == 0xED && ch2 >= 0xA0) {
153 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
154 will result in surrogates in range D800-DFFF. Surrogates are
155 not valid UTF-8 so they are rejected.
156 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
157 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
158 goto InvalidContinuation1;
159 }
160 if (!IS_CONTINUATION_BYTE(ch3)) {
161 /* invalid continuation byte */
162 goto InvalidContinuation2;
163 }
164 ch = (ch << 12) + (ch2 << 6) + ch3 -
165 ((0xE0 << 12) + (0x80 << 6) + 0x80);
166 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
167 s += 3;
168 if (STRINGLIB_MAX_CHAR <= 0x07FF ||
169 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
170 /* Out-of-range */
171 goto Return;
172 *p++ = ch;
173 continue;
174 }
175
176 if (ch < 0xF5) {
177 /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
178 Py_UCS4 ch2, ch3, ch4;
179 if (end - s < 4) {
180 /* unexpected end of data: the caller will decide whether
181 it's an error or not */
182 if (end - s < 2)
183 break;
184 ch2 = (unsigned char)s[1];
185 if (!IS_CONTINUATION_BYTE(ch2) ||
186 (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
187 /* for clarification see comments below */
188 goto InvalidContinuation1;
189 if (end - s < 3)
190 break;
191 ch3 = (unsigned char)s[2];
192 if (!IS_CONTINUATION_BYTE(ch3))
193 goto InvalidContinuation2;
194 break;
195 }
196 ch2 = (unsigned char)s[1];
197 ch3 = (unsigned char)s[2];
198 ch4 = (unsigned char)s[3];
199 if (!IS_CONTINUATION_BYTE(ch2)) {
200 /* invalid continuation byte */
201 goto InvalidContinuation1;
202 }
203 if (ch == 0xF0) {
204 if (ch2 < 0x90)
205 /* invalid sequence
206 \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
207 goto InvalidContinuation1;
208 } else if (ch == 0xF4 && ch2 >= 0x90) {
209 /* invalid sequence
210 \xF4\x90\x80\x80- -- 110000- overflow */
211 goto InvalidContinuation1;
212 }
213 if (!IS_CONTINUATION_BYTE(ch3)) {
214 /* invalid continuation byte */
215 goto InvalidContinuation2;
216 }
217 if (!IS_CONTINUATION_BYTE(ch4)) {
218 /* invalid continuation byte */
219 goto InvalidContinuation3;
220 }
221 ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
222 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
223 assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
224 s += 4;
225 if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
226 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
227 /* Out-of-range */
228 goto Return;
229 *p++ = ch;
230 continue;
231 }
232 goto InvalidStart;
233 }
234 ch = 0;
235 Return:
236 *inptr = s;
237 *outpos = p - dest;
238 return ch;
239 InvalidStart:
240 ch = 1;
241 goto Return;
242 InvalidContinuation1:
243 ch = 2;
244 goto Return;
245 InvalidContinuation2:
246 ch = 3;
247 goto Return;
248 InvalidContinuation3:
249 ch = 4;
250 goto Return;
251 }
252
253 #undef ASCII_CHAR_MASK
254
255
256 /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
257 PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
258 UCS-1 strings don't need to handle surrogates for example. */
259 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(utf8_encoder)260 STRINGLIB(utf8_encoder)(PyObject *unicode,
261 STRINGLIB_CHAR *data,
262 Py_ssize_t size,
263 _Py_error_handler error_handler,
264 const char *errors)
265 {
266 Py_ssize_t i; /* index into data of next input character */
267 char *p; /* next free byte in output buffer */
268 #if STRINGLIB_SIZEOF_CHAR > 1
269 PyObject *error_handler_obj = NULL;
270 PyObject *exc = NULL;
271 PyObject *rep = NULL;
272 #endif
273 #if STRINGLIB_SIZEOF_CHAR == 1
274 const Py_ssize_t max_char_size = 2;
275 #elif STRINGLIB_SIZEOF_CHAR == 2
276 const Py_ssize_t max_char_size = 3;
277 #else /* STRINGLIB_SIZEOF_CHAR == 4 */
278 const Py_ssize_t max_char_size = 4;
279 #endif
280 _PyBytesWriter writer;
281
282 assert(size >= 0);
283 _PyBytesWriter_Init(&writer);
284
285 if (size > PY_SSIZE_T_MAX / max_char_size) {
286 /* integer overflow */
287 return PyErr_NoMemory();
288 }
289
290 p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
291 if (p == NULL)
292 return NULL;
293
294 for (i = 0; i < size;) {
295 Py_UCS4 ch = data[i++];
296
297 if (ch < 0x80) {
298 /* Encode ASCII */
299 *p++ = (char) ch;
300
301 }
302 else
303 #if STRINGLIB_SIZEOF_CHAR > 1
304 if (ch < 0x0800)
305 #endif
306 {
307 /* Encode Latin-1 */
308 *p++ = (char)(0xc0 | (ch >> 6));
309 *p++ = (char)(0x80 | (ch & 0x3f));
310 }
311 #if STRINGLIB_SIZEOF_CHAR > 1
312 else if (Py_UNICODE_IS_SURROGATE(ch)) {
313 Py_ssize_t startpos, endpos, newpos;
314 Py_ssize_t k;
315 if (error_handler == _Py_ERROR_UNKNOWN) {
316 error_handler = _Py_GetErrorHandler(errors);
317 }
318
319 startpos = i-1;
320 endpos = startpos+1;
321
322 while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
323 endpos++;
324
325 /* Only overallocate the buffer if it's not the last write */
326 writer.overallocate = (endpos < size);
327
328 switch (error_handler)
329 {
330 case _Py_ERROR_REPLACE:
331 memset(p, '?', endpos - startpos);
332 p += (endpos - startpos);
333 /* fall through */
334 case _Py_ERROR_IGNORE:
335 i += (endpos - startpos - 1);
336 break;
337
338 case _Py_ERROR_SURROGATEPASS:
339 for (k=startpos; k<endpos; k++) {
340 ch = data[k];
341 *p++ = (char)(0xe0 | (ch >> 12));
342 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
343 *p++ = (char)(0x80 | (ch & 0x3f));
344 }
345 i += (endpos - startpos - 1);
346 break;
347
348 case _Py_ERROR_BACKSLASHREPLACE:
349 /* subtract preallocated bytes */
350 writer.min_size -= max_char_size * (endpos - startpos);
351 p = backslashreplace(&writer, p,
352 unicode, startpos, endpos);
353 if (p == NULL)
354 goto error;
355 i += (endpos - startpos - 1);
356 break;
357
358 case _Py_ERROR_XMLCHARREFREPLACE:
359 /* subtract preallocated bytes */
360 writer.min_size -= max_char_size * (endpos - startpos);
361 p = xmlcharrefreplace(&writer, p,
362 unicode, startpos, endpos);
363 if (p == NULL)
364 goto error;
365 i += (endpos - startpos - 1);
366 break;
367
368 case _Py_ERROR_SURROGATEESCAPE:
369 for (k=startpos; k<endpos; k++) {
370 ch = data[k];
371 if (!(0xDC80 <= ch && ch <= 0xDCFF))
372 break;
373 *p++ = (char)(ch & 0xff);
374 }
375 if (k >= endpos) {
376 i += (endpos - startpos - 1);
377 break;
378 }
379 startpos = k;
380 assert(startpos < endpos);
381 /* fall through */
382 default:
383 rep = unicode_encode_call_errorhandler(
384 errors, &error_handler_obj, "utf-8", "surrogates not allowed",
385 unicode, &exc, startpos, endpos, &newpos);
386 if (!rep)
387 goto error;
388
389 /* subtract preallocated bytes */
390 writer.min_size -= max_char_size * (newpos - startpos);
391
392 if (PyBytes_Check(rep)) {
393 p = _PyBytesWriter_WriteBytes(&writer, p,
394 PyBytes_AS_STRING(rep),
395 PyBytes_GET_SIZE(rep));
396 }
397 else {
398 /* rep is unicode */
399 if (PyUnicode_READY(rep) < 0)
400 goto error;
401
402 if (!PyUnicode_IS_ASCII(rep)) {
403 raise_encode_exception(&exc, "utf-8", unicode,
404 startpos, endpos,
405 "surrogates not allowed");
406 goto error;
407 }
408
409 p = _PyBytesWriter_WriteBytes(&writer, p,
410 PyUnicode_DATA(rep),
411 PyUnicode_GET_LENGTH(rep));
412 }
413
414 if (p == NULL)
415 goto error;
416 Py_CLEAR(rep);
417
418 i = newpos;
419 }
420
421 /* If overallocation was disabled, ensure that it was the last
422 write. Otherwise, we missed an optimization */
423 assert(writer.overallocate || i == size);
424 }
425 else
426 #if STRINGLIB_SIZEOF_CHAR > 2
427 if (ch < 0x10000)
428 #endif
429 {
430 *p++ = (char)(0xe0 | (ch >> 12));
431 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
432 *p++ = (char)(0x80 | (ch & 0x3f));
433 }
434 #if STRINGLIB_SIZEOF_CHAR > 2
435 else /* ch >= 0x10000 */
436 {
437 assert(ch <= MAX_UNICODE);
438 /* Encode UCS4 Unicode ordinals */
439 *p++ = (char)(0xf0 | (ch >> 18));
440 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
441 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
442 *p++ = (char)(0x80 | (ch & 0x3f));
443 }
444 #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
445 #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
446 }
447
448 #if STRINGLIB_SIZEOF_CHAR > 1
449 Py_XDECREF(error_handler_obj);
450 Py_XDECREF(exc);
451 #endif
452 return _PyBytesWriter_Finish(&writer, p);
453
454 #if STRINGLIB_SIZEOF_CHAR > 1
455 error:
456 Py_XDECREF(rep);
457 Py_XDECREF(error_handler_obj);
458 Py_XDECREF(exc);
459 _PyBytesWriter_Dealloc(&writer);
460 return NULL;
461 #endif
462 }
463
464 /* The pattern for constructing UCS2-repeated masks. */
465 #if SIZEOF_LONG == 8
466 # define UCS2_REPEAT_MASK 0x0001000100010001ul
467 #elif SIZEOF_LONG == 4
468 # define UCS2_REPEAT_MASK 0x00010001ul
469 #else
470 # error C 'long' size should be either 4 or 8!
471 #endif
472
473 /* The mask for fast checking. */
474 #if STRINGLIB_SIZEOF_CHAR == 1
475 /* The mask for fast checking of whether a C 'long' contains a
476 non-ASCII or non-Latin1 UTF16-encoded characters. */
477 # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
478 #else
479 /* The mask for fast checking of whether a C 'long' may contain
480 UTF16-encoded surrogate characters. This is an efficient heuristic,
481 assuming that non-surrogate characters with a code point >= 0x8000 are
482 rare in most input.
483 */
484 # define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u)
485 #endif
486 /* The mask for fast byte-swapping. */
487 #define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu)
488 /* Swap bytes. */
489 #define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \
490 (((value) & STRIPPED_MASK) << 8))
491
492 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf16_decode)493 STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
494 STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
495 int native_ordering)
496 {
497 Py_UCS4 ch;
498 const unsigned char *aligned_end =
499 (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
500 const unsigned char *q = *inptr;
501 STRINGLIB_CHAR *p = dest + *outpos;
502 /* Offsets from q for retrieving byte pairs in the right order. */
503 #if PY_LITTLE_ENDIAN
504 int ihi = !!native_ordering, ilo = !native_ordering;
505 #else
506 int ihi = !native_ordering, ilo = !!native_ordering;
507 #endif
508 --e;
509
510 while (q < e) {
511 Py_UCS4 ch2;
512 /* First check for possible aligned read of a C 'long'. Unaligned
513 reads are more expensive, better to defer to another iteration. */
514 if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
515 /* Fast path for runs of in-range non-surrogate chars. */
516 const unsigned char *_q = q;
517 while (_q < aligned_end) {
518 unsigned long block = * (const unsigned long *) _q;
519 if (native_ordering) {
520 /* Can use buffer directly */
521 if (block & FAST_CHAR_MASK)
522 break;
523 }
524 else {
525 /* Need to byte-swap */
526 if (block & SWAB(FAST_CHAR_MASK))
527 break;
528 #if STRINGLIB_SIZEOF_CHAR == 1
529 block >>= 8;
530 #else
531 block = SWAB(block);
532 #endif
533 }
534 #if PY_LITTLE_ENDIAN
535 # if SIZEOF_LONG == 4
536 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
537 p[1] = (STRINGLIB_CHAR)(block >> 16);
538 # elif SIZEOF_LONG == 8
539 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
540 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
541 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
542 p[3] = (STRINGLIB_CHAR)(block >> 48);
543 # endif
544 #else
545 # if SIZEOF_LONG == 4
546 p[0] = (STRINGLIB_CHAR)(block >> 16);
547 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
548 # elif SIZEOF_LONG == 8
549 p[0] = (STRINGLIB_CHAR)(block >> 48);
550 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
551 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
552 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
553 # endif
554 #endif
555 _q += SIZEOF_LONG;
556 p += SIZEOF_LONG / 2;
557 }
558 q = _q;
559 if (q >= e)
560 break;
561 }
562
563 ch = (q[ihi] << 8) | q[ilo];
564 q += 2;
565 if (!Py_UNICODE_IS_SURROGATE(ch)) {
566 #if STRINGLIB_SIZEOF_CHAR < 2
567 if (ch > STRINGLIB_MAX_CHAR)
568 /* Out-of-range */
569 goto Return;
570 #endif
571 *p++ = (STRINGLIB_CHAR)ch;
572 continue;
573 }
574
575 /* UTF-16 code pair: */
576 if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
577 goto IllegalEncoding;
578 if (q >= e)
579 goto UnexpectedEnd;
580 ch2 = (q[ihi] << 8) | q[ilo];
581 q += 2;
582 if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
583 goto IllegalSurrogate;
584 ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
585 #if STRINGLIB_SIZEOF_CHAR < 4
586 /* Out-of-range */
587 goto Return;
588 #else
589 *p++ = (STRINGLIB_CHAR)ch;
590 #endif
591 }
592 ch = 0;
593 Return:
594 *inptr = q;
595 *outpos = p - dest;
596 return ch;
597 UnexpectedEnd:
598 ch = 1;
599 goto Return;
600 IllegalEncoding:
601 ch = 2;
602 goto Return;
603 IllegalSurrogate:
604 ch = 3;
605 goto Return;
606 }
607 #undef UCS2_REPEAT_MASK
608 #undef FAST_CHAR_MASK
609 #undef STRIPPED_MASK
610 #undef SWAB
611
612
613 #if STRINGLIB_MAX_CHAR >= 0x80
614 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf16_encode)615 STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
616 Py_ssize_t len,
617 unsigned short **outptr,
618 int native_ordering)
619 {
620 unsigned short *out = *outptr;
621 const STRINGLIB_CHAR *end = in + len;
622 #if STRINGLIB_SIZEOF_CHAR == 1
623 if (native_ordering) {
624 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
625 while (in < unrolled_end) {
626 out[0] = in[0];
627 out[1] = in[1];
628 out[2] = in[2];
629 out[3] = in[3];
630 in += 4; out += 4;
631 }
632 while (in < end) {
633 *out++ = *in++;
634 }
635 } else {
636 # define SWAB2(CH) ((CH) << 8) /* high byte is zero */
637 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
638 while (in < unrolled_end) {
639 out[0] = SWAB2(in[0]);
640 out[1] = SWAB2(in[1]);
641 out[2] = SWAB2(in[2]);
642 out[3] = SWAB2(in[3]);
643 in += 4; out += 4;
644 }
645 while (in < end) {
646 Py_UCS4 ch = *in++;
647 *out++ = SWAB2((Py_UCS2)ch);
648 }
649 #undef SWAB2
650 }
651 *outptr = out;
652 return len;
653 #else
654 if (native_ordering) {
655 #if STRINGLIB_MAX_CHAR < 0x10000
656 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
657 while (in < unrolled_end) {
658 /* check if any character is a surrogate character */
659 if (((in[0] ^ 0xd800) &
660 (in[1] ^ 0xd800) &
661 (in[2] ^ 0xd800) &
662 (in[3] ^ 0xd800) & 0xf800) == 0)
663 break;
664 out[0] = in[0];
665 out[1] = in[1];
666 out[2] = in[2];
667 out[3] = in[3];
668 in += 4; out += 4;
669 }
670 #endif
671 while (in < end) {
672 Py_UCS4 ch;
673 ch = *in++;
674 if (ch < 0xd800)
675 *out++ = ch;
676 else if (ch < 0xe000)
677 /* reject surrogate characters (U+D800-U+DFFF) */
678 goto fail;
679 #if STRINGLIB_MAX_CHAR >= 0x10000
680 else if (ch >= 0x10000) {
681 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
682 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
683 out += 2;
684 }
685 #endif
686 else
687 *out++ = ch;
688 }
689 } else {
690 #define SWAB2(CH) (((CH) << 8) | ((CH) >> 8))
691 #if STRINGLIB_MAX_CHAR < 0x10000
692 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
693 while (in < unrolled_end) {
694 /* check if any character is a surrogate character */
695 if (((in[0] ^ 0xd800) &
696 (in[1] ^ 0xd800) &
697 (in[2] ^ 0xd800) &
698 (in[3] ^ 0xd800) & 0xf800) == 0)
699 break;
700 out[0] = SWAB2(in[0]);
701 out[1] = SWAB2(in[1]);
702 out[2] = SWAB2(in[2]);
703 out[3] = SWAB2(in[3]);
704 in += 4; out += 4;
705 }
706 #endif
707 while (in < end) {
708 Py_UCS4 ch = *in++;
709 if (ch < 0xd800)
710 *out++ = SWAB2((Py_UCS2)ch);
711 else if (ch < 0xe000)
712 /* reject surrogate characters (U+D800-U+DFFF) */
713 goto fail;
714 #if STRINGLIB_MAX_CHAR >= 0x10000
715 else if (ch >= 0x10000) {
716 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
717 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
718 out[0] = SWAB2(ch1);
719 out[1] = SWAB2(ch2);
720 out += 2;
721 }
722 #endif
723 else
724 *out++ = SWAB2((Py_UCS2)ch);
725 }
726 #undef SWAB2
727 }
728 *outptr = out;
729 return len;
730 fail:
731 *outptr = out;
732 return len - (end - in + 1);
733 #endif
734 }
735
736 #if STRINGLIB_SIZEOF_CHAR == 1
737 # define SWAB4(CH, tmp) ((CH) << 24) /* high bytes are zero */
738 #elif STRINGLIB_SIZEOF_CHAR == 2
739 # define SWAB4(CH, tmp) (tmp = (CH), \
740 ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
741 /* high bytes are zero */
742 #else
743 # define SWAB4(CH, tmp) (tmp = (CH), \
744 tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
745 ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
746 #endif
747 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf32_encode)748 STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
749 Py_ssize_t len,
750 PY_UINT32_T **outptr,
751 int native_ordering)
752 {
753 PY_UINT32_T *out = *outptr;
754 const STRINGLIB_CHAR *end = in + len;
755 if (native_ordering) {
756 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
757 while (in < unrolled_end) {
758 #if STRINGLIB_SIZEOF_CHAR > 1
759 /* check if any character is a surrogate character */
760 if (((in[0] ^ 0xd800) &
761 (in[1] ^ 0xd800) &
762 (in[2] ^ 0xd800) &
763 (in[3] ^ 0xd800) & 0xf800) == 0)
764 break;
765 #endif
766 out[0] = in[0];
767 out[1] = in[1];
768 out[2] = in[2];
769 out[3] = in[3];
770 in += 4; out += 4;
771 }
772 while (in < end) {
773 Py_UCS4 ch;
774 ch = *in++;
775 #if STRINGLIB_SIZEOF_CHAR > 1
776 if (Py_UNICODE_IS_SURROGATE(ch)) {
777 /* reject surrogate characters (U+D800-U+DFFF) */
778 goto fail;
779 }
780 #endif
781 *out++ = ch;
782 }
783 } else {
784 const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
785 while (in < unrolled_end) {
786 #if STRINGLIB_SIZEOF_CHAR > 1
787 Py_UCS4 ch1, ch2, ch3, ch4;
788 /* check if any character is a surrogate character */
789 if (((in[0] ^ 0xd800) &
790 (in[1] ^ 0xd800) &
791 (in[2] ^ 0xd800) &
792 (in[3] ^ 0xd800) & 0xf800) == 0)
793 break;
794 #endif
795 out[0] = SWAB4(in[0], ch1);
796 out[1] = SWAB4(in[1], ch2);
797 out[2] = SWAB4(in[2], ch3);
798 out[3] = SWAB4(in[3], ch4);
799 in += 4; out += 4;
800 }
801 while (in < end) {
802 Py_UCS4 ch = *in++;
803 #if STRINGLIB_SIZEOF_CHAR > 1
804 if (Py_UNICODE_IS_SURROGATE(ch)) {
805 /* reject surrogate characters (U+D800-U+DFFF) */
806 goto fail;
807 }
808 #endif
809 *out++ = SWAB4(ch, ch);
810 }
811 }
812 *outptr = out;
813 return len;
814 #if STRINGLIB_SIZEOF_CHAR > 1
815 fail:
816 *outptr = out;
817 return len - (end - in + 1);
818 #endif
819 }
820 #undef SWAB4
821
822 #endif
823