1 /* stringlib: codec implementations */
2 
3 #if !STRINGLIB_IS_UNICODE
4 # error "codecs.h is specific to Unicode"
5 #endif
6 
7 /* Mask to quickly check whether a C 'long' contains a
8    non-ASCII, UTF8-encoded char. */
9 #if (SIZEOF_LONG == 8)
10 # define ASCII_CHAR_MASK 0x8080808080808080UL
11 #elif (SIZEOF_LONG == 4)
12 # define ASCII_CHAR_MASK 0x80808080UL
13 #else
14 # error C 'long' size should be either 4 or 8!
15 #endif
16 
17 /* 10xxxxxx */
18 #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
19 
20 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_decode)21 STRINGLIB(utf8_decode)(const char **inptr, const char *end,
22                        STRINGLIB_CHAR *dest,
23                        Py_ssize_t *outpos)
24 {
25     Py_UCS4 ch;
26     const char *s = *inptr;
27     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
28     STRINGLIB_CHAR *p = dest + *outpos;
29 
30     while (s < end) {
31         ch = (unsigned char)*s;
32 
33         if (ch < 0x80) {
34             /* Fast path for runs of ASCII characters. Given that common UTF-8
35                input will consist of an overwhelming majority of ASCII
36                characters, we try to optimize for this case by checking
37                as many characters as a C 'long' can contain.
38                First, check if we can do an aligned read, as most CPUs have
39                a penalty for unaligned reads.
40             */
41             if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
42                 /* Help register allocation */
43                 const char *_s = s;
44                 STRINGLIB_CHAR *_p = p;
45                 while (_s < aligned_end) {
46                     /* Read a whole long at a time (either 4 or 8 bytes),
47                        and do a fast unrolled copy if it only contains ASCII
48                        characters. */
49                     unsigned long value = *(const unsigned long *) _s;
50                     if (value & ASCII_CHAR_MASK)
51                         break;
52 #if PY_LITTLE_ENDIAN
53                     _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
54                     _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
55                     _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
56                     _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
57 # if SIZEOF_LONG == 8
58                     _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
59                     _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
60                     _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
61                     _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
62 # endif
63 #else
64 # if SIZEOF_LONG == 8
65                     _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
66                     _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
67                     _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
68                     _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
69                     _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
70                     _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
71                     _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
72                     _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
73 # else
74                     _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
75                     _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
76                     _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
77                     _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
78 # endif
79 #endif
80                     _s += SIZEOF_LONG;
81                     _p += SIZEOF_LONG;
82                 }
83                 s = _s;
84                 p = _p;
85                 if (s == end)
86                     break;
87                 ch = (unsigned char)*s;
88             }
89             if (ch < 0x80) {
90                 s++;
91                 *p++ = ch;
92                 continue;
93             }
94         }
95 
96         if (ch < 0xE0) {
97             /* \xC2\x80-\xDF\xBF -- 0080-07FF */
98             Py_UCS4 ch2;
99             if (ch < 0xC2) {
100                 /* invalid sequence
101                 \x80-\xBF -- continuation byte
102                 \xC0-\xC1 -- fake 0000-007F */
103                 goto InvalidStart;
104             }
105             if (end - s < 2) {
106                 /* unexpected end of data: the caller will decide whether
107                    it's an error or not */
108                 break;
109             }
110             ch2 = (unsigned char)s[1];
111             if (!IS_CONTINUATION_BYTE(ch2))
112                 /* invalid continuation byte */
113                 goto InvalidContinuation1;
114             ch = (ch << 6) + ch2 -
115                  ((0xC0 << 6) + 0x80);
116             assert ((ch > 0x007F) && (ch <= 0x07FF));
117             s += 2;
118             if (STRINGLIB_MAX_CHAR <= 0x007F ||
119                 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
120                 /* Out-of-range */
121                 goto Return;
122             *p++ = ch;
123             continue;
124         }
125 
126         if (ch < 0xF0) {
127             /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
128             Py_UCS4 ch2, ch3;
129             if (end - s < 3) {
130                 /* unexpected end of data: the caller will decide whether
131                    it's an error or not */
132                 if (end - s < 2)
133                     break;
134                 ch2 = (unsigned char)s[1];
135                 if (!IS_CONTINUATION_BYTE(ch2) ||
136                     (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
137                     /* for clarification see comments below */
138                     goto InvalidContinuation1;
139                 break;
140             }
141             ch2 = (unsigned char)s[1];
142             ch3 = (unsigned char)s[2];
143             if (!IS_CONTINUATION_BYTE(ch2)) {
144                 /* invalid continuation byte */
145                 goto InvalidContinuation1;
146             }
147             if (ch == 0xE0) {
148                 if (ch2 < 0xA0)
149                     /* invalid sequence
150                        \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
151                     goto InvalidContinuation1;
152             } else if (ch == 0xED && ch2 >= 0xA0) {
153                 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
154                    will result in surrogates in range D800-DFFF. Surrogates are
155                    not valid UTF-8 so they are rejected.
156                    See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
157                    (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
158                 goto InvalidContinuation1;
159             }
160             if (!IS_CONTINUATION_BYTE(ch3)) {
161                 /* invalid continuation byte */
162                 goto InvalidContinuation2;
163             }
164             ch = (ch << 12) + (ch2 << 6) + ch3 -
165                  ((0xE0 << 12) + (0x80 << 6) + 0x80);
166             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
167             s += 3;
168             if (STRINGLIB_MAX_CHAR <= 0x07FF ||
169                 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
170                 /* Out-of-range */
171                 goto Return;
172             *p++ = ch;
173             continue;
174         }
175 
176         if (ch < 0xF5) {
177             /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
178             Py_UCS4 ch2, ch3, ch4;
179             if (end - s < 4) {
180                 /* unexpected end of data: the caller will decide whether
181                    it's an error or not */
182                 if (end - s < 2)
183                     break;
184                 ch2 = (unsigned char)s[1];
185                 if (!IS_CONTINUATION_BYTE(ch2) ||
186                     (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
187                     /* for clarification see comments below */
188                     goto InvalidContinuation1;
189                 if (end - s < 3)
190                     break;
191                 ch3 = (unsigned char)s[2];
192                 if (!IS_CONTINUATION_BYTE(ch3))
193                     goto InvalidContinuation2;
194                 break;
195             }
196             ch2 = (unsigned char)s[1];
197             ch3 = (unsigned char)s[2];
198             ch4 = (unsigned char)s[3];
199             if (!IS_CONTINUATION_BYTE(ch2)) {
200                 /* invalid continuation byte */
201                 goto InvalidContinuation1;
202             }
203             if (ch == 0xF0) {
204                 if (ch2 < 0x90)
205                     /* invalid sequence
206                        \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
207                     goto InvalidContinuation1;
208             } else if (ch == 0xF4 && ch2 >= 0x90) {
209                 /* invalid sequence
210                    \xF4\x90\x80\x80- -- 110000- overflow */
211                 goto InvalidContinuation1;
212             }
213             if (!IS_CONTINUATION_BYTE(ch3)) {
214                 /* invalid continuation byte */
215                 goto InvalidContinuation2;
216             }
217             if (!IS_CONTINUATION_BYTE(ch4)) {
218                 /* invalid continuation byte */
219                 goto InvalidContinuation3;
220             }
221             ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
222                  ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
223             assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
224             s += 4;
225             if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
226                 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
227                 /* Out-of-range */
228                 goto Return;
229             *p++ = ch;
230             continue;
231         }
232         goto InvalidStart;
233     }
234     ch = 0;
235 Return:
236     *inptr = s;
237     *outpos = p - dest;
238     return ch;
239 InvalidStart:
240     ch = 1;
241     goto Return;
242 InvalidContinuation1:
243     ch = 2;
244     goto Return;
245 InvalidContinuation2:
246     ch = 3;
247     goto Return;
248 InvalidContinuation3:
249     ch = 4;
250     goto Return;
251 }
252 
253 #undef ASCII_CHAR_MASK
254 
255 
256 /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
257    PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
258    UCS-1 strings don't need to handle surrogates for example. */
259 Py_LOCAL_INLINE(PyObject *)
STRINGLIB(utf8_encoder)260 STRINGLIB(utf8_encoder)(PyObject *unicode,
261                         STRINGLIB_CHAR *data,
262                         Py_ssize_t size,
263                         _Py_error_handler error_handler,
264                         const char *errors)
265 {
266     Py_ssize_t i;                /* index into data of next input character */
267     char *p;                     /* next free byte in output buffer */
268 #if STRINGLIB_SIZEOF_CHAR > 1
269     PyObject *error_handler_obj = NULL;
270     PyObject *exc = NULL;
271     PyObject *rep = NULL;
272 #endif
273 #if STRINGLIB_SIZEOF_CHAR == 1
274     const Py_ssize_t max_char_size = 2;
275 #elif STRINGLIB_SIZEOF_CHAR == 2
276     const Py_ssize_t max_char_size = 3;
277 #else /*  STRINGLIB_SIZEOF_CHAR == 4 */
278     const Py_ssize_t max_char_size = 4;
279 #endif
280     _PyBytesWriter writer;
281 
282     assert(size >= 0);
283     _PyBytesWriter_Init(&writer);
284 
285     if (size > PY_SSIZE_T_MAX / max_char_size) {
286         /* integer overflow */
287         return PyErr_NoMemory();
288     }
289 
290     p = _PyBytesWriter_Alloc(&writer, size * max_char_size);
291     if (p == NULL)
292         return NULL;
293 
294     for (i = 0; i < size;) {
295         Py_UCS4 ch = data[i++];
296 
297         if (ch < 0x80) {
298             /* Encode ASCII */
299             *p++ = (char) ch;
300 
301         }
302         else
303 #if STRINGLIB_SIZEOF_CHAR > 1
304         if (ch < 0x0800)
305 #endif
306         {
307             /* Encode Latin-1 */
308             *p++ = (char)(0xc0 | (ch >> 6));
309             *p++ = (char)(0x80 | (ch & 0x3f));
310         }
311 #if STRINGLIB_SIZEOF_CHAR > 1
312         else if (Py_UNICODE_IS_SURROGATE(ch)) {
313             Py_ssize_t startpos, endpos, newpos;
314             Py_ssize_t k;
315             if (error_handler == _Py_ERROR_UNKNOWN) {
316                 error_handler = _Py_GetErrorHandler(errors);
317             }
318 
319             startpos = i-1;
320             endpos = startpos+1;
321 
322             while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
323                 endpos++;
324 
325             /* Only overallocate the buffer if it's not the last write */
326             writer.overallocate = (endpos < size);
327 
328             switch (error_handler)
329             {
330             case _Py_ERROR_REPLACE:
331                 memset(p, '?', endpos - startpos);
332                 p += (endpos - startpos);
333                 /* fall through */
334             case _Py_ERROR_IGNORE:
335                 i += (endpos - startpos - 1);
336                 break;
337 
338             case _Py_ERROR_SURROGATEPASS:
339                 for (k=startpos; k<endpos; k++) {
340                     ch = data[k];
341                     *p++ = (char)(0xe0 | (ch >> 12));
342                     *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
343                     *p++ = (char)(0x80 | (ch & 0x3f));
344                 }
345                 i += (endpos - startpos - 1);
346                 break;
347 
348             case _Py_ERROR_BACKSLASHREPLACE:
349                 /* subtract preallocated bytes */
350                 writer.min_size -= max_char_size * (endpos - startpos);
351                 p = backslashreplace(&writer, p,
352                                      unicode, startpos, endpos);
353                 if (p == NULL)
354                     goto error;
355                 i += (endpos - startpos - 1);
356                 break;
357 
358             case _Py_ERROR_XMLCHARREFREPLACE:
359                 /* subtract preallocated bytes */
360                 writer.min_size -= max_char_size * (endpos - startpos);
361                 p = xmlcharrefreplace(&writer, p,
362                                       unicode, startpos, endpos);
363                 if (p == NULL)
364                     goto error;
365                 i += (endpos - startpos - 1);
366                 break;
367 
368             case _Py_ERROR_SURROGATEESCAPE:
369                 for (k=startpos; k<endpos; k++) {
370                     ch = data[k];
371                     if (!(0xDC80 <= ch && ch <= 0xDCFF))
372                         break;
373                     *p++ = (char)(ch & 0xff);
374                 }
375                 if (k >= endpos) {
376                     i += (endpos - startpos - 1);
377                     break;
378                 }
379                 startpos = k;
380                 assert(startpos < endpos);
381                 /* fall through */
382             default:
383                 rep = unicode_encode_call_errorhandler(
384                       errors, &error_handler_obj, "utf-8", "surrogates not allowed",
385                       unicode, &exc, startpos, endpos, &newpos);
386                 if (!rep)
387                     goto error;
388 
389                 /* subtract preallocated bytes */
390                 writer.min_size -= max_char_size * (newpos - startpos);
391 
392                 if (PyBytes_Check(rep)) {
393                     p = _PyBytesWriter_WriteBytes(&writer, p,
394                                                   PyBytes_AS_STRING(rep),
395                                                   PyBytes_GET_SIZE(rep));
396                 }
397                 else {
398                     /* rep is unicode */
399                     if (PyUnicode_READY(rep) < 0)
400                         goto error;
401 
402                     if (!PyUnicode_IS_ASCII(rep)) {
403                         raise_encode_exception(&exc, "utf-8", unicode,
404                                                startpos, endpos,
405                                                "surrogates not allowed");
406                         goto error;
407                     }
408 
409                     p = _PyBytesWriter_WriteBytes(&writer, p,
410                                                   PyUnicode_DATA(rep),
411                                                   PyUnicode_GET_LENGTH(rep));
412                 }
413 
414                 if (p == NULL)
415                     goto error;
416                 Py_CLEAR(rep);
417 
418                 i = newpos;
419             }
420 
421             /* If overallocation was disabled, ensure that it was the last
422                write. Otherwise, we missed an optimization */
423             assert(writer.overallocate || i == size);
424         }
425         else
426 #if STRINGLIB_SIZEOF_CHAR > 2
427         if (ch < 0x10000)
428 #endif
429         {
430             *p++ = (char)(0xe0 | (ch >> 12));
431             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
432             *p++ = (char)(0x80 | (ch & 0x3f));
433         }
434 #if STRINGLIB_SIZEOF_CHAR > 2
435         else /* ch >= 0x10000 */
436         {
437             assert(ch <= MAX_UNICODE);
438             /* Encode UCS4 Unicode ordinals */
439             *p++ = (char)(0xf0 | (ch >> 18));
440             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
441             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
442             *p++ = (char)(0x80 | (ch & 0x3f));
443         }
444 #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
445 #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
446     }
447 
448 #if STRINGLIB_SIZEOF_CHAR > 1
449     Py_XDECREF(error_handler_obj);
450     Py_XDECREF(exc);
451 #endif
452     return _PyBytesWriter_Finish(&writer, p);
453 
454 #if STRINGLIB_SIZEOF_CHAR > 1
455  error:
456     Py_XDECREF(rep);
457     Py_XDECREF(error_handler_obj);
458     Py_XDECREF(exc);
459     _PyBytesWriter_Dealloc(&writer);
460     return NULL;
461 #endif
462 }
463 
464 /* The pattern for constructing UCS2-repeated masks. */
465 #if SIZEOF_LONG == 8
466 # define UCS2_REPEAT_MASK 0x0001000100010001ul
467 #elif SIZEOF_LONG == 4
468 # define UCS2_REPEAT_MASK 0x00010001ul
469 #else
470 # error C 'long' size should be either 4 or 8!
471 #endif
472 
473 /* The mask for fast checking. */
474 #if STRINGLIB_SIZEOF_CHAR == 1
475 /* The mask for fast checking of whether a C 'long' contains a
476    non-ASCII or non-Latin1 UTF16-encoded characters. */
477 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
478 #else
479 /* The mask for fast checking of whether a C 'long' may contain
480    UTF16-encoded surrogate characters. This is an efficient heuristic,
481    assuming that non-surrogate characters with a code point >= 0x8000 are
482    rare in most input.
483 */
484 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)
485 #endif
486 /* The mask for fast byte-swapping. */
487 #define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)
488 /* Swap bytes. */
489 #define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \
490                                  (((value) & STRIPPED_MASK) << 8))
491 
492 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf16_decode)493 STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
494                         STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
495                         int native_ordering)
496 {
497     Py_UCS4 ch;
498     const unsigned char *aligned_end =
499             (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
500     const unsigned char *q = *inptr;
501     STRINGLIB_CHAR *p = dest + *outpos;
502     /* Offsets from q for retrieving byte pairs in the right order. */
503 #if PY_LITTLE_ENDIAN
504     int ihi = !!native_ordering, ilo = !native_ordering;
505 #else
506     int ihi = !native_ordering, ilo = !!native_ordering;
507 #endif
508     --e;
509 
510     while (q < e) {
511         Py_UCS4 ch2;
512         /* First check for possible aligned read of a C 'long'. Unaligned
513            reads are more expensive, better to defer to another iteration. */
514         if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
515             /* Fast path for runs of in-range non-surrogate chars. */
516             const unsigned char *_q = q;
517             while (_q < aligned_end) {
518                 unsigned long block = * (const unsigned long *) _q;
519                 if (native_ordering) {
520                     /* Can use buffer directly */
521                     if (block & FAST_CHAR_MASK)
522                         break;
523                 }
524                 else {
525                     /* Need to byte-swap */
526                     if (block & SWAB(FAST_CHAR_MASK))
527                         break;
528 #if STRINGLIB_SIZEOF_CHAR == 1
529                     block >>= 8;
530 #else
531                     block = SWAB(block);
532 #endif
533                 }
534 #if PY_LITTLE_ENDIAN
535 # if SIZEOF_LONG == 4
536                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
537                 p[1] = (STRINGLIB_CHAR)(block >> 16);
538 # elif SIZEOF_LONG == 8
539                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
540                 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
541                 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
542                 p[3] = (STRINGLIB_CHAR)(block >> 48);
543 # endif
544 #else
545 # if SIZEOF_LONG == 4
546                 p[0] = (STRINGLIB_CHAR)(block >> 16);
547                 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
548 # elif SIZEOF_LONG == 8
549                 p[0] = (STRINGLIB_CHAR)(block >> 48);
550                 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
551                 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
552                 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
553 # endif
554 #endif
555                 _q += SIZEOF_LONG;
556                 p += SIZEOF_LONG / 2;
557             }
558             q = _q;
559             if (q >= e)
560                 break;
561         }
562 
563         ch = (q[ihi] << 8) | q[ilo];
564         q += 2;
565         if (!Py_UNICODE_IS_SURROGATE(ch)) {
566 #if STRINGLIB_SIZEOF_CHAR < 2
567             if (ch > STRINGLIB_MAX_CHAR)
568                 /* Out-of-range */
569                 goto Return;
570 #endif
571             *p++ = (STRINGLIB_CHAR)ch;
572             continue;
573         }
574 
575         /* UTF-16 code pair: */
576         if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
577             goto IllegalEncoding;
578         if (q >= e)
579             goto UnexpectedEnd;
580         ch2 = (q[ihi] << 8) | q[ilo];
581         q += 2;
582         if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
583             goto IllegalSurrogate;
584         ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
585 #if STRINGLIB_SIZEOF_CHAR < 4
586         /* Out-of-range */
587         goto Return;
588 #else
589         *p++ = (STRINGLIB_CHAR)ch;
590 #endif
591     }
592     ch = 0;
593 Return:
594     *inptr = q;
595     *outpos = p - dest;
596     return ch;
597 UnexpectedEnd:
598     ch = 1;
599     goto Return;
600 IllegalEncoding:
601     ch = 2;
602     goto Return;
603 IllegalSurrogate:
604     ch = 3;
605     goto Return;
606 }
607 #undef UCS2_REPEAT_MASK
608 #undef FAST_CHAR_MASK
609 #undef STRIPPED_MASK
610 #undef SWAB
611 
612 
613 #if STRINGLIB_MAX_CHAR >= 0x80
614 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf16_encode)615 STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
616                         Py_ssize_t len,
617                         unsigned short **outptr,
618                         int native_ordering)
619 {
620     unsigned short *out = *outptr;
621     const STRINGLIB_CHAR *end = in + len;
622 #if STRINGLIB_SIZEOF_CHAR == 1
623     if (native_ordering) {
624         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
625         while (in < unrolled_end) {
626             out[0] = in[0];
627             out[1] = in[1];
628             out[2] = in[2];
629             out[3] = in[3];
630             in += 4; out += 4;
631         }
632         while (in < end) {
633             *out++ = *in++;
634         }
635     } else {
636 # define SWAB2(CH)  ((CH) << 8) /* high byte is zero */
637         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
638         while (in < unrolled_end) {
639             out[0] = SWAB2(in[0]);
640             out[1] = SWAB2(in[1]);
641             out[2] = SWAB2(in[2]);
642             out[3] = SWAB2(in[3]);
643             in += 4; out += 4;
644         }
645         while (in < end) {
646             Py_UCS4 ch = *in++;
647             *out++ = SWAB2((Py_UCS2)ch);
648         }
649 #undef SWAB2
650     }
651     *outptr = out;
652     return len;
653 #else
654     if (native_ordering) {
655 #if STRINGLIB_MAX_CHAR < 0x10000
656         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
657         while (in < unrolled_end) {
658             /* check if any character is a surrogate character */
659             if (((in[0] ^ 0xd800) &
660                  (in[1] ^ 0xd800) &
661                  (in[2] ^ 0xd800) &
662                  (in[3] ^ 0xd800) & 0xf800) == 0)
663                 break;
664             out[0] = in[0];
665             out[1] = in[1];
666             out[2] = in[2];
667             out[3] = in[3];
668             in += 4; out += 4;
669         }
670 #endif
671         while (in < end) {
672             Py_UCS4 ch;
673             ch = *in++;
674             if (ch < 0xd800)
675                 *out++ = ch;
676             else if (ch < 0xe000)
677                 /* reject surrogate characters (U+D800-U+DFFF) */
678                 goto fail;
679 #if STRINGLIB_MAX_CHAR >= 0x10000
680             else if (ch >= 0x10000) {
681                 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
682                 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
683                 out += 2;
684             }
685 #endif
686             else
687                 *out++ = ch;
688         }
689     } else {
690 #define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))
691 #if STRINGLIB_MAX_CHAR < 0x10000
692         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
693         while (in < unrolled_end) {
694             /* check if any character is a surrogate character */
695             if (((in[0] ^ 0xd800) &
696                  (in[1] ^ 0xd800) &
697                  (in[2] ^ 0xd800) &
698                  (in[3] ^ 0xd800) & 0xf800) == 0)
699                 break;
700             out[0] = SWAB2(in[0]);
701             out[1] = SWAB2(in[1]);
702             out[2] = SWAB2(in[2]);
703             out[3] = SWAB2(in[3]);
704             in += 4; out += 4;
705         }
706 #endif
707         while (in < end) {
708             Py_UCS4 ch = *in++;
709             if (ch < 0xd800)
710                 *out++ = SWAB2((Py_UCS2)ch);
711             else if (ch < 0xe000)
712                 /* reject surrogate characters (U+D800-U+DFFF) */
713                 goto fail;
714 #if STRINGLIB_MAX_CHAR >= 0x10000
715             else if (ch >= 0x10000) {
716                 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
717                 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
718                 out[0] = SWAB2(ch1);
719                 out[1] = SWAB2(ch2);
720                 out += 2;
721             }
722 #endif
723             else
724                 *out++ = SWAB2((Py_UCS2)ch);
725         }
726 #undef SWAB2
727     }
728     *outptr = out;
729     return len;
730   fail:
731     *outptr = out;
732     return len - (end - in + 1);
733 #endif
734 }
735 
736 #if STRINGLIB_SIZEOF_CHAR == 1
737 # define SWAB4(CH, tmp)  ((CH) << 24) /* high bytes are zero */
738 #elif STRINGLIB_SIZEOF_CHAR == 2
739 # define SWAB4(CH, tmp)  (tmp = (CH), \
740             ((tmp & 0x00FFu) << 24) + ((tmp & 0xFF00u) << 8))
741             /* high bytes are zero */
742 #else
743 # define SWAB4(CH, tmp)  (tmp = (CH), \
744             tmp = ((tmp & 0x00FF00FFu) << 8) + ((tmp >> 8) & 0x00FF00FFu), \
745             ((tmp & 0x0000FFFFu) << 16) + ((tmp >> 16) & 0x0000FFFFu))
746 #endif
747 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf32_encode)748 STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
749                         Py_ssize_t len,
750                         PY_UINT32_T **outptr,
751                         int native_ordering)
752 {
753     PY_UINT32_T *out = *outptr;
754     const STRINGLIB_CHAR *end = in + len;
755     if (native_ordering) {
756         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
757         while (in < unrolled_end) {
758 #if STRINGLIB_SIZEOF_CHAR > 1
759             /* check if any character is a surrogate character */
760             if (((in[0] ^ 0xd800) &
761                  (in[1] ^ 0xd800) &
762                  (in[2] ^ 0xd800) &
763                  (in[3] ^ 0xd800) & 0xf800) == 0)
764                 break;
765 #endif
766             out[0] = in[0];
767             out[1] = in[1];
768             out[2] = in[2];
769             out[3] = in[3];
770             in += 4; out += 4;
771         }
772         while (in < end) {
773             Py_UCS4 ch;
774             ch = *in++;
775 #if STRINGLIB_SIZEOF_CHAR > 1
776             if (Py_UNICODE_IS_SURROGATE(ch)) {
777                 /* reject surrogate characters (U+D800-U+DFFF) */
778                 goto fail;
779             }
780 #endif
781             *out++ = ch;
782         }
783     } else {
784         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
785         while (in < unrolled_end) {
786 #if STRINGLIB_SIZEOF_CHAR > 1
787             Py_UCS4 ch1, ch2, ch3, ch4;
788             /* check if any character is a surrogate character */
789             if (((in[0] ^ 0xd800) &
790                  (in[1] ^ 0xd800) &
791                  (in[2] ^ 0xd800) &
792                  (in[3] ^ 0xd800) & 0xf800) == 0)
793                 break;
794 #endif
795             out[0] = SWAB4(in[0], ch1);
796             out[1] = SWAB4(in[1], ch2);
797             out[2] = SWAB4(in[2], ch3);
798             out[3] = SWAB4(in[3], ch4);
799             in += 4; out += 4;
800         }
801         while (in < end) {
802             Py_UCS4 ch = *in++;
803 #if STRINGLIB_SIZEOF_CHAR > 1
804             if (Py_UNICODE_IS_SURROGATE(ch)) {
805                 /* reject surrogate characters (U+D800-U+DFFF) */
806                 goto fail;
807             }
808 #endif
809             *out++ = SWAB4(ch, ch);
810         }
811     }
812     *outptr = out;
813     return len;
814 #if STRINGLIB_SIZEOF_CHAR > 1
815   fail:
816     *outptr = out;
817     return len - (end - in + 1);
818 #endif
819 }
820 #undef SWAB4
821 
822 #endif
823