1 /* ------------------------------------------------------------------------
2 
3    Python Codec Registry and support functions
4 
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 
7 Copyright (c) Corporation for National Research Initiatives.
8 
9    ------------------------------------------------------------------------ */
10 
11 #include "Python.h"
12 #include "pycore_pystate.h"
13 #include "ucnhash.h"
14 #include <ctype.h>
15 
16 const char *Py_hexdigits = "0123456789abcdef";
17 
18 /* --- Codec Registry ----------------------------------------------------- */
19 
20 /* Import the standard encodings package which will register the first
21    codec search function.
22 
23    This is done in a lazy way so that the Unicode implementation does
24    not downgrade startup time of scripts not needing it.
25 
26    ImportErrors are silently ignored by this function. Only one try is
27    made.
28 
29 */
30 
31 static int _PyCodecRegistry_Init(void); /* Forward */
32 
PyCodec_Register(PyObject * search_function)33 int PyCodec_Register(PyObject *search_function)
34 {
35     PyInterpreterState *interp = _PyInterpreterState_Get();
36     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
37         goto onError;
38     if (search_function == NULL) {
39         PyErr_BadArgument();
40         goto onError;
41     }
42     if (!PyCallable_Check(search_function)) {
43         PyErr_SetString(PyExc_TypeError, "argument must be callable");
44         goto onError;
45     }
46     return PyList_Append(interp->codec_search_path, search_function);
47 
48  onError:
49     return -1;
50 }
51 
52 /* Convert a string to a normalized Python string: all characters are
53    converted to lower case, spaces are replaced with underscores. */
54 
55 static
normalizestring(const char * string)56 PyObject *normalizestring(const char *string)
57 {
58     size_t i;
59     size_t len = strlen(string);
60     char *p;
61     PyObject *v;
62 
63     if (len > PY_SSIZE_T_MAX) {
64         PyErr_SetString(PyExc_OverflowError, "string is too large");
65         return NULL;
66     }
67 
68     p = PyMem_Malloc(len + 1);
69     if (p == NULL)
70         return PyErr_NoMemory();
71     for (i = 0; i < len; i++) {
72         char ch = string[i];
73         if (ch == ' ')
74             ch = '-';
75         else
76             ch = Py_TOLOWER(Py_CHARMASK(ch));
77         p[i] = ch;
78     }
79     p[i] = '\0';
80     v = PyUnicode_FromString(p);
81     PyMem_Free(p);
82     return v;
83 }
84 
85 /* Lookup the given encoding and return a tuple providing the codec
86    facilities.
87 
88    The encoding string is looked up converted to all lower-case
89    characters. This makes encodings looked up through this mechanism
90    effectively case-insensitive.
91 
92    If no codec is found, a LookupError is set and NULL returned.
93 
94    As side effect, this tries to load the encodings package, if not
95    yet done. This is part of the lazy load strategy for the encodings
96    package.
97 
98 */
99 
_PyCodec_Lookup(const char * encoding)100 PyObject *_PyCodec_Lookup(const char *encoding)
101 {
102     PyObject *result, *args = NULL, *v;
103     Py_ssize_t i, len;
104 
105     if (encoding == NULL) {
106         PyErr_BadArgument();
107         goto onError;
108     }
109 
110     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
111     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
112         goto onError;
113 
114     /* Convert the encoding to a normalized Python string: all
115        characters are converted to lower case, spaces and hyphens are
116        replaced with underscores. */
117     v = normalizestring(encoding);
118     if (v == NULL)
119         goto onError;
120     PyUnicode_InternInPlace(&v);
121 
122     /* First, try to lookup the name in the registry dictionary */
123     result = PyDict_GetItemWithError(interp->codec_search_cache, v);
124     if (result != NULL) {
125         Py_INCREF(result);
126         Py_DECREF(v);
127         return result;
128     }
129     else if (PyErr_Occurred()) {
130         Py_DECREF(v);
131         return NULL;
132     }
133 
134     /* Next, scan the search functions in order of registration */
135     args = PyTuple_New(1);
136     if (args == NULL) {
137         Py_DECREF(v);
138         return NULL;
139     }
140     PyTuple_SET_ITEM(args,0,v);
141 
142     len = PyList_Size(interp->codec_search_path);
143     if (len < 0)
144         goto onError;
145     if (len == 0) {
146         PyErr_SetString(PyExc_LookupError,
147                         "no codec search functions registered: "
148                         "can't find encoding");
149         goto onError;
150     }
151 
152     for (i = 0; i < len; i++) {
153         PyObject *func;
154 
155         func = PyList_GetItem(interp->codec_search_path, i);
156         if (func == NULL)
157             goto onError;
158         result = PyEval_CallObject(func, args);
159         if (result == NULL)
160             goto onError;
161         if (result == Py_None) {
162             Py_DECREF(result);
163             continue;
164         }
165         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
166             PyErr_SetString(PyExc_TypeError,
167                             "codec search functions must return 4-tuples");
168             Py_DECREF(result);
169             goto onError;
170         }
171         break;
172     }
173     if (i == len) {
174         /* XXX Perhaps we should cache misses too ? */
175         PyErr_Format(PyExc_LookupError,
176                      "unknown encoding: %s", encoding);
177         goto onError;
178     }
179 
180     /* Cache and return the result */
181     if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
182         Py_DECREF(result);
183         goto onError;
184     }
185     Py_DECREF(args);
186     return result;
187 
188  onError:
189     Py_XDECREF(args);
190     return NULL;
191 }
192 
_PyCodec_Forget(const char * encoding)193 int _PyCodec_Forget(const char *encoding)
194 {
195     PyObject *v;
196     int result;
197 
198     PyInterpreterState *interp = _PyInterpreterState_Get();
199     if (interp->codec_search_path == NULL) {
200         return -1;
201     }
202 
203     /* Convert the encoding to a normalized Python string: all
204        characters are converted to lower case, spaces and hyphens are
205        replaced with underscores. */
206     v = normalizestring(encoding);
207     if (v == NULL) {
208         return -1;
209     }
210 
211     /* Drop the named codec from the internal cache */
212     result = PyDict_DelItem(interp->codec_search_cache, v);
213     Py_DECREF(v);
214 
215     return result;
216 }
217 
218 /* Codec registry encoding check API. */
219 
PyCodec_KnownEncoding(const char * encoding)220 int PyCodec_KnownEncoding(const char *encoding)
221 {
222     PyObject *codecs;
223 
224     codecs = _PyCodec_Lookup(encoding);
225     if (!codecs) {
226         PyErr_Clear();
227         return 0;
228     }
229     else {
230         Py_DECREF(codecs);
231         return 1;
232     }
233 }
234 
235 static
args_tuple(PyObject * object,const char * errors)236 PyObject *args_tuple(PyObject *object,
237                      const char *errors)
238 {
239     PyObject *args;
240 
241     args = PyTuple_New(1 + (errors != NULL));
242     if (args == NULL)
243         return NULL;
244     Py_INCREF(object);
245     PyTuple_SET_ITEM(args,0,object);
246     if (errors) {
247         PyObject *v;
248 
249         v = PyUnicode_FromString(errors);
250         if (v == NULL) {
251             Py_DECREF(args);
252             return NULL;
253         }
254         PyTuple_SET_ITEM(args, 1, v);
255     }
256     return args;
257 }
258 
259 /* Helper function to get a codec item */
260 
261 static
codec_getitem(const char * encoding,int index)262 PyObject *codec_getitem(const char *encoding, int index)
263 {
264     PyObject *codecs;
265     PyObject *v;
266 
267     codecs = _PyCodec_Lookup(encoding);
268     if (codecs == NULL)
269         return NULL;
270     v = PyTuple_GET_ITEM(codecs, index);
271     Py_DECREF(codecs);
272     Py_INCREF(v);
273     return v;
274 }
275 
276 /* Helper functions to create an incremental codec. */
277 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)278 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
279                                      const char *errors,
280                                      const char *attrname)
281 {
282     PyObject *ret, *inccodec;
283 
284     inccodec = PyObject_GetAttrString(codec_info, attrname);
285     if (inccodec == NULL)
286         return NULL;
287     if (errors)
288         ret = PyObject_CallFunction(inccodec, "s", errors);
289     else
290         ret = _PyObject_CallNoArg(inccodec);
291     Py_DECREF(inccodec);
292     return ret;
293 }
294 
295 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)296 PyObject *codec_getincrementalcodec(const char *encoding,
297                                     const char *errors,
298                                     const char *attrname)
299 {
300     PyObject *codec_info, *ret;
301 
302     codec_info = _PyCodec_Lookup(encoding);
303     if (codec_info == NULL)
304         return NULL;
305     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
306     Py_DECREF(codec_info);
307     return ret;
308 }
309 
310 /* Helper function to create a stream codec. */
311 
312 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)313 PyObject *codec_getstreamcodec(const char *encoding,
314                                PyObject *stream,
315                                const char *errors,
316                                const int index)
317 {
318     PyObject *codecs, *streamcodec, *codeccls;
319 
320     codecs = _PyCodec_Lookup(encoding);
321     if (codecs == NULL)
322         return NULL;
323 
324     codeccls = PyTuple_GET_ITEM(codecs, index);
325     if (errors != NULL)
326         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
327     else
328         streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
329     Py_DECREF(codecs);
330     return streamcodec;
331 }
332 
333 /* Helpers to work with the result of _PyCodec_Lookup
334 
335  */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)336 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
337                                              const char *errors)
338 {
339     return codec_makeincrementalcodec(codec_info, errors,
340                                       "incrementaldecoder");
341 }
342 
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)343 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
344                                              const char *errors)
345 {
346     return codec_makeincrementalcodec(codec_info, errors,
347                                       "incrementalencoder");
348 }
349 
350 
351 /* Convenience APIs to query the Codec registry.
352 
353    All APIs return a codec object with incremented refcount.
354 
355  */
356 
PyCodec_Encoder(const char * encoding)357 PyObject *PyCodec_Encoder(const char *encoding)
358 {
359     return codec_getitem(encoding, 0);
360 }
361 
PyCodec_Decoder(const char * encoding)362 PyObject *PyCodec_Decoder(const char *encoding)
363 {
364     return codec_getitem(encoding, 1);
365 }
366 
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)367 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
368                                      const char *errors)
369 {
370     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
371 }
372 
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)373 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
374                                      const char *errors)
375 {
376     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
377 }
378 
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)379 PyObject *PyCodec_StreamReader(const char *encoding,
380                                PyObject *stream,
381                                const char *errors)
382 {
383     return codec_getstreamcodec(encoding, stream, errors, 2);
384 }
385 
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)386 PyObject *PyCodec_StreamWriter(const char *encoding,
387                                PyObject *stream,
388                                const char *errors)
389 {
390     return codec_getstreamcodec(encoding, stream, errors, 3);
391 }
392 
393 /* Helper that tries to ensure the reported exception chain indicates the
394  * codec that was invoked to trigger the failure without changing the type
395  * of the exception raised.
396  */
397 static void
wrap_codec_error(const char * operation,const char * encoding)398 wrap_codec_error(const char *operation,
399                  const char *encoding)
400 {
401     /* TrySetFromCause will replace the active exception with a suitably
402      * updated clone if it can, otherwise it will leave the original
403      * exception alone.
404      */
405     _PyErr_TrySetFromCause("%s with '%s' codec failed",
406                            operation, encoding);
407 }
408 
409 /* Encode an object (e.g. a Unicode object) using the given encoding
410    and return the resulting encoded object (usually a Python string).
411 
412    errors is passed to the encoder factory as argument if non-NULL. */
413 
414 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)415 _PyCodec_EncodeInternal(PyObject *object,
416                         PyObject *encoder,
417                         const char *encoding,
418                         const char *errors)
419 {
420     PyObject *args = NULL, *result = NULL;
421     PyObject *v = NULL;
422 
423     args = args_tuple(object, errors);
424     if (args == NULL)
425         goto onError;
426 
427     result = PyEval_CallObject(encoder, args);
428     if (result == NULL) {
429         wrap_codec_error("encoding", encoding);
430         goto onError;
431     }
432 
433     if (!PyTuple_Check(result) ||
434         PyTuple_GET_SIZE(result) != 2) {
435         PyErr_SetString(PyExc_TypeError,
436                         "encoder must return a tuple (object, integer)");
437         goto onError;
438     }
439     v = PyTuple_GET_ITEM(result,0);
440     Py_INCREF(v);
441     /* We don't check or use the second (integer) entry. */
442 
443     Py_DECREF(args);
444     Py_DECREF(encoder);
445     Py_DECREF(result);
446     return v;
447 
448  onError:
449     Py_XDECREF(result);
450     Py_XDECREF(args);
451     Py_XDECREF(encoder);
452     return NULL;
453 }
454 
455 /* Decode an object (usually a Python string) using the given encoding
456    and return an equivalent object (e.g. a Unicode object).
457 
458    errors is passed to the decoder factory as argument if non-NULL. */
459 
460 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)461 _PyCodec_DecodeInternal(PyObject *object,
462                         PyObject *decoder,
463                         const char *encoding,
464                         const char *errors)
465 {
466     PyObject *args = NULL, *result = NULL;
467     PyObject *v;
468 
469     args = args_tuple(object, errors);
470     if (args == NULL)
471         goto onError;
472 
473     result = PyEval_CallObject(decoder,args);
474     if (result == NULL) {
475         wrap_codec_error("decoding", encoding);
476         goto onError;
477     }
478     if (!PyTuple_Check(result) ||
479         PyTuple_GET_SIZE(result) != 2) {
480         PyErr_SetString(PyExc_TypeError,
481                         "decoder must return a tuple (object,integer)");
482         goto onError;
483     }
484     v = PyTuple_GET_ITEM(result,0);
485     Py_INCREF(v);
486     /* We don't check or use the second (integer) entry. */
487 
488     Py_DECREF(args);
489     Py_DECREF(decoder);
490     Py_DECREF(result);
491     return v;
492 
493  onError:
494     Py_XDECREF(args);
495     Py_XDECREF(decoder);
496     Py_XDECREF(result);
497     return NULL;
498 }
499 
500 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)501 PyObject *PyCodec_Encode(PyObject *object,
502                          const char *encoding,
503                          const char *errors)
504 {
505     PyObject *encoder;
506 
507     encoder = PyCodec_Encoder(encoding);
508     if (encoder == NULL)
509         return NULL;
510 
511     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
512 }
513 
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)514 PyObject *PyCodec_Decode(PyObject *object,
515                          const char *encoding,
516                          const char *errors)
517 {
518     PyObject *decoder;
519 
520     decoder = PyCodec_Decoder(encoding);
521     if (decoder == NULL)
522         return NULL;
523 
524     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
525 }
526 
527 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)528 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
529                                        const char *alternate_command)
530 {
531     _Py_IDENTIFIER(_is_text_encoding);
532     PyObject *codec;
533     PyObject *attr;
534     int is_text_codec;
535 
536     codec = _PyCodec_Lookup(encoding);
537     if (codec == NULL)
538         return NULL;
539 
540     /* Backwards compatibility: assume any raw tuple describes a text
541      * encoding, and the same for anything lacking the private
542      * attribute.
543      */
544     if (!PyTuple_CheckExact(codec)) {
545         if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
546             Py_DECREF(codec);
547             return NULL;
548         }
549         if (attr != NULL) {
550             is_text_codec = PyObject_IsTrue(attr);
551             Py_DECREF(attr);
552             if (is_text_codec <= 0) {
553                 Py_DECREF(codec);
554                 if (!is_text_codec)
555                     PyErr_Format(PyExc_LookupError,
556                                  "'%.400s' is not a text encoding; "
557                                  "use %s to handle arbitrary codecs",
558                                  encoding, alternate_command);
559                 return NULL;
560             }
561         }
562     }
563 
564     /* This appears to be a valid text encoding */
565     return codec;
566 }
567 
568 
569 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)570 PyObject *codec_getitem_checked(const char *encoding,
571                                 const char *alternate_command,
572                                 int index)
573 {
574     PyObject *codec;
575     PyObject *v;
576 
577     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
578     if (codec == NULL)
579         return NULL;
580 
581     v = PyTuple_GET_ITEM(codec, index);
582     Py_INCREF(v);
583     Py_DECREF(codec);
584     return v;
585 }
586 
_PyCodec_TextEncoder(const char * encoding)587 static PyObject * _PyCodec_TextEncoder(const char *encoding)
588 {
589     return codec_getitem_checked(encoding, "codecs.encode()", 0);
590 }
591 
_PyCodec_TextDecoder(const char * encoding)592 static PyObject * _PyCodec_TextDecoder(const char *encoding)
593 {
594     return codec_getitem_checked(encoding, "codecs.decode()", 1);
595 }
596 
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)597 PyObject *_PyCodec_EncodeText(PyObject *object,
598                               const char *encoding,
599                               const char *errors)
600 {
601     PyObject *encoder;
602 
603     encoder = _PyCodec_TextEncoder(encoding);
604     if (encoder == NULL)
605         return NULL;
606 
607     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
608 }
609 
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)610 PyObject *_PyCodec_DecodeText(PyObject *object,
611                               const char *encoding,
612                               const char *errors)
613 {
614     PyObject *decoder;
615 
616     decoder = _PyCodec_TextDecoder(encoding);
617     if (decoder == NULL)
618         return NULL;
619 
620     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
621 }
622 
623 /* Register the error handling callback function error under the name
624    name. This function will be called by the codec when it encounters
625    an unencodable characters/undecodable bytes and doesn't know the
626    callback name, when name is specified as the error parameter
627    in the call to the encode/decode function.
628    Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)629 int PyCodec_RegisterError(const char *name, PyObject *error)
630 {
631     PyInterpreterState *interp = _PyInterpreterState_Get();
632     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
633         return -1;
634     if (!PyCallable_Check(error)) {
635         PyErr_SetString(PyExc_TypeError, "handler must be callable");
636         return -1;
637     }
638     return PyDict_SetItemString(interp->codec_error_registry,
639                                 name, error);
640 }
641 
642 /* Lookup the error handling callback function registered under the
643    name error. As a special case NULL can be passed, in which case
644    the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)645 PyObject *PyCodec_LookupError(const char *name)
646 {
647     PyObject *handler = NULL;
648 
649     PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
650     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
651         return NULL;
652 
653     if (name==NULL)
654         name = "strict";
655     handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
656     if (handler) {
657         Py_INCREF(handler);
658     }
659     else if (!PyErr_Occurred()) {
660         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
661     }
662     return handler;
663 }
664 
wrong_exception_type(PyObject * exc)665 static void wrong_exception_type(PyObject *exc)
666 {
667     PyErr_Format(PyExc_TypeError,
668                  "don't know how to handle %.200s in error callback",
669                  exc->ob_type->tp_name);
670 }
671 
PyCodec_StrictErrors(PyObject * exc)672 PyObject *PyCodec_StrictErrors(PyObject *exc)
673 {
674     if (PyExceptionInstance_Check(exc))
675         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
676     else
677         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
678     return NULL;
679 }
680 
681 
PyCodec_IgnoreErrors(PyObject * exc)682 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
683 {
684     Py_ssize_t end;
685 
686     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
687         if (PyUnicodeEncodeError_GetEnd(exc, &end))
688             return NULL;
689     }
690     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
691         if (PyUnicodeDecodeError_GetEnd(exc, &end))
692             return NULL;
693     }
694     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
695         if (PyUnicodeTranslateError_GetEnd(exc, &end))
696             return NULL;
697     }
698     else {
699         wrong_exception_type(exc);
700         return NULL;
701     }
702     return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
703 }
704 
705 
PyCodec_ReplaceErrors(PyObject * exc)706 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
707 {
708     Py_ssize_t start, end, i, len;
709 
710     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
711         PyObject *res;
712         int kind;
713         void *data;
714         if (PyUnicodeEncodeError_GetStart(exc, &start))
715             return NULL;
716         if (PyUnicodeEncodeError_GetEnd(exc, &end))
717             return NULL;
718         len = end - start;
719         res = PyUnicode_New(len, '?');
720         if (res == NULL)
721             return NULL;
722         kind = PyUnicode_KIND(res);
723         data = PyUnicode_DATA(res);
724         for (i = 0; i < len; ++i)
725             PyUnicode_WRITE(kind, data, i, '?');
726         assert(_PyUnicode_CheckConsistency(res, 1));
727         return Py_BuildValue("(Nn)", res, end);
728     }
729     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
730         if (PyUnicodeDecodeError_GetEnd(exc, &end))
731             return NULL;
732         return Py_BuildValue("(Cn)",
733                              (int)Py_UNICODE_REPLACEMENT_CHARACTER,
734                              end);
735     }
736     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
737         PyObject *res;
738         int kind;
739         void *data;
740         if (PyUnicodeTranslateError_GetStart(exc, &start))
741             return NULL;
742         if (PyUnicodeTranslateError_GetEnd(exc, &end))
743             return NULL;
744         len = end - start;
745         res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
746         if (res == NULL)
747             return NULL;
748         kind = PyUnicode_KIND(res);
749         data = PyUnicode_DATA(res);
750         for (i=0; i < len; i++)
751             PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
752         assert(_PyUnicode_CheckConsistency(res, 1));
753         return Py_BuildValue("(Nn)", res, end);
754     }
755     else {
756         wrong_exception_type(exc);
757         return NULL;
758     }
759 }
760 
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)761 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
762 {
763     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
764         PyObject *restuple;
765         PyObject *object;
766         Py_ssize_t i;
767         Py_ssize_t start;
768         Py_ssize_t end;
769         PyObject *res;
770         unsigned char *outp;
771         Py_ssize_t ressize;
772         Py_UCS4 ch;
773         if (PyUnicodeEncodeError_GetStart(exc, &start))
774             return NULL;
775         if (PyUnicodeEncodeError_GetEnd(exc, &end))
776             return NULL;
777         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
778             return NULL;
779         if (end - start > PY_SSIZE_T_MAX / (2+7+1))
780             end = start + PY_SSIZE_T_MAX / (2+7+1);
781         for (i = start, ressize = 0; i < end; ++i) {
782             /* object is guaranteed to be "ready" */
783             ch = PyUnicode_READ_CHAR(object, i);
784             if (ch<10)
785                 ressize += 2+1+1;
786             else if (ch<100)
787                 ressize += 2+2+1;
788             else if (ch<1000)
789                 ressize += 2+3+1;
790             else if (ch<10000)
791                 ressize += 2+4+1;
792             else if (ch<100000)
793                 ressize += 2+5+1;
794             else if (ch<1000000)
795                 ressize += 2+6+1;
796             else
797                 ressize += 2+7+1;
798         }
799         /* allocate replacement */
800         res = PyUnicode_New(ressize, 127);
801         if (res == NULL) {
802             Py_DECREF(object);
803             return NULL;
804         }
805         outp = PyUnicode_1BYTE_DATA(res);
806         /* generate replacement */
807         for (i = start; i < end; ++i) {
808             int digits;
809             int base;
810             ch = PyUnicode_READ_CHAR(object, i);
811             *outp++ = '&';
812             *outp++ = '#';
813             if (ch<10) {
814                 digits = 1;
815                 base = 1;
816             }
817             else if (ch<100) {
818                 digits = 2;
819                 base = 10;
820             }
821             else if (ch<1000) {
822                 digits = 3;
823                 base = 100;
824             }
825             else if (ch<10000) {
826                 digits = 4;
827                 base = 1000;
828             }
829             else if (ch<100000) {
830                 digits = 5;
831                 base = 10000;
832             }
833             else if (ch<1000000) {
834                 digits = 6;
835                 base = 100000;
836             }
837             else {
838                 digits = 7;
839                 base = 1000000;
840             }
841             while (digits-->0) {
842                 *outp++ = '0' + ch/base;
843                 ch %= base;
844                 base /= 10;
845             }
846             *outp++ = ';';
847         }
848         assert(_PyUnicode_CheckConsistency(res, 1));
849         restuple = Py_BuildValue("(Nn)", res, end);
850         Py_DECREF(object);
851         return restuple;
852     }
853     else {
854         wrong_exception_type(exc);
855         return NULL;
856     }
857 }
858 
PyCodec_BackslashReplaceErrors(PyObject * exc)859 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
860 {
861     PyObject *object;
862     Py_ssize_t i;
863     Py_ssize_t start;
864     Py_ssize_t end;
865     PyObject *res;
866     unsigned char *outp;
867     int ressize;
868     Py_UCS4 c;
869 
870     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
871         const unsigned char *p;
872         if (PyUnicodeDecodeError_GetStart(exc, &start))
873             return NULL;
874         if (PyUnicodeDecodeError_GetEnd(exc, &end))
875             return NULL;
876         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
877             return NULL;
878         p = (const unsigned char*)PyBytes_AS_STRING(object);
879         res = PyUnicode_New(4 * (end - start), 127);
880         if (res == NULL) {
881             Py_DECREF(object);
882             return NULL;
883         }
884         outp = PyUnicode_1BYTE_DATA(res);
885         for (i = start; i < end; i++, outp += 4) {
886             unsigned char c = p[i];
887             outp[0] = '\\';
888             outp[1] = 'x';
889             outp[2] = Py_hexdigits[(c>>4)&0xf];
890             outp[3] = Py_hexdigits[c&0xf];
891         }
892 
893         assert(_PyUnicode_CheckConsistency(res, 1));
894         Py_DECREF(object);
895         return Py_BuildValue("(Nn)", res, end);
896     }
897     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
898         if (PyUnicodeEncodeError_GetStart(exc, &start))
899             return NULL;
900         if (PyUnicodeEncodeError_GetEnd(exc, &end))
901             return NULL;
902         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
903             return NULL;
904     }
905     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
906         if (PyUnicodeTranslateError_GetStart(exc, &start))
907             return NULL;
908         if (PyUnicodeTranslateError_GetEnd(exc, &end))
909             return NULL;
910         if (!(object = PyUnicodeTranslateError_GetObject(exc)))
911             return NULL;
912     }
913     else {
914         wrong_exception_type(exc);
915         return NULL;
916     }
917 
918     if (end - start > PY_SSIZE_T_MAX / (1+1+8))
919         end = start + PY_SSIZE_T_MAX / (1+1+8);
920     for (i = start, ressize = 0; i < end; ++i) {
921         /* object is guaranteed to be "ready" */
922         c = PyUnicode_READ_CHAR(object, i);
923         if (c >= 0x10000) {
924             ressize += 1+1+8;
925         }
926         else if (c >= 0x100) {
927             ressize += 1+1+4;
928         }
929         else
930             ressize += 1+1+2;
931     }
932     res = PyUnicode_New(ressize, 127);
933     if (res == NULL) {
934         Py_DECREF(object);
935         return NULL;
936     }
937     outp = PyUnicode_1BYTE_DATA(res);
938     for (i = start; i < end; ++i) {
939         c = PyUnicode_READ_CHAR(object, i);
940         *outp++ = '\\';
941         if (c >= 0x00010000) {
942             *outp++ = 'U';
943             *outp++ = Py_hexdigits[(c>>28)&0xf];
944             *outp++ = Py_hexdigits[(c>>24)&0xf];
945             *outp++ = Py_hexdigits[(c>>20)&0xf];
946             *outp++ = Py_hexdigits[(c>>16)&0xf];
947             *outp++ = Py_hexdigits[(c>>12)&0xf];
948             *outp++ = Py_hexdigits[(c>>8)&0xf];
949         }
950         else if (c >= 0x100) {
951             *outp++ = 'u';
952             *outp++ = Py_hexdigits[(c>>12)&0xf];
953             *outp++ = Py_hexdigits[(c>>8)&0xf];
954         }
955         else
956             *outp++ = 'x';
957         *outp++ = Py_hexdigits[(c>>4)&0xf];
958         *outp++ = Py_hexdigits[c&0xf];
959     }
960 
961     assert(_PyUnicode_CheckConsistency(res, 1));
962     Py_DECREF(object);
963     return Py_BuildValue("(Nn)", res, end);
964 }
965 
966 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
967 
PyCodec_NameReplaceErrors(PyObject * exc)968 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
969 {
970     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
971         PyObject *restuple;
972         PyObject *object;
973         Py_ssize_t i;
974         Py_ssize_t start;
975         Py_ssize_t end;
976         PyObject *res;
977         unsigned char *outp;
978         Py_ssize_t ressize;
979         int replsize;
980         Py_UCS4 c;
981         char buffer[256]; /* NAME_MAXLEN */
982         if (PyUnicodeEncodeError_GetStart(exc, &start))
983             return NULL;
984         if (PyUnicodeEncodeError_GetEnd(exc, &end))
985             return NULL;
986         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
987             return NULL;
988         if (!ucnhash_CAPI) {
989             /* load the unicode data module */
990             ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
991                                             PyUnicodeData_CAPSULE_NAME, 1);
992             if (!ucnhash_CAPI)
993                 return NULL;
994         }
995         for (i = start, ressize = 0; i < end; ++i) {
996             /* object is guaranteed to be "ready" */
997             c = PyUnicode_READ_CHAR(object, i);
998             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
999                 replsize = 1+1+1+(int)strlen(buffer)+1;
1000             }
1001             else if (c >= 0x10000) {
1002                 replsize = 1+1+8;
1003             }
1004             else if (c >= 0x100) {
1005                 replsize = 1+1+4;
1006             }
1007             else
1008                 replsize = 1+1+2;
1009             if (ressize > PY_SSIZE_T_MAX - replsize)
1010                 break;
1011             ressize += replsize;
1012         }
1013         end = i;
1014         res = PyUnicode_New(ressize, 127);
1015         if (res==NULL)
1016             return NULL;
1017         for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1018             i < end; ++i) {
1019             c = PyUnicode_READ_CHAR(object, i);
1020             *outp++ = '\\';
1021             if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1022                 *outp++ = 'N';
1023                 *outp++ = '{';
1024                 strcpy((char *)outp, buffer);
1025                 outp += strlen(buffer);
1026                 *outp++ = '}';
1027                 continue;
1028             }
1029             if (c >= 0x00010000) {
1030                 *outp++ = 'U';
1031                 *outp++ = Py_hexdigits[(c>>28)&0xf];
1032                 *outp++ = Py_hexdigits[(c>>24)&0xf];
1033                 *outp++ = Py_hexdigits[(c>>20)&0xf];
1034                 *outp++ = Py_hexdigits[(c>>16)&0xf];
1035                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1036                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1037             }
1038             else if (c >= 0x100) {
1039                 *outp++ = 'u';
1040                 *outp++ = Py_hexdigits[(c>>12)&0xf];
1041                 *outp++ = Py_hexdigits[(c>>8)&0xf];
1042             }
1043             else
1044                 *outp++ = 'x';
1045             *outp++ = Py_hexdigits[(c>>4)&0xf];
1046             *outp++ = Py_hexdigits[c&0xf];
1047         }
1048 
1049         assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1050         assert(_PyUnicode_CheckConsistency(res, 1));
1051         restuple = Py_BuildValue("(Nn)", res, end);
1052         Py_DECREF(object);
1053         return restuple;
1054     }
1055     else {
1056         wrong_exception_type(exc);
1057         return NULL;
1058     }
1059 }
1060 
1061 #define ENC_UNKNOWN     -1
1062 #define ENC_UTF8        0
1063 #define ENC_UTF16BE     1
1064 #define ENC_UTF16LE     2
1065 #define ENC_UTF32BE     3
1066 #define ENC_UTF32LE     4
1067 
1068 static int
get_standard_encoding(const char * encoding,int * bytelength)1069 get_standard_encoding(const char *encoding, int *bytelength)
1070 {
1071     if (Py_TOLOWER(encoding[0]) == 'u' &&
1072         Py_TOLOWER(encoding[1]) == 't' &&
1073         Py_TOLOWER(encoding[2]) == 'f') {
1074         encoding += 3;
1075         if (*encoding == '-' || *encoding == '_' )
1076             encoding++;
1077         if (encoding[0] == '8' && encoding[1] == '\0') {
1078             *bytelength = 3;
1079             return ENC_UTF8;
1080         }
1081         else if (encoding[0] == '1' && encoding[1] == '6') {
1082             encoding += 2;
1083             *bytelength = 2;
1084             if (*encoding == '\0') {
1085 #ifdef WORDS_BIGENDIAN
1086                 return ENC_UTF16BE;
1087 #else
1088                 return ENC_UTF16LE;
1089 #endif
1090             }
1091             if (*encoding == '-' || *encoding == '_' )
1092                 encoding++;
1093             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1094                 if (Py_TOLOWER(encoding[0]) == 'b')
1095                     return ENC_UTF16BE;
1096                 if (Py_TOLOWER(encoding[0]) == 'l')
1097                     return ENC_UTF16LE;
1098             }
1099         }
1100         else if (encoding[0] == '3' && encoding[1] == '2') {
1101             encoding += 2;
1102             *bytelength = 4;
1103             if (*encoding == '\0') {
1104 #ifdef WORDS_BIGENDIAN
1105                 return ENC_UTF32BE;
1106 #else
1107                 return ENC_UTF32LE;
1108 #endif
1109             }
1110             if (*encoding == '-' || *encoding == '_' )
1111                 encoding++;
1112             if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1113                 if (Py_TOLOWER(encoding[0]) == 'b')
1114                     return ENC_UTF32BE;
1115                 if (Py_TOLOWER(encoding[0]) == 'l')
1116                     return ENC_UTF32LE;
1117             }
1118         }
1119     }
1120     else if (strcmp(encoding, "CP_UTF8") == 0) {
1121         *bytelength = 3;
1122         return ENC_UTF8;
1123     }
1124     return ENC_UNKNOWN;
1125 }
1126 
1127 /* This handler is declared static until someone demonstrates
1128    a need to call it directly. */
1129 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1130 PyCodec_SurrogatePassErrors(PyObject *exc)
1131 {
1132     PyObject *restuple;
1133     PyObject *object;
1134     PyObject *encode;
1135     const char *encoding;
1136     int code;
1137     int bytelength;
1138     Py_ssize_t i;
1139     Py_ssize_t start;
1140     Py_ssize_t end;
1141     PyObject *res;
1142 
1143     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1144         unsigned char *outp;
1145         if (PyUnicodeEncodeError_GetStart(exc, &start))
1146             return NULL;
1147         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1148             return NULL;
1149         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1150             return NULL;
1151         if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1152             Py_DECREF(object);
1153             return NULL;
1154         }
1155         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1156             Py_DECREF(object);
1157             Py_DECREF(encode);
1158             return NULL;
1159         }
1160         code = get_standard_encoding(encoding, &bytelength);
1161         Py_DECREF(encode);
1162         if (code == ENC_UNKNOWN) {
1163             /* Not supported, fail with original exception */
1164             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1165             Py_DECREF(object);
1166             return NULL;
1167         }
1168 
1169         if (end - start > PY_SSIZE_T_MAX / bytelength)
1170             end = start + PY_SSIZE_T_MAX / bytelength;
1171         res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1172         if (!res) {
1173             Py_DECREF(object);
1174             return NULL;
1175         }
1176         outp = (unsigned char*)PyBytes_AsString(res);
1177         for (i = start; i < end; i++) {
1178             /* object is guaranteed to be "ready" */
1179             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1180             if (!Py_UNICODE_IS_SURROGATE(ch)) {
1181                 /* Not a surrogate, fail with original exception */
1182                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1183                 Py_DECREF(res);
1184                 Py_DECREF(object);
1185                 return NULL;
1186             }
1187             switch (code) {
1188             case ENC_UTF8:
1189                 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1190                 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1191                 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1192                 break;
1193             case ENC_UTF16LE:
1194                 *outp++ = (unsigned char) ch;
1195                 *outp++ = (unsigned char)(ch >> 8);
1196                 break;
1197             case ENC_UTF16BE:
1198                 *outp++ = (unsigned char)(ch >> 8);
1199                 *outp++ = (unsigned char) ch;
1200                 break;
1201             case ENC_UTF32LE:
1202                 *outp++ = (unsigned char) ch;
1203                 *outp++ = (unsigned char)(ch >> 8);
1204                 *outp++ = (unsigned char)(ch >> 16);
1205                 *outp++ = (unsigned char)(ch >> 24);
1206                 break;
1207             case ENC_UTF32BE:
1208                 *outp++ = (unsigned char)(ch >> 24);
1209                 *outp++ = (unsigned char)(ch >> 16);
1210                 *outp++ = (unsigned char)(ch >> 8);
1211                 *outp++ = (unsigned char) ch;
1212                 break;
1213             }
1214         }
1215         restuple = Py_BuildValue("(On)", res, end);
1216         Py_DECREF(res);
1217         Py_DECREF(object);
1218         return restuple;
1219     }
1220     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1221         const unsigned char *p;
1222         Py_UCS4 ch = 0;
1223         if (PyUnicodeDecodeError_GetStart(exc, &start))
1224             return NULL;
1225         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1226             return NULL;
1227         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1228             return NULL;
1229         p = (const unsigned char*)PyBytes_AS_STRING(object);
1230         if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1231             Py_DECREF(object);
1232             return NULL;
1233         }
1234         if (!(encoding = PyUnicode_AsUTF8(encode))) {
1235             Py_DECREF(object);
1236             Py_DECREF(encode);
1237             return NULL;
1238         }
1239         code = get_standard_encoding(encoding, &bytelength);
1240         Py_DECREF(encode);
1241         if (code == ENC_UNKNOWN) {
1242             /* Not supported, fail with original exception */
1243             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1244             Py_DECREF(object);
1245             return NULL;
1246         }
1247 
1248         /* Try decoding a single surrogate character. If
1249            there are more, let the codec call us again. */
1250         p += start;
1251         if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1252             switch (code) {
1253             case ENC_UTF8:
1254                 if ((p[0] & 0xf0) == 0xe0 &&
1255                     (p[1] & 0xc0) == 0x80 &&
1256                     (p[2] & 0xc0) == 0x80) {
1257                     /* it's a three-byte code */
1258                     ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1259                 }
1260                 break;
1261             case ENC_UTF16LE:
1262                 ch = p[1] << 8 | p[0];
1263                 break;
1264             case ENC_UTF16BE:
1265                 ch = p[0] << 8 | p[1];
1266                 break;
1267             case ENC_UTF32LE:
1268                 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1269                 break;
1270             case ENC_UTF32BE:
1271                 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1272                 break;
1273             }
1274         }
1275 
1276         Py_DECREF(object);
1277         if (!Py_UNICODE_IS_SURROGATE(ch)) {
1278             /* it's not a surrogate - fail */
1279             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1280             return NULL;
1281         }
1282         res = PyUnicode_FromOrdinal(ch);
1283         if (res == NULL)
1284             return NULL;
1285         return Py_BuildValue("(Nn)", res, start + bytelength);
1286     }
1287     else {
1288         wrong_exception_type(exc);
1289         return NULL;
1290     }
1291 }
1292 
1293 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1294 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1295 {
1296     PyObject *restuple;
1297     PyObject *object;
1298     Py_ssize_t i;
1299     Py_ssize_t start;
1300     Py_ssize_t end;
1301     PyObject *res;
1302 
1303     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1304         char *outp;
1305         if (PyUnicodeEncodeError_GetStart(exc, &start))
1306             return NULL;
1307         if (PyUnicodeEncodeError_GetEnd(exc, &end))
1308             return NULL;
1309         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1310             return NULL;
1311         res = PyBytes_FromStringAndSize(NULL, end-start);
1312         if (!res) {
1313             Py_DECREF(object);
1314             return NULL;
1315         }
1316         outp = PyBytes_AsString(res);
1317         for (i = start; i < end; i++) {
1318             /* object is guaranteed to be "ready" */
1319             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1320             if (ch < 0xdc80 || ch > 0xdcff) {
1321                 /* Not a UTF-8b surrogate, fail with original exception */
1322                 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1323                 Py_DECREF(res);
1324                 Py_DECREF(object);
1325                 return NULL;
1326             }
1327             *outp++ = ch - 0xdc00;
1328         }
1329         restuple = Py_BuildValue("(On)", res, end);
1330         Py_DECREF(res);
1331         Py_DECREF(object);
1332         return restuple;
1333     }
1334     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1335         PyObject *str;
1336         const unsigned char *p;
1337         Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1338         int consumed = 0;
1339         if (PyUnicodeDecodeError_GetStart(exc, &start))
1340             return NULL;
1341         if (PyUnicodeDecodeError_GetEnd(exc, &end))
1342             return NULL;
1343         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1344             return NULL;
1345         p = (const unsigned char*)PyBytes_AS_STRING(object);
1346         while (consumed < 4 && consumed < end-start) {
1347             /* Refuse to escape ASCII bytes. */
1348             if (p[start+consumed] < 128)
1349                 break;
1350             ch[consumed] = 0xdc00 + p[start+consumed];
1351             consumed++;
1352         }
1353         Py_DECREF(object);
1354         if (!consumed) {
1355             /* codec complained about ASCII byte. */
1356             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1357             return NULL;
1358         }
1359         str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1360         if (str == NULL)
1361             return NULL;
1362         return Py_BuildValue("(Nn)", str, start+consumed);
1363     }
1364     else {
1365         wrong_exception_type(exc);
1366         return NULL;
1367     }
1368 }
1369 
1370 
strict_errors(PyObject * self,PyObject * exc)1371 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1372 {
1373     return PyCodec_StrictErrors(exc);
1374 }
1375 
1376 
ignore_errors(PyObject * self,PyObject * exc)1377 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1378 {
1379     return PyCodec_IgnoreErrors(exc);
1380 }
1381 
1382 
replace_errors(PyObject * self,PyObject * exc)1383 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1384 {
1385     return PyCodec_ReplaceErrors(exc);
1386 }
1387 
1388 
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1389 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1390 {
1391     return PyCodec_XMLCharRefReplaceErrors(exc);
1392 }
1393 
1394 
backslashreplace_errors(PyObject * self,PyObject * exc)1395 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1396 {
1397     return PyCodec_BackslashReplaceErrors(exc);
1398 }
1399 
namereplace_errors(PyObject * self,PyObject * exc)1400 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1401 {
1402     return PyCodec_NameReplaceErrors(exc);
1403 }
1404 
surrogatepass_errors(PyObject * self,PyObject * exc)1405 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1406 {
1407     return PyCodec_SurrogatePassErrors(exc);
1408 }
1409 
surrogateescape_errors(PyObject * self,PyObject * exc)1410 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1411 {
1412     return PyCodec_SurrogateEscapeErrors(exc);
1413 }
1414 
_PyCodecRegistry_Init(void)1415 static int _PyCodecRegistry_Init(void)
1416 {
1417     static struct {
1418         const char *name;
1419         PyMethodDef def;
1420     } methods[] =
1421     {
1422         {
1423             "strict",
1424             {
1425                 "strict_errors",
1426                 strict_errors,
1427                 METH_O,
1428                 PyDoc_STR("Implements the 'strict' error handling, which "
1429                           "raises a UnicodeError on coding errors.")
1430             }
1431         },
1432         {
1433             "ignore",
1434             {
1435                 "ignore_errors",
1436                 ignore_errors,
1437                 METH_O,
1438                 PyDoc_STR("Implements the 'ignore' error handling, which "
1439                           "ignores malformed data and continues.")
1440             }
1441         },
1442         {
1443             "replace",
1444             {
1445                 "replace_errors",
1446                 replace_errors,
1447                 METH_O,
1448                 PyDoc_STR("Implements the 'replace' error handling, which "
1449                           "replaces malformed data with a replacement marker.")
1450             }
1451         },
1452         {
1453             "xmlcharrefreplace",
1454             {
1455                 "xmlcharrefreplace_errors",
1456                 xmlcharrefreplace_errors,
1457                 METH_O,
1458                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1459                           "which replaces an unencodable character with the "
1460                           "appropriate XML character reference.")
1461             }
1462         },
1463         {
1464             "backslashreplace",
1465             {
1466                 "backslashreplace_errors",
1467                 backslashreplace_errors,
1468                 METH_O,
1469                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1470                           "which replaces malformed data with a backslashed "
1471                           "escape sequence.")
1472             }
1473         },
1474         {
1475             "namereplace",
1476             {
1477                 "namereplace_errors",
1478                 namereplace_errors,
1479                 METH_O,
1480                 PyDoc_STR("Implements the 'namereplace' error handling, "
1481                           "which replaces an unencodable character with a "
1482                           "\\N{...} escape sequence.")
1483             }
1484         },
1485         {
1486             "surrogatepass",
1487             {
1488                 "surrogatepass",
1489                 surrogatepass_errors,
1490                 METH_O
1491             }
1492         },
1493         {
1494             "surrogateescape",
1495             {
1496                 "surrogateescape",
1497                 surrogateescape_errors,
1498                 METH_O
1499             }
1500         }
1501     };
1502 
1503     PyInterpreterState *interp = _PyInterpreterState_Get();
1504     PyObject *mod;
1505     unsigned i;
1506 
1507     if (interp->codec_search_path != NULL)
1508         return 0;
1509 
1510     interp->codec_search_path = PyList_New(0);
1511     interp->codec_search_cache = PyDict_New();
1512     interp->codec_error_registry = PyDict_New();
1513 
1514     if (interp->codec_error_registry) {
1515         for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1516             PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1517             int res;
1518             if (!func)
1519                 Py_FatalError("can't initialize codec error registry");
1520             res = PyCodec_RegisterError(methods[i].name, func);
1521             Py_DECREF(func);
1522             if (res)
1523                 Py_FatalError("can't initialize codec error registry");
1524         }
1525     }
1526 
1527     if (interp->codec_search_path == NULL ||
1528         interp->codec_search_cache == NULL ||
1529         interp->codec_error_registry == NULL)
1530         Py_FatalError("can't initialize codec registry");
1531 
1532     mod = PyImport_ImportModuleNoBlock("encodings");
1533     if (mod == NULL) {
1534         return -1;
1535     }
1536     Py_DECREF(mod);
1537     interp->codecs_initialized = 1;
1538     return 0;
1539 }
1540