1 /* ------------------------------------------------------------------------
2 
3    Python Codec Registry and support functions
4 
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6 
7 Copyright (c) Corporation for National Research Initiatives.
8 
9    ------------------------------------------------------------------------ */
10 
11 #include "Python.h"
12 #include <ctype.h>
13 
14 /* --- Codec Registry ----------------------------------------------------- */
15 
16 /* Import the standard encodings package which will register the first
17    codec search function.
18 
19    This is done in a lazy way so that the Unicode implementation does
20    not downgrade startup time of scripts not needing it.
21 
22    ImportErrors are silently ignored by this function. Only one try is
23    made.
24 
25 */
26 
27 static int _PyCodecRegistry_Init(void); /* Forward */
28 
PyCodec_Register(PyObject * search_function)29 int PyCodec_Register(PyObject *search_function)
30 {
31     PyInterpreterState *interp = PyThreadState_GET()->interp;
32     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
33         goto onError;
34     if (search_function == NULL) {
35         PyErr_BadArgument();
36         goto onError;
37     }
38     if (!PyCallable_Check(search_function)) {
39         PyErr_SetString(PyExc_TypeError, "argument must be callable");
40         goto onError;
41     }
42     return PyList_Append(interp->codec_search_path, search_function);
43 
44  onError:
45     return -1;
46 }
47 
48 /* Convert a string to a normalized Python string: all characters are
49    converted to lower case, spaces are replaced with underscores. */
50 
51 static
normalizestring(const char * string)52 PyObject *normalizestring(const char *string)
53 {
54     register size_t i;
55     size_t len = strlen(string);
56     char *p;
57     PyObject *v;
58 
59     if (len > PY_SSIZE_T_MAX) {
60         PyErr_SetString(PyExc_OverflowError, "string is too large");
61         return NULL;
62     }
63 
64     v = PyString_FromStringAndSize(NULL, len);
65     if (v == NULL)
66         return NULL;
67     p = PyString_AS_STRING(v);
68     for (i = 0; i < len; i++) {
69         register char ch = string[i];
70         if (ch == ' ')
71             ch = '-';
72         else
73             ch = Py_TOLOWER(Py_CHARMASK(ch));
74         p[i] = ch;
75     }
76     return v;
77 }
78 
79 /* Lookup the given encoding and return a tuple providing the codec
80    facilities.
81 
82    The encoding string is looked up converted to all lower-case
83    characters. This makes encodings looked up through this mechanism
84    effectively case-insensitive.
85 
86    If no codec is found, a LookupError is set and NULL returned.
87 
88    As side effect, this tries to load the encodings package, if not
89    yet done. This is part of the lazy load strategy for the encodings
90    package.
91 
92 */
93 
_PyCodec_Lookup(const char * encoding)94 PyObject *_PyCodec_Lookup(const char *encoding)
95 {
96     PyInterpreterState *interp;
97     PyObject *result, *args = NULL, *v;
98     Py_ssize_t i, len;
99 
100     if (encoding == NULL) {
101         PyErr_BadArgument();
102         goto onError;
103     }
104 
105     interp = PyThreadState_GET()->interp;
106     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
107         goto onError;
108 
109     /* Convert the encoding to a normalized Python string: all
110        characters are converted to lower case, spaces and hyphens are
111        replaced with underscores. */
112     v = normalizestring(encoding);
113     if (v == NULL)
114         goto onError;
115     PyString_InternInPlace(&v);
116 
117     /* First, try to lookup the name in the registry dictionary */
118     result = PyDict_GetItem(interp->codec_search_cache, v);
119     if (result != NULL) {
120         Py_INCREF(result);
121         Py_DECREF(v);
122         return result;
123     }
124 
125     /* Next, scan the search functions in order of registration */
126     args = PyTuple_New(1);
127     if (args == NULL)
128         goto onError;
129     PyTuple_SET_ITEM(args,0,v);
130 
131     len = PyList_Size(interp->codec_search_path);
132     if (len < 0)
133         goto onError;
134     if (len == 0) {
135         PyErr_SetString(PyExc_LookupError,
136                         "no codec search functions registered: "
137                         "can't find encoding");
138         goto onError;
139     }
140 
141     for (i = 0; i < len; i++) {
142         PyObject *func;
143 
144         func = PyList_GetItem(interp->codec_search_path, i);
145         if (func == NULL)
146             goto onError;
147         result = PyEval_CallObject(func, args);
148         if (result == NULL)
149             goto onError;
150         if (result == Py_None) {
151             Py_DECREF(result);
152             continue;
153         }
154         if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
155             PyErr_SetString(PyExc_TypeError,
156                             "codec search functions must return 4-tuples");
157             Py_DECREF(result);
158             goto onError;
159         }
160         break;
161     }
162     if (i == len) {
163         /* XXX Perhaps we should cache misses too ? */
164         PyErr_Format(PyExc_LookupError,
165                      "unknown encoding: %s", encoding);
166         goto onError;
167     }
168 
169     /* Cache and return the result */
170     PyDict_SetItem(interp->codec_search_cache, v, result);
171     Py_DECREF(args);
172     return result;
173 
174  onError:
175     Py_XDECREF(args);
176     return NULL;
177 }
178 
179 static
args_tuple(PyObject * object,const char * errors)180 PyObject *args_tuple(PyObject *object,
181                      const char *errors)
182 {
183     PyObject *args;
184 
185     args = PyTuple_New(1 + (errors != NULL));
186     if (args == NULL)
187         return NULL;
188     Py_INCREF(object);
189     PyTuple_SET_ITEM(args,0,object);
190     if (errors) {
191         PyObject *v;
192 
193         v = PyString_FromString(errors);
194         if (v == NULL) {
195             Py_DECREF(args);
196             return NULL;
197         }
198         PyTuple_SET_ITEM(args, 1, v);
199     }
200     return args;
201 }
202 
203 /* Helper function to get a codec item */
204 
205 static
codec_getitem(const char * encoding,int index)206 PyObject *codec_getitem(const char *encoding, int index)
207 {
208     PyObject *codecs;
209     PyObject *v;
210 
211     codecs = _PyCodec_Lookup(encoding);
212     if (codecs == NULL)
213         return NULL;
214     v = PyTuple_GET_ITEM(codecs, index);
215     Py_DECREF(codecs);
216     Py_INCREF(v);
217     return v;
218 }
219 
220 /* Helper functions to create an incremental codec. */
221 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)222 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
223                                      const char *errors,
224                                      const char *attrname)
225 {
226     PyObject *ret, *inccodec;
227 
228     inccodec = PyObject_GetAttrString(codec_info, attrname);
229     if (inccodec == NULL)
230         return NULL;
231     if (errors)
232         ret = PyObject_CallFunction(inccodec, "s", errors);
233     else
234         ret = PyObject_CallFunction(inccodec, NULL);
235     Py_DECREF(inccodec);
236     return ret;
237 }
238 
239 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)240 PyObject *codec_getincrementalcodec(const char *encoding,
241                                     const char *errors,
242                                     const char *attrname)
243 {
244     PyObject *codec_info, *ret;
245 
246     codec_info = _PyCodec_Lookup(encoding);
247     if (codec_info == NULL)
248         return NULL;
249     ret = codec_makeincrementalcodec(codec_info, errors, attrname);
250     Py_DECREF(codec_info);
251     return ret;
252 }
253 
254 /* Helper function to create a stream codec. */
255 
256 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)257 PyObject *codec_getstreamcodec(const char *encoding,
258                                PyObject *stream,
259                                const char *errors,
260                                const int index)
261 {
262     PyObject *codecs, *streamcodec, *codeccls;
263 
264     codecs = _PyCodec_Lookup(encoding);
265     if (codecs == NULL)
266         return NULL;
267 
268     codeccls = PyTuple_GET_ITEM(codecs, index);
269     if (errors != NULL)
270         streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
271     else
272         streamcodec = PyObject_CallFunction(codeccls, "O", stream);
273     Py_DECREF(codecs);
274     return streamcodec;
275 }
276 
277 /* Helpers to work with the result of _PyCodec_Lookup
278 
279  */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)280 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
281                                              const char *errors)
282 {
283     return codec_makeincrementalcodec(codec_info, errors,
284                                       "incrementaldecoder");
285 }
286 
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)287 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
288                                              const char *errors)
289 {
290     return codec_makeincrementalcodec(codec_info, errors,
291                                       "incrementalencoder");
292 }
293 
294 
295 /* Convenience APIs to query the Codec registry.
296 
297    All APIs return a codec object with incremented refcount.
298 
299  */
300 
PyCodec_Encoder(const char * encoding)301 PyObject *PyCodec_Encoder(const char *encoding)
302 {
303     return codec_getitem(encoding, 0);
304 }
305 
PyCodec_Decoder(const char * encoding)306 PyObject *PyCodec_Decoder(const char *encoding)
307 {
308     return codec_getitem(encoding, 1);
309 }
310 
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)311 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
312                                      const char *errors)
313 {
314     return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
315 }
316 
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)317 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
318                                      const char *errors)
319 {
320     return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
321 }
322 
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)323 PyObject *PyCodec_StreamReader(const char *encoding,
324                                PyObject *stream,
325                                const char *errors)
326 {
327     return codec_getstreamcodec(encoding, stream, errors, 2);
328 }
329 
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)330 PyObject *PyCodec_StreamWriter(const char *encoding,
331                                PyObject *stream,
332                                const char *errors)
333 {
334     return codec_getstreamcodec(encoding, stream, errors, 3);
335 }
336 
337 /* Encode an object (e.g. a Unicode object) using the given encoding
338    and return the resulting encoded object (usually a Python string).
339 
340    errors is passed to the encoder factory as argument if non-NULL. */
341 
342 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)343 _PyCodec_EncodeInternal(PyObject *object,
344                         PyObject *encoder,
345                         const char *encoding,
346                         const char *errors)
347 {
348     PyObject *args = NULL, *result = NULL;
349     PyObject *v;
350 
351     args = args_tuple(object, errors);
352     if (args == NULL)
353         goto onError;
354 
355     result = PyEval_CallObject(encoder,args);
356     if (result == NULL)
357         goto onError;
358 
359     if (!PyTuple_Check(result) ||
360         PyTuple_GET_SIZE(result) != 2) {
361         PyErr_SetString(PyExc_TypeError,
362                         "encoder must return a tuple (object,integer)");
363         goto onError;
364     }
365     v = PyTuple_GET_ITEM(result,0);
366     Py_INCREF(v);
367     /* We don't check or use the second (integer) entry. */
368 
369     Py_DECREF(args);
370     Py_DECREF(encoder);
371     Py_DECREF(result);
372     return v;
373 
374  onError:
375     Py_XDECREF(result);
376     Py_XDECREF(args);
377     Py_XDECREF(encoder);
378     return NULL;
379 }
380 
381 /* Decode an object (usually a Python string) using the given encoding
382    and return an equivalent object (e.g. a Unicode object).
383 
384    errors is passed to the decoder factory as argument if non-NULL. */
385 
386 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)387 _PyCodec_DecodeInternal(PyObject *object,
388                         PyObject *decoder,
389                         const char *encoding,
390                         const char *errors)
391 {
392     PyObject *args = NULL, *result = NULL;
393     PyObject *v;
394 
395     args = args_tuple(object, errors);
396     if (args == NULL)
397         goto onError;
398 
399     result = PyEval_CallObject(decoder,args);
400     if (result == NULL)
401         goto onError;
402     if (!PyTuple_Check(result) ||
403         PyTuple_GET_SIZE(result) != 2) {
404         PyErr_SetString(PyExc_TypeError,
405                         "decoder must return a tuple (object,integer)");
406         goto onError;
407     }
408     v = PyTuple_GET_ITEM(result,0);
409     Py_INCREF(v);
410     /* We don't check or use the second (integer) entry. */
411 
412     Py_DECREF(args);
413     Py_DECREF(decoder);
414     Py_DECREF(result);
415     return v;
416 
417  onError:
418     Py_XDECREF(args);
419     Py_XDECREF(decoder);
420     Py_XDECREF(result);
421     return NULL;
422 }
423 
424 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)425 PyObject *PyCodec_Encode(PyObject *object,
426                          const char *encoding,
427                          const char *errors)
428 {
429     PyObject *encoder;
430 
431     encoder = PyCodec_Encoder(encoding);
432     if (encoder == NULL)
433         return NULL;
434 
435     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
436 }
437 
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)438 PyObject *PyCodec_Decode(PyObject *object,
439                          const char *encoding,
440                          const char *errors)
441 {
442     PyObject *decoder;
443 
444     decoder = PyCodec_Decoder(encoding);
445     if (decoder == NULL)
446         return NULL;
447 
448     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
449 }
450 
451 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)452 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
453                                        const char *alternate_command)
454 {
455     PyObject *codec;
456     PyObject *attr;
457     int is_text_codec;
458 
459     codec = _PyCodec_Lookup(encoding);
460     if (codec == NULL)
461         return NULL;
462 
463     /* Backwards compatibility: assume any raw tuple describes a text
464      * encoding, and the same for anything lacking the private
465      * attribute.
466      */
467     if (Py_Py3kWarningFlag && !PyTuple_CheckExact(codec)) {
468         attr = PyObject_GetAttrString(codec, "_is_text_encoding");
469         if (attr == NULL) {
470             if (!PyErr_ExceptionMatches(PyExc_AttributeError))
471                 goto onError;
472             PyErr_Clear();
473         } else {
474             is_text_codec = PyObject_IsTrue(attr);
475             Py_DECREF(attr);
476             if (is_text_codec < 0)
477                 goto onError;
478             if (!is_text_codec) {
479                 PyObject *msg = PyString_FromFormat(
480                             "'%.400s' is not a text encoding; "
481                             "use %s to handle arbitrary codecs",
482                             encoding, alternate_command);
483                 if (msg == NULL)
484                     goto onError;
485                 if (PyErr_WarnPy3k(PyString_AS_STRING(msg), 1) < 0) {
486                     Py_DECREF(msg);
487                     goto onError;
488                 }
489                 Py_DECREF(msg);
490             }
491         }
492     }
493 
494     /* This appears to be a valid text encoding */
495     return codec;
496 
497  onError:
498     Py_DECREF(codec);
499     return NULL;
500 }
501 
502 
503 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)504 PyObject *codec_getitem_checked(const char *encoding,
505                                 const char *alternate_command,
506                                 int index)
507 {
508     PyObject *codec;
509     PyObject *v;
510 
511     codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
512     if (codec == NULL)
513         return NULL;
514 
515     v = PyTuple_GET_ITEM(codec, index);
516     Py_INCREF(v);
517     Py_DECREF(codec);
518     return v;
519 }
520 
_PyCodec_TextEncoder(const char * encoding)521 static PyObject * _PyCodec_TextEncoder(const char *encoding)
522 {
523     return codec_getitem_checked(encoding, "codecs.encode()", 0);
524 }
525 
_PyCodec_TextDecoder(const char * encoding)526 static PyObject * _PyCodec_TextDecoder(const char *encoding)
527 {
528     return codec_getitem_checked(encoding, "codecs.decode()", 1);
529 }
530 
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)531 PyObject *_PyCodec_EncodeText(PyObject *object,
532                               const char *encoding,
533                               const char *errors)
534 {
535     PyObject *encoder;
536 
537     encoder = _PyCodec_TextEncoder(encoding);
538     if (encoder == NULL)
539         return NULL;
540 
541     return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
542 }
543 
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)544 PyObject *_PyCodec_DecodeText(PyObject *object,
545                               const char *encoding,
546                               const char *errors)
547 {
548     PyObject *decoder;
549 
550     decoder = _PyCodec_TextDecoder(encoding);
551     if (decoder == NULL)
552         return NULL;
553 
554     return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
555 }
556 
557 /* Register the error handling callback function error under the name
558    name. This function will be called by the codec when it encounters
559    an unencodable characters/undecodable bytes and doesn't know the
560    callback name, when name is specified as the error parameter
561    in the call to the encode/decode function.
562    Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)563 int PyCodec_RegisterError(const char *name, PyObject *error)
564 {
565     PyInterpreterState *interp = PyThreadState_GET()->interp;
566     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
567         return -1;
568     if (!PyCallable_Check(error)) {
569         PyErr_SetString(PyExc_TypeError, "handler must be callable");
570         return -1;
571     }
572     return PyDict_SetItemString(interp->codec_error_registry,
573                                 (char *)name, error);
574 }
575 
576 /* Lookup the error handling callback function registered under the
577    name error. As a special case NULL can be passed, in which case
578    the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)579 PyObject *PyCodec_LookupError(const char *name)
580 {
581     PyObject *handler = NULL;
582 
583     PyInterpreterState *interp = PyThreadState_GET()->interp;
584     if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
585         return NULL;
586 
587     if (name==NULL)
588         name = "strict";
589     handler = PyDict_GetItemString(interp->codec_error_registry, (char *)name);
590     if (!handler)
591         PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
592     else
593         Py_INCREF(handler);
594     return handler;
595 }
596 
wrong_exception_type(PyObject * exc)597 static void wrong_exception_type(PyObject *exc)
598 {
599     PyObject *type = PyObject_GetAttrString(exc, "__class__");
600     if (type != NULL) {
601         PyObject *name = PyObject_GetAttrString(type, "__name__");
602         Py_DECREF(type);
603         if (name != NULL) {
604             PyObject *string = PyObject_Str(name);
605             Py_DECREF(name);
606             if (string != NULL) {
607                 PyErr_Format(PyExc_TypeError,
608                     "don't know how to handle %.400s in error callback",
609                     PyString_AS_STRING(string));
610                 Py_DECREF(string);
611             }
612         }
613     }
614 }
615 
PyCodec_StrictErrors(PyObject * exc)616 PyObject *PyCodec_StrictErrors(PyObject *exc)
617 {
618     if (PyExceptionInstance_Check(exc))
619         PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
620     else
621         PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
622     return NULL;
623 }
624 
625 
626 #ifdef Py_USING_UNICODE
PyCodec_IgnoreErrors(PyObject * exc)627 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
628 {
629     Py_ssize_t end;
630 
631     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
632         if (PyUnicodeEncodeError_GetEnd(exc, &end))
633             return NULL;
634     }
635     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
636         if (PyUnicodeDecodeError_GetEnd(exc, &end))
637             return NULL;
638     }
639     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
640         if (PyUnicodeTranslateError_GetEnd(exc, &end))
641             return NULL;
642     }
643     else {
644         wrong_exception_type(exc);
645         return NULL;
646     }
647     /* ouch: passing NULL, 0, pos gives None instead of u'' */
648     return Py_BuildValue("(u#n)", &end, 0, end);
649 }
650 
651 
PyCodec_ReplaceErrors(PyObject * exc)652 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
653 {
654     PyObject *restuple;
655     Py_ssize_t start;
656     Py_ssize_t end;
657     Py_ssize_t i;
658 
659     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
660         PyObject *res;
661         Py_UNICODE *p;
662         if (PyUnicodeEncodeError_GetStart(exc, &start))
663             return NULL;
664         if (PyUnicodeEncodeError_GetEnd(exc, &end))
665             return NULL;
666         res = PyUnicode_FromUnicode(NULL, end-start);
667         if (res == NULL)
668             return NULL;
669         for (p = PyUnicode_AS_UNICODE(res), i = start;
670             i<end; ++p, ++i)
671             *p = '?';
672         restuple = Py_BuildValue("(On)", res, end);
673         Py_DECREF(res);
674         return restuple;
675     }
676     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
677         Py_UNICODE res = Py_UNICODE_REPLACEMENT_CHARACTER;
678         if (PyUnicodeDecodeError_GetEnd(exc, &end))
679             return NULL;
680         return Py_BuildValue("(u#n)", &res, (Py_ssize_t)1, end);
681     }
682     else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
683         PyObject *res;
684         Py_UNICODE *p;
685         if (PyUnicodeTranslateError_GetStart(exc, &start))
686             return NULL;
687         if (PyUnicodeTranslateError_GetEnd(exc, &end))
688             return NULL;
689         res = PyUnicode_FromUnicode(NULL, end-start);
690         if (res == NULL)
691             return NULL;
692         for (p = PyUnicode_AS_UNICODE(res), i = start;
693             i<end; ++p, ++i)
694             *p = Py_UNICODE_REPLACEMENT_CHARACTER;
695         restuple = Py_BuildValue("(On)", res, end);
696         Py_DECREF(res);
697         return restuple;
698     }
699     else {
700         wrong_exception_type(exc);
701         return NULL;
702     }
703 }
704 
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)705 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
706 {
707     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
708         PyObject *restuple;
709         PyObject *object;
710         Py_ssize_t start;
711         Py_ssize_t end;
712         PyObject *res;
713         Py_UNICODE *p;
714         Py_UNICODE *startp;
715         Py_UNICODE *e;
716         Py_UNICODE *outp;
717         Py_ssize_t ressize;
718         if (PyUnicodeEncodeError_GetStart(exc, &start))
719             return NULL;
720         if (PyUnicodeEncodeError_GetEnd(exc, &end))
721             return NULL;
722         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
723             return NULL;
724         startp = PyUnicode_AS_UNICODE(object);
725         if (end - start > PY_SSIZE_T_MAX / (2+7+1)) {
726             end = start + PY_SSIZE_T_MAX / (2+7+1);
727 #ifndef Py_UNICODE_WIDE
728             if (0xD800 <= startp[end - 1] && startp[end - 1] <= 0xDBFF)
729                 end--;
730 #endif
731         }
732         e = startp + end;
733         for (p = startp+start, ressize = 0; p < e;) {
734             Py_UCS4 ch = *p++;
735 #ifndef Py_UNICODE_WIDE
736             if ((0xD800 <= ch && ch <= 0xDBFF) &&
737                 (p < e) &&
738                 (0xDC00 <= *p && *p <= 0xDFFF)) {
739                 ch = ((((ch & 0x03FF) << 10) |
740                        ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
741             }
742 #endif
743             if (ch < 10)
744                 ressize += 2+1+1;
745             else if (ch < 100)
746                 ressize += 2+2+1;
747             else if (ch < 1000)
748                 ressize += 2+3+1;
749             else if (ch < 10000)
750                 ressize += 2+4+1;
751             else if (ch < 100000)
752                 ressize += 2+5+1;
753             else if (ch < 1000000)
754                 ressize += 2+6+1;
755             else
756                 ressize += 2+7+1;
757         }
758         /* allocate replacement */
759         res = PyUnicode_FromUnicode(NULL, ressize);
760         if (res == NULL) {
761             Py_DECREF(object);
762             return NULL;
763         }
764         /* generate replacement */
765         for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) {
766             int digits;
767             int base;
768             Py_UCS4 ch = *p++;
769 #ifndef Py_UNICODE_WIDE
770             if ((0xD800 <= ch && ch <= 0xDBFF) &&
771                 (p < startp+end) &&
772                 (0xDC00 <= *p && *p <= 0xDFFF)) {
773                 ch = ((((ch & 0x03FF) << 10) |
774                        ((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
775             }
776 #endif
777             *outp++ = '&';
778             *outp++ = '#';
779             if (ch < 10) {
780                 digits = 1;
781                 base = 1;
782             }
783             else if (ch < 100) {
784                 digits = 2;
785                 base = 10;
786             }
787             else if (ch < 1000) {
788                 digits = 3;
789                 base = 100;
790             }
791             else if (ch < 10000) {
792                 digits = 4;
793                 base = 1000;
794             }
795             else if (ch < 100000) {
796                 digits = 5;
797                 base = 10000;
798             }
799             else if (ch < 1000000) {
800                 digits = 6;
801                 base = 100000;
802             }
803             else {
804                 digits = 7;
805                 base = 1000000;
806             }
807             while (digits-->0) {
808                 *outp++ = '0' + ch/base;
809                 ch %= base;
810                 base /= 10;
811             }
812             *outp++ = ';';
813         }
814         restuple = Py_BuildValue("(On)", res, end);
815         Py_DECREF(res);
816         Py_DECREF(object);
817         return restuple;
818     }
819     else {
820         wrong_exception_type(exc);
821         return NULL;
822     }
823 }
824 
825 static Py_UNICODE hexdigits[] = {
826     '0', '1', '2', '3', '4', '5', '6', '7',
827     '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
828 };
829 
PyCodec_BackslashReplaceErrors(PyObject * exc)830 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
831 {
832     if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
833         PyObject *restuple;
834         PyObject *object;
835         Py_ssize_t start;
836         Py_ssize_t end;
837         PyObject *res;
838         Py_UNICODE *p;
839         Py_UNICODE *startp;
840         Py_UNICODE *outp;
841         Py_ssize_t ressize;
842         if (PyUnicodeEncodeError_GetStart(exc, &start))
843             return NULL;
844         if (PyUnicodeEncodeError_GetEnd(exc, &end))
845             return NULL;
846         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
847             return NULL;
848         if (end - start > PY_SSIZE_T_MAX / (1+1+8))
849             end = start + PY_SSIZE_T_MAX / (1+1+8);
850         startp = PyUnicode_AS_UNICODE(object);
851         for (p = startp+start, ressize = 0; p < startp+end; ++p) {
852 #ifdef Py_UNICODE_WIDE
853             if (*p >= 0x00010000)
854                 ressize += 1+1+8;
855             else
856 #endif
857             if (*p >= 0x100) {
858                 ressize += 1+1+4;
859             }
860             else
861                 ressize += 1+1+2;
862         }
863         res = PyUnicode_FromUnicode(NULL, ressize);
864         if (res == NULL) {
865             Py_DECREF(object);
866             return NULL;
867         }
868         for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
869             p < startp+end; ++p) {
870             Py_UNICODE c = *p;
871             *outp++ = '\\';
872 #ifdef Py_UNICODE_WIDE
873             if (c >= 0x00010000) {
874                 *outp++ = 'U';
875                 *outp++ = hexdigits[(c>>28)&0xf];
876                 *outp++ = hexdigits[(c>>24)&0xf];
877                 *outp++ = hexdigits[(c>>20)&0xf];
878                 *outp++ = hexdigits[(c>>16)&0xf];
879                 *outp++ = hexdigits[(c>>12)&0xf];
880                 *outp++ = hexdigits[(c>>8)&0xf];
881             }
882             else
883 #endif
884             if (c >= 0x100) {
885                 *outp++ = 'u';
886                 *outp++ = hexdigits[(c>>12)&0xf];
887                 *outp++ = hexdigits[(c>>8)&0xf];
888             }
889             else
890                 *outp++ = 'x';
891             *outp++ = hexdigits[(c>>4)&0xf];
892             *outp++ = hexdigits[c&0xf];
893         }
894 
895         restuple = Py_BuildValue("(On)", res, end);
896         Py_DECREF(res);
897         Py_DECREF(object);
898         return restuple;
899     }
900     else {
901         wrong_exception_type(exc);
902         return NULL;
903     }
904 }
905 #endif
906 
strict_errors(PyObject * self,PyObject * exc)907 static PyObject *strict_errors(PyObject *self, PyObject *exc)
908 {
909     return PyCodec_StrictErrors(exc);
910 }
911 
912 
913 #ifdef Py_USING_UNICODE
ignore_errors(PyObject * self,PyObject * exc)914 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
915 {
916     return PyCodec_IgnoreErrors(exc);
917 }
918 
919 
replace_errors(PyObject * self,PyObject * exc)920 static PyObject *replace_errors(PyObject *self, PyObject *exc)
921 {
922     return PyCodec_ReplaceErrors(exc);
923 }
924 
925 
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)926 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
927 {
928     return PyCodec_XMLCharRefReplaceErrors(exc);
929 }
930 
931 
backslashreplace_errors(PyObject * self,PyObject * exc)932 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
933 {
934     return PyCodec_BackslashReplaceErrors(exc);
935 }
936 #endif
937 
_PyCodecRegistry_Init(void)938 static int _PyCodecRegistry_Init(void)
939 {
940     static struct {
941         char *name;
942         PyMethodDef def;
943     } methods[] =
944     {
945         {
946             "strict",
947             {
948                 "strict_errors",
949                 strict_errors,
950                 METH_O,
951                 PyDoc_STR("Implements the 'strict' error handling, which "
952                           "raises a UnicodeError on coding errors.")
953             }
954         },
955 #ifdef Py_USING_UNICODE
956         {
957             "ignore",
958             {
959                 "ignore_errors",
960                 ignore_errors,
961                 METH_O,
962                 PyDoc_STR("Implements the 'ignore' error handling, which "
963                           "ignores malformed data and continues.")
964             }
965         },
966         {
967             "replace",
968             {
969                 "replace_errors",
970                 replace_errors,
971                 METH_O,
972                 PyDoc_STR("Implements the 'replace' error handling, which "
973                           "replaces malformed data with a replacement marker.")
974             }
975         },
976         {
977             "xmlcharrefreplace",
978             {
979                 "xmlcharrefreplace_errors",
980                 xmlcharrefreplace_errors,
981                 METH_O,
982                 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
983                           "which replaces an unencodable character with the "
984                           "appropriate XML character reference.")
985             }
986         },
987         {
988             "backslashreplace",
989             {
990                 "backslashreplace_errors",
991                 backslashreplace_errors,
992                 METH_O,
993                 PyDoc_STR("Implements the 'backslashreplace' error handling, "
994                           "which replaces an unencodable character with a "
995                           "backslashed escape sequence.")
996             }
997         }
998 #endif
999     };
1000 
1001     PyInterpreterState *interp = PyThreadState_GET()->interp;
1002     PyObject *mod;
1003     unsigned i;
1004 
1005     if (interp->codec_search_path != NULL)
1006         return 0;
1007 
1008     interp->codec_search_path = PyList_New(0);
1009     interp->codec_search_cache = PyDict_New();
1010     interp->codec_error_registry = PyDict_New();
1011 
1012     if (interp->codec_error_registry) {
1013         for (i = 0; i < sizeof(methods)/sizeof(methods[0]); ++i) {
1014             PyObject *func = PyCFunction_New(&methods[i].def, NULL);
1015             int res;
1016             if (!func)
1017                 Py_FatalError("can't initialize codec error registry");
1018             res = PyCodec_RegisterError(methods[i].name, func);
1019             Py_DECREF(func);
1020             if (res)
1021                 Py_FatalError("can't initialize codec error registry");
1022         }
1023     }
1024 
1025     if (interp->codec_search_path == NULL ||
1026         interp->codec_search_cache == NULL ||
1027         interp->codec_error_registry == NULL)
1028         Py_FatalError("can't initialize codec registry");
1029 
1030     mod = PyImport_ImportModuleLevel("encodings", NULL, NULL, NULL, 0);
1031     if (mod == NULL) {
1032         if (PyErr_ExceptionMatches(PyExc_ImportError)) {
1033             /* Ignore ImportErrors... this is done so that
1034                distributions can disable the encodings package. Note
1035                that other errors are not masked, e.g. SystemErrors
1036                raised to inform the user of an error in the Python
1037                configuration are still reported back to the user. */
1038             PyErr_Clear();
1039             return 0;
1040         }
1041         return -1;
1042     }
1043     Py_DECREF(mod);
1044     return 0;
1045 }
1046