1 /* ------------------------------------------------------------------------
2
3 Python Codec Registry and support functions
4
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6
7 Copyright (c) Corporation for National Research Initiatives.
8
9 ------------------------------------------------------------------------ */
10
11 #include "Python.h"
12 #include "pycore_pystate.h"
13 #include "ucnhash.h"
14 #include <ctype.h>
15
16 const char *Py_hexdigits = "0123456789abcdef";
17
18 /* --- Codec Registry ----------------------------------------------------- */
19
20 /* Import the standard encodings package which will register the first
21 codec search function.
22
23 This is done in a lazy way so that the Unicode implementation does
24 not downgrade startup time of scripts not needing it.
25
26 ImportErrors are silently ignored by this function. Only one try is
27 made.
28
29 */
30
31 static int _PyCodecRegistry_Init(void); /* Forward */
32
PyCodec_Register(PyObject * search_function)33 int PyCodec_Register(PyObject *search_function)
34 {
35 PyInterpreterState *interp = _PyInterpreterState_Get();
36 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
37 goto onError;
38 if (search_function == NULL) {
39 PyErr_BadArgument();
40 goto onError;
41 }
42 if (!PyCallable_Check(search_function)) {
43 PyErr_SetString(PyExc_TypeError, "argument must be callable");
44 goto onError;
45 }
46 return PyList_Append(interp->codec_search_path, search_function);
47
48 onError:
49 return -1;
50 }
51
52 /* Convert a string to a normalized Python string: all characters are
53 converted to lower case, spaces are replaced with underscores. */
54
55 static
normalizestring(const char * string)56 PyObject *normalizestring(const char *string)
57 {
58 size_t i;
59 size_t len = strlen(string);
60 char *p;
61 PyObject *v;
62
63 if (len > PY_SSIZE_T_MAX) {
64 PyErr_SetString(PyExc_OverflowError, "string is too large");
65 return NULL;
66 }
67
68 p = PyMem_Malloc(len + 1);
69 if (p == NULL)
70 return PyErr_NoMemory();
71 for (i = 0; i < len; i++) {
72 char ch = string[i];
73 if (ch == ' ')
74 ch = '-';
75 else
76 ch = Py_TOLOWER(Py_CHARMASK(ch));
77 p[i] = ch;
78 }
79 p[i] = '\0';
80 v = PyUnicode_FromString(p);
81 PyMem_Free(p);
82 return v;
83 }
84
85 /* Lookup the given encoding and return a tuple providing the codec
86 facilities.
87
88 The encoding string is looked up converted to all lower-case
89 characters. This makes encodings looked up through this mechanism
90 effectively case-insensitive.
91
92 If no codec is found, a LookupError is set and NULL returned.
93
94 As side effect, this tries to load the encodings package, if not
95 yet done. This is part of the lazy load strategy for the encodings
96 package.
97
98 */
99
_PyCodec_Lookup(const char * encoding)100 PyObject *_PyCodec_Lookup(const char *encoding)
101 {
102 PyObject *result, *args = NULL, *v;
103 Py_ssize_t i, len;
104
105 if (encoding == NULL) {
106 PyErr_BadArgument();
107 goto onError;
108 }
109
110 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
111 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
112 goto onError;
113
114 /* Convert the encoding to a normalized Python string: all
115 characters are converted to lower case, spaces and hyphens are
116 replaced with underscores. */
117 v = normalizestring(encoding);
118 if (v == NULL)
119 goto onError;
120 PyUnicode_InternInPlace(&v);
121
122 /* First, try to lookup the name in the registry dictionary */
123 result = PyDict_GetItemWithError(interp->codec_search_cache, v);
124 if (result != NULL) {
125 Py_INCREF(result);
126 Py_DECREF(v);
127 return result;
128 }
129 else if (PyErr_Occurred()) {
130 Py_DECREF(v);
131 return NULL;
132 }
133
134 /* Next, scan the search functions in order of registration */
135 args = PyTuple_New(1);
136 if (args == NULL) {
137 Py_DECREF(v);
138 return NULL;
139 }
140 PyTuple_SET_ITEM(args,0,v);
141
142 len = PyList_Size(interp->codec_search_path);
143 if (len < 0)
144 goto onError;
145 if (len == 0) {
146 PyErr_SetString(PyExc_LookupError,
147 "no codec search functions registered: "
148 "can't find encoding");
149 goto onError;
150 }
151
152 for (i = 0; i < len; i++) {
153 PyObject *func;
154
155 func = PyList_GetItem(interp->codec_search_path, i);
156 if (func == NULL)
157 goto onError;
158 result = PyEval_CallObject(func, args);
159 if (result == NULL)
160 goto onError;
161 if (result == Py_None) {
162 Py_DECREF(result);
163 continue;
164 }
165 if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
166 PyErr_SetString(PyExc_TypeError,
167 "codec search functions must return 4-tuples");
168 Py_DECREF(result);
169 goto onError;
170 }
171 break;
172 }
173 if (i == len) {
174 /* XXX Perhaps we should cache misses too ? */
175 PyErr_Format(PyExc_LookupError,
176 "unknown encoding: %s", encoding);
177 goto onError;
178 }
179
180 /* Cache and return the result */
181 if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
182 Py_DECREF(result);
183 goto onError;
184 }
185 Py_DECREF(args);
186 return result;
187
188 onError:
189 Py_XDECREF(args);
190 return NULL;
191 }
192
_PyCodec_Forget(const char * encoding)193 int _PyCodec_Forget(const char *encoding)
194 {
195 PyObject *v;
196 int result;
197
198 PyInterpreterState *interp = _PyInterpreterState_Get();
199 if (interp->codec_search_path == NULL) {
200 return -1;
201 }
202
203 /* Convert the encoding to a normalized Python string: all
204 characters are converted to lower case, spaces and hyphens are
205 replaced with underscores. */
206 v = normalizestring(encoding);
207 if (v == NULL) {
208 return -1;
209 }
210
211 /* Drop the named codec from the internal cache */
212 result = PyDict_DelItem(interp->codec_search_cache, v);
213 Py_DECREF(v);
214
215 return result;
216 }
217
218 /* Codec registry encoding check API. */
219
PyCodec_KnownEncoding(const char * encoding)220 int PyCodec_KnownEncoding(const char *encoding)
221 {
222 PyObject *codecs;
223
224 codecs = _PyCodec_Lookup(encoding);
225 if (!codecs) {
226 PyErr_Clear();
227 return 0;
228 }
229 else {
230 Py_DECREF(codecs);
231 return 1;
232 }
233 }
234
235 static
args_tuple(PyObject * object,const char * errors)236 PyObject *args_tuple(PyObject *object,
237 const char *errors)
238 {
239 PyObject *args;
240
241 args = PyTuple_New(1 + (errors != NULL));
242 if (args == NULL)
243 return NULL;
244 Py_INCREF(object);
245 PyTuple_SET_ITEM(args,0,object);
246 if (errors) {
247 PyObject *v;
248
249 v = PyUnicode_FromString(errors);
250 if (v == NULL) {
251 Py_DECREF(args);
252 return NULL;
253 }
254 PyTuple_SET_ITEM(args, 1, v);
255 }
256 return args;
257 }
258
259 /* Helper function to get a codec item */
260
261 static
codec_getitem(const char * encoding,int index)262 PyObject *codec_getitem(const char *encoding, int index)
263 {
264 PyObject *codecs;
265 PyObject *v;
266
267 codecs = _PyCodec_Lookup(encoding);
268 if (codecs == NULL)
269 return NULL;
270 v = PyTuple_GET_ITEM(codecs, index);
271 Py_DECREF(codecs);
272 Py_INCREF(v);
273 return v;
274 }
275
276 /* Helper functions to create an incremental codec. */
277 static
codec_makeincrementalcodec(PyObject * codec_info,const char * errors,const char * attrname)278 PyObject *codec_makeincrementalcodec(PyObject *codec_info,
279 const char *errors,
280 const char *attrname)
281 {
282 PyObject *ret, *inccodec;
283
284 inccodec = PyObject_GetAttrString(codec_info, attrname);
285 if (inccodec == NULL)
286 return NULL;
287 if (errors)
288 ret = PyObject_CallFunction(inccodec, "s", errors);
289 else
290 ret = _PyObject_CallNoArg(inccodec);
291 Py_DECREF(inccodec);
292 return ret;
293 }
294
295 static
codec_getincrementalcodec(const char * encoding,const char * errors,const char * attrname)296 PyObject *codec_getincrementalcodec(const char *encoding,
297 const char *errors,
298 const char *attrname)
299 {
300 PyObject *codec_info, *ret;
301
302 codec_info = _PyCodec_Lookup(encoding);
303 if (codec_info == NULL)
304 return NULL;
305 ret = codec_makeincrementalcodec(codec_info, errors, attrname);
306 Py_DECREF(codec_info);
307 return ret;
308 }
309
310 /* Helper function to create a stream codec. */
311
312 static
codec_getstreamcodec(const char * encoding,PyObject * stream,const char * errors,const int index)313 PyObject *codec_getstreamcodec(const char *encoding,
314 PyObject *stream,
315 const char *errors,
316 const int index)
317 {
318 PyObject *codecs, *streamcodec, *codeccls;
319
320 codecs = _PyCodec_Lookup(encoding);
321 if (codecs == NULL)
322 return NULL;
323
324 codeccls = PyTuple_GET_ITEM(codecs, index);
325 if (errors != NULL)
326 streamcodec = PyObject_CallFunction(codeccls, "Os", stream, errors);
327 else
328 streamcodec = PyObject_CallFunctionObjArgs(codeccls, stream, NULL);
329 Py_DECREF(codecs);
330 return streamcodec;
331 }
332
333 /* Helpers to work with the result of _PyCodec_Lookup
334
335 */
_PyCodecInfo_GetIncrementalDecoder(PyObject * codec_info,const char * errors)336 PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info,
337 const char *errors)
338 {
339 return codec_makeincrementalcodec(codec_info, errors,
340 "incrementaldecoder");
341 }
342
_PyCodecInfo_GetIncrementalEncoder(PyObject * codec_info,const char * errors)343 PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info,
344 const char *errors)
345 {
346 return codec_makeincrementalcodec(codec_info, errors,
347 "incrementalencoder");
348 }
349
350
351 /* Convenience APIs to query the Codec registry.
352
353 All APIs return a codec object with incremented refcount.
354
355 */
356
PyCodec_Encoder(const char * encoding)357 PyObject *PyCodec_Encoder(const char *encoding)
358 {
359 return codec_getitem(encoding, 0);
360 }
361
PyCodec_Decoder(const char * encoding)362 PyObject *PyCodec_Decoder(const char *encoding)
363 {
364 return codec_getitem(encoding, 1);
365 }
366
PyCodec_IncrementalEncoder(const char * encoding,const char * errors)367 PyObject *PyCodec_IncrementalEncoder(const char *encoding,
368 const char *errors)
369 {
370 return codec_getincrementalcodec(encoding, errors, "incrementalencoder");
371 }
372
PyCodec_IncrementalDecoder(const char * encoding,const char * errors)373 PyObject *PyCodec_IncrementalDecoder(const char *encoding,
374 const char *errors)
375 {
376 return codec_getincrementalcodec(encoding, errors, "incrementaldecoder");
377 }
378
PyCodec_StreamReader(const char * encoding,PyObject * stream,const char * errors)379 PyObject *PyCodec_StreamReader(const char *encoding,
380 PyObject *stream,
381 const char *errors)
382 {
383 return codec_getstreamcodec(encoding, stream, errors, 2);
384 }
385
PyCodec_StreamWriter(const char * encoding,PyObject * stream,const char * errors)386 PyObject *PyCodec_StreamWriter(const char *encoding,
387 PyObject *stream,
388 const char *errors)
389 {
390 return codec_getstreamcodec(encoding, stream, errors, 3);
391 }
392
393 /* Helper that tries to ensure the reported exception chain indicates the
394 * codec that was invoked to trigger the failure without changing the type
395 * of the exception raised.
396 */
397 static void
wrap_codec_error(const char * operation,const char * encoding)398 wrap_codec_error(const char *operation,
399 const char *encoding)
400 {
401 /* TrySetFromCause will replace the active exception with a suitably
402 * updated clone if it can, otherwise it will leave the original
403 * exception alone.
404 */
405 _PyErr_TrySetFromCause("%s with '%s' codec failed",
406 operation, encoding);
407 }
408
409 /* Encode an object (e.g. a Unicode object) using the given encoding
410 and return the resulting encoded object (usually a Python string).
411
412 errors is passed to the encoder factory as argument if non-NULL. */
413
414 static PyObject *
_PyCodec_EncodeInternal(PyObject * object,PyObject * encoder,const char * encoding,const char * errors)415 _PyCodec_EncodeInternal(PyObject *object,
416 PyObject *encoder,
417 const char *encoding,
418 const char *errors)
419 {
420 PyObject *args = NULL, *result = NULL;
421 PyObject *v = NULL;
422
423 args = args_tuple(object, errors);
424 if (args == NULL)
425 goto onError;
426
427 result = PyEval_CallObject(encoder, args);
428 if (result == NULL) {
429 wrap_codec_error("encoding", encoding);
430 goto onError;
431 }
432
433 if (!PyTuple_Check(result) ||
434 PyTuple_GET_SIZE(result) != 2) {
435 PyErr_SetString(PyExc_TypeError,
436 "encoder must return a tuple (object, integer)");
437 goto onError;
438 }
439 v = PyTuple_GET_ITEM(result,0);
440 Py_INCREF(v);
441 /* We don't check or use the second (integer) entry. */
442
443 Py_DECREF(args);
444 Py_DECREF(encoder);
445 Py_DECREF(result);
446 return v;
447
448 onError:
449 Py_XDECREF(result);
450 Py_XDECREF(args);
451 Py_XDECREF(encoder);
452 return NULL;
453 }
454
455 /* Decode an object (usually a Python string) using the given encoding
456 and return an equivalent object (e.g. a Unicode object).
457
458 errors is passed to the decoder factory as argument if non-NULL. */
459
460 static PyObject *
_PyCodec_DecodeInternal(PyObject * object,PyObject * decoder,const char * encoding,const char * errors)461 _PyCodec_DecodeInternal(PyObject *object,
462 PyObject *decoder,
463 const char *encoding,
464 const char *errors)
465 {
466 PyObject *args = NULL, *result = NULL;
467 PyObject *v;
468
469 args = args_tuple(object, errors);
470 if (args == NULL)
471 goto onError;
472
473 result = PyEval_CallObject(decoder,args);
474 if (result == NULL) {
475 wrap_codec_error("decoding", encoding);
476 goto onError;
477 }
478 if (!PyTuple_Check(result) ||
479 PyTuple_GET_SIZE(result) != 2) {
480 PyErr_SetString(PyExc_TypeError,
481 "decoder must return a tuple (object,integer)");
482 goto onError;
483 }
484 v = PyTuple_GET_ITEM(result,0);
485 Py_INCREF(v);
486 /* We don't check or use the second (integer) entry. */
487
488 Py_DECREF(args);
489 Py_DECREF(decoder);
490 Py_DECREF(result);
491 return v;
492
493 onError:
494 Py_XDECREF(args);
495 Py_XDECREF(decoder);
496 Py_XDECREF(result);
497 return NULL;
498 }
499
500 /* Generic encoding/decoding API */
PyCodec_Encode(PyObject * object,const char * encoding,const char * errors)501 PyObject *PyCodec_Encode(PyObject *object,
502 const char *encoding,
503 const char *errors)
504 {
505 PyObject *encoder;
506
507 encoder = PyCodec_Encoder(encoding);
508 if (encoder == NULL)
509 return NULL;
510
511 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
512 }
513
PyCodec_Decode(PyObject * object,const char * encoding,const char * errors)514 PyObject *PyCodec_Decode(PyObject *object,
515 const char *encoding,
516 const char *errors)
517 {
518 PyObject *decoder;
519
520 decoder = PyCodec_Decoder(encoding);
521 if (decoder == NULL)
522 return NULL;
523
524 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
525 }
526
527 /* Text encoding/decoding API */
_PyCodec_LookupTextEncoding(const char * encoding,const char * alternate_command)528 PyObject * _PyCodec_LookupTextEncoding(const char *encoding,
529 const char *alternate_command)
530 {
531 _Py_IDENTIFIER(_is_text_encoding);
532 PyObject *codec;
533 PyObject *attr;
534 int is_text_codec;
535
536 codec = _PyCodec_Lookup(encoding);
537 if (codec == NULL)
538 return NULL;
539
540 /* Backwards compatibility: assume any raw tuple describes a text
541 * encoding, and the same for anything lacking the private
542 * attribute.
543 */
544 if (!PyTuple_CheckExact(codec)) {
545 if (_PyObject_LookupAttrId(codec, &PyId__is_text_encoding, &attr) < 0) {
546 Py_DECREF(codec);
547 return NULL;
548 }
549 if (attr != NULL) {
550 is_text_codec = PyObject_IsTrue(attr);
551 Py_DECREF(attr);
552 if (is_text_codec <= 0) {
553 Py_DECREF(codec);
554 if (!is_text_codec)
555 PyErr_Format(PyExc_LookupError,
556 "'%.400s' is not a text encoding; "
557 "use %s to handle arbitrary codecs",
558 encoding, alternate_command);
559 return NULL;
560 }
561 }
562 }
563
564 /* This appears to be a valid text encoding */
565 return codec;
566 }
567
568
569 static
codec_getitem_checked(const char * encoding,const char * alternate_command,int index)570 PyObject *codec_getitem_checked(const char *encoding,
571 const char *alternate_command,
572 int index)
573 {
574 PyObject *codec;
575 PyObject *v;
576
577 codec = _PyCodec_LookupTextEncoding(encoding, alternate_command);
578 if (codec == NULL)
579 return NULL;
580
581 v = PyTuple_GET_ITEM(codec, index);
582 Py_INCREF(v);
583 Py_DECREF(codec);
584 return v;
585 }
586
_PyCodec_TextEncoder(const char * encoding)587 static PyObject * _PyCodec_TextEncoder(const char *encoding)
588 {
589 return codec_getitem_checked(encoding, "codecs.encode()", 0);
590 }
591
_PyCodec_TextDecoder(const char * encoding)592 static PyObject * _PyCodec_TextDecoder(const char *encoding)
593 {
594 return codec_getitem_checked(encoding, "codecs.decode()", 1);
595 }
596
_PyCodec_EncodeText(PyObject * object,const char * encoding,const char * errors)597 PyObject *_PyCodec_EncodeText(PyObject *object,
598 const char *encoding,
599 const char *errors)
600 {
601 PyObject *encoder;
602
603 encoder = _PyCodec_TextEncoder(encoding);
604 if (encoder == NULL)
605 return NULL;
606
607 return _PyCodec_EncodeInternal(object, encoder, encoding, errors);
608 }
609
_PyCodec_DecodeText(PyObject * object,const char * encoding,const char * errors)610 PyObject *_PyCodec_DecodeText(PyObject *object,
611 const char *encoding,
612 const char *errors)
613 {
614 PyObject *decoder;
615
616 decoder = _PyCodec_TextDecoder(encoding);
617 if (decoder == NULL)
618 return NULL;
619
620 return _PyCodec_DecodeInternal(object, decoder, encoding, errors);
621 }
622
623 /* Register the error handling callback function error under the name
624 name. This function will be called by the codec when it encounters
625 an unencodable characters/undecodable bytes and doesn't know the
626 callback name, when name is specified as the error parameter
627 in the call to the encode/decode function.
628 Return 0 on success, -1 on error */
PyCodec_RegisterError(const char * name,PyObject * error)629 int PyCodec_RegisterError(const char *name, PyObject *error)
630 {
631 PyInterpreterState *interp = _PyInterpreterState_Get();
632 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
633 return -1;
634 if (!PyCallable_Check(error)) {
635 PyErr_SetString(PyExc_TypeError, "handler must be callable");
636 return -1;
637 }
638 return PyDict_SetItemString(interp->codec_error_registry,
639 name, error);
640 }
641
642 /* Lookup the error handling callback function registered under the
643 name error. As a special case NULL can be passed, in which case
644 the error handling callback for strict encoding will be returned. */
PyCodec_LookupError(const char * name)645 PyObject *PyCodec_LookupError(const char *name)
646 {
647 PyObject *handler = NULL;
648
649 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
650 if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
651 return NULL;
652
653 if (name==NULL)
654 name = "strict";
655 handler = _PyDict_GetItemStringWithError(interp->codec_error_registry, name);
656 if (handler) {
657 Py_INCREF(handler);
658 }
659 else if (!PyErr_Occurred()) {
660 PyErr_Format(PyExc_LookupError, "unknown error handler name '%.400s'", name);
661 }
662 return handler;
663 }
664
wrong_exception_type(PyObject * exc)665 static void wrong_exception_type(PyObject *exc)
666 {
667 PyErr_Format(PyExc_TypeError,
668 "don't know how to handle %.200s in error callback",
669 exc->ob_type->tp_name);
670 }
671
PyCodec_StrictErrors(PyObject * exc)672 PyObject *PyCodec_StrictErrors(PyObject *exc)
673 {
674 if (PyExceptionInstance_Check(exc))
675 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
676 else
677 PyErr_SetString(PyExc_TypeError, "codec must pass exception instance");
678 return NULL;
679 }
680
681
PyCodec_IgnoreErrors(PyObject * exc)682 PyObject *PyCodec_IgnoreErrors(PyObject *exc)
683 {
684 Py_ssize_t end;
685
686 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
687 if (PyUnicodeEncodeError_GetEnd(exc, &end))
688 return NULL;
689 }
690 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
691 if (PyUnicodeDecodeError_GetEnd(exc, &end))
692 return NULL;
693 }
694 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
695 if (PyUnicodeTranslateError_GetEnd(exc, &end))
696 return NULL;
697 }
698 else {
699 wrong_exception_type(exc);
700 return NULL;
701 }
702 return Py_BuildValue("(Nn)", PyUnicode_New(0, 0), end);
703 }
704
705
PyCodec_ReplaceErrors(PyObject * exc)706 PyObject *PyCodec_ReplaceErrors(PyObject *exc)
707 {
708 Py_ssize_t start, end, i, len;
709
710 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
711 PyObject *res;
712 int kind;
713 void *data;
714 if (PyUnicodeEncodeError_GetStart(exc, &start))
715 return NULL;
716 if (PyUnicodeEncodeError_GetEnd(exc, &end))
717 return NULL;
718 len = end - start;
719 res = PyUnicode_New(len, '?');
720 if (res == NULL)
721 return NULL;
722 kind = PyUnicode_KIND(res);
723 data = PyUnicode_DATA(res);
724 for (i = 0; i < len; ++i)
725 PyUnicode_WRITE(kind, data, i, '?');
726 assert(_PyUnicode_CheckConsistency(res, 1));
727 return Py_BuildValue("(Nn)", res, end);
728 }
729 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
730 if (PyUnicodeDecodeError_GetEnd(exc, &end))
731 return NULL;
732 return Py_BuildValue("(Cn)",
733 (int)Py_UNICODE_REPLACEMENT_CHARACTER,
734 end);
735 }
736 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
737 PyObject *res;
738 int kind;
739 void *data;
740 if (PyUnicodeTranslateError_GetStart(exc, &start))
741 return NULL;
742 if (PyUnicodeTranslateError_GetEnd(exc, &end))
743 return NULL;
744 len = end - start;
745 res = PyUnicode_New(len, Py_UNICODE_REPLACEMENT_CHARACTER);
746 if (res == NULL)
747 return NULL;
748 kind = PyUnicode_KIND(res);
749 data = PyUnicode_DATA(res);
750 for (i=0; i < len; i++)
751 PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER);
752 assert(_PyUnicode_CheckConsistency(res, 1));
753 return Py_BuildValue("(Nn)", res, end);
754 }
755 else {
756 wrong_exception_type(exc);
757 return NULL;
758 }
759 }
760
PyCodec_XMLCharRefReplaceErrors(PyObject * exc)761 PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
762 {
763 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
764 PyObject *restuple;
765 PyObject *object;
766 Py_ssize_t i;
767 Py_ssize_t start;
768 Py_ssize_t end;
769 PyObject *res;
770 unsigned char *outp;
771 Py_ssize_t ressize;
772 Py_UCS4 ch;
773 if (PyUnicodeEncodeError_GetStart(exc, &start))
774 return NULL;
775 if (PyUnicodeEncodeError_GetEnd(exc, &end))
776 return NULL;
777 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
778 return NULL;
779 if (end - start > PY_SSIZE_T_MAX / (2+7+1))
780 end = start + PY_SSIZE_T_MAX / (2+7+1);
781 for (i = start, ressize = 0; i < end; ++i) {
782 /* object is guaranteed to be "ready" */
783 ch = PyUnicode_READ_CHAR(object, i);
784 if (ch<10)
785 ressize += 2+1+1;
786 else if (ch<100)
787 ressize += 2+2+1;
788 else if (ch<1000)
789 ressize += 2+3+1;
790 else if (ch<10000)
791 ressize += 2+4+1;
792 else if (ch<100000)
793 ressize += 2+5+1;
794 else if (ch<1000000)
795 ressize += 2+6+1;
796 else
797 ressize += 2+7+1;
798 }
799 /* allocate replacement */
800 res = PyUnicode_New(ressize, 127);
801 if (res == NULL) {
802 Py_DECREF(object);
803 return NULL;
804 }
805 outp = PyUnicode_1BYTE_DATA(res);
806 /* generate replacement */
807 for (i = start; i < end; ++i) {
808 int digits;
809 int base;
810 ch = PyUnicode_READ_CHAR(object, i);
811 *outp++ = '&';
812 *outp++ = '#';
813 if (ch<10) {
814 digits = 1;
815 base = 1;
816 }
817 else if (ch<100) {
818 digits = 2;
819 base = 10;
820 }
821 else if (ch<1000) {
822 digits = 3;
823 base = 100;
824 }
825 else if (ch<10000) {
826 digits = 4;
827 base = 1000;
828 }
829 else if (ch<100000) {
830 digits = 5;
831 base = 10000;
832 }
833 else if (ch<1000000) {
834 digits = 6;
835 base = 100000;
836 }
837 else {
838 digits = 7;
839 base = 1000000;
840 }
841 while (digits-->0) {
842 *outp++ = '0' + ch/base;
843 ch %= base;
844 base /= 10;
845 }
846 *outp++ = ';';
847 }
848 assert(_PyUnicode_CheckConsistency(res, 1));
849 restuple = Py_BuildValue("(Nn)", res, end);
850 Py_DECREF(object);
851 return restuple;
852 }
853 else {
854 wrong_exception_type(exc);
855 return NULL;
856 }
857 }
858
PyCodec_BackslashReplaceErrors(PyObject * exc)859 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
860 {
861 PyObject *object;
862 Py_ssize_t i;
863 Py_ssize_t start;
864 Py_ssize_t end;
865 PyObject *res;
866 unsigned char *outp;
867 int ressize;
868 Py_UCS4 c;
869
870 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
871 const unsigned char *p;
872 if (PyUnicodeDecodeError_GetStart(exc, &start))
873 return NULL;
874 if (PyUnicodeDecodeError_GetEnd(exc, &end))
875 return NULL;
876 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
877 return NULL;
878 p = (const unsigned char*)PyBytes_AS_STRING(object);
879 res = PyUnicode_New(4 * (end - start), 127);
880 if (res == NULL) {
881 Py_DECREF(object);
882 return NULL;
883 }
884 outp = PyUnicode_1BYTE_DATA(res);
885 for (i = start; i < end; i++, outp += 4) {
886 unsigned char c = p[i];
887 outp[0] = '\\';
888 outp[1] = 'x';
889 outp[2] = Py_hexdigits[(c>>4)&0xf];
890 outp[3] = Py_hexdigits[c&0xf];
891 }
892
893 assert(_PyUnicode_CheckConsistency(res, 1));
894 Py_DECREF(object);
895 return Py_BuildValue("(Nn)", res, end);
896 }
897 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
898 if (PyUnicodeEncodeError_GetStart(exc, &start))
899 return NULL;
900 if (PyUnicodeEncodeError_GetEnd(exc, &end))
901 return NULL;
902 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
903 return NULL;
904 }
905 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeTranslateError)) {
906 if (PyUnicodeTranslateError_GetStart(exc, &start))
907 return NULL;
908 if (PyUnicodeTranslateError_GetEnd(exc, &end))
909 return NULL;
910 if (!(object = PyUnicodeTranslateError_GetObject(exc)))
911 return NULL;
912 }
913 else {
914 wrong_exception_type(exc);
915 return NULL;
916 }
917
918 if (end - start > PY_SSIZE_T_MAX / (1+1+8))
919 end = start + PY_SSIZE_T_MAX / (1+1+8);
920 for (i = start, ressize = 0; i < end; ++i) {
921 /* object is guaranteed to be "ready" */
922 c = PyUnicode_READ_CHAR(object, i);
923 if (c >= 0x10000) {
924 ressize += 1+1+8;
925 }
926 else if (c >= 0x100) {
927 ressize += 1+1+4;
928 }
929 else
930 ressize += 1+1+2;
931 }
932 res = PyUnicode_New(ressize, 127);
933 if (res == NULL) {
934 Py_DECREF(object);
935 return NULL;
936 }
937 outp = PyUnicode_1BYTE_DATA(res);
938 for (i = start; i < end; ++i) {
939 c = PyUnicode_READ_CHAR(object, i);
940 *outp++ = '\\';
941 if (c >= 0x00010000) {
942 *outp++ = 'U';
943 *outp++ = Py_hexdigits[(c>>28)&0xf];
944 *outp++ = Py_hexdigits[(c>>24)&0xf];
945 *outp++ = Py_hexdigits[(c>>20)&0xf];
946 *outp++ = Py_hexdigits[(c>>16)&0xf];
947 *outp++ = Py_hexdigits[(c>>12)&0xf];
948 *outp++ = Py_hexdigits[(c>>8)&0xf];
949 }
950 else if (c >= 0x100) {
951 *outp++ = 'u';
952 *outp++ = Py_hexdigits[(c>>12)&0xf];
953 *outp++ = Py_hexdigits[(c>>8)&0xf];
954 }
955 else
956 *outp++ = 'x';
957 *outp++ = Py_hexdigits[(c>>4)&0xf];
958 *outp++ = Py_hexdigits[c&0xf];
959 }
960
961 assert(_PyUnicode_CheckConsistency(res, 1));
962 Py_DECREF(object);
963 return Py_BuildValue("(Nn)", res, end);
964 }
965
966 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
967
PyCodec_NameReplaceErrors(PyObject * exc)968 PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
969 {
970 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
971 PyObject *restuple;
972 PyObject *object;
973 Py_ssize_t i;
974 Py_ssize_t start;
975 Py_ssize_t end;
976 PyObject *res;
977 unsigned char *outp;
978 Py_ssize_t ressize;
979 int replsize;
980 Py_UCS4 c;
981 char buffer[256]; /* NAME_MAXLEN */
982 if (PyUnicodeEncodeError_GetStart(exc, &start))
983 return NULL;
984 if (PyUnicodeEncodeError_GetEnd(exc, &end))
985 return NULL;
986 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
987 return NULL;
988 if (!ucnhash_CAPI) {
989 /* load the unicode data module */
990 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
991 PyUnicodeData_CAPSULE_NAME, 1);
992 if (!ucnhash_CAPI)
993 return NULL;
994 }
995 for (i = start, ressize = 0; i < end; ++i) {
996 /* object is guaranteed to be "ready" */
997 c = PyUnicode_READ_CHAR(object, i);
998 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
999 replsize = 1+1+1+(int)strlen(buffer)+1;
1000 }
1001 else if (c >= 0x10000) {
1002 replsize = 1+1+8;
1003 }
1004 else if (c >= 0x100) {
1005 replsize = 1+1+4;
1006 }
1007 else
1008 replsize = 1+1+2;
1009 if (ressize > PY_SSIZE_T_MAX - replsize)
1010 break;
1011 ressize += replsize;
1012 }
1013 end = i;
1014 res = PyUnicode_New(ressize, 127);
1015 if (res==NULL)
1016 return NULL;
1017 for (i = start, outp = PyUnicode_1BYTE_DATA(res);
1018 i < end; ++i) {
1019 c = PyUnicode_READ_CHAR(object, i);
1020 *outp++ = '\\';
1021 if (ucnhash_CAPI->getname(NULL, c, buffer, sizeof(buffer), 1)) {
1022 *outp++ = 'N';
1023 *outp++ = '{';
1024 strcpy((char *)outp, buffer);
1025 outp += strlen(buffer);
1026 *outp++ = '}';
1027 continue;
1028 }
1029 if (c >= 0x00010000) {
1030 *outp++ = 'U';
1031 *outp++ = Py_hexdigits[(c>>28)&0xf];
1032 *outp++ = Py_hexdigits[(c>>24)&0xf];
1033 *outp++ = Py_hexdigits[(c>>20)&0xf];
1034 *outp++ = Py_hexdigits[(c>>16)&0xf];
1035 *outp++ = Py_hexdigits[(c>>12)&0xf];
1036 *outp++ = Py_hexdigits[(c>>8)&0xf];
1037 }
1038 else if (c >= 0x100) {
1039 *outp++ = 'u';
1040 *outp++ = Py_hexdigits[(c>>12)&0xf];
1041 *outp++ = Py_hexdigits[(c>>8)&0xf];
1042 }
1043 else
1044 *outp++ = 'x';
1045 *outp++ = Py_hexdigits[(c>>4)&0xf];
1046 *outp++ = Py_hexdigits[c&0xf];
1047 }
1048
1049 assert(outp == PyUnicode_1BYTE_DATA(res) + ressize);
1050 assert(_PyUnicode_CheckConsistency(res, 1));
1051 restuple = Py_BuildValue("(Nn)", res, end);
1052 Py_DECREF(object);
1053 return restuple;
1054 }
1055 else {
1056 wrong_exception_type(exc);
1057 return NULL;
1058 }
1059 }
1060
1061 #define ENC_UNKNOWN -1
1062 #define ENC_UTF8 0
1063 #define ENC_UTF16BE 1
1064 #define ENC_UTF16LE 2
1065 #define ENC_UTF32BE 3
1066 #define ENC_UTF32LE 4
1067
1068 static int
get_standard_encoding(const char * encoding,int * bytelength)1069 get_standard_encoding(const char *encoding, int *bytelength)
1070 {
1071 if (Py_TOLOWER(encoding[0]) == 'u' &&
1072 Py_TOLOWER(encoding[1]) == 't' &&
1073 Py_TOLOWER(encoding[2]) == 'f') {
1074 encoding += 3;
1075 if (*encoding == '-' || *encoding == '_' )
1076 encoding++;
1077 if (encoding[0] == '8' && encoding[1] == '\0') {
1078 *bytelength = 3;
1079 return ENC_UTF8;
1080 }
1081 else if (encoding[0] == '1' && encoding[1] == '6') {
1082 encoding += 2;
1083 *bytelength = 2;
1084 if (*encoding == '\0') {
1085 #ifdef WORDS_BIGENDIAN
1086 return ENC_UTF16BE;
1087 #else
1088 return ENC_UTF16LE;
1089 #endif
1090 }
1091 if (*encoding == '-' || *encoding == '_' )
1092 encoding++;
1093 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1094 if (Py_TOLOWER(encoding[0]) == 'b')
1095 return ENC_UTF16BE;
1096 if (Py_TOLOWER(encoding[0]) == 'l')
1097 return ENC_UTF16LE;
1098 }
1099 }
1100 else if (encoding[0] == '3' && encoding[1] == '2') {
1101 encoding += 2;
1102 *bytelength = 4;
1103 if (*encoding == '\0') {
1104 #ifdef WORDS_BIGENDIAN
1105 return ENC_UTF32BE;
1106 #else
1107 return ENC_UTF32LE;
1108 #endif
1109 }
1110 if (*encoding == '-' || *encoding == '_' )
1111 encoding++;
1112 if (Py_TOLOWER(encoding[1]) == 'e' && encoding[2] == '\0') {
1113 if (Py_TOLOWER(encoding[0]) == 'b')
1114 return ENC_UTF32BE;
1115 if (Py_TOLOWER(encoding[0]) == 'l')
1116 return ENC_UTF32LE;
1117 }
1118 }
1119 }
1120 else if (strcmp(encoding, "CP_UTF8") == 0) {
1121 *bytelength = 3;
1122 return ENC_UTF8;
1123 }
1124 return ENC_UNKNOWN;
1125 }
1126
1127 /* This handler is declared static until someone demonstrates
1128 a need to call it directly. */
1129 static PyObject *
PyCodec_SurrogatePassErrors(PyObject * exc)1130 PyCodec_SurrogatePassErrors(PyObject *exc)
1131 {
1132 PyObject *restuple;
1133 PyObject *object;
1134 PyObject *encode;
1135 const char *encoding;
1136 int code;
1137 int bytelength;
1138 Py_ssize_t i;
1139 Py_ssize_t start;
1140 Py_ssize_t end;
1141 PyObject *res;
1142
1143 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1144 unsigned char *outp;
1145 if (PyUnicodeEncodeError_GetStart(exc, &start))
1146 return NULL;
1147 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1148 return NULL;
1149 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1150 return NULL;
1151 if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
1152 Py_DECREF(object);
1153 return NULL;
1154 }
1155 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1156 Py_DECREF(object);
1157 Py_DECREF(encode);
1158 return NULL;
1159 }
1160 code = get_standard_encoding(encoding, &bytelength);
1161 Py_DECREF(encode);
1162 if (code == ENC_UNKNOWN) {
1163 /* Not supported, fail with original exception */
1164 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1165 Py_DECREF(object);
1166 return NULL;
1167 }
1168
1169 if (end - start > PY_SSIZE_T_MAX / bytelength)
1170 end = start + PY_SSIZE_T_MAX / bytelength;
1171 res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
1172 if (!res) {
1173 Py_DECREF(object);
1174 return NULL;
1175 }
1176 outp = (unsigned char*)PyBytes_AsString(res);
1177 for (i = start; i < end; i++) {
1178 /* object is guaranteed to be "ready" */
1179 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1180 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1181 /* Not a surrogate, fail with original exception */
1182 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1183 Py_DECREF(res);
1184 Py_DECREF(object);
1185 return NULL;
1186 }
1187 switch (code) {
1188 case ENC_UTF8:
1189 *outp++ = (unsigned char)(0xe0 | (ch >> 12));
1190 *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
1191 *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
1192 break;
1193 case ENC_UTF16LE:
1194 *outp++ = (unsigned char) ch;
1195 *outp++ = (unsigned char)(ch >> 8);
1196 break;
1197 case ENC_UTF16BE:
1198 *outp++ = (unsigned char)(ch >> 8);
1199 *outp++ = (unsigned char) ch;
1200 break;
1201 case ENC_UTF32LE:
1202 *outp++ = (unsigned char) ch;
1203 *outp++ = (unsigned char)(ch >> 8);
1204 *outp++ = (unsigned char)(ch >> 16);
1205 *outp++ = (unsigned char)(ch >> 24);
1206 break;
1207 case ENC_UTF32BE:
1208 *outp++ = (unsigned char)(ch >> 24);
1209 *outp++ = (unsigned char)(ch >> 16);
1210 *outp++ = (unsigned char)(ch >> 8);
1211 *outp++ = (unsigned char) ch;
1212 break;
1213 }
1214 }
1215 restuple = Py_BuildValue("(On)", res, end);
1216 Py_DECREF(res);
1217 Py_DECREF(object);
1218 return restuple;
1219 }
1220 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1221 const unsigned char *p;
1222 Py_UCS4 ch = 0;
1223 if (PyUnicodeDecodeError_GetStart(exc, &start))
1224 return NULL;
1225 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1226 return NULL;
1227 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1228 return NULL;
1229 p = (const unsigned char*)PyBytes_AS_STRING(object);
1230 if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
1231 Py_DECREF(object);
1232 return NULL;
1233 }
1234 if (!(encoding = PyUnicode_AsUTF8(encode))) {
1235 Py_DECREF(object);
1236 Py_DECREF(encode);
1237 return NULL;
1238 }
1239 code = get_standard_encoding(encoding, &bytelength);
1240 Py_DECREF(encode);
1241 if (code == ENC_UNKNOWN) {
1242 /* Not supported, fail with original exception */
1243 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1244 Py_DECREF(object);
1245 return NULL;
1246 }
1247
1248 /* Try decoding a single surrogate character. If
1249 there are more, let the codec call us again. */
1250 p += start;
1251 if (PyBytes_GET_SIZE(object) - start >= bytelength) {
1252 switch (code) {
1253 case ENC_UTF8:
1254 if ((p[0] & 0xf0) == 0xe0 &&
1255 (p[1] & 0xc0) == 0x80 &&
1256 (p[2] & 0xc0) == 0x80) {
1257 /* it's a three-byte code */
1258 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
1259 }
1260 break;
1261 case ENC_UTF16LE:
1262 ch = p[1] << 8 | p[0];
1263 break;
1264 case ENC_UTF16BE:
1265 ch = p[0] << 8 | p[1];
1266 break;
1267 case ENC_UTF32LE:
1268 ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
1269 break;
1270 case ENC_UTF32BE:
1271 ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
1272 break;
1273 }
1274 }
1275
1276 Py_DECREF(object);
1277 if (!Py_UNICODE_IS_SURROGATE(ch)) {
1278 /* it's not a surrogate - fail */
1279 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1280 return NULL;
1281 }
1282 res = PyUnicode_FromOrdinal(ch);
1283 if (res == NULL)
1284 return NULL;
1285 return Py_BuildValue("(Nn)", res, start + bytelength);
1286 }
1287 else {
1288 wrong_exception_type(exc);
1289 return NULL;
1290 }
1291 }
1292
1293 static PyObject *
PyCodec_SurrogateEscapeErrors(PyObject * exc)1294 PyCodec_SurrogateEscapeErrors(PyObject *exc)
1295 {
1296 PyObject *restuple;
1297 PyObject *object;
1298 Py_ssize_t i;
1299 Py_ssize_t start;
1300 Py_ssize_t end;
1301 PyObject *res;
1302
1303 if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeEncodeError)) {
1304 char *outp;
1305 if (PyUnicodeEncodeError_GetStart(exc, &start))
1306 return NULL;
1307 if (PyUnicodeEncodeError_GetEnd(exc, &end))
1308 return NULL;
1309 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
1310 return NULL;
1311 res = PyBytes_FromStringAndSize(NULL, end-start);
1312 if (!res) {
1313 Py_DECREF(object);
1314 return NULL;
1315 }
1316 outp = PyBytes_AsString(res);
1317 for (i = start; i < end; i++) {
1318 /* object is guaranteed to be "ready" */
1319 Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
1320 if (ch < 0xdc80 || ch > 0xdcff) {
1321 /* Not a UTF-8b surrogate, fail with original exception */
1322 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1323 Py_DECREF(res);
1324 Py_DECREF(object);
1325 return NULL;
1326 }
1327 *outp++ = ch - 0xdc00;
1328 }
1329 restuple = Py_BuildValue("(On)", res, end);
1330 Py_DECREF(res);
1331 Py_DECREF(object);
1332 return restuple;
1333 }
1334 else if (PyObject_TypeCheck(exc, (PyTypeObject *)PyExc_UnicodeDecodeError)) {
1335 PyObject *str;
1336 const unsigned char *p;
1337 Py_UCS2 ch[4]; /* decode up to 4 bad bytes. */
1338 int consumed = 0;
1339 if (PyUnicodeDecodeError_GetStart(exc, &start))
1340 return NULL;
1341 if (PyUnicodeDecodeError_GetEnd(exc, &end))
1342 return NULL;
1343 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
1344 return NULL;
1345 p = (const unsigned char*)PyBytes_AS_STRING(object);
1346 while (consumed < 4 && consumed < end-start) {
1347 /* Refuse to escape ASCII bytes. */
1348 if (p[start+consumed] < 128)
1349 break;
1350 ch[consumed] = 0xdc00 + p[start+consumed];
1351 consumed++;
1352 }
1353 Py_DECREF(object);
1354 if (!consumed) {
1355 /* codec complained about ASCII byte. */
1356 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
1357 return NULL;
1358 }
1359 str = PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND, ch, consumed);
1360 if (str == NULL)
1361 return NULL;
1362 return Py_BuildValue("(Nn)", str, start+consumed);
1363 }
1364 else {
1365 wrong_exception_type(exc);
1366 return NULL;
1367 }
1368 }
1369
1370
strict_errors(PyObject * self,PyObject * exc)1371 static PyObject *strict_errors(PyObject *self, PyObject *exc)
1372 {
1373 return PyCodec_StrictErrors(exc);
1374 }
1375
1376
ignore_errors(PyObject * self,PyObject * exc)1377 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
1378 {
1379 return PyCodec_IgnoreErrors(exc);
1380 }
1381
1382
replace_errors(PyObject * self,PyObject * exc)1383 static PyObject *replace_errors(PyObject *self, PyObject *exc)
1384 {
1385 return PyCodec_ReplaceErrors(exc);
1386 }
1387
1388
xmlcharrefreplace_errors(PyObject * self,PyObject * exc)1389 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
1390 {
1391 return PyCodec_XMLCharRefReplaceErrors(exc);
1392 }
1393
1394
backslashreplace_errors(PyObject * self,PyObject * exc)1395 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
1396 {
1397 return PyCodec_BackslashReplaceErrors(exc);
1398 }
1399
namereplace_errors(PyObject * self,PyObject * exc)1400 static PyObject *namereplace_errors(PyObject *self, PyObject *exc)
1401 {
1402 return PyCodec_NameReplaceErrors(exc);
1403 }
1404
surrogatepass_errors(PyObject * self,PyObject * exc)1405 static PyObject *surrogatepass_errors(PyObject *self, PyObject *exc)
1406 {
1407 return PyCodec_SurrogatePassErrors(exc);
1408 }
1409
surrogateescape_errors(PyObject * self,PyObject * exc)1410 static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
1411 {
1412 return PyCodec_SurrogateEscapeErrors(exc);
1413 }
1414
_PyCodecRegistry_Init(void)1415 static int _PyCodecRegistry_Init(void)
1416 {
1417 static struct {
1418 const char *name;
1419 PyMethodDef def;
1420 } methods[] =
1421 {
1422 {
1423 "strict",
1424 {
1425 "strict_errors",
1426 strict_errors,
1427 METH_O,
1428 PyDoc_STR("Implements the 'strict' error handling, which "
1429 "raises a UnicodeError on coding errors.")
1430 }
1431 },
1432 {
1433 "ignore",
1434 {
1435 "ignore_errors",
1436 ignore_errors,
1437 METH_O,
1438 PyDoc_STR("Implements the 'ignore' error handling, which "
1439 "ignores malformed data and continues.")
1440 }
1441 },
1442 {
1443 "replace",
1444 {
1445 "replace_errors",
1446 replace_errors,
1447 METH_O,
1448 PyDoc_STR("Implements the 'replace' error handling, which "
1449 "replaces malformed data with a replacement marker.")
1450 }
1451 },
1452 {
1453 "xmlcharrefreplace",
1454 {
1455 "xmlcharrefreplace_errors",
1456 xmlcharrefreplace_errors,
1457 METH_O,
1458 PyDoc_STR("Implements the 'xmlcharrefreplace' error handling, "
1459 "which replaces an unencodable character with the "
1460 "appropriate XML character reference.")
1461 }
1462 },
1463 {
1464 "backslashreplace",
1465 {
1466 "backslashreplace_errors",
1467 backslashreplace_errors,
1468 METH_O,
1469 PyDoc_STR("Implements the 'backslashreplace' error handling, "
1470 "which replaces malformed data with a backslashed "
1471 "escape sequence.")
1472 }
1473 },
1474 {
1475 "namereplace",
1476 {
1477 "namereplace_errors",
1478 namereplace_errors,
1479 METH_O,
1480 PyDoc_STR("Implements the 'namereplace' error handling, "
1481 "which replaces an unencodable character with a "
1482 "\\N{...} escape sequence.")
1483 }
1484 },
1485 {
1486 "surrogatepass",
1487 {
1488 "surrogatepass",
1489 surrogatepass_errors,
1490 METH_O
1491 }
1492 },
1493 {
1494 "surrogateescape",
1495 {
1496 "surrogateescape",
1497 surrogateescape_errors,
1498 METH_O
1499 }
1500 }
1501 };
1502
1503 PyInterpreterState *interp = _PyInterpreterState_Get();
1504 PyObject *mod;
1505 unsigned i;
1506
1507 if (interp->codec_search_path != NULL)
1508 return 0;
1509
1510 interp->codec_search_path = PyList_New(0);
1511 interp->codec_search_cache = PyDict_New();
1512 interp->codec_error_registry = PyDict_New();
1513
1514 if (interp->codec_error_registry) {
1515 for (i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
1516 PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
1517 int res;
1518 if (!func)
1519 Py_FatalError("can't initialize codec error registry");
1520 res = PyCodec_RegisterError(methods[i].name, func);
1521 Py_DECREF(func);
1522 if (res)
1523 Py_FatalError("can't initialize codec error registry");
1524 }
1525 }
1526
1527 if (interp->codec_search_path == NULL ||
1528 interp->codec_search_cache == NULL ||
1529 interp->codec_error_registry == NULL)
1530 Py_FatalError("can't initialize codec registry");
1531
1532 mod = PyImport_ImportModuleNoBlock("encodings");
1533 if (mod == NULL) {
1534 return -1;
1535 }
1536 Py_DECREF(mod);
1537 interp->codecs_initialized = 1;
1538 return 0;
1539 }
1540