1# Copyright 2014 Red Hat, Inc. 2# All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); you may 5# not use this file except in compliance with the License. You may obtain 6# a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13# License for the specific language governing permissions and limitations 14# under the License. 15 16import sys 17 18 19# NOTE(blk-u): This provides a symbol that can be overridden just for this 20# module during testing. sys.getfilesystemencoding() is called by coverage so 21# mocking it globally caused the coverage job to fail. 22_getfilesystemencoding = sys.getfilesystemencoding 23 24 25def safe_decode(text, incoming=None, errors='strict'): 26 """Decodes incoming text/bytes string using `incoming` if they're not 27 already unicode. 28 29 :param incoming: Text's current encoding 30 :param errors: Errors handling policy. See here for valid 31 values http://docs.python.org/2/library/codecs.html 32 :returns: text or a unicode `incoming` encoded 33 representation of it. 34 :raises TypeError: If text is not an instance of str 35 """ 36 if not isinstance(text, (str, bytes)): 37 raise TypeError("%s can't be decoded" % type(text)) 38 39 if isinstance(text, str): 40 return text 41 42 if not incoming: 43 incoming = (getattr(sys.stdin, 'encoding', None) or 44 sys.getdefaultencoding()) 45 46 try: 47 return text.decode(incoming, errors) 48 except UnicodeDecodeError: 49 # Note(flaper87) If we get here, it means that 50 # sys.stdin.encoding / sys.getdefaultencoding 51 # didn't return a suitable encoding to decode 52 # text. This happens mostly when global LANG 53 # var is not set correctly and there's no 54 # default encoding. In this case, most likely 55 # python will use ASCII or ANSI encoders as 56 # default encodings but they won't be capable 57 # of decoding non-ASCII characters. 58 # 59 # Also, UTF-8 is being used since it's an ASCII 60 # extension. 61 return text.decode('utf-8', errors) 62 63 64def safe_encode(text, incoming=None, 65 encoding='utf-8', errors='strict'): 66 """Encodes incoming text/bytes string using `encoding`. 67 68 If incoming is not specified, text is expected to be encoded with 69 current python's default encoding. (`sys.getdefaultencoding`) 70 71 :param incoming: Text's current encoding 72 :param encoding: Expected encoding for text (Default UTF-8) 73 :param errors: Errors handling policy. See here for valid 74 values http://docs.python.org/2/library/codecs.html 75 :returns: text or a bytestring `encoding` encoded 76 representation of it. 77 :raises TypeError: If text is not an instance of str 78 79 See also to_utf8() function which is simpler and don't depend on 80 the locale encoding. 81 """ 82 if not isinstance(text, (str, bytes)): 83 raise TypeError("%s can't be encoded" % type(text)) 84 85 if not incoming: 86 incoming = (getattr(sys.stdin, 'encoding', None) or 87 sys.getdefaultencoding()) 88 89 # Avoid case issues in comparisons 90 if hasattr(incoming, 'lower'): 91 incoming = incoming.lower() 92 if hasattr(encoding, 'lower'): 93 encoding = encoding.lower() 94 95 if isinstance(text, str): 96 return text.encode(encoding, errors) 97 elif text and encoding != incoming: 98 # Decode text before encoding it with `encoding` 99 text = safe_decode(text, incoming, errors) 100 return text.encode(encoding, errors) 101 else: 102 return text 103 104 105def to_utf8(text): 106 """Encode Unicode to UTF-8, return bytes unchanged. 107 108 Raise TypeError if text is not a bytes string or a Unicode string. 109 110 .. versionadded:: 3.5 111 """ 112 if isinstance(text, bytes): 113 return text 114 elif isinstance(text, str): 115 return text.encode('utf-8') 116 else: 117 raise TypeError("bytes or Unicode expected, got %s" 118 % type(text).__name__) 119 120 121def exception_to_unicode(exc): 122 """Get the message of an exception as a Unicode string. 123 124 On Python 3, the exception message is always a Unicode string. On 125 Python 2, the exception message is a bytes string *most* of the time. 126 127 If the exception message is a bytes strings, try to decode it from UTF-8 128 (superset of ASCII), from the locale encoding, or fallback to decoding it 129 from ISO-8859-1 (which never fails). 130 131 .. versionadded:: 1.6 132 """ 133 msg = None 134 135 if msg is None: 136 # Don't call directly str(exc), because it fails with 137 # UnicodeEncodeError on Python 2 if exc.__str__() returns a Unicode 138 # string not encodable to the default encoding (ASCII) 139 msg = exc.__str__() 140 141 if isinstance(msg, str): 142 # This should be the default path on Python 3 and an *optional* path 143 # on Python 2 (if for some reason the exception message was already 144 # in unicode instead of the more typical bytes string); so avoid 145 # further converting to unicode in both of these cases. 146 return msg 147 148 try: 149 # Try to decode from UTF-8 (superset of ASCII). The decoder fails 150 # if the string is not a valid UTF-8 string: the UTF-8 codec includes 151 # a validation algorithm to ensure the consistency of the codec. 152 return msg.decode('utf-8') 153 except UnicodeDecodeError: # nosec 154 pass 155 156 # Try the locale encoding, most error messages are encoded to this encoding 157 # (ex: os.strerror(errno)) 158 encoding = _getfilesystemencoding() 159 try: 160 return msg.decode(encoding) 161 except UnicodeDecodeError: # nosec 162 pass 163 164 # The encoding is not ASCII, not UTF-8, nor the locale encoding. Fallback 165 # to the ISO-8859-1 encoding which never fails. It will produce mojibake 166 # if the message is not encoded to ISO-8859-1, but we don't want a super 167 # complex heuristic to get the encoding of an exception message. 168 return msg.decode('latin1') 169