1# -*- coding: utf-8 -*- 2# config.py 3# Copyright (C) 2008, 2009 Michael Trier (mtrier@gmail.com) and contributors 4# 5# This module is part of GitPython and is released under 6# the BSD License: http://www.opensource.org/licenses/bsd-license.php 7"""utilities to help provide compatibility with python 3""" 8# flake8: noqa 9 10import locale 11import os 12import sys 13import codecs 14 15 16from gitdb.utils.compat import ( 17 xrange, 18 MAXSIZE, # @UnusedImport 19 izip, # @UnusedImport 20) 21from gitdb.utils.encoding import ( 22 string_types, # @UnusedImport 23 text_type, # @UnusedImport 24 force_bytes, # @UnusedImport 25 force_text # @UnusedImport 26) 27 28 29PY3 = sys.version_info[0] >= 3 30is_win = (os.name == 'nt') 31is_posix = (os.name == 'posix') 32is_darwin = (os.name == 'darwin') 33defenc = sys.getdefaultencoding() 34 35if PY3: 36 import io 37 FileType = io.IOBase 38 39 def byte_ord(b): 40 return b 41 42 def bchr(n): 43 return bytes([n]) 44 45 def mviter(d): 46 return d.values() 47 48 range = xrange # @ReservedAssignment 49 unicode = str 50 binary_type = bytes 51else: 52 FileType = file # @UndefinedVariable on PY3 53 # usually, this is just ascii, which might not enough for our encoding needs 54 # Unless it's set specifically, we override it to be utf-8 55 if defenc == 'ascii': 56 defenc = 'utf-8' 57 byte_ord = ord 58 bchr = chr 59 unicode = unicode 60 binary_type = str 61 range = xrange # @ReservedAssignment 62 63 def mviter(d): 64 return d.itervalues() 65 66 67def safe_decode(s): 68 """Safely decodes a binary string to unicode""" 69 if isinstance(s, unicode): 70 return s 71 elif isinstance(s, bytes): 72 return s.decode(defenc, 'surrogateescape') 73 elif s is not None: 74 raise TypeError('Expected bytes or text, but got %r' % (s,)) 75 76 77def safe_encode(s): 78 """Safely decodes a binary string to unicode""" 79 if isinstance(s, unicode): 80 return s.encode(defenc) 81 elif isinstance(s, bytes): 82 return s 83 elif s is not None: 84 raise TypeError('Expected bytes or text, but got %r' % (s,)) 85 86 87def win_encode(s): 88 """Encode unicodes for process arguments on Windows.""" 89 if isinstance(s, unicode): 90 return s.encode(locale.getpreferredencoding(False)) 91 elif isinstance(s, bytes): 92 return s 93 elif s is not None: 94 raise TypeError('Expected bytes or text, but got %r' % (s,)) 95 96 97def with_metaclass(meta, *bases): 98 """copied from https://github.com/Byron/bcore/blob/master/src/python/butility/future.py#L15""" 99 class metaclass(meta): 100 __call__ = type.__call__ 101 __init__ = type.__init__ 102 103 def __new__(cls, name, nbases, d): 104 if nbases is None: 105 return type.__new__(cls, name, (), d) 106 # There may be clients who rely on this attribute to be set to a reasonable value, which is why 107 # we set the __metaclass__ attribute explicitly 108 if not PY3 and '___metaclass__' not in d: 109 d['__metaclass__'] = meta 110 return meta(name, bases, d) 111 return metaclass(meta.__name__ + 'Helper', None, {}) 112 113 114## From https://docs.python.org/3.3/howto/pyporting.html 115class UnicodeMixin(object): 116 117 """Mixin class to handle defining the proper __str__/__unicode__ 118 methods in Python 2 or 3.""" 119 120 if PY3: 121 def __str__(self): 122 return self.__unicode__() 123 else: # Python 2 124 def __str__(self): 125 return self.__unicode__().encode(defenc) 126 127 128""" 129This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error 130handler of Python 3. 131Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc 132""" 133 134# This code is released under the Python license and the BSD 2-clause license 135 136 137FS_ERRORS = 'surrogateescape' 138 139# # -- Python 2/3 compatibility ------------------------------------- 140# FS_ERRORS = 'my_surrogateescape' 141 142def u(text): 143 if PY3: 144 return text 145 else: 146 return text.decode('unicode_escape') 147 148def b(data): 149 if PY3: 150 return data.encode('latin1') 151 else: 152 return data 153 154if PY3: 155 _unichr = chr 156 bytes_chr = lambda code: bytes((code,)) 157else: 158 _unichr = unichr 159 bytes_chr = chr 160 161def surrogateescape_handler(exc): 162 """ 163 Pure Python implementation of the PEP 383: the "surrogateescape" error 164 handler of Python 3. Undecodable bytes will be replaced by a Unicode 165 character U+DCxx on decoding, and these are translated into the 166 original bytes on encoding. 167 """ 168 mystring = exc.object[exc.start:exc.end] 169 170 try: 171 if isinstance(exc, UnicodeDecodeError): 172 # mystring is a byte-string in this case 173 decoded = replace_surrogate_decode(mystring) 174 elif isinstance(exc, UnicodeEncodeError): 175 # In the case of u'\udcc3'.encode('ascii', 176 # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an 177 # exception anyway after this function is called, even though I think 178 # it's doing what it should. It seems that the strict encoder is called 179 # to encode the unicode string that this function returns ... 180 decoded = replace_surrogate_encode(mystring, exc) 181 else: 182 raise exc 183 except NotASurrogateError: 184 raise exc 185 return (decoded, exc.end) 186 187 188class NotASurrogateError(Exception): 189 pass 190 191 192def replace_surrogate_encode(mystring, exc): 193 """ 194 Returns a (unicode) string, not the more logical bytes, because the codecs 195 register_error functionality expects this. 196 """ 197 decoded = [] 198 for ch in mystring: 199 # if PY3: 200 # code = ch 201 # else: 202 code = ord(ch) 203 204 # The following magic comes from Py3.3's Python/codecs.c file: 205 if not 0xD800 <= code <= 0xDCFF: 206 # Not a surrogate. Fail with the original exception. 207 raise exc 208 # mybytes = [0xe0 | (code >> 12), 209 # 0x80 | ((code >> 6) & 0x3f), 210 # 0x80 | (code & 0x3f)] 211 # Is this a good idea? 212 if 0xDC00 <= code <= 0xDC7F: 213 decoded.append(_unichr(code - 0xDC00)) 214 elif code <= 0xDCFF: 215 decoded.append(_unichr(code - 0xDC00)) 216 else: 217 raise NotASurrogateError 218 return str().join(decoded) 219 220 221def replace_surrogate_decode(mybytes): 222 """ 223 Returns a (unicode) string 224 """ 225 decoded = [] 226 for ch in mybytes: 227 # We may be parsing newbytes (in which case ch is an int) or a native 228 # str on Py2 229 if isinstance(ch, int): 230 code = ch 231 else: 232 code = ord(ch) 233 if 0x80 <= code <= 0xFF: 234 decoded.append(_unichr(0xDC00 + code)) 235 elif code <= 0x7F: 236 decoded.append(_unichr(code)) 237 else: 238 # # It may be a bad byte 239 # # Try swallowing it. 240 # continue 241 # print("RAISE!") 242 raise NotASurrogateError 243 return str().join(decoded) 244 245 246def encodefilename(fn): 247 if FS_ENCODING == 'ascii': 248 # ASCII encoder of Python 2 expects that the error handler returns a 249 # Unicode string encodable to ASCII, whereas our surrogateescape error 250 # handler has to return bytes in 0x80-0xFF range. 251 encoded = [] 252 for index, ch in enumerate(fn): 253 code = ord(ch) 254 if code < 128: 255 ch = bytes_chr(code) 256 elif 0xDC80 <= code <= 0xDCFF: 257 ch = bytes_chr(code - 0xDC00) 258 else: 259 raise UnicodeEncodeError(FS_ENCODING, 260 fn, index, index+1, 261 'ordinal not in range(128)') 262 encoded.append(ch) 263 return bytes().join(encoded) 264 elif FS_ENCODING == 'utf-8': 265 # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF 266 # doesn't go through our error handler 267 encoded = [] 268 for index, ch in enumerate(fn): 269 code = ord(ch) 270 if 0xD800 <= code <= 0xDFFF: 271 if 0xDC80 <= code <= 0xDCFF: 272 ch = bytes_chr(code - 0xDC00) 273 encoded.append(ch) 274 else: 275 raise UnicodeEncodeError( 276 FS_ENCODING, 277 fn, index, index+1, 'surrogates not allowed') 278 else: 279 ch_utf8 = ch.encode('utf-8') 280 encoded.append(ch_utf8) 281 return bytes().join(encoded) 282 else: 283 return fn.encode(FS_ENCODING, FS_ERRORS) 284 285def decodefilename(fn): 286 return fn.decode(FS_ENCODING, FS_ERRORS) 287 288FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') 289# FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]') 290# FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]') 291 292 293# normalize the filesystem encoding name. 294# For example, we expect "utf-8", not "UTF8". 295FS_ENCODING = codecs.lookup(FS_ENCODING).name 296 297 298def register_surrogateescape(): 299 """ 300 Registers the surrogateescape error handler on Python 2 (only) 301 """ 302 if PY3: 303 return 304 try: 305 codecs.lookup_error(FS_ERRORS) 306 except LookupError: 307 codecs.register_error(FS_ERRORS, surrogateescape_handler) 308 309 310try: 311 b"100644 \x9f\0aaa".decode(defenc, "surrogateescape") 312except Exception: 313 register_surrogateescape() 314