1""" 2This module redefines ``str`` on Python 2.x to be a subclass of the Py2 3``unicode`` type that behaves like the Python 3.x ``str``. 4 5The main differences between ``newstr`` and Python 2.x's ``unicode`` type are 6the stricter type-checking and absence of a `u''` prefix in the representation. 7 8It is designed to be used together with the ``unicode_literals`` import 9as follows: 10 11 >>> from __future__ import unicode_literals 12 >>> from builtins import str, isinstance 13 14On Python 3.x and normally on Python 2.x, these expressions hold 15 16 >>> str('blah') is 'blah' 17 True 18 >>> isinstance('blah', str) 19 True 20 21However, on Python 2.x, with this import: 22 23 >>> from __future__ import unicode_literals 24 25the same expressions are False: 26 27 >>> str('blah') is 'blah' 28 False 29 >>> isinstance('blah', str) 30 False 31 32This module is designed to be imported together with ``unicode_literals`` on 33Python 2 to bring the meaning of ``str`` back into alignment with unprefixed 34string literals (i.e. ``unicode`` subclasses). 35 36Note that ``str()`` (and ``print()``) would then normally call the 37``__unicode__`` method on objects in Python 2. To define string 38representations of your objects portably across Py3 and Py2, use the 39:func:`python_2_unicode_compatible` decorator in :mod:`future.utils`. 40 41""" 42 43from collections import Iterable 44from numbers import Number 45 46from future.utils import PY3, istext, with_metaclass, isnewbytes 47from future.types import no, issubset 48from future.types.newobject import newobject 49 50 51if PY3: 52 # We'll probably never use newstr on Py3 anyway... 53 unicode = str 54 55 56class BaseNewStr(type): 57 def __instancecheck__(cls, instance): 58 if cls == newstr: 59 return isinstance(instance, unicode) 60 else: 61 return issubclass(instance.__class__, cls) 62 63 64class newstr(with_metaclass(BaseNewStr, unicode)): 65 """ 66 A backport of the Python 3 str object to Py2 67 """ 68 no_convert_msg = "Can't convert '{0}' object to str implicitly" 69 70 def __new__(cls, *args, **kwargs): 71 """ 72 From the Py3 str docstring: 73 74 str(object='') -> str 75 str(bytes_or_buffer[, encoding[, errors]]) -> str 76 77 Create a new string object from the given object. If encoding or 78 errors is specified, then the object must expose a data buffer 79 that will be decoded using the given encoding and error handler. 80 Otherwise, returns the result of object.__str__() (if defined) 81 or repr(object). 82 encoding defaults to sys.getdefaultencoding(). 83 errors defaults to 'strict'. 84 85 """ 86 if len(args) == 0: 87 return super(newstr, cls).__new__(cls) 88 # Special case: If someone requests str(str(u'abc')), return the same 89 # object (same id) for consistency with Py3.3. This is not true for 90 # other objects like list or dict. 91 elif type(args[0]) == newstr and cls == newstr: 92 return args[0] 93 elif isinstance(args[0], unicode): 94 value = args[0] 95 elif isinstance(args[0], bytes): # i.e. Py2 bytes or newbytes 96 if 'encoding' in kwargs or len(args) > 1: 97 value = args[0].decode(*args[1:], **kwargs) 98 else: 99 value = args[0].__str__() 100 else: 101 value = args[0] 102 return super(newstr, cls).__new__(cls, value) 103 104 def __repr__(self): 105 """ 106 Without the u prefix 107 """ 108 value = super(newstr, self).__repr__() 109 # assert value[0] == u'u' 110 return value[1:] 111 112 def __getitem__(self, y): 113 """ 114 Warning: Python <= 2.7.6 has a bug that causes this method never to be called 115 when y is a slice object. Therefore the type of newstr()[:2] is wrong 116 (unicode instead of newstr). 117 """ 118 return newstr(super(newstr, self).__getitem__(y)) 119 120 def __contains__(self, key): 121 errmsg = "'in <string>' requires string as left operand, not {0}" 122 # Don't use isinstance() here because we only want to catch 123 # newstr, not Python 2 unicode: 124 if type(key) == newstr: 125 newkey = key 126 elif isinstance(key, unicode) or isinstance(key, bytes) and not isnewbytes(key): 127 newkey = newstr(key) 128 else: 129 raise TypeError(errmsg.format(type(key))) 130 return issubset(list(newkey), list(self)) 131 132 @no('newbytes') 133 def __add__(self, other): 134 return newstr(super(newstr, self).__add__(other)) 135 136 @no('newbytes') 137 def __radd__(self, left): 138 " left + self " 139 try: 140 return newstr(left) + self 141 except: 142 return NotImplemented 143 144 def __mul__(self, other): 145 return newstr(super(newstr, self).__mul__(other)) 146 147 def __rmul__(self, other): 148 return newstr(super(newstr, self).__rmul__(other)) 149 150 def join(self, iterable): 151 errmsg = 'sequence item {0}: expected unicode string, found bytes' 152 for i, item in enumerate(iterable): 153 # Here we use type() rather than isinstance() because 154 # __instancecheck__ is being overridden. E.g. 155 # isinstance(b'abc', newbytes) is True on Py2. 156 if isnewbytes(item): 157 raise TypeError(errmsg.format(i)) 158 # Support use as a staticmethod: str.join('-', ['a', 'b']) 159 if type(self) == newstr: 160 return newstr(super(newstr, self).join(iterable)) 161 else: 162 return newstr(super(newstr, newstr(self)).join(iterable)) 163 164 @no('newbytes') 165 def find(self, sub, *args): 166 return super(newstr, self).find(sub, *args) 167 168 @no('newbytes') 169 def rfind(self, sub, *args): 170 return super(newstr, self).rfind(sub, *args) 171 172 @no('newbytes', (1, 2)) 173 def replace(self, old, new, *args): 174 return newstr(super(newstr, self).replace(old, new, *args)) 175 176 def decode(self, *args): 177 raise AttributeError("decode method has been disabled in newstr") 178 179 def encode(self, encoding='utf-8', errors='strict'): 180 """ 181 Returns bytes 182 183 Encode S using the codec registered for encoding. Default encoding 184 is 'utf-8'. errors may be given to set a different error 185 handling scheme. Default is 'strict' meaning that encoding errors raise 186 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and 187 'xmlcharrefreplace' as well as any other name registered with 188 codecs.register_error that can handle UnicodeEncodeErrors. 189 """ 190 from future.types.newbytes import newbytes 191 # Py2 unicode.encode() takes encoding and errors as optional parameter, 192 # not keyword arguments as in Python 3 str. 193 194 # For the surrogateescape error handling mechanism, the 195 # codecs.register_error() function seems to be inadequate for an 196 # implementation of it when encoding. (Decoding seems fine, however.) 197 # For example, in the case of 198 # u'\udcc3'.encode('ascii', 'surrogateescape_handler') 199 # after registering the ``surrogateescape_handler`` function in 200 # future.utils.surrogateescape, both Python 2.x and 3.x raise an 201 # exception anyway after the function is called because the unicode 202 # string it has to return isn't encodable strictly as ASCII. 203 204 if errors == 'surrogateescape': 205 if encoding == 'utf-16': 206 # Known to fail here. See test_encoding_works_normally() 207 raise NotImplementedError('FIXME: surrogateescape handling is ' 208 'not yet implemented properly') 209 # Encode char by char, building up list of byte-strings 210 mybytes = [] 211 for c in self: 212 code = ord(c) 213 if 0xD800 <= code <= 0xDCFF: 214 mybytes.append(newbytes([code - 0xDC00])) 215 else: 216 mybytes.append(c.encode(encoding=encoding)) 217 return newbytes(b'').join(mybytes) 218 return newbytes(super(newstr, self).encode(encoding, errors)) 219 220 @no('newbytes', 1) 221 def startswith(self, prefix, *args): 222 if isinstance(prefix, Iterable): 223 for thing in prefix: 224 if isnewbytes(thing): 225 raise TypeError(self.no_convert_msg.format(type(thing))) 226 return super(newstr, self).startswith(prefix, *args) 227 228 @no('newbytes', 1) 229 def endswith(self, prefix, *args): 230 # Note we need the decorator above as well as the isnewbytes() 231 # check because prefix can be either a bytes object or e.g. a 232 # tuple of possible prefixes. (If it's a bytes object, each item 233 # in it is an int.) 234 if isinstance(prefix, Iterable): 235 for thing in prefix: 236 if isnewbytes(thing): 237 raise TypeError(self.no_convert_msg.format(type(thing))) 238 return super(newstr, self).endswith(prefix, *args) 239 240 @no('newbytes', 1) 241 def split(self, sep=None, maxsplit=-1): 242 # Py2 unicode.split() takes maxsplit as an optional parameter, 243 # not as a keyword argument as in Python 3 str. 244 parts = super(newstr, self).split(sep, maxsplit) 245 return [newstr(part) for part in parts] 246 247 @no('newbytes', 1) 248 def rsplit(self, sep=None, maxsplit=-1): 249 # Py2 unicode.rsplit() takes maxsplit as an optional parameter, 250 # not as a keyword argument as in Python 3 str. 251 parts = super(newstr, self).rsplit(sep, maxsplit) 252 return [newstr(part) for part in parts] 253 254 @no('newbytes', 1) 255 def partition(self, sep): 256 parts = super(newstr, self).partition(sep) 257 return tuple(newstr(part) for part in parts) 258 259 @no('newbytes', 1) 260 def rpartition(self, sep): 261 parts = super(newstr, self).rpartition(sep) 262 return tuple(newstr(part) for part in parts) 263 264 @no('newbytes', 1) 265 def index(self, sub, *args): 266 """ 267 Like newstr.find() but raise ValueError when the substring is not 268 found. 269 """ 270 pos = self.find(sub, *args) 271 if pos == -1: 272 raise ValueError('substring not found') 273 return pos 274 275 def splitlines(self, keepends=False): 276 """ 277 S.splitlines(keepends=False) -> list of strings 278 279 Return a list of the lines in S, breaking at line boundaries. 280 Line breaks are not included in the resulting list unless keepends 281 is given and true. 282 """ 283 # Py2 unicode.splitlines() takes keepends as an optional parameter, 284 # not as a keyword argument as in Python 3 str. 285 parts = super(newstr, self).splitlines(keepends) 286 return [newstr(part) for part in parts] 287 288 def __eq__(self, other): 289 if (isinstance(other, unicode) or 290 isinstance(other, bytes) and not isnewbytes(other)): 291 return super(newstr, self).__eq__(other) 292 else: 293 return False 294 295 def __ne__(self, other): 296 if (isinstance(other, unicode) or 297 isinstance(other, bytes) and not isnewbytes(other)): 298 return super(newstr, self).__ne__(other) 299 else: 300 return True 301 302 unorderable_err = 'unorderable types: str() and {0}' 303 304 def __lt__(self, other): 305 if not istext(other): 306 raise TypeError(self.unorderable_err.format(type(other))) 307 return super(newstr, self).__lt__(other) 308 309 def __le__(self, other): 310 if not istext(other): 311 raise TypeError(self.unorderable_err.format(type(other))) 312 return super(newstr, self).__le__(other) 313 314 def __gt__(self, other): 315 if not istext(other): 316 raise TypeError(self.unorderable_err.format(type(other))) 317 return super(newstr, self).__gt__(other) 318 319 def __ge__(self, other): 320 if not istext(other): 321 raise TypeError(self.unorderable_err.format(type(other))) 322 return super(newstr, self).__ge__(other) 323 324 def __getattribute__(self, name): 325 """ 326 A trick to cause the ``hasattr`` builtin-fn to return False for 327 the 'decode' method on Py2. 328 """ 329 if name in ['decode', u'decode']: 330 raise AttributeError("decode method has been disabled in newstr") 331 return super(newstr, self).__getattribute__(name) 332 333 def __native__(self): 334 """ 335 A hook for the future.utils.native() function. 336 """ 337 return unicode(self) 338 339 @staticmethod 340 def maketrans(x, y=None, z=None): 341 """ 342 Return a translation table usable for str.translate(). 343 344 If there is only one argument, it must be a dictionary mapping Unicode 345 ordinals (integers) or characters to Unicode ordinals, strings or None. 346 Character keys will be then converted to ordinals. 347 If there are two arguments, they must be strings of equal length, and 348 in the resulting dictionary, each character in x will be mapped to the 349 character at the same position in y. If there is a third argument, it 350 must be a string, whose characters will be mapped to None in the result. 351 """ 352 353 if y is None: 354 assert z is None 355 if not isinstance(x, dict): 356 raise TypeError('if you give only one argument to maketrans it must be a dict') 357 result = {} 358 for (key, value) in x.items(): 359 if len(key) > 1: 360 raise ValueError('keys in translate table must be strings or integers') 361 result[ord(key)] = value 362 else: 363 if not isinstance(x, unicode) and isinstance(y, unicode): 364 raise TypeError('x and y must be unicode strings') 365 if not len(x) == len(y): 366 raise ValueError('the first two maketrans arguments must have equal length') 367 result = {} 368 for (xi, yi) in zip(x, y): 369 if len(xi) > 1: 370 raise ValueError('keys in translate table must be strings or integers') 371 result[ord(xi)] = ord(yi) 372 373 if z is not None: 374 for char in z: 375 result[ord(char)] = None 376 return result 377 378 def translate(self, table): 379 """ 380 S.translate(table) -> str 381 382 Return a copy of the string S, where all characters have been mapped 383 through the given translation table, which must be a mapping of 384 Unicode ordinals to Unicode ordinals, strings, or None. 385 Unmapped characters are left untouched. Characters mapped to None 386 are deleted. 387 """ 388 l = [] 389 for c in self: 390 if ord(c) in table: 391 val = table[ord(c)] 392 if val is None: 393 continue 394 elif isinstance(val, unicode): 395 l.append(val) 396 else: 397 l.append(chr(val)) 398 else: 399 l.append(c) 400 return ''.join(l) 401 402 def isprintable(self): 403 raise NotImplementedError('fixme') 404 405 def isidentifier(self): 406 raise NotImplementedError('fixme') 407 408 def format_map(self): 409 raise NotImplementedError('fixme') 410 411 412__all__ = ['newstr'] 413