future/types/newstr.py

"""
This module redefines ``str`` on Python 2.x to be a subclass of the Py2
``unicode`` type that behaves like the Python 3.x ``str``.

The main differences between ``newstr`` and Python 2.x's ``unicode`` type are
the stricter type-checking and absence of a `u''` prefix in the representation.

It is designed to be used together with the ``unicode_literals`` import
as follows:

    >>> from __future__ import unicode_literals
    >>> from builtins import str, isinstance

On Python 3.x and normally on Python 2.x, these expressions hold

    >>> str('blah') is 'blah'
    True
    >>> isinstance('blah', str)
    True

However, on Python 2.x, with this import:

    >>> from __future__ import unicode_literals

the same expressions are False:

    >>> str('blah') is 'blah'
    False
    >>> isinstance('blah', str)
    False

This module is designed to be imported together with ``unicode_literals`` on
Python 2 to bring the meaning of ``str`` back into alignment with unprefixed
string literals (i.e. ``unicode`` subclasses).

Note that ``str()`` (and ``print()``) would then normally call the
``__unicode__`` method on objects in Python 2. To define string
representations of your objects portably across Py3 and Py2, use the
:func:`python_2_unicode_compatible` decorator in  :mod:`future.utils`.

"""

from collections import Iterable
from numbers import Number

from future.utils import PY3, istext, with_metaclass, isnewbytes
from future.types import no, issubset
from future.types.newobject import newobject


if PY3:
    # We'll probably never use newstr on Py3 anyway...
    unicode = str


class BaseNewStr(type):
    def __instancecheck__(cls, instance):
        if cls == newstr:
            return isinstance(instance, unicode)
        else:
            return issubclass(instance.__class__, cls)


class newstr(with_metaclass(BaseNewStr, unicode)):
    """
    A backport of the Python 3 str object to Py2
    """
    no_convert_msg = "Can't convert '{0}' object to str implicitly"

    def __new__(cls, *args, **kwargs):
        """
        From the Py3 str docstring:

          str(object='') -> str
          str(bytes_or_buffer[, encoding[, errors]]) -> str

          Create a new string object from the given object. If encoding or
          errors is specified, then the object must expose a data buffer
          that will be decoded using the given encoding and error handler.
          Otherwise, returns the result of object.__str__() (if defined)
          or repr(object).
          encoding defaults to sys.getdefaultencoding().
          errors defaults to 'strict'.

        """
        if len(args) == 0:
            return super(newstr, cls).__new__(cls)
        # Special case: If someone requests str(str(u'abc')), return the same
        # object (same id) for consistency with Py3.3. This is not true for
        # other objects like list or dict.
        elif type(args[0]) == newstr and cls == newstr:
            return args[0]
        elif isinstance(args[0], unicode):
            value = args[0]
        elif isinstance(args[0], bytes):   # i.e. Py2 bytes or newbytes
            if 'encoding' in kwargs or len(args) > 1:
                value = args[0].decode(*args[1:], **kwargs)
            else:
                value = args[0].__str__()
        else:
            value = args[0]
        return super(newstr, cls).__new__(cls, value)

    def __repr__(self):
        """
        Without the u prefix
        """
        value = super(newstr, self).__repr__()
        # assert value[0] == u'u'
        return value[1:]

    def __getitem__(self, y):
        """
        Warning: Python <= 2.7.6 has a bug that causes this method never to be called
        when y is a slice object. Therefore the type of newstr()[:2] is wrong
        (unicode instead of newstr).
        """
        return newstr(super(newstr, self).__getitem__(y))

    def __contains__(self, key):
        errmsg = "'in <string>' requires string as left operand, not {0}"
        # Don't use isinstance() here because we only want to catch
        # newstr, not Python 2 unicode:
        if type(key) == newstr:
            newkey = key
        elif isinstance(key, unicode) or isinstance(key, bytes) and not isnewbytes(key):
            newkey = newstr(key)
        else:
            raise TypeError(errmsg.format(type(key)))
        return issubset(list(newkey), list(self))

    @no('newbytes')
    def __add__(self, other):
        return newstr(super(newstr, self).__add__(other))

    @no('newbytes')
    def __radd__(self, left):
        " left + self "
        try:
            return newstr(left) + self
        except:
            return NotImplemented

    def __mul__(self, other):
        return newstr(super(newstr, self).__mul__(other))

    def __rmul__(self, other):
        return newstr(super(newstr, self).__rmul__(other))

    def join(self, iterable):
        errmsg = 'sequence item {0}: expected unicode string, found bytes'
        for i, item in enumerate(iterable):
            # Here we use type() rather than isinstance() because
            # __instancecheck__ is being overridden. E.g.
            # isinstance(b'abc', newbytes) is True on Py2.
            if isnewbytes(item):
                raise TypeError(errmsg.format(i))
        # Support use as a staticmethod: str.join('-', ['a', 'b'])
        if type(self) == newstr:
            return newstr(super(newstr, self).join(iterable))
        else:
            return newstr(super(newstr, newstr(self)).join(iterable))

    @no('newbytes')
    def find(self, sub, *args):
        return super(newstr, self).find(sub, *args)

    @no('newbytes')
    def rfind(self, sub, *args):
        return super(newstr, self).rfind(sub, *args)

    @no('newbytes', (1, 2))
    def replace(self, old, new, *args):
        return newstr(super(newstr, self).replace(old, new, *args))

    def decode(self, *args):
        raise AttributeError("decode method has been disabled in newstr")

    def encode(self, encoding='utf-8', errors='strict'):
        """
        Returns bytes

        Encode S using the codec registered for encoding. Default encoding
        is 'utf-8'. errors may be given to set a different error
        handling scheme. Default is 'strict' meaning that encoding errors raise
        a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
        'xmlcharrefreplace' as well as any other name registered with
        codecs.register_error that can handle UnicodeEncodeErrors.
        """
        from future.types.newbytes import newbytes
        # Py2 unicode.encode() takes encoding and errors as optional parameter,
        # not keyword arguments as in Python 3 str.

        # For the surrogateescape error handling mechanism, the
        # codecs.register_error() function seems to be inadequate for an
        # implementation of it when encoding. (Decoding seems fine, however.)
        # For example, in the case of
        #     u'\udcc3'.encode('ascii', 'surrogateescape_handler')
        # after registering the ``surrogateescape_handler`` function in
        # future.utils.surrogateescape, both Python 2.x and 3.x raise an
        # exception anyway after the function is called because the unicode
        # string it has to return isn't encodable strictly as ASCII.

        if errors == 'surrogateescape':
            if encoding == 'utf-16':
                # Known to fail here. See test_encoding_works_normally()
                raise NotImplementedError('FIXME: surrogateescape handling is '
                                          'not yet implemented properly')
            # Encode char by char, building up list of byte-strings
            mybytes = []
            for c in self:
                code = ord(c)
                if 0xD800 <= code <= 0xDCFF:
                    mybytes.append(newbytes([code - 0xDC00]))
                else:
                    mybytes.append(c.encode(encoding=encoding))
            return newbytes(b'').join(mybytes)
        return newbytes(super(newstr, self).encode(encoding, errors))

    @no('newbytes', 1)
    def startswith(self, prefix, *args):
        if isinstance(prefix, Iterable):
            for thing in prefix:
                if isnewbytes(thing):
                    raise TypeError(self.no_convert_msg.format(type(thing)))
        return super(newstr, self).startswith(prefix, *args)

    @no('newbytes', 1)
    def endswith(self, prefix, *args):
        # Note we need the decorator above as well as the isnewbytes()
        # check because prefix can be either a bytes object or e.g. a
        # tuple of possible prefixes. (If it's a bytes object, each item
        # in it is an int.)
        if isinstance(prefix, Iterable):
            for thing in prefix:
                if isnewbytes(thing):
                    raise TypeError(self.no_convert_msg.format(type(thing)))
        return super(newstr, self).endswith(prefix, *args)

    @no('newbytes', 1)
    def split(self, sep=None, maxsplit=-1):
        # Py2 unicode.split() takes maxsplit as an optional parameter,
        # not as a keyword argument as in Python 3 str.
        parts = super(newstr, self).split(sep, maxsplit)
        return [newstr(part) for part in parts]

    @no('newbytes', 1)
    def rsplit(self, sep=None, maxsplit=-1):
        # Py2 unicode.rsplit() takes maxsplit as an optional parameter,
        # not as a keyword argument as in Python 3 str.
        parts = super(newstr, self).rsplit(sep, maxsplit)
        return [newstr(part) for part in parts]

    @no('newbytes', 1)
    def partition(self, sep):
        parts = super(newstr, self).partition(sep)
        return tuple(newstr(part) for part in parts)

    @no('newbytes', 1)
    def rpartition(self, sep):
        parts = super(newstr, self).rpartition(sep)
        return tuple(newstr(part) for part in parts)

    @no('newbytes', 1)
    def index(self, sub, *args):
        """
        Like newstr.find() but raise ValueError when the substring is not
        found.
        """
        pos = self.find(sub, *args)
        if pos == -1:
            raise ValueError('substring not found')
        return pos

    def splitlines(self, keepends=False):
        """
        S.splitlines(keepends=False) -> list of strings

        Return a list of the lines in S, breaking at line boundaries.
        Line breaks are not included in the resulting list unless keepends
        is given and true.
        """
        # Py2 unicode.splitlines() takes keepends as an optional parameter,
        # not as a keyword argument as in Python 3 str.
        parts = super(newstr, self).splitlines(keepends)
        return [newstr(part) for part in parts]

    def __eq__(self, other):
        if (isinstance(other, unicode) or
            isinstance(other, bytes) and not isnewbytes(other)):
            return super(newstr, self).__eq__(other)
        else:
            return False

    def __ne__(self, other):
        if (isinstance(other, unicode) or
            isinstance(other, bytes) and not isnewbytes(other)):
            return super(newstr, self).__ne__(other)
        else:
            return True

    unorderable_err = 'unorderable types: str() and {0}'

    def __lt__(self, other):
        if not istext(other):
            raise TypeError(self.unorderable_err.format(type(other)))
        return super(newstr, self).__lt__(other)

    def __le__(self, other):
        if not istext(other):
            raise TypeError(self.unorderable_err.format(type(other)))
        return super(newstr, self).__le__(other)

    def __gt__(self, other):
        if not istext(other):
            raise TypeError(self.unorderable_err.format(type(other)))
        return super(newstr, self).__gt__(other)

    def __ge__(self, other):
        if not istext(other):
            raise TypeError(self.unorderable_err.format(type(other)))
        return super(newstr, self).__ge__(other)

    def __getattribute__(self, name):
        """
        A trick to cause the ``hasattr`` builtin-fn to return False for
        the 'decode' method on Py2.
        """
        if name in ['decode', u'decode']:
            raise AttributeError("decode method has been disabled in newstr")
        return super(newstr, self).__getattribute__(name)

    def __native__(self):
        """
        A hook for the future.utils.native() function.
        """
        return unicode(self)

    @staticmethod
    def maketrans(x, y=None, z=None):
        """
        Return a translation table usable for str.translate().

        If there is only one argument, it must be a dictionary mapping Unicode
        ordinals (integers) or characters to Unicode ordinals, strings or None.
        Character keys will be then converted to ordinals.
        If there are two arguments, they must be strings of equal length, and
        in the resulting dictionary, each character in x will be mapped to the
        character at the same position in y. If there is a third argument, it
        must be a string, whose characters will be mapped to None in the result.
        """

        if y is None:
            assert z is None
            if not isinstance(x, dict):
                raise TypeError('if you give only one argument to maketrans it must be a dict')
            result = {}
            for (key, value) in x.items():
                if len(key) > 1:
                    raise ValueError('keys in translate table must be strings or integers')
                result[ord(key)] = value
        else:
            if not isinstance(x, unicode) and isinstance(y, unicode):
                raise TypeError('x and y must be unicode strings')
            if not len(x) == len(y):
                raise ValueError('the first two maketrans arguments must have equal length')
            result = {}
            for (xi, yi) in zip(x, y):
                if len(xi) > 1:
                    raise ValueError('keys in translate table must be strings or integers')
                result[ord(xi)] = ord(yi)

        if z is not None:
            for char in z:
                result[ord(char)] = None
        return result

    def translate(self, table):
        """
        S.translate(table) -> str

        Return a copy of the string S, where all characters have been mapped
        through the given translation table, which must be a mapping of
        Unicode ordinals to Unicode ordinals, strings, or None.
        Unmapped characters are left untouched. Characters mapped to None
        are deleted.
        """
        l = []
        for c in self:
            if ord(c) in table:
                val = table[ord(c)]
                if val is None:
                    continue
                elif isinstance(val, unicode):
                    l.append(val)
                else:
                    l.append(chr(val))
            else:
                l.append(c)
        return ''.join(l)

    def isprintable(self):
        raise NotImplementedError('fixme')

    def isidentifier(self):
        raise NotImplementedError('fixme')

    def format_map(self):
        raise NotImplementedError('fixme')


__all__ = ['newstr']