1# Copyright (c) 2008-2011 testtools developers. See LICENSE for details.
2
3"""Compatibility support for python 2 and 3."""
4
5__metaclass__ = type
6__all__ = [
7    '_b',
8    '_u',
9    'advance_iterator',
10    'all',
11    'BytesIO',
12    'classtypes',
13    'isbaseexception',
14    'istext',
15    'str_is_unicode',
16    'StringIO',
17    'reraise',
18    'unicode_output_stream',
19    ]
20
21import codecs
22import io
23import linecache
24import locale
25import os
26import re
27import sys
28import traceback
29import unicodedata
30
31from extras import try_imports
32
33BytesIO = try_imports(['StringIO.StringIO', 'io.BytesIO'])
34StringIO = try_imports(['StringIO.StringIO', 'io.StringIO'])
35
36try:
37    from testtools import _compat2x as _compat
38except (SyntaxError, ImportError):
39    from testtools import _compat3x as _compat
40
41reraise = _compat.reraise
42
43
44__u_doc = """A function version of the 'u' prefix.
45
46This is needed becayse the u prefix is not usable in Python 3 but is required
47in Python 2 to get a unicode object.
48
49To migrate code that was written as u'\u1234' in Python 2 to 2+3 change
50it to be _u('\u1234'). The Python 3 interpreter will decode it
51appropriately and the no-op _u for Python 3 lets it through, in Python
522 we then call unicode-escape in the _u function.
53"""
54
55if sys.version_info > (3, 0):
56    import builtins
57    def _u(s):
58        return s
59    _r = ascii
60    def _b(s):
61        """A byte literal."""
62        return s.encode("latin-1")
63    advance_iterator = next
64    # GZ 2011-08-24: Seems istext() is easy to misuse and makes for bad code.
65    def istext(x):
66        return isinstance(x, str)
67    def classtypes():
68        return (type,)
69    str_is_unicode = True
70else:
71    import __builtin__ as builtins
72    def _u(s):
73        # The double replace mangling going on prepares the string for
74        # unicode-escape - \foo is preserved, \u and \U are decoded.
75        return (s.replace("\\", "\\\\").replace("\\\\u", "\\u")
76            .replace("\\\\U", "\\U").decode("unicode-escape"))
77    _r = repr
78    def _b(s):
79        return s
80    advance_iterator = lambda it: it.next()
81    def istext(x):
82        return isinstance(x, basestring)
83    def classtypes():
84        import types
85        return (type, types.ClassType)
86    str_is_unicode = sys.platform == "cli"
87
88_u.__doc__ = __u_doc
89
90
91if sys.version_info > (2, 5):
92    all = all
93    _error_repr = BaseException.__repr__
94    def isbaseexception(exception):
95        """Return whether exception inherits from BaseException only"""
96        return (isinstance(exception, BaseException)
97            and not isinstance(exception, Exception))
98else:
99    def all(iterable):
100        """If contents of iterable all evaluate as boolean True"""
101        for obj in iterable:
102            if not obj:
103                return False
104        return True
105    def _error_repr(exception):
106        """Format an exception instance as Python 2.5 and later do"""
107        return exception.__class__.__name__ + repr(exception.args)
108    def isbaseexception(exception):
109        """Return whether exception would inherit from BaseException only
110
111        This approximates the hierarchy in Python 2.5 and later, compare the
112        difference between the diagrams at the bottom of the pages:
113        <http://docs.python.org/release/2.4.4/lib/module-exceptions.html>
114        <http://docs.python.org/release/2.5.4/lib/module-exceptions.html>
115        """
116        return isinstance(exception, (KeyboardInterrupt, SystemExit))
117
118
119# GZ 2011-08-24: Using isinstance checks like this encourages bad interfaces,
120#                there should be better ways to write code needing this.
121if not issubclass(getattr(builtins, "bytes", str), str):
122    def _isbytes(x):
123        return isinstance(x, bytes)
124else:
125    # Never return True on Pythons that provide the name but not the real type
126    def _isbytes(x):
127        return False
128
129
130def _slow_escape(text):
131    """Escape unicode ``text`` leaving printable characters unmodified
132
133    The behaviour emulates the Python 3 implementation of repr, see
134    unicode_repr in unicodeobject.c and isprintable definition.
135
136    Because this iterates over the input a codepoint at a time, it's slow, and
137    does not handle astral characters correctly on Python builds with 16 bit
138    rather than 32 bit unicode type.
139    """
140    output = []
141    for c in text:
142        o = ord(c)
143        if o < 256:
144            if o < 32 or 126 < o < 161:
145                output.append(c.encode("unicode-escape"))
146            elif o == 92:
147                # Separate due to bug in unicode-escape codec in Python 2.4
148                output.append("\\\\")
149            else:
150                output.append(c)
151        else:
152            # To get correct behaviour would need to pair up surrogates here
153            if unicodedata.category(c)[0] in "CZ":
154                output.append(c.encode("unicode-escape"))
155            else:
156                output.append(c)
157    return "".join(output)
158
159
160def text_repr(text, multiline=None):
161    """Rich repr for ``text`` returning unicode, triple quoted if ``multiline``.
162    """
163    is_py3k = sys.version_info > (3, 0)
164    nl = _isbytes(text) and bytes((0xA,)) or "\n"
165    if multiline is None:
166        multiline = nl in text
167    if not multiline and (is_py3k or not str_is_unicode and type(text) is str):
168        # Use normal repr for single line of unicode on Python 3 or bytes
169        return repr(text)
170    prefix = repr(text[:0])[:-2]
171    if multiline:
172        # To escape multiline strings, split and process each line in turn,
173        # making sure that quotes are not escaped.
174        if is_py3k:
175            offset = len(prefix) + 1
176            lines = []
177            for l in text.split(nl):
178                r = repr(l)
179                q = r[-1]
180                lines.append(r[offset:-1].replace("\\" + q, q))
181        elif not str_is_unicode and isinstance(text, str):
182            lines = [l.encode("string-escape").replace("\\'", "'")
183                for l in text.split("\n")]
184        else:
185            lines = [_slow_escape(l) for l in text.split("\n")]
186        # Combine the escaped lines and append two of the closing quotes,
187        # then iterate over the result to escape triple quotes correctly.
188        _semi_done = "\n".join(lines) + "''"
189        p = 0
190        while True:
191            p = _semi_done.find("'''", p)
192            if p == -1:
193                break
194            _semi_done = "\\".join([_semi_done[:p], _semi_done[p:]])
195            p += 2
196        return "".join([prefix, "'''\\\n", _semi_done, "'"])
197    escaped_text = _slow_escape(text)
198    # Determine which quote character to use and if one gets prefixed with a
199    # backslash following the same logic Python uses for repr() on strings
200    quote = "'"
201    if "'" in text:
202        if '"' in text:
203            escaped_text = escaped_text.replace("'", "\\'")
204        else:
205            quote = '"'
206    return "".join([prefix, quote, escaped_text, quote])
207
208
209def unicode_output_stream(stream):
210    """Get wrapper for given stream that writes any unicode without exception
211
212    Characters that can't be coerced to the encoding of the stream, or 'ascii'
213    if valid encoding is not found, will be replaced. The original stream may
214    be returned in situations where a wrapper is determined unneeded.
215
216    The wrapper only allows unicode to be written, not non-ascii bytestrings,
217    which is a good thing to ensure sanity and sanitation.
218    """
219    if (sys.platform == "cli" or
220        isinstance(stream, (io.TextIOWrapper, io.StringIO))):
221        # Best to never encode before writing in IronPython, or if it is
222        # already a TextIO [which in the io library has no encoding
223        # attribute).
224        return stream
225    try:
226        writer = codecs.getwriter(stream.encoding or "")
227    except (AttributeError, LookupError):
228        return codecs.getwriter("ascii")(stream, "replace")
229    if writer.__module__.rsplit(".", 1)[1].startswith("utf"):
230        # The current stream has a unicode encoding so no error handler is needed
231        if sys.version_info > (3, 0):
232            return stream
233        return writer(stream)
234    if sys.version_info > (3, 0):
235        # Python 3 doesn't seem to make this easy, handle a common case
236        try:
237            return stream.__class__(stream.buffer, stream.encoding, "replace",
238                stream.newlines, stream.line_buffering)
239        except AttributeError:
240            pass
241    return writer(stream, "replace")
242
243
244# The default source encoding is actually "iso-8859-1" until Python 2.5 but
245# using non-ascii causes a deprecation warning in 2.4 and it's cleaner to
246# treat all versions the same way
247_default_source_encoding = "ascii"
248
249# Pattern specified in <http://www.python.org/dev/peps/pep-0263/>
250_cookie_search=re.compile("coding[:=]\s*([-\w.]+)").search
251
252def _detect_encoding(lines):
253    """Get the encoding of a Python source file from a list of lines as bytes
254
255    This function does less than tokenize.detect_encoding added in Python 3 as
256    it does not attempt to raise a SyntaxError when the interpreter would, it
257    just wants the encoding of a source file Python has already compiled and
258    determined is valid.
259    """
260    if not lines:
261        return _default_source_encoding
262    if lines[0].startswith("\xef\xbb\xbf"):
263        # Source starting with UTF-8 BOM is either UTF-8 or a SyntaxError
264        return "utf-8"
265    # Only the first two lines of the source file are examined
266    magic = _cookie_search("".join(lines[:2]))
267    if magic is None:
268        return _default_source_encoding
269    encoding = magic.group(1)
270    try:
271        codecs.lookup(encoding)
272    except LookupError:
273        # Some codecs raise something other than LookupError if they don't
274        # support the given error handler, but not the text ones that could
275        # actually be used for Python source code
276        return _default_source_encoding
277    return encoding
278
279
280class _EncodingTuple(tuple):
281    """A tuple type that can have an encoding attribute smuggled on"""
282
283
284def _get_source_encoding(filename):
285    """Detect, cache and return the encoding of Python source at filename"""
286    try:
287        return linecache.cache[filename].encoding
288    except (AttributeError, KeyError):
289        encoding = _detect_encoding(linecache.getlines(filename))
290        if filename in linecache.cache:
291            newtuple = _EncodingTuple(linecache.cache[filename])
292            newtuple.encoding = encoding
293            linecache.cache[filename] = newtuple
294        return encoding
295
296
297def _get_exception_encoding():
298    """Return the encoding we expect messages from the OS to be encoded in"""
299    if os.name == "nt":
300        # GZ 2010-05-24: Really want the codepage number instead, the error
301        #                handling of standard codecs is more deterministic
302        return "mbcs"
303    # GZ 2010-05-23: We need this call to be after initialisation, but there's
304    #                no benefit in asking more than once as it's a global
305    #                setting that can change after the message is formatted.
306    return locale.getlocale(locale.LC_MESSAGES)[1] or "ascii"
307
308
309def _exception_to_text(evalue):
310    """Try hard to get a sensible text value out of an exception instance"""
311    try:
312        return unicode(evalue)
313    except KeyboardInterrupt:
314        raise
315    except:
316        # Apparently this is what traceback._some_str does. Sigh - RBC 20100623
317        pass
318    try:
319        return str(evalue).decode(_get_exception_encoding(), "replace")
320    except KeyboardInterrupt:
321        raise
322    except:
323        # Apparently this is what traceback._some_str does. Sigh - RBC 20100623
324        pass
325    # Okay, out of ideas, let higher level handle it
326    return None
327
328
329def _format_stack_list(stack_lines):
330    """Format 'stack_lines' and return a list of unicode strings.
331
332    :param stack_lines: A list of filename, lineno, name, and line variables,
333        probably obtained by calling traceback.extract_tb or
334        traceback.extract_stack.
335    """
336    fs_enc = sys.getfilesystemencoding()
337    extracted_list = []
338    for filename, lineno, name, line in stack_lines:
339            extracted_list.append((
340                filename.decode(fs_enc, "replace"),
341                lineno,
342                name.decode("ascii", "replace"),
343                line and line.decode(
344                    _get_source_encoding(filename), "replace")))
345    return traceback.format_list(extracted_list)
346
347
348def _format_exception_only(eclass, evalue):
349    """Format the excption part of a traceback.
350
351    :param eclass: The type of the exception being formatted.
352    :param evalue: The exception instance.
353    :returns: A list of unicode strings.
354    """
355    list = []
356    if evalue is None:
357        # Is a (deprecated) string exception
358        list.append((eclass + "\n").decode("ascii", "replace"))
359        return list
360    if isinstance(evalue, SyntaxError):
361        # Avoid duplicating the special formatting for SyntaxError here,
362        # instead create a new instance with unicode filename and line
363        # Potentially gives duff spacing, but that's a pre-existing issue
364        try:
365            msg, (filename, lineno, offset, line) = evalue
366        except (TypeError, ValueError):
367            pass # Strange exception instance, fall through to generic code
368        else:
369            # Errors during parsing give the line from buffer encoded as
370            # latin-1 or utf-8 or the encoding of the file depending on the
371            # coding and whether the patch for issue #1031213 is applied, so
372            # give up on trying to decode it and just read the file again
373            if line:
374                bytestr = linecache.getline(filename, lineno)
375                if bytestr:
376                    if lineno == 1 and bytestr.startswith("\xef\xbb\xbf"):
377                        bytestr = bytestr[3:]
378                    line = bytestr.decode(
379                        _get_source_encoding(filename), "replace")
380                    del linecache.cache[filename]
381                else:
382                    line = line.decode("ascii", "replace")
383            if filename:
384                fs_enc = sys.getfilesystemencoding()
385                filename = filename.decode(fs_enc, "replace")
386            evalue = eclass(msg, (filename, lineno, offset, line))
387            list.extend(traceback.format_exception_only(eclass, evalue))
388            return list
389    sclass = eclass.__name__
390    svalue = _exception_to_text(evalue)
391    if svalue:
392        list.append("%s: %s\n" % (sclass, svalue))
393    elif svalue is None:
394        # GZ 2010-05-24: Not a great fallback message, but keep for the moment
395        list.append(_u("%s: <unprintable %s object>\n" % (sclass, sclass)))
396    else:
397        list.append(_u("%s\n" % sclass))
398    return list
399
400
401_TB_HEADER = _u('Traceback (most recent call last):\n')
402
403
404def _format_exc_info(eclass, evalue, tb, limit=None):
405    """Format a stack trace and the exception information as unicode
406
407    Compatibility function for Python 2 which ensures each component of a
408    traceback is correctly decoded according to its origins.
409
410    Based on traceback.format_exception and related functions.
411    """
412    return [_TB_HEADER] \
413        + _format_stack_list(traceback.extract_tb(tb, limit)) \
414        + _format_exception_only(eclass, evalue)
415
416