1# -*- coding: utf-8 -*-
2# Natural Language Toolkit: Compatibility
3#
4# Copyright (C) 2001-2019 NLTK Project
5#
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8
9from __future__ import absolute_import, print_function
10import os
11import sys
12from functools import update_wrapper, wraps
13import fractions
14import unicodedata
15
16from six import string_types, text_type
17
18# Python 2/3 compatibility layer. Based on six.
19
20PY3 = sys.version_info[0] == 3
21
22if PY3:
23
24    def get_im_class(meth):
25        return meth.__self__.__class__
26
27    import io
28
29    StringIO = io.StringIO
30    BytesIO = io.BytesIO
31
32    from datetime import timezone
33
34    UTC = timezone.utc
35
36    from tempfile import TemporaryDirectory
37
38else:
39
40    def get_im_class(meth):
41        return meth.im_class
42
43    try:
44        from cStringIO import StringIO
45    except ImportError:
46        from StringIO import StringIO
47    BytesIO = StringIO
48
49    from datetime import tzinfo, timedelta
50
51    ZERO = timedelta(0)
52    HOUR = timedelta(hours=1)
53
54    # A UTC class for python 2.7
55    class UTC(tzinfo):
56        """UTC"""
57
58        def utcoffset(self, dt):
59            return ZERO
60
61        def tzname(self, dt):
62            return "UTC"
63
64        def dst(self, dt):
65            return ZERO
66
67    UTC = UTC()
68
69    import csv
70    import codecs
71    import cStringIO
72
73    class UnicodeWriter:
74        """
75        A CSV writer which will write rows to CSV file "f",
76        which is encoded in the given encoding.
77        see https://docs.python.org/2/library/csv.html
78        """
79
80        def __init__(
81            self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds
82        ):
83            # Redirect output to a queue
84            self.queue = cStringIO.StringIO()
85            self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
86            self.stream = f
87            encoder_cls = codecs.getincrementalencoder(encoding)
88            self.encoder = encoder_cls(errors=errors)
89
90        def encode(self, data):
91            if isinstance(data, string_types):
92                return data.encode("utf-8")
93            else:
94                return data
95
96        def writerow(self, row):
97            self.writer.writerow([self.encode(s) for s in row])
98            # Fetch UTF-8 output from the queue ...
99            data = self.queue.getvalue()
100            data = data.decode("utf-8")
101            # ... and reencode it into the target encoding
102            data = self.encoder.encode(data, 'replace')
103            # write to the target stream
104            self.stream.write(data)
105            # empty queue
106            self.queue.truncate(0)
107
108    import warnings as _warnings
109    import os as _os
110    from tempfile import mkdtemp
111
112    class TemporaryDirectory(object):
113        """Create and return a temporary directory.  This has the same
114        behavior as mkdtemp but can be used as a context manager.  For
115        example:
116
117            with TemporaryDirectory() as tmpdir:
118                ...
119
120        Upon exiting the context, the directory and everything contained
121        in it are removed.
122
123        http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7
124        """
125
126        def __init__(self, suffix="", prefix="tmp", dir=None):
127            self._closed = False
128            self.name = None  # Handle mkdtemp raising an exception
129            self.name = mkdtemp(suffix, prefix, dir)
130
131        def __repr__(self):
132            return "<{} {!r}>".format(self.__class__.__name__, self.name)
133
134        def __enter__(self):
135            return self.name
136
137        def cleanup(self, _warn=False):
138            if self.name and not self._closed:
139                try:
140                    self._rmtree(self.name)
141                except (TypeError, AttributeError) as ex:
142                    # Issue #10188: Emit a warning on stderr
143                    # if the directory could not be cleaned
144                    # up due to missing globals
145                    if "None" not in str(ex):
146                        raise
147                    print(
148                        "ERROR: {!r} while cleaning up {!r}".format(ex, self),
149                        file=sys.stderr,
150                    )
151                    return
152                self._closed = True
153                if _warn:
154                    self._warn("Implicitly cleaning up {!r}".format(self), Warning)
155
156        def __exit__(self, exc, value, tb):
157            self.cleanup()
158
159        def __del__(self):
160            # Issue a Warning if implicit cleanup needed
161            self.cleanup(_warn=True)
162
163        # XXX (ncoghlan): The following code attempts to make
164        # this class tolerant of the module nulling out process
165        # that happens during CPython interpreter shutdown
166        # Alas, it doesn't actually manage it. See issue #10188
167        _listdir = staticmethod(_os.listdir)
168        _path_join = staticmethod(_os.path.join)
169        _isdir = staticmethod(_os.path.isdir)
170        _islink = staticmethod(_os.path.islink)
171        _remove = staticmethod(_os.remove)
172        _rmdir = staticmethod(_os.rmdir)
173        _warn = _warnings.warn
174
175        def _rmtree(self, path):
176            # Essentially a stripped down version of shutil.rmtree.  We can't
177            # use globals because they may be None'ed out at shutdown.
178            for name in self._listdir(path):
179                fullname = self._path_join(path, name)
180                try:
181                    isdir = self._isdir(fullname) and not self._islink(fullname)
182                except OSError:
183                    isdir = False
184                if isdir:
185                    self._rmtree(fullname)
186                else:
187                    try:
188                        self._remove(fullname)
189                    except OSError:
190                        pass
191            try:
192                self._rmdir(path)
193            except OSError:
194                pass
195
196
197# ======= Compatibility for datasets that care about Python versions ========
198
199# The following datasets have a /PY3 subdirectory containing
200# a full copy of the data which has been re-encoded or repickled.
201DATA_UPDATES = [
202    ("chunkers", "maxent_ne_chunker"),
203    ("help", "tagsets"),
204    ("taggers", "maxent_treebank_pos_tagger"),
205    ("tokenizers", "punkt"),
206]
207
208_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES]
209
210
211def add_py3_data(path):
212    if PY3:
213        for item in _PY3_DATA_UPDATES:
214            if item in str(path) and "/PY3" not in str(path):
215                pos = path.index(item) + len(item)
216                if path[pos : pos + 4] == ".zip":
217                    pos += 4
218                path = path[:pos] + "/PY3" + path[pos:]
219                break
220    return path
221
222
223# for use in adding /PY3 to the second (filename) argument
224# of the file pointers in data.py
225def py3_data(init_func):
226    def _decorator(*args, **kwargs):
227        args = (args[0], add_py3_data(args[1])) + args[2:]
228        return init_func(*args, **kwargs)
229
230    return wraps(init_func)(_decorator)
231
232
233# ======= Compatibility layer for __str__ and __repr__ ==========
234def remove_accents(text):
235
236    if isinstance(text, bytes):
237        text = text.decode('ascii')
238
239    category = unicodedata.category  # this gives a small (~10%) speedup
240    return ''.join(
241        c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn'
242    )
243
244
245# Select the best transliteration method:
246try:
247    # Older versions of Unidecode are licensed under Artistic License;
248    # assume an older version is installed.
249    from unidecode import unidecode as transliterate
250except ImportError:
251    try:
252        # text-unidecode implementation is worse than Unidecode
253        # implementation so Unidecode is preferred.
254        from text_unidecode import unidecode as transliterate
255    except ImportError:
256        # This transliteration method should be enough
257        # for many Western languages.
258        transliterate = remove_accents
259
260
261def python_2_unicode_compatible(klass):
262    """
263    This decorator defines __unicode__ method and fixes
264    __repr__ and __str__ methods under Python 2.
265
266    To support Python 2 and 3 with a single code base,
267    define __str__ and __repr__ methods returning unicode
268    text and apply this decorator to the class.
269
270    Original __repr__ and __str__ would be available
271    as unicode_repr and __unicode__ (under both Python 2
272    and Python 3).
273    """
274
275    if not issubclass(klass, object):
276        raise ValueError("This decorator doesn't work for old-style classes")
277
278    # both __unicode__ and unicode_repr are public because they
279    # may be useful in console under Python 2.x
280
281    # if __str__ or __repr__ are not overriden in a subclass,
282    # they may be already fixed by this decorator in a parent class
283    # and we shouldn't them again
284
285    if not _was_fixed(klass.__str__):
286        klass.__unicode__ = klass.__str__
287        if not PY3:
288            klass.__str__ = _7bit(_transliterated(klass.__unicode__))
289
290    if not _was_fixed(klass.__repr__):
291        klass.unicode_repr = klass.__repr__
292        if not PY3:
293            klass.__repr__ = _7bit(klass.unicode_repr)
294
295    return klass
296
297
298def unicode_repr(obj):
299    """
300    For classes that was fixed with @python_2_unicode_compatible
301    ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings
302    the result is returned without "u" letter (to make output the
303    same under Python 2.x and Python 3.x); for other variables
304    it is the same as ``repr``.
305    """
306    if PY3:
307        return repr(obj)
308
309    # Python 2.x
310    if hasattr(obj, 'unicode_repr'):
311        return obj.unicode_repr()
312
313    if isinstance(obj, text_type):
314        return repr(obj)[1:]  # strip "u" letter from output
315
316    return repr(obj)
317
318
319def _transliterated(method):
320    def wrapper(self):
321        return transliterate(method(self))
322
323    update_wrapper(wrapper, method, ["__name__", "__doc__"])
324    if hasattr(method, "_nltk_compat_7bit"):
325        wrapper._nltk_compat_7bit = method._nltk_compat_7bit
326
327    wrapper._nltk_compat_transliterated = True
328    return wrapper
329
330
331def _7bit(method):
332    def wrapper(self):
333        return method(self).encode('ascii', 'backslashreplace')
334
335    update_wrapper(wrapper, method, ["__name__", "__doc__"])
336
337    if hasattr(method, "_nltk_compat_transliterated"):
338        wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated
339
340    wrapper._nltk_compat_7bit = True
341    return wrapper
342
343
344def _was_fixed(method):
345    return getattr(method, "_nltk_compat_7bit", False) or getattr(
346        method, "_nltk_compat_transliterated", False
347    )
348
349
350class Fraction(fractions.Fraction):
351    """
352    This is a simplified backwards compatible version of fractions.Fraction
353    from Python >=3.5. It adds the `_normalize` parameter such that it does
354    not normalize the denominator to the Greatest Common Divisor (gcd) when
355    the numerator is 0.
356
357    This is most probably only used by the nltk.translate.bleu_score.py where
358    numerator and denominator of the different ngram precisions are mutable.
359    But the idea of "mutable" fraction might not be applicable to other usages,
360    See http://stackoverflow.com/questions/34561265
361
362    This objects should be deprecated once NLTK stops supporting Python < 3.5
363    See https://github.com/nltk/nltk/issues/1330
364    """
365
366    def __new__(cls, numerator=0, denominator=None, _normalize=True):
367        cls = super(Fraction, cls).__new__(cls, numerator, denominator)
368        # To emulate fraction.Fraction.from_float across Python >=2.7,
369        # check that numerator is an integer and denominator is not None.
370        if not _normalize and type(numerator) == int and denominator:
371            cls._numerator = numerator
372            cls._denominator = denominator
373        return cls
374