1# -*- coding: utf-8 -*- 2# Natural Language Toolkit: Compatibility 3# 4# Copyright (C) 2001-2019 NLTK Project 5# 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8 9from __future__ import absolute_import, print_function 10import os 11import sys 12from functools import update_wrapper, wraps 13import fractions 14import unicodedata 15 16from six import string_types, text_type 17 18# Python 2/3 compatibility layer. Based on six. 19 20PY3 = sys.version_info[0] == 3 21 22if PY3: 23 24 def get_im_class(meth): 25 return meth.__self__.__class__ 26 27 import io 28 29 StringIO = io.StringIO 30 BytesIO = io.BytesIO 31 32 from datetime import timezone 33 34 UTC = timezone.utc 35 36 from tempfile import TemporaryDirectory 37 38else: 39 40 def get_im_class(meth): 41 return meth.im_class 42 43 try: 44 from cStringIO import StringIO 45 except ImportError: 46 from StringIO import StringIO 47 BytesIO = StringIO 48 49 from datetime import tzinfo, timedelta 50 51 ZERO = timedelta(0) 52 HOUR = timedelta(hours=1) 53 54 # A UTC class for python 2.7 55 class UTC(tzinfo): 56 """UTC""" 57 58 def utcoffset(self, dt): 59 return ZERO 60 61 def tzname(self, dt): 62 return "UTC" 63 64 def dst(self, dt): 65 return ZERO 66 67 UTC = UTC() 68 69 import csv 70 import codecs 71 import cStringIO 72 73 class UnicodeWriter: 74 """ 75 A CSV writer which will write rows to CSV file "f", 76 which is encoded in the given encoding. 77 see https://docs.python.org/2/library/csv.html 78 """ 79 80 def __init__( 81 self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds 82 ): 83 # Redirect output to a queue 84 self.queue = cStringIO.StringIO() 85 self.writer = csv.writer(self.queue, dialect=dialect, **kwds) 86 self.stream = f 87 encoder_cls = codecs.getincrementalencoder(encoding) 88 self.encoder = encoder_cls(errors=errors) 89 90 def encode(self, data): 91 if isinstance(data, string_types): 92 return data.encode("utf-8") 93 else: 94 return data 95 96 def writerow(self, row): 97 self.writer.writerow([self.encode(s) for s in row]) 98 # Fetch UTF-8 output from the queue ... 99 data = self.queue.getvalue() 100 data = data.decode("utf-8") 101 # ... and reencode it into the target encoding 102 data = self.encoder.encode(data, 'replace') 103 # write to the target stream 104 self.stream.write(data) 105 # empty queue 106 self.queue.truncate(0) 107 108 import warnings as _warnings 109 import os as _os 110 from tempfile import mkdtemp 111 112 class TemporaryDirectory(object): 113 """Create and return a temporary directory. This has the same 114 behavior as mkdtemp but can be used as a context manager. For 115 example: 116 117 with TemporaryDirectory() as tmpdir: 118 ... 119 120 Upon exiting the context, the directory and everything contained 121 in it are removed. 122 123 http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7 124 """ 125 126 def __init__(self, suffix="", prefix="tmp", dir=None): 127 self._closed = False 128 self.name = None # Handle mkdtemp raising an exception 129 self.name = mkdtemp(suffix, prefix, dir) 130 131 def __repr__(self): 132 return "<{} {!r}>".format(self.__class__.__name__, self.name) 133 134 def __enter__(self): 135 return self.name 136 137 def cleanup(self, _warn=False): 138 if self.name and not self._closed: 139 try: 140 self._rmtree(self.name) 141 except (TypeError, AttributeError) as ex: 142 # Issue #10188: Emit a warning on stderr 143 # if the directory could not be cleaned 144 # up due to missing globals 145 if "None" not in str(ex): 146 raise 147 print( 148 "ERROR: {!r} while cleaning up {!r}".format(ex, self), 149 file=sys.stderr, 150 ) 151 return 152 self._closed = True 153 if _warn: 154 self._warn("Implicitly cleaning up {!r}".format(self), Warning) 155 156 def __exit__(self, exc, value, tb): 157 self.cleanup() 158 159 def __del__(self): 160 # Issue a Warning if implicit cleanup needed 161 self.cleanup(_warn=True) 162 163 # XXX (ncoghlan): The following code attempts to make 164 # this class tolerant of the module nulling out process 165 # that happens during CPython interpreter shutdown 166 # Alas, it doesn't actually manage it. See issue #10188 167 _listdir = staticmethod(_os.listdir) 168 _path_join = staticmethod(_os.path.join) 169 _isdir = staticmethod(_os.path.isdir) 170 _islink = staticmethod(_os.path.islink) 171 _remove = staticmethod(_os.remove) 172 _rmdir = staticmethod(_os.rmdir) 173 _warn = _warnings.warn 174 175 def _rmtree(self, path): 176 # Essentially a stripped down version of shutil.rmtree. We can't 177 # use globals because they may be None'ed out at shutdown. 178 for name in self._listdir(path): 179 fullname = self._path_join(path, name) 180 try: 181 isdir = self._isdir(fullname) and not self._islink(fullname) 182 except OSError: 183 isdir = False 184 if isdir: 185 self._rmtree(fullname) 186 else: 187 try: 188 self._remove(fullname) 189 except OSError: 190 pass 191 try: 192 self._rmdir(path) 193 except OSError: 194 pass 195 196 197# ======= Compatibility for datasets that care about Python versions ======== 198 199# The following datasets have a /PY3 subdirectory containing 200# a full copy of the data which has been re-encoded or repickled. 201DATA_UPDATES = [ 202 ("chunkers", "maxent_ne_chunker"), 203 ("help", "tagsets"), 204 ("taggers", "maxent_treebank_pos_tagger"), 205 ("tokenizers", "punkt"), 206] 207 208_PY3_DATA_UPDATES = [os.path.join(*path_list) for path_list in DATA_UPDATES] 209 210 211def add_py3_data(path): 212 if PY3: 213 for item in _PY3_DATA_UPDATES: 214 if item in str(path) and "/PY3" not in str(path): 215 pos = path.index(item) + len(item) 216 if path[pos : pos + 4] == ".zip": 217 pos += 4 218 path = path[:pos] + "/PY3" + path[pos:] 219 break 220 return path 221 222 223# for use in adding /PY3 to the second (filename) argument 224# of the file pointers in data.py 225def py3_data(init_func): 226 def _decorator(*args, **kwargs): 227 args = (args[0], add_py3_data(args[1])) + args[2:] 228 return init_func(*args, **kwargs) 229 230 return wraps(init_func)(_decorator) 231 232 233# ======= Compatibility layer for __str__ and __repr__ ========== 234def remove_accents(text): 235 236 if isinstance(text, bytes): 237 text = text.decode('ascii') 238 239 category = unicodedata.category # this gives a small (~10%) speedup 240 return ''.join( 241 c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn' 242 ) 243 244 245# Select the best transliteration method: 246try: 247 # Older versions of Unidecode are licensed under Artistic License; 248 # assume an older version is installed. 249 from unidecode import unidecode as transliterate 250except ImportError: 251 try: 252 # text-unidecode implementation is worse than Unidecode 253 # implementation so Unidecode is preferred. 254 from text_unidecode import unidecode as transliterate 255 except ImportError: 256 # This transliteration method should be enough 257 # for many Western languages. 258 transliterate = remove_accents 259 260 261def python_2_unicode_compatible(klass): 262 """ 263 This decorator defines __unicode__ method and fixes 264 __repr__ and __str__ methods under Python 2. 265 266 To support Python 2 and 3 with a single code base, 267 define __str__ and __repr__ methods returning unicode 268 text and apply this decorator to the class. 269 270 Original __repr__ and __str__ would be available 271 as unicode_repr and __unicode__ (under both Python 2 272 and Python 3). 273 """ 274 275 if not issubclass(klass, object): 276 raise ValueError("This decorator doesn't work for old-style classes") 277 278 # both __unicode__ and unicode_repr are public because they 279 # may be useful in console under Python 2.x 280 281 # if __str__ or __repr__ are not overriden in a subclass, 282 # they may be already fixed by this decorator in a parent class 283 # and we shouldn't them again 284 285 if not _was_fixed(klass.__str__): 286 klass.__unicode__ = klass.__str__ 287 if not PY3: 288 klass.__str__ = _7bit(_transliterated(klass.__unicode__)) 289 290 if not _was_fixed(klass.__repr__): 291 klass.unicode_repr = klass.__repr__ 292 if not PY3: 293 klass.__repr__ = _7bit(klass.unicode_repr) 294 295 return klass 296 297 298def unicode_repr(obj): 299 """ 300 For classes that was fixed with @python_2_unicode_compatible 301 ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings 302 the result is returned without "u" letter (to make output the 303 same under Python 2.x and Python 3.x); for other variables 304 it is the same as ``repr``. 305 """ 306 if PY3: 307 return repr(obj) 308 309 # Python 2.x 310 if hasattr(obj, 'unicode_repr'): 311 return obj.unicode_repr() 312 313 if isinstance(obj, text_type): 314 return repr(obj)[1:] # strip "u" letter from output 315 316 return repr(obj) 317 318 319def _transliterated(method): 320 def wrapper(self): 321 return transliterate(method(self)) 322 323 update_wrapper(wrapper, method, ["__name__", "__doc__"]) 324 if hasattr(method, "_nltk_compat_7bit"): 325 wrapper._nltk_compat_7bit = method._nltk_compat_7bit 326 327 wrapper._nltk_compat_transliterated = True 328 return wrapper 329 330 331def _7bit(method): 332 def wrapper(self): 333 return method(self).encode('ascii', 'backslashreplace') 334 335 update_wrapper(wrapper, method, ["__name__", "__doc__"]) 336 337 if hasattr(method, "_nltk_compat_transliterated"): 338 wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated 339 340 wrapper._nltk_compat_7bit = True 341 return wrapper 342 343 344def _was_fixed(method): 345 return getattr(method, "_nltk_compat_7bit", False) or getattr( 346 method, "_nltk_compat_transliterated", False 347 ) 348 349 350class Fraction(fractions.Fraction): 351 """ 352 This is a simplified backwards compatible version of fractions.Fraction 353 from Python >=3.5. It adds the `_normalize` parameter such that it does 354 not normalize the denominator to the Greatest Common Divisor (gcd) when 355 the numerator is 0. 356 357 This is most probably only used by the nltk.translate.bleu_score.py where 358 numerator and denominator of the different ngram precisions are mutable. 359 But the idea of "mutable" fraction might not be applicable to other usages, 360 See http://stackoverflow.com/questions/34561265 361 362 This objects should be deprecated once NLTK stops supporting Python < 3.5 363 See https://github.com/nltk/nltk/issues/1330 364 """ 365 366 def __new__(cls, numerator=0, denominator=None, _normalize=True): 367 cls = super(Fraction, cls).__new__(cls, numerator, denominator) 368 # To emulate fraction.Fraction.from_float across Python >=2.7, 369 # check that numerator is an integer and denominator is not None. 370 if not _normalize and type(numerator) == int and denominator: 371 cls._numerator = numerator 372 cls._denominator = denominator 373 return cls 374