1# Natural Language Toolkit: Collections 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Steven Bird <stevenbird1@gmail.com> 5# URL: <http://nltk.org/> 6# For license information, see LICENSE.TXT 7from __future__ import print_function, absolute_import 8 9import bisect 10from itertools import islice, chain 11from functools import total_ordering 12# this unused import is for python 2.7 13from collections import defaultdict, deque, Counter 14 15from six import text_type 16 17from nltk.internals import slice_bounds, raise_unorderable_types 18from nltk.compat import python_2_unicode_compatible 19 20 21########################################################################## 22# Ordered Dictionary 23########################################################################## 24 25 26class OrderedDict(dict): 27 def __init__(self, data=None, **kwargs): 28 self._keys = self.keys(data, kwargs.get('keys')) 29 self._default_factory = kwargs.get('default_factory') 30 if data is None: 31 dict.__init__(self) 32 else: 33 dict.__init__(self, data) 34 35 def __delitem__(self, key): 36 dict.__delitem__(self, key) 37 self._keys.remove(key) 38 39 def __getitem__(self, key): 40 try: 41 return dict.__getitem__(self, key) 42 except KeyError: 43 return self.__missing__(key) 44 45 def __iter__(self): 46 return (key for key in self.keys()) 47 48 def __missing__(self, key): 49 if not self._default_factory and key not in self._keys: 50 raise KeyError() 51 return self._default_factory() 52 53 def __setitem__(self, key, item): 54 dict.__setitem__(self, key, item) 55 if key not in self._keys: 56 self._keys.append(key) 57 58 def clear(self): 59 dict.clear(self) 60 self._keys.clear() 61 62 def copy(self): 63 d = dict.copy(self) 64 d._keys = self._keys 65 return d 66 67 def items(self): 68 # returns iterator under python 3 and list under python 2 69 return zip(self.keys(), self.values()) 70 71 def keys(self, data=None, keys=None): 72 if data: 73 if keys: 74 assert isinstance(keys, list) 75 assert len(data) == len(keys) 76 return keys 77 else: 78 assert ( 79 isinstance(data, dict) 80 or isinstance(data, OrderedDict) 81 or isinstance(data, list) 82 ) 83 if isinstance(data, dict) or isinstance(data, OrderedDict): 84 return data.keys() 85 elif isinstance(data, list): 86 return [key for (key, value) in data] 87 elif '_keys' in self.__dict__: 88 return self._keys 89 else: 90 return [] 91 92 def popitem(self): 93 if not self._keys: 94 raise KeyError() 95 96 key = self._keys.pop() 97 value = self[key] 98 del self[key] 99 return (key, value) 100 101 def setdefault(self, key, failobj=None): 102 dict.setdefault(self, key, failobj) 103 if key not in self._keys: 104 self._keys.append(key) 105 106 def update(self, data): 107 dict.update(self, data) 108 for key in self.keys(data): 109 if key not in self._keys: 110 self._keys.append(key) 111 112 def values(self): 113 # returns iterator under python 3 114 return map(self.get, self._keys) 115 116 117###################################################################### 118# Lazy Sequences 119###################################################################### 120 121 122@total_ordering 123@python_2_unicode_compatible 124class AbstractLazySequence(object): 125 """ 126 An abstract base class for read-only sequences whose values are 127 computed as needed. Lazy sequences act like tuples -- they can be 128 indexed, sliced, and iterated over; but they may not be modified. 129 130 The most common application of lazy sequences in NLTK is for 131 corpus view objects, which provide access to the contents of a 132 corpus without loading the entire corpus into memory, by loading 133 pieces of the corpus from disk as needed. 134 135 The result of modifying a mutable element of a lazy sequence is 136 undefined. In particular, the modifications made to the element 137 may or may not persist, depending on whether and when the lazy 138 sequence caches that element's value or reconstructs it from 139 scratch. 140 141 Subclasses are required to define two methods: ``__len__()`` 142 and ``iterate_from()``. 143 """ 144 145 def __len__(self): 146 """ 147 Return the number of tokens in the corpus file underlying this 148 corpus view. 149 """ 150 raise NotImplementedError('should be implemented by subclass') 151 152 def iterate_from(self, start): 153 """ 154 Return an iterator that generates the tokens in the corpus 155 file underlying this corpus view, starting at the token number 156 ``start``. If ``start>=len(self)``, then this iterator will 157 generate no tokens. 158 """ 159 raise NotImplementedError('should be implemented by subclass') 160 161 def __getitem__(self, i): 162 """ 163 Return the *i* th token in the corpus file underlying this 164 corpus view. Negative indices and spans are both supported. 165 """ 166 if isinstance(i, slice): 167 start, stop = slice_bounds(self, i) 168 return LazySubsequence(self, start, stop) 169 else: 170 # Handle negative indices 171 if i < 0: 172 i += len(self) 173 if i < 0: 174 raise IndexError('index out of range') 175 # Use iterate_from to extract it. 176 try: 177 return next(self.iterate_from(i)) 178 except StopIteration: 179 raise IndexError('index out of range') 180 181 def __iter__(self): 182 """Return an iterator that generates the tokens in the corpus 183 file underlying this corpus view.""" 184 return self.iterate_from(0) 185 186 def count(self, value): 187 """Return the number of times this list contains ``value``.""" 188 return sum(1 for elt in self if elt == value) 189 190 def index(self, value, start=None, stop=None): 191 """Return the index of the first occurrence of ``value`` in this 192 list that is greater than or equal to ``start`` and less than 193 ``stop``. Negative start and stop values are treated like negative 194 slice bounds -- i.e., they count from the end of the list.""" 195 start, stop = slice_bounds(self, slice(start, stop)) 196 for i, elt in enumerate(islice(self, start, stop)): 197 if elt == value: 198 return i + start 199 raise ValueError('index(x): x not in list') 200 201 def __contains__(self, value): 202 """Return true if this list contains ``value``.""" 203 return bool(self.count(value)) 204 205 def __add__(self, other): 206 """Return a list concatenating self with other.""" 207 return LazyConcatenation([self, other]) 208 209 def __radd__(self, other): 210 """Return a list concatenating other with self.""" 211 return LazyConcatenation([other, self]) 212 213 def __mul__(self, count): 214 """Return a list concatenating self with itself ``count`` times.""" 215 return LazyConcatenation([self] * count) 216 217 def __rmul__(self, count): 218 """Return a list concatenating self with itself ``count`` times.""" 219 return LazyConcatenation([self] * count) 220 221 _MAX_REPR_SIZE = 60 222 223 def __repr__(self): 224 """ 225 Return a string representation for this corpus view that is 226 similar to a list's representation; but if it would be more 227 than 60 characters long, it is truncated. 228 """ 229 pieces = [] 230 length = 5 231 for elt in self: 232 pieces.append(repr(elt)) 233 length += len(pieces[-1]) + 2 234 if length > self._MAX_REPR_SIZE and len(pieces) > 2: 235 return '[%s, ...]' % text_type(', ').join(pieces[:-1]) 236 return '[%s]' % text_type(', ').join(pieces) 237 238 def __eq__(self, other): 239 return type(self) == type(other) and list(self) == list(other) 240 241 def __ne__(self, other): 242 return not self == other 243 244 def __lt__(self, other): 245 if type(other) != type(self): 246 raise_unorderable_types("<", self, other) 247 return list(self) < list(other) 248 249 def __hash__(self): 250 """ 251 :raise ValueError: Corpus view objects are unhashable. 252 """ 253 raise ValueError('%s objects are unhashable' % self.__class__.__name__) 254 255 256class LazySubsequence(AbstractLazySequence): 257 """ 258 A subsequence produced by slicing a lazy sequence. This slice 259 keeps a reference to its source sequence, and generates its values 260 by looking them up in the source sequence. 261 """ 262 263 MIN_SIZE = 100 264 """ 265 The minimum size for which lazy slices should be created. If 266 ``LazySubsequence()`` is called with a subsequence that is 267 shorter than ``MIN_SIZE``, then a tuple will be returned instead. 268 """ 269 270 def __new__(cls, source, start, stop): 271 """ 272 Construct a new slice from a given underlying sequence. The 273 ``start`` and ``stop`` indices should be absolute indices -- 274 i.e., they should not be negative (for indexing from the back 275 of a list) or greater than the length of ``source``. 276 """ 277 # If the slice is small enough, just use a tuple. 278 if stop - start < cls.MIN_SIZE: 279 return list(islice(source.iterate_from(start), stop - start)) 280 else: 281 return object.__new__(cls) 282 283 def __init__(self, source, start, stop): 284 self._source = source 285 self._start = start 286 self._stop = stop 287 288 def __len__(self): 289 return self._stop - self._start 290 291 def iterate_from(self, start): 292 return islice( 293 self._source.iterate_from(start + self._start), max(0, len(self) - start) 294 ) 295 296 297class LazyConcatenation(AbstractLazySequence): 298 """ 299 A lazy sequence formed by concatenating a list of lists. This 300 underlying list of lists may itself be lazy. ``LazyConcatenation`` 301 maintains an index that it uses to keep track of the relationship 302 between offsets in the concatenated lists and offsets in the 303 sublists. 304 """ 305 306 def __init__(self, list_of_lists): 307 self._list = list_of_lists 308 self._offsets = [0] 309 310 def __len__(self): 311 if len(self._offsets) <= len(self._list): 312 for tok in self.iterate_from(self._offsets[-1]): 313 pass 314 return self._offsets[-1] 315 316 def iterate_from(self, start_index): 317 if start_index < self._offsets[-1]: 318 sublist_index = bisect.bisect_right(self._offsets, start_index) - 1 319 else: 320 sublist_index = len(self._offsets) - 1 321 322 index = self._offsets[sublist_index] 323 324 # Construct an iterator over the sublists. 325 if isinstance(self._list, AbstractLazySequence): 326 sublist_iter = self._list.iterate_from(sublist_index) 327 else: 328 sublist_iter = islice(self._list, sublist_index, None) 329 330 for sublist in sublist_iter: 331 if sublist_index == (len(self._offsets) - 1): 332 assert ( 333 index + len(sublist) >= self._offsets[-1] 334 ), 'offests not monotonic increasing!' 335 self._offsets.append(index + len(sublist)) 336 else: 337 assert self._offsets[sublist_index + 1] == index + len( 338 sublist 339 ), 'inconsistent list value (num elts)' 340 341 for value in sublist[max(0, start_index - index) :]: 342 yield value 343 344 index += len(sublist) 345 sublist_index += 1 346 347 348class LazyMap(AbstractLazySequence): 349 """ 350 A lazy sequence whose elements are formed by applying a given 351 function to each element in one or more underlying lists. The 352 function is applied lazily -- i.e., when you read a value from the 353 list, ``LazyMap`` will calculate that value by applying its 354 function to the underlying lists' value(s). ``LazyMap`` is 355 essentially a lazy version of the Python primitive function 356 ``map``. In particular, the following two expressions are 357 equivalent: 358 359 >>> from nltk.collections import LazyMap 360 >>> function = str 361 >>> sequence = [1,2,3] 362 >>> map(function, sequence) # doctest: +SKIP 363 ['1', '2', '3'] 364 >>> list(LazyMap(function, sequence)) 365 ['1', '2', '3'] 366 367 Like the Python ``map`` primitive, if the source lists do not have 368 equal size, then the value None will be supplied for the 369 'missing' elements. 370 371 Lazy maps can be useful for conserving memory, in cases where 372 individual values take up a lot of space. This is especially true 373 if the underlying list's values are constructed lazily, as is the 374 case with many corpus readers. 375 376 A typical example of a use case for this class is performing 377 feature detection on the tokens in a corpus. Since featuresets 378 are encoded as dictionaries, which can take up a lot of memory, 379 using a ``LazyMap`` can significantly reduce memory usage when 380 training and running classifiers. 381 """ 382 383 def __init__(self, function, *lists, **config): 384 """ 385 :param function: The function that should be applied to 386 elements of ``lists``. It should take as many arguments 387 as there are ``lists``. 388 :param lists: The underlying lists. 389 :param cache_size: Determines the size of the cache used 390 by this lazy map. (default=5) 391 """ 392 if not lists: 393 raise TypeError('LazyMap requires at least two args') 394 395 self._lists = lists 396 self._func = function 397 self._cache_size = config.get('cache_size', 5) 398 self._cache = {} if self._cache_size > 0 else None 399 400 # If you just take bool() of sum() here _all_lazy will be true just 401 # in case n >= 1 list is an AbstractLazySequence. Presumably this 402 # isn't what's intended. 403 self._all_lazy = sum( 404 isinstance(lst, AbstractLazySequence) for lst in lists 405 ) == len(lists) 406 407 def iterate_from(self, index): 408 # Special case: one lazy sublist 409 if len(self._lists) == 1 and self._all_lazy: 410 for value in self._lists[0].iterate_from(index): 411 yield self._func(value) 412 return 413 414 # Special case: one non-lazy sublist 415 elif len(self._lists) == 1: 416 while True: 417 try: 418 yield self._func(self._lists[0][index]) 419 except IndexError: 420 return 421 index += 1 422 423 # Special case: n lazy sublists 424 elif self._all_lazy: 425 iterators = [lst.iterate_from(index) for lst in self._lists] 426 while True: 427 elements = [] 428 for iterator in iterators: 429 try: 430 elements.append(next(iterator)) 431 except: # FIXME: What is this except really catching? StopIteration? 432 elements.append(None) 433 if elements == [None] * len(self._lists): 434 return 435 yield self._func(*elements) 436 index += 1 437 438 # general case 439 else: 440 while True: 441 try: 442 elements = [lst[index] for lst in self._lists] 443 except IndexError: 444 elements = [None] * len(self._lists) 445 for i, lst in enumerate(self._lists): 446 try: 447 elements[i] = lst[index] 448 except IndexError: 449 pass 450 if elements == [None] * len(self._lists): 451 return 452 yield self._func(*elements) 453 index += 1 454 455 def __getitem__(self, index): 456 if isinstance(index, slice): 457 sliced_lists = [lst[index] for lst in self._lists] 458 return LazyMap(self._func, *sliced_lists) 459 else: 460 # Handle negative indices 461 if index < 0: 462 index += len(self) 463 if index < 0: 464 raise IndexError('index out of range') 465 # Check the cache 466 if self._cache is not None and index in self._cache: 467 return self._cache[index] 468 # Calculate the value 469 try: 470 val = next(self.iterate_from(index)) 471 except StopIteration: 472 raise IndexError('index out of range') 473 # Update the cache 474 if self._cache is not None: 475 if len(self._cache) > self._cache_size: 476 self._cache.popitem() # discard random entry 477 self._cache[index] = val 478 # Return the value 479 return val 480 481 def __len__(self): 482 return max(len(lst) for lst in self._lists) 483 484 485class LazyZip(LazyMap): 486 """ 487 A lazy sequence whose elements are tuples, each containing the i-th 488 element from each of the argument sequences. The returned list is 489 truncated in length to the length of the shortest argument sequence. The 490 tuples are constructed lazily -- i.e., when you read a value from the 491 list, ``LazyZip`` will calculate that value by forming a tuple from 492 the i-th element of each of the argument sequences. 493 494 ``LazyZip`` is essentially a lazy version of the Python primitive function 495 ``zip``. In particular, an evaluated LazyZip is equivalent to a zip: 496 497 >>> from nltk.collections import LazyZip 498 >>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c'] 499 >>> zip(sequence1, sequence2) # doctest: +SKIP 500 [(1, 'a'), (2, 'b'), (3, 'c')] 501 >>> list(LazyZip(sequence1, sequence2)) 502 [(1, 'a'), (2, 'b'), (3, 'c')] 503 >>> sequences = [sequence1, sequence2, [6,7,8,9]] 504 >>> list(zip(*sequences)) == list(LazyZip(*sequences)) 505 True 506 507 Lazy zips can be useful for conserving memory in cases where the argument 508 sequences are particularly long. 509 510 A typical example of a use case for this class is combining long sequences 511 of gold standard and predicted values in a classification or tagging task 512 in order to calculate accuracy. By constructing tuples lazily and 513 avoiding the creation of an additional long sequence, memory usage can be 514 significantly reduced. 515 """ 516 517 def __init__(self, *lists): 518 """ 519 :param lists: the underlying lists 520 :type lists: list(list) 521 """ 522 LazyMap.__init__(self, lambda *elts: elts, *lists) 523 524 def iterate_from(self, index): 525 iterator = LazyMap.iterate_from(self, index) 526 while index < len(self): 527 yield next(iterator) 528 index += 1 529 return 530 531 def __len__(self): 532 return min(len(lst) for lst in self._lists) 533 534 535class LazyEnumerate(LazyZip): 536 """ 537 A lazy sequence whose elements are tuples, each ontaining a count (from 538 zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is 539 useful for obtaining an indexed list. The tuples are constructed lazily 540 -- i.e., when you read a value from the list, ``LazyEnumerate`` will 541 calculate that value by forming a tuple from the count of the i-th 542 element and the i-th element of the underlying sequence. 543 544 ``LazyEnumerate`` is essentially a lazy version of the Python primitive 545 function ``enumerate``. In particular, the following two expressions are 546 equivalent: 547 548 >>> from nltk.collections import LazyEnumerate 549 >>> sequence = ['first', 'second', 'third'] 550 >>> list(enumerate(sequence)) 551 [(0, 'first'), (1, 'second'), (2, 'third')] 552 >>> list(LazyEnumerate(sequence)) 553 [(0, 'first'), (1, 'second'), (2, 'third')] 554 555 Lazy enumerations can be useful for conserving memory in cases where the 556 argument sequences are particularly long. 557 558 A typical example of a use case for this class is obtaining an indexed 559 list for a long sequence of values. By constructing tuples lazily and 560 avoiding the creation of an additional long sequence, memory usage can be 561 significantly reduced. 562 """ 563 564 def __init__(self, lst): 565 """ 566 :param lst: the underlying list 567 :type lst: list 568 """ 569 LazyZip.__init__(self, range(len(lst)), lst) 570 571 572class LazyIteratorList(AbstractLazySequence): 573 """ 574 Wraps an iterator, loading its elements on demand 575 and making them subscriptable. 576 __repr__ displays only the first few elements. 577 """ 578 579 def __init__(self, it, known_len=None): 580 self._it = it 581 self._len = known_len 582 self._cache = [] 583 584 def __len__(self): 585 if self._len: 586 return self._len 587 for x in self.iterate_from(len(self._cache)): 588 pass 589 self._len = len(self._cache) 590 return self._len 591 592 def iterate_from(self, start): 593 """Create a new iterator over this list starting at the given offset.""" 594 while len(self._cache) < start: 595 v = next(self._it) 596 self._cache.append(v) 597 i = start 598 while i < len(self._cache): 599 yield self._cache[i] 600 i += 1 601 while True: 602 v = next(self._it) 603 self._cache.append(v) 604 yield v 605 i += 1 606 607 def __add__(self, other): 608 """Return a list concatenating self with other.""" 609 return type(self)(chain(self, other)) 610 611 def __radd__(self, other): 612 """Return a list concatenating other with self.""" 613 return type(self)(chain(other, self)) 614 615 616###################################################################### 617# Trie Implementation 618###################################################################### 619class Trie(dict): 620 """A Trie implementation for strings""" 621 622 LEAF = True 623 624 def __init__(self, strings=None): 625 """Builds a Trie object, which is built around a ``dict`` 626 627 If ``strings`` is provided, it will add the ``strings``, which 628 consist of a ``list`` of ``strings``, to the Trie. 629 Otherwise, it'll construct an empty Trie. 630 631 :param strings: List of strings to insert into the trie 632 (Default is ``None``) 633 :type strings: list(str) 634 635 """ 636 super(Trie, self).__init__() 637 if strings: 638 for string in strings: 639 self.insert(string) 640 641 def insert(self, string): 642 """Inserts ``string`` into the Trie 643 644 :param string: String to insert into the trie 645 :type string: str 646 647 :Example: 648 649 >>> from nltk.collections import Trie 650 >>> trie = Trie(["abc", "def"]) 651 >>> expected = {'a': {'b': {'c': {True: None}}}, \ 652 'd': {'e': {'f': {True: None}}}} 653 >>> trie == expected 654 True 655 656 """ 657 if len(string): 658 self[string[0]].insert(string[1:]) 659 else: 660 # mark the string is complete 661 self[Trie.LEAF] = None 662 663 def __missing__(self, key): 664 self[key] = Trie() 665 return self[key] 666