1# Natural Language Toolkit: Collections
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Steven Bird <stevenbird1@gmail.com>
5# URL: <http://nltk.org/>
6# For license information, see LICENSE.TXT
7from __future__ import print_function, absolute_import
8
9import bisect
10from itertools import islice, chain
11from functools import total_ordering
12# this unused import is for python 2.7
13from collections import defaultdict, deque, Counter
14
15from six import text_type
16
17from nltk.internals import slice_bounds, raise_unorderable_types
18from nltk.compat import python_2_unicode_compatible
19
20
21##########################################################################
22# Ordered Dictionary
23##########################################################################
24
25
26class OrderedDict(dict):
27    def __init__(self, data=None, **kwargs):
28        self._keys = self.keys(data, kwargs.get('keys'))
29        self._default_factory = kwargs.get('default_factory')
30        if data is None:
31            dict.__init__(self)
32        else:
33            dict.__init__(self, data)
34
35    def __delitem__(self, key):
36        dict.__delitem__(self, key)
37        self._keys.remove(key)
38
39    def __getitem__(self, key):
40        try:
41            return dict.__getitem__(self, key)
42        except KeyError:
43            return self.__missing__(key)
44
45    def __iter__(self):
46        return (key for key in self.keys())
47
48    def __missing__(self, key):
49        if not self._default_factory and key not in self._keys:
50            raise KeyError()
51        return self._default_factory()
52
53    def __setitem__(self, key, item):
54        dict.__setitem__(self, key, item)
55        if key not in self._keys:
56            self._keys.append(key)
57
58    def clear(self):
59        dict.clear(self)
60        self._keys.clear()
61
62    def copy(self):
63        d = dict.copy(self)
64        d._keys = self._keys
65        return d
66
67    def items(self):
68        # returns iterator under python 3 and list under python 2
69        return zip(self.keys(), self.values())
70
71    def keys(self, data=None, keys=None):
72        if data:
73            if keys:
74                assert isinstance(keys, list)
75                assert len(data) == len(keys)
76                return keys
77            else:
78                assert (
79                    isinstance(data, dict)
80                    or isinstance(data, OrderedDict)
81                    or isinstance(data, list)
82                )
83                if isinstance(data, dict) or isinstance(data, OrderedDict):
84                    return data.keys()
85                elif isinstance(data, list):
86                    return [key for (key, value) in data]
87        elif '_keys' in self.__dict__:
88            return self._keys
89        else:
90            return []
91
92    def popitem(self):
93        if not self._keys:
94            raise KeyError()
95
96        key = self._keys.pop()
97        value = self[key]
98        del self[key]
99        return (key, value)
100
101    def setdefault(self, key, failobj=None):
102        dict.setdefault(self, key, failobj)
103        if key not in self._keys:
104            self._keys.append(key)
105
106    def update(self, data):
107        dict.update(self, data)
108        for key in self.keys(data):
109            if key not in self._keys:
110                self._keys.append(key)
111
112    def values(self):
113        # returns iterator under python 3
114        return map(self.get, self._keys)
115
116
117######################################################################
118# Lazy Sequences
119######################################################################
120
121
122@total_ordering
123@python_2_unicode_compatible
124class AbstractLazySequence(object):
125    """
126    An abstract base class for read-only sequences whose values are
127    computed as needed.  Lazy sequences act like tuples -- they can be
128    indexed, sliced, and iterated over; but they may not be modified.
129
130    The most common application of lazy sequences in NLTK is for
131    corpus view objects, which provide access to the contents of a
132    corpus without loading the entire corpus into memory, by loading
133    pieces of the corpus from disk as needed.
134
135    The result of modifying a mutable element of a lazy sequence is
136    undefined.  In particular, the modifications made to the element
137    may or may not persist, depending on whether and when the lazy
138    sequence caches that element's value or reconstructs it from
139    scratch.
140
141    Subclasses are required to define two methods: ``__len__()``
142    and ``iterate_from()``.
143    """
144
145    def __len__(self):
146        """
147        Return the number of tokens in the corpus file underlying this
148        corpus view.
149        """
150        raise NotImplementedError('should be implemented by subclass')
151
152    def iterate_from(self, start):
153        """
154        Return an iterator that generates the tokens in the corpus
155        file underlying this corpus view, starting at the token number
156        ``start``.  If ``start>=len(self)``, then this iterator will
157        generate no tokens.
158        """
159        raise NotImplementedError('should be implemented by subclass')
160
161    def __getitem__(self, i):
162        """
163        Return the *i* th token in the corpus file underlying this
164        corpus view.  Negative indices and spans are both supported.
165        """
166        if isinstance(i, slice):
167            start, stop = slice_bounds(self, i)
168            return LazySubsequence(self, start, stop)
169        else:
170            # Handle negative indices
171            if i < 0:
172                i += len(self)
173            if i < 0:
174                raise IndexError('index out of range')
175            # Use iterate_from to extract it.
176            try:
177                return next(self.iterate_from(i))
178            except StopIteration:
179                raise IndexError('index out of range')
180
181    def __iter__(self):
182        """Return an iterator that generates the tokens in the corpus
183        file underlying this corpus view."""
184        return self.iterate_from(0)
185
186    def count(self, value):
187        """Return the number of times this list contains ``value``."""
188        return sum(1 for elt in self if elt == value)
189
190    def index(self, value, start=None, stop=None):
191        """Return the index of the first occurrence of ``value`` in this
192        list that is greater than or equal to ``start`` and less than
193        ``stop``.  Negative start and stop values are treated like negative
194        slice bounds -- i.e., they count from the end of the list."""
195        start, stop = slice_bounds(self, slice(start, stop))
196        for i, elt in enumerate(islice(self, start, stop)):
197            if elt == value:
198                return i + start
199        raise ValueError('index(x): x not in list')
200
201    def __contains__(self, value):
202        """Return true if this list contains ``value``."""
203        return bool(self.count(value))
204
205    def __add__(self, other):
206        """Return a list concatenating self with other."""
207        return LazyConcatenation([self, other])
208
209    def __radd__(self, other):
210        """Return a list concatenating other with self."""
211        return LazyConcatenation([other, self])
212
213    def __mul__(self, count):
214        """Return a list concatenating self with itself ``count`` times."""
215        return LazyConcatenation([self] * count)
216
217    def __rmul__(self, count):
218        """Return a list concatenating self with itself ``count`` times."""
219        return LazyConcatenation([self] * count)
220
221    _MAX_REPR_SIZE = 60
222
223    def __repr__(self):
224        """
225        Return a string representation for this corpus view that is
226        similar to a list's representation; but if it would be more
227        than 60 characters long, it is truncated.
228        """
229        pieces = []
230        length = 5
231        for elt in self:
232            pieces.append(repr(elt))
233            length += len(pieces[-1]) + 2
234            if length > self._MAX_REPR_SIZE and len(pieces) > 2:
235                return '[%s, ...]' % text_type(', ').join(pieces[:-1])
236        return '[%s]' % text_type(', ').join(pieces)
237
238    def __eq__(self, other):
239        return type(self) == type(other) and list(self) == list(other)
240
241    def __ne__(self, other):
242        return not self == other
243
244    def __lt__(self, other):
245        if type(other) != type(self):
246            raise_unorderable_types("<", self, other)
247        return list(self) < list(other)
248
249    def __hash__(self):
250        """
251        :raise ValueError: Corpus view objects are unhashable.
252        """
253        raise ValueError('%s objects are unhashable' % self.__class__.__name__)
254
255
256class LazySubsequence(AbstractLazySequence):
257    """
258    A subsequence produced by slicing a lazy sequence.  This slice
259    keeps a reference to its source sequence, and generates its values
260    by looking them up in the source sequence.
261    """
262
263    MIN_SIZE = 100
264    """
265    The minimum size for which lazy slices should be created.  If
266    ``LazySubsequence()`` is called with a subsequence that is
267    shorter than ``MIN_SIZE``, then a tuple will be returned instead.
268    """
269
270    def __new__(cls, source, start, stop):
271        """
272        Construct a new slice from a given underlying sequence.  The
273        ``start`` and ``stop`` indices should be absolute indices --
274        i.e., they should not be negative (for indexing from the back
275        of a list) or greater than the length of ``source``.
276        """
277        # If the slice is small enough, just use a tuple.
278        if stop - start < cls.MIN_SIZE:
279            return list(islice(source.iterate_from(start), stop - start))
280        else:
281            return object.__new__(cls)
282
283    def __init__(self, source, start, stop):
284        self._source = source
285        self._start = start
286        self._stop = stop
287
288    def __len__(self):
289        return self._stop - self._start
290
291    def iterate_from(self, start):
292        return islice(
293            self._source.iterate_from(start + self._start), max(0, len(self) - start)
294        )
295
296
297class LazyConcatenation(AbstractLazySequence):
298    """
299    A lazy sequence formed by concatenating a list of lists.  This
300    underlying list of lists may itself be lazy.  ``LazyConcatenation``
301    maintains an index that it uses to keep track of the relationship
302    between offsets in the concatenated lists and offsets in the
303    sublists.
304    """
305
306    def __init__(self, list_of_lists):
307        self._list = list_of_lists
308        self._offsets = [0]
309
310    def __len__(self):
311        if len(self._offsets) <= len(self._list):
312            for tok in self.iterate_from(self._offsets[-1]):
313                pass
314        return self._offsets[-1]
315
316    def iterate_from(self, start_index):
317        if start_index < self._offsets[-1]:
318            sublist_index = bisect.bisect_right(self._offsets, start_index) - 1
319        else:
320            sublist_index = len(self._offsets) - 1
321
322        index = self._offsets[sublist_index]
323
324        # Construct an iterator over the sublists.
325        if isinstance(self._list, AbstractLazySequence):
326            sublist_iter = self._list.iterate_from(sublist_index)
327        else:
328            sublist_iter = islice(self._list, sublist_index, None)
329
330        for sublist in sublist_iter:
331            if sublist_index == (len(self._offsets) - 1):
332                assert (
333                    index + len(sublist) >= self._offsets[-1]
334                ), 'offests not monotonic increasing!'
335                self._offsets.append(index + len(sublist))
336            else:
337                assert self._offsets[sublist_index + 1] == index + len(
338                    sublist
339                ), 'inconsistent list value (num elts)'
340
341            for value in sublist[max(0, start_index - index) :]:
342                yield value
343
344            index += len(sublist)
345            sublist_index += 1
346
347
348class LazyMap(AbstractLazySequence):
349    """
350    A lazy sequence whose elements are formed by applying a given
351    function to each element in one or more underlying lists.  The
352    function is applied lazily -- i.e., when you read a value from the
353    list, ``LazyMap`` will calculate that value by applying its
354    function to the underlying lists' value(s).  ``LazyMap`` is
355    essentially a lazy version of the Python primitive function
356    ``map``.  In particular, the following two expressions are
357    equivalent:
358
359        >>> from nltk.collections import LazyMap
360        >>> function = str
361        >>> sequence = [1,2,3]
362        >>> map(function, sequence) # doctest: +SKIP
363        ['1', '2', '3']
364        >>> list(LazyMap(function, sequence))
365        ['1', '2', '3']
366
367    Like the Python ``map`` primitive, if the source lists do not have
368    equal size, then the value None will be supplied for the
369    'missing' elements.
370
371    Lazy maps can be useful for conserving memory, in cases where
372    individual values take up a lot of space.  This is especially true
373    if the underlying list's values are constructed lazily, as is the
374    case with many corpus readers.
375
376    A typical example of a use case for this class is performing
377    feature detection on the tokens in a corpus.  Since featuresets
378    are encoded as dictionaries, which can take up a lot of memory,
379    using a ``LazyMap`` can significantly reduce memory usage when
380    training and running classifiers.
381    """
382
383    def __init__(self, function, *lists, **config):
384        """
385        :param function: The function that should be applied to
386            elements of ``lists``.  It should take as many arguments
387            as there are ``lists``.
388        :param lists: The underlying lists.
389        :param cache_size: Determines the size of the cache used
390            by this lazy map.  (default=5)
391        """
392        if not lists:
393            raise TypeError('LazyMap requires at least two args')
394
395        self._lists = lists
396        self._func = function
397        self._cache_size = config.get('cache_size', 5)
398        self._cache = {} if self._cache_size > 0 else None
399
400        # If you just take bool() of sum() here _all_lazy will be true just
401        # in case n >= 1 list is an AbstractLazySequence.  Presumably this
402        # isn't what's intended.
403        self._all_lazy = sum(
404            isinstance(lst, AbstractLazySequence) for lst in lists
405        ) == len(lists)
406
407    def iterate_from(self, index):
408        # Special case: one lazy sublist
409        if len(self._lists) == 1 and self._all_lazy:
410            for value in self._lists[0].iterate_from(index):
411                yield self._func(value)
412            return
413
414        # Special case: one non-lazy sublist
415        elif len(self._lists) == 1:
416            while True:
417                try:
418                    yield self._func(self._lists[0][index])
419                except IndexError:
420                    return
421                index += 1
422
423        # Special case: n lazy sublists
424        elif self._all_lazy:
425            iterators = [lst.iterate_from(index) for lst in self._lists]
426            while True:
427                elements = []
428                for iterator in iterators:
429                    try:
430                        elements.append(next(iterator))
431                    except:  # FIXME: What is this except really catching? StopIteration?
432                        elements.append(None)
433                if elements == [None] * len(self._lists):
434                    return
435                yield self._func(*elements)
436                index += 1
437
438        # general case
439        else:
440            while True:
441                try:
442                    elements = [lst[index] for lst in self._lists]
443                except IndexError:
444                    elements = [None] * len(self._lists)
445                    for i, lst in enumerate(self._lists):
446                        try:
447                            elements[i] = lst[index]
448                        except IndexError:
449                            pass
450                    if elements == [None] * len(self._lists):
451                        return
452                yield self._func(*elements)
453                index += 1
454
455    def __getitem__(self, index):
456        if isinstance(index, slice):
457            sliced_lists = [lst[index] for lst in self._lists]
458            return LazyMap(self._func, *sliced_lists)
459        else:
460            # Handle negative indices
461            if index < 0:
462                index += len(self)
463            if index < 0:
464                raise IndexError('index out of range')
465            # Check the cache
466            if self._cache is not None and index in self._cache:
467                return self._cache[index]
468            # Calculate the value
469            try:
470                val = next(self.iterate_from(index))
471            except StopIteration:
472                raise IndexError('index out of range')
473            # Update the cache
474            if self._cache is not None:
475                if len(self._cache) > self._cache_size:
476                    self._cache.popitem()  # discard random entry
477                self._cache[index] = val
478            # Return the value
479            return val
480
481    def __len__(self):
482        return max(len(lst) for lst in self._lists)
483
484
485class LazyZip(LazyMap):
486    """
487    A lazy sequence whose elements are tuples, each containing the i-th
488    element from each of the argument sequences.  The returned list is
489    truncated in length to the length of the shortest argument sequence. The
490    tuples are constructed lazily -- i.e., when you read a value from the
491    list, ``LazyZip`` will calculate that value by forming a tuple from
492    the i-th element of each of the argument sequences.
493
494    ``LazyZip`` is essentially a lazy version of the Python primitive function
495    ``zip``.  In particular, an evaluated LazyZip is equivalent to a zip:
496
497        >>> from nltk.collections import LazyZip
498        >>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c']
499        >>> zip(sequence1, sequence2) # doctest: +SKIP
500        [(1, 'a'), (2, 'b'), (3, 'c')]
501        >>> list(LazyZip(sequence1, sequence2))
502        [(1, 'a'), (2, 'b'), (3, 'c')]
503        >>> sequences = [sequence1, sequence2, [6,7,8,9]]
504        >>> list(zip(*sequences)) == list(LazyZip(*sequences))
505        True
506
507    Lazy zips can be useful for conserving memory in cases where the argument
508    sequences are particularly long.
509
510    A typical example of a use case for this class is combining long sequences
511    of gold standard and predicted values in a classification or tagging task
512    in order to calculate accuracy.  By constructing tuples lazily and
513    avoiding the creation of an additional long sequence, memory usage can be
514    significantly reduced.
515    """
516
517    def __init__(self, *lists):
518        """
519        :param lists: the underlying lists
520        :type lists: list(list)
521        """
522        LazyMap.__init__(self, lambda *elts: elts, *lists)
523
524    def iterate_from(self, index):
525        iterator = LazyMap.iterate_from(self, index)
526        while index < len(self):
527            yield next(iterator)
528            index += 1
529        return
530
531    def __len__(self):
532        return min(len(lst) for lst in self._lists)
533
534
535class LazyEnumerate(LazyZip):
536    """
537    A lazy sequence whose elements are tuples, each ontaining a count (from
538    zero) and a value yielded by underlying sequence.  ``LazyEnumerate`` is
539    useful for obtaining an indexed list. The tuples are constructed lazily
540    -- i.e., when you read a value from the list, ``LazyEnumerate`` will
541    calculate that value by forming a tuple from the count of the i-th
542    element and the i-th element of the underlying sequence.
543
544    ``LazyEnumerate`` is essentially a lazy version of the Python primitive
545    function ``enumerate``.  In particular, the following two expressions are
546    equivalent:
547
548        >>> from nltk.collections import LazyEnumerate
549        >>> sequence = ['first', 'second', 'third']
550        >>> list(enumerate(sequence))
551        [(0, 'first'), (1, 'second'), (2, 'third')]
552        >>> list(LazyEnumerate(sequence))
553        [(0, 'first'), (1, 'second'), (2, 'third')]
554
555    Lazy enumerations can be useful for conserving memory in cases where the
556    argument sequences are particularly long.
557
558    A typical example of a use case for this class is obtaining an indexed
559    list for a long sequence of values.  By constructing tuples lazily and
560    avoiding the creation of an additional long sequence, memory usage can be
561    significantly reduced.
562    """
563
564    def __init__(self, lst):
565        """
566        :param lst: the underlying list
567        :type lst: list
568        """
569        LazyZip.__init__(self, range(len(lst)), lst)
570
571
572class LazyIteratorList(AbstractLazySequence):
573    """
574    Wraps an iterator, loading its elements on demand
575    and making them subscriptable.
576    __repr__ displays only the first few elements.
577    """
578
579    def __init__(self, it, known_len=None):
580        self._it = it
581        self._len = known_len
582        self._cache = []
583
584    def __len__(self):
585        if self._len:
586            return self._len
587        for x in self.iterate_from(len(self._cache)):
588            pass
589        self._len = len(self._cache)
590        return self._len
591
592    def iterate_from(self, start):
593        """Create a new iterator over this list starting at the given offset."""
594        while len(self._cache) < start:
595            v = next(self._it)
596            self._cache.append(v)
597        i = start
598        while i < len(self._cache):
599            yield self._cache[i]
600            i += 1
601        while True:
602            v = next(self._it)
603            self._cache.append(v)
604            yield v
605            i += 1
606
607    def __add__(self, other):
608        """Return a list concatenating self with other."""
609        return type(self)(chain(self, other))
610
611    def __radd__(self, other):
612        """Return a list concatenating other with self."""
613        return type(self)(chain(other, self))
614
615
616######################################################################
617# Trie Implementation
618######################################################################
619class Trie(dict):
620    """A Trie implementation for strings"""
621
622    LEAF = True
623
624    def __init__(self, strings=None):
625        """Builds a Trie object, which is built around a ``dict``
626
627        If ``strings`` is provided, it will add the ``strings``, which
628        consist of a ``list`` of ``strings``, to the Trie.
629        Otherwise, it'll construct an empty Trie.
630
631        :param strings: List of strings to insert into the trie
632            (Default is ``None``)
633        :type strings: list(str)
634
635        """
636        super(Trie, self).__init__()
637        if strings:
638            for string in strings:
639                self.insert(string)
640
641    def insert(self, string):
642        """Inserts ``string`` into the Trie
643
644        :param string: String to insert into the trie
645        :type string: str
646
647        :Example:
648
649        >>> from nltk.collections import Trie
650        >>> trie = Trie(["abc", "def"])
651        >>> expected = {'a': {'b': {'c': {True: None}}}, \
652                        'd': {'e': {'f': {True: None}}}}
653        >>> trie == expected
654        True
655
656        """
657        if len(string):
658            self[string[0]].insert(string[1:])
659        else:
660            # mark the string is complete
661            self[Trie.LEAF] = None
662
663    def __missing__(self, key):
664        self[key] = Trie()
665        return self[key]
666