1# -*- coding: utf-8 -*-
2# Natural Language Toolkit: A Chart Parser
3#
4# Copyright (C) 2001-2019 NLTK Project
5# Author: Edward Loper <edloper@gmail.com>
6#         Steven Bird <stevenbird1@gmail.com>
7#         Jean Mark Gawron <gawron@mail.sdsu.edu>
8#         Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
9# URL: <http://nltk.org/>
10# For license information, see LICENSE.TXT
11
12"""
13Data classes and parser implementations for "chart parsers", which
14use dynamic programming to efficiently parse a text.  A chart
15parser derives parse trees for a text by iteratively adding "edges"
16to a "chart."  Each edge represents a hypothesis about the tree
17structure for a subsequence of the text.  The chart is a
18"blackboard" for composing and combining these hypotheses.
19
20When a chart parser begins parsing a text, it creates a new (empty)
21chart, spanning the text.  It then incrementally adds new edges to the
22chart.  A set of "chart rules" specifies the conditions under which
23new edges should be added to the chart.  Once the chart reaches a
24stage where none of the chart rules adds any new edges, parsing is
25complete.
26
27Charts are encoded with the ``Chart`` class, and edges are encoded with
28the ``TreeEdge`` and ``LeafEdge`` classes.  The chart parser module
29defines three chart parsers:
30
31  - ``ChartParser`` is a simple and flexible chart parser.  Given a
32    set of chart rules, it will apply those rules to the chart until
33    no more edges are added.
34
35  - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can
36    be used to step through the parsing process.
37"""
38from __future__ import print_function, division, unicode_literals
39
40import itertools
41import re
42import warnings
43from functools import total_ordering
44
45from six.moves import range
46
47from nltk.tree import Tree
48from nltk.grammar import PCFG, is_nonterminal, is_terminal
49from nltk.util import OrderedDict
50from nltk.internals import raise_unorderable_types
51from nltk.compat import python_2_unicode_compatible, unicode_repr
52
53from nltk.parse.api import ParserI
54
55
56########################################################################
57##  Edges
58########################################################################
59
60
61@total_ordering
62class EdgeI(object):
63    """
64    A hypothesis about the structure of part of a sentence.
65    Each edge records the fact that a structure is (partially)
66    consistent with the sentence.  An edge contains:
67
68    - A span, indicating what part of the sentence is
69      consistent with the hypothesized structure.
70    - A left-hand side, specifying what kind of structure is
71      hypothesized.
72    - A right-hand side, specifying the contents of the
73      hypothesized structure.
74    - A dot position, indicating how much of the hypothesized
75      structure is consistent with the sentence.
76
77    Every edge is either complete or incomplete:
78
79    - An edge is complete if its structure is fully consistent
80      with the sentence.
81    - An edge is incomplete if its structure is partially
82      consistent with the sentence.  For every incomplete edge, the
83      span specifies a possible prefix for the edge's structure.
84
85    There are two kinds of edge:
86
87    - A ``TreeEdge`` records which trees have been found to
88      be (partially) consistent with the text.
89    - A ``LeafEdge`` records the tokens occurring in the text.
90
91    The ``EdgeI`` interface provides a common interface to both types
92    of edge, allowing chart parsers to treat them in a uniform manner.
93    """
94
95    def __init__(self):
96        if self.__class__ == EdgeI:
97            raise TypeError('Edge is an abstract interface')
98
99    # ////////////////////////////////////////////////////////////
100    # Span
101    # ////////////////////////////////////////////////////////////
102
103    def span(self):
104        """
105        Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the
106        portion of the sentence that is consistent with this
107        edge's structure.
108
109        :rtype: tuple(int, int)
110        """
111        raise NotImplementedError()
112
113    def start(self):
114        """
115        Return the start index of this edge's span.
116
117        :rtype: int
118        """
119        raise NotImplementedError()
120
121    def end(self):
122        """
123        Return the end index of this edge's span.
124
125        :rtype: int
126        """
127        raise NotImplementedError()
128
129    def length(self):
130        """
131        Return the length of this edge's span.
132
133        :rtype: int
134        """
135        raise NotImplementedError()
136
137    # ////////////////////////////////////////////////////////////
138    # Left Hand Side
139    # ////////////////////////////////////////////////////////////
140
141    def lhs(self):
142        """
143        Return this edge's left-hand side, which specifies what kind
144        of structure is hypothesized by this edge.
145
146        :see: ``TreeEdge`` and ``LeafEdge`` for a description of
147            the left-hand side values for each edge type.
148        """
149        raise NotImplementedError()
150
151    # ////////////////////////////////////////////////////////////
152    # Right Hand Side
153    # ////////////////////////////////////////////////////////////
154
155    def rhs(self):
156        """
157        Return this edge's right-hand side, which specifies
158        the content of the structure hypothesized by this edge.
159
160        :see: ``TreeEdge`` and ``LeafEdge`` for a description of
161            the right-hand side values for each edge type.
162        """
163        raise NotImplementedError()
164
165    def dot(self):
166        """
167        Return this edge's dot position, which indicates how much of
168        the hypothesized structure is consistent with the
169        sentence.  In particular, ``self.rhs[:dot]`` is consistent
170        with ``tokens[self.start():self.end()]``.
171
172        :rtype: int
173        """
174        raise NotImplementedError()
175
176    def nextsym(self):
177        """
178        Return the element of this edge's right-hand side that
179        immediately follows its dot.
180
181        :rtype: Nonterminal or terminal or None
182        """
183        raise NotImplementedError()
184
185    def is_complete(self):
186        """
187        Return True if this edge's structure is fully consistent
188        with the text.
189
190        :rtype: bool
191        """
192        raise NotImplementedError()
193
194    def is_incomplete(self):
195        """
196        Return True if this edge's structure is partially consistent
197        with the text.
198
199        :rtype: bool
200        """
201        raise NotImplementedError()
202
203    # ////////////////////////////////////////////////////////////
204    # Comparisons & hashing
205    # ////////////////////////////////////////////////////////////
206
207    def __eq__(self, other):
208        return (
209            self.__class__ is other.__class__
210            and self._comparison_key == other._comparison_key
211        )
212
213    def __ne__(self, other):
214        return not self == other
215
216    def __lt__(self, other):
217        if not isinstance(other, EdgeI):
218            raise_unorderable_types("<", self, other)
219        if self.__class__ is other.__class__:
220            return self._comparison_key < other._comparison_key
221        else:
222            return self.__class__.__name__ < other.__class__.__name__
223
224    def __hash__(self):
225        try:
226            return self._hash
227        except AttributeError:
228            self._hash = hash(self._comparison_key)
229            return self._hash
230
231
232@python_2_unicode_compatible
233class TreeEdge(EdgeI):
234    """
235    An edge that records the fact that a tree is (partially)
236    consistent with the sentence.  A tree edge consists of:
237
238    - A span, indicating what part of the sentence is
239      consistent with the hypothesized tree.
240    - A left-hand side, specifying the hypothesized tree's node
241      value.
242    - A right-hand side, specifying the hypothesized tree's
243      children.  Each element of the right-hand side is either a
244      terminal, specifying a token with that terminal as its leaf
245      value; or a nonterminal, specifying a subtree with that
246      nonterminal's symbol as its node value.
247    - A dot position, indicating which children are consistent
248      with part of the sentence.  In particular, if ``dot`` is the
249      dot position, ``rhs`` is the right-hand size, ``(start,end)``
250      is the span, and ``sentence`` is the list of tokens in the
251      sentence, then ``tokens[start:end]`` can be spanned by the
252      children specified by ``rhs[:dot]``.
253
254    For more information about edges, see the ``EdgeI`` interface.
255    """
256
257    def __init__(self, span, lhs, rhs, dot=0):
258        """
259        Construct a new ``TreeEdge``.
260
261        :type span: tuple(int, int)
262        :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the
263            portion of the sentence that is consistent with the new
264            edge's structure.
265        :type lhs: Nonterminal
266        :param lhs: The new edge's left-hand side, specifying the
267            hypothesized tree's node value.
268        :type rhs: list(Nonterminal and str)
269        :param rhs: The new edge's right-hand side, specifying the
270            hypothesized tree's children.
271        :type dot: int
272        :param dot: The position of the new edge's dot.  This position
273            specifies what prefix of the production's right hand side
274            is consistent with the text.  In particular, if
275            ``sentence`` is the list of tokens in the sentence, then
276            ``okens[span[0]:span[1]]`` can be spanned by the
277            children specified by ``rhs[:dot]``.
278        """
279        self._span = span
280        self._lhs = lhs
281        rhs = tuple(rhs)
282        self._rhs = rhs
283        self._dot = dot
284        self._comparison_key = (span, lhs, rhs, dot)
285
286    @staticmethod
287    def from_production(production, index):
288        """
289        Return a new ``TreeEdge`` formed from the given production.
290        The new edge's left-hand side and right-hand side will
291        be taken from ``production``; its span will be
292        ``(index,index)``; and its dot position will be ``0``.
293
294        :rtype: TreeEdge
295        """
296        return TreeEdge(
297            span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0
298        )
299
300    def move_dot_forward(self, new_end):
301        """
302        Return a new ``TreeEdge`` formed from this edge.
303        The new edge's dot position is increased by ``1``,
304        and its end index will be replaced by ``new_end``.
305
306        :param new_end: The new end index.
307        :type new_end: int
308        :rtype: TreeEdge
309        """
310        return TreeEdge(
311            span=(self._span[0], new_end),
312            lhs=self._lhs,
313            rhs=self._rhs,
314            dot=self._dot + 1,
315        )
316
317    # Accessors
318    def lhs(self):
319        return self._lhs
320
321    def span(self):
322        return self._span
323
324    def start(self):
325        return self._span[0]
326
327    def end(self):
328        return self._span[1]
329
330    def length(self):
331        return self._span[1] - self._span[0]
332
333    def rhs(self):
334        return self._rhs
335
336    def dot(self):
337        return self._dot
338
339    def is_complete(self):
340        return self._dot == len(self._rhs)
341
342    def is_incomplete(self):
343        return self._dot != len(self._rhs)
344
345    def nextsym(self):
346        if self._dot >= len(self._rhs):
347            return None
348        else:
349            return self._rhs[self._dot]
350
351    # String representation
352    def __str__(self):
353        str = '[%s:%s] ' % (self._span[0], self._span[1])
354        str += '%-2r ->' % (self._lhs,)
355
356        for i in range(len(self._rhs)):
357            if i == self._dot:
358                str += ' *'
359            str += ' %s' % unicode_repr(self._rhs[i])
360        if len(self._rhs) == self._dot:
361            str += ' *'
362        return str
363
364    def __repr__(self):
365        return '[Edge: %s]' % self
366
367
368@python_2_unicode_compatible
369class LeafEdge(EdgeI):
370    """
371    An edge that records the fact that a leaf value is consistent with
372    a word in the sentence.  A leaf edge consists of:
373
374    - An index, indicating the position of the word.
375    - A leaf, specifying the word's content.
376
377    A leaf edge's left-hand side is its leaf value, and its right hand
378    side is ``()``.  Its span is ``[index, index+1]``, and its dot
379    position is ``0``.
380    """
381
382    def __init__(self, leaf, index):
383        """
384        Construct a new ``LeafEdge``.
385
386        :param leaf: The new edge's leaf value, specifying the word
387            that is recorded by this edge.
388        :param index: The new edge's index, specifying the position of
389            the word that is recorded by this edge.
390        """
391        self._leaf = leaf
392        self._index = index
393        self._comparison_key = (leaf, index)
394
395    # Accessors
396    def lhs(self):
397        return self._leaf
398
399    def span(self):
400        return (self._index, self._index + 1)
401
402    def start(self):
403        return self._index
404
405    def end(self):
406        return self._index + 1
407
408    def length(self):
409        return 1
410
411    def rhs(self):
412        return ()
413
414    def dot(self):
415        return 0
416
417    def is_complete(self):
418        return True
419
420    def is_incomplete(self):
421        return False
422
423    def nextsym(self):
424        return None
425
426    # String representations
427    def __str__(self):
428        return '[%s:%s] %s' % (self._index, self._index + 1, unicode_repr(self._leaf))
429
430    def __repr__(self):
431        return '[Edge: %s]' % (self)
432
433
434########################################################################
435##  Chart
436########################################################################
437
438
439class Chart(object):
440    """
441    A blackboard for hypotheses about the syntactic constituents of a
442    sentence.  A chart contains a set of edges, and each edge encodes
443    a single hypothesis about the structure of some portion of the
444    sentence.
445
446    The ``select`` method can be used to select a specific collection
447    of edges.  For example ``chart.select(is_complete=True, start=0)``
448    yields all complete edges whose start indices are 0.  To ensure
449    the efficiency of these selection operations, ``Chart`` dynamically
450    creates and maintains an index for each set of attributes that
451    have been selected on.
452
453    In order to reconstruct the trees that are represented by an edge,
454    the chart associates each edge with a set of child pointer lists.
455    A child pointer list is a list of the edges that license an
456    edge's right-hand side.
457
458    :ivar _tokens: The sentence that the chart covers.
459    :ivar _num_leaves: The number of tokens.
460    :ivar _edges: A list of the edges in the chart
461    :ivar _edge_to_cpls: A dictionary mapping each edge to a set
462        of child pointer lists that are associated with that edge.
463    :ivar _indexes: A dictionary mapping tuples of edge attributes
464        to indices, where each index maps the corresponding edge
465        attribute values to lists of edges.
466    """
467
468    def __init__(self, tokens):
469        """
470        Construct a new chart. The chart is initialized with the
471        leaf edges corresponding to the terminal leaves.
472
473        :type tokens: list
474        :param tokens: The sentence that this chart will be used to parse.
475        """
476        # Record the sentence token and the sentence length.
477        self._tokens = tuple(tokens)
478        self._num_leaves = len(self._tokens)
479
480        # Initialise the chart.
481        self.initialize()
482
483    def initialize(self):
484        """
485        Clear the chart.
486        """
487        # A list of edges contained in this chart.
488        self._edges = []
489
490        # The set of child pointer lists associated with each edge.
491        self._edge_to_cpls = {}
492
493        # Indexes mapping attribute values to lists of edges
494        # (used by select()).
495        self._indexes = {}
496
497    # ////////////////////////////////////////////////////////////
498    # Sentence Access
499    # ////////////////////////////////////////////////////////////
500
501    def num_leaves(self):
502        """
503        Return the number of words in this chart's sentence.
504
505        :rtype: int
506        """
507        return self._num_leaves
508
509    def leaf(self, index):
510        """
511        Return the leaf value of the word at the given index.
512
513        :rtype: str
514        """
515        return self._tokens[index]
516
517    def leaves(self):
518        """
519        Return a list of the leaf values of each word in the
520        chart's sentence.
521
522        :rtype: list(str)
523        """
524        return self._tokens
525
526    # ////////////////////////////////////////////////////////////
527    # Edge access
528    # ////////////////////////////////////////////////////////////
529
530    def edges(self):
531        """
532        Return a list of all edges in this chart.  New edges
533        that are added to the chart after the call to edges()
534        will *not* be contained in this list.
535
536        :rtype: list(EdgeI)
537        :see: ``iteredges``, ``select``
538        """
539        return self._edges[:]
540
541    def iteredges(self):
542        """
543        Return an iterator over the edges in this chart.  It is
544        not guaranteed that new edges which are added to the
545        chart before the iterator is exhausted will also be generated.
546
547        :rtype: iter(EdgeI)
548        :see: ``edges``, ``select``
549        """
550        return iter(self._edges)
551
552    # Iterating over the chart yields its edges.
553    __iter__ = iteredges
554
555    def num_edges(self):
556        """
557        Return the number of edges contained in this chart.
558
559        :rtype: int
560        """
561        return len(self._edge_to_cpls)
562
563    def select(self, **restrictions):
564        """
565        Return an iterator over the edges in this chart.  Any
566        new edges that are added to the chart before the iterator
567        is exahusted will also be generated.  ``restrictions``
568        can be used to restrict the set of edges that will be
569        generated.
570
571        :param span: Only generate edges ``e`` where ``e.span()==span``
572        :param start: Only generate edges ``e`` where ``e.start()==start``
573        :param end: Only generate edges ``e`` where ``e.end()==end``
574        :param length: Only generate edges ``e`` where ``e.length()==length``
575        :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs``
576        :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs``
577        :param nextsym: Only generate edges ``e`` where
578            ``e.nextsym()==nextsym``
579        :param dot: Only generate edges ``e`` where ``e.dot()==dot``
580        :param is_complete: Only generate edges ``e`` where
581            ``e.is_complete()==is_complete``
582        :param is_incomplete: Only generate edges ``e`` where
583            ``e.is_incomplete()==is_incomplete``
584        :rtype: iter(EdgeI)
585        """
586        # If there are no restrictions, then return all edges.
587        if restrictions == {}:
588            return iter(self._edges)
589
590        # Find the index corresponding to the given restrictions.
591        restr_keys = sorted(restrictions.keys())
592        restr_keys = tuple(restr_keys)
593
594        # If it doesn't exist, then create it.
595        if restr_keys not in self._indexes:
596            self._add_index(restr_keys)
597
598        vals = tuple(restrictions[key] for key in restr_keys)
599        return iter(self._indexes[restr_keys].get(vals, []))
600
601    def _add_index(self, restr_keys):
602        """
603        A helper function for ``select``, which creates a new index for
604        a given set of attributes (aka restriction keys).
605        """
606        # Make sure it's a valid index.
607        for key in restr_keys:
608            if not hasattr(EdgeI, key):
609                raise ValueError('Bad restriction: %s' % key)
610
611        # Create the index.
612        index = self._indexes[restr_keys] = {}
613
614        # Add all existing edges to the index.
615        for edge in self._edges:
616            vals = tuple(getattr(edge, key)() for key in restr_keys)
617            index.setdefault(vals, []).append(edge)
618
619    def _register_with_indexes(self, edge):
620        """
621        A helper function for ``insert``, which registers the new
622        edge with all existing indexes.
623        """
624        for (restr_keys, index) in self._indexes.items():
625            vals = tuple(getattr(edge, key)() for key in restr_keys)
626            index.setdefault(vals, []).append(edge)
627
628    # ////////////////////////////////////////////////////////////
629    # Edge Insertion
630    # ////////////////////////////////////////////////////////////
631
632    def insert_with_backpointer(self, new_edge, previous_edge, child_edge):
633        """
634        Add a new edge to the chart, using a pointer to the previous edge.
635        """
636        cpls = self.child_pointer_lists(previous_edge)
637        new_cpls = [cpl + (child_edge,) for cpl in cpls]
638        return self.insert(new_edge, *new_cpls)
639
640    def insert(self, edge, *child_pointer_lists):
641        """
642        Add a new edge to the chart, and return True if this operation
643        modified the chart.  In particular, return true iff the chart
644        did not already contain ``edge``, or if it did not already associate
645        ``child_pointer_lists`` with ``edge``.
646
647        :type edge: EdgeI
648        :param edge: The new edge
649        :type child_pointer_lists: sequence of tuple(EdgeI)
650        :param child_pointer_lists: A sequence of lists of the edges that
651            were used to form this edge.  This list is used to reconstruct
652            the trees (or partial trees) that are associated with ``edge``.
653        :rtype: bool
654        """
655        # Is it a new edge?
656        if edge not in self._edge_to_cpls:
657            # Add it to the list of edges.
658            self._append_edge(edge)
659            # Register with indexes.
660            self._register_with_indexes(edge)
661
662        # Get the set of child pointer lists for this edge.
663        cpls = self._edge_to_cpls.setdefault(edge, OrderedDict())
664        chart_was_modified = False
665        for child_pointer_list in child_pointer_lists:
666            child_pointer_list = tuple(child_pointer_list)
667            if child_pointer_list not in cpls:
668                # It's a new CPL; register it, and return true.
669                cpls[child_pointer_list] = True
670                chart_was_modified = True
671        return chart_was_modified
672
673    def _append_edge(self, edge):
674        self._edges.append(edge)
675
676    # ////////////////////////////////////////////////////////////
677    # Tree extraction & child pointer lists
678    # ////////////////////////////////////////////////////////////
679
680    def parses(self, root, tree_class=Tree):
681        """
682        Return an iterator of the complete tree structures that span
683        the entire chart, and whose root node is ``root``.
684        """
685        for edge in self.select(start=0, end=self._num_leaves, lhs=root):
686            for tree in self.trees(edge, tree_class=tree_class, complete=True):
687                yield tree
688
689    def trees(self, edge, tree_class=Tree, complete=False):
690        """
691        Return an iterator of the tree structures that are associated
692        with ``edge``.
693
694        If ``edge`` is incomplete, then the unexpanded children will be
695        encoded as childless subtrees, whose node value is the
696        corresponding terminal or nonterminal.
697
698        :rtype: list(Tree)
699        :note: If two trees share a common subtree, then the same
700            Tree may be used to encode that subtree in
701            both trees.  If you need to eliminate this subtree
702            sharing, then create a deep copy of each tree.
703        """
704        return iter(self._trees(edge, complete, memo={}, tree_class=tree_class))
705
706    def _trees(self, edge, complete, memo, tree_class):
707        """
708        A helper function for ``trees``.
709
710        :param memo: A dictionary used to record the trees that we've
711            generated for each edge, so that when we see an edge more
712            than once, we can reuse the same trees.
713        """
714        # If we've seen this edge before, then reuse our old answer.
715        if edge in memo:
716            return memo[edge]
717
718        # when we're reading trees off the chart, don't use incomplete edges
719        if complete and edge.is_incomplete():
720            return []
721
722        # Leaf edges.
723        if isinstance(edge, LeafEdge):
724            leaf = self._tokens[edge.start()]
725            memo[edge] = [leaf]
726            return [leaf]
727
728        # Until we're done computing the trees for edge, set
729        # memo[edge] to be empty.  This has the effect of filtering
730        # out any cyclic trees (i.e., trees that contain themselves as
731        # descendants), because if we reach this edge via a cycle,
732        # then it will appear that the edge doesn't generate any trees.
733        memo[edge] = []
734        trees = []
735        lhs = edge.lhs().symbol()
736
737        # Each child pointer list can be used to form trees.
738        for cpl in self.child_pointer_lists(edge):
739            # Get the set of child choices for each child pointer.
740            # child_choices[i] is the set of choices for the tree's
741            # ith child.
742            child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl]
743
744            # For each combination of children, add a tree.
745            for children in itertools.product(*child_choices):
746                trees.append(tree_class(lhs, children))
747
748        # If the edge is incomplete, then extend it with "partial trees":
749        if edge.is_incomplete():
750            unexpanded = [tree_class(elt, []) for elt in edge.rhs()[edge.dot() :]]
751            for tree in trees:
752                tree.extend(unexpanded)
753
754        # Update the memoization dictionary.
755        memo[edge] = trees
756
757        # Return the list of trees.
758        return trees
759
760    def child_pointer_lists(self, edge):
761        """
762        Return the set of child pointer lists for the given edge.
763        Each child pointer list is a list of edges that have
764        been used to form this edge.
765
766        :rtype: list(list(EdgeI))
767        """
768        # Make a copy, in case they modify it.
769        return self._edge_to_cpls.get(edge, {}).keys()
770
771    # ////////////////////////////////////////////////////////////
772    # Display
773    # ////////////////////////////////////////////////////////////
774    def pretty_format_edge(self, edge, width=None):
775        """
776        Return a pretty-printed string representation of a given edge
777        in this chart.
778
779        :rtype: str
780        :param width: The number of characters allotted to each
781            index in the sentence.
782        """
783        if width is None:
784            width = 50 // (self.num_leaves() + 1)
785        (start, end) = (edge.start(), edge.end())
786
787        str = '|' + ('.' + ' ' * (width - 1)) * start
788
789        # Zero-width edges are "#" if complete, ">" if incomplete
790        if start == end:
791            if edge.is_complete():
792                str += '#'
793            else:
794                str += '>'
795
796        # Spanning complete edges are "[===]"; Other edges are
797        # "[---]" if complete, "[--->" if incomplete
798        elif edge.is_complete() and edge.span() == (0, self._num_leaves):
799            str += '[' + ('=' * width) * (end - start - 1) + '=' * (width - 1) + ']'
800        elif edge.is_complete():
801            str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + ']'
802        else:
803            str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + '>'
804
805        str += (' ' * (width - 1) + '.') * (self._num_leaves - end)
806        return str + '| %s' % edge
807
808    def pretty_format_leaves(self, width=None):
809        """
810        Return a pretty-printed string representation of this
811        chart's leaves.  This string can be used as a header
812        for calls to ``pretty_format_edge``.
813        """
814        if width is None:
815            width = 50 // (self.num_leaves() + 1)
816
817        if self._tokens is not None and width > 1:
818            header = '|.'
819            for tok in self._tokens:
820                header += tok[: width - 1].center(width - 1) + '.'
821            header += '|'
822        else:
823            header = ''
824
825        return header
826
827    def pretty_format(self, width=None):
828        """
829        Return a pretty-printed string representation of this chart.
830
831        :param width: The number of characters allotted to each
832            index in the sentence.
833        :rtype: str
834        """
835        if width is None:
836            width = 50 // (self.num_leaves() + 1)
837        # sort edges: primary key=length, secondary key=start index.
838        # (and filter out the token edges)
839        edges = sorted([(e.length(), e.start(), e) for e in self])
840        edges = [e for (_, _, e) in edges]
841
842        return (
843            self.pretty_format_leaves(width)
844            + '\n'
845            + '\n'.join(self.pretty_format_edge(edge, width) for edge in edges)
846        )
847
848    # ////////////////////////////////////////////////////////////
849    # Display: Dot (AT&T Graphviz)
850    # ////////////////////////////////////////////////////////////
851
852    def dot_digraph(self):
853        # Header
854        s = 'digraph nltk_chart {\n'
855        # s += '  size="5,5";\n'
856        s += '  rankdir=LR;\n'
857        s += '  node [height=0.1,width=0.1];\n'
858        s += '  node [style=filled, color="lightgray"];\n'
859
860        # Set up the nodes
861        for y in range(self.num_edges(), -1, -1):
862            if y == 0:
863                s += '  node [style=filled, color="black"];\n'
864            for x in range(self.num_leaves() + 1):
865                if y == 0 or (
866                    x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
867                ):
868                    s += '  %04d.%04d [label=""];\n' % (x, y)
869
870        # Add a spacer
871        s += '  x [style=invis]; x->0000.0000 [style=invis];\n'
872
873        # Declare ranks.
874        for x in range(self.num_leaves() + 1):
875            s += '  {rank=same;'
876            for y in range(self.num_edges() + 1):
877                if y == 0 or (
878                    x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end()
879                ):
880                    s += ' %04d.%04d' % (x, y)
881            s += '}\n'
882
883        # Add the leaves
884        s += '  edge [style=invis, weight=100];\n'
885        s += '  node [shape=plaintext]\n'
886        s += '  0000.0000'
887        for x in range(self.num_leaves()):
888            s += '->%s->%04d.0000' % (self.leaf(x), x + 1)
889        s += ';\n\n'
890
891        # Add the edges
892        s += '  edge [style=solid, weight=1];\n'
893        for y, edge in enumerate(self):
894            for x in range(edge.start()):
895                s += '  %04d.%04d -> %04d.%04d [style="invis"];\n' % (
896                    x,
897                    y + 1,
898                    x + 1,
899                    y + 1,
900                )
901            s += '  %04d.%04d -> %04d.%04d [label="%s"];\n' % (
902                edge.start(),
903                y + 1,
904                edge.end(),
905                y + 1,
906                edge,
907            )
908            for x in range(edge.end(), self.num_leaves()):
909                s += '  %04d.%04d -> %04d.%04d [style="invis"];\n' % (
910                    x,
911                    y + 1,
912                    x + 1,
913                    y + 1,
914                )
915        s += '}\n'
916        return s
917
918
919########################################################################
920##  Chart Rules
921########################################################################
922
923
924class ChartRuleI(object):
925    """
926    A rule that specifies what new edges are licensed by any given set
927    of existing edges.  Each chart rule expects a fixed number of
928    edges, as indicated by the class variable ``NUM_EDGES``.  In
929    particular:
930
931    - A chart rule with ``NUM_EDGES=0`` specifies what new edges are
932      licensed, regardless of existing edges.
933    - A chart rule with ``NUM_EDGES=1`` specifies what new edges are
934      licensed by a single existing edge.
935    - A chart rule with ``NUM_EDGES=2`` specifies what new edges are
936      licensed by a pair of existing edges.
937
938    :type NUM_EDGES: int
939    :cvar NUM_EDGES: The number of existing edges that this rule uses
940        to license new edges.  Typically, this number ranges from zero
941        to two.
942    """
943
944    def apply(self, chart, grammar, *edges):
945        """
946        Return a generator that will add edges licensed by this rule
947        and the given edges to the chart, one at a time.  Each
948        time the generator is resumed, it will either add a new
949        edge and yield that edge; or return.
950
951        :type edges: list(EdgeI)
952        :param edges: A set of existing edges.  The number of edges
953            that should be passed to ``apply()`` is specified by the
954            ``NUM_EDGES`` class variable.
955        :rtype: iter(EdgeI)
956        """
957        raise NotImplementedError()
958
959    def apply_everywhere(self, chart, grammar):
960        """
961        Return a generator that will add all edges licensed by
962        this rule, given the edges that are currently in the
963        chart, one at a time.  Each time the generator is resumed,
964        it will either add a new edge and yield that edge; or return.
965
966        :rtype: iter(EdgeI)
967        """
968        raise NotImplementedError()
969
970
971@python_2_unicode_compatible
972class AbstractChartRule(ChartRuleI):
973    """
974    An abstract base class for chart rules.  ``AbstractChartRule``
975    provides:
976
977    - A default implementation for ``apply``.
978    - A default implementation for ``apply_everywhere``,
979      (Currently, this implementation assumes that ``NUM_EDGES``<=3.)
980    - A default implementation for ``__str__``, which returns a
981      name based on the rule's class name.
982    """
983
984    # Subclasses must define apply.
985    def apply(self, chart, grammar, *edges):
986        raise NotImplementedError()
987
988    # Default: loop through the given number of edges, and call
989    # self.apply() for each set of edges.
990    def apply_everywhere(self, chart, grammar):
991        if self.NUM_EDGES == 0:
992            for new_edge in self.apply(chart, grammar):
993                yield new_edge
994
995        elif self.NUM_EDGES == 1:
996            for e1 in chart:
997                for new_edge in self.apply(chart, grammar, e1):
998                    yield new_edge
999
1000        elif self.NUM_EDGES == 2:
1001            for e1 in chart:
1002                for e2 in chart:
1003                    for new_edge in self.apply(chart, grammar, e1, e2):
1004                        yield new_edge
1005
1006        elif self.NUM_EDGES == 3:
1007            for e1 in chart:
1008                for e2 in chart:
1009                    for e3 in chart:
1010                        for new_edge in self.apply(chart, grammar, e1, e2, e3):
1011                            yield new_edge
1012
1013        else:
1014            raise AssertionError('NUM_EDGES>3 is not currently supported')
1015
1016    # Default: return a name based on the class name.
1017    def __str__(self):
1018        # Add spaces between InitialCapsWords.
1019        return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__)
1020
1021
1022# ////////////////////////////////////////////////////////////
1023# Fundamental Rule
1024# ////////////////////////////////////////////////////////////
1025
1026
1027class FundamentalRule(AbstractChartRule):
1028    """
1029    A rule that joins two adjacent edges to form a single combined
1030    edge.  In particular, this rule specifies that any pair of edges
1031
1032    - ``[A -> alpha \* B beta][i:j]``
1033    - ``[B -> gamma \*][j:k]``
1034
1035    licenses the edge:
1036
1037    - ``[A -> alpha B * beta][i:j]``
1038    """
1039
1040    NUM_EDGES = 2
1041
1042    def apply(self, chart, grammar, left_edge, right_edge):
1043        # Make sure the rule is applicable.
1044        if not (
1045            left_edge.is_incomplete()
1046            and right_edge.is_complete()
1047            and left_edge.end() == right_edge.start()
1048            and left_edge.nextsym() == right_edge.lhs()
1049        ):
1050            return
1051
1052        # Construct the new edge.
1053        new_edge = left_edge.move_dot_forward(right_edge.end())
1054
1055        # Insert it into the chart.
1056        if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
1057            yield new_edge
1058
1059
1060class SingleEdgeFundamentalRule(FundamentalRule):
1061    """
1062    A rule that joins a given edge with adjacent edges in the chart,
1063    to form combined edges.  In particular, this rule specifies that
1064    either of the edges:
1065
1066    - ``[A -> alpha \* B beta][i:j]``
1067    - ``[B -> gamma \*][j:k]``
1068
1069    licenses the edge:
1070
1071    - ``[A -> alpha B * beta][i:j]``
1072
1073    if the other edge is already in the chart.
1074
1075    :note: This is basically ``FundamentalRule``, with one edge left
1076        unspecified.
1077    """
1078
1079    NUM_EDGES = 1
1080
1081    def apply(self, chart, grammar, edge):
1082        if edge.is_incomplete():
1083            for new_edge in self._apply_incomplete(chart, grammar, edge):
1084                yield new_edge
1085        else:
1086            for new_edge in self._apply_complete(chart, grammar, edge):
1087                yield new_edge
1088
1089    def _apply_complete(self, chart, grammar, right_edge):
1090        for left_edge in chart.select(
1091            end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
1092        ):
1093            new_edge = left_edge.move_dot_forward(right_edge.end())
1094            if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
1095                yield new_edge
1096
1097    def _apply_incomplete(self, chart, grammar, left_edge):
1098        for right_edge in chart.select(
1099            start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
1100        ):
1101            new_edge = left_edge.move_dot_forward(right_edge.end())
1102            if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
1103                yield new_edge
1104
1105
1106# ////////////////////////////////////////////////////////////
1107# Inserting Terminal Leafs
1108# ////////////////////////////////////////////////////////////
1109
1110
1111class LeafInitRule(AbstractChartRule):
1112    NUM_EDGES = 0
1113
1114    def apply(self, chart, grammar):
1115        for index in range(chart.num_leaves()):
1116            new_edge = LeafEdge(chart.leaf(index), index)
1117            if chart.insert(new_edge, ()):
1118                yield new_edge
1119
1120
1121# ////////////////////////////////////////////////////////////
1122# Top-Down Prediction
1123# ////////////////////////////////////////////////////////////
1124
1125
1126class TopDownInitRule(AbstractChartRule):
1127    """
1128    A rule licensing edges corresponding to the grammar productions for
1129    the grammar's start symbol.  In particular, this rule specifies that
1130    ``[S -> \* alpha][0:i]`` is licensed for each grammar production
1131    ``S -> alpha``, where ``S`` is the grammar's start symbol.
1132    """
1133
1134    NUM_EDGES = 0
1135
1136    def apply(self, chart, grammar):
1137        for prod in grammar.productions(lhs=grammar.start()):
1138            new_edge = TreeEdge.from_production(prod, 0)
1139            if chart.insert(new_edge, ()):
1140                yield new_edge
1141
1142
1143class TopDownPredictRule(AbstractChartRule):
1144    """
1145    A rule licensing edges corresponding to the grammar productions
1146    for the nonterminal following an incomplete edge's dot.  In
1147    particular, this rule specifies that
1148    ``[A -> alpha \* B beta][i:j]`` licenses the edge
1149    ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``.
1150
1151    :note: This rule corresponds to the Predictor Rule in Earley parsing.
1152    """
1153
1154    NUM_EDGES = 1
1155
1156    def apply(self, chart, grammar, edge):
1157        if edge.is_complete():
1158            return
1159        for prod in grammar.productions(lhs=edge.nextsym()):
1160            new_edge = TreeEdge.from_production(prod, edge.end())
1161            if chart.insert(new_edge, ()):
1162                yield new_edge
1163
1164
1165class CachedTopDownPredictRule(TopDownPredictRule):
1166    """
1167    A cached version of ``TopDownPredictRule``.  After the first time
1168    this rule is applied to an edge with a given ``end`` and ``next``,
1169    it will not generate any more edges for edges with that ``end`` and
1170    ``next``.
1171
1172    If ``chart`` or ``grammar`` are changed, then the cache is flushed.
1173    """
1174
1175    def __init__(self):
1176        TopDownPredictRule.__init__(self)
1177        self._done = {}
1178
1179    def apply(self, chart, grammar, edge):
1180        if edge.is_complete():
1181            return
1182        nextsym, index = edge.nextsym(), edge.end()
1183        if not is_nonterminal(nextsym):
1184            return
1185
1186        # If we've already applied this rule to an edge with the same
1187        # next & end, and the chart & grammar have not changed, then
1188        # just return (no new edges to add).
1189        done = self._done.get((nextsym, index), (None, None))
1190        if done[0] is chart and done[1] is grammar:
1191            return
1192
1193        # Add all the edges indicated by the top down expand rule.
1194        for prod in grammar.productions(lhs=nextsym):
1195            # If the left corner in the predicted production is
1196            # leaf, it must match with the input.
1197            if prod.rhs():
1198                first = prod.rhs()[0]
1199                if is_terminal(first):
1200                    if index >= chart.num_leaves() or first != chart.leaf(index):
1201                        continue
1202
1203            new_edge = TreeEdge.from_production(prod, index)
1204            if chart.insert(new_edge, ()):
1205                yield new_edge
1206
1207        # Record the fact that we've applied this rule.
1208        self._done[nextsym, index] = (chart, grammar)
1209
1210
1211# ////////////////////////////////////////////////////////////
1212# Bottom-Up Prediction
1213# ////////////////////////////////////////////////////////////
1214
1215
1216class BottomUpPredictRule(AbstractChartRule):
1217    """
1218    A rule licensing any edge corresponding to a production whose
1219    right-hand side begins with a complete edge's left-hand side.  In
1220    particular, this rule specifies that ``[A -> alpha \*]`` licenses
1221    the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``.
1222    """
1223
1224    NUM_EDGES = 1
1225
1226    def apply(self, chart, grammar, edge):
1227        if edge.is_incomplete():
1228            return
1229        for prod in grammar.productions(rhs=edge.lhs()):
1230            new_edge = TreeEdge.from_production(prod, edge.start())
1231            if chart.insert(new_edge, ()):
1232                yield new_edge
1233
1234
1235class BottomUpPredictCombineRule(BottomUpPredictRule):
1236    """
1237    A rule licensing any edge corresponding to a production whose
1238    right-hand side begins with a complete edge's left-hand side.  In
1239    particular, this rule specifies that ``[A -> alpha \*]``
1240    licenses the edge ``[B -> A \* beta]`` for each grammar
1241    production ``B -> A beta``.
1242
1243    :note: This is like ``BottomUpPredictRule``, but it also applies
1244        the ``FundamentalRule`` to the resulting edge.
1245    """
1246
1247    NUM_EDGES = 1
1248
1249    def apply(self, chart, grammar, edge):
1250        if edge.is_incomplete():
1251            return
1252        for prod in grammar.productions(rhs=edge.lhs()):
1253            new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
1254            if chart.insert(new_edge, (edge,)):
1255                yield new_edge
1256
1257
1258class EmptyPredictRule(AbstractChartRule):
1259    """
1260    A rule that inserts all empty productions as passive edges,
1261    in every position in the chart.
1262    """
1263
1264    NUM_EDGES = 0
1265
1266    def apply(self, chart, grammar):
1267        for prod in grammar.productions(empty=True):
1268            for index in range(chart.num_leaves() + 1):
1269                new_edge = TreeEdge.from_production(prod, index)
1270                if chart.insert(new_edge, ()):
1271                    yield new_edge
1272
1273
1274########################################################################
1275##  Filtered Bottom Up
1276########################################################################
1277
1278
1279class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule):
1280    def _apply_complete(self, chart, grammar, right_edge):
1281        end = right_edge.end()
1282        nexttoken = end < chart.num_leaves() and chart.leaf(end)
1283        for left_edge in chart.select(
1284            end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()
1285        ):
1286            if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
1287                new_edge = left_edge.move_dot_forward(right_edge.end())
1288                if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
1289                    yield new_edge
1290
1291    def _apply_incomplete(self, chart, grammar, left_edge):
1292        for right_edge in chart.select(
1293            start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()
1294        ):
1295            end = right_edge.end()
1296            nexttoken = end < chart.num_leaves() and chart.leaf(end)
1297            if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()):
1298                new_edge = left_edge.move_dot_forward(right_edge.end())
1299                if chart.insert_with_backpointer(new_edge, left_edge, right_edge):
1300                    yield new_edge
1301
1302
1303class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule):
1304    def apply(self, chart, grammar, edge):
1305        if edge.is_incomplete():
1306            return
1307
1308        end = edge.end()
1309        nexttoken = end < chart.num_leaves() and chart.leaf(end)
1310        for prod in grammar.productions(rhs=edge.lhs()):
1311            if _bottomup_filter(grammar, nexttoken, prod.rhs()):
1312                new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1)
1313                if chart.insert(new_edge, (edge,)):
1314                    yield new_edge
1315
1316
1317def _bottomup_filter(grammar, nexttoken, rhs, dot=0):
1318    if len(rhs) <= dot + 1:
1319        return True
1320    _next = rhs[dot + 1]
1321    if is_terminal(_next):
1322        return nexttoken == _next
1323    else:
1324        return grammar.is_leftcorner(_next, nexttoken)
1325
1326
1327########################################################################
1328##  Generic Chart Parser
1329########################################################################
1330
1331TD_STRATEGY = [
1332    LeafInitRule(),
1333    TopDownInitRule(),
1334    CachedTopDownPredictRule(),
1335    SingleEdgeFundamentalRule(),
1336]
1337BU_STRATEGY = [
1338    LeafInitRule(),
1339    EmptyPredictRule(),
1340    BottomUpPredictRule(),
1341    SingleEdgeFundamentalRule(),
1342]
1343BU_LC_STRATEGY = [
1344    LeafInitRule(),
1345    EmptyPredictRule(),
1346    BottomUpPredictCombineRule(),
1347    SingleEdgeFundamentalRule(),
1348]
1349
1350LC_STRATEGY = [
1351    LeafInitRule(),
1352    FilteredBottomUpPredictCombineRule(),
1353    FilteredSingleEdgeFundamentalRule(),
1354]
1355
1356
1357class ChartParser(ParserI):
1358    """
1359    A generic chart parser.  A "strategy", or list of
1360    ``ChartRuleI`` instances, is used to decide what edges to add to
1361    the chart.  In particular, ``ChartParser`` uses the following
1362    algorithm to parse texts:
1363
1364    | Until no new edges are added:
1365    |   For each *rule* in *strategy*:
1366    |     Apply *rule* to any applicable edges in the chart.
1367    | Return any complete parses in the chart
1368    """
1369
1370    def __init__(
1371        self,
1372        grammar,
1373        strategy=BU_LC_STRATEGY,
1374        trace=0,
1375        trace_chart_width=50,
1376        use_agenda=True,
1377        chart_class=Chart,
1378    ):
1379        """
1380        Create a new chart parser, that uses ``grammar`` to parse
1381        texts.
1382
1383        :type grammar: CFG
1384        :param grammar: The grammar used to parse texts.
1385        :type strategy: list(ChartRuleI)
1386        :param strategy: A list of rules that should be used to decide
1387            what edges to add to the chart (top-down strategy by default).
1388        :type trace: int
1389        :param trace: The level of tracing that should be used when
1390            parsing a text.  ``0`` will generate no tracing output;
1391            and higher numbers will produce more verbose tracing
1392            output.
1393        :type trace_chart_width: int
1394        :param trace_chart_width: The default total width reserved for
1395            the chart in trace output.  The remainder of each line will
1396            be used to display edges.
1397        :type use_agenda: bool
1398        :param use_agenda: Use an optimized agenda-based algorithm,
1399            if possible.
1400        :param chart_class: The class that should be used to create
1401            the parse charts.
1402        """
1403        self._grammar = grammar
1404        self._strategy = strategy
1405        self._trace = trace
1406        self._trace_chart_width = trace_chart_width
1407        # If the strategy only consists of axioms (NUM_EDGES==0) and
1408        # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm:
1409        self._use_agenda = use_agenda
1410        self._chart_class = chart_class
1411
1412        self._axioms = []
1413        self._inference_rules = []
1414        for rule in strategy:
1415            if rule.NUM_EDGES == 0:
1416                self._axioms.append(rule)
1417            elif rule.NUM_EDGES == 1:
1418                self._inference_rules.append(rule)
1419            else:
1420                self._use_agenda = False
1421
1422    def grammar(self):
1423        return self._grammar
1424
1425    def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width):
1426        if not trace:
1427            return
1428        print_rule_header = trace > 1
1429        for edge in new_edges:
1430            if print_rule_header:
1431                print('%s:' % rule)
1432                print_rule_header = False
1433            print(chart.pretty_format_edge(edge, edge_width))
1434
1435    def chart_parse(self, tokens, trace=None):
1436        """
1437        Return the final parse ``Chart`` from which all possible
1438        parse trees can be extracted.
1439
1440        :param tokens: The sentence to be parsed
1441        :type tokens: list(str)
1442        :rtype: Chart
1443        """
1444        if trace is None:
1445            trace = self._trace
1446        trace_new_edges = self._trace_new_edges
1447
1448        tokens = list(tokens)
1449        self._grammar.check_coverage(tokens)
1450        chart = self._chart_class(tokens)
1451        grammar = self._grammar
1452
1453        # Width, for printing trace edges.
1454        trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1)
1455        if trace:
1456            print(chart.pretty_format_leaves(trace_edge_width))
1457
1458        if self._use_agenda:
1459            # Use an agenda-based algorithm.
1460            for axiom in self._axioms:
1461                new_edges = list(axiom.apply(chart, grammar))
1462                trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width)
1463
1464            inference_rules = self._inference_rules
1465            agenda = chart.edges()
1466            # We reverse the initial agenda, since it is a stack
1467            # but chart.edges() functions as a queue.
1468            agenda.reverse()
1469            while agenda:
1470                edge = agenda.pop()
1471                for rule in inference_rules:
1472                    new_edges = list(rule.apply(chart, grammar, edge))
1473                    if trace:
1474                        trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
1475                    agenda += new_edges
1476
1477        else:
1478            # Do not use an agenda-based algorithm.
1479            edges_added = True
1480            while edges_added:
1481                edges_added = False
1482                for rule in self._strategy:
1483                    new_edges = list(rule.apply_everywhere(chart, grammar))
1484                    edges_added = len(new_edges)
1485                    trace_new_edges(chart, rule, new_edges, trace, trace_edge_width)
1486
1487        # Return the final chart.
1488        return chart
1489
1490    def parse(self, tokens, tree_class=Tree):
1491        chart = self.chart_parse(tokens)
1492        return iter(chart.parses(self._grammar.start(), tree_class=tree_class))
1493
1494
1495class TopDownChartParser(ChartParser):
1496    """
1497    A ``ChartParser`` using a top-down parsing strategy.
1498    See ``ChartParser`` for more information.
1499    """
1500
1501    def __init__(self, grammar, **parser_args):
1502        ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args)
1503
1504
1505class BottomUpChartParser(ChartParser):
1506    """
1507    A ``ChartParser`` using a bottom-up parsing strategy.
1508    See ``ChartParser`` for more information.
1509    """
1510
1511    def __init__(self, grammar, **parser_args):
1512        if isinstance(grammar, PCFG):
1513            warnings.warn(
1514                "BottomUpChartParser only works for CFG, "
1515                "use BottomUpProbabilisticChartParser instead",
1516                category=DeprecationWarning,
1517            )
1518        ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args)
1519
1520
1521class BottomUpLeftCornerChartParser(ChartParser):
1522    """
1523    A ``ChartParser`` using a bottom-up left-corner parsing strategy.
1524    This strategy is often more efficient than standard bottom-up.
1525    See ``ChartParser`` for more information.
1526    """
1527
1528    def __init__(self, grammar, **parser_args):
1529        ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args)
1530
1531
1532class LeftCornerChartParser(ChartParser):
1533    def __init__(self, grammar, **parser_args):
1534        if not grammar.is_nonempty():
1535            raise ValueError(
1536                "LeftCornerParser only works for grammars " "without empty productions."
1537            )
1538        ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args)
1539
1540
1541########################################################################
1542##  Stepping Chart Parser
1543########################################################################
1544
1545
1546class SteppingChartParser(ChartParser):
1547    """
1548    A ``ChartParser`` that allows you to step through the parsing
1549    process, adding a single edge at a time.  It also allows you to
1550    change the parser's strategy or grammar midway through parsing a
1551    text.
1552
1553    The ``initialize`` method is used to start parsing a text.  ``step``
1554    adds a single edge to the chart.  ``set_strategy`` changes the
1555    strategy used by the chart parser.  ``parses`` returns the set of
1556    parses that has been found by the chart parser.
1557
1558    :ivar _restart: Records whether the parser's strategy, grammar,
1559        or chart has been changed.  If so, then ``step`` must restart
1560        the parsing algorithm.
1561    """
1562
1563    def __init__(self, grammar, strategy=[], trace=0):
1564        self._chart = None
1565        self._current_chartrule = None
1566        self._restart = False
1567        ChartParser.__init__(self, grammar, strategy, trace)
1568
1569    # ////////////////////////////////////////////////////////////
1570    # Initialization
1571    # ////////////////////////////////////////////////////////////
1572
1573    def initialize(self, tokens):
1574        "Begin parsing the given tokens."
1575        self._chart = Chart(list(tokens))
1576        self._restart = True
1577
1578    # ////////////////////////////////////////////////////////////
1579    # Stepping
1580    # ////////////////////////////////////////////////////////////
1581
1582    def step(self):
1583        """
1584        Return a generator that adds edges to the chart, one at a
1585        time.  Each time the generator is resumed, it adds a single
1586        edge and yields that edge.  If no more edges can be added,
1587        then it yields None.
1588
1589        If the parser's strategy, grammar, or chart is changed, then
1590        the generator will continue adding edges using the new
1591        strategy, grammar, or chart.
1592
1593        Note that this generator never terminates, since the grammar
1594        or strategy might be changed to values that would add new
1595        edges.  Instead, it yields None when no more edges can be
1596        added with the current strategy and grammar.
1597        """
1598        if self._chart is None:
1599            raise ValueError('Parser must be initialized first')
1600        while True:
1601            self._restart = False
1602            w = 50 // (self._chart.num_leaves() + 1)
1603
1604            for e in self._parse():
1605                if self._trace > 1:
1606                    print(self._current_chartrule)
1607                if self._trace > 0:
1608                    print(self._chart.pretty_format_edge(e, w))
1609                yield e
1610                if self._restart:
1611                    break
1612            else:
1613                yield None  # No more edges.
1614
1615    def _parse(self):
1616        """
1617        A generator that implements the actual parsing algorithm.
1618        ``step`` iterates through this generator, and restarts it
1619        whenever the parser's strategy, grammar, or chart is modified.
1620        """
1621        chart = self._chart
1622        grammar = self._grammar
1623        edges_added = 1
1624        while edges_added > 0:
1625            edges_added = 0
1626            for rule in self._strategy:
1627                self._current_chartrule = rule
1628                for e in rule.apply_everywhere(chart, grammar):
1629                    edges_added += 1
1630                    yield e
1631
1632    # ////////////////////////////////////////////////////////////
1633    # Accessors
1634    # ////////////////////////////////////////////////////////////
1635
1636    def strategy(self):
1637        "Return the strategy used by this parser."
1638        return self._strategy
1639
1640    def grammar(self):
1641        "Return the grammar used by this parser."
1642        return self._grammar
1643
1644    def chart(self):
1645        "Return the chart that is used by this parser."
1646        return self._chart
1647
1648    def current_chartrule(self):
1649        "Return the chart rule used to generate the most recent edge."
1650        return self._current_chartrule
1651
1652    def parses(self, tree_class=Tree):
1653        "Return the parse trees currently contained in the chart."
1654        return self._chart.parses(self._grammar.start(), tree_class)
1655
1656    # ////////////////////////////////////////////////////////////
1657    # Parser modification
1658    # ////////////////////////////////////////////////////////////
1659
1660    def set_strategy(self, strategy):
1661        """
1662        Change the strategy that the parser uses to decide which edges
1663        to add to the chart.
1664
1665        :type strategy: list(ChartRuleI)
1666        :param strategy: A list of rules that should be used to decide
1667            what edges to add to the chart.
1668        """
1669        if strategy == self._strategy:
1670            return
1671        self._strategy = strategy[:]  # Make a copy.
1672        self._restart = True
1673
1674    def set_grammar(self, grammar):
1675        "Change the grammar used by the parser."
1676        if grammar is self._grammar:
1677            return
1678        self._grammar = grammar
1679        self._restart = True
1680
1681    def set_chart(self, chart):
1682        "Load a given chart into the chart parser."
1683        if chart is self._chart:
1684            return
1685        self._chart = chart
1686        self._restart = True
1687
1688    # ////////////////////////////////////////////////////////////
1689    # Standard parser methods
1690    # ////////////////////////////////////////////////////////////
1691
1692    def parse(self, tokens, tree_class=Tree):
1693        tokens = list(tokens)
1694        self._grammar.check_coverage(tokens)
1695
1696        # Initialize ourselves.
1697        self.initialize(tokens)
1698
1699        # Step until no more edges are generated.
1700        for e in self.step():
1701            if e is None:
1702                break
1703
1704        # Return an iterator of complete parses.
1705        return self.parses(tree_class=tree_class)
1706
1707
1708########################################################################
1709##  Demo Code
1710########################################################################
1711
1712
1713def demo_grammar():
1714    from nltk.grammar import CFG
1715
1716    return CFG.fromstring(
1717        """
1718S  -> NP VP
1719PP -> "with" NP
1720NP -> NP PP
1721VP -> VP PP
1722VP -> Verb NP
1723VP -> Verb
1724NP -> Det Noun
1725NP -> "John"
1726NP -> "I"
1727Det -> "the"
1728Det -> "my"
1729Det -> "a"
1730Noun -> "dog"
1731Noun -> "cookie"
1732Verb -> "ate"
1733Verb -> "saw"
1734Prep -> "with"
1735Prep -> "under"
1736"""
1737    )
1738
1739
1740def demo(
1741    choice=None,
1742    print_times=True,
1743    print_grammar=False,
1744    print_trees=True,
1745    trace=2,
1746    sent='I saw John with a dog with my cookie',
1747    numparses=5,
1748):
1749    """
1750    A demonstration of the chart parsers.
1751    """
1752    import sys, time
1753    from nltk import nonterminals, Production, CFG
1754
1755    # The grammar for ChartParser and SteppingChartParser:
1756    grammar = demo_grammar()
1757    if print_grammar:
1758        print("* Grammar")
1759        print(grammar)
1760
1761    # Tokenize the sample sentence.
1762    print("* Sentence:")
1763    print(sent)
1764    tokens = sent.split()
1765    print(tokens)
1766    print()
1767
1768    # Ask the user which parser to test,
1769    # if the parser wasn't provided as an argument
1770    if choice is None:
1771        print('  1: Top-down chart parser')
1772        print('  2: Bottom-up chart parser')
1773        print('  3: Bottom-up left-corner chart parser')
1774        print('  4: Left-corner chart parser with bottom-up filter')
1775        print('  5: Stepping chart parser (alternating top-down & bottom-up)')
1776        print('  6: All parsers')
1777        print('\nWhich parser (1-6)? ', end=' ')
1778        choice = sys.stdin.readline().strip()
1779        print()
1780
1781    choice = str(choice)
1782    if choice not in "123456":
1783        print('Bad parser number')
1784        return
1785
1786    # Keep track of how long each parser takes.
1787    times = {}
1788
1789    strategies = {
1790        '1': ('Top-down', TD_STRATEGY),
1791        '2': ('Bottom-up', BU_STRATEGY),
1792        '3': ('Bottom-up left-corner', BU_LC_STRATEGY),
1793        '4': ('Filtered left-corner', LC_STRATEGY),
1794    }
1795    choices = []
1796    if choice in strategies:
1797        choices = [choice]
1798    if choice == '6':
1799        choices = "1234"
1800
1801    # Run the requested chart parser(s), except the stepping parser.
1802    for strategy in choices:
1803        print("* Strategy: " + strategies[strategy][0])
1804        print()
1805        cp = ChartParser(grammar, strategies[strategy][1], trace=trace)
1806        t = time.time()
1807        chart = cp.chart_parse(tokens)
1808        parses = list(chart.parses(grammar.start()))
1809
1810        times[strategies[strategy][0]] = time.time() - t
1811        print("Nr edges in chart:", len(chart.edges()))
1812        if numparses:
1813            assert len(parses) == numparses, 'Not all parses found'
1814        if print_trees:
1815            for tree in parses:
1816                print(tree)
1817        else:
1818            print("Nr trees:", len(parses))
1819        print()
1820
1821    # Run the stepping parser, if requested.
1822    if choice in "56":
1823        print("* Strategy: Stepping (top-down vs bottom-up)")
1824        print()
1825        t = time.time()
1826        cp = SteppingChartParser(grammar, trace=trace)
1827        cp.initialize(tokens)
1828        for i in range(5):
1829            print('*** SWITCH TO TOP DOWN')
1830            cp.set_strategy(TD_STRATEGY)
1831            for j, e in enumerate(cp.step()):
1832                if j > 20 or e is None:
1833                    break
1834            print('*** SWITCH TO BOTTOM UP')
1835            cp.set_strategy(BU_STRATEGY)
1836            for j, e in enumerate(cp.step()):
1837                if j > 20 or e is None:
1838                    break
1839        times['Stepping'] = time.time() - t
1840        print("Nr edges in chart:", len(cp.chart().edges()))
1841        if numparses:
1842            assert len(list(cp.parses())) == numparses, 'Not all parses found'
1843        if print_trees:
1844            for tree in cp.parses():
1845                print(tree)
1846        else:
1847            print("Nr trees:", len(list(cp.parses())))
1848        print()
1849
1850    # Print the times of all parsers:
1851    if not (print_times and times):
1852        return
1853    print("* Parsing times")
1854    print()
1855    maxlen = max(len(key) for key in times)
1856    format = '%' + repr(maxlen) + 's parser: %6.3fsec'
1857    times_items = times.items()
1858    for (parser, t) in sorted(times_items, key=lambda a: a[1]):
1859        print(format % (parser, t))
1860
1861
1862if __name__ == '__main__':
1863    demo()
1864