1# Natural Language Toolkit: Shift-Reduce Parser
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Edward Loper <edloper@gmail.com>
5#         Steven Bird <stevenbird1@gmail.com>
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8from __future__ import print_function, unicode_literals
9
10from nltk.grammar import Nonterminal
11from nltk.tree import Tree
12from nltk.compat import unicode_repr
13
14from nltk.parse.api import ParserI
15
16##//////////////////////////////////////////////////////
17##  Shift/Reduce Parser
18##//////////////////////////////////////////////////////
19class ShiftReduceParser(ParserI):
20    """
21    A simple bottom-up CFG parser that uses two operations, "shift"
22    and "reduce", to find a single parse for a text.
23
24    ``ShiftReduceParser`` maintains a stack, which records the
25    structure of a portion of the text.  This stack is a list of
26    strings and Trees that collectively cover a portion of
27    the text.  For example, while parsing the sentence "the dog saw
28    the man" with a typical grammar, ``ShiftReduceParser`` will produce
29    the following stack, which covers "the dog saw"::
30
31       [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')]
32
33    ``ShiftReduceParser`` attempts to extend the stack to cover the
34    entire text, and to combine the stack elements into a single tree,
35    producing a complete parse for the sentence.
36
37    Initially, the stack is empty.  It is extended to cover the text,
38    from left to right, by repeatedly applying two operations:
39
40      - "shift" moves a token from the beginning of the text to the
41        end of the stack.
42      - "reduce" uses a CFG production to combine the rightmost stack
43        elements into a single Tree.
44
45    Often, more than one operation can be performed on a given stack.
46    In this case, ``ShiftReduceParser`` uses the following heuristics
47    to decide which operation to perform:
48
49      - Only shift if no reductions are available.
50      - If multiple reductions are available, then apply the reduction
51        whose CFG production is listed earliest in the grammar.
52
53    Note that these heuristics are not guaranteed to choose an
54    operation that leads to a parse of the text.  Also, if multiple
55    parses exists, ``ShiftReduceParser`` will return at most one of
56    them.
57
58    :see: ``nltk.grammar``
59    """
60
61    def __init__(self, grammar, trace=0):
62        """
63        Create a new ``ShiftReduceParser``, that uses ``grammar`` to
64        parse texts.
65
66        :type grammar: Grammar
67        :param grammar: The grammar used to parse texts.
68        :type trace: int
69        :param trace: The level of tracing that should be used when
70            parsing a text.  ``0`` will generate no tracing output;
71            and higher numbers will produce more verbose tracing
72            output.
73        """
74        self._grammar = grammar
75        self._trace = trace
76        self._check_grammar()
77
78    def grammar(self):
79        return self._grammar
80
81    def parse(self, tokens):
82        tokens = list(tokens)
83        self._grammar.check_coverage(tokens)
84
85        # initialize the stack.
86        stack = []
87        remaining_text = tokens
88
89        # Trace output.
90        if self._trace:
91            print('Parsing %r' % " ".join(tokens))
92            self._trace_stack(stack, remaining_text)
93
94        # iterate through the text, pushing the token onto
95        # the stack, then reducing the stack.
96        while len(remaining_text) > 0:
97            self._shift(stack, remaining_text)
98            while self._reduce(stack, remaining_text):
99                pass
100
101        # Did we reduce everything?
102        if len(stack) == 1:
103            # Did we end up with the right category?
104            if stack[0].label() == self._grammar.start().symbol():
105                yield stack[0]
106
107    def _shift(self, stack, remaining_text):
108        """
109        Move a token from the beginning of ``remaining_text`` to the
110        end of ``stack``.
111
112        :type stack: list(str and Tree)
113        :param stack: A list of strings and Trees, encoding
114            the structure of the text that has been parsed so far.
115        :type remaining_text: list(str)
116        :param remaining_text: The portion of the text that is not yet
117            covered by ``stack``.
118        :rtype: None
119        """
120        stack.append(remaining_text[0])
121        remaining_text.remove(remaining_text[0])
122        if self._trace:
123            self._trace_shift(stack, remaining_text)
124
125    def _match_rhs(self, rhs, rightmost_stack):
126        """
127        :rtype: bool
128        :return: true if the right hand side of a CFG production
129            matches the rightmost elements of the stack.  ``rhs``
130            matches ``rightmost_stack`` if they are the same length,
131            and each element of ``rhs`` matches the corresponding
132            element of ``rightmost_stack``.  A nonterminal element of
133            ``rhs`` matches any Tree whose node value is equal
134            to the nonterminal's symbol.  A terminal element of ``rhs``
135            matches any string whose type is equal to the terminal.
136        :type rhs: list(terminal and Nonterminal)
137        :param rhs: The right hand side of a CFG production.
138        :type rightmost_stack: list(string and Tree)
139        :param rightmost_stack: The rightmost elements of the parser's
140            stack.
141        """
142
143        if len(rightmost_stack) != len(rhs):
144            return False
145        for i in range(len(rightmost_stack)):
146            if isinstance(rightmost_stack[i], Tree):
147                if not isinstance(rhs[i], Nonterminal):
148                    return False
149                if rightmost_stack[i].label() != rhs[i].symbol():
150                    return False
151            else:
152                if isinstance(rhs[i], Nonterminal):
153                    return False
154                if rightmost_stack[i] != rhs[i]:
155                    return False
156        return True
157
158    def _reduce(self, stack, remaining_text, production=None):
159        """
160        Find a CFG production whose right hand side matches the
161        rightmost stack elements; and combine those stack elements
162        into a single Tree, with the node specified by the
163        production's left-hand side.  If more than one CFG production
164        matches the stack, then use the production that is listed
165        earliest in the grammar.  The new Tree replaces the
166        elements in the stack.
167
168        :rtype: Production or None
169        :return: If a reduction is performed, then return the CFG
170            production that the reduction is based on; otherwise,
171            return false.
172        :type stack: list(string and Tree)
173        :param stack: A list of strings and Trees, encoding
174            the structure of the text that has been parsed so far.
175        :type remaining_text: list(str)
176        :param remaining_text: The portion of the text that is not yet
177            covered by ``stack``.
178        """
179        if production is None:
180            productions = self._grammar.productions()
181        else:
182            productions = [production]
183
184        # Try each production, in order.
185        for production in productions:
186            rhslen = len(production.rhs())
187
188            # check if the RHS of a production matches the top of the stack
189            if self._match_rhs(production.rhs(), stack[-rhslen:]):
190
191                # combine the tree to reflect the reduction
192                tree = Tree(production.lhs().symbol(), stack[-rhslen:])
193                stack[-rhslen:] = [tree]
194
195                # We reduced something
196                if self._trace:
197                    self._trace_reduce(stack, production, remaining_text)
198                return production
199
200        # We didn't reduce anything
201        return None
202
203    def trace(self, trace=2):
204        """
205        Set the level of tracing output that should be generated when
206        parsing a text.
207
208        :type trace: int
209        :param trace: The trace level.  A trace level of ``0`` will
210            generate no tracing output; and higher trace levels will
211            produce more verbose tracing output.
212        :rtype: None
213        """
214        # 1: just show shifts.
215        # 2: show shifts & reduces
216        # 3: display which tokens & productions are shifed/reduced
217        self._trace = trace
218
219    def _trace_stack(self, stack, remaining_text, marker=' '):
220        """
221        Print trace output displaying the given stack and text.
222
223        :rtype: None
224        :param marker: A character that is printed to the left of the
225            stack.  This is used with trace level 2 to print 'S'
226            before shifted stacks and 'R' before reduced stacks.
227        """
228        s = '  ' + marker + ' [ '
229        for elt in stack:
230            if isinstance(elt, Tree):
231                s += unicode_repr(Nonterminal(elt.label())) + ' '
232            else:
233                s += unicode_repr(elt) + ' '
234        s += '* ' + ' '.join(remaining_text) + ']'
235        print(s)
236
237    def _trace_shift(self, stack, remaining_text):
238        """
239        Print trace output displaying that a token has been shifted.
240
241        :rtype: None
242        """
243        if self._trace > 2:
244            print('Shift %r:' % stack[-1])
245        if self._trace == 2:
246            self._trace_stack(stack, remaining_text, 'S')
247        elif self._trace > 0:
248            self._trace_stack(stack, remaining_text)
249
250    def _trace_reduce(self, stack, production, remaining_text):
251        """
252        Print trace output displaying that ``production`` was used to
253        reduce ``stack``.
254
255        :rtype: None
256        """
257        if self._trace > 2:
258            rhs = " ".join(production.rhs())
259            print('Reduce %r <- %s' % (production.lhs(), rhs))
260        if self._trace == 2:
261            self._trace_stack(stack, remaining_text, 'R')
262        elif self._trace > 1:
263            self._trace_stack(stack, remaining_text)
264
265    def _check_grammar(self):
266        """
267        Check to make sure that all of the CFG productions are
268        potentially useful.  If any productions can never be used,
269        then print a warning.
270
271        :rtype: None
272        """
273        productions = self._grammar.productions()
274
275        # Any production whose RHS is an extension of another production's RHS
276        # will never be used.
277        for i in range(len(productions)):
278            for j in range(i + 1, len(productions)):
279                rhs1 = productions[i].rhs()
280                rhs2 = productions[j].rhs()
281                if rhs1[: len(rhs2)] == rhs2:
282                    print('Warning: %r will never be used' % productions[i])
283
284
285##//////////////////////////////////////////////////////
286##  Stepping Shift/Reduce Parser
287##//////////////////////////////////////////////////////
288class SteppingShiftReduceParser(ShiftReduceParser):
289    """
290    A ``ShiftReduceParser`` that allows you to setp through the parsing
291    process, performing a single operation at a time.  It also allows
292    you to change the parser's grammar midway through parsing a text.
293
294    The ``initialize`` method is used to start parsing a text.
295    ``shift`` performs a single shift operation, and ``reduce`` performs
296    a single reduce operation.  ``step`` will perform a single reduce
297    operation if possible; otherwise, it will perform a single shift
298    operation.  ``parses`` returns the set of parses that have been
299    found by the parser.
300
301    :ivar _history: A list of ``(stack, remaining_text)`` pairs,
302        containing all of the previous states of the parser.  This
303        history is used to implement the ``undo`` operation.
304    :see: ``nltk.grammar``
305    """
306
307    def __init__(self, grammar, trace=0):
308        super(SteppingShiftReduceParser, self).__init__(grammar, trace)
309        self._stack = None
310        self._remaining_text = None
311        self._history = []
312
313    def parse(self, tokens):
314        tokens = list(tokens)
315        self.initialize(tokens)
316        while self.step():
317            pass
318        return self.parses()
319
320    def stack(self):
321        """
322        :return: The parser's stack.
323        :rtype: list(str and Tree)
324        """
325        return self._stack
326
327    def remaining_text(self):
328        """
329        :return: The portion of the text that is not yet covered by the
330            stack.
331        :rtype: list(str)
332        """
333        return self._remaining_text
334
335    def initialize(self, tokens):
336        """
337        Start parsing a given text.  This sets the parser's stack to
338        ``[]`` and sets its remaining text to ``tokens``.
339        """
340        self._stack = []
341        self._remaining_text = tokens
342        self._history = []
343
344    def step(self):
345        """
346        Perform a single parsing operation.  If a reduction is
347        possible, then perform that reduction, and return the
348        production that it is based on.  Otherwise, if a shift is
349        possible, then perform it, and return True.  Otherwise,
350        return False.
351
352        :return: False if no operation was performed; True if a shift was
353            performed; and the CFG production used to reduce if a
354            reduction was performed.
355        :rtype: Production or bool
356        """
357        return self.reduce() or self.shift()
358
359    def shift(self):
360        """
361        Move a token from the beginning of the remaining text to the
362        end of the stack.  If there are no more tokens in the
363        remaining text, then do nothing.
364
365        :return: True if the shift operation was successful.
366        :rtype: bool
367        """
368        if len(self._remaining_text) == 0:
369            return False
370        self._history.append((self._stack[:], self._remaining_text[:]))
371        self._shift(self._stack, self._remaining_text)
372        return True
373
374    def reduce(self, production=None):
375        """
376        Use ``production`` to combine the rightmost stack elements into
377        a single Tree.  If ``production`` does not match the
378        rightmost stack elements, then do nothing.
379
380        :return: The production used to reduce the stack, if a
381            reduction was performed.  If no reduction was performed,
382            return None.
383
384        :rtype: Production or None
385        """
386        self._history.append((self._stack[:], self._remaining_text[:]))
387        return_val = self._reduce(self._stack, self._remaining_text, production)
388
389        if not return_val:
390            self._history.pop()
391        return return_val
392
393    def undo(self):
394        """
395        Return the parser to its state before the most recent
396        shift or reduce operation.  Calling ``undo`` repeatedly return
397        the parser to successively earlier states.  If no shift or
398        reduce operations have been performed, ``undo`` will make no
399        changes.
400
401        :return: true if an operation was successfully undone.
402        :rtype: bool
403        """
404        if len(self._history) == 0:
405            return False
406        (self._stack, self._remaining_text) = self._history.pop()
407        return True
408
409    def reducible_productions(self):
410        """
411        :return: A list of the productions for which reductions are
412            available for the current parser state.
413        :rtype: list(Production)
414        """
415        productions = []
416        for production in self._grammar.productions():
417            rhslen = len(production.rhs())
418            if self._match_rhs(production.rhs(), self._stack[-rhslen:]):
419                productions.append(production)
420        return productions
421
422    def parses(self):
423        """
424        :return: An iterator of the parses that have been found by this
425            parser so far.
426        :rtype: iter(Tree)
427        """
428        if (
429            len(self._remaining_text) == 0
430            and len(self._stack) == 1
431            and self._stack[0].label() == self._grammar.start().symbol()
432        ):
433            yield self._stack[0]
434
435    # copied from nltk.parser
436
437    def set_grammar(self, grammar):
438        """
439        Change the grammar used to parse texts.
440
441        :param grammar: The new grammar.
442        :type grammar: CFG
443        """
444        self._grammar = grammar
445
446
447##//////////////////////////////////////////////////////
448##  Demonstration Code
449##//////////////////////////////////////////////////////
450
451
452def demo():
453    """
454    A demonstration of the shift-reduce parser.
455    """
456
457    from nltk import parse, CFG
458
459    grammar = CFG.fromstring(
460        """
461    S -> NP VP
462    NP -> Det N | Det N PP
463    VP -> V NP | V NP PP
464    PP -> P NP
465    NP -> 'I'
466    N -> 'man' | 'park' | 'telescope' | 'dog'
467    Det -> 'the' | 'a'
468    P -> 'in' | 'with'
469    V -> 'saw'
470    """
471    )
472
473    sent = 'I saw a man in the park'.split()
474
475    parser = parse.ShiftReduceParser(grammar, trace=2)
476    for p in parser.parse(sent):
477        print(p)
478
479
480if __name__ == '__main__':
481    demo()
482