1# Natural Language Toolkit: Recursive Descent Parser
2#
3# Copyright (C) 2001-2019 NLTK Project
4# Author: Edward Loper <edloper@gmail.com>
5#         Steven Bird <stevenbird1@gmail.com>
6# URL: <http://nltk.org/>
7# For license information, see LICENSE.TXT
8from __future__ import print_function, unicode_literals
9
10from nltk.grammar import Nonterminal
11from nltk.tree import Tree, ImmutableTree
12from nltk.compat import unicode_repr
13
14from nltk.parse.api import ParserI
15
16##//////////////////////////////////////////////////////
17##  Recursive Descent Parser
18##//////////////////////////////////////////////////////
19class RecursiveDescentParser(ParserI):
20    """
21    A simple top-down CFG parser that parses texts by recursively
22    expanding the fringe of a Tree, and matching it against a
23    text.
24
25    ``RecursiveDescentParser`` uses a list of tree locations called a
26    "frontier" to remember which subtrees have not yet been expanded
27    and which leaves have not yet been matched against the text.  Each
28    tree location consists of a list of child indices specifying the
29    path from the root of the tree to a subtree or a leaf; see the
30    reference documentation for Tree for more information
31    about tree locations.
32
33    When the parser begins parsing a text, it constructs a tree
34    containing only the start symbol, and a frontier containing the
35    location of the tree's root node.  It then extends the tree to
36    cover the text, using the following recursive procedure:
37
38      - If the frontier is empty, and the text is covered by the tree,
39        then return the tree as a possible parse.
40      - If the frontier is empty, and the text is not covered by the
41        tree, then return no parses.
42      - If the first element of the frontier is a subtree, then
43        use CFG productions to "expand" it.  For each applicable
44        production, add the expanded subtree's children to the
45        frontier, and recursively find all parses that can be
46        generated by the new tree and frontier.
47      - If the first element of the frontier is a token, then "match"
48        it against the next token from the text.  Remove the token
49        from the frontier, and recursively find all parses that can be
50        generated by the new tree and frontier.
51
52    :see: ``nltk.grammar``
53    """
54
55    def __init__(self, grammar, trace=0):
56        """
57        Create a new ``RecursiveDescentParser``, that uses ``grammar``
58        to parse texts.
59
60        :type grammar: CFG
61        :param grammar: The grammar used to parse texts.
62        :type trace: int
63        :param trace: The level of tracing that should be used when
64            parsing a text.  ``0`` will generate no tracing output;
65            and higher numbers will produce more verbose tracing
66            output.
67        """
68        self._grammar = grammar
69        self._trace = trace
70
71    def grammar(self):
72        return self._grammar
73
74    def parse(self, tokens):
75        # Inherit docs from ParserI
76
77        tokens = list(tokens)
78        self._grammar.check_coverage(tokens)
79
80        # Start a recursive descent parse, with an initial tree
81        # containing just the start symbol.
82        start = self._grammar.start().symbol()
83        initial_tree = Tree(start, [])
84        frontier = [()]
85        if self._trace:
86            self._trace_start(initial_tree, frontier, tokens)
87        return self._parse(tokens, initial_tree, frontier)
88
89    def _parse(self, remaining_text, tree, frontier):
90        """
91        Recursively expand and match each elements of ``tree``
92        specified by ``frontier``, to cover ``remaining_text``.  Return
93        a list of all parses found.
94
95        :return: An iterator of all parses that can be generated by
96            matching and expanding the elements of ``tree``
97            specified by ``frontier``.
98        :rtype: iter(Tree)
99        :type tree: Tree
100        :param tree: A partial structure for the text that is
101            currently being parsed.  The elements of ``tree``
102            that are specified by ``frontier`` have not yet been
103            expanded or matched.
104        :type remaining_text: list(str)
105        :param remaining_text: The portion of the text that is not yet
106            covered by ``tree``.
107        :type frontier: list(tuple(int))
108        :param frontier: A list of the locations within ``tree`` of
109            all subtrees that have not yet been expanded, and all
110            leaves that have not yet been matched.  This list sorted
111            in left-to-right order of location within the tree.
112        """
113
114        # If the tree covers the text, and there's nothing left to
115        # expand, then we've found a complete parse; return it.
116        if len(remaining_text) == 0 and len(frontier) == 0:
117            if self._trace:
118                self._trace_succeed(tree, frontier)
119            yield tree
120
121        # If there's still text, but nothing left to expand, we failed.
122        elif len(frontier) == 0:
123            if self._trace:
124                self._trace_backtrack(tree, frontier)
125
126        # If the next element on the frontier is a tree, expand it.
127        elif isinstance(tree[frontier[0]], Tree):
128            for result in self._expand(remaining_text, tree, frontier):
129                yield result
130
131        # If the next element on the frontier is a token, match it.
132        else:
133            for result in self._match(remaining_text, tree, frontier):
134                yield result
135
136    def _match(self, rtext, tree, frontier):
137        """
138        :rtype: iter(Tree)
139        :return: an iterator of all parses that can be generated by
140            matching the first element of ``frontier`` against the
141            first token in ``rtext``.  In particular, if the first
142            element of ``frontier`` has the same type as the first
143            token in ``rtext``, then substitute the token into
144            ``tree``; and return all parses that can be generated by
145            matching and expanding the remaining elements of
146            ``frontier``.  If the first element of ``frontier`` does not
147            have the same type as the first token in ``rtext``, then
148            return empty list.
149
150        :type tree: Tree
151        :param tree: A partial structure for the text that is
152            currently being parsed.  The elements of ``tree``
153            that are specified by ``frontier`` have not yet been
154            expanded or matched.
155        :type rtext: list(str)
156        :param rtext: The portion of the text that is not yet
157            covered by ``tree``.
158        :type frontier: list of tuple of int
159        :param frontier: A list of the locations within ``tree`` of
160            all subtrees that have not yet been expanded, and all
161            leaves that have not yet been matched.
162        """
163
164        tree_leaf = tree[frontier[0]]
165        if len(rtext) > 0 and tree_leaf == rtext[0]:
166            # If it's a terminal that matches rtext[0], then substitute
167            # in the token, and continue parsing.
168            newtree = tree.copy(deep=True)
169            newtree[frontier[0]] = rtext[0]
170            if self._trace:
171                self._trace_match(newtree, frontier[1:], rtext[0])
172            for result in self._parse(rtext[1:], newtree, frontier[1:]):
173                yield result
174        else:
175            # If it's a non-matching terminal, fail.
176            if self._trace:
177                self._trace_backtrack(tree, frontier, rtext[:1])
178
179    def _expand(self, remaining_text, tree, frontier, production=None):
180        """
181        :rtype: iter(Tree)
182        :return: An iterator of all parses that can be generated by
183            expanding the first element of ``frontier`` with
184            ``production``.  In particular, if the first element of
185            ``frontier`` is a subtree whose node type is equal to
186            ``production``'s left hand side, then add a child to that
187            subtree for each element of ``production``'s right hand
188            side; and return all parses that can be generated by
189            matching and expanding the remaining elements of
190            ``frontier``.  If the first element of ``frontier`` is not a
191            subtree whose node type is equal to ``production``'s left
192            hand side, then return an empty list.  If ``production`` is
193            not specified, then return a list of all parses that can
194            be generated by expanding the first element of ``frontier``
195            with *any* CFG production.
196
197        :type tree: Tree
198        :param tree: A partial structure for the text that is
199            currently being parsed.  The elements of ``tree``
200            that are specified by ``frontier`` have not yet been
201            expanded or matched.
202        :type remaining_text: list(str)
203        :param remaining_text: The portion of the text that is not yet
204            covered by ``tree``.
205        :type frontier: list(tuple(int))
206        :param frontier: A list of the locations within ``tree`` of
207            all subtrees that have not yet been expanded, and all
208            leaves that have not yet been matched.
209        """
210
211        if production is None:
212            productions = self._grammar.productions()
213        else:
214            productions = [production]
215
216        for production in productions:
217            lhs = production.lhs().symbol()
218            if lhs == tree[frontier[0]].label():
219                subtree = self._production_to_tree(production)
220                if frontier[0] == ():
221                    newtree = subtree
222                else:
223                    newtree = tree.copy(deep=True)
224                    newtree[frontier[0]] = subtree
225                new_frontier = [
226                    frontier[0] + (i,) for i in range(len(production.rhs()))
227                ]
228                if self._trace:
229                    self._trace_expand(newtree, new_frontier, production)
230                for result in self._parse(
231                    remaining_text, newtree, new_frontier + frontier[1:]
232                ):
233                    yield result
234
235    def _production_to_tree(self, production):
236        """
237        :rtype: Tree
238        :return: The Tree that is licensed by ``production``.
239            In particular, given the production ``[lhs -> elt[1] ... elt[n]]``
240            return a tree that has a node ``lhs.symbol``, and
241            ``n`` children.  For each nonterminal element
242            ``elt[i]`` in the production, the tree token has a
243            childless subtree with node value ``elt[i].symbol``; and
244            for each terminal element ``elt[j]``, the tree token has
245            a leaf token with type ``elt[j]``.
246
247        :param production: The CFG production that licenses the tree
248            token that should be returned.
249        :type production: Production
250        """
251        children = []
252        for elt in production.rhs():
253            if isinstance(elt, Nonterminal):
254                children.append(Tree(elt.symbol(), []))
255            else:
256                # This will be matched.
257                children.append(elt)
258        return Tree(production.lhs().symbol(), children)
259
260    def trace(self, trace=2):
261        """
262        Set the level of tracing output that should be generated when
263        parsing a text.
264
265        :type trace: int
266        :param trace: The trace level.  A trace level of ``0`` will
267            generate no tracing output; and higher trace levels will
268            produce more verbose tracing output.
269        :rtype: None
270        """
271        self._trace = trace
272
273    def _trace_fringe(self, tree, treeloc=None):
274        """
275        Print trace output displaying the fringe of ``tree``.  The
276        fringe of ``tree`` consists of all of its leaves and all of
277        its childless subtrees.
278
279        :rtype: None
280        """
281
282        if treeloc == ():
283            print("*", end=' ')
284        if isinstance(tree, Tree):
285            if len(tree) == 0:
286                print(unicode_repr(Nonterminal(tree.label())), end=' ')
287            for i in range(len(tree)):
288                if treeloc is not None and i == treeloc[0]:
289                    self._trace_fringe(tree[i], treeloc[1:])
290                else:
291                    self._trace_fringe(tree[i])
292        else:
293            print(unicode_repr(tree), end=' ')
294
295    def _trace_tree(self, tree, frontier, operation):
296        """
297        Print trace output displaying the parser's current state.
298
299        :param operation: A character identifying the operation that
300            generated the current state.
301        :rtype: None
302        """
303        if self._trace == 2:
304            print('  %c [' % operation, end=' ')
305        else:
306            print('    [', end=' ')
307        if len(frontier) > 0:
308            self._trace_fringe(tree, frontier[0])
309        else:
310            self._trace_fringe(tree)
311        print(']')
312
313    def _trace_start(self, tree, frontier, text):
314        print('Parsing %r' % " ".join(text))
315        if self._trace > 2:
316            print('Start:')
317        if self._trace > 1:
318            self._trace_tree(tree, frontier, ' ')
319
320    def _trace_expand(self, tree, frontier, production):
321        if self._trace > 2:
322            print('Expand: %s' % production)
323        if self._trace > 1:
324            self._trace_tree(tree, frontier, 'E')
325
326    def _trace_match(self, tree, frontier, tok):
327        if self._trace > 2:
328            print('Match: %r' % tok)
329        if self._trace > 1:
330            self._trace_tree(tree, frontier, 'M')
331
332    def _trace_succeed(self, tree, frontier):
333        if self._trace > 2:
334            print('GOOD PARSE:')
335        if self._trace == 1:
336            print('Found a parse:\n%s' % tree)
337        if self._trace > 1:
338            self._trace_tree(tree, frontier, '+')
339
340    def _trace_backtrack(self, tree, frontier, toks=None):
341        if self._trace > 2:
342            if toks:
343                print('Backtrack: %r match failed' % toks[0])
344            else:
345                print('Backtrack')
346
347
348##//////////////////////////////////////////////////////
349##  Stepping Recursive Descent Parser
350##//////////////////////////////////////////////////////
351class SteppingRecursiveDescentParser(RecursiveDescentParser):
352    """
353    A ``RecursiveDescentParser`` that allows you to step through the
354    parsing process, performing a single operation at a time.
355
356    The ``initialize`` method is used to start parsing a text.
357    ``expand`` expands the first element on the frontier using a single
358    CFG production, and ``match`` matches the first element on the
359    frontier against the next text token. ``backtrack`` undoes the most
360    recent expand or match operation.  ``step`` performs a single
361    expand, match, or backtrack operation.  ``parses`` returns the set
362    of parses that have been found by the parser.
363
364    :ivar _history: A list of ``(rtext, tree, frontier)`` tripples,
365        containing the previous states of the parser.  This history is
366        used to implement the ``backtrack`` operation.
367    :ivar _tried_e: A record of all productions that have been tried
368        for a given tree.  This record is used by ``expand`` to perform
369        the next untried production.
370    :ivar _tried_m: A record of what tokens have been matched for a
371        given tree.  This record is used by ``step`` to decide whether
372        or not to match a token.
373    :see: ``nltk.grammar``
374    """
375
376    def __init__(self, grammar, trace=0):
377        super(SteppingRecursiveDescentParser, self).__init__(grammar, trace)
378        self._rtext = None
379        self._tree = None
380        self._frontier = [()]
381        self._tried_e = {}
382        self._tried_m = {}
383        self._history = []
384        self._parses = []
385
386    # [XX] TEMPORARY HACK WARNING!  This should be replaced with
387    # something nicer when we get the chance.
388    def _freeze(self, tree):
389        c = tree.copy()
390        #        for pos in c.treepositions('leaves'):
391        #            c[pos] = c[pos].freeze()
392        return ImmutableTree.convert(c)
393
394    def parse(self, tokens):
395        tokens = list(tokens)
396        self.initialize(tokens)
397        while self.step() is not None:
398            pass
399        return self.parses()
400
401    def initialize(self, tokens):
402        """
403        Start parsing a given text.  This sets the parser's tree to
404        the start symbol, its frontier to the root node, and its
405        remaining text to ``token['SUBTOKENS']``.
406        """
407
408        self._rtext = tokens
409        start = self._grammar.start().symbol()
410        self._tree = Tree(start, [])
411        self._frontier = [()]
412        self._tried_e = {}
413        self._tried_m = {}
414        self._history = []
415        self._parses = []
416        if self._trace:
417            self._trace_start(self._tree, self._frontier, self._rtext)
418
419    def remaining_text(self):
420        """
421        :return: The portion of the text that is not yet covered by the
422            tree.
423        :rtype: list(str)
424        """
425        return self._rtext
426
427    def frontier(self):
428        """
429        :return: A list of the tree locations of all subtrees that
430            have not yet been expanded, and all leaves that have not
431            yet been matched.
432        :rtype: list(tuple(int))
433        """
434        return self._frontier
435
436    def tree(self):
437        """
438        :return: A partial structure for the text that is
439            currently being parsed.  The elements specified by the
440            frontier have not yet been expanded or matched.
441        :rtype: Tree
442        """
443        return self._tree
444
445    def step(self):
446        """
447        Perform a single parsing operation.  If an untried match is
448        possible, then perform the match, and return the matched
449        token.  If an untried expansion is possible, then perform the
450        expansion, and return the production that it is based on.  If
451        backtracking is possible, then backtrack, and return True.
452        Otherwise, return None.
453
454        :return: None if no operation was performed; a token if a match
455            was performed; a production if an expansion was performed;
456            and True if a backtrack operation was performed.
457        :rtype: Production or String or bool
458        """
459        # Try matching (if we haven't already)
460        if self.untried_match():
461            token = self.match()
462            if token is not None:
463                return token
464
465        # Try expanding.
466        production = self.expand()
467        if production is not None:
468            return production
469
470        # Try backtracking
471        if self.backtrack():
472            self._trace_backtrack(self._tree, self._frontier)
473            return True
474
475        # Nothing left to do.
476        return None
477
478    def expand(self, production=None):
479        """
480        Expand the first element of the frontier.  In particular, if
481        the first element of the frontier is a subtree whose node type
482        is equal to ``production``'s left hand side, then add a child
483        to that subtree for each element of ``production``'s right hand
484        side.  If ``production`` is not specified, then use the first
485        untried expandable production.  If all expandable productions
486        have been tried, do nothing.
487
488        :return: The production used to expand the frontier, if an
489           expansion was performed.  If no expansion was performed,
490           return None.
491        :rtype: Production or None
492        """
493
494        # Make sure we *can* expand.
495        if len(self._frontier) == 0:
496            return None
497        if not isinstance(self._tree[self._frontier[0]], Tree):
498            return None
499
500        # If they didn't specify a production, check all untried ones.
501        if production is None:
502            productions = self.untried_expandable_productions()
503        else:
504            productions = [production]
505
506        parses = []
507        for prod in productions:
508            # Record that we've tried this production now.
509            self._tried_e.setdefault(self._freeze(self._tree), []).append(prod)
510
511            # Try expanding.
512            for _result in self._expand(self._rtext, self._tree, self._frontier, prod):
513                return prod
514
515        # We didn't expand anything.
516        return None
517
518    def match(self):
519        """
520        Match the first element of the frontier.  In particular, if
521        the first element of the frontier has the same type as the
522        next text token, then substitute the text token into the tree.
523
524        :return: The token matched, if a match operation was
525            performed.  If no match was performed, return None
526        :rtype: str or None
527        """
528
529        # Record that we've tried matching this token.
530        tok = self._rtext[0]
531        self._tried_m.setdefault(self._freeze(self._tree), []).append(tok)
532
533        # Make sure we *can* match.
534        if len(self._frontier) == 0:
535            return None
536        if isinstance(self._tree[self._frontier[0]], Tree):
537            return None
538
539        for _result in self._match(self._rtext, self._tree, self._frontier):
540            # Return the token we just matched.
541            return self._history[-1][0][0]
542        return None
543
544    def backtrack(self):
545        """
546        Return the parser to its state before the most recent
547        match or expand operation.  Calling ``undo`` repeatedly return
548        the parser to successively earlier states.  If no match or
549        expand operations have been performed, ``undo`` will make no
550        changes.
551
552        :return: true if an operation was successfully undone.
553        :rtype: bool
554        """
555        if len(self._history) == 0:
556            return False
557        (self._rtext, self._tree, self._frontier) = self._history.pop()
558        return True
559
560    def expandable_productions(self):
561        """
562        :return: A list of all the productions for which expansions
563            are available for the current parser state.
564        :rtype: list(Production)
565        """
566        # Make sure we *can* expand.
567        if len(self._frontier) == 0:
568            return []
569        frontier_child = self._tree[self._frontier[0]]
570        if len(self._frontier) == 0 or not isinstance(frontier_child, Tree):
571            return []
572
573        return [
574            p
575            for p in self._grammar.productions()
576            if p.lhs().symbol() == frontier_child.label()
577        ]
578
579    def untried_expandable_productions(self):
580        """
581        :return: A list of all the untried productions for which
582            expansions are available for the current parser state.
583        :rtype: list(Production)
584        """
585
586        tried_expansions = self._tried_e.get(self._freeze(self._tree), [])
587        return [p for p in self.expandable_productions() if p not in tried_expansions]
588
589    def untried_match(self):
590        """
591        :return: Whether the first element of the frontier is a token
592            that has not yet been matched.
593        :rtype: bool
594        """
595
596        if len(self._rtext) == 0:
597            return False
598        tried_matches = self._tried_m.get(self._freeze(self._tree), [])
599        return self._rtext[0] not in tried_matches
600
601    def currently_complete(self):
602        """
603        :return: Whether the parser's current state represents a
604            complete parse.
605        :rtype: bool
606        """
607        return len(self._frontier) == 0 and len(self._rtext) == 0
608
609    def _parse(self, remaining_text, tree, frontier):
610        """
611        A stub version of ``_parse`` that sets the parsers current
612        state to the given arguments.  In ``RecursiveDescentParser``,
613        the ``_parse`` method is used to recursively continue parsing a
614        text.  ``SteppingRecursiveDescentParser`` overrides it to
615        capture these recursive calls.  It records the parser's old
616        state in the history (to allow for backtracking), and updates
617        the parser's new state using the given arguments.  Finally, it
618        returns ``[1]``, which is used by ``match`` and ``expand`` to
619        detect whether their operations were successful.
620
621        :return: ``[1]``
622        :rtype: list of int
623        """
624        self._history.append((self._rtext, self._tree, self._frontier))
625        self._rtext = remaining_text
626        self._tree = tree
627        self._frontier = frontier
628
629        # Is it a good parse?  If so, record it.
630        if len(frontier) == 0 and len(remaining_text) == 0:
631            self._parses.append(tree)
632            self._trace_succeed(self._tree, self._frontier)
633
634        return [1]
635
636    def parses(self):
637        """
638        :return: An iterator of the parses that have been found by this
639            parser so far.
640        :rtype: list of Tree
641        """
642        return iter(self._parses)
643
644    def set_grammar(self, grammar):
645        """
646        Change the grammar used to parse texts.
647
648        :param grammar: The new grammar.
649        :type grammar: CFG
650        """
651        self._grammar = grammar
652
653
654##//////////////////////////////////////////////////////
655##  Demonstration Code
656##//////////////////////////////////////////////////////
657
658
659def demo():
660    """
661    A demonstration of the recursive descent parser.
662    """
663
664    from nltk import parse, CFG
665
666    grammar = CFG.fromstring(
667        """
668    S -> NP VP
669    NP -> Det N | Det N PP
670    VP -> V NP | V NP PP
671    PP -> P NP
672    NP -> 'I'
673    N -> 'man' | 'park' | 'telescope' | 'dog'
674    Det -> 'the' | 'a'
675    P -> 'in' | 'with'
676    V -> 'saw'
677    """
678    )
679
680    for prod in grammar.productions():
681        print(prod)
682
683    sent = 'I saw a man in the park'.split()
684    parser = parse.RecursiveDescentParser(grammar, trace=2)
685    for p in parser.parse(sent):
686        print(p)
687
688
689if __name__ == '__main__':
690    demo()
691