1# Natural Language Toolkit: Recursive Descent Parser 2# 3# Copyright (C) 2001-2019 NLTK Project 4# Author: Edward Loper <edloper@gmail.com> 5# Steven Bird <stevenbird1@gmail.com> 6# URL: <http://nltk.org/> 7# For license information, see LICENSE.TXT 8from __future__ import print_function, unicode_literals 9 10from nltk.grammar import Nonterminal 11from nltk.tree import Tree, ImmutableTree 12from nltk.compat import unicode_repr 13 14from nltk.parse.api import ParserI 15 16##////////////////////////////////////////////////////// 17## Recursive Descent Parser 18##////////////////////////////////////////////////////// 19class RecursiveDescentParser(ParserI): 20 """ 21 A simple top-down CFG parser that parses texts by recursively 22 expanding the fringe of a Tree, and matching it against a 23 text. 24 25 ``RecursiveDescentParser`` uses a list of tree locations called a 26 "frontier" to remember which subtrees have not yet been expanded 27 and which leaves have not yet been matched against the text. Each 28 tree location consists of a list of child indices specifying the 29 path from the root of the tree to a subtree or a leaf; see the 30 reference documentation for Tree for more information 31 about tree locations. 32 33 When the parser begins parsing a text, it constructs a tree 34 containing only the start symbol, and a frontier containing the 35 location of the tree's root node. It then extends the tree to 36 cover the text, using the following recursive procedure: 37 38 - If the frontier is empty, and the text is covered by the tree, 39 then return the tree as a possible parse. 40 - If the frontier is empty, and the text is not covered by the 41 tree, then return no parses. 42 - If the first element of the frontier is a subtree, then 43 use CFG productions to "expand" it. For each applicable 44 production, add the expanded subtree's children to the 45 frontier, and recursively find all parses that can be 46 generated by the new tree and frontier. 47 - If the first element of the frontier is a token, then "match" 48 it against the next token from the text. Remove the token 49 from the frontier, and recursively find all parses that can be 50 generated by the new tree and frontier. 51 52 :see: ``nltk.grammar`` 53 """ 54 55 def __init__(self, grammar, trace=0): 56 """ 57 Create a new ``RecursiveDescentParser``, that uses ``grammar`` 58 to parse texts. 59 60 :type grammar: CFG 61 :param grammar: The grammar used to parse texts. 62 :type trace: int 63 :param trace: The level of tracing that should be used when 64 parsing a text. ``0`` will generate no tracing output; 65 and higher numbers will produce more verbose tracing 66 output. 67 """ 68 self._grammar = grammar 69 self._trace = trace 70 71 def grammar(self): 72 return self._grammar 73 74 def parse(self, tokens): 75 # Inherit docs from ParserI 76 77 tokens = list(tokens) 78 self._grammar.check_coverage(tokens) 79 80 # Start a recursive descent parse, with an initial tree 81 # containing just the start symbol. 82 start = self._grammar.start().symbol() 83 initial_tree = Tree(start, []) 84 frontier = [()] 85 if self._trace: 86 self._trace_start(initial_tree, frontier, tokens) 87 return self._parse(tokens, initial_tree, frontier) 88 89 def _parse(self, remaining_text, tree, frontier): 90 """ 91 Recursively expand and match each elements of ``tree`` 92 specified by ``frontier``, to cover ``remaining_text``. Return 93 a list of all parses found. 94 95 :return: An iterator of all parses that can be generated by 96 matching and expanding the elements of ``tree`` 97 specified by ``frontier``. 98 :rtype: iter(Tree) 99 :type tree: Tree 100 :param tree: A partial structure for the text that is 101 currently being parsed. The elements of ``tree`` 102 that are specified by ``frontier`` have not yet been 103 expanded or matched. 104 :type remaining_text: list(str) 105 :param remaining_text: The portion of the text that is not yet 106 covered by ``tree``. 107 :type frontier: list(tuple(int)) 108 :param frontier: A list of the locations within ``tree`` of 109 all subtrees that have not yet been expanded, and all 110 leaves that have not yet been matched. This list sorted 111 in left-to-right order of location within the tree. 112 """ 113 114 # If the tree covers the text, and there's nothing left to 115 # expand, then we've found a complete parse; return it. 116 if len(remaining_text) == 0 and len(frontier) == 0: 117 if self._trace: 118 self._trace_succeed(tree, frontier) 119 yield tree 120 121 # If there's still text, but nothing left to expand, we failed. 122 elif len(frontier) == 0: 123 if self._trace: 124 self._trace_backtrack(tree, frontier) 125 126 # If the next element on the frontier is a tree, expand it. 127 elif isinstance(tree[frontier[0]], Tree): 128 for result in self._expand(remaining_text, tree, frontier): 129 yield result 130 131 # If the next element on the frontier is a token, match it. 132 else: 133 for result in self._match(remaining_text, tree, frontier): 134 yield result 135 136 def _match(self, rtext, tree, frontier): 137 """ 138 :rtype: iter(Tree) 139 :return: an iterator of all parses that can be generated by 140 matching the first element of ``frontier`` against the 141 first token in ``rtext``. In particular, if the first 142 element of ``frontier`` has the same type as the first 143 token in ``rtext``, then substitute the token into 144 ``tree``; and return all parses that can be generated by 145 matching and expanding the remaining elements of 146 ``frontier``. If the first element of ``frontier`` does not 147 have the same type as the first token in ``rtext``, then 148 return empty list. 149 150 :type tree: Tree 151 :param tree: A partial structure for the text that is 152 currently being parsed. The elements of ``tree`` 153 that are specified by ``frontier`` have not yet been 154 expanded or matched. 155 :type rtext: list(str) 156 :param rtext: The portion of the text that is not yet 157 covered by ``tree``. 158 :type frontier: list of tuple of int 159 :param frontier: A list of the locations within ``tree`` of 160 all subtrees that have not yet been expanded, and all 161 leaves that have not yet been matched. 162 """ 163 164 tree_leaf = tree[frontier[0]] 165 if len(rtext) > 0 and tree_leaf == rtext[0]: 166 # If it's a terminal that matches rtext[0], then substitute 167 # in the token, and continue parsing. 168 newtree = tree.copy(deep=True) 169 newtree[frontier[0]] = rtext[0] 170 if self._trace: 171 self._trace_match(newtree, frontier[1:], rtext[0]) 172 for result in self._parse(rtext[1:], newtree, frontier[1:]): 173 yield result 174 else: 175 # If it's a non-matching terminal, fail. 176 if self._trace: 177 self._trace_backtrack(tree, frontier, rtext[:1]) 178 179 def _expand(self, remaining_text, tree, frontier, production=None): 180 """ 181 :rtype: iter(Tree) 182 :return: An iterator of all parses that can be generated by 183 expanding the first element of ``frontier`` with 184 ``production``. In particular, if the first element of 185 ``frontier`` is a subtree whose node type is equal to 186 ``production``'s left hand side, then add a child to that 187 subtree for each element of ``production``'s right hand 188 side; and return all parses that can be generated by 189 matching and expanding the remaining elements of 190 ``frontier``. If the first element of ``frontier`` is not a 191 subtree whose node type is equal to ``production``'s left 192 hand side, then return an empty list. If ``production`` is 193 not specified, then return a list of all parses that can 194 be generated by expanding the first element of ``frontier`` 195 with *any* CFG production. 196 197 :type tree: Tree 198 :param tree: A partial structure for the text that is 199 currently being parsed. The elements of ``tree`` 200 that are specified by ``frontier`` have not yet been 201 expanded or matched. 202 :type remaining_text: list(str) 203 :param remaining_text: The portion of the text that is not yet 204 covered by ``tree``. 205 :type frontier: list(tuple(int)) 206 :param frontier: A list of the locations within ``tree`` of 207 all subtrees that have not yet been expanded, and all 208 leaves that have not yet been matched. 209 """ 210 211 if production is None: 212 productions = self._grammar.productions() 213 else: 214 productions = [production] 215 216 for production in productions: 217 lhs = production.lhs().symbol() 218 if lhs == tree[frontier[0]].label(): 219 subtree = self._production_to_tree(production) 220 if frontier[0] == (): 221 newtree = subtree 222 else: 223 newtree = tree.copy(deep=True) 224 newtree[frontier[0]] = subtree 225 new_frontier = [ 226 frontier[0] + (i,) for i in range(len(production.rhs())) 227 ] 228 if self._trace: 229 self._trace_expand(newtree, new_frontier, production) 230 for result in self._parse( 231 remaining_text, newtree, new_frontier + frontier[1:] 232 ): 233 yield result 234 235 def _production_to_tree(self, production): 236 """ 237 :rtype: Tree 238 :return: The Tree that is licensed by ``production``. 239 In particular, given the production ``[lhs -> elt[1] ... elt[n]]`` 240 return a tree that has a node ``lhs.symbol``, and 241 ``n`` children. For each nonterminal element 242 ``elt[i]`` in the production, the tree token has a 243 childless subtree with node value ``elt[i].symbol``; and 244 for each terminal element ``elt[j]``, the tree token has 245 a leaf token with type ``elt[j]``. 246 247 :param production: The CFG production that licenses the tree 248 token that should be returned. 249 :type production: Production 250 """ 251 children = [] 252 for elt in production.rhs(): 253 if isinstance(elt, Nonterminal): 254 children.append(Tree(elt.symbol(), [])) 255 else: 256 # This will be matched. 257 children.append(elt) 258 return Tree(production.lhs().symbol(), children) 259 260 def trace(self, trace=2): 261 """ 262 Set the level of tracing output that should be generated when 263 parsing a text. 264 265 :type trace: int 266 :param trace: The trace level. A trace level of ``0`` will 267 generate no tracing output; and higher trace levels will 268 produce more verbose tracing output. 269 :rtype: None 270 """ 271 self._trace = trace 272 273 def _trace_fringe(self, tree, treeloc=None): 274 """ 275 Print trace output displaying the fringe of ``tree``. The 276 fringe of ``tree`` consists of all of its leaves and all of 277 its childless subtrees. 278 279 :rtype: None 280 """ 281 282 if treeloc == (): 283 print("*", end=' ') 284 if isinstance(tree, Tree): 285 if len(tree) == 0: 286 print(unicode_repr(Nonterminal(tree.label())), end=' ') 287 for i in range(len(tree)): 288 if treeloc is not None and i == treeloc[0]: 289 self._trace_fringe(tree[i], treeloc[1:]) 290 else: 291 self._trace_fringe(tree[i]) 292 else: 293 print(unicode_repr(tree), end=' ') 294 295 def _trace_tree(self, tree, frontier, operation): 296 """ 297 Print trace output displaying the parser's current state. 298 299 :param operation: A character identifying the operation that 300 generated the current state. 301 :rtype: None 302 """ 303 if self._trace == 2: 304 print(' %c [' % operation, end=' ') 305 else: 306 print(' [', end=' ') 307 if len(frontier) > 0: 308 self._trace_fringe(tree, frontier[0]) 309 else: 310 self._trace_fringe(tree) 311 print(']') 312 313 def _trace_start(self, tree, frontier, text): 314 print('Parsing %r' % " ".join(text)) 315 if self._trace > 2: 316 print('Start:') 317 if self._trace > 1: 318 self._trace_tree(tree, frontier, ' ') 319 320 def _trace_expand(self, tree, frontier, production): 321 if self._trace > 2: 322 print('Expand: %s' % production) 323 if self._trace > 1: 324 self._trace_tree(tree, frontier, 'E') 325 326 def _trace_match(self, tree, frontier, tok): 327 if self._trace > 2: 328 print('Match: %r' % tok) 329 if self._trace > 1: 330 self._trace_tree(tree, frontier, 'M') 331 332 def _trace_succeed(self, tree, frontier): 333 if self._trace > 2: 334 print('GOOD PARSE:') 335 if self._trace == 1: 336 print('Found a parse:\n%s' % tree) 337 if self._trace > 1: 338 self._trace_tree(tree, frontier, '+') 339 340 def _trace_backtrack(self, tree, frontier, toks=None): 341 if self._trace > 2: 342 if toks: 343 print('Backtrack: %r match failed' % toks[0]) 344 else: 345 print('Backtrack') 346 347 348##////////////////////////////////////////////////////// 349## Stepping Recursive Descent Parser 350##////////////////////////////////////////////////////// 351class SteppingRecursiveDescentParser(RecursiveDescentParser): 352 """ 353 A ``RecursiveDescentParser`` that allows you to step through the 354 parsing process, performing a single operation at a time. 355 356 The ``initialize`` method is used to start parsing a text. 357 ``expand`` expands the first element on the frontier using a single 358 CFG production, and ``match`` matches the first element on the 359 frontier against the next text token. ``backtrack`` undoes the most 360 recent expand or match operation. ``step`` performs a single 361 expand, match, or backtrack operation. ``parses`` returns the set 362 of parses that have been found by the parser. 363 364 :ivar _history: A list of ``(rtext, tree, frontier)`` tripples, 365 containing the previous states of the parser. This history is 366 used to implement the ``backtrack`` operation. 367 :ivar _tried_e: A record of all productions that have been tried 368 for a given tree. This record is used by ``expand`` to perform 369 the next untried production. 370 :ivar _tried_m: A record of what tokens have been matched for a 371 given tree. This record is used by ``step`` to decide whether 372 or not to match a token. 373 :see: ``nltk.grammar`` 374 """ 375 376 def __init__(self, grammar, trace=0): 377 super(SteppingRecursiveDescentParser, self).__init__(grammar, trace) 378 self._rtext = None 379 self._tree = None 380 self._frontier = [()] 381 self._tried_e = {} 382 self._tried_m = {} 383 self._history = [] 384 self._parses = [] 385 386 # [XX] TEMPORARY HACK WARNING! This should be replaced with 387 # something nicer when we get the chance. 388 def _freeze(self, tree): 389 c = tree.copy() 390 # for pos in c.treepositions('leaves'): 391 # c[pos] = c[pos].freeze() 392 return ImmutableTree.convert(c) 393 394 def parse(self, tokens): 395 tokens = list(tokens) 396 self.initialize(tokens) 397 while self.step() is not None: 398 pass 399 return self.parses() 400 401 def initialize(self, tokens): 402 """ 403 Start parsing a given text. This sets the parser's tree to 404 the start symbol, its frontier to the root node, and its 405 remaining text to ``token['SUBTOKENS']``. 406 """ 407 408 self._rtext = tokens 409 start = self._grammar.start().symbol() 410 self._tree = Tree(start, []) 411 self._frontier = [()] 412 self._tried_e = {} 413 self._tried_m = {} 414 self._history = [] 415 self._parses = [] 416 if self._trace: 417 self._trace_start(self._tree, self._frontier, self._rtext) 418 419 def remaining_text(self): 420 """ 421 :return: The portion of the text that is not yet covered by the 422 tree. 423 :rtype: list(str) 424 """ 425 return self._rtext 426 427 def frontier(self): 428 """ 429 :return: A list of the tree locations of all subtrees that 430 have not yet been expanded, and all leaves that have not 431 yet been matched. 432 :rtype: list(tuple(int)) 433 """ 434 return self._frontier 435 436 def tree(self): 437 """ 438 :return: A partial structure for the text that is 439 currently being parsed. The elements specified by the 440 frontier have not yet been expanded or matched. 441 :rtype: Tree 442 """ 443 return self._tree 444 445 def step(self): 446 """ 447 Perform a single parsing operation. If an untried match is 448 possible, then perform the match, and return the matched 449 token. If an untried expansion is possible, then perform the 450 expansion, and return the production that it is based on. If 451 backtracking is possible, then backtrack, and return True. 452 Otherwise, return None. 453 454 :return: None if no operation was performed; a token if a match 455 was performed; a production if an expansion was performed; 456 and True if a backtrack operation was performed. 457 :rtype: Production or String or bool 458 """ 459 # Try matching (if we haven't already) 460 if self.untried_match(): 461 token = self.match() 462 if token is not None: 463 return token 464 465 # Try expanding. 466 production = self.expand() 467 if production is not None: 468 return production 469 470 # Try backtracking 471 if self.backtrack(): 472 self._trace_backtrack(self._tree, self._frontier) 473 return True 474 475 # Nothing left to do. 476 return None 477 478 def expand(self, production=None): 479 """ 480 Expand the first element of the frontier. In particular, if 481 the first element of the frontier is a subtree whose node type 482 is equal to ``production``'s left hand side, then add a child 483 to that subtree for each element of ``production``'s right hand 484 side. If ``production`` is not specified, then use the first 485 untried expandable production. If all expandable productions 486 have been tried, do nothing. 487 488 :return: The production used to expand the frontier, if an 489 expansion was performed. If no expansion was performed, 490 return None. 491 :rtype: Production or None 492 """ 493 494 # Make sure we *can* expand. 495 if len(self._frontier) == 0: 496 return None 497 if not isinstance(self._tree[self._frontier[0]], Tree): 498 return None 499 500 # If they didn't specify a production, check all untried ones. 501 if production is None: 502 productions = self.untried_expandable_productions() 503 else: 504 productions = [production] 505 506 parses = [] 507 for prod in productions: 508 # Record that we've tried this production now. 509 self._tried_e.setdefault(self._freeze(self._tree), []).append(prod) 510 511 # Try expanding. 512 for _result in self._expand(self._rtext, self._tree, self._frontier, prod): 513 return prod 514 515 # We didn't expand anything. 516 return None 517 518 def match(self): 519 """ 520 Match the first element of the frontier. In particular, if 521 the first element of the frontier has the same type as the 522 next text token, then substitute the text token into the tree. 523 524 :return: The token matched, if a match operation was 525 performed. If no match was performed, return None 526 :rtype: str or None 527 """ 528 529 # Record that we've tried matching this token. 530 tok = self._rtext[0] 531 self._tried_m.setdefault(self._freeze(self._tree), []).append(tok) 532 533 # Make sure we *can* match. 534 if len(self._frontier) == 0: 535 return None 536 if isinstance(self._tree[self._frontier[0]], Tree): 537 return None 538 539 for _result in self._match(self._rtext, self._tree, self._frontier): 540 # Return the token we just matched. 541 return self._history[-1][0][0] 542 return None 543 544 def backtrack(self): 545 """ 546 Return the parser to its state before the most recent 547 match or expand operation. Calling ``undo`` repeatedly return 548 the parser to successively earlier states. If no match or 549 expand operations have been performed, ``undo`` will make no 550 changes. 551 552 :return: true if an operation was successfully undone. 553 :rtype: bool 554 """ 555 if len(self._history) == 0: 556 return False 557 (self._rtext, self._tree, self._frontier) = self._history.pop() 558 return True 559 560 def expandable_productions(self): 561 """ 562 :return: A list of all the productions for which expansions 563 are available for the current parser state. 564 :rtype: list(Production) 565 """ 566 # Make sure we *can* expand. 567 if len(self._frontier) == 0: 568 return [] 569 frontier_child = self._tree[self._frontier[0]] 570 if len(self._frontier) == 0 or not isinstance(frontier_child, Tree): 571 return [] 572 573 return [ 574 p 575 for p in self._grammar.productions() 576 if p.lhs().symbol() == frontier_child.label() 577 ] 578 579 def untried_expandable_productions(self): 580 """ 581 :return: A list of all the untried productions for which 582 expansions are available for the current parser state. 583 :rtype: list(Production) 584 """ 585 586 tried_expansions = self._tried_e.get(self._freeze(self._tree), []) 587 return [p for p in self.expandable_productions() if p not in tried_expansions] 588 589 def untried_match(self): 590 """ 591 :return: Whether the first element of the frontier is a token 592 that has not yet been matched. 593 :rtype: bool 594 """ 595 596 if len(self._rtext) == 0: 597 return False 598 tried_matches = self._tried_m.get(self._freeze(self._tree), []) 599 return self._rtext[0] not in tried_matches 600 601 def currently_complete(self): 602 """ 603 :return: Whether the parser's current state represents a 604 complete parse. 605 :rtype: bool 606 """ 607 return len(self._frontier) == 0 and len(self._rtext) == 0 608 609 def _parse(self, remaining_text, tree, frontier): 610 """ 611 A stub version of ``_parse`` that sets the parsers current 612 state to the given arguments. In ``RecursiveDescentParser``, 613 the ``_parse`` method is used to recursively continue parsing a 614 text. ``SteppingRecursiveDescentParser`` overrides it to 615 capture these recursive calls. It records the parser's old 616 state in the history (to allow for backtracking), and updates 617 the parser's new state using the given arguments. Finally, it 618 returns ``[1]``, which is used by ``match`` and ``expand`` to 619 detect whether their operations were successful. 620 621 :return: ``[1]`` 622 :rtype: list of int 623 """ 624 self._history.append((self._rtext, self._tree, self._frontier)) 625 self._rtext = remaining_text 626 self._tree = tree 627 self._frontier = frontier 628 629 # Is it a good parse? If so, record it. 630 if len(frontier) == 0 and len(remaining_text) == 0: 631 self._parses.append(tree) 632 self._trace_succeed(self._tree, self._frontier) 633 634 return [1] 635 636 def parses(self): 637 """ 638 :return: An iterator of the parses that have been found by this 639 parser so far. 640 :rtype: list of Tree 641 """ 642 return iter(self._parses) 643 644 def set_grammar(self, grammar): 645 """ 646 Change the grammar used to parse texts. 647 648 :param grammar: The new grammar. 649 :type grammar: CFG 650 """ 651 self._grammar = grammar 652 653 654##////////////////////////////////////////////////////// 655## Demonstration Code 656##////////////////////////////////////////////////////// 657 658 659def demo(): 660 """ 661 A demonstration of the recursive descent parser. 662 """ 663 664 from nltk import parse, CFG 665 666 grammar = CFG.fromstring( 667 """ 668 S -> NP VP 669 NP -> Det N | Det N PP 670 VP -> V NP | V NP PP 671 PP -> P NP 672 NP -> 'I' 673 N -> 'man' | 'park' | 'telescope' | 'dog' 674 Det -> 'the' | 'a' 675 P -> 'in' | 'with' 676 V -> 'saw' 677 """ 678 ) 679 680 for prod in grammar.productions(): 681 print(prod) 682 683 sent = 'I saw a man in the park'.split() 684 parser = parse.RecursiveDescentParser(grammar, trace=2) 685 for p in parser.parse(sent): 686 print(p) 687 688 689if __name__ == '__main__': 690 demo() 691