1# -*- coding: utf-8 -*- 2# Natural Language Toolkit: A Chart Parser 3# 4# Copyright (C) 2001-2019 NLTK Project 5# Author: Edward Loper <edloper@gmail.com> 6# Steven Bird <stevenbird1@gmail.com> 7# Jean Mark Gawron <gawron@mail.sdsu.edu> 8# Peter Ljunglöf <peter.ljunglof@heatherleaf.se> 9# URL: <http://nltk.org/> 10# For license information, see LICENSE.TXT 11 12""" 13Data classes and parser implementations for "chart parsers", which 14use dynamic programming to efficiently parse a text. A chart 15parser derives parse trees for a text by iteratively adding "edges" 16to a "chart." Each edge represents a hypothesis about the tree 17structure for a subsequence of the text. The chart is a 18"blackboard" for composing and combining these hypotheses. 19 20When a chart parser begins parsing a text, it creates a new (empty) 21chart, spanning the text. It then incrementally adds new edges to the 22chart. A set of "chart rules" specifies the conditions under which 23new edges should be added to the chart. Once the chart reaches a 24stage where none of the chart rules adds any new edges, parsing is 25complete. 26 27Charts are encoded with the ``Chart`` class, and edges are encoded with 28the ``TreeEdge`` and ``LeafEdge`` classes. The chart parser module 29defines three chart parsers: 30 31 - ``ChartParser`` is a simple and flexible chart parser. Given a 32 set of chart rules, it will apply those rules to the chart until 33 no more edges are added. 34 35 - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can 36 be used to step through the parsing process. 37""" 38from __future__ import print_function, division, unicode_literals 39 40import itertools 41import re 42import warnings 43from functools import total_ordering 44 45from six.moves import range 46 47from nltk.tree import Tree 48from nltk.grammar import PCFG, is_nonterminal, is_terminal 49from nltk.util import OrderedDict 50from nltk.internals import raise_unorderable_types 51from nltk.compat import python_2_unicode_compatible, unicode_repr 52 53from nltk.parse.api import ParserI 54 55 56######################################################################## 57## Edges 58######################################################################## 59 60 61@total_ordering 62class EdgeI(object): 63 """ 64 A hypothesis about the structure of part of a sentence. 65 Each edge records the fact that a structure is (partially) 66 consistent with the sentence. An edge contains: 67 68 - A span, indicating what part of the sentence is 69 consistent with the hypothesized structure. 70 - A left-hand side, specifying what kind of structure is 71 hypothesized. 72 - A right-hand side, specifying the contents of the 73 hypothesized structure. 74 - A dot position, indicating how much of the hypothesized 75 structure is consistent with the sentence. 76 77 Every edge is either complete or incomplete: 78 79 - An edge is complete if its structure is fully consistent 80 with the sentence. 81 - An edge is incomplete if its structure is partially 82 consistent with the sentence. For every incomplete edge, the 83 span specifies a possible prefix for the edge's structure. 84 85 There are two kinds of edge: 86 87 - A ``TreeEdge`` records which trees have been found to 88 be (partially) consistent with the text. 89 - A ``LeafEdge`` records the tokens occurring in the text. 90 91 The ``EdgeI`` interface provides a common interface to both types 92 of edge, allowing chart parsers to treat them in a uniform manner. 93 """ 94 95 def __init__(self): 96 if self.__class__ == EdgeI: 97 raise TypeError('Edge is an abstract interface') 98 99 # //////////////////////////////////////////////////////////// 100 # Span 101 # //////////////////////////////////////////////////////////// 102 103 def span(self): 104 """ 105 Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the 106 portion of the sentence that is consistent with this 107 edge's structure. 108 109 :rtype: tuple(int, int) 110 """ 111 raise NotImplementedError() 112 113 def start(self): 114 """ 115 Return the start index of this edge's span. 116 117 :rtype: int 118 """ 119 raise NotImplementedError() 120 121 def end(self): 122 """ 123 Return the end index of this edge's span. 124 125 :rtype: int 126 """ 127 raise NotImplementedError() 128 129 def length(self): 130 """ 131 Return the length of this edge's span. 132 133 :rtype: int 134 """ 135 raise NotImplementedError() 136 137 # //////////////////////////////////////////////////////////// 138 # Left Hand Side 139 # //////////////////////////////////////////////////////////// 140 141 def lhs(self): 142 """ 143 Return this edge's left-hand side, which specifies what kind 144 of structure is hypothesized by this edge. 145 146 :see: ``TreeEdge`` and ``LeafEdge`` for a description of 147 the left-hand side values for each edge type. 148 """ 149 raise NotImplementedError() 150 151 # //////////////////////////////////////////////////////////// 152 # Right Hand Side 153 # //////////////////////////////////////////////////////////// 154 155 def rhs(self): 156 """ 157 Return this edge's right-hand side, which specifies 158 the content of the structure hypothesized by this edge. 159 160 :see: ``TreeEdge`` and ``LeafEdge`` for a description of 161 the right-hand side values for each edge type. 162 """ 163 raise NotImplementedError() 164 165 def dot(self): 166 """ 167 Return this edge's dot position, which indicates how much of 168 the hypothesized structure is consistent with the 169 sentence. In particular, ``self.rhs[:dot]`` is consistent 170 with ``tokens[self.start():self.end()]``. 171 172 :rtype: int 173 """ 174 raise NotImplementedError() 175 176 def nextsym(self): 177 """ 178 Return the element of this edge's right-hand side that 179 immediately follows its dot. 180 181 :rtype: Nonterminal or terminal or None 182 """ 183 raise NotImplementedError() 184 185 def is_complete(self): 186 """ 187 Return True if this edge's structure is fully consistent 188 with the text. 189 190 :rtype: bool 191 """ 192 raise NotImplementedError() 193 194 def is_incomplete(self): 195 """ 196 Return True if this edge's structure is partially consistent 197 with the text. 198 199 :rtype: bool 200 """ 201 raise NotImplementedError() 202 203 # //////////////////////////////////////////////////////////// 204 # Comparisons & hashing 205 # //////////////////////////////////////////////////////////// 206 207 def __eq__(self, other): 208 return ( 209 self.__class__ is other.__class__ 210 and self._comparison_key == other._comparison_key 211 ) 212 213 def __ne__(self, other): 214 return not self == other 215 216 def __lt__(self, other): 217 if not isinstance(other, EdgeI): 218 raise_unorderable_types("<", self, other) 219 if self.__class__ is other.__class__: 220 return self._comparison_key < other._comparison_key 221 else: 222 return self.__class__.__name__ < other.__class__.__name__ 223 224 def __hash__(self): 225 try: 226 return self._hash 227 except AttributeError: 228 self._hash = hash(self._comparison_key) 229 return self._hash 230 231 232@python_2_unicode_compatible 233class TreeEdge(EdgeI): 234 """ 235 An edge that records the fact that a tree is (partially) 236 consistent with the sentence. A tree edge consists of: 237 238 - A span, indicating what part of the sentence is 239 consistent with the hypothesized tree. 240 - A left-hand side, specifying the hypothesized tree's node 241 value. 242 - A right-hand side, specifying the hypothesized tree's 243 children. Each element of the right-hand side is either a 244 terminal, specifying a token with that terminal as its leaf 245 value; or a nonterminal, specifying a subtree with that 246 nonterminal's symbol as its node value. 247 - A dot position, indicating which children are consistent 248 with part of the sentence. In particular, if ``dot`` is the 249 dot position, ``rhs`` is the right-hand size, ``(start,end)`` 250 is the span, and ``sentence`` is the list of tokens in the 251 sentence, then ``tokens[start:end]`` can be spanned by the 252 children specified by ``rhs[:dot]``. 253 254 For more information about edges, see the ``EdgeI`` interface. 255 """ 256 257 def __init__(self, span, lhs, rhs, dot=0): 258 """ 259 Construct a new ``TreeEdge``. 260 261 :type span: tuple(int, int) 262 :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the 263 portion of the sentence that is consistent with the new 264 edge's structure. 265 :type lhs: Nonterminal 266 :param lhs: The new edge's left-hand side, specifying the 267 hypothesized tree's node value. 268 :type rhs: list(Nonterminal and str) 269 :param rhs: The new edge's right-hand side, specifying the 270 hypothesized tree's children. 271 :type dot: int 272 :param dot: The position of the new edge's dot. This position 273 specifies what prefix of the production's right hand side 274 is consistent with the text. In particular, if 275 ``sentence`` is the list of tokens in the sentence, then 276 ``okens[span[0]:span[1]]`` can be spanned by the 277 children specified by ``rhs[:dot]``. 278 """ 279 self._span = span 280 self._lhs = lhs 281 rhs = tuple(rhs) 282 self._rhs = rhs 283 self._dot = dot 284 self._comparison_key = (span, lhs, rhs, dot) 285 286 @staticmethod 287 def from_production(production, index): 288 """ 289 Return a new ``TreeEdge`` formed from the given production. 290 The new edge's left-hand side and right-hand side will 291 be taken from ``production``; its span will be 292 ``(index,index)``; and its dot position will be ``0``. 293 294 :rtype: TreeEdge 295 """ 296 return TreeEdge( 297 span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0 298 ) 299 300 def move_dot_forward(self, new_end): 301 """ 302 Return a new ``TreeEdge`` formed from this edge. 303 The new edge's dot position is increased by ``1``, 304 and its end index will be replaced by ``new_end``. 305 306 :param new_end: The new end index. 307 :type new_end: int 308 :rtype: TreeEdge 309 """ 310 return TreeEdge( 311 span=(self._span[0], new_end), 312 lhs=self._lhs, 313 rhs=self._rhs, 314 dot=self._dot + 1, 315 ) 316 317 # Accessors 318 def lhs(self): 319 return self._lhs 320 321 def span(self): 322 return self._span 323 324 def start(self): 325 return self._span[0] 326 327 def end(self): 328 return self._span[1] 329 330 def length(self): 331 return self._span[1] - self._span[0] 332 333 def rhs(self): 334 return self._rhs 335 336 def dot(self): 337 return self._dot 338 339 def is_complete(self): 340 return self._dot == len(self._rhs) 341 342 def is_incomplete(self): 343 return self._dot != len(self._rhs) 344 345 def nextsym(self): 346 if self._dot >= len(self._rhs): 347 return None 348 else: 349 return self._rhs[self._dot] 350 351 # String representation 352 def __str__(self): 353 str = '[%s:%s] ' % (self._span[0], self._span[1]) 354 str += '%-2r ->' % (self._lhs,) 355 356 for i in range(len(self._rhs)): 357 if i == self._dot: 358 str += ' *' 359 str += ' %s' % unicode_repr(self._rhs[i]) 360 if len(self._rhs) == self._dot: 361 str += ' *' 362 return str 363 364 def __repr__(self): 365 return '[Edge: %s]' % self 366 367 368@python_2_unicode_compatible 369class LeafEdge(EdgeI): 370 """ 371 An edge that records the fact that a leaf value is consistent with 372 a word in the sentence. A leaf edge consists of: 373 374 - An index, indicating the position of the word. 375 - A leaf, specifying the word's content. 376 377 A leaf edge's left-hand side is its leaf value, and its right hand 378 side is ``()``. Its span is ``[index, index+1]``, and its dot 379 position is ``0``. 380 """ 381 382 def __init__(self, leaf, index): 383 """ 384 Construct a new ``LeafEdge``. 385 386 :param leaf: The new edge's leaf value, specifying the word 387 that is recorded by this edge. 388 :param index: The new edge's index, specifying the position of 389 the word that is recorded by this edge. 390 """ 391 self._leaf = leaf 392 self._index = index 393 self._comparison_key = (leaf, index) 394 395 # Accessors 396 def lhs(self): 397 return self._leaf 398 399 def span(self): 400 return (self._index, self._index + 1) 401 402 def start(self): 403 return self._index 404 405 def end(self): 406 return self._index + 1 407 408 def length(self): 409 return 1 410 411 def rhs(self): 412 return () 413 414 def dot(self): 415 return 0 416 417 def is_complete(self): 418 return True 419 420 def is_incomplete(self): 421 return False 422 423 def nextsym(self): 424 return None 425 426 # String representations 427 def __str__(self): 428 return '[%s:%s] %s' % (self._index, self._index + 1, unicode_repr(self._leaf)) 429 430 def __repr__(self): 431 return '[Edge: %s]' % (self) 432 433 434######################################################################## 435## Chart 436######################################################################## 437 438 439class Chart(object): 440 """ 441 A blackboard for hypotheses about the syntactic constituents of a 442 sentence. A chart contains a set of edges, and each edge encodes 443 a single hypothesis about the structure of some portion of the 444 sentence. 445 446 The ``select`` method can be used to select a specific collection 447 of edges. For example ``chart.select(is_complete=True, start=0)`` 448 yields all complete edges whose start indices are 0. To ensure 449 the efficiency of these selection operations, ``Chart`` dynamically 450 creates and maintains an index for each set of attributes that 451 have been selected on. 452 453 In order to reconstruct the trees that are represented by an edge, 454 the chart associates each edge with a set of child pointer lists. 455 A child pointer list is a list of the edges that license an 456 edge's right-hand side. 457 458 :ivar _tokens: The sentence that the chart covers. 459 :ivar _num_leaves: The number of tokens. 460 :ivar _edges: A list of the edges in the chart 461 :ivar _edge_to_cpls: A dictionary mapping each edge to a set 462 of child pointer lists that are associated with that edge. 463 :ivar _indexes: A dictionary mapping tuples of edge attributes 464 to indices, where each index maps the corresponding edge 465 attribute values to lists of edges. 466 """ 467 468 def __init__(self, tokens): 469 """ 470 Construct a new chart. The chart is initialized with the 471 leaf edges corresponding to the terminal leaves. 472 473 :type tokens: list 474 :param tokens: The sentence that this chart will be used to parse. 475 """ 476 # Record the sentence token and the sentence length. 477 self._tokens = tuple(tokens) 478 self._num_leaves = len(self._tokens) 479 480 # Initialise the chart. 481 self.initialize() 482 483 def initialize(self): 484 """ 485 Clear the chart. 486 """ 487 # A list of edges contained in this chart. 488 self._edges = [] 489 490 # The set of child pointer lists associated with each edge. 491 self._edge_to_cpls = {} 492 493 # Indexes mapping attribute values to lists of edges 494 # (used by select()). 495 self._indexes = {} 496 497 # //////////////////////////////////////////////////////////// 498 # Sentence Access 499 # //////////////////////////////////////////////////////////// 500 501 def num_leaves(self): 502 """ 503 Return the number of words in this chart's sentence. 504 505 :rtype: int 506 """ 507 return self._num_leaves 508 509 def leaf(self, index): 510 """ 511 Return the leaf value of the word at the given index. 512 513 :rtype: str 514 """ 515 return self._tokens[index] 516 517 def leaves(self): 518 """ 519 Return a list of the leaf values of each word in the 520 chart's sentence. 521 522 :rtype: list(str) 523 """ 524 return self._tokens 525 526 # //////////////////////////////////////////////////////////// 527 # Edge access 528 # //////////////////////////////////////////////////////////// 529 530 def edges(self): 531 """ 532 Return a list of all edges in this chart. New edges 533 that are added to the chart after the call to edges() 534 will *not* be contained in this list. 535 536 :rtype: list(EdgeI) 537 :see: ``iteredges``, ``select`` 538 """ 539 return self._edges[:] 540 541 def iteredges(self): 542 """ 543 Return an iterator over the edges in this chart. It is 544 not guaranteed that new edges which are added to the 545 chart before the iterator is exhausted will also be generated. 546 547 :rtype: iter(EdgeI) 548 :see: ``edges``, ``select`` 549 """ 550 return iter(self._edges) 551 552 # Iterating over the chart yields its edges. 553 __iter__ = iteredges 554 555 def num_edges(self): 556 """ 557 Return the number of edges contained in this chart. 558 559 :rtype: int 560 """ 561 return len(self._edge_to_cpls) 562 563 def select(self, **restrictions): 564 """ 565 Return an iterator over the edges in this chart. Any 566 new edges that are added to the chart before the iterator 567 is exahusted will also be generated. ``restrictions`` 568 can be used to restrict the set of edges that will be 569 generated. 570 571 :param span: Only generate edges ``e`` where ``e.span()==span`` 572 :param start: Only generate edges ``e`` where ``e.start()==start`` 573 :param end: Only generate edges ``e`` where ``e.end()==end`` 574 :param length: Only generate edges ``e`` where ``e.length()==length`` 575 :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs`` 576 :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs`` 577 :param nextsym: Only generate edges ``e`` where 578 ``e.nextsym()==nextsym`` 579 :param dot: Only generate edges ``e`` where ``e.dot()==dot`` 580 :param is_complete: Only generate edges ``e`` where 581 ``e.is_complete()==is_complete`` 582 :param is_incomplete: Only generate edges ``e`` where 583 ``e.is_incomplete()==is_incomplete`` 584 :rtype: iter(EdgeI) 585 """ 586 # If there are no restrictions, then return all edges. 587 if restrictions == {}: 588 return iter(self._edges) 589 590 # Find the index corresponding to the given restrictions. 591 restr_keys = sorted(restrictions.keys()) 592 restr_keys = tuple(restr_keys) 593 594 # If it doesn't exist, then create it. 595 if restr_keys not in self._indexes: 596 self._add_index(restr_keys) 597 598 vals = tuple(restrictions[key] for key in restr_keys) 599 return iter(self._indexes[restr_keys].get(vals, [])) 600 601 def _add_index(self, restr_keys): 602 """ 603 A helper function for ``select``, which creates a new index for 604 a given set of attributes (aka restriction keys). 605 """ 606 # Make sure it's a valid index. 607 for key in restr_keys: 608 if not hasattr(EdgeI, key): 609 raise ValueError('Bad restriction: %s' % key) 610 611 # Create the index. 612 index = self._indexes[restr_keys] = {} 613 614 # Add all existing edges to the index. 615 for edge in self._edges: 616 vals = tuple(getattr(edge, key)() for key in restr_keys) 617 index.setdefault(vals, []).append(edge) 618 619 def _register_with_indexes(self, edge): 620 """ 621 A helper function for ``insert``, which registers the new 622 edge with all existing indexes. 623 """ 624 for (restr_keys, index) in self._indexes.items(): 625 vals = tuple(getattr(edge, key)() for key in restr_keys) 626 index.setdefault(vals, []).append(edge) 627 628 # //////////////////////////////////////////////////////////// 629 # Edge Insertion 630 # //////////////////////////////////////////////////////////// 631 632 def insert_with_backpointer(self, new_edge, previous_edge, child_edge): 633 """ 634 Add a new edge to the chart, using a pointer to the previous edge. 635 """ 636 cpls = self.child_pointer_lists(previous_edge) 637 new_cpls = [cpl + (child_edge,) for cpl in cpls] 638 return self.insert(new_edge, *new_cpls) 639 640 def insert(self, edge, *child_pointer_lists): 641 """ 642 Add a new edge to the chart, and return True if this operation 643 modified the chart. In particular, return true iff the chart 644 did not already contain ``edge``, or if it did not already associate 645 ``child_pointer_lists`` with ``edge``. 646 647 :type edge: EdgeI 648 :param edge: The new edge 649 :type child_pointer_lists: sequence of tuple(EdgeI) 650 :param child_pointer_lists: A sequence of lists of the edges that 651 were used to form this edge. This list is used to reconstruct 652 the trees (or partial trees) that are associated with ``edge``. 653 :rtype: bool 654 """ 655 # Is it a new edge? 656 if edge not in self._edge_to_cpls: 657 # Add it to the list of edges. 658 self._append_edge(edge) 659 # Register with indexes. 660 self._register_with_indexes(edge) 661 662 # Get the set of child pointer lists for this edge. 663 cpls = self._edge_to_cpls.setdefault(edge, OrderedDict()) 664 chart_was_modified = False 665 for child_pointer_list in child_pointer_lists: 666 child_pointer_list = tuple(child_pointer_list) 667 if child_pointer_list not in cpls: 668 # It's a new CPL; register it, and return true. 669 cpls[child_pointer_list] = True 670 chart_was_modified = True 671 return chart_was_modified 672 673 def _append_edge(self, edge): 674 self._edges.append(edge) 675 676 # //////////////////////////////////////////////////////////// 677 # Tree extraction & child pointer lists 678 # //////////////////////////////////////////////////////////// 679 680 def parses(self, root, tree_class=Tree): 681 """ 682 Return an iterator of the complete tree structures that span 683 the entire chart, and whose root node is ``root``. 684 """ 685 for edge in self.select(start=0, end=self._num_leaves, lhs=root): 686 for tree in self.trees(edge, tree_class=tree_class, complete=True): 687 yield tree 688 689 def trees(self, edge, tree_class=Tree, complete=False): 690 """ 691 Return an iterator of the tree structures that are associated 692 with ``edge``. 693 694 If ``edge`` is incomplete, then the unexpanded children will be 695 encoded as childless subtrees, whose node value is the 696 corresponding terminal or nonterminal. 697 698 :rtype: list(Tree) 699 :note: If two trees share a common subtree, then the same 700 Tree may be used to encode that subtree in 701 both trees. If you need to eliminate this subtree 702 sharing, then create a deep copy of each tree. 703 """ 704 return iter(self._trees(edge, complete, memo={}, tree_class=tree_class)) 705 706 def _trees(self, edge, complete, memo, tree_class): 707 """ 708 A helper function for ``trees``. 709 710 :param memo: A dictionary used to record the trees that we've 711 generated for each edge, so that when we see an edge more 712 than once, we can reuse the same trees. 713 """ 714 # If we've seen this edge before, then reuse our old answer. 715 if edge in memo: 716 return memo[edge] 717 718 # when we're reading trees off the chart, don't use incomplete edges 719 if complete and edge.is_incomplete(): 720 return [] 721 722 # Leaf edges. 723 if isinstance(edge, LeafEdge): 724 leaf = self._tokens[edge.start()] 725 memo[edge] = [leaf] 726 return [leaf] 727 728 # Until we're done computing the trees for edge, set 729 # memo[edge] to be empty. This has the effect of filtering 730 # out any cyclic trees (i.e., trees that contain themselves as 731 # descendants), because if we reach this edge via a cycle, 732 # then it will appear that the edge doesn't generate any trees. 733 memo[edge] = [] 734 trees = [] 735 lhs = edge.lhs().symbol() 736 737 # Each child pointer list can be used to form trees. 738 for cpl in self.child_pointer_lists(edge): 739 # Get the set of child choices for each child pointer. 740 # child_choices[i] is the set of choices for the tree's 741 # ith child. 742 child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl] 743 744 # For each combination of children, add a tree. 745 for children in itertools.product(*child_choices): 746 trees.append(tree_class(lhs, children)) 747 748 # If the edge is incomplete, then extend it with "partial trees": 749 if edge.is_incomplete(): 750 unexpanded = [tree_class(elt, []) for elt in edge.rhs()[edge.dot() :]] 751 for tree in trees: 752 tree.extend(unexpanded) 753 754 # Update the memoization dictionary. 755 memo[edge] = trees 756 757 # Return the list of trees. 758 return trees 759 760 def child_pointer_lists(self, edge): 761 """ 762 Return the set of child pointer lists for the given edge. 763 Each child pointer list is a list of edges that have 764 been used to form this edge. 765 766 :rtype: list(list(EdgeI)) 767 """ 768 # Make a copy, in case they modify it. 769 return self._edge_to_cpls.get(edge, {}).keys() 770 771 # //////////////////////////////////////////////////////////// 772 # Display 773 # //////////////////////////////////////////////////////////// 774 def pretty_format_edge(self, edge, width=None): 775 """ 776 Return a pretty-printed string representation of a given edge 777 in this chart. 778 779 :rtype: str 780 :param width: The number of characters allotted to each 781 index in the sentence. 782 """ 783 if width is None: 784 width = 50 // (self.num_leaves() + 1) 785 (start, end) = (edge.start(), edge.end()) 786 787 str = '|' + ('.' + ' ' * (width - 1)) * start 788 789 # Zero-width edges are "#" if complete, ">" if incomplete 790 if start == end: 791 if edge.is_complete(): 792 str += '#' 793 else: 794 str += '>' 795 796 # Spanning complete edges are "[===]"; Other edges are 797 # "[---]" if complete, "[--->" if incomplete 798 elif edge.is_complete() and edge.span() == (0, self._num_leaves): 799 str += '[' + ('=' * width) * (end - start - 1) + '=' * (width - 1) + ']' 800 elif edge.is_complete(): 801 str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + ']' 802 else: 803 str += '[' + ('-' * width) * (end - start - 1) + '-' * (width - 1) + '>' 804 805 str += (' ' * (width - 1) + '.') * (self._num_leaves - end) 806 return str + '| %s' % edge 807 808 def pretty_format_leaves(self, width=None): 809 """ 810 Return a pretty-printed string representation of this 811 chart's leaves. This string can be used as a header 812 for calls to ``pretty_format_edge``. 813 """ 814 if width is None: 815 width = 50 // (self.num_leaves() + 1) 816 817 if self._tokens is not None and width > 1: 818 header = '|.' 819 for tok in self._tokens: 820 header += tok[: width - 1].center(width - 1) + '.' 821 header += '|' 822 else: 823 header = '' 824 825 return header 826 827 def pretty_format(self, width=None): 828 """ 829 Return a pretty-printed string representation of this chart. 830 831 :param width: The number of characters allotted to each 832 index in the sentence. 833 :rtype: str 834 """ 835 if width is None: 836 width = 50 // (self.num_leaves() + 1) 837 # sort edges: primary key=length, secondary key=start index. 838 # (and filter out the token edges) 839 edges = sorted([(e.length(), e.start(), e) for e in self]) 840 edges = [e for (_, _, e) in edges] 841 842 return ( 843 self.pretty_format_leaves(width) 844 + '\n' 845 + '\n'.join(self.pretty_format_edge(edge, width) for edge in edges) 846 ) 847 848 # //////////////////////////////////////////////////////////// 849 # Display: Dot (AT&T Graphviz) 850 # //////////////////////////////////////////////////////////// 851 852 def dot_digraph(self): 853 # Header 854 s = 'digraph nltk_chart {\n' 855 # s += ' size="5,5";\n' 856 s += ' rankdir=LR;\n' 857 s += ' node [height=0.1,width=0.1];\n' 858 s += ' node [style=filled, color="lightgray"];\n' 859 860 # Set up the nodes 861 for y in range(self.num_edges(), -1, -1): 862 if y == 0: 863 s += ' node [style=filled, color="black"];\n' 864 for x in range(self.num_leaves() + 1): 865 if y == 0 or ( 866 x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end() 867 ): 868 s += ' %04d.%04d [label=""];\n' % (x, y) 869 870 # Add a spacer 871 s += ' x [style=invis]; x->0000.0000 [style=invis];\n' 872 873 # Declare ranks. 874 for x in range(self.num_leaves() + 1): 875 s += ' {rank=same;' 876 for y in range(self.num_edges() + 1): 877 if y == 0 or ( 878 x <= self._edges[y - 1].start() or x >= self._edges[y - 1].end() 879 ): 880 s += ' %04d.%04d' % (x, y) 881 s += '}\n' 882 883 # Add the leaves 884 s += ' edge [style=invis, weight=100];\n' 885 s += ' node [shape=plaintext]\n' 886 s += ' 0000.0000' 887 for x in range(self.num_leaves()): 888 s += '->%s->%04d.0000' % (self.leaf(x), x + 1) 889 s += ';\n\n' 890 891 # Add the edges 892 s += ' edge [style=solid, weight=1];\n' 893 for y, edge in enumerate(self): 894 for x in range(edge.start()): 895 s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % ( 896 x, 897 y + 1, 898 x + 1, 899 y + 1, 900 ) 901 s += ' %04d.%04d -> %04d.%04d [label="%s"];\n' % ( 902 edge.start(), 903 y + 1, 904 edge.end(), 905 y + 1, 906 edge, 907 ) 908 for x in range(edge.end(), self.num_leaves()): 909 s += ' %04d.%04d -> %04d.%04d [style="invis"];\n' % ( 910 x, 911 y + 1, 912 x + 1, 913 y + 1, 914 ) 915 s += '}\n' 916 return s 917 918 919######################################################################## 920## Chart Rules 921######################################################################## 922 923 924class ChartRuleI(object): 925 """ 926 A rule that specifies what new edges are licensed by any given set 927 of existing edges. Each chart rule expects a fixed number of 928 edges, as indicated by the class variable ``NUM_EDGES``. In 929 particular: 930 931 - A chart rule with ``NUM_EDGES=0`` specifies what new edges are 932 licensed, regardless of existing edges. 933 - A chart rule with ``NUM_EDGES=1`` specifies what new edges are 934 licensed by a single existing edge. 935 - A chart rule with ``NUM_EDGES=2`` specifies what new edges are 936 licensed by a pair of existing edges. 937 938 :type NUM_EDGES: int 939 :cvar NUM_EDGES: The number of existing edges that this rule uses 940 to license new edges. Typically, this number ranges from zero 941 to two. 942 """ 943 944 def apply(self, chart, grammar, *edges): 945 """ 946 Return a generator that will add edges licensed by this rule 947 and the given edges to the chart, one at a time. Each 948 time the generator is resumed, it will either add a new 949 edge and yield that edge; or return. 950 951 :type edges: list(EdgeI) 952 :param edges: A set of existing edges. The number of edges 953 that should be passed to ``apply()`` is specified by the 954 ``NUM_EDGES`` class variable. 955 :rtype: iter(EdgeI) 956 """ 957 raise NotImplementedError() 958 959 def apply_everywhere(self, chart, grammar): 960 """ 961 Return a generator that will add all edges licensed by 962 this rule, given the edges that are currently in the 963 chart, one at a time. Each time the generator is resumed, 964 it will either add a new edge and yield that edge; or return. 965 966 :rtype: iter(EdgeI) 967 """ 968 raise NotImplementedError() 969 970 971@python_2_unicode_compatible 972class AbstractChartRule(ChartRuleI): 973 """ 974 An abstract base class for chart rules. ``AbstractChartRule`` 975 provides: 976 977 - A default implementation for ``apply``. 978 - A default implementation for ``apply_everywhere``, 979 (Currently, this implementation assumes that ``NUM_EDGES``<=3.) 980 - A default implementation for ``__str__``, which returns a 981 name based on the rule's class name. 982 """ 983 984 # Subclasses must define apply. 985 def apply(self, chart, grammar, *edges): 986 raise NotImplementedError() 987 988 # Default: loop through the given number of edges, and call 989 # self.apply() for each set of edges. 990 def apply_everywhere(self, chart, grammar): 991 if self.NUM_EDGES == 0: 992 for new_edge in self.apply(chart, grammar): 993 yield new_edge 994 995 elif self.NUM_EDGES == 1: 996 for e1 in chart: 997 for new_edge in self.apply(chart, grammar, e1): 998 yield new_edge 999 1000 elif self.NUM_EDGES == 2: 1001 for e1 in chart: 1002 for e2 in chart: 1003 for new_edge in self.apply(chart, grammar, e1, e2): 1004 yield new_edge 1005 1006 elif self.NUM_EDGES == 3: 1007 for e1 in chart: 1008 for e2 in chart: 1009 for e3 in chart: 1010 for new_edge in self.apply(chart, grammar, e1, e2, e3): 1011 yield new_edge 1012 1013 else: 1014 raise AssertionError('NUM_EDGES>3 is not currently supported') 1015 1016 # Default: return a name based on the class name. 1017 def __str__(self): 1018 # Add spaces between InitialCapsWords. 1019 return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__) 1020 1021 1022# //////////////////////////////////////////////////////////// 1023# Fundamental Rule 1024# //////////////////////////////////////////////////////////// 1025 1026 1027class FundamentalRule(AbstractChartRule): 1028 """ 1029 A rule that joins two adjacent edges to form a single combined 1030 edge. In particular, this rule specifies that any pair of edges 1031 1032 - ``[A -> alpha \* B beta][i:j]`` 1033 - ``[B -> gamma \*][j:k]`` 1034 1035 licenses the edge: 1036 1037 - ``[A -> alpha B * beta][i:j]`` 1038 """ 1039 1040 NUM_EDGES = 2 1041 1042 def apply(self, chart, grammar, left_edge, right_edge): 1043 # Make sure the rule is applicable. 1044 if not ( 1045 left_edge.is_incomplete() 1046 and right_edge.is_complete() 1047 and left_edge.end() == right_edge.start() 1048 and left_edge.nextsym() == right_edge.lhs() 1049 ): 1050 return 1051 1052 # Construct the new edge. 1053 new_edge = left_edge.move_dot_forward(right_edge.end()) 1054 1055 # Insert it into the chart. 1056 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1057 yield new_edge 1058 1059 1060class SingleEdgeFundamentalRule(FundamentalRule): 1061 """ 1062 A rule that joins a given edge with adjacent edges in the chart, 1063 to form combined edges. In particular, this rule specifies that 1064 either of the edges: 1065 1066 - ``[A -> alpha \* B beta][i:j]`` 1067 - ``[B -> gamma \*][j:k]`` 1068 1069 licenses the edge: 1070 1071 - ``[A -> alpha B * beta][i:j]`` 1072 1073 if the other edge is already in the chart. 1074 1075 :note: This is basically ``FundamentalRule``, with one edge left 1076 unspecified. 1077 """ 1078 1079 NUM_EDGES = 1 1080 1081 def apply(self, chart, grammar, edge): 1082 if edge.is_incomplete(): 1083 for new_edge in self._apply_incomplete(chart, grammar, edge): 1084 yield new_edge 1085 else: 1086 for new_edge in self._apply_complete(chart, grammar, edge): 1087 yield new_edge 1088 1089 def _apply_complete(self, chart, grammar, right_edge): 1090 for left_edge in chart.select( 1091 end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() 1092 ): 1093 new_edge = left_edge.move_dot_forward(right_edge.end()) 1094 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1095 yield new_edge 1096 1097 def _apply_incomplete(self, chart, grammar, left_edge): 1098 for right_edge in chart.select( 1099 start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() 1100 ): 1101 new_edge = left_edge.move_dot_forward(right_edge.end()) 1102 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1103 yield new_edge 1104 1105 1106# //////////////////////////////////////////////////////////// 1107# Inserting Terminal Leafs 1108# //////////////////////////////////////////////////////////// 1109 1110 1111class LeafInitRule(AbstractChartRule): 1112 NUM_EDGES = 0 1113 1114 def apply(self, chart, grammar): 1115 for index in range(chart.num_leaves()): 1116 new_edge = LeafEdge(chart.leaf(index), index) 1117 if chart.insert(new_edge, ()): 1118 yield new_edge 1119 1120 1121# //////////////////////////////////////////////////////////// 1122# Top-Down Prediction 1123# //////////////////////////////////////////////////////////// 1124 1125 1126class TopDownInitRule(AbstractChartRule): 1127 """ 1128 A rule licensing edges corresponding to the grammar productions for 1129 the grammar's start symbol. In particular, this rule specifies that 1130 ``[S -> \* alpha][0:i]`` is licensed for each grammar production 1131 ``S -> alpha``, where ``S`` is the grammar's start symbol. 1132 """ 1133 1134 NUM_EDGES = 0 1135 1136 def apply(self, chart, grammar): 1137 for prod in grammar.productions(lhs=grammar.start()): 1138 new_edge = TreeEdge.from_production(prod, 0) 1139 if chart.insert(new_edge, ()): 1140 yield new_edge 1141 1142 1143class TopDownPredictRule(AbstractChartRule): 1144 """ 1145 A rule licensing edges corresponding to the grammar productions 1146 for the nonterminal following an incomplete edge's dot. In 1147 particular, this rule specifies that 1148 ``[A -> alpha \* B beta][i:j]`` licenses the edge 1149 ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``. 1150 1151 :note: This rule corresponds to the Predictor Rule in Earley parsing. 1152 """ 1153 1154 NUM_EDGES = 1 1155 1156 def apply(self, chart, grammar, edge): 1157 if edge.is_complete(): 1158 return 1159 for prod in grammar.productions(lhs=edge.nextsym()): 1160 new_edge = TreeEdge.from_production(prod, edge.end()) 1161 if chart.insert(new_edge, ()): 1162 yield new_edge 1163 1164 1165class CachedTopDownPredictRule(TopDownPredictRule): 1166 """ 1167 A cached version of ``TopDownPredictRule``. After the first time 1168 this rule is applied to an edge with a given ``end`` and ``next``, 1169 it will not generate any more edges for edges with that ``end`` and 1170 ``next``. 1171 1172 If ``chart`` or ``grammar`` are changed, then the cache is flushed. 1173 """ 1174 1175 def __init__(self): 1176 TopDownPredictRule.__init__(self) 1177 self._done = {} 1178 1179 def apply(self, chart, grammar, edge): 1180 if edge.is_complete(): 1181 return 1182 nextsym, index = edge.nextsym(), edge.end() 1183 if not is_nonterminal(nextsym): 1184 return 1185 1186 # If we've already applied this rule to an edge with the same 1187 # next & end, and the chart & grammar have not changed, then 1188 # just return (no new edges to add). 1189 done = self._done.get((nextsym, index), (None, None)) 1190 if done[0] is chart and done[1] is grammar: 1191 return 1192 1193 # Add all the edges indicated by the top down expand rule. 1194 for prod in grammar.productions(lhs=nextsym): 1195 # If the left corner in the predicted production is 1196 # leaf, it must match with the input. 1197 if prod.rhs(): 1198 first = prod.rhs()[0] 1199 if is_terminal(first): 1200 if index >= chart.num_leaves() or first != chart.leaf(index): 1201 continue 1202 1203 new_edge = TreeEdge.from_production(prod, index) 1204 if chart.insert(new_edge, ()): 1205 yield new_edge 1206 1207 # Record the fact that we've applied this rule. 1208 self._done[nextsym, index] = (chart, grammar) 1209 1210 1211# //////////////////////////////////////////////////////////// 1212# Bottom-Up Prediction 1213# //////////////////////////////////////////////////////////// 1214 1215 1216class BottomUpPredictRule(AbstractChartRule): 1217 """ 1218 A rule licensing any edge corresponding to a production whose 1219 right-hand side begins with a complete edge's left-hand side. In 1220 particular, this rule specifies that ``[A -> alpha \*]`` licenses 1221 the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``. 1222 """ 1223 1224 NUM_EDGES = 1 1225 1226 def apply(self, chart, grammar, edge): 1227 if edge.is_incomplete(): 1228 return 1229 for prod in grammar.productions(rhs=edge.lhs()): 1230 new_edge = TreeEdge.from_production(prod, edge.start()) 1231 if chart.insert(new_edge, ()): 1232 yield new_edge 1233 1234 1235class BottomUpPredictCombineRule(BottomUpPredictRule): 1236 """ 1237 A rule licensing any edge corresponding to a production whose 1238 right-hand side begins with a complete edge's left-hand side. In 1239 particular, this rule specifies that ``[A -> alpha \*]`` 1240 licenses the edge ``[B -> A \* beta]`` for each grammar 1241 production ``B -> A beta``. 1242 1243 :note: This is like ``BottomUpPredictRule``, but it also applies 1244 the ``FundamentalRule`` to the resulting edge. 1245 """ 1246 1247 NUM_EDGES = 1 1248 1249 def apply(self, chart, grammar, edge): 1250 if edge.is_incomplete(): 1251 return 1252 for prod in grammar.productions(rhs=edge.lhs()): 1253 new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) 1254 if chart.insert(new_edge, (edge,)): 1255 yield new_edge 1256 1257 1258class EmptyPredictRule(AbstractChartRule): 1259 """ 1260 A rule that inserts all empty productions as passive edges, 1261 in every position in the chart. 1262 """ 1263 1264 NUM_EDGES = 0 1265 1266 def apply(self, chart, grammar): 1267 for prod in grammar.productions(empty=True): 1268 for index in range(chart.num_leaves() + 1): 1269 new_edge = TreeEdge.from_production(prod, index) 1270 if chart.insert(new_edge, ()): 1271 yield new_edge 1272 1273 1274######################################################################## 1275## Filtered Bottom Up 1276######################################################################## 1277 1278 1279class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): 1280 def _apply_complete(self, chart, grammar, right_edge): 1281 end = right_edge.end() 1282 nexttoken = end < chart.num_leaves() and chart.leaf(end) 1283 for left_edge in chart.select( 1284 end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs() 1285 ): 1286 if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): 1287 new_edge = left_edge.move_dot_forward(right_edge.end()) 1288 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1289 yield new_edge 1290 1291 def _apply_incomplete(self, chart, grammar, left_edge): 1292 for right_edge in chart.select( 1293 start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym() 1294 ): 1295 end = right_edge.end() 1296 nexttoken = end < chart.num_leaves() and chart.leaf(end) 1297 if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): 1298 new_edge = left_edge.move_dot_forward(right_edge.end()) 1299 if chart.insert_with_backpointer(new_edge, left_edge, right_edge): 1300 yield new_edge 1301 1302 1303class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule): 1304 def apply(self, chart, grammar, edge): 1305 if edge.is_incomplete(): 1306 return 1307 1308 end = edge.end() 1309 nexttoken = end < chart.num_leaves() and chart.leaf(end) 1310 for prod in grammar.productions(rhs=edge.lhs()): 1311 if _bottomup_filter(grammar, nexttoken, prod.rhs()): 1312 new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) 1313 if chart.insert(new_edge, (edge,)): 1314 yield new_edge 1315 1316 1317def _bottomup_filter(grammar, nexttoken, rhs, dot=0): 1318 if len(rhs) <= dot + 1: 1319 return True 1320 _next = rhs[dot + 1] 1321 if is_terminal(_next): 1322 return nexttoken == _next 1323 else: 1324 return grammar.is_leftcorner(_next, nexttoken) 1325 1326 1327######################################################################## 1328## Generic Chart Parser 1329######################################################################## 1330 1331TD_STRATEGY = [ 1332 LeafInitRule(), 1333 TopDownInitRule(), 1334 CachedTopDownPredictRule(), 1335 SingleEdgeFundamentalRule(), 1336] 1337BU_STRATEGY = [ 1338 LeafInitRule(), 1339 EmptyPredictRule(), 1340 BottomUpPredictRule(), 1341 SingleEdgeFundamentalRule(), 1342] 1343BU_LC_STRATEGY = [ 1344 LeafInitRule(), 1345 EmptyPredictRule(), 1346 BottomUpPredictCombineRule(), 1347 SingleEdgeFundamentalRule(), 1348] 1349 1350LC_STRATEGY = [ 1351 LeafInitRule(), 1352 FilteredBottomUpPredictCombineRule(), 1353 FilteredSingleEdgeFundamentalRule(), 1354] 1355 1356 1357class ChartParser(ParserI): 1358 """ 1359 A generic chart parser. A "strategy", or list of 1360 ``ChartRuleI`` instances, is used to decide what edges to add to 1361 the chart. In particular, ``ChartParser`` uses the following 1362 algorithm to parse texts: 1363 1364 | Until no new edges are added: 1365 | For each *rule* in *strategy*: 1366 | Apply *rule* to any applicable edges in the chart. 1367 | Return any complete parses in the chart 1368 """ 1369 1370 def __init__( 1371 self, 1372 grammar, 1373 strategy=BU_LC_STRATEGY, 1374 trace=0, 1375 trace_chart_width=50, 1376 use_agenda=True, 1377 chart_class=Chart, 1378 ): 1379 """ 1380 Create a new chart parser, that uses ``grammar`` to parse 1381 texts. 1382 1383 :type grammar: CFG 1384 :param grammar: The grammar used to parse texts. 1385 :type strategy: list(ChartRuleI) 1386 :param strategy: A list of rules that should be used to decide 1387 what edges to add to the chart (top-down strategy by default). 1388 :type trace: int 1389 :param trace: The level of tracing that should be used when 1390 parsing a text. ``0`` will generate no tracing output; 1391 and higher numbers will produce more verbose tracing 1392 output. 1393 :type trace_chart_width: int 1394 :param trace_chart_width: The default total width reserved for 1395 the chart in trace output. The remainder of each line will 1396 be used to display edges. 1397 :type use_agenda: bool 1398 :param use_agenda: Use an optimized agenda-based algorithm, 1399 if possible. 1400 :param chart_class: The class that should be used to create 1401 the parse charts. 1402 """ 1403 self._grammar = grammar 1404 self._strategy = strategy 1405 self._trace = trace 1406 self._trace_chart_width = trace_chart_width 1407 # If the strategy only consists of axioms (NUM_EDGES==0) and 1408 # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm: 1409 self._use_agenda = use_agenda 1410 self._chart_class = chart_class 1411 1412 self._axioms = [] 1413 self._inference_rules = [] 1414 for rule in strategy: 1415 if rule.NUM_EDGES == 0: 1416 self._axioms.append(rule) 1417 elif rule.NUM_EDGES == 1: 1418 self._inference_rules.append(rule) 1419 else: 1420 self._use_agenda = False 1421 1422 def grammar(self): 1423 return self._grammar 1424 1425 def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width): 1426 if not trace: 1427 return 1428 print_rule_header = trace > 1 1429 for edge in new_edges: 1430 if print_rule_header: 1431 print('%s:' % rule) 1432 print_rule_header = False 1433 print(chart.pretty_format_edge(edge, edge_width)) 1434 1435 def chart_parse(self, tokens, trace=None): 1436 """ 1437 Return the final parse ``Chart`` from which all possible 1438 parse trees can be extracted. 1439 1440 :param tokens: The sentence to be parsed 1441 :type tokens: list(str) 1442 :rtype: Chart 1443 """ 1444 if trace is None: 1445 trace = self._trace 1446 trace_new_edges = self._trace_new_edges 1447 1448 tokens = list(tokens) 1449 self._grammar.check_coverage(tokens) 1450 chart = self._chart_class(tokens) 1451 grammar = self._grammar 1452 1453 # Width, for printing trace edges. 1454 trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1) 1455 if trace: 1456 print(chart.pretty_format_leaves(trace_edge_width)) 1457 1458 if self._use_agenda: 1459 # Use an agenda-based algorithm. 1460 for axiom in self._axioms: 1461 new_edges = list(axiom.apply(chart, grammar)) 1462 trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) 1463 1464 inference_rules = self._inference_rules 1465 agenda = chart.edges() 1466 # We reverse the initial agenda, since it is a stack 1467 # but chart.edges() functions as a queue. 1468 agenda.reverse() 1469 while agenda: 1470 edge = agenda.pop() 1471 for rule in inference_rules: 1472 new_edges = list(rule.apply(chart, grammar, edge)) 1473 if trace: 1474 trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) 1475 agenda += new_edges 1476 1477 else: 1478 # Do not use an agenda-based algorithm. 1479 edges_added = True 1480 while edges_added: 1481 edges_added = False 1482 for rule in self._strategy: 1483 new_edges = list(rule.apply_everywhere(chart, grammar)) 1484 edges_added = len(new_edges) 1485 trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) 1486 1487 # Return the final chart. 1488 return chart 1489 1490 def parse(self, tokens, tree_class=Tree): 1491 chart = self.chart_parse(tokens) 1492 return iter(chart.parses(self._grammar.start(), tree_class=tree_class)) 1493 1494 1495class TopDownChartParser(ChartParser): 1496 """ 1497 A ``ChartParser`` using a top-down parsing strategy. 1498 See ``ChartParser`` for more information. 1499 """ 1500 1501 def __init__(self, grammar, **parser_args): 1502 ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args) 1503 1504 1505class BottomUpChartParser(ChartParser): 1506 """ 1507 A ``ChartParser`` using a bottom-up parsing strategy. 1508 See ``ChartParser`` for more information. 1509 """ 1510 1511 def __init__(self, grammar, **parser_args): 1512 if isinstance(grammar, PCFG): 1513 warnings.warn( 1514 "BottomUpChartParser only works for CFG, " 1515 "use BottomUpProbabilisticChartParser instead", 1516 category=DeprecationWarning, 1517 ) 1518 ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args) 1519 1520 1521class BottomUpLeftCornerChartParser(ChartParser): 1522 """ 1523 A ``ChartParser`` using a bottom-up left-corner parsing strategy. 1524 This strategy is often more efficient than standard bottom-up. 1525 See ``ChartParser`` for more information. 1526 """ 1527 1528 def __init__(self, grammar, **parser_args): 1529 ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args) 1530 1531 1532class LeftCornerChartParser(ChartParser): 1533 def __init__(self, grammar, **parser_args): 1534 if not grammar.is_nonempty(): 1535 raise ValueError( 1536 "LeftCornerParser only works for grammars " "without empty productions." 1537 ) 1538 ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args) 1539 1540 1541######################################################################## 1542## Stepping Chart Parser 1543######################################################################## 1544 1545 1546class SteppingChartParser(ChartParser): 1547 """ 1548 A ``ChartParser`` that allows you to step through the parsing 1549 process, adding a single edge at a time. It also allows you to 1550 change the parser's strategy or grammar midway through parsing a 1551 text. 1552 1553 The ``initialize`` method is used to start parsing a text. ``step`` 1554 adds a single edge to the chart. ``set_strategy`` changes the 1555 strategy used by the chart parser. ``parses`` returns the set of 1556 parses that has been found by the chart parser. 1557 1558 :ivar _restart: Records whether the parser's strategy, grammar, 1559 or chart has been changed. If so, then ``step`` must restart 1560 the parsing algorithm. 1561 """ 1562 1563 def __init__(self, grammar, strategy=[], trace=0): 1564 self._chart = None 1565 self._current_chartrule = None 1566 self._restart = False 1567 ChartParser.__init__(self, grammar, strategy, trace) 1568 1569 # //////////////////////////////////////////////////////////// 1570 # Initialization 1571 # //////////////////////////////////////////////////////////// 1572 1573 def initialize(self, tokens): 1574 "Begin parsing the given tokens." 1575 self._chart = Chart(list(tokens)) 1576 self._restart = True 1577 1578 # //////////////////////////////////////////////////////////// 1579 # Stepping 1580 # //////////////////////////////////////////////////////////// 1581 1582 def step(self): 1583 """ 1584 Return a generator that adds edges to the chart, one at a 1585 time. Each time the generator is resumed, it adds a single 1586 edge and yields that edge. If no more edges can be added, 1587 then it yields None. 1588 1589 If the parser's strategy, grammar, or chart is changed, then 1590 the generator will continue adding edges using the new 1591 strategy, grammar, or chart. 1592 1593 Note that this generator never terminates, since the grammar 1594 or strategy might be changed to values that would add new 1595 edges. Instead, it yields None when no more edges can be 1596 added with the current strategy and grammar. 1597 """ 1598 if self._chart is None: 1599 raise ValueError('Parser must be initialized first') 1600 while True: 1601 self._restart = False 1602 w = 50 // (self._chart.num_leaves() + 1) 1603 1604 for e in self._parse(): 1605 if self._trace > 1: 1606 print(self._current_chartrule) 1607 if self._trace > 0: 1608 print(self._chart.pretty_format_edge(e, w)) 1609 yield e 1610 if self._restart: 1611 break 1612 else: 1613 yield None # No more edges. 1614 1615 def _parse(self): 1616 """ 1617 A generator that implements the actual parsing algorithm. 1618 ``step`` iterates through this generator, and restarts it 1619 whenever the parser's strategy, grammar, or chart is modified. 1620 """ 1621 chart = self._chart 1622 grammar = self._grammar 1623 edges_added = 1 1624 while edges_added > 0: 1625 edges_added = 0 1626 for rule in self._strategy: 1627 self._current_chartrule = rule 1628 for e in rule.apply_everywhere(chart, grammar): 1629 edges_added += 1 1630 yield e 1631 1632 # //////////////////////////////////////////////////////////// 1633 # Accessors 1634 # //////////////////////////////////////////////////////////// 1635 1636 def strategy(self): 1637 "Return the strategy used by this parser." 1638 return self._strategy 1639 1640 def grammar(self): 1641 "Return the grammar used by this parser." 1642 return self._grammar 1643 1644 def chart(self): 1645 "Return the chart that is used by this parser." 1646 return self._chart 1647 1648 def current_chartrule(self): 1649 "Return the chart rule used to generate the most recent edge." 1650 return self._current_chartrule 1651 1652 def parses(self, tree_class=Tree): 1653 "Return the parse trees currently contained in the chart." 1654 return self._chart.parses(self._grammar.start(), tree_class) 1655 1656 # //////////////////////////////////////////////////////////// 1657 # Parser modification 1658 # //////////////////////////////////////////////////////////// 1659 1660 def set_strategy(self, strategy): 1661 """ 1662 Change the strategy that the parser uses to decide which edges 1663 to add to the chart. 1664 1665 :type strategy: list(ChartRuleI) 1666 :param strategy: A list of rules that should be used to decide 1667 what edges to add to the chart. 1668 """ 1669 if strategy == self._strategy: 1670 return 1671 self._strategy = strategy[:] # Make a copy. 1672 self._restart = True 1673 1674 def set_grammar(self, grammar): 1675 "Change the grammar used by the parser." 1676 if grammar is self._grammar: 1677 return 1678 self._grammar = grammar 1679 self._restart = True 1680 1681 def set_chart(self, chart): 1682 "Load a given chart into the chart parser." 1683 if chart is self._chart: 1684 return 1685 self._chart = chart 1686 self._restart = True 1687 1688 # //////////////////////////////////////////////////////////// 1689 # Standard parser methods 1690 # //////////////////////////////////////////////////////////// 1691 1692 def parse(self, tokens, tree_class=Tree): 1693 tokens = list(tokens) 1694 self._grammar.check_coverage(tokens) 1695 1696 # Initialize ourselves. 1697 self.initialize(tokens) 1698 1699 # Step until no more edges are generated. 1700 for e in self.step(): 1701 if e is None: 1702 break 1703 1704 # Return an iterator of complete parses. 1705 return self.parses(tree_class=tree_class) 1706 1707 1708######################################################################## 1709## Demo Code 1710######################################################################## 1711 1712 1713def demo_grammar(): 1714 from nltk.grammar import CFG 1715 1716 return CFG.fromstring( 1717 """ 1718S -> NP VP 1719PP -> "with" NP 1720NP -> NP PP 1721VP -> VP PP 1722VP -> Verb NP 1723VP -> Verb 1724NP -> Det Noun 1725NP -> "John" 1726NP -> "I" 1727Det -> "the" 1728Det -> "my" 1729Det -> "a" 1730Noun -> "dog" 1731Noun -> "cookie" 1732Verb -> "ate" 1733Verb -> "saw" 1734Prep -> "with" 1735Prep -> "under" 1736""" 1737 ) 1738 1739 1740def demo( 1741 choice=None, 1742 print_times=True, 1743 print_grammar=False, 1744 print_trees=True, 1745 trace=2, 1746 sent='I saw John with a dog with my cookie', 1747 numparses=5, 1748): 1749 """ 1750 A demonstration of the chart parsers. 1751 """ 1752 import sys, time 1753 from nltk import nonterminals, Production, CFG 1754 1755 # The grammar for ChartParser and SteppingChartParser: 1756 grammar = demo_grammar() 1757 if print_grammar: 1758 print("* Grammar") 1759 print(grammar) 1760 1761 # Tokenize the sample sentence. 1762 print("* Sentence:") 1763 print(sent) 1764 tokens = sent.split() 1765 print(tokens) 1766 print() 1767 1768 # Ask the user which parser to test, 1769 # if the parser wasn't provided as an argument 1770 if choice is None: 1771 print(' 1: Top-down chart parser') 1772 print(' 2: Bottom-up chart parser') 1773 print(' 3: Bottom-up left-corner chart parser') 1774 print(' 4: Left-corner chart parser with bottom-up filter') 1775 print(' 5: Stepping chart parser (alternating top-down & bottom-up)') 1776 print(' 6: All parsers') 1777 print('\nWhich parser (1-6)? ', end=' ') 1778 choice = sys.stdin.readline().strip() 1779 print() 1780 1781 choice = str(choice) 1782 if choice not in "123456": 1783 print('Bad parser number') 1784 return 1785 1786 # Keep track of how long each parser takes. 1787 times = {} 1788 1789 strategies = { 1790 '1': ('Top-down', TD_STRATEGY), 1791 '2': ('Bottom-up', BU_STRATEGY), 1792 '3': ('Bottom-up left-corner', BU_LC_STRATEGY), 1793 '4': ('Filtered left-corner', LC_STRATEGY), 1794 } 1795 choices = [] 1796 if choice in strategies: 1797 choices = [choice] 1798 if choice == '6': 1799 choices = "1234" 1800 1801 # Run the requested chart parser(s), except the stepping parser. 1802 for strategy in choices: 1803 print("* Strategy: " + strategies[strategy][0]) 1804 print() 1805 cp = ChartParser(grammar, strategies[strategy][1], trace=trace) 1806 t = time.time() 1807 chart = cp.chart_parse(tokens) 1808 parses = list(chart.parses(grammar.start())) 1809 1810 times[strategies[strategy][0]] = time.time() - t 1811 print("Nr edges in chart:", len(chart.edges())) 1812 if numparses: 1813 assert len(parses) == numparses, 'Not all parses found' 1814 if print_trees: 1815 for tree in parses: 1816 print(tree) 1817 else: 1818 print("Nr trees:", len(parses)) 1819 print() 1820 1821 # Run the stepping parser, if requested. 1822 if choice in "56": 1823 print("* Strategy: Stepping (top-down vs bottom-up)") 1824 print() 1825 t = time.time() 1826 cp = SteppingChartParser(grammar, trace=trace) 1827 cp.initialize(tokens) 1828 for i in range(5): 1829 print('*** SWITCH TO TOP DOWN') 1830 cp.set_strategy(TD_STRATEGY) 1831 for j, e in enumerate(cp.step()): 1832 if j > 20 or e is None: 1833 break 1834 print('*** SWITCH TO BOTTOM UP') 1835 cp.set_strategy(BU_STRATEGY) 1836 for j, e in enumerate(cp.step()): 1837 if j > 20 or e is None: 1838 break 1839 times['Stepping'] = time.time() - t 1840 print("Nr edges in chart:", len(cp.chart().edges())) 1841 if numparses: 1842 assert len(list(cp.parses())) == numparses, 'Not all parses found' 1843 if print_trees: 1844 for tree in cp.parses(): 1845 print(tree) 1846 else: 1847 print("Nr trees:", len(list(cp.parses()))) 1848 print() 1849 1850 # Print the times of all parsers: 1851 if not (print_times and times): 1852 return 1853 print("* Parsing times") 1854 print() 1855 maxlen = max(len(key) for key in times) 1856 format = '%' + repr(maxlen) + 's parser: %6.3fsec' 1857 times_items = times.items() 1858 for (parser, t) in sorted(times_items, key=lambda a: a[1]): 1859 print(format % (parser, t)) 1860 1861 1862if __name__ == '__main__': 1863 demo() 1864