1# -*- coding: utf-8 -*- 2# Natural Language Toolkit: Text Trees 3# 4# Copyright (C) 2001-2019 NLTK Project 5# Author: Edward Loper <edloper@gmail.com> 6# Steven Bird <stevenbird1@gmail.com> 7# Peter Ljunglöf <peter.ljunglof@gu.se> 8# Nathan Bodenstab <bodenstab@cslu.ogi.edu> (tree transforms) 9# URL: <http://nltk.org/> 10# For license information, see LICENSE.TXT 11 12""" 13Class for representing hierarchical language structures, such as 14syntax trees and morphological trees. 15""" 16from __future__ import print_function, unicode_literals 17 18import re 19from abc import ABCMeta, abstractmethod 20 21from six import string_types, add_metaclass 22 23from nltk.grammar import Production, Nonterminal 24from nltk.probability import ProbabilisticMixIn 25from nltk.util import slice_bounds 26from nltk.compat import python_2_unicode_compatible, unicode_repr 27from nltk.internals import raise_unorderable_types 28 29# TODO: add LabelledTree (can be used for dependency trees) 30 31###################################################################### 32## Trees 33###################################################################### 34 35 36@python_2_unicode_compatible 37class Tree(list): 38 """ 39 A Tree represents a hierarchical grouping of leaves and subtrees. 40 For example, each constituent in a syntax tree is represented by a single Tree. 41 42 A tree's children are encoded as a list of leaves and subtrees, 43 where a leaf is a basic (non-tree) value; and a subtree is a 44 nested Tree. 45 46 >>> from nltk.tree import Tree 47 >>> print(Tree(1, [2, Tree(3, [4]), 5])) 48 (1 2 (3 4) 5) 49 >>> vp = Tree('VP', [Tree('V', ['saw']), 50 ... Tree('NP', ['him'])]) 51 >>> s = Tree('S', [Tree('NP', ['I']), vp]) 52 >>> print(s) 53 (S (NP I) (VP (V saw) (NP him))) 54 >>> print(s[1]) 55 (VP (V saw) (NP him)) 56 >>> print(s[1,1]) 57 (NP him) 58 >>> t = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))") 59 >>> s == t 60 True 61 >>> t[1][1].set_label('X') 62 >>> t[1][1].label() 63 'X' 64 >>> print(t) 65 (S (NP I) (VP (V saw) (X him))) 66 >>> t[0], t[1,1] = t[1,1], t[0] 67 >>> print(t) 68 (S (X him) (VP (V saw) (NP I))) 69 70 The length of a tree is the number of children it has. 71 72 >>> len(t) 73 2 74 75 The set_label() and label() methods allow individual constituents 76 to be labeled. For example, syntax trees use this label to specify 77 phrase tags, such as "NP" and "VP". 78 79 Several Tree methods use "tree positions" to specify 80 children or descendants of a tree. Tree positions are defined as 81 follows: 82 83 - The tree position *i* specifies a Tree's *i*\ th child. 84 - The tree position ``()`` specifies the Tree itself. 85 - If *p* is the tree position of descendant *d*, then 86 *p+i* specifies the *i*\ th child of *d*. 87 88 I.e., every tree position is either a single index *i*, 89 specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*, 90 specifying ``tree[i1][i2]...[iN]``. 91 92 Construct a new tree. This constructor can be called in one 93 of two ways: 94 95 - ``Tree(label, children)`` constructs a new tree with the 96 specified label and list of children. 97 98 - ``Tree.fromstring(s)`` constructs a new tree by parsing the string ``s``. 99 """ 100 101 def __init__(self, node, children=None): 102 if children is None: 103 raise TypeError( 104 "%s: Expected a node value and child list " % type(self).__name__ 105 ) 106 elif isinstance(children, string_types): 107 raise TypeError( 108 "%s() argument 2 should be a list, not a " 109 "string" % type(self).__name__ 110 ) 111 else: 112 list.__init__(self, children) 113 self._label = node 114 115 # //////////////////////////////////////////////////////////// 116 # Comparison operators 117 # //////////////////////////////////////////////////////////// 118 119 def __eq__(self, other): 120 return self.__class__ is other.__class__ and (self._label, list(self)) == ( 121 other._label, 122 list(other), 123 ) 124 125 def __lt__(self, other): 126 if not isinstance(other, Tree): 127 # raise_unorderable_types("<", self, other) 128 # Sometimes children can be pure strings, 129 # so we need to be able to compare with non-trees: 130 return self.__class__.__name__ < other.__class__.__name__ 131 elif self.__class__ is other.__class__: 132 return (self._label, list(self)) < (other._label, list(other)) 133 else: 134 return self.__class__.__name__ < other.__class__.__name__ 135 136 # @total_ordering doesn't work here, since the class inherits from a builtin class 137 __ne__ = lambda self, other: not self == other 138 __gt__ = lambda self, other: not (self < other or self == other) 139 __le__ = lambda self, other: self < other or self == other 140 __ge__ = lambda self, other: not self < other 141 142 # //////////////////////////////////////////////////////////// 143 # Disabled list operations 144 # //////////////////////////////////////////////////////////// 145 146 def __mul__(self, v): 147 raise TypeError('Tree does not support multiplication') 148 149 def __rmul__(self, v): 150 raise TypeError('Tree does not support multiplication') 151 152 def __add__(self, v): 153 raise TypeError('Tree does not support addition') 154 155 def __radd__(self, v): 156 raise TypeError('Tree does not support addition') 157 158 # //////////////////////////////////////////////////////////// 159 # Indexing (with support for tree positions) 160 # //////////////////////////////////////////////////////////// 161 162 def __getitem__(self, index): 163 if isinstance(index, (int, slice)): 164 return list.__getitem__(self, index) 165 elif isinstance(index, (list, tuple)): 166 if len(index) == 0: 167 return self 168 elif len(index) == 1: 169 return self[index[0]] 170 else: 171 return self[index[0]][index[1:]] 172 else: 173 raise TypeError( 174 "%s indices must be integers, not %s" 175 % (type(self).__name__, type(index).__name__) 176 ) 177 178 def __setitem__(self, index, value): 179 if isinstance(index, (int, slice)): 180 return list.__setitem__(self, index, value) 181 elif isinstance(index, (list, tuple)): 182 if len(index) == 0: 183 raise IndexError('The tree position () may not be ' 'assigned to.') 184 elif len(index) == 1: 185 self[index[0]] = value 186 else: 187 self[index[0]][index[1:]] = value 188 else: 189 raise TypeError( 190 "%s indices must be integers, not %s" 191 % (type(self).__name__, type(index).__name__) 192 ) 193 194 def __delitem__(self, index): 195 if isinstance(index, (int, slice)): 196 return list.__delitem__(self, index) 197 elif isinstance(index, (list, tuple)): 198 if len(index) == 0: 199 raise IndexError('The tree position () may not be deleted.') 200 elif len(index) == 1: 201 del self[index[0]] 202 else: 203 del self[index[0]][index[1:]] 204 else: 205 raise TypeError( 206 "%s indices must be integers, not %s" 207 % (type(self).__name__, type(index).__name__) 208 ) 209 210 # //////////////////////////////////////////////////////////// 211 # Basic tree operations 212 # //////////////////////////////////////////////////////////// 213 214 def _get_node(self): 215 """Outdated method to access the node value; use the label() method instead.""" 216 raise NotImplementedError("Use label() to access a node label.") 217 218 def _set_node(self, value): 219 """Outdated method to set the node value; use the set_label() method instead.""" 220 raise NotImplementedError("Use set_label() method to set a node label.") 221 222 node = property(_get_node, _set_node) 223 224 def label(self): 225 """ 226 Return the node label of the tree. 227 228 >>> t = Tree.fromstring('(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))') 229 >>> t.label() 230 'S' 231 232 :return: the node label (typically a string) 233 :rtype: any 234 """ 235 return self._label 236 237 def set_label(self, label): 238 """ 239 Set the node label of the tree. 240 241 >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") 242 >>> t.set_label("T") 243 >>> print(t) 244 (T (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat)))) 245 246 :param label: the node label (typically a string) 247 :type label: any 248 """ 249 self._label = label 250 251 def leaves(self): 252 """ 253 Return the leaves of the tree. 254 255 >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") 256 >>> t.leaves() 257 ['the', 'dog', 'chased', 'the', 'cat'] 258 259 :return: a list containing this tree's leaves. 260 The order reflects the order of the 261 leaves in the tree's hierarchical structure. 262 :rtype: list 263 """ 264 leaves = [] 265 for child in self: 266 if isinstance(child, Tree): 267 leaves.extend(child.leaves()) 268 else: 269 leaves.append(child) 270 return leaves 271 272 def flatten(self): 273 """ 274 Return a flat version of the tree, with all non-root non-terminals removed. 275 276 >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") 277 >>> print(t.flatten()) 278 (S the dog chased the cat) 279 280 :return: a tree consisting of this tree's root connected directly to 281 its leaves, omitting all intervening non-terminal nodes. 282 :rtype: Tree 283 """ 284 return Tree(self.label(), self.leaves()) 285 286 def height(self): 287 """ 288 Return the height of the tree. 289 290 >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") 291 >>> t.height() 292 5 293 >>> print(t[0,0]) 294 (D the) 295 >>> t[0,0].height() 296 2 297 298 :return: The height of this tree. The height of a tree 299 containing no children is 1; the height of a tree 300 containing only leaves is 2; and the height of any other 301 tree is one plus the maximum of its children's 302 heights. 303 :rtype: int 304 """ 305 max_child_height = 0 306 for child in self: 307 if isinstance(child, Tree): 308 max_child_height = max(max_child_height, child.height()) 309 else: 310 max_child_height = max(max_child_height, 1) 311 return 1 + max_child_height 312 313 def treepositions(self, order='preorder'): 314 """ 315 >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") 316 >>> t.treepositions() # doctest: +ELLIPSIS 317 [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...] 318 >>> for pos in t.treepositions('leaves'): 319 ... t[pos] = t[pos][::-1].upper() 320 >>> print(t) 321 (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC)))) 322 323 :param order: One of: ``preorder``, ``postorder``, ``bothorder``, 324 ``leaves``. 325 """ 326 positions = [] 327 if order in ('preorder', 'bothorder'): 328 positions.append(()) 329 for i, child in enumerate(self): 330 if isinstance(child, Tree): 331 childpos = child.treepositions(order) 332 positions.extend((i,) + p for p in childpos) 333 else: 334 positions.append((i,)) 335 if order in ('postorder', 'bothorder'): 336 positions.append(()) 337 return positions 338 339 def subtrees(self, filter=None): 340 """ 341 Generate all the subtrees of this tree, optionally restricted 342 to trees matching the filter function. 343 344 >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") 345 >>> for s in t.subtrees(lambda t: t.height() == 2): 346 ... print(s) 347 (D the) 348 (N dog) 349 (V chased) 350 (D the) 351 (N cat) 352 353 :type filter: function 354 :param filter: the function to filter all local trees 355 """ 356 if not filter or filter(self): 357 yield self 358 for child in self: 359 if isinstance(child, Tree): 360 for subtree in child.subtrees(filter): 361 yield subtree 362 363 def productions(self): 364 """ 365 Generate the productions that correspond to the non-terminal nodes of the tree. 366 For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the 367 form P -> C1 C2 ... Cn. 368 369 >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") 370 >>> t.productions() 371 [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased', 372 NP -> D N, D -> 'the', N -> 'cat'] 373 374 :rtype: list(Production) 375 """ 376 377 if not isinstance(self._label, string_types): 378 raise TypeError( 379 'Productions can only be generated from trees having node labels that are strings' 380 ) 381 382 prods = [Production(Nonterminal(self._label), _child_names(self))] 383 for child in self: 384 if isinstance(child, Tree): 385 prods += child.productions() 386 return prods 387 388 def pos(self): 389 """ 390 Return a sequence of pos-tagged words extracted from the tree. 391 392 >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") 393 >>> t.pos() 394 [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')] 395 396 :return: a list of tuples containing leaves and pre-terminals (part-of-speech tags). 397 The order reflects the order of the leaves in the tree's hierarchical structure. 398 :rtype: list(tuple) 399 """ 400 pos = [] 401 for child in self: 402 if isinstance(child, Tree): 403 pos.extend(child.pos()) 404 else: 405 pos.append((child, self._label)) 406 return pos 407 408 def leaf_treeposition(self, index): 409 """ 410 :return: The tree position of the ``index``-th leaf in this 411 tree. I.e., if ``tp=self.leaf_treeposition(i)``, then 412 ``self[tp]==self.leaves()[i]``. 413 414 :raise IndexError: If this tree contains fewer than ``index+1`` 415 leaves, or if ``index<0``. 416 """ 417 if index < 0: 418 raise IndexError('index must be non-negative') 419 420 stack = [(self, ())] 421 while stack: 422 value, treepos = stack.pop() 423 if not isinstance(value, Tree): 424 if index == 0: 425 return treepos 426 else: 427 index -= 1 428 else: 429 for i in range(len(value) - 1, -1, -1): 430 stack.append((value[i], treepos + (i,))) 431 432 raise IndexError('index must be less than or equal to len(self)') 433 434 def treeposition_spanning_leaves(self, start, end): 435 """ 436 :return: The tree position of the lowest descendant of this 437 tree that dominates ``self.leaves()[start:end]``. 438 :raise ValueError: if ``end <= start`` 439 """ 440 if end <= start: 441 raise ValueError('end must be greater than start') 442 # Find the tree positions of the start & end leaves, and 443 # take the longest common subsequence. 444 start_treepos = self.leaf_treeposition(start) 445 end_treepos = self.leaf_treeposition(end - 1) 446 # Find the first index where they mismatch: 447 for i in range(len(start_treepos)): 448 if i == len(end_treepos) or start_treepos[i] != end_treepos[i]: 449 return start_treepos[:i] 450 return start_treepos 451 452 # //////////////////////////////////////////////////////////// 453 # Transforms 454 # //////////////////////////////////////////////////////////// 455 456 def chomsky_normal_form( 457 self, 458 factor="right", 459 horzMarkov=None, 460 vertMarkov=0, 461 childChar="|", 462 parentChar="^", 463 ): 464 """ 465 This method can modify a tree in three ways: 466 467 1. Convert a tree into its Chomsky Normal Form (CNF) 468 equivalent -- Every subtree has either two non-terminals 469 or one terminal as its children. This process requires 470 the creation of more"artificial" non-terminal nodes. 471 2. Markov (vertical) smoothing of children in new artificial 472 nodes 473 3. Horizontal (parent) annotation of nodes 474 475 :param factor: Right or left factoring method (default = "right") 476 :type factor: str = [left|right] 477 :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings) 478 :type horzMarkov: int | None 479 :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation) 480 :type vertMarkov: int | None 481 :param childChar: A string used in construction of the artificial nodes, separating the head of the 482 original subtree from the child nodes that have yet to be expanded (default = "|") 483 :type childChar: str 484 :param parentChar: A string used to separate the node representation from its vertical annotation 485 :type parentChar: str 486 """ 487 from nltk.treetransforms import chomsky_normal_form 488 489 chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar) 490 491 def un_chomsky_normal_form( 492 self, expandUnary=True, childChar="|", parentChar="^", unaryChar="+" 493 ): 494 """ 495 This method modifies the tree in three ways: 496 497 1. Transforms a tree in Chomsky Normal Form back to its 498 original structure (branching greater than two) 499 2. Removes any parent annotation (if it exists) 500 3. (optional) expands unary subtrees (if previously 501 collapsed with collapseUnary(...) ) 502 503 :param expandUnary: Flag to expand unary or not (default = True) 504 :type expandUnary: bool 505 :param childChar: A string separating the head node from its children in an artificial node (default = "|") 506 :type childChar: str 507 :param parentChar: A sting separating the node label from its parent annotation (default = "^") 508 :type parentChar: str 509 :param unaryChar: A string joining two non-terminals in a unary production (default = "+") 510 :type unaryChar: str 511 """ 512 from nltk.treetransforms import un_chomsky_normal_form 513 514 un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar) 515 516 def collapse_unary(self, collapsePOS=False, collapseRoot=False, joinChar="+"): 517 """ 518 Collapse subtrees with a single child (ie. unary productions) 519 into a new non-terminal (Tree node) joined by 'joinChar'. 520 This is useful when working with algorithms that do not allow 521 unary productions, and completely removing the unary productions 522 would require loss of useful information. The Tree is modified 523 directly (since it is passed by reference) and no value is returned. 524 525 :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. 526 Part-of-Speech tags) since they are always unary productions 527 :type collapsePOS: bool 528 :param collapseRoot: 'False' (default) will not modify the root production 529 if it is unary. For the Penn WSJ treebank corpus, this corresponds 530 to the TOP -> productions. 531 :type collapseRoot: bool 532 :param joinChar: A string used to connect collapsed node values (default = "+") 533 :type joinChar: str 534 """ 535 from nltk.treetransforms import collapse_unary 536 537 collapse_unary(self, collapsePOS, collapseRoot, joinChar) 538 539 # //////////////////////////////////////////////////////////// 540 # Convert, copy 541 # //////////////////////////////////////////////////////////// 542 543 @classmethod 544 def convert(cls, tree): 545 """ 546 Convert a tree between different subtypes of Tree. ``cls`` determines 547 which class will be used to encode the new tree. 548 549 :type tree: Tree 550 :param tree: The tree that should be converted. 551 :return: The new Tree. 552 """ 553 if isinstance(tree, Tree): 554 children = [cls.convert(child) for child in tree] 555 return cls(tree._label, children) 556 else: 557 return tree 558 559 def copy(self, deep=False): 560 if not deep: 561 return type(self)(self._label, self) 562 else: 563 return type(self).convert(self) 564 565 def _frozen_class(self): 566 return ImmutableTree 567 568 def freeze(self, leaf_freezer=None): 569 frozen_class = self._frozen_class() 570 if leaf_freezer is None: 571 newcopy = frozen_class.convert(self) 572 else: 573 newcopy = self.copy(deep=True) 574 for pos in newcopy.treepositions('leaves'): 575 newcopy[pos] = leaf_freezer(newcopy[pos]) 576 newcopy = frozen_class.convert(newcopy) 577 hash(newcopy) # Make sure the leaves are hashable. 578 return newcopy 579 580 # //////////////////////////////////////////////////////////// 581 # Parsing 582 # //////////////////////////////////////////////////////////// 583 584 @classmethod 585 def fromstring( 586 cls, 587 s, 588 brackets='()', 589 read_node=None, 590 read_leaf=None, 591 node_pattern=None, 592 leaf_pattern=None, 593 remove_empty_top_bracketing=False, 594 ): 595 """ 596 Read a bracketed tree string and return the resulting tree. 597 Trees are represented as nested brackettings, such as:: 598 599 (S (NP (NNP John)) (VP (V runs))) 600 601 :type s: str 602 :param s: The string to read 603 604 :type brackets: str (length=2) 605 :param brackets: The bracket characters used to mark the 606 beginning and end of trees and subtrees. 607 608 :type read_node: function 609 :type read_leaf: function 610 :param read_node, read_leaf: If specified, these functions 611 are applied to the substrings of ``s`` corresponding to 612 nodes and leaves (respectively) to obtain the values for 613 those nodes and leaves. They should have the following 614 signature: 615 616 read_node(str) -> value 617 618 For example, these functions could be used to process nodes 619 and leaves whose values should be some type other than 620 string (such as ``FeatStruct``). 621 Note that by default, node strings and leaf strings are 622 delimited by whitespace and brackets; to override this 623 default, use the ``node_pattern`` and ``leaf_pattern`` 624 arguments. 625 626 :type node_pattern: str 627 :type leaf_pattern: str 628 :param node_pattern, leaf_pattern: Regular expression patterns 629 used to find node and leaf substrings in ``s``. By 630 default, both nodes patterns are defined to match any 631 sequence of non-whitespace non-bracket characters. 632 633 :type remove_empty_top_bracketing: bool 634 :param remove_empty_top_bracketing: If the resulting tree has 635 an empty node label, and is length one, then return its 636 single child instead. This is useful for treebank trees, 637 which sometimes contain an extra level of bracketing. 638 639 :return: A tree corresponding to the string representation ``s``. 640 If this class method is called using a subclass of Tree, 641 then it will return a tree of that type. 642 :rtype: Tree 643 """ 644 if not isinstance(brackets, string_types) or len(brackets) != 2: 645 raise TypeError('brackets must be a length-2 string') 646 if re.search('\s', brackets): 647 raise TypeError('whitespace brackets not allowed') 648 # Construct a regexp that will tokenize the string. 649 open_b, close_b = brackets 650 open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b)) 651 if node_pattern is None: 652 node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern) 653 if leaf_pattern is None: 654 leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern) 655 token_re = re.compile( 656 '%s\s*(%s)?|%s|(%s)' 657 % (open_pattern, node_pattern, close_pattern, leaf_pattern) 658 ) 659 # Walk through each token, updating a stack of trees. 660 stack = [(None, [])] # list of (node, children) tuples 661 for match in token_re.finditer(s): 662 token = match.group() 663 # Beginning of a tree/subtree 664 if token[0] == open_b: 665 if len(stack) == 1 and len(stack[0][1]) > 0: 666 cls._parse_error(s, match, 'end-of-string') 667 label = token[1:].lstrip() 668 if read_node is not None: 669 label = read_node(label) 670 stack.append((label, [])) 671 # End of a tree/subtree 672 elif token == close_b: 673 if len(stack) == 1: 674 if len(stack[0][1]) == 0: 675 cls._parse_error(s, match, open_b) 676 else: 677 cls._parse_error(s, match, 'end-of-string') 678 label, children = stack.pop() 679 stack[-1][1].append(cls(label, children)) 680 # Leaf node 681 else: 682 if len(stack) == 1: 683 cls._parse_error(s, match, open_b) 684 if read_leaf is not None: 685 token = read_leaf(token) 686 stack[-1][1].append(token) 687 688 # check that we got exactly one complete tree. 689 if len(stack) > 1: 690 cls._parse_error(s, 'end-of-string', close_b) 691 elif len(stack[0][1]) == 0: 692 cls._parse_error(s, 'end-of-string', open_b) 693 else: 694 assert stack[0][0] is None 695 assert len(stack[0][1]) == 1 696 tree = stack[0][1][0] 697 698 # If the tree has an extra level with node='', then get rid of 699 # it. E.g.: "((S (NP ...) (VP ...)))" 700 if remove_empty_top_bracketing and tree._label == '' and len(tree) == 1: 701 tree = tree[0] 702 # return the tree. 703 return tree 704 705 @classmethod 706 def _parse_error(cls, s, match, expecting): 707 """ 708 Display a friendly error message when parsing a tree string fails. 709 :param s: The string we're parsing. 710 :param match: regexp match of the problem token. 711 :param expecting: what we expected to see instead. 712 """ 713 # Construct a basic error message 714 if match == 'end-of-string': 715 pos, token = len(s), 'end-of-string' 716 else: 717 pos, token = match.start(), match.group() 718 msg = '%s.read(): expected %r but got %r\n%sat index %d.' % ( 719 cls.__name__, 720 expecting, 721 token, 722 ' ' * 12, 723 pos, 724 ) 725 # Add a display showing the error token itsels: 726 s = s.replace('\n', ' ').replace('\t', ' ') 727 offset = pos 728 if len(s) > pos + 10: 729 s = s[: pos + 10] + '...' 730 if pos > 10: 731 s = '...' + s[pos - 10 :] 732 offset = 13 733 msg += '\n%s"%s"\n%s^' % (' ' * 16, s, ' ' * (17 + offset)) 734 raise ValueError(msg) 735 736 # //////////////////////////////////////////////////////////// 737 # Visualization & String Representation 738 # //////////////////////////////////////////////////////////// 739 740 def draw(self): 741 """ 742 Open a new window containing a graphical diagram of this tree. 743 """ 744 from nltk.draw.tree import draw_trees 745 746 draw_trees(self) 747 748 def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs): 749 """ 750 Pretty-print this tree as ASCII or Unicode art. 751 For explanation of the arguments, see the documentation for 752 `nltk.treeprettyprinter.TreePrettyPrinter`. 753 """ 754 from nltk.treeprettyprinter import TreePrettyPrinter 755 756 print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream) 757 758 def __repr__(self): 759 childstr = ", ".join(unicode_repr(c) for c in self) 760 return '%s(%s, [%s])' % ( 761 type(self).__name__, 762 unicode_repr(self._label), 763 childstr, 764 ) 765 766 def _repr_png_(self): 767 """ 768 Draws and outputs in PNG for ipython. 769 PNG is used instead of PDF, since it can be displayed in the qt console and 770 has wider browser support. 771 """ 772 import os 773 import base64 774 import subprocess 775 import tempfile 776 from nltk.draw.tree import tree_to_treesegment 777 from nltk.draw.util import CanvasFrame 778 from nltk.internals import find_binary 779 780 _canvas_frame = CanvasFrame() 781 widget = tree_to_treesegment(_canvas_frame.canvas(), self) 782 _canvas_frame.add_widget(widget) 783 x, y, w, h = widget.bbox() 784 # print_to_file uses scrollregion to set the width and height of the pdf. 785 _canvas_frame.canvas()['scrollregion'] = (0, 0, w, h) 786 with tempfile.NamedTemporaryFile() as file: 787 in_path = '{0:}.ps'.format(file.name) 788 out_path = '{0:}.png'.format(file.name) 789 _canvas_frame.print_to_file(in_path) 790 _canvas_frame.destroy_widget(widget) 791 subprocess.call( 792 [ 793 find_binary( 794 'gs', 795 binary_names=['gswin32c.exe', 'gswin64c.exe'], 796 env_vars=['PATH'], 797 verbose=False, 798 ) 799 ] 800 + '-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'.format( 801 out_path, in_path 802 ).split() 803 ) 804 with open(out_path, 'rb') as sr: 805 res = sr.read() 806 os.remove(in_path) 807 os.remove(out_path) 808 return base64.b64encode(res).decode() 809 810 def __str__(self): 811 return self.pformat() 812 813 def pprint(self, **kwargs): 814 """ 815 Print a string representation of this Tree to 'stream' 816 """ 817 818 if "stream" in kwargs: 819 stream = kwargs["stream"] 820 del kwargs["stream"] 821 else: 822 stream = None 823 print(self.pformat(**kwargs), file=stream) 824 825 def pformat(self, margin=70, indent=0, nodesep='', parens='()', quotes=False): 826 """ 827 :return: A pretty-printed string representation of this tree. 828 :rtype: str 829 :param margin: The right margin at which to do line-wrapping. 830 :type margin: int 831 :param indent: The indentation level at which printing 832 begins. This number is used to decide how far to indent 833 subsequent lines. 834 :type indent: int 835 :param nodesep: A string that is used to separate the node 836 from the children. E.g., the default value ``':'`` gives 837 trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``. 838 """ 839 840 # Try writing it on one line. 841 s = self._pformat_flat(nodesep, parens, quotes) 842 if len(s) + indent < margin: 843 return s 844 845 # If it doesn't fit on one line, then write it on multi-lines. 846 if isinstance(self._label, string_types): 847 s = '%s%s%s' % (parens[0], self._label, nodesep) 848 else: 849 s = '%s%s%s' % (parens[0], unicode_repr(self._label), nodesep) 850 for child in self: 851 if isinstance(child, Tree): 852 s += ( 853 '\n' 854 + ' ' * (indent + 2) 855 + child.pformat(margin, indent + 2, nodesep, parens, quotes) 856 ) 857 elif isinstance(child, tuple): 858 s += '\n' + ' ' * (indent + 2) + "/".join(child) 859 elif isinstance(child, string_types) and not quotes: 860 s += '\n' + ' ' * (indent + 2) + '%s' % child 861 else: 862 s += '\n' + ' ' * (indent + 2) + unicode_repr(child) 863 return s + parens[1] 864 865 def pformat_latex_qtree(self): 866 r""" 867 Returns a representation of the tree compatible with the 868 LaTeX qtree package. This consists of the string ``\Tree`` 869 followed by the tree represented in bracketed notation. 870 871 For example, the following result was generated from a parse tree of 872 the sentence ``The announcement astounded us``:: 873 874 \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ] 875 [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ] 876 877 See http://www.ling.upenn.edu/advice/latex.html for the LaTeX 878 style file for the qtree package. 879 880 :return: A latex qtree representation of this tree. 881 :rtype: str 882 """ 883 reserved_chars = re.compile('([#\$%&~_\{\}])') 884 885 pformat = self.pformat(indent=6, nodesep='', parens=('[.', ' ]')) 886 return r'\Tree ' + re.sub(reserved_chars, r'\\\1', pformat) 887 888 def _pformat_flat(self, nodesep, parens, quotes): 889 childstrs = [] 890 for child in self: 891 if isinstance(child, Tree): 892 childstrs.append(child._pformat_flat(nodesep, parens, quotes)) 893 elif isinstance(child, tuple): 894 childstrs.append("/".join(child)) 895 elif isinstance(child, string_types) and not quotes: 896 childstrs.append('%s' % child) 897 else: 898 childstrs.append(unicode_repr(child)) 899 if isinstance(self._label, string_types): 900 return '%s%s%s %s%s' % ( 901 parens[0], 902 self._label, 903 nodesep, 904 " ".join(childstrs), 905 parens[1], 906 ) 907 else: 908 return '%s%s%s %s%s' % ( 909 parens[0], 910 unicode_repr(self._label), 911 nodesep, 912 " ".join(childstrs), 913 parens[1], 914 ) 915 916 917class ImmutableTree(Tree): 918 def __init__(self, node, children=None): 919 super(ImmutableTree, self).__init__(node, children) 920 # Precompute our hash value. This ensures that we're really 921 # immutable. It also means we only have to calculate it once. 922 try: 923 self._hash = hash((self._label, tuple(self))) 924 except (TypeError, ValueError): 925 raise ValueError( 926 "%s: node value and children " "must be immutable" % type(self).__name__ 927 ) 928 929 def __setitem__(self, index, value): 930 raise ValueError('%s may not be modified' % type(self).__name__) 931 932 def __setslice__(self, i, j, value): 933 raise ValueError('%s may not be modified' % type(self).__name__) 934 935 def __delitem__(self, index): 936 raise ValueError('%s may not be modified' % type(self).__name__) 937 938 def __delslice__(self, i, j): 939 raise ValueError('%s may not be modified' % type(self).__name__) 940 941 def __iadd__(self, other): 942 raise ValueError('%s may not be modified' % type(self).__name__) 943 944 def __imul__(self, other): 945 raise ValueError('%s may not be modified' % type(self).__name__) 946 947 def append(self, v): 948 raise ValueError('%s may not be modified' % type(self).__name__) 949 950 def extend(self, v): 951 raise ValueError('%s may not be modified' % type(self).__name__) 952 953 def pop(self, v=None): 954 raise ValueError('%s may not be modified' % type(self).__name__) 955 956 def remove(self, v): 957 raise ValueError('%s may not be modified' % type(self).__name__) 958 959 def reverse(self): 960 raise ValueError('%s may not be modified' % type(self).__name__) 961 962 def sort(self): 963 raise ValueError('%s may not be modified' % type(self).__name__) 964 965 def __hash__(self): 966 return self._hash 967 968 def set_label(self, value): 969 """ 970 Set the node label. This will only succeed the first time the 971 node label is set, which should occur in ImmutableTree.__init__(). 972 """ 973 if hasattr(self, '_label'): 974 raise ValueError('%s may not be modified' % type(self).__name__) 975 self._label = value 976 977 978###################################################################### 979## Parented trees 980###################################################################### 981@add_metaclass(ABCMeta) 982class AbstractParentedTree(Tree): 983 """ 984 An abstract base class for a ``Tree`` that automatically maintains 985 pointers to parent nodes. These parent pointers are updated 986 whenever any change is made to a tree's structure. Two subclasses 987 are currently defined: 988 989 - ``ParentedTree`` is used for tree structures where each subtree 990 has at most one parent. This class should be used in cases 991 where there is no"sharing" of subtrees. 992 993 - ``MultiParentedTree`` is used for tree structures where a 994 subtree may have zero or more parents. This class should be 995 used in cases where subtrees may be shared. 996 997 Subclassing 998 =========== 999 The ``AbstractParentedTree`` class redefines all operations that 1000 modify a tree's structure to call two methods, which are used by 1001 subclasses to update parent information: 1002 1003 - ``_setparent()`` is called whenever a new child is added. 1004 - ``_delparent()`` is called whenever a child is removed. 1005 """ 1006 1007 def __init__(self, node, children=None): 1008 super(AbstractParentedTree, self).__init__(node, children) 1009 # If children is None, the tree is read from node, and 1010 # all parents will be set during parsing. 1011 if children is not None: 1012 # Otherwise we have to set the parent of the children. 1013 # Iterate over self, and *not* children, because children 1014 # might be an iterator. 1015 for i, child in enumerate(self): 1016 if isinstance(child, Tree): 1017 self._setparent(child, i, dry_run=True) 1018 for i, child in enumerate(self): 1019 if isinstance(child, Tree): 1020 self._setparent(child, i) 1021 1022 # //////////////////////////////////////////////////////////// 1023 # Parent management 1024 # //////////////////////////////////////////////////////////// 1025 @abstractmethod 1026 def _setparent(self, child, index, dry_run=False): 1027 """ 1028 Update the parent pointer of ``child`` to point to ``self``. This 1029 method is only called if the type of ``child`` is ``Tree``; 1030 i.e., it is not called when adding a leaf to a tree. This method 1031 is always called before the child is actually added to the 1032 child list of ``self``. 1033 1034 :type child: Tree 1035 :type index: int 1036 :param index: The index of ``child`` in ``self``. 1037 :raise TypeError: If ``child`` is a tree with an impropriate 1038 type. Typically, if ``child`` is a tree, then its type needs 1039 to match the type of ``self``. This prevents mixing of 1040 different tree types (single-parented, multi-parented, and 1041 non-parented). 1042 :param dry_run: If true, the don't actually set the child's 1043 parent pointer; just check for any error conditions, and 1044 raise an exception if one is found. 1045 """ 1046 1047 @abstractmethod 1048 def _delparent(self, child, index): 1049 """ 1050 Update the parent pointer of ``child`` to not point to self. This 1051 method is only called if the type of ``child`` is ``Tree``; i.e., it 1052 is not called when removing a leaf from a tree. This method 1053 is always called before the child is actually removed from the 1054 child list of ``self``. 1055 1056 :type child: Tree 1057 :type index: int 1058 :param index: The index of ``child`` in ``self``. 1059 """ 1060 1061 # //////////////////////////////////////////////////////////// 1062 # Methods that add/remove children 1063 # //////////////////////////////////////////////////////////// 1064 # Every method that adds or removes a child must make 1065 # appropriate calls to _setparent() and _delparent(). 1066 1067 def __delitem__(self, index): 1068 # del ptree[start:stop] 1069 if isinstance(index, slice): 1070 start, stop, step = slice_bounds(self, index, allow_step=True) 1071 # Clear all the children pointers. 1072 for i in range(start, stop, step): 1073 if isinstance(self[i], Tree): 1074 self._delparent(self[i], i) 1075 # Delete the children from our child list. 1076 super(AbstractParentedTree, self).__delitem__(index) 1077 1078 # del ptree[i] 1079 elif isinstance(index, int): 1080 if index < 0: 1081 index += len(self) 1082 if index < 0: 1083 raise IndexError('index out of range') 1084 # Clear the child's parent pointer. 1085 if isinstance(self[index], Tree): 1086 self._delparent(self[index], index) 1087 # Remove the child from our child list. 1088 super(AbstractParentedTree, self).__delitem__(index) 1089 1090 elif isinstance(index, (list, tuple)): 1091 # del ptree[()] 1092 if len(index) == 0: 1093 raise IndexError('The tree position () may not be deleted.') 1094 # del ptree[(i,)] 1095 elif len(index) == 1: 1096 del self[index[0]] 1097 # del ptree[i1, i2, i3] 1098 else: 1099 del self[index[0]][index[1:]] 1100 1101 else: 1102 raise TypeError( 1103 "%s indices must be integers, not %s" 1104 % (type(self).__name__, type(index).__name__) 1105 ) 1106 1107 def __setitem__(self, index, value): 1108 # ptree[start:stop] = value 1109 if isinstance(index, slice): 1110 start, stop, step = slice_bounds(self, index, allow_step=True) 1111 # make a copy of value, in case it's an iterator 1112 if not isinstance(value, (list, tuple)): 1113 value = list(value) 1114 # Check for any error conditions, so we can avoid ending 1115 # up in an inconsistent state if an error does occur. 1116 for i, child in enumerate(value): 1117 if isinstance(child, Tree): 1118 self._setparent(child, start + i * step, dry_run=True) 1119 # clear the child pointers of all parents we're removing 1120 for i in range(start, stop, step): 1121 if isinstance(self[i], Tree): 1122 self._delparent(self[i], i) 1123 # set the child pointers of the new children. We do this 1124 # after clearing *all* child pointers, in case we're e.g. 1125 # reversing the elements in a tree. 1126 for i, child in enumerate(value): 1127 if isinstance(child, Tree): 1128 self._setparent(child, start + i * step) 1129 # finally, update the content of the child list itself. 1130 super(AbstractParentedTree, self).__setitem__(index, value) 1131 1132 # ptree[i] = value 1133 elif isinstance(index, int): 1134 if index < 0: 1135 index += len(self) 1136 if index < 0: 1137 raise IndexError('index out of range') 1138 # if the value is not changing, do nothing. 1139 if value is self[index]: 1140 return 1141 # Set the new child's parent pointer. 1142 if isinstance(value, Tree): 1143 self._setparent(value, index) 1144 # Remove the old child's parent pointer 1145 if isinstance(self[index], Tree): 1146 self._delparent(self[index], index) 1147 # Update our child list. 1148 super(AbstractParentedTree, self).__setitem__(index, value) 1149 1150 elif isinstance(index, (list, tuple)): 1151 # ptree[()] = value 1152 if len(index) == 0: 1153 raise IndexError('The tree position () may not be assigned to.') 1154 # ptree[(i,)] = value 1155 elif len(index) == 1: 1156 self[index[0]] = value 1157 # ptree[i1, i2, i3] = value 1158 else: 1159 self[index[0]][index[1:]] = value 1160 1161 else: 1162 raise TypeError( 1163 "%s indices must be integers, not %s" 1164 % (type(self).__name__, type(index).__name__) 1165 ) 1166 1167 def append(self, child): 1168 if isinstance(child, Tree): 1169 self._setparent(child, len(self)) 1170 super(AbstractParentedTree, self).append(child) 1171 1172 def extend(self, children): 1173 for child in children: 1174 if isinstance(child, Tree): 1175 self._setparent(child, len(self)) 1176 super(AbstractParentedTree, self).append(child) 1177 1178 def insert(self, index, child): 1179 # Handle negative indexes. Note that if index < -len(self), 1180 # we do *not* raise an IndexError, unlike __getitem__. This 1181 # is done for consistency with list.__getitem__ and list.index. 1182 if index < 0: 1183 index += len(self) 1184 if index < 0: 1185 index = 0 1186 # Set the child's parent, and update our child list. 1187 if isinstance(child, Tree): 1188 self._setparent(child, index) 1189 super(AbstractParentedTree, self).insert(index, child) 1190 1191 def pop(self, index=-1): 1192 if index < 0: 1193 index += len(self) 1194 if index < 0: 1195 raise IndexError('index out of range') 1196 if isinstance(self[index], Tree): 1197 self._delparent(self[index], index) 1198 return super(AbstractParentedTree, self).pop(index) 1199 1200 # n.b.: like `list`, this is done by equality, not identity! 1201 # To remove a specific child, use del ptree[i]. 1202 def remove(self, child): 1203 index = self.index(child) 1204 if isinstance(self[index], Tree): 1205 self._delparent(self[index], index) 1206 super(AbstractParentedTree, self).remove(child) 1207 1208 # We need to implement __getslice__ and friends, even though 1209 # they're deprecated, because otherwise list.__getslice__ will get 1210 # called (since we're subclassing from list). Just delegate to 1211 # __getitem__ etc., but use max(0, start) and max(0, stop) because 1212 # because negative indices are already handled *before* 1213 # __getslice__ is called; and we don't want to double-count them. 1214 if hasattr(list, '__getslice__'): 1215 1216 def __getslice__(self, start, stop): 1217 return self.__getitem__(slice(max(0, start), max(0, stop))) 1218 1219 def __delslice__(self, start, stop): 1220 return self.__delitem__(slice(max(0, start), max(0, stop))) 1221 1222 def __setslice__(self, start, stop, value): 1223 return self.__setitem__(slice(max(0, start), max(0, stop)), value) 1224 1225 1226class ParentedTree(AbstractParentedTree): 1227 """ 1228 A ``Tree`` that automatically maintains parent pointers for 1229 single-parented trees. The following are methods for querying 1230 the structure of a parented tree: ``parent``, ``parent_index``, 1231 ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``. 1232 1233 Each ``ParentedTree`` may have at most one parent. In 1234 particular, subtrees may not be shared. Any attempt to reuse a 1235 single ``ParentedTree`` as a child of more than one parent (or 1236 as multiple children of the same parent) will cause a 1237 ``ValueError`` exception to be raised. 1238 1239 ``ParentedTrees`` should never be used in the same tree as ``Trees`` 1240 or ``MultiParentedTrees``. Mixing tree implementations may result 1241 in incorrect parent pointers and in ``TypeError`` exceptions. 1242 """ 1243 1244 def __init__(self, node, children=None): 1245 self._parent = None 1246 """The parent of this Tree, or None if it has no parent.""" 1247 super(ParentedTree, self).__init__(node, children) 1248 if children is None: 1249 # If children is None, the tree is read from node. 1250 # After parsing, the parent of the immediate children 1251 # will point to an intermediate tree, not self. 1252 # We fix this by brute force: 1253 for i, child in enumerate(self): 1254 if isinstance(child, Tree): 1255 child._parent = None 1256 self._setparent(child, i) 1257 1258 def _frozen_class(self): 1259 return ImmutableParentedTree 1260 1261 # ///////////////////////////////////////////////////////////////// 1262 # Methods 1263 # ///////////////////////////////////////////////////////////////// 1264 1265 def parent(self): 1266 """The parent of this tree, or None if it has no parent.""" 1267 return self._parent 1268 1269 def parent_index(self): 1270 """ 1271 The index of this tree in its parent. I.e., 1272 ``ptree.parent()[ptree.parent_index()] is ptree``. Note that 1273 ``ptree.parent_index()`` is not necessarily equal to 1274 ``ptree.parent.index(ptree)``, since the ``index()`` method 1275 returns the first child that is equal to its argument. 1276 """ 1277 if self._parent is None: 1278 return None 1279 for i, child in enumerate(self._parent): 1280 if child is self: 1281 return i 1282 assert False, 'expected to find self in self._parent!' 1283 1284 def left_sibling(self): 1285 """The left sibling of this tree, or None if it has none.""" 1286 parent_index = self.parent_index() 1287 if self._parent and parent_index > 0: 1288 return self._parent[parent_index - 1] 1289 return None # no left sibling 1290 1291 def right_sibling(self): 1292 """The right sibling of this tree, or None if it has none.""" 1293 parent_index = self.parent_index() 1294 if self._parent and parent_index < (len(self._parent) - 1): 1295 return self._parent[parent_index + 1] 1296 return None # no right sibling 1297 1298 def root(self): 1299 """ 1300 The root of this tree. I.e., the unique ancestor of this tree 1301 whose parent is None. If ``ptree.parent()`` is None, then 1302 ``ptree`` is its own root. 1303 """ 1304 root = self 1305 while root.parent() is not None: 1306 root = root.parent() 1307 return root 1308 1309 def treeposition(self): 1310 """ 1311 The tree position of this tree, relative to the root of the 1312 tree. I.e., ``ptree.root[ptree.treeposition] is ptree``. 1313 """ 1314 if self.parent() is None: 1315 return () 1316 else: 1317 return self.parent().treeposition() + (self.parent_index(),) 1318 1319 # ///////////////////////////////////////////////////////////////// 1320 # Parent Management 1321 # ///////////////////////////////////////////////////////////////// 1322 1323 def _delparent(self, child, index): 1324 # Sanity checks 1325 assert isinstance(child, ParentedTree) 1326 assert self[index] is child 1327 assert child._parent is self 1328 1329 # Delete child's parent pointer. 1330 child._parent = None 1331 1332 def _setparent(self, child, index, dry_run=False): 1333 # If the child's type is incorrect, then complain. 1334 if not isinstance(child, ParentedTree): 1335 raise TypeError( 1336 'Can not insert a non-ParentedTree ' + 'into a ParentedTree' 1337 ) 1338 1339 # If child already has a parent, then complain. 1340 if child._parent is not None: 1341 raise ValueError('Can not insert a subtree that already ' 'has a parent.') 1342 1343 # Set child's parent pointer & index. 1344 if not dry_run: 1345 child._parent = self 1346 1347 1348class MultiParentedTree(AbstractParentedTree): 1349 """ 1350 A ``Tree`` that automatically maintains parent pointers for 1351 multi-parented trees. The following are methods for querying the 1352 structure of a multi-parented tree: ``parents()``, ``parent_indices()``, 1353 ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``. 1354 1355 Each ``MultiParentedTree`` may have zero or more parents. In 1356 particular, subtrees may be shared. If a single 1357 ``MultiParentedTree`` is used as multiple children of the same 1358 parent, then that parent will appear multiple times in its 1359 ``parents()`` method. 1360 1361 ``MultiParentedTrees`` should never be used in the same tree as 1362 ``Trees`` or ``ParentedTrees``. Mixing tree implementations may 1363 result in incorrect parent pointers and in ``TypeError`` exceptions. 1364 """ 1365 1366 def __init__(self, node, children=None): 1367 self._parents = [] 1368 """A list of this tree's parents. This list should not 1369 contain duplicates, even if a parent contains this tree 1370 multiple times.""" 1371 super(MultiParentedTree, self).__init__(node, children) 1372 if children is None: 1373 # If children is None, the tree is read from node. 1374 # After parsing, the parent(s) of the immediate children 1375 # will point to an intermediate tree, not self. 1376 # We fix this by brute force: 1377 for i, child in enumerate(self): 1378 if isinstance(child, Tree): 1379 child._parents = [] 1380 self._setparent(child, i) 1381 1382 def _frozen_class(self): 1383 return ImmutableMultiParentedTree 1384 1385 # ///////////////////////////////////////////////////////////////// 1386 # Methods 1387 # ///////////////////////////////////////////////////////////////// 1388 1389 def parents(self): 1390 """ 1391 The set of parents of this tree. If this tree has no parents, 1392 then ``parents`` is the empty set. To check if a tree is used 1393 as multiple children of the same parent, use the 1394 ``parent_indices()`` method. 1395 1396 :type: list(MultiParentedTree) 1397 """ 1398 return list(self._parents) 1399 1400 def left_siblings(self): 1401 """ 1402 A list of all left siblings of this tree, in any of its parent 1403 trees. A tree may be its own left sibling if it is used as 1404 multiple contiguous children of the same parent. A tree may 1405 appear multiple times in this list if it is the left sibling 1406 of this tree with respect to multiple parents. 1407 1408 :type: list(MultiParentedTree) 1409 """ 1410 return [ 1411 parent[index - 1] 1412 for (parent, index) in self._get_parent_indices() 1413 if index > 0 1414 ] 1415 1416 def right_siblings(self): 1417 """ 1418 A list of all right siblings of this tree, in any of its parent 1419 trees. A tree may be its own right sibling if it is used as 1420 multiple contiguous children of the same parent. A tree may 1421 appear multiple times in this list if it is the right sibling 1422 of this tree with respect to multiple parents. 1423 1424 :type: list(MultiParentedTree) 1425 """ 1426 return [ 1427 parent[index + 1] 1428 for (parent, index) in self._get_parent_indices() 1429 if index < (len(parent) - 1) 1430 ] 1431 1432 def _get_parent_indices(self): 1433 return [ 1434 (parent, index) 1435 for parent in self._parents 1436 for index, child in enumerate(parent) 1437 if child is self 1438 ] 1439 1440 def roots(self): 1441 """ 1442 The set of all roots of this tree. This set is formed by 1443 tracing all possible parent paths until trees with no parents 1444 are found. 1445 1446 :type: list(MultiParentedTree) 1447 """ 1448 return list(self._get_roots_helper({}).values()) 1449 1450 def _get_roots_helper(self, result): 1451 if self._parents: 1452 for parent in self._parents: 1453 parent._get_roots_helper(result) 1454 else: 1455 result[id(self)] = self 1456 return result 1457 1458 def parent_indices(self, parent): 1459 """ 1460 Return a list of the indices where this tree occurs as a child 1461 of ``parent``. If this child does not occur as a child of 1462 ``parent``, then the empty list is returned. The following is 1463 always true:: 1464 1465 for parent_index in ptree.parent_indices(parent): 1466 parent[parent_index] is ptree 1467 """ 1468 if parent not in self._parents: 1469 return [] 1470 else: 1471 return [index for (index, child) in enumerate(parent) if child is self] 1472 1473 def treepositions(self, root): 1474 """ 1475 Return a list of all tree positions that can be used to reach 1476 this multi-parented tree starting from ``root``. I.e., the 1477 following is always true:: 1478 1479 for treepos in ptree.treepositions(root): 1480 root[treepos] is ptree 1481 """ 1482 if self is root: 1483 return [()] 1484 else: 1485 return [ 1486 treepos + (index,) 1487 for parent in self._parents 1488 for treepos in parent.treepositions(root) 1489 for (index, child) in enumerate(parent) 1490 if child is self 1491 ] 1492 1493 # ///////////////////////////////////////////////////////////////// 1494 # Parent Management 1495 # ///////////////////////////////////////////////////////////////// 1496 1497 def _delparent(self, child, index): 1498 # Sanity checks 1499 assert isinstance(child, MultiParentedTree) 1500 assert self[index] is child 1501 assert len([p for p in child._parents if p is self]) == 1 1502 1503 # If the only copy of child in self is at index, then delete 1504 # self from child's parent list. 1505 for i, c in enumerate(self): 1506 if c is child and i != index: 1507 break 1508 else: 1509 child._parents.remove(self) 1510 1511 def _setparent(self, child, index, dry_run=False): 1512 # If the child's type is incorrect, then complain. 1513 if not isinstance(child, MultiParentedTree): 1514 raise TypeError( 1515 'Can not insert a non-MultiParentedTree ' + 'into a MultiParentedTree' 1516 ) 1517 1518 # Add self as a parent pointer if it's not already listed. 1519 if not dry_run: 1520 for parent in child._parents: 1521 if parent is self: 1522 break 1523 else: 1524 child._parents.append(self) 1525 1526 1527class ImmutableParentedTree(ImmutableTree, ParentedTree): 1528 pass 1529 1530 1531class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree): 1532 pass 1533 1534 1535###################################################################### 1536## Probabilistic trees 1537###################################################################### 1538 1539 1540@python_2_unicode_compatible 1541class ProbabilisticTree(Tree, ProbabilisticMixIn): 1542 def __init__(self, node, children=None, **prob_kwargs): 1543 Tree.__init__(self, node, children) 1544 ProbabilisticMixIn.__init__(self, **prob_kwargs) 1545 1546 # We have to patch up these methods to make them work right: 1547 def _frozen_class(self): 1548 return ImmutableProbabilisticTree 1549 1550 def __repr__(self): 1551 return '%s (p=%r)' % (Tree.unicode_repr(self), self.prob()) 1552 1553 def __str__(self): 1554 return '%s (p=%.6g)' % (self.pformat(margin=60), self.prob()) 1555 1556 def copy(self, deep=False): 1557 if not deep: 1558 return type(self)(self._label, self, prob=self.prob()) 1559 else: 1560 return type(self).convert(self) 1561 1562 @classmethod 1563 def convert(cls, val): 1564 if isinstance(val, Tree): 1565 children = [cls.convert(child) for child in val] 1566 if isinstance(val, ProbabilisticMixIn): 1567 return cls(val._label, children, prob=val.prob()) 1568 else: 1569 return cls(val._label, children, prob=1.0) 1570 else: 1571 return val 1572 1573 def __eq__(self, other): 1574 return self.__class__ is other.__class__ and ( 1575 self._label, 1576 list(self), 1577 self.prob(), 1578 ) == (other._label, list(other), other.prob()) 1579 1580 def __lt__(self, other): 1581 if not isinstance(other, Tree): 1582 raise_unorderable_types("<", self, other) 1583 if self.__class__ is other.__class__: 1584 return (self._label, list(self), self.prob()) < ( 1585 other._label, 1586 list(other), 1587 other.prob(), 1588 ) 1589 else: 1590 return self.__class__.__name__ < other.__class__.__name__ 1591 1592 1593@python_2_unicode_compatible 1594class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn): 1595 def __init__(self, node, children=None, **prob_kwargs): 1596 ImmutableTree.__init__(self, node, children) 1597 ProbabilisticMixIn.__init__(self, **prob_kwargs) 1598 self._hash = hash((self._label, tuple(self), self.prob())) 1599 1600 # We have to patch up these methods to make them work right: 1601 def _frozen_class(self): 1602 return ImmutableProbabilisticTree 1603 1604 def __repr__(self): 1605 return '%s [%s]' % (Tree.unicode_repr(self), self.prob()) 1606 1607 def __str__(self): 1608 return '%s [%s]' % (self.pformat(margin=60), self.prob()) 1609 1610 def copy(self, deep=False): 1611 if not deep: 1612 return type(self)(self._label, self, prob=self.prob()) 1613 else: 1614 return type(self).convert(self) 1615 1616 @classmethod 1617 def convert(cls, val): 1618 if isinstance(val, Tree): 1619 children = [cls.convert(child) for child in val] 1620 if isinstance(val, ProbabilisticMixIn): 1621 return cls(val._label, children, prob=val.prob()) 1622 else: 1623 return cls(val._label, children, prob=1.0) 1624 else: 1625 return val 1626 1627 1628def _child_names(tree): 1629 names = [] 1630 for child in tree: 1631 if isinstance(child, Tree): 1632 names.append(Nonterminal(child._label)) 1633 else: 1634 names.append(child) 1635 return names 1636 1637 1638###################################################################### 1639## Parsing 1640###################################################################### 1641 1642 1643def bracket_parse(s): 1644 """ 1645 Use Tree.read(s, remove_empty_top_bracketing=True) instead. 1646 """ 1647 raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.") 1648 1649 1650def sinica_parse(s): 1651 """ 1652 Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, 1653 as shown in the following example (X represents a Chinese character): 1654 S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY) 1655 1656 :return: A tree corresponding to the string representation. 1657 :rtype: Tree 1658 :param s: The string to be converted 1659 :type s: str 1660 """ 1661 tokens = re.split(r'([()| ])', s) 1662 for i in range(len(tokens)): 1663 if tokens[i] == '(': 1664 tokens[i - 1], tokens[i] = ( 1665 tokens[i], 1666 tokens[i - 1], 1667 ) # pull nonterminal inside parens 1668 elif ':' in tokens[i]: 1669 fields = tokens[i].split(':') 1670 if len(fields) == 2: # non-terminal 1671 tokens[i] = fields[1] 1672 else: 1673 tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" 1674 elif tokens[i] == '|': 1675 tokens[i] = '' 1676 1677 treebank_string = " ".join(tokens) 1678 return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True) 1679 1680 1681# s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier 1682# s = re.sub(r'\w+:', '', s) # remove role tags 1683 1684# return s 1685 1686###################################################################### 1687## Demonstration 1688###################################################################### 1689 1690 1691def demo(): 1692 """ 1693 A demonstration showing how Trees and Trees can be 1694 used. This demonstration creates a Tree, and loads a 1695 Tree from the Treebank corpus, 1696 and shows the results of calling several of their methods. 1697 """ 1698 1699 from nltk import Tree, ProbabilisticTree 1700 1701 # Demonstrate tree parsing. 1702 s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))' 1703 t = Tree.fromstring(s) 1704 print("Convert bracketed string into tree:") 1705 print(t) 1706 print(t.__repr__()) 1707 1708 print("Display tree properties:") 1709 print(t.label()) # tree's constituent type 1710 print(t[0]) # tree's first child 1711 print(t[1]) # tree's second child 1712 print(t.height()) 1713 print(t.leaves()) 1714 print(t[1]) 1715 print(t[1, 1]) 1716 print(t[1, 1, 0]) 1717 1718 # Demonstrate tree modification. 1719 the_cat = t[0] 1720 the_cat.insert(1, Tree.fromstring('(JJ big)')) 1721 print("Tree modification:") 1722 print(t) 1723 t[1, 1, 1] = Tree.fromstring('(NN cake)') 1724 print(t) 1725 print() 1726 1727 # Tree transforms 1728 print("Collapse unary:") 1729 t.collapse_unary() 1730 print(t) 1731 print("Chomsky normal form:") 1732 t.chomsky_normal_form() 1733 print(t) 1734 print() 1735 1736 # Demonstrate probabilistic trees. 1737 pt = ProbabilisticTree('x', ['y', 'z'], prob=0.5) 1738 print("Probabilistic Tree:") 1739 print(pt) 1740 print() 1741 1742 # Demonstrate parsing of treebank output format. 1743 t = Tree.fromstring(t.pformat()) 1744 print("Convert tree to bracketed string and back again:") 1745 print(t) 1746 print() 1747 1748 # Demonstrate LaTeX output 1749 print("LaTeX output:") 1750 print(t.pformat_latex_qtree()) 1751 print() 1752 1753 # Demonstrate Productions 1754 print("Production output:") 1755 print(t.productions()) 1756 print() 1757 1758 # Demonstrate tree nodes containing objects other than strings 1759 t.set_label(('test', 3)) 1760 print(t) 1761 1762 1763__all__ = [ 1764 'ImmutableProbabilisticTree', 1765 'ImmutableTree', 1766 'ProbabilisticMixIn', 1767 'ProbabilisticTree', 1768 'Tree', 1769 'bracket_parse', 1770 'sinica_parse', 1771 'ParentedTree', 1772 'MultiParentedTree', 1773 'ImmutableParentedTree', 1774 'ImmutableMultiParentedTree', 1775] 1776