1# -*- coding: utf-8 -*-
2# Natural Language Toolkit: Text Trees
3#
4# Copyright (C) 2001-2019 NLTK Project
5# Author: Edward Loper <edloper@gmail.com>
6#         Steven Bird <stevenbird1@gmail.com>
7#         Peter Ljunglöf <peter.ljunglof@gu.se>
8#         Nathan Bodenstab <bodenstab@cslu.ogi.edu> (tree transforms)
9# URL: <http://nltk.org/>
10# For license information, see LICENSE.TXT
11
12"""
13Class for representing hierarchical language structures, such as
14syntax trees and morphological trees.
15"""
16from __future__ import print_function, unicode_literals
17
18import re
19from abc import ABCMeta, abstractmethod
20
21from six import string_types, add_metaclass
22
23from nltk.grammar import Production, Nonterminal
24from nltk.probability import ProbabilisticMixIn
25from nltk.util import slice_bounds
26from nltk.compat import python_2_unicode_compatible, unicode_repr
27from nltk.internals import raise_unorderable_types
28
29# TODO: add LabelledTree (can be used for dependency trees)
30
31######################################################################
32## Trees
33######################################################################
34
35
36@python_2_unicode_compatible
37class Tree(list):
38    """
39    A Tree represents a hierarchical grouping of leaves and subtrees.
40    For example, each constituent in a syntax tree is represented by a single Tree.
41
42    A tree's children are encoded as a list of leaves and subtrees,
43    where a leaf is a basic (non-tree) value; and a subtree is a
44    nested Tree.
45
46        >>> from nltk.tree import Tree
47        >>> print(Tree(1, [2, Tree(3, [4]), 5]))
48        (1 2 (3 4) 5)
49        >>> vp = Tree('VP', [Tree('V', ['saw']),
50        ...                  Tree('NP', ['him'])])
51        >>> s = Tree('S', [Tree('NP', ['I']), vp])
52        >>> print(s)
53        (S (NP I) (VP (V saw) (NP him)))
54        >>> print(s[1])
55        (VP (V saw) (NP him))
56        >>> print(s[1,1])
57        (NP him)
58        >>> t = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))")
59        >>> s == t
60        True
61        >>> t[1][1].set_label('X')
62        >>> t[1][1].label()
63        'X'
64        >>> print(t)
65        (S (NP I) (VP (V saw) (X him)))
66        >>> t[0], t[1,1] = t[1,1], t[0]
67        >>> print(t)
68        (S (X him) (VP (V saw) (NP I)))
69
70    The length of a tree is the number of children it has.
71
72        >>> len(t)
73        2
74
75    The set_label() and label() methods allow individual constituents
76    to be labeled.  For example, syntax trees use this label to specify
77    phrase tags, such as "NP" and "VP".
78
79    Several Tree methods use "tree positions" to specify
80    children or descendants of a tree.  Tree positions are defined as
81    follows:
82
83      - The tree position *i* specifies a Tree's *i*\ th child.
84      - The tree position ``()`` specifies the Tree itself.
85      - If *p* is the tree position of descendant *d*, then
86        *p+i* specifies the *i*\ th child of *d*.
87
88    I.e., every tree position is either a single index *i*,
89    specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*,
90    specifying ``tree[i1][i2]...[iN]``.
91
92    Construct a new tree.  This constructor can be called in one
93    of two ways:
94
95    - ``Tree(label, children)`` constructs a new tree with the
96        specified label and list of children.
97
98    - ``Tree.fromstring(s)`` constructs a new tree by parsing the string ``s``.
99    """
100
101    def __init__(self, node, children=None):
102        if children is None:
103            raise TypeError(
104                "%s: Expected a node value and child list " % type(self).__name__
105            )
106        elif isinstance(children, string_types):
107            raise TypeError(
108                "%s() argument 2 should be a list, not a "
109                "string" % type(self).__name__
110            )
111        else:
112            list.__init__(self, children)
113            self._label = node
114
115    # ////////////////////////////////////////////////////////////
116    # Comparison operators
117    # ////////////////////////////////////////////////////////////
118
119    def __eq__(self, other):
120        return self.__class__ is other.__class__ and (self._label, list(self)) == (
121            other._label,
122            list(other),
123        )
124
125    def __lt__(self, other):
126        if not isinstance(other, Tree):
127            # raise_unorderable_types("<", self, other)
128            # Sometimes children can be pure strings,
129            # so we need to be able to compare with non-trees:
130            return self.__class__.__name__ < other.__class__.__name__
131        elif self.__class__ is other.__class__:
132            return (self._label, list(self)) < (other._label, list(other))
133        else:
134            return self.__class__.__name__ < other.__class__.__name__
135
136    # @total_ordering doesn't work here, since the class inherits from a builtin class
137    __ne__ = lambda self, other: not self == other
138    __gt__ = lambda self, other: not (self < other or self == other)
139    __le__ = lambda self, other: self < other or self == other
140    __ge__ = lambda self, other: not self < other
141
142    # ////////////////////////////////////////////////////////////
143    # Disabled list operations
144    # ////////////////////////////////////////////////////////////
145
146    def __mul__(self, v):
147        raise TypeError('Tree does not support multiplication')
148
149    def __rmul__(self, v):
150        raise TypeError('Tree does not support multiplication')
151
152    def __add__(self, v):
153        raise TypeError('Tree does not support addition')
154
155    def __radd__(self, v):
156        raise TypeError('Tree does not support addition')
157
158    # ////////////////////////////////////////////////////////////
159    # Indexing (with support for tree positions)
160    # ////////////////////////////////////////////////////////////
161
162    def __getitem__(self, index):
163        if isinstance(index, (int, slice)):
164            return list.__getitem__(self, index)
165        elif isinstance(index, (list, tuple)):
166            if len(index) == 0:
167                return self
168            elif len(index) == 1:
169                return self[index[0]]
170            else:
171                return self[index[0]][index[1:]]
172        else:
173            raise TypeError(
174                "%s indices must be integers, not %s"
175                % (type(self).__name__, type(index).__name__)
176            )
177
178    def __setitem__(self, index, value):
179        if isinstance(index, (int, slice)):
180            return list.__setitem__(self, index, value)
181        elif isinstance(index, (list, tuple)):
182            if len(index) == 0:
183                raise IndexError('The tree position () may not be ' 'assigned to.')
184            elif len(index) == 1:
185                self[index[0]] = value
186            else:
187                self[index[0]][index[1:]] = value
188        else:
189            raise TypeError(
190                "%s indices must be integers, not %s"
191                % (type(self).__name__, type(index).__name__)
192            )
193
194    def __delitem__(self, index):
195        if isinstance(index, (int, slice)):
196            return list.__delitem__(self, index)
197        elif isinstance(index, (list, tuple)):
198            if len(index) == 0:
199                raise IndexError('The tree position () may not be deleted.')
200            elif len(index) == 1:
201                del self[index[0]]
202            else:
203                del self[index[0]][index[1:]]
204        else:
205            raise TypeError(
206                "%s indices must be integers, not %s"
207                % (type(self).__name__, type(index).__name__)
208            )
209
210    # ////////////////////////////////////////////////////////////
211    # Basic tree operations
212    # ////////////////////////////////////////////////////////////
213
214    def _get_node(self):
215        """Outdated method to access the node value; use the label() method instead."""
216        raise NotImplementedError("Use label() to access a node label.")
217
218    def _set_node(self, value):
219        """Outdated method to set the node value; use the set_label() method instead."""
220        raise NotImplementedError("Use set_label() method to set a node label.")
221
222    node = property(_get_node, _set_node)
223
224    def label(self):
225        """
226        Return the node label of the tree.
227
228            >>> t = Tree.fromstring('(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))')
229            >>> t.label()
230            'S'
231
232        :return: the node label (typically a string)
233        :rtype: any
234        """
235        return self._label
236
237    def set_label(self, label):
238        """
239        Set the node label of the tree.
240
241            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
242            >>> t.set_label("T")
243            >>> print(t)
244            (T (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))
245
246        :param label: the node label (typically a string)
247        :type label: any
248        """
249        self._label = label
250
251    def leaves(self):
252        """
253        Return the leaves of the tree.
254
255            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
256            >>> t.leaves()
257            ['the', 'dog', 'chased', 'the', 'cat']
258
259        :return: a list containing this tree's leaves.
260            The order reflects the order of the
261            leaves in the tree's hierarchical structure.
262        :rtype: list
263        """
264        leaves = []
265        for child in self:
266            if isinstance(child, Tree):
267                leaves.extend(child.leaves())
268            else:
269                leaves.append(child)
270        return leaves
271
272    def flatten(self):
273        """
274        Return a flat version of the tree, with all non-root non-terminals removed.
275
276            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
277            >>> print(t.flatten())
278            (S the dog chased the cat)
279
280        :return: a tree consisting of this tree's root connected directly to
281            its leaves, omitting all intervening non-terminal nodes.
282        :rtype: Tree
283        """
284        return Tree(self.label(), self.leaves())
285
286    def height(self):
287        """
288        Return the height of the tree.
289
290            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
291            >>> t.height()
292            5
293            >>> print(t[0,0])
294            (D the)
295            >>> t[0,0].height()
296            2
297
298        :return: The height of this tree.  The height of a tree
299            containing no children is 1; the height of a tree
300            containing only leaves is 2; and the height of any other
301            tree is one plus the maximum of its children's
302            heights.
303        :rtype: int
304        """
305        max_child_height = 0
306        for child in self:
307            if isinstance(child, Tree):
308                max_child_height = max(max_child_height, child.height())
309            else:
310                max_child_height = max(max_child_height, 1)
311        return 1 + max_child_height
312
313    def treepositions(self, order='preorder'):
314        """
315            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
316            >>> t.treepositions() # doctest: +ELLIPSIS
317            [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...]
318            >>> for pos in t.treepositions('leaves'):
319            ...     t[pos] = t[pos][::-1].upper()
320            >>> print(t)
321            (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC))))
322
323        :param order: One of: ``preorder``, ``postorder``, ``bothorder``,
324            ``leaves``.
325        """
326        positions = []
327        if order in ('preorder', 'bothorder'):
328            positions.append(())
329        for i, child in enumerate(self):
330            if isinstance(child, Tree):
331                childpos = child.treepositions(order)
332                positions.extend((i,) + p for p in childpos)
333            else:
334                positions.append((i,))
335        if order in ('postorder', 'bothorder'):
336            positions.append(())
337        return positions
338
339    def subtrees(self, filter=None):
340        """
341        Generate all the subtrees of this tree, optionally restricted
342        to trees matching the filter function.
343
344            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
345            >>> for s in t.subtrees(lambda t: t.height() == 2):
346            ...     print(s)
347            (D the)
348            (N dog)
349            (V chased)
350            (D the)
351            (N cat)
352
353        :type filter: function
354        :param filter: the function to filter all local trees
355        """
356        if not filter or filter(self):
357            yield self
358        for child in self:
359            if isinstance(child, Tree):
360                for subtree in child.subtrees(filter):
361                    yield subtree
362
363    def productions(self):
364        """
365        Generate the productions that correspond to the non-terminal nodes of the tree.
366        For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the
367        form P -> C1 C2 ... Cn.
368
369            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
370            >>> t.productions()
371            [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased',
372            NP -> D N, D -> 'the', N -> 'cat']
373
374        :rtype: list(Production)
375        """
376
377        if not isinstance(self._label, string_types):
378            raise TypeError(
379                'Productions can only be generated from trees having node labels that are strings'
380            )
381
382        prods = [Production(Nonterminal(self._label), _child_names(self))]
383        for child in self:
384            if isinstance(child, Tree):
385                prods += child.productions()
386        return prods
387
388    def pos(self):
389        """
390        Return a sequence of pos-tagged words extracted from the tree.
391
392            >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
393            >>> t.pos()
394            [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')]
395
396        :return: a list of tuples containing leaves and pre-terminals (part-of-speech tags).
397            The order reflects the order of the leaves in the tree's hierarchical structure.
398        :rtype: list(tuple)
399        """
400        pos = []
401        for child in self:
402            if isinstance(child, Tree):
403                pos.extend(child.pos())
404            else:
405                pos.append((child, self._label))
406        return pos
407
408    def leaf_treeposition(self, index):
409        """
410        :return: The tree position of the ``index``-th leaf in this
411            tree.  I.e., if ``tp=self.leaf_treeposition(i)``, then
412            ``self[tp]==self.leaves()[i]``.
413
414        :raise IndexError: If this tree contains fewer than ``index+1``
415            leaves, or if ``index<0``.
416        """
417        if index < 0:
418            raise IndexError('index must be non-negative')
419
420        stack = [(self, ())]
421        while stack:
422            value, treepos = stack.pop()
423            if not isinstance(value, Tree):
424                if index == 0:
425                    return treepos
426                else:
427                    index -= 1
428            else:
429                for i in range(len(value) - 1, -1, -1):
430                    stack.append((value[i], treepos + (i,)))
431
432        raise IndexError('index must be less than or equal to len(self)')
433
434    def treeposition_spanning_leaves(self, start, end):
435        """
436        :return: The tree position of the lowest descendant of this
437            tree that dominates ``self.leaves()[start:end]``.
438        :raise ValueError: if ``end <= start``
439        """
440        if end <= start:
441            raise ValueError('end must be greater than start')
442        # Find the tree positions of the start & end leaves, and
443        # take the longest common subsequence.
444        start_treepos = self.leaf_treeposition(start)
445        end_treepos = self.leaf_treeposition(end - 1)
446        # Find the first index where they mismatch:
447        for i in range(len(start_treepos)):
448            if i == len(end_treepos) or start_treepos[i] != end_treepos[i]:
449                return start_treepos[:i]
450        return start_treepos
451
452    # ////////////////////////////////////////////////////////////
453    # Transforms
454    # ////////////////////////////////////////////////////////////
455
456    def chomsky_normal_form(
457        self,
458        factor="right",
459        horzMarkov=None,
460        vertMarkov=0,
461        childChar="|",
462        parentChar="^",
463    ):
464        """
465        This method can modify a tree in three ways:
466
467          1. Convert a tree into its Chomsky Normal Form (CNF)
468             equivalent -- Every subtree has either two non-terminals
469             or one terminal as its children.  This process requires
470             the creation of more"artificial" non-terminal nodes.
471          2. Markov (vertical) smoothing of children in new artificial
472             nodes
473          3. Horizontal (parent) annotation of nodes
474
475        :param factor: Right or left factoring method (default = "right")
476        :type  factor: str = [left|right]
477        :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings)
478        :type  horzMarkov: int | None
479        :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation)
480        :type  vertMarkov: int | None
481        :param childChar: A string used in construction of the artificial nodes, separating the head of the
482                          original subtree from the child nodes that have yet to be expanded (default = "|")
483        :type  childChar: str
484        :param parentChar: A string used to separate the node representation from its vertical annotation
485        :type  parentChar: str
486        """
487        from nltk.treetransforms import chomsky_normal_form
488
489        chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar)
490
491    def un_chomsky_normal_form(
492        self, expandUnary=True, childChar="|", parentChar="^", unaryChar="+"
493    ):
494        """
495        This method modifies the tree in three ways:
496
497          1. Transforms a tree in Chomsky Normal Form back to its
498             original structure (branching greater than two)
499          2. Removes any parent annotation (if it exists)
500          3. (optional) expands unary subtrees (if previously
501             collapsed with collapseUnary(...) )
502
503        :param expandUnary: Flag to expand unary or not (default = True)
504        :type  expandUnary: bool
505        :param childChar: A string separating the head node from its children in an artificial node (default = "|")
506        :type  childChar: str
507        :param parentChar: A sting separating the node label from its parent annotation (default = "^")
508        :type  parentChar: str
509        :param unaryChar: A string joining two non-terminals in a unary production (default = "+")
510        :type  unaryChar: str
511        """
512        from nltk.treetransforms import un_chomsky_normal_form
513
514        un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar)
515
516    def collapse_unary(self, collapsePOS=False, collapseRoot=False, joinChar="+"):
517        """
518        Collapse subtrees with a single child (ie. unary productions)
519        into a new non-terminal (Tree node) joined by 'joinChar'.
520        This is useful when working with algorithms that do not allow
521        unary productions, and completely removing the unary productions
522        would require loss of useful information.  The Tree is modified
523        directly (since it is passed by reference) and no value is returned.
524
525        :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie.
526                            Part-of-Speech tags) since they are always unary productions
527        :type  collapsePOS: bool
528        :param collapseRoot: 'False' (default) will not modify the root production
529                             if it is unary.  For the Penn WSJ treebank corpus, this corresponds
530                             to the TOP -> productions.
531        :type collapseRoot: bool
532        :param joinChar: A string used to connect collapsed node values (default = "+")
533        :type  joinChar: str
534        """
535        from nltk.treetransforms import collapse_unary
536
537        collapse_unary(self, collapsePOS, collapseRoot, joinChar)
538
539    # ////////////////////////////////////////////////////////////
540    # Convert, copy
541    # ////////////////////////////////////////////////////////////
542
543    @classmethod
544    def convert(cls, tree):
545        """
546        Convert a tree between different subtypes of Tree.  ``cls`` determines
547        which class will be used to encode the new tree.
548
549        :type tree: Tree
550        :param tree: The tree that should be converted.
551        :return: The new Tree.
552        """
553        if isinstance(tree, Tree):
554            children = [cls.convert(child) for child in tree]
555            return cls(tree._label, children)
556        else:
557            return tree
558
559    def copy(self, deep=False):
560        if not deep:
561            return type(self)(self._label, self)
562        else:
563            return type(self).convert(self)
564
565    def _frozen_class(self):
566        return ImmutableTree
567
568    def freeze(self, leaf_freezer=None):
569        frozen_class = self._frozen_class()
570        if leaf_freezer is None:
571            newcopy = frozen_class.convert(self)
572        else:
573            newcopy = self.copy(deep=True)
574            for pos in newcopy.treepositions('leaves'):
575                newcopy[pos] = leaf_freezer(newcopy[pos])
576            newcopy = frozen_class.convert(newcopy)
577        hash(newcopy)  # Make sure the leaves are hashable.
578        return newcopy
579
580    # ////////////////////////////////////////////////////////////
581    # Parsing
582    # ////////////////////////////////////////////////////////////
583
584    @classmethod
585    def fromstring(
586        cls,
587        s,
588        brackets='()',
589        read_node=None,
590        read_leaf=None,
591        node_pattern=None,
592        leaf_pattern=None,
593        remove_empty_top_bracketing=False,
594    ):
595        """
596        Read a bracketed tree string and return the resulting tree.
597        Trees are represented as nested brackettings, such as::
598
599          (S (NP (NNP John)) (VP (V runs)))
600
601        :type s: str
602        :param s: The string to read
603
604        :type brackets: str (length=2)
605        :param brackets: The bracket characters used to mark the
606            beginning and end of trees and subtrees.
607
608        :type read_node: function
609        :type read_leaf: function
610        :param read_node, read_leaf: If specified, these functions
611            are applied to the substrings of ``s`` corresponding to
612            nodes and leaves (respectively) to obtain the values for
613            those nodes and leaves.  They should have the following
614            signature:
615
616               read_node(str) -> value
617
618            For example, these functions could be used to process nodes
619            and leaves whose values should be some type other than
620            string (such as ``FeatStruct``).
621            Note that by default, node strings and leaf strings are
622            delimited by whitespace and brackets; to override this
623            default, use the ``node_pattern`` and ``leaf_pattern``
624            arguments.
625
626        :type node_pattern: str
627        :type leaf_pattern: str
628        :param node_pattern, leaf_pattern: Regular expression patterns
629            used to find node and leaf substrings in ``s``.  By
630            default, both nodes patterns are defined to match any
631            sequence of non-whitespace non-bracket characters.
632
633        :type remove_empty_top_bracketing: bool
634        :param remove_empty_top_bracketing: If the resulting tree has
635            an empty node label, and is length one, then return its
636            single child instead.  This is useful for treebank trees,
637            which sometimes contain an extra level of bracketing.
638
639        :return: A tree corresponding to the string representation ``s``.
640            If this class method is called using a subclass of Tree,
641            then it will return a tree of that type.
642        :rtype: Tree
643        """
644        if not isinstance(brackets, string_types) or len(brackets) != 2:
645            raise TypeError('brackets must be a length-2 string')
646        if re.search('\s', brackets):
647            raise TypeError('whitespace brackets not allowed')
648        # Construct a regexp that will tokenize the string.
649        open_b, close_b = brackets
650        open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b))
651        if node_pattern is None:
652            node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
653        if leaf_pattern is None:
654            leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern)
655        token_re = re.compile(
656            '%s\s*(%s)?|%s|(%s)'
657            % (open_pattern, node_pattern, close_pattern, leaf_pattern)
658        )
659        # Walk through each token, updating a stack of trees.
660        stack = [(None, [])]  # list of (node, children) tuples
661        for match in token_re.finditer(s):
662            token = match.group()
663            # Beginning of a tree/subtree
664            if token[0] == open_b:
665                if len(stack) == 1 and len(stack[0][1]) > 0:
666                    cls._parse_error(s, match, 'end-of-string')
667                label = token[1:].lstrip()
668                if read_node is not None:
669                    label = read_node(label)
670                stack.append((label, []))
671            # End of a tree/subtree
672            elif token == close_b:
673                if len(stack) == 1:
674                    if len(stack[0][1]) == 0:
675                        cls._parse_error(s, match, open_b)
676                    else:
677                        cls._parse_error(s, match, 'end-of-string')
678                label, children = stack.pop()
679                stack[-1][1].append(cls(label, children))
680            # Leaf node
681            else:
682                if len(stack) == 1:
683                    cls._parse_error(s, match, open_b)
684                if read_leaf is not None:
685                    token = read_leaf(token)
686                stack[-1][1].append(token)
687
688        # check that we got exactly one complete tree.
689        if len(stack) > 1:
690            cls._parse_error(s, 'end-of-string', close_b)
691        elif len(stack[0][1]) == 0:
692            cls._parse_error(s, 'end-of-string', open_b)
693        else:
694            assert stack[0][0] is None
695            assert len(stack[0][1]) == 1
696        tree = stack[0][1][0]
697
698        # If the tree has an extra level with node='', then get rid of
699        # it.  E.g.: "((S (NP ...) (VP ...)))"
700        if remove_empty_top_bracketing and tree._label == '' and len(tree) == 1:
701            tree = tree[0]
702        # return the tree.
703        return tree
704
705    @classmethod
706    def _parse_error(cls, s, match, expecting):
707        """
708        Display a friendly error message when parsing a tree string fails.
709        :param s: The string we're parsing.
710        :param match: regexp match of the problem token.
711        :param expecting: what we expected to see instead.
712        """
713        # Construct a basic error message
714        if match == 'end-of-string':
715            pos, token = len(s), 'end-of-string'
716        else:
717            pos, token = match.start(), match.group()
718        msg = '%s.read(): expected %r but got %r\n%sat index %d.' % (
719            cls.__name__,
720            expecting,
721            token,
722            ' ' * 12,
723            pos,
724        )
725        # Add a display showing the error token itsels:
726        s = s.replace('\n', ' ').replace('\t', ' ')
727        offset = pos
728        if len(s) > pos + 10:
729            s = s[: pos + 10] + '...'
730        if pos > 10:
731            s = '...' + s[pos - 10 :]
732            offset = 13
733        msg += '\n%s"%s"\n%s^' % (' ' * 16, s, ' ' * (17 + offset))
734        raise ValueError(msg)
735
736    # ////////////////////////////////////////////////////////////
737    # Visualization & String Representation
738    # ////////////////////////////////////////////////////////////
739
740    def draw(self):
741        """
742        Open a new window containing a graphical diagram of this tree.
743        """
744        from nltk.draw.tree import draw_trees
745
746        draw_trees(self)
747
748    def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs):
749        """
750        Pretty-print this tree as ASCII or Unicode art.
751        For explanation of the arguments, see the documentation for
752        `nltk.treeprettyprinter.TreePrettyPrinter`.
753        """
754        from nltk.treeprettyprinter import TreePrettyPrinter
755
756        print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream)
757
758    def __repr__(self):
759        childstr = ", ".join(unicode_repr(c) for c in self)
760        return '%s(%s, [%s])' % (
761            type(self).__name__,
762            unicode_repr(self._label),
763            childstr,
764        )
765
766    def _repr_png_(self):
767        """
768        Draws and outputs in PNG for ipython.
769        PNG is used instead of PDF, since it can be displayed in the qt console and
770        has wider browser support.
771        """
772        import os
773        import base64
774        import subprocess
775        import tempfile
776        from nltk.draw.tree import tree_to_treesegment
777        from nltk.draw.util import CanvasFrame
778        from nltk.internals import find_binary
779
780        _canvas_frame = CanvasFrame()
781        widget = tree_to_treesegment(_canvas_frame.canvas(), self)
782        _canvas_frame.add_widget(widget)
783        x, y, w, h = widget.bbox()
784        # print_to_file uses scrollregion to set the width and height of the pdf.
785        _canvas_frame.canvas()['scrollregion'] = (0, 0, w, h)
786        with tempfile.NamedTemporaryFile() as file:
787            in_path = '{0:}.ps'.format(file.name)
788            out_path = '{0:}.png'.format(file.name)
789            _canvas_frame.print_to_file(in_path)
790            _canvas_frame.destroy_widget(widget)
791            subprocess.call(
792                [
793                    find_binary(
794                        'gs',
795                        binary_names=['gswin32c.exe', 'gswin64c.exe'],
796                        env_vars=['PATH'],
797                        verbose=False,
798                    )
799                ]
800                + '-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}'.format(
801                    out_path, in_path
802                ).split()
803            )
804            with open(out_path, 'rb') as sr:
805                res = sr.read()
806            os.remove(in_path)
807            os.remove(out_path)
808            return base64.b64encode(res).decode()
809
810    def __str__(self):
811        return self.pformat()
812
813    def pprint(self, **kwargs):
814        """
815        Print a string representation of this Tree to 'stream'
816        """
817
818        if "stream" in kwargs:
819            stream = kwargs["stream"]
820            del kwargs["stream"]
821        else:
822            stream = None
823        print(self.pformat(**kwargs), file=stream)
824
825    def pformat(self, margin=70, indent=0, nodesep='', parens='()', quotes=False):
826        """
827        :return: A pretty-printed string representation of this tree.
828        :rtype: str
829        :param margin: The right margin at which to do line-wrapping.
830        :type margin: int
831        :param indent: The indentation level at which printing
832            begins.  This number is used to decide how far to indent
833            subsequent lines.
834        :type indent: int
835        :param nodesep: A string that is used to separate the node
836            from the children.  E.g., the default value ``':'`` gives
837            trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``.
838        """
839
840        # Try writing it on one line.
841        s = self._pformat_flat(nodesep, parens, quotes)
842        if len(s) + indent < margin:
843            return s
844
845        # If it doesn't fit on one line, then write it on multi-lines.
846        if isinstance(self._label, string_types):
847            s = '%s%s%s' % (parens[0], self._label, nodesep)
848        else:
849            s = '%s%s%s' % (parens[0], unicode_repr(self._label), nodesep)
850        for child in self:
851            if isinstance(child, Tree):
852                s += (
853                    '\n'
854                    + ' ' * (indent + 2)
855                    + child.pformat(margin, indent + 2, nodesep, parens, quotes)
856                )
857            elif isinstance(child, tuple):
858                s += '\n' + ' ' * (indent + 2) + "/".join(child)
859            elif isinstance(child, string_types) and not quotes:
860                s += '\n' + ' ' * (indent + 2) + '%s' % child
861            else:
862                s += '\n' + ' ' * (indent + 2) + unicode_repr(child)
863        return s + parens[1]
864
865    def pformat_latex_qtree(self):
866        r"""
867        Returns a representation of the tree compatible with the
868        LaTeX qtree package. This consists of the string ``\Tree``
869        followed by the tree represented in bracketed notation.
870
871        For example, the following result was generated from a parse tree of
872        the sentence ``The announcement astounded us``::
873
874          \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ]
875              [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ]
876
877        See http://www.ling.upenn.edu/advice/latex.html for the LaTeX
878        style file for the qtree package.
879
880        :return: A latex qtree representation of this tree.
881        :rtype: str
882        """
883        reserved_chars = re.compile('([#\$%&~_\{\}])')
884
885        pformat = self.pformat(indent=6, nodesep='', parens=('[.', ' ]'))
886        return r'\Tree ' + re.sub(reserved_chars, r'\\\1', pformat)
887
888    def _pformat_flat(self, nodesep, parens, quotes):
889        childstrs = []
890        for child in self:
891            if isinstance(child, Tree):
892                childstrs.append(child._pformat_flat(nodesep, parens, quotes))
893            elif isinstance(child, tuple):
894                childstrs.append("/".join(child))
895            elif isinstance(child, string_types) and not quotes:
896                childstrs.append('%s' % child)
897            else:
898                childstrs.append(unicode_repr(child))
899        if isinstance(self._label, string_types):
900            return '%s%s%s %s%s' % (
901                parens[0],
902                self._label,
903                nodesep,
904                " ".join(childstrs),
905                parens[1],
906            )
907        else:
908            return '%s%s%s %s%s' % (
909                parens[0],
910                unicode_repr(self._label),
911                nodesep,
912                " ".join(childstrs),
913                parens[1],
914            )
915
916
917class ImmutableTree(Tree):
918    def __init__(self, node, children=None):
919        super(ImmutableTree, self).__init__(node, children)
920        # Precompute our hash value.  This ensures that we're really
921        # immutable.  It also means we only have to calculate it once.
922        try:
923            self._hash = hash((self._label, tuple(self)))
924        except (TypeError, ValueError):
925            raise ValueError(
926                "%s: node value and children " "must be immutable" % type(self).__name__
927            )
928
929    def __setitem__(self, index, value):
930        raise ValueError('%s may not be modified' % type(self).__name__)
931
932    def __setslice__(self, i, j, value):
933        raise ValueError('%s may not be modified' % type(self).__name__)
934
935    def __delitem__(self, index):
936        raise ValueError('%s may not be modified' % type(self).__name__)
937
938    def __delslice__(self, i, j):
939        raise ValueError('%s may not be modified' % type(self).__name__)
940
941    def __iadd__(self, other):
942        raise ValueError('%s may not be modified' % type(self).__name__)
943
944    def __imul__(self, other):
945        raise ValueError('%s may not be modified' % type(self).__name__)
946
947    def append(self, v):
948        raise ValueError('%s may not be modified' % type(self).__name__)
949
950    def extend(self, v):
951        raise ValueError('%s may not be modified' % type(self).__name__)
952
953    def pop(self, v=None):
954        raise ValueError('%s may not be modified' % type(self).__name__)
955
956    def remove(self, v):
957        raise ValueError('%s may not be modified' % type(self).__name__)
958
959    def reverse(self):
960        raise ValueError('%s may not be modified' % type(self).__name__)
961
962    def sort(self):
963        raise ValueError('%s may not be modified' % type(self).__name__)
964
965    def __hash__(self):
966        return self._hash
967
968    def set_label(self, value):
969        """
970        Set the node label.  This will only succeed the first time the
971        node label is set, which should occur in ImmutableTree.__init__().
972        """
973        if hasattr(self, '_label'):
974            raise ValueError('%s may not be modified' % type(self).__name__)
975        self._label = value
976
977
978######################################################################
979## Parented trees
980######################################################################
981@add_metaclass(ABCMeta)
982class AbstractParentedTree(Tree):
983    """
984    An abstract base class for a ``Tree`` that automatically maintains
985    pointers to parent nodes.  These parent pointers are updated
986    whenever any change is made to a tree's structure.  Two subclasses
987    are currently defined:
988
989      - ``ParentedTree`` is used for tree structures where each subtree
990        has at most one parent.  This class should be used in cases
991        where there is no"sharing" of subtrees.
992
993      - ``MultiParentedTree`` is used for tree structures where a
994        subtree may have zero or more parents.  This class should be
995        used in cases where subtrees may be shared.
996
997    Subclassing
998    ===========
999    The ``AbstractParentedTree`` class redefines all operations that
1000    modify a tree's structure to call two methods, which are used by
1001    subclasses to update parent information:
1002
1003      - ``_setparent()`` is called whenever a new child is added.
1004      - ``_delparent()`` is called whenever a child is removed.
1005    """
1006
1007    def __init__(self, node, children=None):
1008        super(AbstractParentedTree, self).__init__(node, children)
1009        # If children is None, the tree is read from node, and
1010        # all parents will be set during parsing.
1011        if children is not None:
1012            # Otherwise we have to set the parent of the children.
1013            # Iterate over self, and *not* children, because children
1014            # might be an iterator.
1015            for i, child in enumerate(self):
1016                if isinstance(child, Tree):
1017                    self._setparent(child, i, dry_run=True)
1018            for i, child in enumerate(self):
1019                if isinstance(child, Tree):
1020                    self._setparent(child, i)
1021
1022    # ////////////////////////////////////////////////////////////
1023    # Parent management
1024    # ////////////////////////////////////////////////////////////
1025    @abstractmethod
1026    def _setparent(self, child, index, dry_run=False):
1027        """
1028        Update the parent pointer of ``child`` to point to ``self``.  This
1029        method is only called if the type of ``child`` is ``Tree``;
1030        i.e., it is not called when adding a leaf to a tree.  This method
1031        is always called before the child is actually added to the
1032        child list of ``self``.
1033
1034        :type child: Tree
1035        :type index: int
1036        :param index: The index of ``child`` in ``self``.
1037        :raise TypeError: If ``child`` is a tree with an impropriate
1038            type.  Typically, if ``child`` is a tree, then its type needs
1039            to match the type of ``self``.  This prevents mixing of
1040            different tree types (single-parented, multi-parented, and
1041            non-parented).
1042        :param dry_run: If true, the don't actually set the child's
1043            parent pointer; just check for any error conditions, and
1044            raise an exception if one is found.
1045        """
1046
1047    @abstractmethod
1048    def _delparent(self, child, index):
1049        """
1050        Update the parent pointer of ``child`` to not point to self.  This
1051        method is only called if the type of ``child`` is ``Tree``; i.e., it
1052        is not called when removing a leaf from a tree.  This method
1053        is always called before the child is actually removed from the
1054        child list of ``self``.
1055
1056        :type child: Tree
1057        :type index: int
1058        :param index: The index of ``child`` in ``self``.
1059        """
1060
1061    # ////////////////////////////////////////////////////////////
1062    # Methods that add/remove children
1063    # ////////////////////////////////////////////////////////////
1064    # Every method that adds or removes a child must make
1065    # appropriate calls to _setparent() and _delparent().
1066
1067    def __delitem__(self, index):
1068        # del ptree[start:stop]
1069        if isinstance(index, slice):
1070            start, stop, step = slice_bounds(self, index, allow_step=True)
1071            # Clear all the children pointers.
1072            for i in range(start, stop, step):
1073                if isinstance(self[i], Tree):
1074                    self._delparent(self[i], i)
1075            # Delete the children from our child list.
1076            super(AbstractParentedTree, self).__delitem__(index)
1077
1078        # del ptree[i]
1079        elif isinstance(index, int):
1080            if index < 0:
1081                index += len(self)
1082            if index < 0:
1083                raise IndexError('index out of range')
1084            # Clear the child's parent pointer.
1085            if isinstance(self[index], Tree):
1086                self._delparent(self[index], index)
1087            # Remove the child from our child list.
1088            super(AbstractParentedTree, self).__delitem__(index)
1089
1090        elif isinstance(index, (list, tuple)):
1091            # del ptree[()]
1092            if len(index) == 0:
1093                raise IndexError('The tree position () may not be deleted.')
1094            # del ptree[(i,)]
1095            elif len(index) == 1:
1096                del self[index[0]]
1097            # del ptree[i1, i2, i3]
1098            else:
1099                del self[index[0]][index[1:]]
1100
1101        else:
1102            raise TypeError(
1103                "%s indices must be integers, not %s"
1104                % (type(self).__name__, type(index).__name__)
1105            )
1106
1107    def __setitem__(self, index, value):
1108        # ptree[start:stop] = value
1109        if isinstance(index, slice):
1110            start, stop, step = slice_bounds(self, index, allow_step=True)
1111            # make a copy of value, in case it's an iterator
1112            if not isinstance(value, (list, tuple)):
1113                value = list(value)
1114            # Check for any error conditions, so we can avoid ending
1115            # up in an inconsistent state if an error does occur.
1116            for i, child in enumerate(value):
1117                if isinstance(child, Tree):
1118                    self._setparent(child, start + i * step, dry_run=True)
1119            # clear the child pointers of all parents we're removing
1120            for i in range(start, stop, step):
1121                if isinstance(self[i], Tree):
1122                    self._delparent(self[i], i)
1123            # set the child pointers of the new children.  We do this
1124            # after clearing *all* child pointers, in case we're e.g.
1125            # reversing the elements in a tree.
1126            for i, child in enumerate(value):
1127                if isinstance(child, Tree):
1128                    self._setparent(child, start + i * step)
1129            # finally, update the content of the child list itself.
1130            super(AbstractParentedTree, self).__setitem__(index, value)
1131
1132        # ptree[i] = value
1133        elif isinstance(index, int):
1134            if index < 0:
1135                index += len(self)
1136            if index < 0:
1137                raise IndexError('index out of range')
1138            # if the value is not changing, do nothing.
1139            if value is self[index]:
1140                return
1141            # Set the new child's parent pointer.
1142            if isinstance(value, Tree):
1143                self._setparent(value, index)
1144            # Remove the old child's parent pointer
1145            if isinstance(self[index], Tree):
1146                self._delparent(self[index], index)
1147            # Update our child list.
1148            super(AbstractParentedTree, self).__setitem__(index, value)
1149
1150        elif isinstance(index, (list, tuple)):
1151            # ptree[()] = value
1152            if len(index) == 0:
1153                raise IndexError('The tree position () may not be assigned to.')
1154            # ptree[(i,)] = value
1155            elif len(index) == 1:
1156                self[index[0]] = value
1157            # ptree[i1, i2, i3] = value
1158            else:
1159                self[index[0]][index[1:]] = value
1160
1161        else:
1162            raise TypeError(
1163                "%s indices must be integers, not %s"
1164                % (type(self).__name__, type(index).__name__)
1165            )
1166
1167    def append(self, child):
1168        if isinstance(child, Tree):
1169            self._setparent(child, len(self))
1170        super(AbstractParentedTree, self).append(child)
1171
1172    def extend(self, children):
1173        for child in children:
1174            if isinstance(child, Tree):
1175                self._setparent(child, len(self))
1176            super(AbstractParentedTree, self).append(child)
1177
1178    def insert(self, index, child):
1179        # Handle negative indexes.  Note that if index < -len(self),
1180        # we do *not* raise an IndexError, unlike __getitem__.  This
1181        # is done for consistency with list.__getitem__ and list.index.
1182        if index < 0:
1183            index += len(self)
1184        if index < 0:
1185            index = 0
1186        # Set the child's parent, and update our child list.
1187        if isinstance(child, Tree):
1188            self._setparent(child, index)
1189        super(AbstractParentedTree, self).insert(index, child)
1190
1191    def pop(self, index=-1):
1192        if index < 0:
1193            index += len(self)
1194        if index < 0:
1195            raise IndexError('index out of range')
1196        if isinstance(self[index], Tree):
1197            self._delparent(self[index], index)
1198        return super(AbstractParentedTree, self).pop(index)
1199
1200    # n.b.: like `list`, this is done by equality, not identity!
1201    # To remove a specific child, use del ptree[i].
1202    def remove(self, child):
1203        index = self.index(child)
1204        if isinstance(self[index], Tree):
1205            self._delparent(self[index], index)
1206        super(AbstractParentedTree, self).remove(child)
1207
1208    # We need to implement __getslice__ and friends, even though
1209    # they're deprecated, because otherwise list.__getslice__ will get
1210    # called (since we're subclassing from list).  Just delegate to
1211    # __getitem__ etc., but use max(0, start) and max(0, stop) because
1212    # because negative indices are already handled *before*
1213    # __getslice__ is called; and we don't want to double-count them.
1214    if hasattr(list, '__getslice__'):
1215
1216        def __getslice__(self, start, stop):
1217            return self.__getitem__(slice(max(0, start), max(0, stop)))
1218
1219        def __delslice__(self, start, stop):
1220            return self.__delitem__(slice(max(0, start), max(0, stop)))
1221
1222        def __setslice__(self, start, stop, value):
1223            return self.__setitem__(slice(max(0, start), max(0, stop)), value)
1224
1225
1226class ParentedTree(AbstractParentedTree):
1227    """
1228    A ``Tree`` that automatically maintains parent pointers for
1229    single-parented trees.  The following are methods for querying
1230    the structure of a parented tree: ``parent``, ``parent_index``,
1231    ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``.
1232
1233    Each ``ParentedTree`` may have at most one parent.  In
1234    particular, subtrees may not be shared.  Any attempt to reuse a
1235    single ``ParentedTree`` as a child of more than one parent (or
1236    as multiple children of the same parent) will cause a
1237    ``ValueError`` exception to be raised.
1238
1239    ``ParentedTrees`` should never be used in the same tree as ``Trees``
1240    or ``MultiParentedTrees``.  Mixing tree implementations may result
1241    in incorrect parent pointers and in ``TypeError`` exceptions.
1242    """
1243
1244    def __init__(self, node, children=None):
1245        self._parent = None
1246        """The parent of this Tree, or None if it has no parent."""
1247        super(ParentedTree, self).__init__(node, children)
1248        if children is None:
1249            # If children is None, the tree is read from node.
1250            # After parsing, the parent of the immediate children
1251            # will point to an intermediate tree, not self.
1252            # We fix this by brute force:
1253            for i, child in enumerate(self):
1254                if isinstance(child, Tree):
1255                    child._parent = None
1256                    self._setparent(child, i)
1257
1258    def _frozen_class(self):
1259        return ImmutableParentedTree
1260
1261    # /////////////////////////////////////////////////////////////////
1262    # Methods
1263    # /////////////////////////////////////////////////////////////////
1264
1265    def parent(self):
1266        """The parent of this tree, or None if it has no parent."""
1267        return self._parent
1268
1269    def parent_index(self):
1270        """
1271        The index of this tree in its parent.  I.e.,
1272        ``ptree.parent()[ptree.parent_index()] is ptree``.  Note that
1273        ``ptree.parent_index()`` is not necessarily equal to
1274        ``ptree.parent.index(ptree)``, since the ``index()`` method
1275        returns the first child that is equal to its argument.
1276        """
1277        if self._parent is None:
1278            return None
1279        for i, child in enumerate(self._parent):
1280            if child is self:
1281                return i
1282        assert False, 'expected to find self in self._parent!'
1283
1284    def left_sibling(self):
1285        """The left sibling of this tree, or None if it has none."""
1286        parent_index = self.parent_index()
1287        if self._parent and parent_index > 0:
1288            return self._parent[parent_index - 1]
1289        return None  # no left sibling
1290
1291    def right_sibling(self):
1292        """The right sibling of this tree, or None if it has none."""
1293        parent_index = self.parent_index()
1294        if self._parent and parent_index < (len(self._parent) - 1):
1295            return self._parent[parent_index + 1]
1296        return None  # no right sibling
1297
1298    def root(self):
1299        """
1300        The root of this tree.  I.e., the unique ancestor of this tree
1301        whose parent is None.  If ``ptree.parent()`` is None, then
1302        ``ptree`` is its own root.
1303        """
1304        root = self
1305        while root.parent() is not None:
1306            root = root.parent()
1307        return root
1308
1309    def treeposition(self):
1310        """
1311        The tree position of this tree, relative to the root of the
1312        tree.  I.e., ``ptree.root[ptree.treeposition] is ptree``.
1313        """
1314        if self.parent() is None:
1315            return ()
1316        else:
1317            return self.parent().treeposition() + (self.parent_index(),)
1318
1319    # /////////////////////////////////////////////////////////////////
1320    # Parent Management
1321    # /////////////////////////////////////////////////////////////////
1322
1323    def _delparent(self, child, index):
1324        # Sanity checks
1325        assert isinstance(child, ParentedTree)
1326        assert self[index] is child
1327        assert child._parent is self
1328
1329        # Delete child's parent pointer.
1330        child._parent = None
1331
1332    def _setparent(self, child, index, dry_run=False):
1333        # If the child's type is incorrect, then complain.
1334        if not isinstance(child, ParentedTree):
1335            raise TypeError(
1336                'Can not insert a non-ParentedTree ' + 'into a ParentedTree'
1337            )
1338
1339        # If child already has a parent, then complain.
1340        if child._parent is not None:
1341            raise ValueError('Can not insert a subtree that already ' 'has a parent.')
1342
1343        # Set child's parent pointer & index.
1344        if not dry_run:
1345            child._parent = self
1346
1347
1348class MultiParentedTree(AbstractParentedTree):
1349    """
1350    A ``Tree`` that automatically maintains parent pointers for
1351    multi-parented trees.  The following are methods for querying the
1352    structure of a multi-parented tree: ``parents()``, ``parent_indices()``,
1353    ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``.
1354
1355    Each ``MultiParentedTree`` may have zero or more parents.  In
1356    particular, subtrees may be shared.  If a single
1357    ``MultiParentedTree`` is used as multiple children of the same
1358    parent, then that parent will appear multiple times in its
1359    ``parents()`` method.
1360
1361    ``MultiParentedTrees`` should never be used in the same tree as
1362    ``Trees`` or ``ParentedTrees``.  Mixing tree implementations may
1363    result in incorrect parent pointers and in ``TypeError`` exceptions.
1364    """
1365
1366    def __init__(self, node, children=None):
1367        self._parents = []
1368        """A list of this tree's parents.  This list should not
1369           contain duplicates, even if a parent contains this tree
1370           multiple times."""
1371        super(MultiParentedTree, self).__init__(node, children)
1372        if children is None:
1373            # If children is None, the tree is read from node.
1374            # After parsing, the parent(s) of the immediate children
1375            # will point to an intermediate tree, not self.
1376            # We fix this by brute force:
1377            for i, child in enumerate(self):
1378                if isinstance(child, Tree):
1379                    child._parents = []
1380                    self._setparent(child, i)
1381
1382    def _frozen_class(self):
1383        return ImmutableMultiParentedTree
1384
1385    # /////////////////////////////////////////////////////////////////
1386    # Methods
1387    # /////////////////////////////////////////////////////////////////
1388
1389    def parents(self):
1390        """
1391        The set of parents of this tree.  If this tree has no parents,
1392        then ``parents`` is the empty set.  To check if a tree is used
1393        as multiple children of the same parent, use the
1394        ``parent_indices()`` method.
1395
1396        :type: list(MultiParentedTree)
1397        """
1398        return list(self._parents)
1399
1400    def left_siblings(self):
1401        """
1402        A list of all left siblings of this tree, in any of its parent
1403        trees.  A tree may be its own left sibling if it is used as
1404        multiple contiguous children of the same parent.  A tree may
1405        appear multiple times in this list if it is the left sibling
1406        of this tree with respect to multiple parents.
1407
1408        :type: list(MultiParentedTree)
1409        """
1410        return [
1411            parent[index - 1]
1412            for (parent, index) in self._get_parent_indices()
1413            if index > 0
1414        ]
1415
1416    def right_siblings(self):
1417        """
1418        A list of all right siblings of this tree, in any of its parent
1419        trees.  A tree may be its own right sibling if it is used as
1420        multiple contiguous children of the same parent.  A tree may
1421        appear multiple times in this list if it is the right sibling
1422        of this tree with respect to multiple parents.
1423
1424        :type: list(MultiParentedTree)
1425        """
1426        return [
1427            parent[index + 1]
1428            for (parent, index) in self._get_parent_indices()
1429            if index < (len(parent) - 1)
1430        ]
1431
1432    def _get_parent_indices(self):
1433        return [
1434            (parent, index)
1435            for parent in self._parents
1436            for index, child in enumerate(parent)
1437            if child is self
1438        ]
1439
1440    def roots(self):
1441        """
1442        The set of all roots of this tree.  This set is formed by
1443        tracing all possible parent paths until trees with no parents
1444        are found.
1445
1446        :type: list(MultiParentedTree)
1447        """
1448        return list(self._get_roots_helper({}).values())
1449
1450    def _get_roots_helper(self, result):
1451        if self._parents:
1452            for parent in self._parents:
1453                parent._get_roots_helper(result)
1454        else:
1455            result[id(self)] = self
1456        return result
1457
1458    def parent_indices(self, parent):
1459        """
1460        Return a list of the indices where this tree occurs as a child
1461        of ``parent``.  If this child does not occur as a child of
1462        ``parent``, then the empty list is returned.  The following is
1463        always true::
1464
1465          for parent_index in ptree.parent_indices(parent):
1466              parent[parent_index] is ptree
1467        """
1468        if parent not in self._parents:
1469            return []
1470        else:
1471            return [index for (index, child) in enumerate(parent) if child is self]
1472
1473    def treepositions(self, root):
1474        """
1475        Return a list of all tree positions that can be used to reach
1476        this multi-parented tree starting from ``root``.  I.e., the
1477        following is always true::
1478
1479          for treepos in ptree.treepositions(root):
1480              root[treepos] is ptree
1481        """
1482        if self is root:
1483            return [()]
1484        else:
1485            return [
1486                treepos + (index,)
1487                for parent in self._parents
1488                for treepos in parent.treepositions(root)
1489                for (index, child) in enumerate(parent)
1490                if child is self
1491            ]
1492
1493    # /////////////////////////////////////////////////////////////////
1494    # Parent Management
1495    # /////////////////////////////////////////////////////////////////
1496
1497    def _delparent(self, child, index):
1498        # Sanity checks
1499        assert isinstance(child, MultiParentedTree)
1500        assert self[index] is child
1501        assert len([p for p in child._parents if p is self]) == 1
1502
1503        # If the only copy of child in self is at index, then delete
1504        # self from child's parent list.
1505        for i, c in enumerate(self):
1506            if c is child and i != index:
1507                break
1508        else:
1509            child._parents.remove(self)
1510
1511    def _setparent(self, child, index, dry_run=False):
1512        # If the child's type is incorrect, then complain.
1513        if not isinstance(child, MultiParentedTree):
1514            raise TypeError(
1515                'Can not insert a non-MultiParentedTree ' + 'into a MultiParentedTree'
1516            )
1517
1518        # Add self as a parent pointer if it's not already listed.
1519        if not dry_run:
1520            for parent in child._parents:
1521                if parent is self:
1522                    break
1523            else:
1524                child._parents.append(self)
1525
1526
1527class ImmutableParentedTree(ImmutableTree, ParentedTree):
1528    pass
1529
1530
1531class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree):
1532    pass
1533
1534
1535######################################################################
1536## Probabilistic trees
1537######################################################################
1538
1539
1540@python_2_unicode_compatible
1541class ProbabilisticTree(Tree, ProbabilisticMixIn):
1542    def __init__(self, node, children=None, **prob_kwargs):
1543        Tree.__init__(self, node, children)
1544        ProbabilisticMixIn.__init__(self, **prob_kwargs)
1545
1546    # We have to patch up these methods to make them work right:
1547    def _frozen_class(self):
1548        return ImmutableProbabilisticTree
1549
1550    def __repr__(self):
1551        return '%s (p=%r)' % (Tree.unicode_repr(self), self.prob())
1552
1553    def __str__(self):
1554        return '%s (p=%.6g)' % (self.pformat(margin=60), self.prob())
1555
1556    def copy(self, deep=False):
1557        if not deep:
1558            return type(self)(self._label, self, prob=self.prob())
1559        else:
1560            return type(self).convert(self)
1561
1562    @classmethod
1563    def convert(cls, val):
1564        if isinstance(val, Tree):
1565            children = [cls.convert(child) for child in val]
1566            if isinstance(val, ProbabilisticMixIn):
1567                return cls(val._label, children, prob=val.prob())
1568            else:
1569                return cls(val._label, children, prob=1.0)
1570        else:
1571            return val
1572
1573    def __eq__(self, other):
1574        return self.__class__ is other.__class__ and (
1575            self._label,
1576            list(self),
1577            self.prob(),
1578        ) == (other._label, list(other), other.prob())
1579
1580    def __lt__(self, other):
1581        if not isinstance(other, Tree):
1582            raise_unorderable_types("<", self, other)
1583        if self.__class__ is other.__class__:
1584            return (self._label, list(self), self.prob()) < (
1585                other._label,
1586                list(other),
1587                other.prob(),
1588            )
1589        else:
1590            return self.__class__.__name__ < other.__class__.__name__
1591
1592
1593@python_2_unicode_compatible
1594class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn):
1595    def __init__(self, node, children=None, **prob_kwargs):
1596        ImmutableTree.__init__(self, node, children)
1597        ProbabilisticMixIn.__init__(self, **prob_kwargs)
1598        self._hash = hash((self._label, tuple(self), self.prob()))
1599
1600    # We have to patch up these methods to make them work right:
1601    def _frozen_class(self):
1602        return ImmutableProbabilisticTree
1603
1604    def __repr__(self):
1605        return '%s [%s]' % (Tree.unicode_repr(self), self.prob())
1606
1607    def __str__(self):
1608        return '%s [%s]' % (self.pformat(margin=60), self.prob())
1609
1610    def copy(self, deep=False):
1611        if not deep:
1612            return type(self)(self._label, self, prob=self.prob())
1613        else:
1614            return type(self).convert(self)
1615
1616    @classmethod
1617    def convert(cls, val):
1618        if isinstance(val, Tree):
1619            children = [cls.convert(child) for child in val]
1620            if isinstance(val, ProbabilisticMixIn):
1621                return cls(val._label, children, prob=val.prob())
1622            else:
1623                return cls(val._label, children, prob=1.0)
1624        else:
1625            return val
1626
1627
1628def _child_names(tree):
1629    names = []
1630    for child in tree:
1631        if isinstance(child, Tree):
1632            names.append(Nonterminal(child._label))
1633        else:
1634            names.append(child)
1635    return names
1636
1637
1638######################################################################
1639## Parsing
1640######################################################################
1641
1642
1643def bracket_parse(s):
1644    """
1645    Use Tree.read(s, remove_empty_top_bracketing=True) instead.
1646    """
1647    raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.")
1648
1649
1650def sinica_parse(s):
1651    """
1652    Parse a Sinica Treebank string and return a tree.  Trees are represented as nested brackettings,
1653    as shown in the following example (X represents a Chinese character):
1654    S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY)
1655
1656    :return: A tree corresponding to the string representation.
1657    :rtype: Tree
1658    :param s: The string to be converted
1659    :type s: str
1660    """
1661    tokens = re.split(r'([()| ])', s)
1662    for i in range(len(tokens)):
1663        if tokens[i] == '(':
1664            tokens[i - 1], tokens[i] = (
1665                tokens[i],
1666                tokens[i - 1],
1667            )  # pull nonterminal inside parens
1668        elif ':' in tokens[i]:
1669            fields = tokens[i].split(':')
1670            if len(fields) == 2:  # non-terminal
1671                tokens[i] = fields[1]
1672            else:
1673                tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")"
1674        elif tokens[i] == '|':
1675            tokens[i] = ''
1676
1677    treebank_string = " ".join(tokens)
1678    return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True)
1679
1680
1681#    s = re.sub(r'^#[^\s]*\s', '', s)  # remove leading identifier
1682#    s = re.sub(r'\w+:', '', s)       # remove role tags
1683
1684#    return s
1685
1686######################################################################
1687## Demonstration
1688######################################################################
1689
1690
1691def demo():
1692    """
1693    A demonstration showing how Trees and Trees can be
1694    used.  This demonstration creates a Tree, and loads a
1695    Tree from the Treebank corpus,
1696    and shows the results of calling several of their methods.
1697    """
1698
1699    from nltk import Tree, ProbabilisticTree
1700
1701    # Demonstrate tree parsing.
1702    s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
1703    t = Tree.fromstring(s)
1704    print("Convert bracketed string into tree:")
1705    print(t)
1706    print(t.__repr__())
1707
1708    print("Display tree properties:")
1709    print(t.label())  # tree's constituent type
1710    print(t[0])  # tree's first child
1711    print(t[1])  # tree's second child
1712    print(t.height())
1713    print(t.leaves())
1714    print(t[1])
1715    print(t[1, 1])
1716    print(t[1, 1, 0])
1717
1718    # Demonstrate tree modification.
1719    the_cat = t[0]
1720    the_cat.insert(1, Tree.fromstring('(JJ big)'))
1721    print("Tree modification:")
1722    print(t)
1723    t[1, 1, 1] = Tree.fromstring('(NN cake)')
1724    print(t)
1725    print()
1726
1727    # Tree transforms
1728    print("Collapse unary:")
1729    t.collapse_unary()
1730    print(t)
1731    print("Chomsky normal form:")
1732    t.chomsky_normal_form()
1733    print(t)
1734    print()
1735
1736    # Demonstrate probabilistic trees.
1737    pt = ProbabilisticTree('x', ['y', 'z'], prob=0.5)
1738    print("Probabilistic Tree:")
1739    print(pt)
1740    print()
1741
1742    # Demonstrate parsing of treebank output format.
1743    t = Tree.fromstring(t.pformat())
1744    print("Convert tree to bracketed string and back again:")
1745    print(t)
1746    print()
1747
1748    # Demonstrate LaTeX output
1749    print("LaTeX output:")
1750    print(t.pformat_latex_qtree())
1751    print()
1752
1753    # Demonstrate Productions
1754    print("Production output:")
1755    print(t.productions())
1756    print()
1757
1758    # Demonstrate tree nodes containing objects other than strings
1759    t.set_label(('test', 3))
1760    print(t)
1761
1762
1763__all__ = [
1764    'ImmutableProbabilisticTree',
1765    'ImmutableTree',
1766    'ProbabilisticMixIn',
1767    'ProbabilisticTree',
1768    'Tree',
1769    'bracket_parse',
1770    'sinica_parse',
1771    'ParentedTree',
1772    'MultiParentedTree',
1773    'ImmutableParentedTree',
1774    'ImmutableMultiParentedTree',
1775]
1776