1# #START_LICENSE###########################################################
2#
3#
4# This file is part of the Environment for Tree Exploration program
5# (ETE).  http://etetoolkit.org
6#
7# ETE is free software: you can redistribute it and/or modify it
8# under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# ETE is distributed in the hope that it will be useful, but WITHOUT
13# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15# License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
19#
20#
21#                     ABOUT THE ETE PACKAGE
22#                     =====================
23#
24# ETE is distributed under the GPL copyleft license (2008-2015).
25#
26# If you make use of ETE in published work, please cite:
27#
28# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
29# ETE: a python Environment for Tree Exploration. Jaime BMC
30# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
31#
32# Note that extra references to the specific methods implemented in
33# the toolkit may be available in the documentation.
34#
35# More info at http://etetoolkit.org. Contact: huerta@embl.de
36#
37#
38# #END_LICENSE#############################################################
39
40
41"""
42This module defines the PhyloNode dataytype to manage phylogenetic
43trees. It inheritates the coretype TreeNode and add some special
44features to the the node instances.
45"""
46from __future__ import absolute_import
47from __future__ import print_function
48
49import sys
50import os
51import re
52import itertools
53from collections import defaultdict
54from .. import TreeNode, SeqGroup, NCBITaxa
55from .reconciliation import get_reconciled_tree
56from . import spoverlap
57
58__all__ = ["PhyloNode", "PhyloTree"]
59
60def _parse_species(name):
61    return name[:3]
62
63def is_dup(n):
64    return getattr(n, "evoltype", None) == "D"
65
66def get_subtrees(tree, full_copy=False, features=None, newick_only=False):
67    """Calculate all possible species trees within a gene tree. I
68    tested several recursive and iterative approaches to do it and
69    this is the most efficient way I found. The method is now fast and
70    light enough to deal with very large gene trees, and it scales
71    linearly instead of exponentially. For instance, a tree with ~8000
72    nodes, ~100 species and ~400 duplications returns ~10,000 sptrees
73    that could be loaded in few minutes.
74
75    To avoid memory overloads, this function returns a tuple containing the
76    total number of trees, number of duplication events, and an iterator for the
77    species trees. Real trees are not actually computed until the iterator is
78    first accessed. This allows to filter out cases producing astronomic numbers
79    of sptrees.
80
81    """
82    ntrees, ndups = calc_subtrees(tree)
83    return ntrees, ndups, _get_subtrees(tree, full_copy, features, newick_only)
84
85def _get_subtrees(tree, full_copy=False, features=None, newick_only=False):
86    # First I need to precalculate all the species trees in tuple (newick) format
87    nid = 0
88    n2nid = {}
89    nid2node = {}
90    n2subtrees = defaultdict(list)
91    for n in tree.traverse("postorder"):
92        n2nid[n] = nid
93        nid2node[nid] = n
94        nid += 1
95        if n.children:
96            if is_dup(n):
97                subtrees = []
98                for ch in n.children:
99                    subtrees.extend(n2subtrees[n2nid[ch]])
100            else:
101                subtrees = tuple([val for val in
102                                  itertools.product(n2subtrees[n2nid[n.children[0]]],
103                                                    n2subtrees[n2nid[n.children[1]]])])
104        else:
105            subtrees = tuple([n2nid[n]])
106
107        n2subtrees[n2nid[n]] = subtrees
108        for ch in n.children:
109            del n2subtrees[n2nid[ch]]
110
111    sp_trees = n2subtrees[n2nid[tree]]
112
113    # Second, I yield a tree per iteration in newick or ETE format
114    features = set(features) if features else set()
115    features.update(["name"])
116
117    def _nodereplacer(match):
118        pre, b, post =  match.groups()
119        pre = '' if not pre else pre
120        post = '' if not post else post
121        node = nid2node[int(b)]
122        fstring = ""
123        if features:
124            fstring = "".join(["[&&NHX:",
125                               ':'.join(["%s=%s" %(f, getattr(node, f))
126                                         for f in features if hasattr(node, f)])
127                               , "]"])
128
129        return ''.join([pre, node.name, fstring, post])
130
131    if newick_only:
132        id_match = re.compile("([^0-9])?(\d+)([^0-9])?")
133        for nw in sp_trees:
134            yield re.sub(id_match, _nodereplacer, str(nw)+";")
135    else:
136        for nw in sp_trees:
137            # I take advantage from the fact that I generated the subtrees
138            # using tuples, so str representation is actually a newick :)
139            t = PhyloTree(str(nw)+";")
140            # Map features from original tree
141            for leaf in t.iter_leaves():
142                _nid = int(leaf.name)
143                for f in features:
144                    leaf.add_feature(f, getattr(nid2node[_nid], f))
145            yield t
146
147def calc_subtrees(tree):
148    '''
149    Computes the total number of species trees that TreeKO algorithm would produce for a given gene tree
150
151    returns: ntrees, ndups
152    '''
153    n2subtrees = {}
154    dups = 0
155    for n in tree.traverse("postorder"):
156        if n.children:
157            if is_dup(n):
158                dups += 1
159                subtrees = 0
160                for ch in n.children:
161                    subtrees += n2subtrees[ch]
162            else:
163                subtrees = n2subtrees[n.children[0]] * n2subtrees[n.children[1]]
164        else:
165            subtrees = 1
166        n2subtrees[n] = subtrees
167    return n2subtrees[tree], dups
168
169def iter_sptrees(sptrees, nid2node, features=None, newick_only=False):
170    """ Loads and map the species trees returned by get_subtrees"""
171
172    features = set(features) if features else set()
173    features.update(["name"])
174
175    def _nodereplacer(match):
176        pre, b, post =  match.groups()
177        node = nid2node[int(b)]
178        fstring = ""
179        if features:
180            fstring = "".join(["[&&NHX:",
181                               ','.join(["%s=%s" %(f, getattr(node, f))
182                                         for f in features if hasattr(node, f)])
183                               , "]"])
184
185        return ''.join([pre, node.name, fstring, post])
186
187    if newick_only:
188        id_match = re.compile("([^0-9])(\d+)([^0-9])")
189        for nw in sptrees:
190            yield re.sub(id_match, _nodereplacer, str(nw)+";")
191    else:
192        for nw in sptrees:
193            # I take advantage from the fact that I generated the subtrees
194            # using tuples, so str representation is actually a newick :)
195            t = PhyloTree(str(nw)+";")
196            # Map features from original tree
197            for leaf in t.iter_leaves():
198                _nid = int(leaf.name)
199                for f in features:
200                    leaf.add_feature(f, getattr(nid2node[_nid], f))
201            yield t
202
203def _get_subtrees_recursive(node, full_copy=True):
204    if is_dup(node):
205        sp_trees = []
206        for ch in node.children:
207            sp_trees.extend(_get_subtrees_recursive(ch, full_copy=full_copy))
208        return sp_trees
209
210    # saves a list of duplication nodes under current node
211    dups = []
212    for _n in node.iter_leaves(is_leaf_fn=is_dup):
213        if is_dup(_n):
214            dups.append(_n)
215
216    if dups:
217        # detach inner duplication nodes and stores their anchor point
218        subtrees = []
219        for dp in dups:
220            # The real node to attach sibling subtress
221            anchor = dp.up
222            dp.detach()
223
224            duptrees = []
225            #get all sibling sptrees in each side of the
226            #duplication. Each subtree is pointed to its anchor
227            for ch in dp.children:
228                for subt in _get_subtrees_recursive(ch, full_copy=full_copy):
229                    if not full_copy:
230                        subt = node.__class__(subt)
231                    subt.up = anchor
232                    duptrees.append(subt)
233
234            #all posible sptrees under this duplication are stored
235            subtrees.append(duptrees)
236
237        # Generates all combinations of subtrees in sibling duplications
238        sp_trees = []
239        for comb in itertools.product(*subtrees):
240            #each subtree is attached to its anchor point and make a copy
241            #of the final sp tree
242            for subt in comb:
243                #anchor = subt2anchor[subt]
244                if subt.up:
245                    subt.up.children.append(subt)
246                    #print subt.up
247                else:
248                    sp_trees.append(subt)
249            if full_copy:
250                 back_up = node.up
251                 node.up = None
252                 _node = node.copy()
253                 node.up = back_up
254            else:
255                _node = node.write(format=9, features=["name", "evoltype"])
256            sp_trees.append(_node)
257            # Clear current node
258            for subt in comb:
259                subt.up.children.pop(-1)
260    else:
261        if full_copy:
262            back_up = node.up
263            node.up = None
264            _node = node.copy()
265            node.up = back_up
266        else:
267            _node = node.write(format=9, features=["name", "evoltype"])
268        #node.detach()
269        sp_trees = [_node]
270
271    return sp_trees
272
273def get_subparts(n):
274    def is_dup(n):
275        return getattr(n, "evoltype", None) == "D"
276
277    subtrees = []
278    if is_dup(n):
279        for ch in n.get_children():
280            ch.detach()
281            subtrees.extend(get_subparts(ch))
282    else:
283        to_visit = []
284        for _n in n.iter_leaves(is_leaf_fn=is_dup):
285            if is_dup(_n):
286                to_visit.append(_n)
287
288        for _n in to_visit:
289            _n.detach()
290
291        freaks = [_n for _n in n.iter_descendants() if
292                  len(_n.children)==1 or (not hasattr(_n, "_leaf") and not _n.children)]
293        for s in freaks:
294            s.delete(prevent_nondicotomic=True)
295
296        # Clean node structure to prevent nodes with only one child
297        while len(n.children) == 1:
298            n = n.children[0]
299            n.detach()
300
301        if not n.children and not hasattr(n, "_leaf"):
302            pass
303        else:
304            subtrees.append(n)
305
306        for _n in to_visit:
307            subtrees.extend(get_subparts(_n))
308
309    return subtrees
310
311
312class PhyloNode(TreeNode):
313    """
314    .. currentmodule:: ete3
315    Extends the standard :class:`TreeNode` instance. It adds
316    specific attributes and methods to work with phylogentic trees.
317
318    :argument newick: Path to the file containing the tree or, alternatively,
319      the text string containing the same information.
320
321    :argument alignment: file containing a multiple sequence alignment.
322
323    :argument alg_format:  "fasta", "phylip" or "iphylip" (interleaved)
324
325    :argument format: sub-newick format
326
327      .. table::
328
329          ======  ==============================================
330          FORMAT  DESCRIPTION
331          ======  ==============================================
332          0        flexible with support values
333          1        flexible with internal node names
334          2        all branches + leaf names + internal supports
335          3        all branches + all names
336          4        leaf branches + leaf names
337          5        internal and leaf branches + leaf names
338          6        internal branches + leaf names
339          7        leaf branches + all names
340          8        all names
341          9        leaf names
342          100      topology only
343          ======  ==============================================
344
345    :argument sp_naming_function: Pointer to a parsing python
346       function that receives nodename as first argument and returns
347       the species name (see
348       :func:`PhyloNode.set_species_naming_function`. By default, the
349       3 first letter of nodes will be used as species identifiers.
350
351
352
353    :returns: a tree node object which represents the base of the tree.
354    """
355
356    def _get_species(self):
357        if self._speciesFunction:
358            try:
359                return self._speciesFunction(self.name)
360            except:
361                return self._speciesFunction(self)
362        else:
363            return self._species
364
365    def _set_species(self, value):
366        if self._speciesFunction:
367            pass
368        else:
369            self._species = value
370
371    # This tweak overwrites the native 'name' attribute to create a
372    # property that updates the species code every time name is
373    # changed
374
375    #: .. currentmodule:: ete3
376    #:
377    #Species code associated to the node. This property can be
378    #automatically extracted from the TreeNode.name attribute or
379    #manually set (see :func:`PhyloNode.set_species_naming_function`).
380    species = property(fget = _get_species, fset = _set_species)
381
382    def __init__(self, newick=None, alignment=None, alg_format="fasta", \
383                 sp_naming_function=_parse_species, format=0, **kargs):
384
385        # _update names?
386        self._name = "NoName"
387        self._species = "Unknown"
388        self._speciesFunction = None
389        # Caution! native __init__ has to be called after setting
390        # _speciesFunction to None!!
391        TreeNode.__init__(self, newick=newick, format=format, **kargs)
392
393        # This will be only executed after reading the whole tree,
394        # because the argument 'alignment' is not passed to the
395        # PhyloNode constructor during parsing
396        if alignment:
397            self.link_to_alignment(alignment, alg_format)
398        if newick:
399            self.set_species_naming_function(sp_naming_function)
400
401    def __repr__(self):
402        return "PhyloTree node '%s' (%s)" %(self.name, hex(self.__hash__()))
403
404    def set_species_naming_function(self, fn):
405        """
406        Sets the parsing function used to extract species name from a
407        node's name.
408
409        :argument fn: Pointer to a parsing python function that
410          receives nodename as first argument and returns the species
411          name.
412
413        ::
414
415          # Example of a parsing function to extract species names for
416          # all nodes in a given tree.
417          def parse_sp_name(node_name):
418              return node_name.split("_")[1]
419          tree.set_species_naming_function(parse_sp_name)
420
421        """
422        if fn:
423            for n in self.traverse():
424                n._speciesFunction = fn
425                if n.is_leaf():
426                    n.features.add("species")
427
428    def link_to_alignment(self, alignment, alg_format="fasta", **kwargs):
429        missing_leaves = []
430        missing_internal = []
431        if type(alignment) == SeqGroup:
432            alg = alignment
433        else:
434            alg = SeqGroup(alignment, format=alg_format, **kwargs)
435        # sets the seq of
436        for n in self.traverse():
437            try:
438                n.add_feature("sequence",alg.get_seq(n.name))
439            except KeyError:
440                if n.is_leaf():
441                    missing_leaves.append(n.name)
442                else:
443                    missing_internal.append(n.name)
444        if len(missing_leaves)>0:
445            print("Warnning: [%d] terminal nodes could not be found in the alignment." %\
446                len(missing_leaves), file=sys.stderr)
447        # Show warning of not associated internal nodes.
448        # if len(missing_internal)>0:
449        #     print >>sys.stderr, \
450        #       "Warnning: [%d] internal nodes could not be found in the alignment." %\
451        #       len(missing_leaves)
452
453    def get_species(self):
454        """ Returns the set of species covered by its partition. """
455        return set([l.species for l in self.iter_leaves()])
456
457    def iter_species(self):
458        """ Returns an iterator over the species grouped by this node. """
459        spcs = set([])
460        for l in self.iter_leaves():
461            if l.species not in spcs:
462                spcs.add(l.species)
463                yield l.species
464
465    def get_age(self, species2age):
466        """
467        Implements the phylostratigrafic method described in:
468
469        Huerta-Cepas, J., & Gabaldon, T. (2011). Assigning duplication events to
470        relative temporal scales in genome-wide studies. Bioinformatics, 27(1),
471        38-45.
472        """
473        return max([species2age[sp] for sp in self.get_species()])
474
475    def reconcile(self, species_tree):
476        """ Returns the reconcilied topology with the provided species
477        tree, and a list of evolutionary events inferred from such
478        reconciliation. """
479        return get_reconciled_tree(self, species_tree, [])
480
481    def get_my_evol_events(self, sos_thr=0.0):
482        """ Returns a list of duplication and speciation events in
483        which the current node has been involved. Scanned nodes are
484        also labeled internally as dup=True|False. You can access this
485        labels using the 'node.dup' sintaxis.
486
487        Method: the algorithm scans all nodes from the given leafName to
488        the root. Nodes are assumed to be duplications when a species
489        overlap is found between its child linages. Method is described
490        more detail in:
491
492        "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
493        T. Genome Biol. 2007;8(6):R109.
494        """
495        return spoverlap.get_evol_events_from_leaf(self, sos_thr=sos_thr)
496
497    def get_descendant_evol_events(self, sos_thr=0.0):
498        """ Returns a list of **all** duplication and speciation
499        events detected after this node. Nodes are assumed to be
500        duplications when a species overlap is found between its child
501        linages. Method is described more detail in:
502
503        "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
504        T. Genome Biol. 2007;8(6):R109.
505        """
506        return spoverlap.get_evol_events_from_root(self, sos_thr=sos_thr)
507
508    def get_farthest_oldest_leaf(self, species2age, is_leaf_fn=None):
509        """ Returns the farthest oldest leaf to the current
510        one. It requires an species2age dictionary with the age
511        estimation for all species.
512
513        :argument None is_leaf_fn: A pointer to a function that
514          receives a node instance as unique argument and returns True
515          or False. It can be used to dynamically collapse nodes, so
516          they are seen as leaves.
517
518        """
519
520        root = self.get_tree_root()
521        outgroup_dist  = 0
522        outgroup_node  = self
523        outgroup_age = 0 # self.get_age(species2age)
524
525        for leaf in root.iter_leaves(is_leaf_fn=is_leaf_fn):
526            if leaf.get_age(species2age) > outgroup_age:
527                outgroup_dist = leaf.get_distance(self)
528                outgroup_node = leaf
529                outgroup_age = species2age[leaf.get_species().pop()]
530            elif leaf.get_age(species2age) == outgroup_age:
531                dist = leaf.get_distance(self)
532                if dist>outgroup_dist:
533                    outgroup_dist  = leaf.get_distance(self)
534                    outgroup_node  = leaf
535                    outgroup_age = species2age[leaf.get_species().pop()]
536        return outgroup_node
537
538    def get_farthest_oldest_node(self, species2age):
539        """
540        .. versionadded:: 2.1
541
542        Returns the farthest oldest node (leaf or internal). The
543        difference with get_farthest_oldest_leaf() is that in this
544        function internal nodes grouping seqs from the same species
545        are collapsed.
546        """
547
548        # I use a custom is_leaf() function to collapse nodes groups
549        # seqs from the same species
550        is_leaf = lambda node: len(node.get_species())==1
551        return self.get_farthest_oldest_leaf(species2age, is_leaf_fn=is_leaf)
552
553    def get_age_balanced_outgroup(self, species2age):
554        """
555        .. versionadded:: 2.2
556
557        Returns the node better balance current tree structure
558        according to the topological age of the different leaves and
559        internal node sizes.
560
561        :param species2age: A dictionary translating from leaf names
562          into a topological age.
563
564        .. warning: This is currently an experimental method!!
565
566        """
567        root = self
568        all_seqs = set(self.get_leaf_names())
569        outgroup_dist  = 0
570        best_balance = max(species2age.values())
571        outgroup_node  = self
572        outgroup_size = 0
573
574        for leaf in root.iter_descendants():
575            leaf_seqs = set(leaf.get_leaf_names())
576            size = len(leaf_seqs)
577
578            leaf_species =[self._speciesFunction(s) for s in leaf_seqs]
579            out_species = [self._speciesFunction(s) for s in all_seqs-leaf_seqs]
580
581            leaf_age_min = min([species2age[sp] for sp in leaf_species])
582            out_age_min = min([species2age[sp] for sp in out_species])
583            leaf_age_max = max([species2age[sp] for sp in leaf_species])
584            out_age_max = max([species2age[sp] for sp in out_species])
585            leaf_age = leaf_age_max - leaf_age_min
586            out_age = out_age_max - out_age_min
587
588            age_inbalance = abs(out_age - leaf_age)
589
590            # DEBUG ONLY
591            # leaf.add_features(age_inbalance = age_inbalance, age=leaf_age)
592
593            update = False
594            if age_inbalance < best_balance:
595                update = True
596            elif age_inbalance == best_balance:
597                if size > outgroup_size:
598                    update = True
599                elif size == outgroup_size:
600                    dist = self.get_distance(leaf)
601                    outgroup_dist = self.get_distance(outgroup_node)
602                    if dist > outgroup_dist:
603                        update = True
604
605            if update:
606                best_balance = age_inbalance
607                outgroup_node = leaf
608                outgroup_size = size
609
610        return outgroup_node
611
612    def get_speciation_trees(self, map_features=None, autodetect_duplications=True,
613                             newick_only=False, target_attr='species'):
614        """
615        .. versionadded: 2.2
616
617        Calculates all possible species trees contained within a
618        duplicated gene family tree as described in `Treeko
619        <http://treeko.cgenomics.org>`_ (see `Marcet and Gabaldon,
620        2011 <http://www.ncbi.nlm.nih.gov/pubmed/21335609>`_ ).
621
622
623        :argument True autodetect_duplications: If True, duplication
624        nodes will be automatically detected using the Species Overlap
625        algorithm (:func:`PhyloNode.get_descendants_evol_events`. If
626        False, duplication nodes within the original tree are expected
627        to contain the feature "evoltype=D".
628
629        :argument None features: A list of features that should be
630        mapped from the original gene family tree to each species
631        tree subtree.
632
633        :returns: (number_of_sptrees, number_of_dups, species_tree_iterator)
634
635        """
636        t = self
637        if autodetect_duplications:
638            #n2content, n2species = t.get_node2species()
639            n2content = t.get_cached_content()
640            n2species = t.get_cached_content(store_attr=target_attr)
641            for node in n2content:
642                sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children])
643                if len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal:
644                    node.add_features(evoltype="D")
645
646        sp_trees = get_subtrees(t, features=map_features, newick_only=newick_only)
647
648        return sp_trees
649
650    def __get_speciation_trees_recursive(self):
651        """ experimental and testing """
652        t = self.copy()
653        if autodetect_duplications:
654            dups = 0
655            #n2content, n2species = t.get_node2species()
656            n2content = t.get_cached_content()
657            n2species = t.get_cached_content(store_attr="species")
658
659            #print "Detecting dups"
660            for node in n2content:
661                sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children])
662                if  len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal:
663                    node.add_features(evoltype="D")
664                    dups += 1
665                elif node.is_leaf():
666                    node._leaf = True
667            #print dups
668        else:
669            for node in t.iter_leaves():
670                node._leaf = True
671        subtrees = _get_subtrees_recursive(t)
672        return len(subtrees), 0, subtrees
673
674    def split_by_dups(self, autodetect_duplications=True):
675        """
676        .. versionadded: 2.2
677
678        Returns the list of all subtrees resulting from splitting
679        current tree by its duplication nodes.
680
681        :argument True autodetect_duplications: If True, duplication
682        nodes will be automatically detected using the Species Overlap
683        algorithm (:func:`PhyloNode.get_descendants_evol_events`. If
684        False, duplication nodes within the original tree are expected
685        to contain the feature "evoltype=D".
686
687        :returns: species_trees
688        """
689        try:
690            t = self.copy()
691        except Exception:
692            t = self.copy("deepcopy")
693
694        if autodetect_duplications:
695            dups = 0
696            #n2content, n2species = t.get_node2species()
697            n2content = t.get_cached_content()
698            n2species = t.get_cached_content(store_attr="species")
699
700            #print "Detecting dups"
701            for node in n2content:
702                sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children])
703                if  len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal:
704                    node.add_features(evoltype="D")
705                    dups += 1
706                elif node.is_leaf():
707                    node._leaf = True
708            #print dups
709        else:
710            for node in t.iter_leaves():
711                node._leaf = True
712        sp_trees = get_subparts(t)
713        return sp_trees
714
715    def collapse_lineage_specific_expansions(self, species=None, return_copy=True):
716        """ Converts lineage specific expansion nodes into a single
717        tip node (randomly chosen from tips within the expansion).
718
719        :param None species: If supplied, only expansions matching the
720           species criteria will be pruned. When None, all expansions
721           within the tree will be processed.
722
723        """
724        if species and isinstance(species, (list, tuple)):
725            species = set(species)
726        elif species and (not isinstance(species, (set, frozenset))):
727            raise TypeError("species argument should be a set (preferred), list or tuple")
728
729        prunned = self.copy("deepcopy") if return_copy else self
730        n2sp = prunned.get_cached_content(store_attr="species")
731        n2leaves = prunned.get_cached_content()
732        is_expansion = lambda n: (len(n2sp[n])==1 and len(n2leaves[n])>1
733                                  and (species is None or species & n2sp[n]))
734        for n in prunned.get_leaves(is_leaf_fn=is_expansion):
735            repre = list(n2leaves[n])[0]
736            repre.detach()
737            if n is not prunned:
738                n.up.add_child(repre)
739                n.detach()
740            else:
741                return repre
742
743        return prunned
744
745
746    def annotate_ncbi_taxa(self, taxid_attr='species', tax2name=None, tax2track=None, tax2rank=None, dbfile=None):
747        """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are
748        expected to contain a feature (name, by default) encoding a valid taxid
749        number.
750
751        All descendant nodes (including internal nodes) are annotated with the
752        following new features:
753
754        `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database
755
756        `Node.named_lineage`: the NCBI lineage track using scientific names
757
758        `Node.taxid`: NCBI taxid number
759
760        `Node.lineage`: same as named_lineage but using taxid codes.
761
762
763        Note that for internal nodes, NCBI information will refer to the first
764        common lineage of the grouped species.
765
766        :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node.
767
768        :param None tax2name: A dictionary where keys are taxid
769            numbers and values are their translation into NCBI
770            scientific name. Its use is optional and allows to avoid
771            database queries when annotating many trees containing the
772            same set of taxids.
773
774        :param None tax2track: A dictionary where keys are taxid
775            numbers and values are their translation into NCBI lineage
776            tracks (taxids). Its use is optional and allows to avoid
777            database queries when annotating many trees containing the
778            same set of taxids.
779
780        :param None tax2rank: A dictionary where keys are taxid
781            numbers and values are their translation into NCBI rank
782            name. Its use is optional and allows to avoid database
783            queries when annotating many trees containing the same set
784            of taxids.
785
786        :param None dbfile : If provided, the provided file will be
787            used as a local copy of the NCBI taxonomy database.
788
789        :returns: tax2name (a dictionary translating taxid numbers
790            into scientific name), tax2lineage (a dictionary
791            translating taxid numbers into their corresponding NCBI
792            lineage track) and tax2rank (a dictionary translating
793            taxid numbers into rank names).
794
795        """
796
797        ncbi = NCBITaxa(dbfile=dbfile)
798        return ncbi.annotate_tree(self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank)
799
800
801    def ncbi_compare(self, autodetect_duplications=True, cached_content=None):
802        if not cached_content:
803            cached_content = self.get_cached_content()
804        cached_species = set([n.species for n in cached_content[self]])
805
806        if len(cached_species) != len(cached_content[self]):
807            print(cached_species)
808            ntrees, ndups, target_trees = self.get_speciation_trees(autodetect_duplications=autodetect_duplications, map_features=["taxid"])
809        else:
810            target_trees = [self]
811
812
813        ncbi = NCBITaxa()
814        for t in target_trees:
815            ncbi.get_broken_branches(t, cached_content)
816
817
818
819#: .. currentmodule:: ete3
820#
821PhyloTree = PhyloNode
822