1# #START_LICENSE########################################################### 2# 3# 4# This file is part of the Environment for Tree Exploration program 5# (ETE). http://etetoolkit.org 6# 7# ETE is free software: you can redistribute it and/or modify it 8# under the terms of the GNU General Public License as published by 9# the Free Software Foundation, either version 3 of the License, or 10# (at your option) any later version. 11# 12# ETE is distributed in the hope that it will be useful, but WITHOUT 13# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15# License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with ETE. If not, see <http://www.gnu.org/licenses/>. 19# 20# 21# ABOUT THE ETE PACKAGE 22# ===================== 23# 24# ETE is distributed under the GPL copyleft license (2008-2015). 25# 26# If you make use of ETE in published work, please cite: 27# 28# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. 29# ETE: a python Environment for Tree Exploration. Jaime BMC 30# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 31# 32# Note that extra references to the specific methods implemented in 33# the toolkit may be available in the documentation. 34# 35# More info at http://etetoolkit.org. Contact: huerta@embl.de 36# 37# 38# #END_LICENSE############################################################# 39 40 41""" 42This module defines the PhyloNode dataytype to manage phylogenetic 43trees. It inheritates the coretype TreeNode and add some special 44features to the the node instances. 45""" 46from __future__ import absolute_import 47from __future__ import print_function 48 49import sys 50import os 51import re 52import itertools 53from collections import defaultdict 54from .. import TreeNode, SeqGroup, NCBITaxa 55from .reconciliation import get_reconciled_tree 56from . import spoverlap 57 58__all__ = ["PhyloNode", "PhyloTree"] 59 60def _parse_species(name): 61 return name[:3] 62 63def is_dup(n): 64 return getattr(n, "evoltype", None) == "D" 65 66def get_subtrees(tree, full_copy=False, features=None, newick_only=False): 67 """Calculate all possible species trees within a gene tree. I 68 tested several recursive and iterative approaches to do it and 69 this is the most efficient way I found. The method is now fast and 70 light enough to deal with very large gene trees, and it scales 71 linearly instead of exponentially. For instance, a tree with ~8000 72 nodes, ~100 species and ~400 duplications returns ~10,000 sptrees 73 that could be loaded in few minutes. 74 75 To avoid memory overloads, this function returns a tuple containing the 76 total number of trees, number of duplication events, and an iterator for the 77 species trees. Real trees are not actually computed until the iterator is 78 first accessed. This allows to filter out cases producing astronomic numbers 79 of sptrees. 80 81 """ 82 ntrees, ndups = calc_subtrees(tree) 83 return ntrees, ndups, _get_subtrees(tree, full_copy, features, newick_only) 84 85def _get_subtrees(tree, full_copy=False, features=None, newick_only=False): 86 # First I need to precalculate all the species trees in tuple (newick) format 87 nid = 0 88 n2nid = {} 89 nid2node = {} 90 n2subtrees = defaultdict(list) 91 for n in tree.traverse("postorder"): 92 n2nid[n] = nid 93 nid2node[nid] = n 94 nid += 1 95 if n.children: 96 if is_dup(n): 97 subtrees = [] 98 for ch in n.children: 99 subtrees.extend(n2subtrees[n2nid[ch]]) 100 else: 101 subtrees = tuple([val for val in 102 itertools.product(n2subtrees[n2nid[n.children[0]]], 103 n2subtrees[n2nid[n.children[1]]])]) 104 else: 105 subtrees = tuple([n2nid[n]]) 106 107 n2subtrees[n2nid[n]] = subtrees 108 for ch in n.children: 109 del n2subtrees[n2nid[ch]] 110 111 sp_trees = n2subtrees[n2nid[tree]] 112 113 # Second, I yield a tree per iteration in newick or ETE format 114 features = set(features) if features else set() 115 features.update(["name"]) 116 117 def _nodereplacer(match): 118 pre, b, post = match.groups() 119 pre = '' if not pre else pre 120 post = '' if not post else post 121 node = nid2node[int(b)] 122 fstring = "" 123 if features: 124 fstring = "".join(["[&&NHX:", 125 ':'.join(["%s=%s" %(f, getattr(node, f)) 126 for f in features if hasattr(node, f)]) 127 , "]"]) 128 129 return ''.join([pre, node.name, fstring, post]) 130 131 if newick_only: 132 id_match = re.compile("([^0-9])?(\d+)([^0-9])?") 133 for nw in sp_trees: 134 yield re.sub(id_match, _nodereplacer, str(nw)+";") 135 else: 136 for nw in sp_trees: 137 # I take advantage from the fact that I generated the subtrees 138 # using tuples, so str representation is actually a newick :) 139 t = PhyloTree(str(nw)+";") 140 # Map features from original tree 141 for leaf in t.iter_leaves(): 142 _nid = int(leaf.name) 143 for f in features: 144 leaf.add_feature(f, getattr(nid2node[_nid], f)) 145 yield t 146 147def calc_subtrees(tree): 148 ''' 149 Computes the total number of species trees that TreeKO algorithm would produce for a given gene tree 150 151 returns: ntrees, ndups 152 ''' 153 n2subtrees = {} 154 dups = 0 155 for n in tree.traverse("postorder"): 156 if n.children: 157 if is_dup(n): 158 dups += 1 159 subtrees = 0 160 for ch in n.children: 161 subtrees += n2subtrees[ch] 162 else: 163 subtrees = n2subtrees[n.children[0]] * n2subtrees[n.children[1]] 164 else: 165 subtrees = 1 166 n2subtrees[n] = subtrees 167 return n2subtrees[tree], dups 168 169def iter_sptrees(sptrees, nid2node, features=None, newick_only=False): 170 """ Loads and map the species trees returned by get_subtrees""" 171 172 features = set(features) if features else set() 173 features.update(["name"]) 174 175 def _nodereplacer(match): 176 pre, b, post = match.groups() 177 node = nid2node[int(b)] 178 fstring = "" 179 if features: 180 fstring = "".join(["[&&NHX:", 181 ','.join(["%s=%s" %(f, getattr(node, f)) 182 for f in features if hasattr(node, f)]) 183 , "]"]) 184 185 return ''.join([pre, node.name, fstring, post]) 186 187 if newick_only: 188 id_match = re.compile("([^0-9])(\d+)([^0-9])") 189 for nw in sptrees: 190 yield re.sub(id_match, _nodereplacer, str(nw)+";") 191 else: 192 for nw in sptrees: 193 # I take advantage from the fact that I generated the subtrees 194 # using tuples, so str representation is actually a newick :) 195 t = PhyloTree(str(nw)+";") 196 # Map features from original tree 197 for leaf in t.iter_leaves(): 198 _nid = int(leaf.name) 199 for f in features: 200 leaf.add_feature(f, getattr(nid2node[_nid], f)) 201 yield t 202 203def _get_subtrees_recursive(node, full_copy=True): 204 if is_dup(node): 205 sp_trees = [] 206 for ch in node.children: 207 sp_trees.extend(_get_subtrees_recursive(ch, full_copy=full_copy)) 208 return sp_trees 209 210 # saves a list of duplication nodes under current node 211 dups = [] 212 for _n in node.iter_leaves(is_leaf_fn=is_dup): 213 if is_dup(_n): 214 dups.append(_n) 215 216 if dups: 217 # detach inner duplication nodes and stores their anchor point 218 subtrees = [] 219 for dp in dups: 220 # The real node to attach sibling subtress 221 anchor = dp.up 222 dp.detach() 223 224 duptrees = [] 225 #get all sibling sptrees in each side of the 226 #duplication. Each subtree is pointed to its anchor 227 for ch in dp.children: 228 for subt in _get_subtrees_recursive(ch, full_copy=full_copy): 229 if not full_copy: 230 subt = node.__class__(subt) 231 subt.up = anchor 232 duptrees.append(subt) 233 234 #all posible sptrees under this duplication are stored 235 subtrees.append(duptrees) 236 237 # Generates all combinations of subtrees in sibling duplications 238 sp_trees = [] 239 for comb in itertools.product(*subtrees): 240 #each subtree is attached to its anchor point and make a copy 241 #of the final sp tree 242 for subt in comb: 243 #anchor = subt2anchor[subt] 244 if subt.up: 245 subt.up.children.append(subt) 246 #print subt.up 247 else: 248 sp_trees.append(subt) 249 if full_copy: 250 back_up = node.up 251 node.up = None 252 _node = node.copy() 253 node.up = back_up 254 else: 255 _node = node.write(format=9, features=["name", "evoltype"]) 256 sp_trees.append(_node) 257 # Clear current node 258 for subt in comb: 259 subt.up.children.pop(-1) 260 else: 261 if full_copy: 262 back_up = node.up 263 node.up = None 264 _node = node.copy() 265 node.up = back_up 266 else: 267 _node = node.write(format=9, features=["name", "evoltype"]) 268 #node.detach() 269 sp_trees = [_node] 270 271 return sp_trees 272 273def get_subparts(n): 274 def is_dup(n): 275 return getattr(n, "evoltype", None) == "D" 276 277 subtrees = [] 278 if is_dup(n): 279 for ch in n.get_children(): 280 ch.detach() 281 subtrees.extend(get_subparts(ch)) 282 else: 283 to_visit = [] 284 for _n in n.iter_leaves(is_leaf_fn=is_dup): 285 if is_dup(_n): 286 to_visit.append(_n) 287 288 for _n in to_visit: 289 _n.detach() 290 291 freaks = [_n for _n in n.iter_descendants() if 292 len(_n.children)==1 or (not hasattr(_n, "_leaf") and not _n.children)] 293 for s in freaks: 294 s.delete(prevent_nondicotomic=True) 295 296 # Clean node structure to prevent nodes with only one child 297 while len(n.children) == 1: 298 n = n.children[0] 299 n.detach() 300 301 if not n.children and not hasattr(n, "_leaf"): 302 pass 303 else: 304 subtrees.append(n) 305 306 for _n in to_visit: 307 subtrees.extend(get_subparts(_n)) 308 309 return subtrees 310 311 312class PhyloNode(TreeNode): 313 """ 314 .. currentmodule:: ete3 315 Extends the standard :class:`TreeNode` instance. It adds 316 specific attributes and methods to work with phylogentic trees. 317 318 :argument newick: Path to the file containing the tree or, alternatively, 319 the text string containing the same information. 320 321 :argument alignment: file containing a multiple sequence alignment. 322 323 :argument alg_format: "fasta", "phylip" or "iphylip" (interleaved) 324 325 :argument format: sub-newick format 326 327 .. table:: 328 329 ====== ============================================== 330 FORMAT DESCRIPTION 331 ====== ============================================== 332 0 flexible with support values 333 1 flexible with internal node names 334 2 all branches + leaf names + internal supports 335 3 all branches + all names 336 4 leaf branches + leaf names 337 5 internal and leaf branches + leaf names 338 6 internal branches + leaf names 339 7 leaf branches + all names 340 8 all names 341 9 leaf names 342 100 topology only 343 ====== ============================================== 344 345 :argument sp_naming_function: Pointer to a parsing python 346 function that receives nodename as first argument and returns 347 the species name (see 348 :func:`PhyloNode.set_species_naming_function`. By default, the 349 3 first letter of nodes will be used as species identifiers. 350 351 352 353 :returns: a tree node object which represents the base of the tree. 354 """ 355 356 def _get_species(self): 357 if self._speciesFunction: 358 try: 359 return self._speciesFunction(self.name) 360 except: 361 return self._speciesFunction(self) 362 else: 363 return self._species 364 365 def _set_species(self, value): 366 if self._speciesFunction: 367 pass 368 else: 369 self._species = value 370 371 # This tweak overwrites the native 'name' attribute to create a 372 # property that updates the species code every time name is 373 # changed 374 375 #: .. currentmodule:: ete3 376 #: 377 #Species code associated to the node. This property can be 378 #automatically extracted from the TreeNode.name attribute or 379 #manually set (see :func:`PhyloNode.set_species_naming_function`). 380 species = property(fget = _get_species, fset = _set_species) 381 382 def __init__(self, newick=None, alignment=None, alg_format="fasta", \ 383 sp_naming_function=_parse_species, format=0, **kargs): 384 385 # _update names? 386 self._name = "NoName" 387 self._species = "Unknown" 388 self._speciesFunction = None 389 # Caution! native __init__ has to be called after setting 390 # _speciesFunction to None!! 391 TreeNode.__init__(self, newick=newick, format=format, **kargs) 392 393 # This will be only executed after reading the whole tree, 394 # because the argument 'alignment' is not passed to the 395 # PhyloNode constructor during parsing 396 if alignment: 397 self.link_to_alignment(alignment, alg_format) 398 if newick: 399 self.set_species_naming_function(sp_naming_function) 400 401 def __repr__(self): 402 return "PhyloTree node '%s' (%s)" %(self.name, hex(self.__hash__())) 403 404 def set_species_naming_function(self, fn): 405 """ 406 Sets the parsing function used to extract species name from a 407 node's name. 408 409 :argument fn: Pointer to a parsing python function that 410 receives nodename as first argument and returns the species 411 name. 412 413 :: 414 415 # Example of a parsing function to extract species names for 416 # all nodes in a given tree. 417 def parse_sp_name(node_name): 418 return node_name.split("_")[1] 419 tree.set_species_naming_function(parse_sp_name) 420 421 """ 422 if fn: 423 for n in self.traverse(): 424 n._speciesFunction = fn 425 if n.is_leaf(): 426 n.features.add("species") 427 428 def link_to_alignment(self, alignment, alg_format="fasta", **kwargs): 429 missing_leaves = [] 430 missing_internal = [] 431 if type(alignment) == SeqGroup: 432 alg = alignment 433 else: 434 alg = SeqGroup(alignment, format=alg_format, **kwargs) 435 # sets the seq of 436 for n in self.traverse(): 437 try: 438 n.add_feature("sequence",alg.get_seq(n.name)) 439 except KeyError: 440 if n.is_leaf(): 441 missing_leaves.append(n.name) 442 else: 443 missing_internal.append(n.name) 444 if len(missing_leaves)>0: 445 print("Warnning: [%d] terminal nodes could not be found in the alignment." %\ 446 len(missing_leaves), file=sys.stderr) 447 # Show warning of not associated internal nodes. 448 # if len(missing_internal)>0: 449 # print >>sys.stderr, \ 450 # "Warnning: [%d] internal nodes could not be found in the alignment." %\ 451 # len(missing_leaves) 452 453 def get_species(self): 454 """ Returns the set of species covered by its partition. """ 455 return set([l.species for l in self.iter_leaves()]) 456 457 def iter_species(self): 458 """ Returns an iterator over the species grouped by this node. """ 459 spcs = set([]) 460 for l in self.iter_leaves(): 461 if l.species not in spcs: 462 spcs.add(l.species) 463 yield l.species 464 465 def get_age(self, species2age): 466 """ 467 Implements the phylostratigrafic method described in: 468 469 Huerta-Cepas, J., & Gabaldon, T. (2011). Assigning duplication events to 470 relative temporal scales in genome-wide studies. Bioinformatics, 27(1), 471 38-45. 472 """ 473 return max([species2age[sp] for sp in self.get_species()]) 474 475 def reconcile(self, species_tree): 476 """ Returns the reconcilied topology with the provided species 477 tree, and a list of evolutionary events inferred from such 478 reconciliation. """ 479 return get_reconciled_tree(self, species_tree, []) 480 481 def get_my_evol_events(self, sos_thr=0.0): 482 """ Returns a list of duplication and speciation events in 483 which the current node has been involved. Scanned nodes are 484 also labeled internally as dup=True|False. You can access this 485 labels using the 'node.dup' sintaxis. 486 487 Method: the algorithm scans all nodes from the given leafName to 488 the root. Nodes are assumed to be duplications when a species 489 overlap is found between its child linages. Method is described 490 more detail in: 491 492 "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon 493 T. Genome Biol. 2007;8(6):R109. 494 """ 495 return spoverlap.get_evol_events_from_leaf(self, sos_thr=sos_thr) 496 497 def get_descendant_evol_events(self, sos_thr=0.0): 498 """ Returns a list of **all** duplication and speciation 499 events detected after this node. Nodes are assumed to be 500 duplications when a species overlap is found between its child 501 linages. Method is described more detail in: 502 503 "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon 504 T. Genome Biol. 2007;8(6):R109. 505 """ 506 return spoverlap.get_evol_events_from_root(self, sos_thr=sos_thr) 507 508 def get_farthest_oldest_leaf(self, species2age, is_leaf_fn=None): 509 """ Returns the farthest oldest leaf to the current 510 one. It requires an species2age dictionary with the age 511 estimation for all species. 512 513 :argument None is_leaf_fn: A pointer to a function that 514 receives a node instance as unique argument and returns True 515 or False. It can be used to dynamically collapse nodes, so 516 they are seen as leaves. 517 518 """ 519 520 root = self.get_tree_root() 521 outgroup_dist = 0 522 outgroup_node = self 523 outgroup_age = 0 # self.get_age(species2age) 524 525 for leaf in root.iter_leaves(is_leaf_fn=is_leaf_fn): 526 if leaf.get_age(species2age) > outgroup_age: 527 outgroup_dist = leaf.get_distance(self) 528 outgroup_node = leaf 529 outgroup_age = species2age[leaf.get_species().pop()] 530 elif leaf.get_age(species2age) == outgroup_age: 531 dist = leaf.get_distance(self) 532 if dist>outgroup_dist: 533 outgroup_dist = leaf.get_distance(self) 534 outgroup_node = leaf 535 outgroup_age = species2age[leaf.get_species().pop()] 536 return outgroup_node 537 538 def get_farthest_oldest_node(self, species2age): 539 """ 540 .. versionadded:: 2.1 541 542 Returns the farthest oldest node (leaf or internal). The 543 difference with get_farthest_oldest_leaf() is that in this 544 function internal nodes grouping seqs from the same species 545 are collapsed. 546 """ 547 548 # I use a custom is_leaf() function to collapse nodes groups 549 # seqs from the same species 550 is_leaf = lambda node: len(node.get_species())==1 551 return self.get_farthest_oldest_leaf(species2age, is_leaf_fn=is_leaf) 552 553 def get_age_balanced_outgroup(self, species2age): 554 """ 555 .. versionadded:: 2.2 556 557 Returns the node better balance current tree structure 558 according to the topological age of the different leaves and 559 internal node sizes. 560 561 :param species2age: A dictionary translating from leaf names 562 into a topological age. 563 564 .. warning: This is currently an experimental method!! 565 566 """ 567 root = self 568 all_seqs = set(self.get_leaf_names()) 569 outgroup_dist = 0 570 best_balance = max(species2age.values()) 571 outgroup_node = self 572 outgroup_size = 0 573 574 for leaf in root.iter_descendants(): 575 leaf_seqs = set(leaf.get_leaf_names()) 576 size = len(leaf_seqs) 577 578 leaf_species =[self._speciesFunction(s) for s in leaf_seqs] 579 out_species = [self._speciesFunction(s) for s in all_seqs-leaf_seqs] 580 581 leaf_age_min = min([species2age[sp] for sp in leaf_species]) 582 out_age_min = min([species2age[sp] for sp in out_species]) 583 leaf_age_max = max([species2age[sp] for sp in leaf_species]) 584 out_age_max = max([species2age[sp] for sp in out_species]) 585 leaf_age = leaf_age_max - leaf_age_min 586 out_age = out_age_max - out_age_min 587 588 age_inbalance = abs(out_age - leaf_age) 589 590 # DEBUG ONLY 591 # leaf.add_features(age_inbalance = age_inbalance, age=leaf_age) 592 593 update = False 594 if age_inbalance < best_balance: 595 update = True 596 elif age_inbalance == best_balance: 597 if size > outgroup_size: 598 update = True 599 elif size == outgroup_size: 600 dist = self.get_distance(leaf) 601 outgroup_dist = self.get_distance(outgroup_node) 602 if dist > outgroup_dist: 603 update = True 604 605 if update: 606 best_balance = age_inbalance 607 outgroup_node = leaf 608 outgroup_size = size 609 610 return outgroup_node 611 612 def get_speciation_trees(self, map_features=None, autodetect_duplications=True, 613 newick_only=False, target_attr='species'): 614 """ 615 .. versionadded: 2.2 616 617 Calculates all possible species trees contained within a 618 duplicated gene family tree as described in `Treeko 619 <http://treeko.cgenomics.org>`_ (see `Marcet and Gabaldon, 620 2011 <http://www.ncbi.nlm.nih.gov/pubmed/21335609>`_ ). 621 622 623 :argument True autodetect_duplications: If True, duplication 624 nodes will be automatically detected using the Species Overlap 625 algorithm (:func:`PhyloNode.get_descendants_evol_events`. If 626 False, duplication nodes within the original tree are expected 627 to contain the feature "evoltype=D". 628 629 :argument None features: A list of features that should be 630 mapped from the original gene family tree to each species 631 tree subtree. 632 633 :returns: (number_of_sptrees, number_of_dups, species_tree_iterator) 634 635 """ 636 t = self 637 if autodetect_duplications: 638 #n2content, n2species = t.get_node2species() 639 n2content = t.get_cached_content() 640 n2species = t.get_cached_content(store_attr=target_attr) 641 for node in n2content: 642 sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children]) 643 if len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal: 644 node.add_features(evoltype="D") 645 646 sp_trees = get_subtrees(t, features=map_features, newick_only=newick_only) 647 648 return sp_trees 649 650 def __get_speciation_trees_recursive(self): 651 """ experimental and testing """ 652 t = self.copy() 653 if autodetect_duplications: 654 dups = 0 655 #n2content, n2species = t.get_node2species() 656 n2content = t.get_cached_content() 657 n2species = t.get_cached_content(store_attr="species") 658 659 #print "Detecting dups" 660 for node in n2content: 661 sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children]) 662 if len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal: 663 node.add_features(evoltype="D") 664 dups += 1 665 elif node.is_leaf(): 666 node._leaf = True 667 #print dups 668 else: 669 for node in t.iter_leaves(): 670 node._leaf = True 671 subtrees = _get_subtrees_recursive(t) 672 return len(subtrees), 0, subtrees 673 674 def split_by_dups(self, autodetect_duplications=True): 675 """ 676 .. versionadded: 2.2 677 678 Returns the list of all subtrees resulting from splitting 679 current tree by its duplication nodes. 680 681 :argument True autodetect_duplications: If True, duplication 682 nodes will be automatically detected using the Species Overlap 683 algorithm (:func:`PhyloNode.get_descendants_evol_events`. If 684 False, duplication nodes within the original tree are expected 685 to contain the feature "evoltype=D". 686 687 :returns: species_trees 688 """ 689 try: 690 t = self.copy() 691 except Exception: 692 t = self.copy("deepcopy") 693 694 if autodetect_duplications: 695 dups = 0 696 #n2content, n2species = t.get_node2species() 697 n2content = t.get_cached_content() 698 n2species = t.get_cached_content(store_attr="species") 699 700 #print "Detecting dups" 701 for node in n2content: 702 sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children]) 703 if len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal: 704 node.add_features(evoltype="D") 705 dups += 1 706 elif node.is_leaf(): 707 node._leaf = True 708 #print dups 709 else: 710 for node in t.iter_leaves(): 711 node._leaf = True 712 sp_trees = get_subparts(t) 713 return sp_trees 714 715 def collapse_lineage_specific_expansions(self, species=None, return_copy=True): 716 """ Converts lineage specific expansion nodes into a single 717 tip node (randomly chosen from tips within the expansion). 718 719 :param None species: If supplied, only expansions matching the 720 species criteria will be pruned. When None, all expansions 721 within the tree will be processed. 722 723 """ 724 if species and isinstance(species, (list, tuple)): 725 species = set(species) 726 elif species and (not isinstance(species, (set, frozenset))): 727 raise TypeError("species argument should be a set (preferred), list or tuple") 728 729 prunned = self.copy("deepcopy") if return_copy else self 730 n2sp = prunned.get_cached_content(store_attr="species") 731 n2leaves = prunned.get_cached_content() 732 is_expansion = lambda n: (len(n2sp[n])==1 and len(n2leaves[n])>1 733 and (species is None or species & n2sp[n])) 734 for n in prunned.get_leaves(is_leaf_fn=is_expansion): 735 repre = list(n2leaves[n])[0] 736 repre.detach() 737 if n is not prunned: 738 n.up.add_child(repre) 739 n.detach() 740 else: 741 return repre 742 743 return prunned 744 745 746 def annotate_ncbi_taxa(self, taxid_attr='species', tax2name=None, tax2track=None, tax2rank=None, dbfile=None): 747 """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are 748 expected to contain a feature (name, by default) encoding a valid taxid 749 number. 750 751 All descendant nodes (including internal nodes) are annotated with the 752 following new features: 753 754 `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database 755 756 `Node.named_lineage`: the NCBI lineage track using scientific names 757 758 `Node.taxid`: NCBI taxid number 759 760 `Node.lineage`: same as named_lineage but using taxid codes. 761 762 763 Note that for internal nodes, NCBI information will refer to the first 764 common lineage of the grouped species. 765 766 :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node. 767 768 :param None tax2name: A dictionary where keys are taxid 769 numbers and values are their translation into NCBI 770 scientific name. Its use is optional and allows to avoid 771 database queries when annotating many trees containing the 772 same set of taxids. 773 774 :param None tax2track: A dictionary where keys are taxid 775 numbers and values are their translation into NCBI lineage 776 tracks (taxids). Its use is optional and allows to avoid 777 database queries when annotating many trees containing the 778 same set of taxids. 779 780 :param None tax2rank: A dictionary where keys are taxid 781 numbers and values are their translation into NCBI rank 782 name. Its use is optional and allows to avoid database 783 queries when annotating many trees containing the same set 784 of taxids. 785 786 :param None dbfile : If provided, the provided file will be 787 used as a local copy of the NCBI taxonomy database. 788 789 :returns: tax2name (a dictionary translating taxid numbers 790 into scientific name), tax2lineage (a dictionary 791 translating taxid numbers into their corresponding NCBI 792 lineage track) and tax2rank (a dictionary translating 793 taxid numbers into rank names). 794 795 """ 796 797 ncbi = NCBITaxa(dbfile=dbfile) 798 return ncbi.annotate_tree(self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank) 799 800 801 def ncbi_compare(self, autodetect_duplications=True, cached_content=None): 802 if not cached_content: 803 cached_content = self.get_cached_content() 804 cached_species = set([n.species for n in cached_content[self]]) 805 806 if len(cached_species) != len(cached_content[self]): 807 print(cached_species) 808 ntrees, ndups, target_trees = self.get_speciation_trees(autodetect_duplications=autodetect_duplications, map_features=["taxid"]) 809 else: 810 target_trees = [self] 811 812 813 ncbi = NCBITaxa() 814 for t in target_trees: 815 ncbi.get_broken_branches(t, cached_content) 816 817 818 819#: .. currentmodule:: ete3 820# 821PhyloTree = PhyloNode 822