1from __future__ import absolute_import 2# #START_LICENSE########################################################### 3# 4# 5# This file is part of the Environment for Tree Exploration program 6# (ETE). http://etetoolkit.org 7# 8# ETE is free software: you can redistribute it and/or modify it 9# under the terms of the GNU General Public License as published by 10# the Free Software Foundation, either version 3 of the License, or 11# (at your option) any later version. 12# 13# ETE is distributed in the hope that it will be useful, but WITHOUT 14# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 16# License for more details. 17# 18# You should have received a copy of the GNU General Public License 19# along with ETE. If not, see <http://www.gnu.org/licenses/>. 20# 21# 22# ABOUT THE ETE PACKAGE 23# ===================== 24# 25# ETE is distributed under the GPL copyleft license (2008-2015). 26# 27# If you make use of ETE in published work, please cite: 28# 29# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. 30# ETE: a python Environment for Tree Exploration. Jaime BMC 31# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 32# 33# Note that extra references to the specific methods implemented in 34# the toolkit may be available in the documentation. 35# 36# More info at http://etetoolkit.org. Contact: huerta@embl.de 37# 38# 39# #END_LICENSE############################################################# 40 41from .evolevents import EvolEvent 42 43__all__ = ["get_evol_events_from_leaf", "get_evol_events_from_root"] 44 45def get_evol_events_from_leaf(node, sos_thr=0.0): 46 """ Returns a list of duplication and speciation events in 47 which the current node has been involved. Scanned nodes are 48 also labeled internally as dup=True|False. You can access this 49 labels using the 'node.dup' sintaxis. 50 51 Method: the algorithm scans all nodes from the given leafName to 52 the root. Nodes are assumed to be duplications when a species 53 overlap is found between its child linages. Method is described 54 more detail in: 55 56 "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon 57 T. Genome Biol. 2007;8(6):R109. 58 """ 59 # Get the tree's root 60 root = node.get_tree_root() 61 62 # Checks that is actually rooted 63 outgroups = root.get_children() 64 if len(outgroups) != 2: 65 raise TypeError("Tree is not rooted") 66 67 # Cautch the smaller outgroup (will be stored as the tree 68 # outgroup) 69 o1 = set([n.name for n in outgroups[0].get_leaves()]) 70 o2 = set([n.name for n in outgroups[1].get_leaves()]) 71 72 if len(o2)<len(o1): 73 smaller_outg = outgroups[1] 74 else: 75 smaller_outg = outgroups[0] 76 77 78 # Prepare to browse tree from leaf to root 79 all_events = [] 80 current = node 81 ref_spcs = node.species 82 sister_leaves = set([]) 83 browsed_spcs = set([current.species]) 84 browsed_leaves = set([current]) 85 # get family Size 86 fSize = len([n for n in root.get_leaves() if n.species == ref_spcs]) 87 88 # Clean previous analysis 89 for n in root.get_descendants()+[root]: 90 n.del_feature("evoltype") 91 92 while current.up: 93 # distances control (0.0 distance check) 94 d = 0 95 for s in current.get_sisters(): 96 for leaf in s.get_leaves(): 97 d += current.get_distance(leaf) 98 sister_leaves.add(leaf) 99 # Process sister node only if there is any new sequence. 100 # (previene dupliaciones por nombres repetidos) 101 sister_leaves = sister_leaves.difference(browsed_leaves) 102 if len(sister_leaves)==0: 103 current = current.up 104 continue 105 # Gets species at both sides of event 106 sister_spcs = set([n.species for n in sister_leaves]) 107 overlaped_spces = browsed_spcs & sister_spcs 108 all_spcs = browsed_spcs | sister_spcs 109 score = float(len(overlaped_spces))/len(all_spcs) 110 # Creates a new evolEvent 111 event = EvolEvent() 112 event.fam_size = fSize 113 event.seed = node.name 114 # event.e_newick = current.up.get_newick() # high mem usage!! 115 event.sos = score 116 event.outgroup = smaller_outg.name 117 # event.allseqs = set(current.up.get_leaf_names()) 118 event.in_seqs = set([n.name for n in browsed_leaves]) 119 event.out_seqs = set([n.name for n in sister_leaves]) 120 event.inparalogs = set([n.name for n in browsed_leaves if n.species == ref_spcs]) 121 122 # If species overlap: duplication 123 if score > sos_thr:# and d > 0.0: Removed branch control. 124 event.node = current.up 125 event.etype = "D" 126 event.outparalogs = set([n.name for n in sister_leaves if n.species == ref_spcs]) 127 event.orthologs = set([]) 128 current.up.add_feature("evoltype","D") 129 all_events.append(event) 130 131 # If NO species overlap: speciation 132 elif score <= sos_thr: 133 event.node = current.up 134 event.etype = "S" 135 event.orthologs = set([n.name for n in sister_leaves if n.species != ref_spcs]) 136 event.outparalogs = set([]) 137 current.up.add_feature("evoltype","S") 138 all_events.append(event) 139 140 # Updates browsed species 141 browsed_spcs |= sister_spcs 142 browsed_leaves |= sister_leaves 143 sister_leaves = set([]) 144 # And keep ascending 145 current = current.up 146 return all_events 147 148def get_evol_events_from_root(node, sos_thr): 149 """ Returns a list of **all** duplication and speciation 150 events detected after this node. Nodes are assumed to be 151 duplications when a species overlap is found between its child 152 linages. Method is described more detail in: 153 154 "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon 155 T. Genome Biol. 2007;8(6):R109. 156 """ 157 158 # Get the tree's root 159 root = node.get_tree_root() 160 161 # Checks that is actually rooted 162 outgroups = root.get_children() 163 if len(outgroups) != 2: 164 raise TypeError("Tree is not rooted") 165 166 # Cautch the smaller outgroup (will be stored as the tree outgroup) 167 o1 = set([n.name for n in outgroups[0].get_leaves()]) 168 o2 = set([n.name for n in outgroups[1].get_leaves()]) 169 170 171 if len(o2)<len(o1): 172 smaller_outg = outgroups[1] 173 else: 174 smaller_outg = outgroups[0] 175 176 # Get family size 177 fSize = len( [n for n in root.get_leaves()] ) 178 179 # Clean data from previous analyses 180 for n in root.get_descendants()+[root]: 181 n.del_feature("evoltype") 182 183 # Gets Prepared to browse the tree from root to leaves 184 to_visit = [] 185 current = root 186 all_events = [] 187 while current: 188 # Gets childs and appends them to the To_visit list 189 childs = current.get_children() 190 to_visit += childs 191 if len(childs)>2: 192 raise TypeError("nodes are expected to have two childs.") 193 elif len(childs)==0: 194 pass # leaf 195 else: 196 # Get leaves and species at both sides of event 197 sideA_leaves= set([n for n in childs[0].get_leaves()]) 198 sideB_leaves= set([n for n in childs[1].get_leaves()]) 199 sideA_spcs = set([n.species for n in childs[0].get_leaves()]) 200 sideB_spcs = set([n.species for n in childs[1].get_leaves()]) 201 # Calculates species overlap 202 overlaped_spcs = sideA_spcs & sideB_spcs 203 all_spcs = sideA_spcs | sideB_spcs 204 score = float(len(overlaped_spcs))/len(all_spcs) 205 206 # Creates a new evolEvent 207 event = EvolEvent() 208 event.fam_size = fSize 209 event.branch_supports = [current.support, current.children[0].support, current.children[1].support] 210 # event.seed = leafName 211 # event.e_newick = current.up.get_newick() # high mem usage!! 212 event.sos = score 213 event.outgroup_spcs = smaller_outg.get_species() 214 event.in_seqs = set([n.name for n in sideA_leaves]) 215 event.out_seqs = set([n.name for n in sideB_leaves]) 216 event.inparalogs = set([n.name for n in sideA_leaves]) 217 # If species overlap: duplication 218 if score >sos_thr: 219 event.node = current 220 event.etype = "D" 221 event.outparalogs = set([n.name for n in sideB_leaves]) 222 event.orthologs = set([]) 223 current.add_feature("evoltype","D") 224 # If NO species overlap: speciation 225 else: 226 event.node = current 227 event.etype = "S" 228 event.orthologs = set([n.name for n in sideB_leaves]) 229 event.outparalogs = set([]) 230 current.add_feature("evoltype","S") 231 232 all_events.append(event) 233 # Keep visiting nodes 234 try: 235 current = to_visit.pop(0) 236 except IndexError: 237 current = None 238 return all_events 239