1from __future__ import absolute_import
2# #START_LICENSE###########################################################
3#
4#
5# This file is part of the Environment for Tree Exploration program
6# (ETE).  http://etetoolkit.org
7#
8# ETE is free software: you can redistribute it and/or modify it
9# under the terms of the GNU General Public License as published by
10# the Free Software Foundation, either version 3 of the License, or
11# (at your option) any later version.
12#
13# ETE is distributed in the hope that it will be useful, but WITHOUT
14# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
16# License for more details.
17#
18# You should have received a copy of the GNU General Public License
19# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
20#
21#
22#                     ABOUT THE ETE PACKAGE
23#                     =====================
24#
25# ETE is distributed under the GPL copyleft license (2008-2015).
26#
27# If you make use of ETE in published work, please cite:
28#
29# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
30# ETE: a python Environment for Tree Exploration. Jaime BMC
31# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
32#
33# Note that extra references to the specific methods implemented in
34# the toolkit may be available in the documentation.
35#
36# More info at http://etetoolkit.org. Contact: huerta@embl.de
37#
38#
39# #END_LICENSE#############################################################
40
41from .evolevents import EvolEvent
42
43__all__ = ["get_evol_events_from_leaf", "get_evol_events_from_root"]
44
45def get_evol_events_from_leaf(node, sos_thr=0.0):
46    """ Returns a list of duplication and speciation events in
47    which the current node has been involved. Scanned nodes are
48    also labeled internally as dup=True|False. You can access this
49    labels using the 'node.dup' sintaxis.
50
51    Method: the algorithm scans all nodes from the given leafName to
52    the root. Nodes are assumed to be duplications when a species
53    overlap is found between its child linages. Method is described
54    more detail in:
55
56    "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
57    T. Genome Biol. 2007;8(6):R109.
58    """
59    # Get the tree's root
60    root = node.get_tree_root()
61
62    # Checks that is actually rooted
63    outgroups = root.get_children()
64    if len(outgroups) != 2:
65        raise TypeError("Tree is not rooted")
66
67    # Cautch the smaller outgroup (will be stored as the tree
68    # outgroup)
69    o1 = set([n.name for n in outgroups[0].get_leaves()])
70    o2 = set([n.name for n in outgroups[1].get_leaves()])
71
72    if len(o2)<len(o1):
73        smaller_outg = outgroups[1]
74    else:
75        smaller_outg = outgroups[0]
76
77
78    # Prepare to browse tree from leaf to root
79    all_events = []
80    current  = node
81    ref_spcs = node.species
82    sister_leaves  = set([])
83    browsed_spcs   = set([current.species])
84    browsed_leaves = set([current])
85    # get family Size
86    fSize =  len([n for n in root.get_leaves() if n.species == ref_spcs])
87
88    # Clean previous analysis
89    for n in root.get_descendants()+[root]:
90        n.del_feature("evoltype")
91
92    while current.up:
93        # distances control (0.0 distance check)
94        d = 0
95        for s in current.get_sisters():
96            for leaf in s.get_leaves():
97                d += current.get_distance(leaf)
98                sister_leaves.add(leaf)
99        # Process sister node only if there is any new sequence.
100        # (previene dupliaciones por nombres repetidos)
101        sister_leaves = sister_leaves.difference(browsed_leaves)
102        if len(sister_leaves)==0:
103            current = current.up
104            continue
105        # Gets species at both sides of event
106        sister_spcs     = set([n.species for n in sister_leaves])
107        overlaped_spces = browsed_spcs & sister_spcs
108        all_spcs        = browsed_spcs | sister_spcs
109        score = float(len(overlaped_spces))/len(all_spcs)
110        # Creates a new evolEvent
111        event = EvolEvent()
112        event.fam_size   = fSize
113        event.seed      = node.name
114        # event.e_newick  = current.up.get_newick()  # high mem usage!!
115        event.sos = score
116        event.outgroup  = smaller_outg.name
117        # event.allseqs   = set(current.up.get_leaf_names())
118        event.in_seqs = set([n.name for n in browsed_leaves])
119        event.out_seqs = set([n.name for n in sister_leaves])
120        event.inparalogs  = set([n.name for n in browsed_leaves if n.species == ref_spcs])
121
122        # If species overlap: duplication
123        if score > sos_thr:# and d > 0.0: Removed branch control.
124            event.node = current.up
125            event.etype = "D"
126            event.outparalogs = set([n.name for n in sister_leaves  if n.species == ref_spcs])
127            event.orthologs   = set([])
128            current.up.add_feature("evoltype","D")
129            all_events.append(event)
130
131        # If NO species overlap: speciation
132        elif score <= sos_thr:
133            event.node = current.up
134            event.etype = "S"
135            event.orthologs = set([n.name for n in sister_leaves if n.species != ref_spcs])
136            event.outparalogs = set([])
137            current.up.add_feature("evoltype","S")
138            all_events.append(event)
139
140        # Updates browsed species
141        browsed_spcs   |= sister_spcs
142        browsed_leaves |= sister_leaves
143        sister_leaves  = set([])
144        # And keep ascending
145        current = current.up
146    return all_events
147
148def get_evol_events_from_root(node, sos_thr):
149    """ Returns a list of **all** duplication and speciation
150    events detected after this node. Nodes are assumed to be
151    duplications when a species overlap is found between its child
152    linages. Method is described more detail in:
153
154    "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
155    T. Genome Biol. 2007;8(6):R109.
156    """
157
158    # Get the tree's root
159    root = node.get_tree_root()
160
161    # Checks that is actually rooted
162    outgroups = root.get_children()
163    if len(outgroups) != 2:
164        raise TypeError("Tree is not rooted")
165
166    # Cautch the smaller outgroup (will be stored as the tree outgroup)
167    o1 = set([n.name for n in outgroups[0].get_leaves()])
168    o2 = set([n.name for n in outgroups[1].get_leaves()])
169
170
171    if len(o2)<len(o1):
172        smaller_outg = outgroups[1]
173    else:
174        smaller_outg = outgroups[0]
175
176    # Get family size
177    fSize = len( [n for n in root.get_leaves()] )
178
179    # Clean data from previous analyses
180    for n in root.get_descendants()+[root]:
181        n.del_feature("evoltype")
182
183    # Gets Prepared to browse the tree from root to leaves
184    to_visit = []
185    current = root
186    all_events = []
187    while current:
188        # Gets childs and appends them to the To_visit list
189        childs = current.get_children()
190        to_visit += childs
191        if len(childs)>2:
192            raise TypeError("nodes are expected to have two childs.")
193        elif len(childs)==0:
194            pass # leaf
195        else:
196            # Get leaves and species at both sides of event
197            sideA_leaves= set([n for n in childs[0].get_leaves()])
198            sideB_leaves= set([n for n in childs[1].get_leaves()])
199            sideA_spcs  = set([n.species for n in childs[0].get_leaves()])
200            sideB_spcs  = set([n.species for n in childs[1].get_leaves()])
201            # Calculates species overlap
202            overlaped_spcs = sideA_spcs & sideB_spcs
203            all_spcs       = sideA_spcs | sideB_spcs
204            score = float(len(overlaped_spcs))/len(all_spcs)
205
206            # Creates a new evolEvent
207            event = EvolEvent()
208            event.fam_size   = fSize
209            event.branch_supports = [current.support, current.children[0].support, current.children[1].support]
210            # event.seed      = leafName
211            # event.e_newick  = current.up.get_newick()  # high mem usage!!
212            event.sos = score
213            event.outgroup_spcs  = smaller_outg.get_species()
214            event.in_seqs = set([n.name for n in sideA_leaves])
215            event.out_seqs = set([n.name for n in sideB_leaves])
216            event.inparalogs  = set([n.name for n in sideA_leaves])
217            # If species overlap: duplication
218            if score >sos_thr:
219                event.node = current
220                event.etype = "D"
221                event.outparalogs = set([n.name for n in sideB_leaves])
222                event.orthologs   = set([])
223                current.add_feature("evoltype","D")
224            # If NO species overlap: speciation
225            else:
226                event.node = current
227                event.etype = "S"
228                event.orthologs = set([n.name for n in sideB_leaves])
229                event.outparalogs = set([])
230                current.add_feature("evoltype","S")
231
232            all_events.append(event)
233        # Keep visiting nodes
234        try:
235            current = to_visit.pop(0)
236        except IndexError:
237            current = None
238    return all_events
239