1# #START_LICENSE########################################################### 2# 3# 4# This file is part of the Environment for Tree Exploration program 5# (ETE). http://etetoolkit.org 6# 7# ETE is free software: you can redistribute it and/or modify it 8# under the terms of the GNU General Public License as published by 9# the Free Software Foundation, either version 3 of the License, or 10# (at your option) any later version. 11# 12# ETE is distributed in the hope that it will be useful, but WITHOUT 13# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15# License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with ETE. If not, see <http://www.gnu.org/licenses/>. 19# 20# 21# ABOUT THE ETE PACKAGE 22# ===================== 23# 24# ETE is distributed under the GPL copyleft license (2008-2015). 25# 26# If you make use of ETE in published work, please cite: 27# 28# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. 29# ETE: a python Environment for Tree Exploration. Jaime BMC 30# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 31# 32# Note that extra references to the specific methods implemented in 33# the toolkit may be available in the documentation. 34# 35# More info at http://etetoolkit.org. Contact: huerta@embl.de 36# 37# 38# #END_LICENSE############################################################# 39from __future__ import absolute_import 40from __future__ import print_function 41 42from sys import stderr 43from . import clustvalidation 44from ..coretype.tree import _translate_nodes 45from .. import TreeNode, ArrayTable 46from .. import numpy 47from six.moves import range 48 49__all__ = ["ClusterNode", "ClusterTree"] 50 51class ClusterNode(TreeNode): 52 """ Creates a new Cluster Tree object, which is a collection 53 of ClusterNode instances connected in a hierarchical way, and 54 representing a clustering result. 55 56 a newick file or string can be passed as the first argument. An 57 ArrayTable file or instance can be passed as a second argument. 58 59 Examples: 60 t1 = Tree() # creates an empty tree 61 t2 = Tree( '(A:1,(B:1,(C:1,D:1):0.5):0.5);' ) 62 t3 = Tree( '/home/user/myNewickFile.txt' ) 63 """ 64 65 def _set_forbidden(self, value): 66 raise ValueError("This attribute can not be manually set.") 67 68 def _get_intra(self): 69 if self._silhouette is None: 70 self.get_silhouette() 71 return self._intracluster_dist 72 73 def _get_inter(self): 74 if self._silhouette is None: 75 self.get_silhouette() 76 return self._intercluster_dist 77 78 def _get_silh(self): 79 if self._silhouette is None: 80 self.get_silhouette() 81 return self._silhouette 82 83 def _get_prof(self): 84 if self._profile is None: 85 self._calculate_avg_profile() 86 return self._profile 87 88 def _get_std(self): 89 if self._std_profile is None: 90 self._calculate_avg_profile() 91 return self._std_profile 92 93 def _set_profile(self, value): 94 self._profile = value 95 96 intracluster_dist = property(fget=_get_intra, fset=_set_forbidden) 97 intercluster_dist = property(fget=_get_inter, fset=_set_forbidden) 98 silhouette = property(fget=_get_silh, fset=_set_forbidden) 99 profile = property(fget=_get_prof, fset=_set_profile) 100 deviation = property(fget=_get_std, fset=_set_forbidden) 101 102 def __init__(self, newick = None, text_array = None, \ 103 fdist=clustvalidation.default_dist): 104 # Default dist is spearman_dist when scipy module is loaded 105 # otherwise, it is set to euclidean_dist. 106 107 # Initialize basic tree features and loads the newick (if any) 108 TreeNode.__init__(self, newick) 109 self._fdist = None 110 self._silhouette = None 111 self._intercluster_dist = None 112 self._intracluster_dist = None 113 self._profile = None 114 self._std_profile = None 115 116 # Cluster especific features 117 self.features.add("intercluster_dist") 118 self.features.add("intracluster_dist") 119 self.features.add("silhouette") 120 self.features.add("profile") 121 self.features.add("deviation") 122 123 # Initialize tree with array data 124 if text_array: 125 self.link_to_arraytable(text_array) 126 127 if newick: 128 self.set_distance_function(fdist) 129 130 def __repr__(self): 131 return "ClusterTree node (%s)" %hex(self.__hash__()) 132 133 def set_distance_function(self, fn): 134 """ Sets the distance function used to calculate cluster 135 distances and silouette index. 136 137 ARGUMENTS: 138 139 fn: a pointer to python function acepting two arrays (numpy) as 140 arguments. 141 142 EXAMPLE: 143 144 # A simple euclidean distance 145 my_dist_fn = lambda x,y: abs(x-y) 146 tree.set_distance_function(my_dist_fn) 147 148 """ 149 for n in self.traverse(): 150 n._fdist = fn 151 n._silhouette = None 152 n._intercluster_dist = None 153 n._intracluster_dist = None 154 155 def link_to_arraytable(self, arraytbl): 156 """ Allows to link a given arraytable object to the tree 157 structure under this node. Row names in the arraytable object 158 are expected to match leaf names. 159 160 Returns a list of nodes for with profiles could not been found 161 in arraytable. 162 163 """ 164 165 # Initialize tree with array data 166 167 if type(arraytbl) == ArrayTable: 168 array = arraytbl 169 else: 170 array = ArrayTable(arraytbl) 171 172 missing_leaves = [] 173 matrix_values = [i for r in range(len(array.matrix))\ 174 for i in array.matrix[r] if numpy.isfinite(i)] 175 176 array._matrix_min = min(matrix_values) 177 array._matrix_max = max(matrix_values) 178 179 for n in self.traverse(): 180 n.arraytable = array 181 if n.is_leaf() and n.name in array.rowNames: 182 n._profile = array.get_row_vector(n.name) 183 elif n.is_leaf(): 184 n._profile = [numpy.nan]*len(array.colNames) 185 missing_leaves.append(n) 186 187 188 if len(missing_leaves)>0: 189 print("""[%d] leaf names could not be mapped to the matrix rows.""" %\ 190 len(missing_leaves), file=stderr) 191 192 self.arraytable = array 193 194 def iter_leaf_profiles(self): 195 """ Returns an iterator over all the profiles associated to 196 the leaves under this node.""" 197 for l in self.iter_leaves(): 198 yield l.get_profile()[0] 199 200 def get_leaf_profiles(self): 201 """ Returns the list of all the profiles associated to the 202 leaves under this node.""" 203 return [l.get_profile()[0] for l in self.iter_leaves()] 204 205 def get_silhouette(self, fdist=None): 206 """ Calculates the node's silhouette value by using a given 207 distance function. By default, euclidean distance is used. It 208 also calculates the deviation profile, mean profile, and 209 inter/intra-cluster distances. 210 211 It sets the following features into the analyzed node: 212 - node.intracluster 213 - node.intercluster 214 - node.silhouete 215 216 intracluster distances a(i) are calculated as the Centroid 217 Diameter 218 219 intercluster distances b(i) are calculated as the Centroid linkage distance 220 221 ** Rousseeuw, P.J. (1987) Silhouettes: A graphical aid to the 222 interpretation and validation of cluster analysis. 223 J. Comput. Appl. Math., 20, 53-65. 224 225 """ 226 if fdist is None: 227 fdist = self._fdist 228 229 # Updates internal values 230 self._silhouette, self._intracluster_dist, self._intercluster_dist = \ 231 clustvalidation.get_silhouette_width(fdist, self) 232 # And returns them 233 return self._silhouette, self._intracluster_dist, self._intercluster_dist 234 235 def get_dunn(self, clusters, fdist=None): 236 """ Calculates the Dunn index for the given set of descendant 237 nodes. 238 """ 239 240 if fdist is None: 241 fdist = self._fdist 242 nodes = _translate_nodes(self, *clusters) 243 return clustvalidation.get_dunn_index(fdist, *nodes) 244 245 def _calculate_avg_profile(self): 246 """ This internal function updates the mean profile 247 associated to an internal node. """ 248 249 # Updates internal values 250 self._profile, self._std_profile = clustvalidation.get_avg_profile(self) 251 252 253# cosmetic alias 254#: .. currentmodule:: ete3 255# 256ClusterTree = ClusterNode 257