1# #START_LICENSE###########################################################
2#
3#
4# This file is part of the Environment for Tree Exploration program
5# (ETE).  http://etetoolkit.org
6#
7# ETE is free software: you can redistribute it and/or modify it
8# under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# ETE is distributed in the hope that it will be useful, but WITHOUT
13# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15# License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
19#
20#
21#                     ABOUT THE ETE PACKAGE
22#                     =====================
23#
24# ETE is distributed under the GPL copyleft license (2008-2015).
25#
26# If you make use of ETE in published work, please cite:
27#
28# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
29# ETE: a python Environment for Tree Exploration. Jaime BMC
30# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
31#
32# Note that extra references to the specific methods implemented in
33# the toolkit may be available in the documentation.
34#
35# More info at http://etetoolkit.org. Contact: huerta@embl.de
36#
37#
38# #END_LICENSE#############################################################
39from __future__ import absolute_import
40from __future__ import print_function
41
42from sys import stderr
43from . import clustvalidation
44from ..coretype.tree import _translate_nodes
45from .. import TreeNode, ArrayTable
46from .. import numpy
47from six.moves import range
48
49__all__ = ["ClusterNode", "ClusterTree"]
50
51class ClusterNode(TreeNode):
52    """ Creates a new Cluster Tree object, which is a collection
53    of ClusterNode instances connected in a hierarchical way, and
54    representing a clustering result.
55
56    a newick file or string can be passed as the first argument. An
57    ArrayTable file or instance can be passed as a second argument.
58
59    Examples:
60      t1 = Tree() # creates an empty tree
61      t2 = Tree( '(A:1,(B:1,(C:1,D:1):0.5):0.5);' )
62      t3 = Tree( '/home/user/myNewickFile.txt' )
63    """
64
65    def _set_forbidden(self, value):
66        raise ValueError("This attribute can not be manually set.")
67
68    def _get_intra(self):
69        if self._silhouette is None:
70            self.get_silhouette()
71        return self._intracluster_dist
72
73    def _get_inter(self):
74        if self._silhouette is None:
75            self.get_silhouette()
76        return self._intercluster_dist
77
78    def _get_silh(self):
79        if self._silhouette is None:
80            self.get_silhouette()
81        return self._silhouette
82
83    def _get_prof(self):
84        if self._profile is None:
85            self._calculate_avg_profile()
86        return self._profile
87
88    def _get_std(self):
89        if self._std_profile is None:
90            self._calculate_avg_profile()
91        return self._std_profile
92
93    def _set_profile(self, value):
94        self._profile = value
95
96    intracluster_dist = property(fget=_get_intra, fset=_set_forbidden)
97    intercluster_dist = property(fget=_get_inter, fset=_set_forbidden)
98    silhouette = property(fget=_get_silh, fset=_set_forbidden)
99    profile = property(fget=_get_prof, fset=_set_profile)
100    deviation = property(fget=_get_std, fset=_set_forbidden)
101
102    def __init__(self, newick = None, text_array = None, \
103                 fdist=clustvalidation.default_dist):
104        # Default dist is spearman_dist when scipy module is loaded
105        # otherwise, it is set to euclidean_dist.
106
107        # Initialize basic tree features and loads the newick (if any)
108        TreeNode.__init__(self, newick)
109        self._fdist = None
110        self._silhouette = None
111        self._intercluster_dist = None
112        self._intracluster_dist = None
113        self._profile = None
114        self._std_profile = None
115
116        # Cluster especific features
117        self.features.add("intercluster_dist")
118        self.features.add("intracluster_dist")
119        self.features.add("silhouette")
120        self.features.add("profile")
121        self.features.add("deviation")
122
123        # Initialize tree with array data
124        if text_array:
125            self.link_to_arraytable(text_array)
126
127        if newick:
128            self.set_distance_function(fdist)
129
130    def __repr__(self):
131        return "ClusterTree node (%s)" %hex(self.__hash__())
132
133    def set_distance_function(self, fn):
134        """ Sets the distance function used to calculate cluster
135        distances and silouette index.
136
137        ARGUMENTS:
138
139          fn: a pointer to python function acepting two arrays (numpy) as
140          arguments.
141
142        EXAMPLE:
143
144          # A simple euclidean distance
145          my_dist_fn = lambda x,y: abs(x-y)
146          tree.set_distance_function(my_dist_fn)
147
148          """
149        for n in self.traverse():
150            n._fdist = fn
151            n._silhouette = None
152            n._intercluster_dist = None
153            n._intracluster_dist = None
154
155    def link_to_arraytable(self, arraytbl):
156        """ Allows to link a given arraytable object to the tree
157        structure under this node. Row names in the arraytable object
158        are expected to match leaf names.
159
160        Returns a list of nodes for with profiles could not been found
161        in arraytable.
162
163        """
164
165        # Initialize tree with array data
166
167        if type(arraytbl) == ArrayTable:
168            array = arraytbl
169        else:
170            array = ArrayTable(arraytbl)
171
172        missing_leaves = []
173        matrix_values = [i for r in range(len(array.matrix))\
174                           for i in array.matrix[r] if numpy.isfinite(i)]
175
176        array._matrix_min = min(matrix_values)
177        array._matrix_max = max(matrix_values)
178
179        for n in self.traverse():
180            n.arraytable = array
181            if n.is_leaf() and n.name in array.rowNames:
182                n._profile = array.get_row_vector(n.name)
183            elif n.is_leaf():
184                n._profile = [numpy.nan]*len(array.colNames)
185                missing_leaves.append(n)
186
187
188        if len(missing_leaves)>0:
189            print("""[%d] leaf names could not be mapped to the matrix rows.""" %\
190                len(missing_leaves), file=stderr)
191
192        self.arraytable = array
193
194    def iter_leaf_profiles(self):
195        """ Returns an iterator over all the profiles associated to
196        the leaves under this node."""
197        for l in self.iter_leaves():
198            yield l.get_profile()[0]
199
200    def get_leaf_profiles(self):
201        """ Returns the list of all the profiles associated to the
202        leaves under this node."""
203        return [l.get_profile()[0] for l in self.iter_leaves()]
204
205    def get_silhouette(self, fdist=None):
206        """ Calculates the node's silhouette value by using a given
207        distance function. By default, euclidean distance is used. It
208        also calculates the deviation profile, mean profile, and
209        inter/intra-cluster distances.
210
211        It sets the following features into the analyzed node:
212           - node.intracluster
213           - node.intercluster
214           - node.silhouete
215
216        intracluster distances a(i) are calculated as the Centroid
217        Diameter
218
219        intercluster distances b(i) are calculated as the Centroid linkage distance
220
221        ** Rousseeuw, P.J. (1987) Silhouettes: A graphical aid to the
222        interpretation and validation of cluster analysis.
223        J. Comput. Appl. Math., 20, 53-65.
224
225        """
226        if fdist is None:
227            fdist = self._fdist
228
229        # Updates internal values
230        self._silhouette, self._intracluster_dist, self._intercluster_dist = \
231            clustvalidation.get_silhouette_width(fdist, self)
232        # And returns them
233        return self._silhouette, self._intracluster_dist, self._intercluster_dist
234
235    def get_dunn(self, clusters, fdist=None):
236        """ Calculates the Dunn index for the given set of descendant
237        nodes.
238        """
239
240        if fdist is None:
241            fdist = self._fdist
242        nodes = _translate_nodes(self, *clusters)
243        return clustvalidation.get_dunn_index(fdist, *nodes)
244
245    def _calculate_avg_profile(self):
246        """ This internal function updates the mean profile
247        associated to an internal node. """
248
249        # Updates internal values
250        self._profile, self._std_profile = clustvalidation.get_avg_profile(self)
251
252
253# cosmetic alias
254#: .. currentmodule:: ete3
255#
256ClusterTree = ClusterNode
257