1# This code is part of the Biopython distribution and governed by its
2# license.  Please see the LICENSE file that should have been included
3# as part of this package.
4#
5"""Cluster Analysis.
6
7The Bio.Cluster provides commonly used clustering algorithms and was
8designed with the application to gene expression data in mind. However,
9this module can also be used for cluster analysis of other types of data.
10
11Bio.Cluster and the underlying C Clustering Library is described in
12M. de Hoon et al. (2004) https://doi.org/10.1093/bioinformatics/bth078
13"""
14
15import numbers
16
17try:
18    import numpy
19except ImportError:
20    from Bio import MissingPythonDependencyError
21
22    raise MissingPythonDependencyError(
23        "Please install numpy if you want to use Bio.Cluster. "
24        "See http://www.numpy.org/"
25    ) from None
26
27from . import _cluster
28
29__all__ = (
30    "Node",
31    "Tree",
32    "kcluster",
33    "kmedoids",
34    "treecluster",
35    "somcluster",
36    "clusterdistance",
37    "clustercentroids",
38    "distancematrix",
39    "pca",
40    "Record",
41    "read",
42)
43
44
45__version__ = _cluster.version()
46
47
48class Node(_cluster.Node):
49    """Element of a hierarchical clustering tree.
50
51    A node contains items or other Nodes(sub-nodes).
52    """
53
54    __doc__ = _cluster.Node.__doc__
55
56
57class Tree(_cluster.Tree):
58    """Hierarchical clustering tree.
59
60    A Tree consists of Nodes.
61    """
62
63    def sort(self, order=None):
64        """Sort the hierarchical clustering tree.
65
66        Sort the hierarchical clustering tree by switching the left and
67        right subnode of nodes such that the elements in the left-to-right
68        order of the tree tend to have increasing order values.
69
70        Return the indices of the elements in the left-to-right order in
71        the hierarchical clustering tree, such that the element with index
72        indices[i] occurs at position i in the dendrogram.
73
74        """
75        n = len(self) + 1
76        indices = numpy.ones(n, dtype="intc")
77        if order is None:
78            order = numpy.ones(n, dtype="d")
79        elif isinstance(order, numpy.ndarray):
80            order = numpy.require(order, dtype="d", requirements="C")
81        else:
82            order = numpy.array(order, dtype="d")
83        _cluster.Tree.sort(self, indices, order)
84        return indices
85
86    def cut(self, nclusters=None):
87        """Create clusters by cutting the hierarchical clustering tree.
88
89        Divide the elements in a hierarchical clustering result mytree
90        into clusters, and return an array with the number of the cluster
91        to which each element was assigned.
92
93        Keyword arguments:
94         - nclusters: The desired number of clusters.
95        """
96        n = len(self) + 1
97        indices = numpy.ones(n, dtype="intc")
98        if nclusters is None:
99            nclusters = n
100        _cluster.Tree.cut(self, indices, nclusters)
101        return indices
102
103
104def kcluster(
105    data,
106    nclusters=2,
107    mask=None,
108    weight=None,
109    transpose=False,
110    npass=1,
111    method="a",
112    dist="e",
113    initialid=None,
114):
115    """Perform k-means clustering.
116
117    This function performs k-means clustering on the values in data, and
118    returns the cluster assignments, the within-cluster sum of distances
119    of the optimal k-means clustering solution, and the number of times
120    the optimal solution was found.
121
122    Keyword arguments:
123     - data: nrows x ncolumns array containing the data values.
124     - nclusters: number of clusters (the 'k' in k-means).
125     - mask: nrows x ncolumns array of integers, showing which data
126       are missing. If mask[i,j]==0, then data[i,j] is missing.
127     - weight: the weights to be used when calculating distances
128     - transpose:
129       - if False: rows are clustered;
130       - if True: columns are clustered.
131     - npass: number of times the k-means clustering algorithm is
132       performed, each time with a different (random) initial
133       condition.
134     - method: specifies how the center of a cluster is found:
135       - method == 'a': arithmetic mean;
136       - method == 'm': median.
137     - dist: specifies the distance function to be used:
138       - dist == 'e': Euclidean distance;
139       - dist == 'b': City Block distance;
140       - dist == 'c': Pearson correlation;
141       - dist == 'a': absolute value of the correlation;
142       - dist == 'u': uncentered correlation;
143       - dist == 'x': absolute uncentered correlation;
144       - dist == 's': Spearman's rank correlation;
145       - dist == 'k': Kendall's tau.
146     - initialid: the initial clustering from which the algorithm
147       should start.
148       If initialid is None, the routine carries out npass
149       repetitions of the EM algorithm, each time starting from a
150       different random initial clustering. If initialid is given,
151       the routine carries out the EM algorithm only once, starting
152       from the given initial clustering and without randomizing the
153       order in which items are assigned to clusters (i.e., using
154       the same order as in the data matrix). In that case, the
155       k-means algorithm is fully deterministic.
156
157    Return values:
158     - clusterid: array containing the number of the cluster to which each
159       item was assigned in the best k-means clustering solution that was
160       found in the npass runs;
161     - error: the within-cluster sum of distances for the returned k-means
162       clustering solution;
163     - nfound: the number of times this solution was found.
164    """
165    data = __check_data(data)
166    shape = data.shape
167    if transpose:
168        ndata, nitems = shape
169    else:
170        nitems, ndata = shape
171    mask = __check_mask(mask, shape)
172    weight = __check_weight(weight, ndata)
173    clusterid, npass = __check_initialid(initialid, npass, nitems)
174    error, nfound = _cluster.kcluster(
175        data, nclusters, mask, weight, transpose, npass, method, dist, clusterid
176    )
177    return clusterid, error, nfound
178
179
180def kmedoids(distance, nclusters=2, npass=1, initialid=None):
181    """Perform k-medoids clustering.
182
183    This function performs k-medoids clustering, and returns the cluster
184    assignments, the within-cluster sum of distances of the optimal
185    k-medoids clustering solution, and the number of times the optimal
186    solution was found.
187
188    Keyword arguments:
189     - distance: The distance matrix between the items. There are three
190       ways in which you can pass a distance matrix:
191       1. a 2D Numerical Python array (in which only the left-lower
192       part of the array will be accessed);
193       2. a 1D Numerical Python array containing the distances
194       consecutively;
195       3. a list of rows containing the lower-triangular part of
196       the distance matrix.
197
198       Examples are:
199
200           >>> from numpy import array
201           >>> # option 1:
202           >>> distance = array([[0.0, 1.1, 2.3],
203           ...                   [1.1, 0.0, 4.5],
204           ...                   [2.3, 4.5, 0.0]])
205           >>> # option 2:
206           >>> distance = array([1.1, 2.3, 4.5])
207           >>> # option 3:
208           >>> distance = [array([]),
209           ...             array([1.1]),
210           ...             array([2.3, 4.5])]
211
212
213       These three correspond to the same distance matrix.
214     - nclusters: number of clusters (the 'k' in k-medoids)
215     - npass: the number of times the k-medoids clustering algorithm
216       is performed, each time with a different (random) initial
217       condition.
218     - initialid: the initial clustering from which the algorithm should start.
219       If initialid is not given, the routine carries out npass
220       repetitions of the EM algorithm, each time starting from a
221       different random initial clustering. If initialid is given,
222       the routine carries out the EM algorithm only once, starting
223       from the initial clustering specified by initialid and
224       without randomizing the order in which items are assigned to
225       clusters (i.e., using the same order as in the data matrix).
226       In that case, the k-medoids algorithm is fully deterministic.
227
228    Return values:
229     - clusterid: array containing the number of the cluster to which each
230       item was assigned in the best k-means clustering solution that was
231       found in the npass runs;
232     - error: the within-cluster sum of distances for the returned k-means
233       clustering solution;
234     - nfound: the number of times this solution was found.
235    """
236    distance = __check_distancematrix(distance)
237    nitems = len(distance)
238    clusterid, npass = __check_initialid(initialid, npass, nitems)
239    error, nfound = _cluster.kmedoids(distance, nclusters, npass, clusterid)
240    return clusterid, error, nfound
241
242
243def treecluster(
244    data,
245    mask=None,
246    weight=None,
247    transpose=False,
248    method="m",
249    dist="e",
250    distancematrix=None,
251):
252    """Perform hierarchical clustering, and return a Tree object.
253
254    This function implements the pairwise single, complete, centroid, and
255    average linkage hierarchical clustering methods.
256
257    Keyword arguments:
258     - data: nrows x ncolumns array containing the data values.
259     - mask: nrows x ncolumns array of integers, showing which data are
260       missing. If mask[i][j]==0, then data[i][j] is missing.
261     - weight: the weights to be used when calculating distances.
262     - transpose:
263       - if False, rows are clustered;
264       - if True, columns are clustered.
265     - dist: specifies the distance function to be used:
266       - dist == 'e': Euclidean distance
267       - dist == 'b': City Block distance
268       - dist == 'c': Pearson correlation
269       - dist == 'a': absolute value of the correlation
270       - dist == 'u': uncentered correlation
271       - dist == 'x': absolute uncentered correlation
272       - dist == 's': Spearman's rank correlation
273       - dist == 'k': Kendall's tau
274     - method: specifies which linkage method is used:
275       - method == 's': Single pairwise linkage
276       - method == 'm': Complete (maximum) pairwise linkage (default)
277       - method == 'c': Centroid linkage
278       - method == 'a': Average pairwise linkage
279     - distancematrix:  The distance matrix between the items. There are
280       three ways in which you can pass a distance matrix:
281       1. a 2D Numerical Python array (in which only the left-lower
282       part of the array will be accessed);
283       2. a 1D Numerical Python array containing the distances
284       consecutively;
285       3. a list of rows containing the lower-triangular part of
286       the distance matrix.
287
288       Examples are:
289
290           >>> from numpy import array
291           >>> # option 1:
292           >>> distance = array([[0.0, 1.1, 2.3],
293           ...                   [1.1, 0.0, 4.5],
294           ...                   [2.3, 4.5, 0.0]])
295           >>> # option 2:
296           >>> distance = array([1.1, 2.3, 4.5])
297           >>> # option 3:
298           >>> distance = [array([]),
299           ...             array([1.1]),
300           ...             array([2.3, 4.5])]
301
302       These three correspond to the same distance matrix.
303
304       PLEASE NOTE:
305       As the treecluster routine may shuffle the values in the
306       distance matrix as part of the clustering algorithm, be sure
307       to save this array in a different variable before calling
308       treecluster if you need it later.
309
310    Either data or distancematrix should be None. If distancematrix is None,
311    the hierarchical clustering solution is calculated from the values stored
312    in the argument data. If data is None, the hierarchical clustering solution
313    is instead calculated from the distance matrix. Pairwise centroid-linkage
314    clustering can be performed only from the data values and not from the
315    distance matrix. Pairwise single-, maximum-, and average-linkage clustering
316    can be calculated from the data values or from the distance matrix.
317
318    Return value:
319    treecluster returns a Tree object describing the hierarchical clustering
320    result. See the description of the Tree class for more information.
321    """
322    if data is None and distancematrix is None:
323        raise ValueError("use either data or distancematrix")
324    if data is not None and distancematrix is not None:
325        raise ValueError("use either data or distancematrix; do not use both")
326    if data is not None:
327        data = __check_data(data)
328        shape = data.shape
329        ndata = shape[0] if transpose else shape[1]
330        mask = __check_mask(mask, shape)
331        weight = __check_weight(weight, ndata)
332    if distancematrix is not None:
333        distancematrix = __check_distancematrix(distancematrix)
334        if mask is not None:
335            raise ValueError("mask is ignored if distancematrix is used")
336        if weight is not None:
337            raise ValueError("weight is ignored if distancematrix is used")
338    tree = Tree()
339    _cluster.treecluster(
340        tree, data, mask, weight, transpose, method, dist, distancematrix
341    )
342    return tree
343
344
345def somcluster(
346    data,
347    mask=None,
348    weight=None,
349    transpose=False,
350    nxgrid=2,
351    nygrid=1,
352    inittau=0.02,
353    niter=1,
354    dist="e",
355):
356    """Calculate a Self-Organizing Map.
357
358    This function implements a Self-Organizing Map on a rectangular grid.
359
360    Keyword arguments:
361     - data: nrows x ncolumns array containing the data values;
362     - mask: nrows x ncolumns array of integers, showing which data are
363       missing. If mask[i][j]==0, then data[i][j] is missing.
364     - weight: the weights to be used when calculating distances
365     - transpose:
366       - if False: rows are clustered;
367       - if True: columns are clustered.
368     - nxgrid: the horizontal dimension of the rectangular SOM map
369     - nygrid: the vertical dimension of the rectangular SOM map
370     - inittau: the initial value of tau (the neighborbood function)
371     - niter: the number of iterations
372     - dist: specifies the distance function to be used:
373       - dist == 'e': Euclidean distance
374       - dist == 'b': City Block distance
375       - dist == 'c': Pearson correlation
376       - dist == 'a': absolute value of the correlation
377       - dist == 'u': uncentered correlation
378       - dist == 'x': absolute uncentered correlation
379       - dist == 's': Spearman's rank correlation
380       - dist == 'k': Kendall's tau
381
382    Return values:
383
384     - clusterid: array with two columns, with the number of rows equal to
385       the items that are being clustered. Each row in the array contains
386       the x and y coordinates of the cell in the rectangular SOM grid to
387       which the item was assigned.
388     - celldata:  an array with dimensions [nxgrid, nygrid, number of columns]
389       if rows are being clustered, or [nxgrid, nygrid, number of rows) if
390       columns are being clustered.
391       Each element [ix, iy] of this array is a 1D vector containing the
392       data values for the centroid of the cluster in the SOM grid cell
393       with coordinates [ix, iy].
394    """
395    if transpose:
396        ndata, nitems = data.shape
397    else:
398        nitems, ndata = data.shape
399    data = __check_data(data)
400    shape = data.shape
401    mask = __check_mask(mask, shape)
402    weight = __check_weight(weight, ndata)
403    if nxgrid < 1:
404        raise ValueError("nxgrid should be a positive integer (default is 2)")
405    if nygrid < 1:
406        raise ValueError("nygrid should be a positive integer (default is 1)")
407    clusterids = numpy.ones((nitems, 2), dtype="intc")
408    celldata = numpy.empty((nxgrid, nygrid, ndata), dtype="d")
409    _cluster.somcluster(
410        clusterids, celldata, data, mask, weight, transpose, inittau, niter, dist
411    )
412    return clusterids, celldata
413
414
415def clusterdistance(
416    data,
417    mask=None,
418    weight=None,
419    index1=None,
420    index2=None,
421    method="a",
422    dist="e",
423    transpose=False,
424):
425    """Calculate and return the distance between two clusters.
426
427    Keyword arguments:
428     - data: nrows x ncolumns array containing the data values.
429     - mask: nrows x ncolumns array of integers, showing which data are
430       missing. If mask[i, j]==0, then data[i, j] is missing.
431     - weight: the weights to be used when calculating distances
432     - index1: 1D array identifying which items belong to the
433       first cluster. If the cluster contains only one item, then
434       index1 can also be written as a single integer.
435     - index2: 1D array identifying which items belong to the
436       second cluster. If the cluster contains only one item, then
437       index2 can also be written as a single integer.
438     - dist: specifies the distance function to be used:
439       - dist == 'e': Euclidean distance
440       - dist == 'b': City Block distance
441       - dist == 'c': Pearson correlation
442       - dist == 'a': absolute value of the correlation
443       - dist == 'u': uncentered correlation
444       - dist == 'x': absolute uncentered correlation
445       - dist == 's': Spearman's rank correlation
446       - dist == 'k': Kendall's tau
447     - method: specifies how the distance between two clusters is defined:
448       - method == 'a': the distance between the arithmetic means
449       of the two clusters
450       - method == 'm': the distance between the medians of the two clusters
451       - method == 's': the smallest pairwise distance between members
452       of the two clusters
453       - method == 'x': the largest pairwise distance between members
454       of the two clusters
455       - method == 'v': average of the pairwise distances between members
456       of the two clusters
457     - transpose:
458       - if False: clusters of rows are considered;
459       - if True: clusters of columns are considered.
460    """
461    data = __check_data(data)
462    shape = data.shape
463    ndata = shape[0] if transpose else shape[1]
464    mask = __check_mask(mask, shape)
465    weight = __check_weight(weight, ndata)
466    index1 = __check_index(index1)
467    index2 = __check_index(index2)
468    return _cluster.clusterdistance(
469        data, mask, weight, index1, index2, method, dist, transpose
470    )
471
472
473def clustercentroids(data, mask=None, clusterid=None, method="a", transpose=False):
474    """Calculate and return the centroid of each cluster.
475
476    The clustercentroids routine calculates the cluster centroids, given to
477    which cluster each item belongs. The centroid is defined as either
478    the mean or the median over all items for each dimension.
479
480    Keyword arguments:
481     - data: nrows x ncolumns array containing the data values.
482     - mask: nrows x ncolumns array of integers, showing which data are
483       missing. If mask[i, j]==0, then data[i, j] is missing.
484     - clusterid: array containing the cluster number for each item.
485       The cluster number should be non-negative.
486     - method: specifies whether the centroid is calculated from the
487       arithmetic mean (method == 'a', default) or the median (method == 'm')
488       over each dimension.
489     - transpose: if False, each row contains the data for one item;
490                  if True, each column contains the data for one item.
491
492    Return values:
493     - cdata: 2D array containing the cluster centroids.
494       If transpose is False, then the dimensions of cdata are
495       nclusters x ncolumns.
496       If transpose is True, then the dimensions of cdata are
497       nrows x nclusters.
498     - cmask: 2D array of integers describing which items in cdata,
499       if any, are missing.
500    """
501    data = __check_data(data)
502    mask = __check_mask(mask, data.shape)
503    nrows, ncolumns = data.shape
504    if clusterid is None:
505        n = ncolumns if transpose else nrows
506        clusterid = numpy.zeros(n, dtype="intc")
507        nclusters = 1
508    else:
509        clusterid = numpy.require(clusterid, dtype="intc", requirements="C")
510        nclusters = max(clusterid + 1)
511    if transpose:
512        shape = (nrows, nclusters)
513    else:
514        shape = (nclusters, ncolumns)
515    cdata = numpy.zeros(shape, dtype="d")
516    cmask = numpy.zeros(shape, dtype="intc")
517    _cluster.clustercentroids(data, mask, clusterid, method, transpose, cdata, cmask)
518    return cdata, cmask
519
520
521def distancematrix(data, mask=None, weight=None, transpose=False, dist="e"):
522    """Calculate and return a distance matrix from the data.
523
524    This function returns the distance matrix calculated from the data.
525
526    Keyword arguments:
527     - data: nrows x ncolumns array containing the data values.
528     - mask: nrows x ncolumns array of integers, showing which data are
529       missing. If mask[i, j]==0, then data[i, j] is missing.
530     - weight: the weights to be used when calculating distances.
531     - transpose: if False: the distances between rows are calculated;
532                  if True:  the distances between columns are calculated.
533     - dist: specifies the distance function to be used:
534       - dist == 'e': Euclidean distance
535       - dist == 'b': City Block distance
536       - dist == 'c': Pearson correlation
537       - dist == 'a': absolute value of the correlation
538       - dist == 'u': uncentered correlation
539       - dist == 'x': absolute uncentered correlation
540       - dist == 's': Spearman's rank correlation
541       - dist == 'k': Kendall's tau
542
543    Return value:
544    The distance matrix is returned as a list of 1D arrays containing the
545    distance matrix calculated from the data. The number of columns in eac
546    row is equal to the row number. Hence, the first row has zero length.
547    For example:
548
549    >>> from numpy import array
550    >>> from Bio.Cluster import distancematrix
551    >>> data = array([[0, 1,  2,  3],
552    ...               [4, 5,  6,  7],
553    ...               [8, 9, 10, 11],
554    ...               [1, 2,  3,  4]])
555    >>> distances = distancematrix(data, dist='e')
556    >>> distances
557    [array([], dtype=float64), array([ 16.]), array([ 64.,  16.]), array([  1.,   9.,  49.])]
558
559    which can be rewritten as
560       distances = [array([], dtype=float64),
561                    array([ 16.]),
562                    array([ 64.,  16.]),
563                    array([  1.,   9.,  49.])]
564
565    This corresponds to the distance matrix:
566
567        [ 0., 16., 64.,  1.]
568        [16.,  0., 16.,  9.]
569        [64., 16.,  0., 49.]
570        [ 1.,  9., 49.,  0.]
571    """
572    data = __check_data(data)
573    shape = data.shape
574    mask = __check_mask(mask, shape)
575    if transpose:
576        ndata, nitems = shape
577    else:
578        nitems, ndata = shape
579    weight = __check_weight(weight, ndata)
580    matrix = [numpy.empty(i, dtype="d") for i in range(nitems)]
581    _cluster.distancematrix(data, mask, weight, transpose, dist, matrix)
582    return matrix
583
584
585def pca(data):
586    """Perform principal component analysis.
587
588    Keyword arguments:
589     - data: nrows x ncolumns array containing the data values.
590
591    Return value:
592    This function returns an array containing the mean of each column, the
593    principal components as an nmin x ncolumns array, as well as the
594    coordinates (an nrows x nmin array) of the data along the principal
595    components, and the associated eigenvalues. The principal components, the
596    coordinates, and the eigenvalues are sorted by the magnitude of the
597    eigenvalue, with the largest eigenvalues appearing first. Here, nmin is
598    the smaller of nrows and ncolumns.
599    Adding the column means to the dot product of the coordinates and the
600    principal components recreates the data matrix:
601
602    >>> from numpy import array, dot, amax, amin
603    >>> from Bio.Cluster import pca
604    >>> matrix = array([[ 0.,  0.,  0.],
605    ...                 [ 1.,  0.,  0.],
606    ...                 [ 7.,  3.,  0.],
607    ...                 [ 4.,  2.,  6.]])
608    >>> columnmean, coordinates, pc, _ = pca(matrix)
609    >>> m = matrix - (columnmean + dot(coordinates, pc))
610    >>> amax(m) < 1e-12 and amin(m) > -1e-12
611    True
612
613    """
614    data = __check_data(data)
615    nrows, ncols = data.shape
616    nmin = min(nrows, ncols)
617    columnmean = numpy.empty(ncols, dtype="d")
618    pc = numpy.empty((nmin, ncols), dtype="d")
619    coordinates = numpy.empty((nrows, nmin), dtype="d")
620    eigenvalues = numpy.empty(nmin, dtype="d")
621    _cluster.pca(data, columnmean, coordinates, pc, eigenvalues)
622    return columnmean, coordinates, pc, eigenvalues
623
624
625class Record:
626    """Store gene expression data.
627
628    A Record stores the gene expression data and related information contained
629    in a data file following the file format defined for Michael Eisen's
630    Cluster/TreeView program.
631
632    Attributes:
633     - data: a matrix containing the gene expression data
634     - mask: a matrix containing only 1's and 0's, denoting which values
635       are present (1) or missing (0). If all items of mask are
636       one (no missing data), then mask is set to None.
637     - geneid: a list containing a unique identifier for each gene
638       (e.g., ORF name)
639     - genename: a list containing an additional description for each gene
640       (e.g., gene name)
641     - gweight: the weight to be used for each gene when calculating the
642       distance
643     - gorder: an array of real numbers indicating the preferred order of the
644       genes in the output file
645     - expid: a list containing a unique identifier for each sample.
646     - eweight: the weight to be used for each sample when calculating the
647       distance
648     - eorder: an array of real numbers indication the preferred order of the
649       samples in the output file
650     - uniqid: the string that was used instead of UNIQID in the input file.
651
652    """
653
654    def __init__(self, handle=None):
655        """Read gene expression data from the file handle and return a Record.
656
657        The file should be in the format defined for Michael Eisen's
658        Cluster/TreeView program.
659        """
660        self.data = None
661        self.mask = None
662        self.geneid = None
663        self.genename = None
664        self.gweight = None
665        self.gorder = None
666        self.expid = None
667        self.eweight = None
668        self.eorder = None
669        self.uniqid = None
670        if not handle:
671            return
672        line = handle.readline().strip("\r\n").split("\t")
673        n = len(line)
674        self.uniqid = line[0]
675        self.expid = []
676        cols = {0: "GENEID"}
677        for word in line[1:]:
678            if word == "NAME":
679                cols[line.index(word)] = word
680                self.genename = []
681            elif word == "GWEIGHT":
682                cols[line.index(word)] = word
683                self.gweight = []
684            elif word == "GORDER":
685                cols[line.index(word)] = word
686                self.gorder = []
687            else:
688                self.expid.append(word)
689        self.geneid = []
690        self.data = []
691        self.mask = []
692        needmask = 0
693        for line in handle:
694            line = line.strip("\r\n").split("\t")
695            if len(line) != n:
696                raise ValueError(
697                    "Line with %d columns found (expected %d)" % (len(line), n)
698                )
699            if line[0] == "EWEIGHT":
700                i = max(cols) + 1
701                self.eweight = numpy.array(line[i:], float)
702                continue
703            if line[0] == "EORDER":
704                i = max(cols) + 1
705                self.eorder = numpy.array(line[i:], float)
706                continue
707            rowdata = []
708            rowmask = []
709            n = len(line)
710            for i in range(n):
711                word = line[i]
712                if i in cols:
713                    if cols[i] == "GENEID":
714                        self.geneid.append(word)
715                    if cols[i] == "NAME":
716                        self.genename.append(word)
717                    if cols[i] == "GWEIGHT":
718                        self.gweight.append(float(word))
719                    if cols[i] == "GORDER":
720                        self.gorder.append(float(word))
721                    continue
722                if not word:
723                    rowdata.append(0.0)
724                    rowmask.append(0)
725                    needmask = 1
726                else:
727                    rowdata.append(float(word))
728                    rowmask.append(1)
729            self.data.append(rowdata)
730            self.mask.append(rowmask)
731        self.data = numpy.array(self.data)
732        if needmask:
733            self.mask = numpy.array(self.mask, int)
734        else:
735            self.mask = None
736        if self.gweight:
737            self.gweight = numpy.array(self.gweight)
738        if self.gorder:
739            self.gorder = numpy.array(self.gorder)
740
741    def treecluster(self, transpose=False, method="m", dist="e"):
742        """Apply hierarchical clustering and return a Tree object.
743
744        The pairwise single, complete, centroid, and average linkage
745        hierarchical clustering methods are available.
746
747        Keyword arguments:
748         - transpose: if False: rows are clustered;
749                      if True: columns are clustered.
750         - dist: specifies the distance function to be used:
751           - dist == 'e': Euclidean distance
752           - dist == 'b': City Block distance
753           - dist == 'c': Pearson correlation
754           - dist == 'a': absolute value of the correlation
755           - dist == 'u': uncentered correlation
756           - dist == 'x': absolute uncentered correlation
757           - dist == 's': Spearman's rank correlation
758           - dist == 'k': Kendall's tau
759         - method: specifies which linkage method is used:
760           - method == 's': Single pairwise linkage
761           - method == 'm': Complete (maximum) pairwise linkage (default)
762           - method == 'c': Centroid linkage
763           - method == 'a': Average pairwise linkage
764
765        See the description of the Tree class for more information about
766        the Tree object returned by this method.
767        """
768        if transpose:
769            weight = self.gweight
770        else:
771            weight = self.eweight
772        return treecluster(self.data, self.mask, weight, transpose, method, dist)
773
774    def kcluster(
775        self,
776        nclusters=2,
777        transpose=False,
778        npass=1,
779        method="a",
780        dist="e",
781        initialid=None,
782    ):
783        """Apply k-means or k-median clustering.
784
785        This method returns a tuple (clusterid, error, nfound).
786
787        Keyword arguments:
788         - nclusters: number of clusters (the 'k' in k-means)
789         - transpose: if False, genes (rows) are clustered;
790                      if True, samples (columns) are clustered.
791         - npass: number of times the k-means clustering algorithm is
792           performed, each time with a different (random) initial condition.
793         - method: specifies how the center of a cluster is found:
794           - method == 'a': arithmetic mean
795           - method == 'm': median
796         - dist: specifies the distance function to be used:
797           - dist == 'e': Euclidean distance
798           - dist == 'b': City Block distance
799           - dist == 'c': Pearson correlation
800           - dist == 'a': absolute value of the correlation
801           - dist == 'u': uncentered correlation
802           - dist == 'x': absolute uncentered correlation
803           - dist == 's': Spearman's rank correlation
804           - dist == 'k': Kendall's tau
805         - initialid: the initial clustering from which the algorithm should
806           start. If initialid is None, the routine carries out npass
807           repetitions of the EM algorithm, each time starting from a different
808           random initial clustering. If initialid is given, the routine
809           carries out the EM algorithm only once, starting from the given
810           initial clustering and without randomizing the order in which items
811           are assigned to clusters (i.e., using the same order as in the data
812           matrix). In that case, the k-means algorithm is fully deterministic.
813
814        Return values:
815         - clusterid: array containing the number of the cluster to which each
816           gene/sample was assigned in the best k-means clustering
817           solution that was found in the npass runs;
818         - error: the within-cluster sum of distances for the returned
819           k-means clustering solution;
820         - nfound: the number of times this solution was found.
821        """
822        if transpose:
823            weight = self.gweight
824        else:
825            weight = self.eweight
826        return kcluster(
827            self.data,
828            nclusters,
829            self.mask,
830            weight,
831            transpose,
832            npass,
833            method,
834            dist,
835            initialid,
836        )
837
838    def somcluster(
839        self, transpose=False, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist="e"
840    ):
841        """Calculate a self-organizing map on a rectangular grid.
842
843        The somcluster method returns a tuple (clusterid, celldata).
844
845        Keyword arguments:
846         - transpose: if False, genes (rows) are clustered;
847                      if True,  samples (columns) are clustered.
848         - nxgrid: the horizontal dimension of the rectangular SOM map
849         - nygrid: the vertical dimension of the rectangular SOM map
850         - inittau: the initial value of tau (the neighborbood function)
851         - niter: the number of iterations
852         - dist: specifies the distance function to be used:
853           - dist == 'e': Euclidean distance
854           - dist == 'b': City Block distance
855           - dist == 'c': Pearson correlation
856           - dist == 'a': absolute value of the correlation
857           - dist == 'u': uncentered correlation
858           - dist == 'x': absolute uncentered correlation
859           - dist == 's': Spearman's rank correlation
860           - dist == 'k': Kendall's tau
861
862        Return values:
863         - clusterid: array with two columns, while the number of rows is equal
864           to the number of genes or the number of samples depending on
865           whether genes or samples are being clustered. Each row in
866           the array contains the x and y coordinates of the cell in the
867           rectangular SOM grid to which the gene or samples was assigned.
868         - celldata: an array with dimensions (nxgrid, nygrid, number of
869           samples) if genes are being clustered, or (nxgrid, nygrid,
870           number of genes) if samples are being clustered. Each item
871           [ix, iy] of this array is a 1D vector containing the gene
872           expression data for the centroid of the cluster in the SOM grid
873           cell with coordinates [ix, iy].
874        """
875        if transpose:
876            weight = self.gweight
877        else:
878            weight = self.eweight
879        return somcluster(
880            self.data,
881            self.mask,
882            weight,
883            transpose,
884            nxgrid,
885            nygrid,
886            inittau,
887            niter,
888            dist,
889        )
890
891    def clustercentroids(self, clusterid=None, method="a", transpose=False):
892        """Calculate the cluster centroids and return a tuple (cdata, cmask).
893
894        The centroid is defined as either the mean or the median over all
895        items for each dimension.
896
897        Keyword arguments:
898         - data: nrows x ncolumns array containing the expression data
899         - mask: nrows x ncolumns array of integers, showing which data
900           are missing. If mask[i, j]==0, then data[i, j] is missing.
901         - transpose: if False, gene (row) clusters are considered;
902                      if True, sample (column) clusters are considered.
903         - clusterid: array containing the cluster number for each gene or
904           sample. The cluster number should be non-negative.
905         - method: specifies how the centroid is calculated:
906           - method == 'a': arithmetic mean over each dimension. (default)
907           - method == 'm': median over each dimension.
908
909        Return values:
910         - cdata: 2D array containing the cluster centroids. If transpose
911           is False, then the dimensions of cdata are nclusters x ncolumns.
912           If transpose is True, then the dimensions of cdata are nrows x
913           nclusters.
914         - cmask: 2D array of integers describing which items in cdata,
915           if any, are missing.
916        """
917        return clustercentroids(self.data, self.mask, clusterid, method, transpose)
918
919    def clusterdistance(
920        self, index1=0, index2=0, method="a", dist="e", transpose=False
921    ):
922        """Calculate the distance between two clusters.
923
924        Keyword arguments:
925         - index1: 1D array identifying which genes/samples belong to the
926           first cluster. If the cluster contains only one gene, then
927           index1 can also be written as a single integer.
928         - index2: 1D array identifying which genes/samples belong to the
929           second cluster. If the cluster contains only one gene, then
930           index2 can also be written as a single integer.
931         - transpose: if False, genes (rows) are clustered;
932                      if True, samples (columns) are clustered.
933         - dist: specifies the distance function to be used:
934           - dist == 'e': Euclidean distance
935           - dist == 'b': City Block distance
936           - dist == 'c': Pearson correlation
937           - dist == 'a': absolute value of the correlation
938           - dist == 'u': uncentered correlation
939           - dist == 'x': absolute uncentered correlation
940           - dist == 's': Spearman's rank correlation
941           - dist == 'k': Kendall's tau
942         - method: specifies how the distance between two clusters is defined:
943           - method == 'a': the distance between the arithmetic means
944           of the two clusters
945           - method == 'm': the distance between the medians of the
946           two clusters
947           - method == 's': the smallest pairwise distance between members
948           of the two clusters
949           - method == 'x': the largest pairwise distance between members
950           of the two clusters
951           - method == 'v': average of the pairwise distances between members
952           of the two clusters
953         - transpose: if False: clusters of rows are considered;
954                      if True: clusters of columns are considered.
955        """
956        if transpose:
957            weight = self.gweight
958        else:
959            weight = self.eweight
960        return clusterdistance(
961            self.data, self.mask, weight, index1, index2, method, dist, transpose
962        )
963
964    def distancematrix(self, transpose=False, dist="e"):
965        """Calculate the distance matrix and return it as a list of arrays.
966
967        Keyword arguments:
968         - transpose:
969             if False: calculate the distances between genes (rows);
970             if True: calculate the distances between samples (columns).
971         - dist: specifies the distance function to be used:
972           - dist == 'e': Euclidean distance
973           - dist == 'b': City Block distance
974           - dist == 'c': Pearson correlation
975           - dist == 'a': absolute value of the correlation
976           - dist == 'u': uncentered correlation
977           - dist == 'x': absolute uncentered correlation
978           - dist == 's': Spearman's rank correlation
979           - dist == 'k': Kendall's tau
980
981        Return value:
982
983        The distance matrix is returned as a list of 1D arrays containing the
984        distance matrix between the gene expression data. The number of columns
985        in each row is equal to the row number. Hence, the first row has zero
986        length. An example of the return value is:
987
988            matrix = [[],
989                      array([1.]),
990                      array([7., 3.]),
991                      array([4., 2., 6.])]
992
993        This corresponds to the distance matrix:
994
995            [0., 1., 7., 4.]
996            [1., 0., 3., 2.]
997            [7., 3., 0., 6.]
998            [4., 2., 6., 0.]
999
1000        """
1001        if transpose:
1002            weight = self.gweight
1003        else:
1004            weight = self.eweight
1005        return distancematrix(self.data, self.mask, weight, transpose, dist)
1006
1007    def save(self, jobname, geneclusters=None, expclusters=None):
1008        """Save the clustering results.
1009
1010        The saved files follow the convention for the Java TreeView program,
1011        which can therefore be used to view the clustering result.
1012
1013        Keyword arguments:
1014         - jobname: The base name of the files to be saved. The filenames
1015           are jobname.cdt, jobname.gtr, and jobname.atr for hierarchical
1016           clustering, and jobname-K*.cdt, jobname-K*.kgg, jobname-K*.kag
1017           for k-means clustering results.
1018         - geneclusters: For hierarchical clustering results, geneclusters
1019           is a Tree object as returned by the treecluster method. For k-means
1020           clustering results, geneclusters is a vector containing ngenes
1021           integers, describing to which cluster a given gene belongs. This
1022           vector can be calculated by kcluster.
1023         - expclusters: For hierarchical clustering results, expclusters
1024           is a Tree object as returned by the treecluster method. For k-means
1025           clustering results, expclusters is a vector containing nexps
1026           integers, describing to which cluster a given sample belongs. This
1027           vector can be calculated by kcluster.
1028        """
1029        (ngenes, nexps) = numpy.shape(self.data)
1030        if self.gorder is None:
1031            gorder = numpy.arange(ngenes)
1032        else:
1033            gorder = self.gorder
1034        if self.eorder is None:
1035            eorder = numpy.arange(nexps)
1036        else:
1037            eorder = self.eorder
1038        if (
1039            geneclusters is not None
1040            and expclusters is not None
1041            and type(geneclusters) != type(expclusters)
1042        ):
1043            raise ValueError(
1044                "found one k-means and one hierarchical "
1045                "clustering solution in geneclusters and "
1046                "expclusters"
1047            )
1048        gid = 0
1049        aid = 0
1050        filename = jobname
1051        postfix = ""
1052        if isinstance(geneclusters, Tree):
1053            # This is a hierarchical clustering result.
1054            geneindex = self._savetree(jobname, geneclusters, gorder, False)
1055            gid = 1
1056        elif geneclusters is not None:
1057            # This is a k-means clustering result.
1058            filename = jobname + "_K"
1059            k = max(geneclusters) + 1
1060            kggfilename = "%s_K_G%d.kgg" % (jobname, k)
1061            geneindex = self._savekmeans(kggfilename, geneclusters, gorder, False)
1062            postfix = "_G%d" % k
1063        else:
1064            geneindex = numpy.argsort(gorder)
1065        if isinstance(expclusters, Tree):
1066            # This is a hierarchical clustering result.
1067            expindex = self._savetree(jobname, expclusters, eorder, True)
1068            aid = 1
1069        elif expclusters is not None:
1070            # This is a k-means clustering result.
1071            filename = jobname + "_K"
1072            k = max(expclusters) + 1
1073            kagfilename = "%s_K_A%d.kag" % (jobname, k)
1074            expindex = self._savekmeans(kagfilename, expclusters, eorder, True)
1075            postfix += "_A%d" % k
1076        else:
1077            expindex = numpy.argsort(eorder)
1078        filename = filename + postfix
1079        self._savedata(filename, gid, aid, geneindex, expindex)
1080
1081    def _savetree(self, jobname, tree, order, transpose):
1082        """Save the hierarchical clustering solution (PRIVATE)."""
1083        if transpose:
1084            extension = ".atr"
1085            keyword = "ARRY"
1086        else:
1087            extension = ".gtr"
1088            keyword = "GENE"
1089        index = tree.sort(order)
1090        nnodes = len(tree)
1091        with open(jobname + extension, "w") as outputfile:
1092            nodeID = [""] * nnodes
1093            nodedist = numpy.array([node.distance for node in tree[:]])
1094            for nodeindex in range(nnodes):
1095                min1 = tree[nodeindex].left
1096                min2 = tree[nodeindex].right
1097                nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1)
1098                outputfile.write(nodeID[nodeindex])
1099                outputfile.write("\t")
1100                if min1 < 0:
1101                    index1 = -min1 - 1
1102                    outputfile.write(nodeID[index1] + "\t")
1103                    nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1])
1104                else:
1105                    outputfile.write("%s%dX\t" % (keyword, min1))
1106                if min2 < 0:
1107                    index2 = -min2 - 1
1108                    outputfile.write(nodeID[index2] + "\t")
1109                    nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2])
1110                else:
1111                    outputfile.write("%s%dX\t" % (keyword, min2))
1112                outputfile.write(str(1.0 - nodedist[nodeindex]))
1113                outputfile.write("\n")
1114        return index
1115
1116    def _savekmeans(self, filename, clusterids, order, transpose):
1117        """Save the k-means clustering solution (PRIVATE)."""
1118        if transpose:
1119            label = "ARRAY"
1120            names = self.expid
1121        else:
1122            label = self.uniqid
1123            names = self.geneid
1124        with open(filename, "w") as outputfile:
1125            outputfile.write(label + "\tGROUP\n")
1126            index = numpy.argsort(order)
1127            n = len(names)
1128            sortedindex = numpy.zeros(n, int)
1129            counter = 0
1130            cluster = 0
1131            while counter < n:
1132                for j in index:
1133                    if clusterids[j] == cluster:
1134                        outputfile.write("%s\t%s\n" % (names[j], cluster))
1135                        sortedindex[counter] = j
1136                        counter += 1
1137                cluster += 1
1138        return sortedindex
1139
1140    def _savedata(self, jobname, gid, aid, geneindex, expindex):
1141        """Save the clustered data (PRIVATE)."""
1142        if self.genename is None:
1143            genename = self.geneid
1144        else:
1145            genename = self.genename
1146        (ngenes, nexps) = numpy.shape(self.data)
1147        with open(jobname + ".cdt", "w") as outputfile:
1148            if self.mask is not None:
1149                mask = self.mask
1150            else:
1151                mask = numpy.ones((ngenes, nexps), int)
1152            if self.gweight is not None:
1153                gweight = self.gweight
1154            else:
1155                gweight = numpy.ones(ngenes)
1156            if self.eweight is not None:
1157                eweight = self.eweight
1158            else:
1159                eweight = numpy.ones(nexps)
1160            if gid:
1161                outputfile.write("GID\t")
1162            outputfile.write(self.uniqid)
1163            outputfile.write("\tNAME\tGWEIGHT")
1164            # Now add headers for data columns.
1165            for j in expindex:
1166                outputfile.write("\t%s" % self.expid[j])
1167            outputfile.write("\n")
1168            if aid:
1169                outputfile.write("AID")
1170                if gid:
1171                    outputfile.write("\t")
1172                outputfile.write("\t\t")
1173                for j in expindex:
1174                    outputfile.write("\tARRY%dX" % j)
1175                outputfile.write("\n")
1176            outputfile.write("EWEIGHT")
1177            if gid:
1178                outputfile.write("\t")
1179            outputfile.write("\t\t")
1180            for j in expindex:
1181                outputfile.write("\t%f" % eweight[j])
1182            outputfile.write("\n")
1183            for i in geneindex:
1184                if gid:
1185                    outputfile.write("GENE%dX\t" % i)
1186                outputfile.write(
1187                    "%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i])
1188                )
1189                for j in expindex:
1190                    outputfile.write("\t")
1191                    if mask[i, j]:
1192                        outputfile.write(str(self.data[i, j]))
1193                outputfile.write("\n")
1194
1195
1196def read(handle):
1197    """Read gene expression data from the file handle and return a Record.
1198
1199    The file should be in the file format defined for Michael Eisen's
1200    Cluster/TreeView program.
1201    """
1202    return Record(handle)
1203
1204
1205# Everything below is private
1206#
1207
1208
1209def __check_data(data):
1210    if isinstance(data, numpy.ndarray):
1211        data = numpy.require(data, dtype="d", requirements="C")
1212    else:
1213        data = numpy.array(data, dtype="d")
1214    if data.ndim != 2:
1215        raise ValueError("data should be 2-dimensional")
1216    if numpy.isnan(data).any():
1217        raise ValueError("data contains NaN values")
1218    return data
1219
1220
1221def __check_mask(mask, shape):
1222    if mask is None:
1223        return numpy.ones(shape, dtype="intc")
1224    elif isinstance(mask, numpy.ndarray):
1225        return numpy.require(mask, dtype="intc", requirements="C")
1226    else:
1227        return numpy.array(mask, dtype="intc")
1228
1229
1230def __check_weight(weight, ndata):
1231    if weight is None:
1232        return numpy.ones(ndata, dtype="d")
1233    if isinstance(weight, numpy.ndarray):
1234        weight = numpy.require(weight, dtype="d", requirements="C")
1235    else:
1236        weight = numpy.array(weight, dtype="d")
1237    if numpy.isnan(weight).any():
1238        raise ValueError("weight contains NaN values")
1239    return weight
1240
1241
1242def __check_initialid(initialid, npass, nitems):
1243    if initialid is None:
1244        if npass <= 0:
1245            raise ValueError("npass should be a positive integer")
1246        clusterid = numpy.empty(nitems, dtype="intc")
1247    else:
1248        npass = 0
1249        clusterid = numpy.array(initialid, dtype="intc")
1250    return clusterid, npass
1251
1252
1253def __check_index(index):
1254    if index is None:
1255        return numpy.zeros(1, dtype="intc")
1256    elif isinstance(index, numbers.Integral):
1257        return numpy.array([index], dtype="intc")
1258    elif isinstance(index, numpy.ndarray):
1259        return numpy.require(index, dtype="intc", requirements="C")
1260    else:
1261        return numpy.array(index, dtype="intc")
1262
1263
1264def __check_distancematrix(distancematrix):
1265    if distancematrix is None:
1266        return distancematrix
1267    if isinstance(distancematrix, numpy.ndarray):
1268        distancematrix = numpy.require(distancematrix, dtype="d", requirements="C")
1269    else:
1270        try:
1271            distancematrix = numpy.array(distancematrix, dtype="d")
1272        except ValueError:
1273            n = len(distancematrix)
1274            d = [None] * n
1275            for i, row in enumerate(distancematrix):
1276                if isinstance(row, numpy.ndarray):
1277                    row = numpy.require(row, dtype="d", requirements="C")
1278                else:
1279                    row = numpy.array(row, dtype="d")
1280                if row.ndim != 1:
1281                    raise ValueError("row %d is not one-dimensional" % i) from None
1282                m = len(row)
1283                if m != i:
1284                    raise ValueError(
1285                        "row %d has incorrect size (%d, expected %d)" % (i, m, i)
1286                    ) from None
1287                if numpy.isnan(row).any():
1288                    raise ValueError("distancematrix contains NaN values") from None
1289                d[i] = row
1290            return d
1291    if numpy.isnan(distancematrix).any():
1292        raise ValueError("distancematrix contains NaN values")
1293    return distancematrix
1294