1# This code is part of the Biopython distribution and governed by its 2# license. Please see the LICENSE file that should have been included 3# as part of this package. 4# 5"""Cluster Analysis. 6 7The Bio.Cluster provides commonly used clustering algorithms and was 8designed with the application to gene expression data in mind. However, 9this module can also be used for cluster analysis of other types of data. 10 11Bio.Cluster and the underlying C Clustering Library is described in 12M. de Hoon et al. (2004) https://doi.org/10.1093/bioinformatics/bth078 13""" 14 15import numbers 16 17try: 18 import numpy 19except ImportError: 20 from Bio import MissingPythonDependencyError 21 22 raise MissingPythonDependencyError( 23 "Please install numpy if you want to use Bio.Cluster. " 24 "See http://www.numpy.org/" 25 ) from None 26 27from . import _cluster 28 29__all__ = ( 30 "Node", 31 "Tree", 32 "kcluster", 33 "kmedoids", 34 "treecluster", 35 "somcluster", 36 "clusterdistance", 37 "clustercentroids", 38 "distancematrix", 39 "pca", 40 "Record", 41 "read", 42) 43 44 45__version__ = _cluster.version() 46 47 48class Node(_cluster.Node): 49 """Element of a hierarchical clustering tree. 50 51 A node contains items or other Nodes(sub-nodes). 52 """ 53 54 __doc__ = _cluster.Node.__doc__ 55 56 57class Tree(_cluster.Tree): 58 """Hierarchical clustering tree. 59 60 A Tree consists of Nodes. 61 """ 62 63 def sort(self, order=None): 64 """Sort the hierarchical clustering tree. 65 66 Sort the hierarchical clustering tree by switching the left and 67 right subnode of nodes such that the elements in the left-to-right 68 order of the tree tend to have increasing order values. 69 70 Return the indices of the elements in the left-to-right order in 71 the hierarchical clustering tree, such that the element with index 72 indices[i] occurs at position i in the dendrogram. 73 74 """ 75 n = len(self) + 1 76 indices = numpy.ones(n, dtype="intc") 77 if order is None: 78 order = numpy.ones(n, dtype="d") 79 elif isinstance(order, numpy.ndarray): 80 order = numpy.require(order, dtype="d", requirements="C") 81 else: 82 order = numpy.array(order, dtype="d") 83 _cluster.Tree.sort(self, indices, order) 84 return indices 85 86 def cut(self, nclusters=None): 87 """Create clusters by cutting the hierarchical clustering tree. 88 89 Divide the elements in a hierarchical clustering result mytree 90 into clusters, and return an array with the number of the cluster 91 to which each element was assigned. 92 93 Keyword arguments: 94 - nclusters: The desired number of clusters. 95 """ 96 n = len(self) + 1 97 indices = numpy.ones(n, dtype="intc") 98 if nclusters is None: 99 nclusters = n 100 _cluster.Tree.cut(self, indices, nclusters) 101 return indices 102 103 104def kcluster( 105 data, 106 nclusters=2, 107 mask=None, 108 weight=None, 109 transpose=False, 110 npass=1, 111 method="a", 112 dist="e", 113 initialid=None, 114): 115 """Perform k-means clustering. 116 117 This function performs k-means clustering on the values in data, and 118 returns the cluster assignments, the within-cluster sum of distances 119 of the optimal k-means clustering solution, and the number of times 120 the optimal solution was found. 121 122 Keyword arguments: 123 - data: nrows x ncolumns array containing the data values. 124 - nclusters: number of clusters (the 'k' in k-means). 125 - mask: nrows x ncolumns array of integers, showing which data 126 are missing. If mask[i,j]==0, then data[i,j] is missing. 127 - weight: the weights to be used when calculating distances 128 - transpose: 129 - if False: rows are clustered; 130 - if True: columns are clustered. 131 - npass: number of times the k-means clustering algorithm is 132 performed, each time with a different (random) initial 133 condition. 134 - method: specifies how the center of a cluster is found: 135 - method == 'a': arithmetic mean; 136 - method == 'm': median. 137 - dist: specifies the distance function to be used: 138 - dist == 'e': Euclidean distance; 139 - dist == 'b': City Block distance; 140 - dist == 'c': Pearson correlation; 141 - dist == 'a': absolute value of the correlation; 142 - dist == 'u': uncentered correlation; 143 - dist == 'x': absolute uncentered correlation; 144 - dist == 's': Spearman's rank correlation; 145 - dist == 'k': Kendall's tau. 146 - initialid: the initial clustering from which the algorithm 147 should start. 148 If initialid is None, the routine carries out npass 149 repetitions of the EM algorithm, each time starting from a 150 different random initial clustering. If initialid is given, 151 the routine carries out the EM algorithm only once, starting 152 from the given initial clustering and without randomizing the 153 order in which items are assigned to clusters (i.e., using 154 the same order as in the data matrix). In that case, the 155 k-means algorithm is fully deterministic. 156 157 Return values: 158 - clusterid: array containing the number of the cluster to which each 159 item was assigned in the best k-means clustering solution that was 160 found in the npass runs; 161 - error: the within-cluster sum of distances for the returned k-means 162 clustering solution; 163 - nfound: the number of times this solution was found. 164 """ 165 data = __check_data(data) 166 shape = data.shape 167 if transpose: 168 ndata, nitems = shape 169 else: 170 nitems, ndata = shape 171 mask = __check_mask(mask, shape) 172 weight = __check_weight(weight, ndata) 173 clusterid, npass = __check_initialid(initialid, npass, nitems) 174 error, nfound = _cluster.kcluster( 175 data, nclusters, mask, weight, transpose, npass, method, dist, clusterid 176 ) 177 return clusterid, error, nfound 178 179 180def kmedoids(distance, nclusters=2, npass=1, initialid=None): 181 """Perform k-medoids clustering. 182 183 This function performs k-medoids clustering, and returns the cluster 184 assignments, the within-cluster sum of distances of the optimal 185 k-medoids clustering solution, and the number of times the optimal 186 solution was found. 187 188 Keyword arguments: 189 - distance: The distance matrix between the items. There are three 190 ways in which you can pass a distance matrix: 191 1. a 2D Numerical Python array (in which only the left-lower 192 part of the array will be accessed); 193 2. a 1D Numerical Python array containing the distances 194 consecutively; 195 3. a list of rows containing the lower-triangular part of 196 the distance matrix. 197 198 Examples are: 199 200 >>> from numpy import array 201 >>> # option 1: 202 >>> distance = array([[0.0, 1.1, 2.3], 203 ... [1.1, 0.0, 4.5], 204 ... [2.3, 4.5, 0.0]]) 205 >>> # option 2: 206 >>> distance = array([1.1, 2.3, 4.5]) 207 >>> # option 3: 208 >>> distance = [array([]), 209 ... array([1.1]), 210 ... array([2.3, 4.5])] 211 212 213 These three correspond to the same distance matrix. 214 - nclusters: number of clusters (the 'k' in k-medoids) 215 - npass: the number of times the k-medoids clustering algorithm 216 is performed, each time with a different (random) initial 217 condition. 218 - initialid: the initial clustering from which the algorithm should start. 219 If initialid is not given, the routine carries out npass 220 repetitions of the EM algorithm, each time starting from a 221 different random initial clustering. If initialid is given, 222 the routine carries out the EM algorithm only once, starting 223 from the initial clustering specified by initialid and 224 without randomizing the order in which items are assigned to 225 clusters (i.e., using the same order as in the data matrix). 226 In that case, the k-medoids algorithm is fully deterministic. 227 228 Return values: 229 - clusterid: array containing the number of the cluster to which each 230 item was assigned in the best k-means clustering solution that was 231 found in the npass runs; 232 - error: the within-cluster sum of distances for the returned k-means 233 clustering solution; 234 - nfound: the number of times this solution was found. 235 """ 236 distance = __check_distancematrix(distance) 237 nitems = len(distance) 238 clusterid, npass = __check_initialid(initialid, npass, nitems) 239 error, nfound = _cluster.kmedoids(distance, nclusters, npass, clusterid) 240 return clusterid, error, nfound 241 242 243def treecluster( 244 data, 245 mask=None, 246 weight=None, 247 transpose=False, 248 method="m", 249 dist="e", 250 distancematrix=None, 251): 252 """Perform hierarchical clustering, and return a Tree object. 253 254 This function implements the pairwise single, complete, centroid, and 255 average linkage hierarchical clustering methods. 256 257 Keyword arguments: 258 - data: nrows x ncolumns array containing the data values. 259 - mask: nrows x ncolumns array of integers, showing which data are 260 missing. If mask[i][j]==0, then data[i][j] is missing. 261 - weight: the weights to be used when calculating distances. 262 - transpose: 263 - if False, rows are clustered; 264 - if True, columns are clustered. 265 - dist: specifies the distance function to be used: 266 - dist == 'e': Euclidean distance 267 - dist == 'b': City Block distance 268 - dist == 'c': Pearson correlation 269 - dist == 'a': absolute value of the correlation 270 - dist == 'u': uncentered correlation 271 - dist == 'x': absolute uncentered correlation 272 - dist == 's': Spearman's rank correlation 273 - dist == 'k': Kendall's tau 274 - method: specifies which linkage method is used: 275 - method == 's': Single pairwise linkage 276 - method == 'm': Complete (maximum) pairwise linkage (default) 277 - method == 'c': Centroid linkage 278 - method == 'a': Average pairwise linkage 279 - distancematrix: The distance matrix between the items. There are 280 three ways in which you can pass a distance matrix: 281 1. a 2D Numerical Python array (in which only the left-lower 282 part of the array will be accessed); 283 2. a 1D Numerical Python array containing the distances 284 consecutively; 285 3. a list of rows containing the lower-triangular part of 286 the distance matrix. 287 288 Examples are: 289 290 >>> from numpy import array 291 >>> # option 1: 292 >>> distance = array([[0.0, 1.1, 2.3], 293 ... [1.1, 0.0, 4.5], 294 ... [2.3, 4.5, 0.0]]) 295 >>> # option 2: 296 >>> distance = array([1.1, 2.3, 4.5]) 297 >>> # option 3: 298 >>> distance = [array([]), 299 ... array([1.1]), 300 ... array([2.3, 4.5])] 301 302 These three correspond to the same distance matrix. 303 304 PLEASE NOTE: 305 As the treecluster routine may shuffle the values in the 306 distance matrix as part of the clustering algorithm, be sure 307 to save this array in a different variable before calling 308 treecluster if you need it later. 309 310 Either data or distancematrix should be None. If distancematrix is None, 311 the hierarchical clustering solution is calculated from the values stored 312 in the argument data. If data is None, the hierarchical clustering solution 313 is instead calculated from the distance matrix. Pairwise centroid-linkage 314 clustering can be performed only from the data values and not from the 315 distance matrix. Pairwise single-, maximum-, and average-linkage clustering 316 can be calculated from the data values or from the distance matrix. 317 318 Return value: 319 treecluster returns a Tree object describing the hierarchical clustering 320 result. See the description of the Tree class for more information. 321 """ 322 if data is None and distancematrix is None: 323 raise ValueError("use either data or distancematrix") 324 if data is not None and distancematrix is not None: 325 raise ValueError("use either data or distancematrix; do not use both") 326 if data is not None: 327 data = __check_data(data) 328 shape = data.shape 329 ndata = shape[0] if transpose else shape[1] 330 mask = __check_mask(mask, shape) 331 weight = __check_weight(weight, ndata) 332 if distancematrix is not None: 333 distancematrix = __check_distancematrix(distancematrix) 334 if mask is not None: 335 raise ValueError("mask is ignored if distancematrix is used") 336 if weight is not None: 337 raise ValueError("weight is ignored if distancematrix is used") 338 tree = Tree() 339 _cluster.treecluster( 340 tree, data, mask, weight, transpose, method, dist, distancematrix 341 ) 342 return tree 343 344 345def somcluster( 346 data, 347 mask=None, 348 weight=None, 349 transpose=False, 350 nxgrid=2, 351 nygrid=1, 352 inittau=0.02, 353 niter=1, 354 dist="e", 355): 356 """Calculate a Self-Organizing Map. 357 358 This function implements a Self-Organizing Map on a rectangular grid. 359 360 Keyword arguments: 361 - data: nrows x ncolumns array containing the data values; 362 - mask: nrows x ncolumns array of integers, showing which data are 363 missing. If mask[i][j]==0, then data[i][j] is missing. 364 - weight: the weights to be used when calculating distances 365 - transpose: 366 - if False: rows are clustered; 367 - if True: columns are clustered. 368 - nxgrid: the horizontal dimension of the rectangular SOM map 369 - nygrid: the vertical dimension of the rectangular SOM map 370 - inittau: the initial value of tau (the neighborbood function) 371 - niter: the number of iterations 372 - dist: specifies the distance function to be used: 373 - dist == 'e': Euclidean distance 374 - dist == 'b': City Block distance 375 - dist == 'c': Pearson correlation 376 - dist == 'a': absolute value of the correlation 377 - dist == 'u': uncentered correlation 378 - dist == 'x': absolute uncentered correlation 379 - dist == 's': Spearman's rank correlation 380 - dist == 'k': Kendall's tau 381 382 Return values: 383 384 - clusterid: array with two columns, with the number of rows equal to 385 the items that are being clustered. Each row in the array contains 386 the x and y coordinates of the cell in the rectangular SOM grid to 387 which the item was assigned. 388 - celldata: an array with dimensions [nxgrid, nygrid, number of columns] 389 if rows are being clustered, or [nxgrid, nygrid, number of rows) if 390 columns are being clustered. 391 Each element [ix, iy] of this array is a 1D vector containing the 392 data values for the centroid of the cluster in the SOM grid cell 393 with coordinates [ix, iy]. 394 """ 395 if transpose: 396 ndata, nitems = data.shape 397 else: 398 nitems, ndata = data.shape 399 data = __check_data(data) 400 shape = data.shape 401 mask = __check_mask(mask, shape) 402 weight = __check_weight(weight, ndata) 403 if nxgrid < 1: 404 raise ValueError("nxgrid should be a positive integer (default is 2)") 405 if nygrid < 1: 406 raise ValueError("nygrid should be a positive integer (default is 1)") 407 clusterids = numpy.ones((nitems, 2), dtype="intc") 408 celldata = numpy.empty((nxgrid, nygrid, ndata), dtype="d") 409 _cluster.somcluster( 410 clusterids, celldata, data, mask, weight, transpose, inittau, niter, dist 411 ) 412 return clusterids, celldata 413 414 415def clusterdistance( 416 data, 417 mask=None, 418 weight=None, 419 index1=None, 420 index2=None, 421 method="a", 422 dist="e", 423 transpose=False, 424): 425 """Calculate and return the distance between two clusters. 426 427 Keyword arguments: 428 - data: nrows x ncolumns array containing the data values. 429 - mask: nrows x ncolumns array of integers, showing which data are 430 missing. If mask[i, j]==0, then data[i, j] is missing. 431 - weight: the weights to be used when calculating distances 432 - index1: 1D array identifying which items belong to the 433 first cluster. If the cluster contains only one item, then 434 index1 can also be written as a single integer. 435 - index2: 1D array identifying which items belong to the 436 second cluster. If the cluster contains only one item, then 437 index2 can also be written as a single integer. 438 - dist: specifies the distance function to be used: 439 - dist == 'e': Euclidean distance 440 - dist == 'b': City Block distance 441 - dist == 'c': Pearson correlation 442 - dist == 'a': absolute value of the correlation 443 - dist == 'u': uncentered correlation 444 - dist == 'x': absolute uncentered correlation 445 - dist == 's': Spearman's rank correlation 446 - dist == 'k': Kendall's tau 447 - method: specifies how the distance between two clusters is defined: 448 - method == 'a': the distance between the arithmetic means 449 of the two clusters 450 - method == 'm': the distance between the medians of the two clusters 451 - method == 's': the smallest pairwise distance between members 452 of the two clusters 453 - method == 'x': the largest pairwise distance between members 454 of the two clusters 455 - method == 'v': average of the pairwise distances between members 456 of the two clusters 457 - transpose: 458 - if False: clusters of rows are considered; 459 - if True: clusters of columns are considered. 460 """ 461 data = __check_data(data) 462 shape = data.shape 463 ndata = shape[0] if transpose else shape[1] 464 mask = __check_mask(mask, shape) 465 weight = __check_weight(weight, ndata) 466 index1 = __check_index(index1) 467 index2 = __check_index(index2) 468 return _cluster.clusterdistance( 469 data, mask, weight, index1, index2, method, dist, transpose 470 ) 471 472 473def clustercentroids(data, mask=None, clusterid=None, method="a", transpose=False): 474 """Calculate and return the centroid of each cluster. 475 476 The clustercentroids routine calculates the cluster centroids, given to 477 which cluster each item belongs. The centroid is defined as either 478 the mean or the median over all items for each dimension. 479 480 Keyword arguments: 481 - data: nrows x ncolumns array containing the data values. 482 - mask: nrows x ncolumns array of integers, showing which data are 483 missing. If mask[i, j]==0, then data[i, j] is missing. 484 - clusterid: array containing the cluster number for each item. 485 The cluster number should be non-negative. 486 - method: specifies whether the centroid is calculated from the 487 arithmetic mean (method == 'a', default) or the median (method == 'm') 488 over each dimension. 489 - transpose: if False, each row contains the data for one item; 490 if True, each column contains the data for one item. 491 492 Return values: 493 - cdata: 2D array containing the cluster centroids. 494 If transpose is False, then the dimensions of cdata are 495 nclusters x ncolumns. 496 If transpose is True, then the dimensions of cdata are 497 nrows x nclusters. 498 - cmask: 2D array of integers describing which items in cdata, 499 if any, are missing. 500 """ 501 data = __check_data(data) 502 mask = __check_mask(mask, data.shape) 503 nrows, ncolumns = data.shape 504 if clusterid is None: 505 n = ncolumns if transpose else nrows 506 clusterid = numpy.zeros(n, dtype="intc") 507 nclusters = 1 508 else: 509 clusterid = numpy.require(clusterid, dtype="intc", requirements="C") 510 nclusters = max(clusterid + 1) 511 if transpose: 512 shape = (nrows, nclusters) 513 else: 514 shape = (nclusters, ncolumns) 515 cdata = numpy.zeros(shape, dtype="d") 516 cmask = numpy.zeros(shape, dtype="intc") 517 _cluster.clustercentroids(data, mask, clusterid, method, transpose, cdata, cmask) 518 return cdata, cmask 519 520 521def distancematrix(data, mask=None, weight=None, transpose=False, dist="e"): 522 """Calculate and return a distance matrix from the data. 523 524 This function returns the distance matrix calculated from the data. 525 526 Keyword arguments: 527 - data: nrows x ncolumns array containing the data values. 528 - mask: nrows x ncolumns array of integers, showing which data are 529 missing. If mask[i, j]==0, then data[i, j] is missing. 530 - weight: the weights to be used when calculating distances. 531 - transpose: if False: the distances between rows are calculated; 532 if True: the distances between columns are calculated. 533 - dist: specifies the distance function to be used: 534 - dist == 'e': Euclidean distance 535 - dist == 'b': City Block distance 536 - dist == 'c': Pearson correlation 537 - dist == 'a': absolute value of the correlation 538 - dist == 'u': uncentered correlation 539 - dist == 'x': absolute uncentered correlation 540 - dist == 's': Spearman's rank correlation 541 - dist == 'k': Kendall's tau 542 543 Return value: 544 The distance matrix is returned as a list of 1D arrays containing the 545 distance matrix calculated from the data. The number of columns in eac 546 row is equal to the row number. Hence, the first row has zero length. 547 For example: 548 549 >>> from numpy import array 550 >>> from Bio.Cluster import distancematrix 551 >>> data = array([[0, 1, 2, 3], 552 ... [4, 5, 6, 7], 553 ... [8, 9, 10, 11], 554 ... [1, 2, 3, 4]]) 555 >>> distances = distancematrix(data, dist='e') 556 >>> distances 557 [array([], dtype=float64), array([ 16.]), array([ 64., 16.]), array([ 1., 9., 49.])] 558 559 which can be rewritten as 560 distances = [array([], dtype=float64), 561 array([ 16.]), 562 array([ 64., 16.]), 563 array([ 1., 9., 49.])] 564 565 This corresponds to the distance matrix: 566 567 [ 0., 16., 64., 1.] 568 [16., 0., 16., 9.] 569 [64., 16., 0., 49.] 570 [ 1., 9., 49., 0.] 571 """ 572 data = __check_data(data) 573 shape = data.shape 574 mask = __check_mask(mask, shape) 575 if transpose: 576 ndata, nitems = shape 577 else: 578 nitems, ndata = shape 579 weight = __check_weight(weight, ndata) 580 matrix = [numpy.empty(i, dtype="d") for i in range(nitems)] 581 _cluster.distancematrix(data, mask, weight, transpose, dist, matrix) 582 return matrix 583 584 585def pca(data): 586 """Perform principal component analysis. 587 588 Keyword arguments: 589 - data: nrows x ncolumns array containing the data values. 590 591 Return value: 592 This function returns an array containing the mean of each column, the 593 principal components as an nmin x ncolumns array, as well as the 594 coordinates (an nrows x nmin array) of the data along the principal 595 components, and the associated eigenvalues. The principal components, the 596 coordinates, and the eigenvalues are sorted by the magnitude of the 597 eigenvalue, with the largest eigenvalues appearing first. Here, nmin is 598 the smaller of nrows and ncolumns. 599 Adding the column means to the dot product of the coordinates and the 600 principal components recreates the data matrix: 601 602 >>> from numpy import array, dot, amax, amin 603 >>> from Bio.Cluster import pca 604 >>> matrix = array([[ 0., 0., 0.], 605 ... [ 1., 0., 0.], 606 ... [ 7., 3., 0.], 607 ... [ 4., 2., 6.]]) 608 >>> columnmean, coordinates, pc, _ = pca(matrix) 609 >>> m = matrix - (columnmean + dot(coordinates, pc)) 610 >>> amax(m) < 1e-12 and amin(m) > -1e-12 611 True 612 613 """ 614 data = __check_data(data) 615 nrows, ncols = data.shape 616 nmin = min(nrows, ncols) 617 columnmean = numpy.empty(ncols, dtype="d") 618 pc = numpy.empty((nmin, ncols), dtype="d") 619 coordinates = numpy.empty((nrows, nmin), dtype="d") 620 eigenvalues = numpy.empty(nmin, dtype="d") 621 _cluster.pca(data, columnmean, coordinates, pc, eigenvalues) 622 return columnmean, coordinates, pc, eigenvalues 623 624 625class Record: 626 """Store gene expression data. 627 628 A Record stores the gene expression data and related information contained 629 in a data file following the file format defined for Michael Eisen's 630 Cluster/TreeView program. 631 632 Attributes: 633 - data: a matrix containing the gene expression data 634 - mask: a matrix containing only 1's and 0's, denoting which values 635 are present (1) or missing (0). If all items of mask are 636 one (no missing data), then mask is set to None. 637 - geneid: a list containing a unique identifier for each gene 638 (e.g., ORF name) 639 - genename: a list containing an additional description for each gene 640 (e.g., gene name) 641 - gweight: the weight to be used for each gene when calculating the 642 distance 643 - gorder: an array of real numbers indicating the preferred order of the 644 genes in the output file 645 - expid: a list containing a unique identifier for each sample. 646 - eweight: the weight to be used for each sample when calculating the 647 distance 648 - eorder: an array of real numbers indication the preferred order of the 649 samples in the output file 650 - uniqid: the string that was used instead of UNIQID in the input file. 651 652 """ 653 654 def __init__(self, handle=None): 655 """Read gene expression data from the file handle and return a Record. 656 657 The file should be in the format defined for Michael Eisen's 658 Cluster/TreeView program. 659 """ 660 self.data = None 661 self.mask = None 662 self.geneid = None 663 self.genename = None 664 self.gweight = None 665 self.gorder = None 666 self.expid = None 667 self.eweight = None 668 self.eorder = None 669 self.uniqid = None 670 if not handle: 671 return 672 line = handle.readline().strip("\r\n").split("\t") 673 n = len(line) 674 self.uniqid = line[0] 675 self.expid = [] 676 cols = {0: "GENEID"} 677 for word in line[1:]: 678 if word == "NAME": 679 cols[line.index(word)] = word 680 self.genename = [] 681 elif word == "GWEIGHT": 682 cols[line.index(word)] = word 683 self.gweight = [] 684 elif word == "GORDER": 685 cols[line.index(word)] = word 686 self.gorder = [] 687 else: 688 self.expid.append(word) 689 self.geneid = [] 690 self.data = [] 691 self.mask = [] 692 needmask = 0 693 for line in handle: 694 line = line.strip("\r\n").split("\t") 695 if len(line) != n: 696 raise ValueError( 697 "Line with %d columns found (expected %d)" % (len(line), n) 698 ) 699 if line[0] == "EWEIGHT": 700 i = max(cols) + 1 701 self.eweight = numpy.array(line[i:], float) 702 continue 703 if line[0] == "EORDER": 704 i = max(cols) + 1 705 self.eorder = numpy.array(line[i:], float) 706 continue 707 rowdata = [] 708 rowmask = [] 709 n = len(line) 710 for i in range(n): 711 word = line[i] 712 if i in cols: 713 if cols[i] == "GENEID": 714 self.geneid.append(word) 715 if cols[i] == "NAME": 716 self.genename.append(word) 717 if cols[i] == "GWEIGHT": 718 self.gweight.append(float(word)) 719 if cols[i] == "GORDER": 720 self.gorder.append(float(word)) 721 continue 722 if not word: 723 rowdata.append(0.0) 724 rowmask.append(0) 725 needmask = 1 726 else: 727 rowdata.append(float(word)) 728 rowmask.append(1) 729 self.data.append(rowdata) 730 self.mask.append(rowmask) 731 self.data = numpy.array(self.data) 732 if needmask: 733 self.mask = numpy.array(self.mask, int) 734 else: 735 self.mask = None 736 if self.gweight: 737 self.gweight = numpy.array(self.gweight) 738 if self.gorder: 739 self.gorder = numpy.array(self.gorder) 740 741 def treecluster(self, transpose=False, method="m", dist="e"): 742 """Apply hierarchical clustering and return a Tree object. 743 744 The pairwise single, complete, centroid, and average linkage 745 hierarchical clustering methods are available. 746 747 Keyword arguments: 748 - transpose: if False: rows are clustered; 749 if True: columns are clustered. 750 - dist: specifies the distance function to be used: 751 - dist == 'e': Euclidean distance 752 - dist == 'b': City Block distance 753 - dist == 'c': Pearson correlation 754 - dist == 'a': absolute value of the correlation 755 - dist == 'u': uncentered correlation 756 - dist == 'x': absolute uncentered correlation 757 - dist == 's': Spearman's rank correlation 758 - dist == 'k': Kendall's tau 759 - method: specifies which linkage method is used: 760 - method == 's': Single pairwise linkage 761 - method == 'm': Complete (maximum) pairwise linkage (default) 762 - method == 'c': Centroid linkage 763 - method == 'a': Average pairwise linkage 764 765 See the description of the Tree class for more information about 766 the Tree object returned by this method. 767 """ 768 if transpose: 769 weight = self.gweight 770 else: 771 weight = self.eweight 772 return treecluster(self.data, self.mask, weight, transpose, method, dist) 773 774 def kcluster( 775 self, 776 nclusters=2, 777 transpose=False, 778 npass=1, 779 method="a", 780 dist="e", 781 initialid=None, 782 ): 783 """Apply k-means or k-median clustering. 784 785 This method returns a tuple (clusterid, error, nfound). 786 787 Keyword arguments: 788 - nclusters: number of clusters (the 'k' in k-means) 789 - transpose: if False, genes (rows) are clustered; 790 if True, samples (columns) are clustered. 791 - npass: number of times the k-means clustering algorithm is 792 performed, each time with a different (random) initial condition. 793 - method: specifies how the center of a cluster is found: 794 - method == 'a': arithmetic mean 795 - method == 'm': median 796 - dist: specifies the distance function to be used: 797 - dist == 'e': Euclidean distance 798 - dist == 'b': City Block distance 799 - dist == 'c': Pearson correlation 800 - dist == 'a': absolute value of the correlation 801 - dist == 'u': uncentered correlation 802 - dist == 'x': absolute uncentered correlation 803 - dist == 's': Spearman's rank correlation 804 - dist == 'k': Kendall's tau 805 - initialid: the initial clustering from which the algorithm should 806 start. If initialid is None, the routine carries out npass 807 repetitions of the EM algorithm, each time starting from a different 808 random initial clustering. If initialid is given, the routine 809 carries out the EM algorithm only once, starting from the given 810 initial clustering and without randomizing the order in which items 811 are assigned to clusters (i.e., using the same order as in the data 812 matrix). In that case, the k-means algorithm is fully deterministic. 813 814 Return values: 815 - clusterid: array containing the number of the cluster to which each 816 gene/sample was assigned in the best k-means clustering 817 solution that was found in the npass runs; 818 - error: the within-cluster sum of distances for the returned 819 k-means clustering solution; 820 - nfound: the number of times this solution was found. 821 """ 822 if transpose: 823 weight = self.gweight 824 else: 825 weight = self.eweight 826 return kcluster( 827 self.data, 828 nclusters, 829 self.mask, 830 weight, 831 transpose, 832 npass, 833 method, 834 dist, 835 initialid, 836 ) 837 838 def somcluster( 839 self, transpose=False, nxgrid=2, nygrid=1, inittau=0.02, niter=1, dist="e" 840 ): 841 """Calculate a self-organizing map on a rectangular grid. 842 843 The somcluster method returns a tuple (clusterid, celldata). 844 845 Keyword arguments: 846 - transpose: if False, genes (rows) are clustered; 847 if True, samples (columns) are clustered. 848 - nxgrid: the horizontal dimension of the rectangular SOM map 849 - nygrid: the vertical dimension of the rectangular SOM map 850 - inittau: the initial value of tau (the neighborbood function) 851 - niter: the number of iterations 852 - dist: specifies the distance function to be used: 853 - dist == 'e': Euclidean distance 854 - dist == 'b': City Block distance 855 - dist == 'c': Pearson correlation 856 - dist == 'a': absolute value of the correlation 857 - dist == 'u': uncentered correlation 858 - dist == 'x': absolute uncentered correlation 859 - dist == 's': Spearman's rank correlation 860 - dist == 'k': Kendall's tau 861 862 Return values: 863 - clusterid: array with two columns, while the number of rows is equal 864 to the number of genes or the number of samples depending on 865 whether genes or samples are being clustered. Each row in 866 the array contains the x and y coordinates of the cell in the 867 rectangular SOM grid to which the gene or samples was assigned. 868 - celldata: an array with dimensions (nxgrid, nygrid, number of 869 samples) if genes are being clustered, or (nxgrid, nygrid, 870 number of genes) if samples are being clustered. Each item 871 [ix, iy] of this array is a 1D vector containing the gene 872 expression data for the centroid of the cluster in the SOM grid 873 cell with coordinates [ix, iy]. 874 """ 875 if transpose: 876 weight = self.gweight 877 else: 878 weight = self.eweight 879 return somcluster( 880 self.data, 881 self.mask, 882 weight, 883 transpose, 884 nxgrid, 885 nygrid, 886 inittau, 887 niter, 888 dist, 889 ) 890 891 def clustercentroids(self, clusterid=None, method="a", transpose=False): 892 """Calculate the cluster centroids and return a tuple (cdata, cmask). 893 894 The centroid is defined as either the mean or the median over all 895 items for each dimension. 896 897 Keyword arguments: 898 - data: nrows x ncolumns array containing the expression data 899 - mask: nrows x ncolumns array of integers, showing which data 900 are missing. If mask[i, j]==0, then data[i, j] is missing. 901 - transpose: if False, gene (row) clusters are considered; 902 if True, sample (column) clusters are considered. 903 - clusterid: array containing the cluster number for each gene or 904 sample. The cluster number should be non-negative. 905 - method: specifies how the centroid is calculated: 906 - method == 'a': arithmetic mean over each dimension. (default) 907 - method == 'm': median over each dimension. 908 909 Return values: 910 - cdata: 2D array containing the cluster centroids. If transpose 911 is False, then the dimensions of cdata are nclusters x ncolumns. 912 If transpose is True, then the dimensions of cdata are nrows x 913 nclusters. 914 - cmask: 2D array of integers describing which items in cdata, 915 if any, are missing. 916 """ 917 return clustercentroids(self.data, self.mask, clusterid, method, transpose) 918 919 def clusterdistance( 920 self, index1=0, index2=0, method="a", dist="e", transpose=False 921 ): 922 """Calculate the distance between two clusters. 923 924 Keyword arguments: 925 - index1: 1D array identifying which genes/samples belong to the 926 first cluster. If the cluster contains only one gene, then 927 index1 can also be written as a single integer. 928 - index2: 1D array identifying which genes/samples belong to the 929 second cluster. If the cluster contains only one gene, then 930 index2 can also be written as a single integer. 931 - transpose: if False, genes (rows) are clustered; 932 if True, samples (columns) are clustered. 933 - dist: specifies the distance function to be used: 934 - dist == 'e': Euclidean distance 935 - dist == 'b': City Block distance 936 - dist == 'c': Pearson correlation 937 - dist == 'a': absolute value of the correlation 938 - dist == 'u': uncentered correlation 939 - dist == 'x': absolute uncentered correlation 940 - dist == 's': Spearman's rank correlation 941 - dist == 'k': Kendall's tau 942 - method: specifies how the distance between two clusters is defined: 943 - method == 'a': the distance between the arithmetic means 944 of the two clusters 945 - method == 'm': the distance between the medians of the 946 two clusters 947 - method == 's': the smallest pairwise distance between members 948 of the two clusters 949 - method == 'x': the largest pairwise distance between members 950 of the two clusters 951 - method == 'v': average of the pairwise distances between members 952 of the two clusters 953 - transpose: if False: clusters of rows are considered; 954 if True: clusters of columns are considered. 955 """ 956 if transpose: 957 weight = self.gweight 958 else: 959 weight = self.eweight 960 return clusterdistance( 961 self.data, self.mask, weight, index1, index2, method, dist, transpose 962 ) 963 964 def distancematrix(self, transpose=False, dist="e"): 965 """Calculate the distance matrix and return it as a list of arrays. 966 967 Keyword arguments: 968 - transpose: 969 if False: calculate the distances between genes (rows); 970 if True: calculate the distances between samples (columns). 971 - dist: specifies the distance function to be used: 972 - dist == 'e': Euclidean distance 973 - dist == 'b': City Block distance 974 - dist == 'c': Pearson correlation 975 - dist == 'a': absolute value of the correlation 976 - dist == 'u': uncentered correlation 977 - dist == 'x': absolute uncentered correlation 978 - dist == 's': Spearman's rank correlation 979 - dist == 'k': Kendall's tau 980 981 Return value: 982 983 The distance matrix is returned as a list of 1D arrays containing the 984 distance matrix between the gene expression data. The number of columns 985 in each row is equal to the row number. Hence, the first row has zero 986 length. An example of the return value is: 987 988 matrix = [[], 989 array([1.]), 990 array([7., 3.]), 991 array([4., 2., 6.])] 992 993 This corresponds to the distance matrix: 994 995 [0., 1., 7., 4.] 996 [1., 0., 3., 2.] 997 [7., 3., 0., 6.] 998 [4., 2., 6., 0.] 999 1000 """ 1001 if transpose: 1002 weight = self.gweight 1003 else: 1004 weight = self.eweight 1005 return distancematrix(self.data, self.mask, weight, transpose, dist) 1006 1007 def save(self, jobname, geneclusters=None, expclusters=None): 1008 """Save the clustering results. 1009 1010 The saved files follow the convention for the Java TreeView program, 1011 which can therefore be used to view the clustering result. 1012 1013 Keyword arguments: 1014 - jobname: The base name of the files to be saved. The filenames 1015 are jobname.cdt, jobname.gtr, and jobname.atr for hierarchical 1016 clustering, and jobname-K*.cdt, jobname-K*.kgg, jobname-K*.kag 1017 for k-means clustering results. 1018 - geneclusters: For hierarchical clustering results, geneclusters 1019 is a Tree object as returned by the treecluster method. For k-means 1020 clustering results, geneclusters is a vector containing ngenes 1021 integers, describing to which cluster a given gene belongs. This 1022 vector can be calculated by kcluster. 1023 - expclusters: For hierarchical clustering results, expclusters 1024 is a Tree object as returned by the treecluster method. For k-means 1025 clustering results, expclusters is a vector containing nexps 1026 integers, describing to which cluster a given sample belongs. This 1027 vector can be calculated by kcluster. 1028 """ 1029 (ngenes, nexps) = numpy.shape(self.data) 1030 if self.gorder is None: 1031 gorder = numpy.arange(ngenes) 1032 else: 1033 gorder = self.gorder 1034 if self.eorder is None: 1035 eorder = numpy.arange(nexps) 1036 else: 1037 eorder = self.eorder 1038 if ( 1039 geneclusters is not None 1040 and expclusters is not None 1041 and type(geneclusters) != type(expclusters) 1042 ): 1043 raise ValueError( 1044 "found one k-means and one hierarchical " 1045 "clustering solution in geneclusters and " 1046 "expclusters" 1047 ) 1048 gid = 0 1049 aid = 0 1050 filename = jobname 1051 postfix = "" 1052 if isinstance(geneclusters, Tree): 1053 # This is a hierarchical clustering result. 1054 geneindex = self._savetree(jobname, geneclusters, gorder, False) 1055 gid = 1 1056 elif geneclusters is not None: 1057 # This is a k-means clustering result. 1058 filename = jobname + "_K" 1059 k = max(geneclusters) + 1 1060 kggfilename = "%s_K_G%d.kgg" % (jobname, k) 1061 geneindex = self._savekmeans(kggfilename, geneclusters, gorder, False) 1062 postfix = "_G%d" % k 1063 else: 1064 geneindex = numpy.argsort(gorder) 1065 if isinstance(expclusters, Tree): 1066 # This is a hierarchical clustering result. 1067 expindex = self._savetree(jobname, expclusters, eorder, True) 1068 aid = 1 1069 elif expclusters is not None: 1070 # This is a k-means clustering result. 1071 filename = jobname + "_K" 1072 k = max(expclusters) + 1 1073 kagfilename = "%s_K_A%d.kag" % (jobname, k) 1074 expindex = self._savekmeans(kagfilename, expclusters, eorder, True) 1075 postfix += "_A%d" % k 1076 else: 1077 expindex = numpy.argsort(eorder) 1078 filename = filename + postfix 1079 self._savedata(filename, gid, aid, geneindex, expindex) 1080 1081 def _savetree(self, jobname, tree, order, transpose): 1082 """Save the hierarchical clustering solution (PRIVATE).""" 1083 if transpose: 1084 extension = ".atr" 1085 keyword = "ARRY" 1086 else: 1087 extension = ".gtr" 1088 keyword = "GENE" 1089 index = tree.sort(order) 1090 nnodes = len(tree) 1091 with open(jobname + extension, "w") as outputfile: 1092 nodeID = [""] * nnodes 1093 nodedist = numpy.array([node.distance for node in tree[:]]) 1094 for nodeindex in range(nnodes): 1095 min1 = tree[nodeindex].left 1096 min2 = tree[nodeindex].right 1097 nodeID[nodeindex] = "NODE%dX" % (nodeindex + 1) 1098 outputfile.write(nodeID[nodeindex]) 1099 outputfile.write("\t") 1100 if min1 < 0: 1101 index1 = -min1 - 1 1102 outputfile.write(nodeID[index1] + "\t") 1103 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index1]) 1104 else: 1105 outputfile.write("%s%dX\t" % (keyword, min1)) 1106 if min2 < 0: 1107 index2 = -min2 - 1 1108 outputfile.write(nodeID[index2] + "\t") 1109 nodedist[nodeindex] = max(nodedist[nodeindex], nodedist[index2]) 1110 else: 1111 outputfile.write("%s%dX\t" % (keyword, min2)) 1112 outputfile.write(str(1.0 - nodedist[nodeindex])) 1113 outputfile.write("\n") 1114 return index 1115 1116 def _savekmeans(self, filename, clusterids, order, transpose): 1117 """Save the k-means clustering solution (PRIVATE).""" 1118 if transpose: 1119 label = "ARRAY" 1120 names = self.expid 1121 else: 1122 label = self.uniqid 1123 names = self.geneid 1124 with open(filename, "w") as outputfile: 1125 outputfile.write(label + "\tGROUP\n") 1126 index = numpy.argsort(order) 1127 n = len(names) 1128 sortedindex = numpy.zeros(n, int) 1129 counter = 0 1130 cluster = 0 1131 while counter < n: 1132 for j in index: 1133 if clusterids[j] == cluster: 1134 outputfile.write("%s\t%s\n" % (names[j], cluster)) 1135 sortedindex[counter] = j 1136 counter += 1 1137 cluster += 1 1138 return sortedindex 1139 1140 def _savedata(self, jobname, gid, aid, geneindex, expindex): 1141 """Save the clustered data (PRIVATE).""" 1142 if self.genename is None: 1143 genename = self.geneid 1144 else: 1145 genename = self.genename 1146 (ngenes, nexps) = numpy.shape(self.data) 1147 with open(jobname + ".cdt", "w") as outputfile: 1148 if self.mask is not None: 1149 mask = self.mask 1150 else: 1151 mask = numpy.ones((ngenes, nexps), int) 1152 if self.gweight is not None: 1153 gweight = self.gweight 1154 else: 1155 gweight = numpy.ones(ngenes) 1156 if self.eweight is not None: 1157 eweight = self.eweight 1158 else: 1159 eweight = numpy.ones(nexps) 1160 if gid: 1161 outputfile.write("GID\t") 1162 outputfile.write(self.uniqid) 1163 outputfile.write("\tNAME\tGWEIGHT") 1164 # Now add headers for data columns. 1165 for j in expindex: 1166 outputfile.write("\t%s" % self.expid[j]) 1167 outputfile.write("\n") 1168 if aid: 1169 outputfile.write("AID") 1170 if gid: 1171 outputfile.write("\t") 1172 outputfile.write("\t\t") 1173 for j in expindex: 1174 outputfile.write("\tARRY%dX" % j) 1175 outputfile.write("\n") 1176 outputfile.write("EWEIGHT") 1177 if gid: 1178 outputfile.write("\t") 1179 outputfile.write("\t\t") 1180 for j in expindex: 1181 outputfile.write("\t%f" % eweight[j]) 1182 outputfile.write("\n") 1183 for i in geneindex: 1184 if gid: 1185 outputfile.write("GENE%dX\t" % i) 1186 outputfile.write( 1187 "%s\t%s\t%f" % (self.geneid[i], genename[i], gweight[i]) 1188 ) 1189 for j in expindex: 1190 outputfile.write("\t") 1191 if mask[i, j]: 1192 outputfile.write(str(self.data[i, j])) 1193 outputfile.write("\n") 1194 1195 1196def read(handle): 1197 """Read gene expression data from the file handle and return a Record. 1198 1199 The file should be in the file format defined for Michael Eisen's 1200 Cluster/TreeView program. 1201 """ 1202 return Record(handle) 1203 1204 1205# Everything below is private 1206# 1207 1208 1209def __check_data(data): 1210 if isinstance(data, numpy.ndarray): 1211 data = numpy.require(data, dtype="d", requirements="C") 1212 else: 1213 data = numpy.array(data, dtype="d") 1214 if data.ndim != 2: 1215 raise ValueError("data should be 2-dimensional") 1216 if numpy.isnan(data).any(): 1217 raise ValueError("data contains NaN values") 1218 return data 1219 1220 1221def __check_mask(mask, shape): 1222 if mask is None: 1223 return numpy.ones(shape, dtype="intc") 1224 elif isinstance(mask, numpy.ndarray): 1225 return numpy.require(mask, dtype="intc", requirements="C") 1226 else: 1227 return numpy.array(mask, dtype="intc") 1228 1229 1230def __check_weight(weight, ndata): 1231 if weight is None: 1232 return numpy.ones(ndata, dtype="d") 1233 if isinstance(weight, numpy.ndarray): 1234 weight = numpy.require(weight, dtype="d", requirements="C") 1235 else: 1236 weight = numpy.array(weight, dtype="d") 1237 if numpy.isnan(weight).any(): 1238 raise ValueError("weight contains NaN values") 1239 return weight 1240 1241 1242def __check_initialid(initialid, npass, nitems): 1243 if initialid is None: 1244 if npass <= 0: 1245 raise ValueError("npass should be a positive integer") 1246 clusterid = numpy.empty(nitems, dtype="intc") 1247 else: 1248 npass = 0 1249 clusterid = numpy.array(initialid, dtype="intc") 1250 return clusterid, npass 1251 1252 1253def __check_index(index): 1254 if index is None: 1255 return numpy.zeros(1, dtype="intc") 1256 elif isinstance(index, numbers.Integral): 1257 return numpy.array([index], dtype="intc") 1258 elif isinstance(index, numpy.ndarray): 1259 return numpy.require(index, dtype="intc", requirements="C") 1260 else: 1261 return numpy.array(index, dtype="intc") 1262 1263 1264def __check_distancematrix(distancematrix): 1265 if distancematrix is None: 1266 return distancematrix 1267 if isinstance(distancematrix, numpy.ndarray): 1268 distancematrix = numpy.require(distancematrix, dtype="d", requirements="C") 1269 else: 1270 try: 1271 distancematrix = numpy.array(distancematrix, dtype="d") 1272 except ValueError: 1273 n = len(distancematrix) 1274 d = [None] * n 1275 for i, row in enumerate(distancematrix): 1276 if isinstance(row, numpy.ndarray): 1277 row = numpy.require(row, dtype="d", requirements="C") 1278 else: 1279 row = numpy.array(row, dtype="d") 1280 if row.ndim != 1: 1281 raise ValueError("row %d is not one-dimensional" % i) from None 1282 m = len(row) 1283 if m != i: 1284 raise ValueError( 1285 "row %d has incorrect size (%d, expected %d)" % (i, m, i) 1286 ) from None 1287 if numpy.isnan(row).any(): 1288 raise ValueError("distancematrix contains NaN values") from None 1289 d[i] = row 1290 return d 1291 if numpy.isnan(distancematrix).any(): 1292 raise ValueError("distancematrix contains NaN values") 1293 return distancematrix 1294