1 /* The C clustering library.
2 * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
3 *
4 * This library was written at the Laboratory of DNA Information Analysis,
5 * Human Genome Center, Institute of Medical Science, University of Tokyo,
6 * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
7 * Contact: mdehoon 'AT' gsc.riken.jp
8 *
9 * Permission to use, copy, modify, and distribute this software and its
10 * documentation with or without modifications and for any purpose and
11 * without fee is hereby granted, provided that any copyright notices
12 * appear in all copies and that both those copyright notices and this
13 * permission notice appear in supporting documentation, and that the
14 * names of the contributors or copyright holders not be used in
15 * advertising or publicity pertaining to distribution of the software
16 * without specific prior permission.
17 *
18 * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
19 * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
21 * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
22 * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23 * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24 * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25 * OR PERFORMANCE OF THIS SOFTWARE.
26 *
27 */
28
29 #include <time.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <math.h>
33 #include <float.h>
34 #include <limits.h>
35 #include <string.h>
36 #include "cluster_floatNOMASK.h"
37 #ifdef WINDOWS
38 # include <windows.h>
39 #endif
40
41 /* ************************************************************************ */
42
43 #ifdef WINDOWS
44 /* Then we make a Windows DLL */
45 int WINAPI
clusterdll_init(HANDLE h,DWORD reason,void * foo)46 clusterdll_init (HANDLE h, DWORD reason, void* foo)
47 {
48 return 1;
49 }
50 #endif
51
52 /* ************************************************************************ */
53
mean(int n,float x[])54 float CALL mean(int n, float x[])
55 { float result = 0.;
56 int i;
57 for (i = 0; i < n; i++) result += x[i];
58 result /= n;
59 return result;
60 }
61
62 /* ************************************************************************ */
63
median(int n,float x[])64 float CALL median (int n, float x[])
65 /*
66 Find the median of X(1), ... , X(N), using as much of the quicksort
67 algorithm as is needed to isolate it.
68 N.B. On exit, the array X is partially ordered.
69 Based on Alan J. Miller's median.f90 routine.
70 */
71
72 { int i, j;
73 int nr = n / 2;
74 int nl = nr - 1;
75 int even = 0;
76 /* hi & lo are position limits encompassing the median. */
77 int lo = 0;
78 int hi = n-1;
79
80 if (n==2*nr) even = 1;
81 if (n<3)
82 { if (n<1) return 0.;
83 if (n == 1) return x[0];
84 return 0.5*(x[0]+x[1]);
85 }
86
87 /* Find median of 1st, middle & last values. */
88 do
89 { int loop;
90 int mid = (lo + hi)/2;
91 float result = x[mid];
92 float xlo = x[lo];
93 float xhi = x[hi];
94 if (xhi<xlo)
95 { float temp = xlo;
96 xlo = xhi;
97 xhi = temp;
98 }
99 if (result>xhi) result = xhi;
100 else if (result<xlo) result = xlo;
101 /* The basic quicksort algorithm to move all values <= the sort key (XMED)
102 * to the left-hand end, and all higher values to the other end.
103 */
104 i = lo;
105 j = hi;
106 do
107 { while (x[i]<result) i++;
108 while (x[j]>result) j--;
109 loop = 0;
110 if (i<j)
111 { float temp = x[i];
112 x[i] = x[j];
113 x[j] = temp;
114 i++;
115 j--;
116 if (i<=j) loop = 1;
117 }
118 } while (loop); /* Decide which half the median is in. */
119
120 if (even)
121 { if (j==nl && i==nr)
122 /* Special case, n even, j = n/2 & i = j + 1, so the median is
123 * between the two halves of the series. Find max. of the first
124 * half & min. of the second half, then average.
125 */
126 { int k;
127 float xmax = x[0];
128 float xmin = x[n-1];
129 for (k = lo; k <= j; k++) xmax = max(xmax,x[k]);
130 for (k = i; k <= hi; k++) xmin = min(xmin,x[k]);
131 return 0.5*(xmin + xmax);
132 }
133 if (j<nl) lo = i;
134 if (i>nr) hi = j;
135 if (i==j)
136 { if (i==nl) lo = nl;
137 if (j==nr) hi = nr;
138 }
139 }
140 else
141 { if (j<nr) lo = i;
142 if (i>nr) hi = j;
143 /* Test whether median has been isolated. */
144 if (i==j && i==nr) return result;
145 }
146 }
147 while (lo<hi-1);
148
149 if (even) return (0.5*(x[nl]+x[nr]));
150 if (x[lo]>x[hi])
151 { float temp = x[lo];
152 x[lo] = x[hi];
153 x[hi] = temp;
154 }
155 return x[nr];
156 }
157
158 /* ********************************************************************** */
159
160 static const float* sortdata = NULL; /* used in the quicksort algorithm */
161
162 /* ---------------------------------------------------------------------- */
163
164 static
compare(const void * a,const void * b)165 int compare(const void* a, const void* b)
166 /* Helper function for sort. Previously, this was a nested function under
167 * sort, which is not allowed under ANSI C.
168 */
169 { const int i1 = *(const int*)a;
170 const int i2 = *(const int*)b;
171 const float term1 = sortdata[i1];
172 const float term2 = sortdata[i2];
173 if (term1 < term2) return -1;
174 if (term1 > term2) return +1;
175 return 0;
176 }
177
178 /* ---------------------------------------------------------------------- */
179
sort(int n,const float data[],int index[])180 void CALL sort(int n, const float data[], int index[])
181 /* Sets up an index table given the data, such that data[index[]] is in
182 * increasing order. Sorting is done on the indices; the array data
183 * is unchanged.
184 */
185 { int i;
186 sortdata = data;
187 for (i = 0; i < n; i++) index[i] = i;
188 qsort(index, n, sizeof(int), compare);
189 }
190
191 /* ********************************************************************** */
192
getrank(int n,float data[])193 static float* getrank (int n, float data[])
194 /* Calculates the ranks of the elements in the array data. Two elements with
195 * the same value get the same rank, equal to the average of the ranks had the
196 * elements different values. The ranks are returned as a newly allocated
197 * array that should be freed by the calling routine. If getrank fails due to
198 * a memory allocation error, it returns NULL.
199 */
200 { int i;
201 float* rank;
202 int* index;
203 rank = malloc(n*sizeof(float));
204 if (!rank) return NULL;
205 index = malloc(n*sizeof(int));
206 if (!index)
207 { free(rank);
208 return NULL;
209 }
210 /* Call sort to get an index table */
211 sort (n, data, index);
212 /* Build a rank table */
213 for (i = 0; i < n; i++) rank[index[i]] = i;
214 /* Fix for equal ranks */
215 i = 0;
216 while (i < n)
217 { int m;
218 float value = data[index[i]];
219 int j = i + 1;
220 while (j < n && data[index[j]] == value) j++;
221 m = j - i; /* number of equal ranks found */
222 value = rank[index[i]] + (m-1)/2.;
223 for (j = i; j < i + m; j++) rank[index[j]] = value;
224 i += m;
225 }
226 free (index);
227 return rank;
228 }
229
230 /* ---------------------------------------------------------------------- */
231
232 static int
makedatamask(int nrows,int ncols,float *** pdata)233 makedatamask(int nrows, int ncols, float*** pdata)
234 { int i;
235 float** data;
236 int** mask;
237 data = malloc(nrows*sizeof(float*));
238 if(!data) return 0;
239 mask = malloc(nrows*sizeof(int*));
240 if(!mask)
241 { free(data);
242 return 0;
243 }
244 for (i = 0; i < nrows; i++)
245 { data[i] = malloc(ncols*sizeof(float));
246 if(!data[i]) break;
247 mask[i] = malloc(ncols*sizeof(int));
248 if(!mask[i])
249 { free(data[i]);
250 break;
251 }
252 }
253 if (i==nrows) /* break not encountered */
254 { *pdata = data;
255 /* *pmask = mask;*/
256 return 1;
257 }
258 *pdata = NULL;
259 /* *pmask = NULL;*/
260 nrows = i;
261 for (i = 0; i < nrows; i++)
262 { free(data[i]);
263 free(mask[i]);
264 }
265 free(data);
266 free(mask);
267 return 0;
268 }
269
270 /* ---------------------------------------------------------------------- */
271
272 static void
freedatamask(int n,float ** data)273 freedatamask(int n, float** data)
274 { int i;
275 for (i = 0; i < n; i++)
276 {
277 free(data[i]);
278 }
279
280 free(data);
281 }
282
283 /* ---------------------------------------------------------------------- */
284
285 static
find_closest_pair(int n,float ** distmatrix,int * ip,int * jp)286 float find_closest_pair(int n, float** distmatrix, int* ip, int* jp)
287 /*
288 This function searches the distance matrix to find the pair with the shortest
289 distance between them. The indices of the pair are returned in ip and jp; the
290 distance itself is returned by the function.
291
292 n (input) int
293 The number of elements in the distance matrix.
294
295 distmatrix (input) float**
296 A ragged array containing the distance matrix. The number of columns in each
297 row is one less than the row index.
298
299 ip (output) int*
300 A pointer to the integer that is to receive the first index of the pair with
301 the shortest distance.
302
303 jp (output) int*
304 A pointer to the integer that is to receive the second index of the pair with
305 the shortest distance.
306 */
307 { int i, j;
308 float temp;
309 float distance = distmatrix[1][0];
310 *ip = 1;
311 *jp = 0;
312 for (i = 1; i < n; i++)
313 { for (j = 0; j < i; j++)
314 { temp = distmatrix[i][j];
315 if (temp<distance)
316 { distance = temp;
317 *ip = i;
318 *jp = j;
319 }
320 }
321 }
322 return distance;
323 }
324
325 /* ********************************************************************* */
326
svd(int m,int n,float ** u,float w[],float ** v,int * ierr)327 void CALL svd(int m, int n, float** u, float w[], float** v, int* ierr)
328 /*
329 * This subroutine is a translation of the Algol procedure svd,
330 * Num. Math. 14, 403-420(1970) by Golub and Reinsch.
331 * Handbook for Auto. Comp., Vol II-Linear Algebra, 134-151(1971).
332 *
333 * This subroutine determines the singular value decomposition
334 * t
335 * A=usv of a real m by n rectangular matrix, where m is greater
336 * then or equal to n. Householder bidiagonalization and a variant
337 * of the QR algorithm are used.
338 *
339 *
340 * On input.
341 *
342 * m is the number of rows of A (and u).
343 *
344 * n is the number of columns of A (and u) and the order of v.
345 *
346 * u contains the rectangular input matrix A to be decomposed.
347 *
348 * On output.
349 *
350 * w contains the n (non-negative) singular values of a (the
351 * diagonal elements of s). they are unordered. if an
352 * error exit is made, the singular values should be correct
353 * for indices ierr+1,ierr+2,...,n.
354 *
355 * u contains the matrix u (orthogonal column vectors) of the
356 * decomposition.
357 * if an error exit is made, the columns of u corresponding
358 * to indices of correct singular values should be correct.
359 *
360 * v contains the matrix v (orthogonal) of the decomposition.
361 * if an error exit is made, the columns of v corresponding
362 * to indices of correct singular values should be correct.
363 *
364 * ierr is set to
365 * zero for normal return,
366 * k if the k-th singular value has not been
367 * determined after 30 iterations,
368 * -1 if memory allocation fails.
369 *
370 * Questions and comments should be directed to B. S. Garbow,
371 * Applied Mathematics division, Argonne National Laboratory
372 *
373 * Modified to eliminate machep
374 *
375 * Translated to C by Michiel de Hoon, Human Genome Center,
376 * University of Tokyo, for inclusion in the C Clustering Library.
377 * This routine is less general than the original svd routine, as
378 * it focuses on the singular value decomposition as needed for
379 * clustering. In particular,
380 * - We require m >= n
381 * - We calculate both u and v in all cases
382 * - We pass the input array A via u; this array is subsequently
383 * overwritten.
384 * - We allocate for the array rv1, used as a working space,
385 * internally in this routine, instead of passing it as an
386 * argument. If the allocation fails, svd sets *ierr to -1
387 * and returns.
388 * 2003.06.05
389 */
390 { int i, j, k, i1, k1, l1, its;
391 float c,f,h,s,x,y,z;
392 int l = 0;
393 float g = 0.0;
394 float scale = 0.0;
395 float anorm = 0.0;
396 float* rv1 = malloc(n*sizeof(float));
397 if (!rv1)
398 { *ierr = -1;
399 return;
400 }
401 *ierr = 0;
402 /* Householder reduction to bidiagonal form */
403 for (i = 0; i < n; i++)
404 { l = i + 1;
405 rv1[i] = scale * g;
406 g = 0.0;
407 s = 0.0;
408 scale = 0.0;
409 for (k = i; k < m; k++) scale += fabs(u[k][i]);
410 if (scale != 0.0)
411 { for (k = i; k < m; k++)
412 { u[k][i] /= scale;
413 s += u[k][i]*u[k][i];
414 }
415 f = u[i][i];
416 g = (f >= 0) ? -sqrt(s) : sqrt(s);
417 h = f * g - s;
418 u[i][i] = f - g;
419 if (i < n-1)
420 { for (j = l; j < n; j++)
421 { s = 0.0;
422 for (k = i; k < m; k++) s += u[k][i] * u[k][j];
423 f = s / h;
424 for (k = i; k < m; k++) u[k][j] += f * u[k][i];
425 }
426 }
427 for (k = i; k < m; k++) u[k][i] *= scale;
428 }
429 w[i] = scale * g;
430 g = 0.0;
431 s = 0.0;
432 scale = 0.0;
433 if (i<n-1)
434 { for (k = l; k < n; k++) scale += fabs(u[i][k]);
435 if (scale != 0.0)
436 { for (k = l; k < n; k++)
437 { u[i][k] /= scale;
438 s += u[i][k] * u[i][k];
439 }
440 f = u[i][l];
441 g = (f >= 0) ? -sqrt(s) : sqrt(s);
442 h = f * g - s;
443 u[i][l] = f - g;
444 for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
445 for (j = l; j < m; j++)
446 { s = 0.0;
447 for (k = l; k < n; k++) s += u[j][k] * u[i][k];
448 for (k = l; k < n; k++) u[j][k] += s * rv1[k];
449 }
450 for (k = l; k < n; k++) u[i][k] *= scale;
451 }
452 }
453 anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
454 }
455 /* accumulation of right-hand transformations */
456 for (i = n-1; i>=0; i--)
457 { if (i < n-1)
458 { if (g != 0.0)
459 { for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
460 /* float division avoids possible underflow */
461 for (j = l; j < n; j++)
462 { s = 0.0;
463 for (k = l; k < n; k++) s += u[i][k] * v[k][j];
464 for (k = l; k < n; k++) v[k][j] += s * v[k][i];
465 }
466 }
467 }
468 for (j = l; j < n; j++)
469 { v[i][j] = 0.0;
470 v[j][i] = 0.0;
471 }
472 v[i][i] = 1.0;
473 g = rv1[i];
474 l = i;
475 }
476 /* accumulation of left-hand transformations */
477 for (i = n-1; i >= 0; i--)
478 { l = i + 1;
479 g = w[i];
480 if (i!=n-1)
481 for (j = l; j < n; j++) u[i][j] = 0.0;
482 if (g!=0.0)
483 { if (i!=n-1)
484 { for (j = l; j < n; j++)
485 { s = 0.0;
486 for (k = l; k < m; k++) s += u[k][i] * u[k][j];
487 /* float division avoids possible underflow */
488 f = (s / u[i][i]) / g;
489 for (k = i; k < m; k++) u[k][j] += f * u[k][i];
490 }
491 }
492 for (j = i; j < m; j++) u[j][i] /= g;
493 }
494 else
495 for (j = i; j < m; j++) u[j][i] = 0.0;
496 u[i][i] += 1.0;
497 }
498 /* diagonalization of the bidiagonal form */
499 for (k = n-1; k >= 0; k--)
500 { k1 = k-1;
501 its = 0;
502 while(1)
503 /* test for splitting */
504 { for (l = k; l >= 0; l--)
505 { l1 = l-1;
506 if (fabs(rv1[l]) + anorm == anorm) break;
507 /* rv1[0] is always zero, so there is no exit
508 * through the bottom of the loop */
509 if (fabs(w[l1]) + anorm == anorm)
510 /* cancellation of rv1[l] if l greater than 0 */
511 { c = 0.0;
512 s = 1.0;
513 for (i = l; i <= k; i++)
514 { f = s * rv1[i];
515 rv1[i] *= c;
516 if (fabs(f) + anorm == anorm) break;
517 g = w[i];
518 h = sqrt(f*f+g*g);
519 w[i] = h;
520 c = g / h;
521 s = -f / h;
522 for (j = 0; j < m; j++)
523 { y = u[j][l1];
524 z = u[j][i];
525 u[j][l1] = y * c + z * s;
526 u[j][i] = -y * s + z * c;
527 }
528 }
529 break;
530 }
531 }
532 /* test for convergence */
533 z = w[k];
534 if (l==k) /* convergence */
535 { if (z < 0.0)
536 /* w[k] is made non-negative */
537 { w[k] = -z;
538 for (j = 0; j < n; j++) v[j][k] = -v[j][k];
539 }
540 break;
541 }
542 else if (its==30)
543 { *ierr = k;
544 break;
545 }
546 else
547 /* shift from bottom 2 by 2 minor */
548 { its++;
549 x = w[l];
550 y = w[k1];
551 g = rv1[k1];
552 h = rv1[k];
553 f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
554 g = sqrt(f*f+1.0);
555 f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
556 /* next qr transformation */
557 c = 1.0;
558 s = 1.0;
559 for (i1 = l; i1 <= k1; i1++)
560 { i = i1 + 1;
561 g = rv1[i];
562 y = w[i];
563 h = s * g;
564 g = c * g;
565 z = sqrt(f*f+h*h);
566 rv1[i1] = z;
567 c = f / z;
568 s = h / z;
569 f = x * c + g * s;
570 g = -x * s + g * c;
571 h = y * s;
572 y = y * c;
573 for (j = 0; j < n; j++)
574 { x = v[j][i1];
575 z = v[j][i];
576 v[j][i1] = x * c + z * s;
577 v[j][i] = -x * s + z * c;
578 }
579 z = sqrt(f*f+h*h);
580 w[i1] = z;
581 /* rotation can be arbitrary if z is zero */
582 if (z!=0.0)
583 { c = f / z;
584 s = h / z;
585 }
586 f = c * g + s * y;
587 x = -s * g + c * y;
588 for (j = 0; j < m; j++)
589 { y = u[j][i1];
590 z = u[j][i];
591 u[j][i1] = y * c + z * s;
592 u[j][i] = -y * s + z * c;
593 }
594 }
595 rv1[l] = 0.0;
596 rv1[k] = f;
597 w[k] = x;
598 }
599 }
600 }
601 free(rv1);
602 return;
603 }
604
605 /* ********************************************************************* */
606
607 static
euclid(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)608 float euclid (int n, float** data1, float** data2,
609 const float weight[], int index1, int index2, int transpose)
610
611 /*
612 Purpose
613 =======
614
615 The euclid routine calculates the weighted Euclidean distance between two
616 rows or columns in a matrix.
617
618 Arguments
619 =========
620
621 n (input) int
622 The number of elements in a row or column. If transpose==0, then n is the number
623 of columns; otherwise, n is the number of rows.
624
625 data1 (input) float array
626 The data array containing the first vector.
627
628 data2 (input) float array
629 The data array containing the second vector.
630
631 mask1 (input) int array
632 This array which elements in data1 are missing. If mask1[i][j]==0, then
633 data1[i][j] is missing.
634
635 mask2 (input) int array
636 This array which elements in data2 are missing. If mask2[i][j]==0, then
637 data2[i][j] is missing.
638
639 weight (input) float[n]
640 The weights that are used to calculate the distance.
641
642 index1 (input) int
643 Index of the first row or column.
644
645 index2 (input) int
646 Index of the second row or column.
647
648 transpose (input) int
649 If transpose==0, the distance between two rows in the matrix is calculated.
650 Otherwise, the distance between two columns in the matrix is calculated.
651
652 ============================================================================
653 */
654 { float result = 0.;
655 float tweight = 0;
656 int i;
657 if (transpose==0) /* Calculate the distance between two rows */
658 { for (i = 0; i < n; i++)
659 { float term = data1[index1][i] - data2[index2][i];
660 result += weight[i]*term*term;
661 tweight += weight[i];
662
663 }
664 }
665 else
666 { for (i = 0; i < n; i++)
667 { float term = data1[i][index1] - data2[i][index2];
668 result += weight[i]*term*term;
669 tweight += weight[i];
670 }
671
672 }
673 if (!tweight) return 0; /* usually due to empty clusters */
674 result /= tweight;
675 return result;
676 }
677
678 /* ********************************************************************* */
679
680 static
cityblock(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)681 float cityblock (int n, float** data1, float** data2, const float weight[], int index1, int index2, int transpose)
682
683 /*
684 Purpose
685 =======
686
687 The cityblock routine calculates the weighted "City Block" distance between
688 two rows or columns in a matrix. City Block distance is defined as the
689 absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
690 equivalent to taking an "up and over" path.
691
692 Arguments
693 =========
694
695 n (input) int
696 The number of elements in a row or column. If transpose==0, then n is the number
697 of columns; otherwise, n is the number of rows.
698
699 data1 (input) float array
700 The data array containing the first vector.
701
702 data2 (input) float array
703 The data array containing the second vector.
704
705 mask1 (input) int array
706 This array which elements in data1 are missing. If mask1[i][j]==0, then
707 data1[i][j] is missing.
708
709 mask2 (input) int array
710 This array which elements in data2 are missing. If mask2[i][j]==0, then
711 data2[i][j] is missing.
712
713 weight (input) float[n]
714 The weights that are used to calculate the distance.
715
716 index1 (input) int
717 Index of the first row or column.
718
719 index2 (input) int
720 Index of the second row or column.
721
722 transpose (input) int
723 If transpose==0, the distance between two rows in the matrix is calculated.
724 Otherwise, the distance between two columns in the matrix is calculated.
725
726 ============================================================================ */
727 { float result = 0.;
728 float tweight = 0;
729 int i;
730 if (transpose==0) /* Calculate the distance between two rows */
731 { for (i = 0; i < n; i++)
732 {
733 float term = data1[index1][i] - data2[index2][i];
734 result = result + weight[i]*fabs(term);
735 tweight += weight[i];
736
737 }
738 }
739 else
740 { for (i = 0; i < n; i++)
741 {
742 float term = data1[i][index1] - data2[i][index2];
743 result = result + weight[i]*fabs(term);
744 tweight += weight[i];
745
746 }
747 }
748 if (!tweight) return 0; /* usually due to empty clusters */
749 result /= tweight;
750 return result;
751 }
752
753 /* ********************************************************************* */
754
755 static
correlation(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)756 float correlation (int n, float** data1, float** data2,
757 const float weight[], int index1, int index2, int transpose)
758 /*
759 Purpose
760 =======
761
762 The correlation routine calculates the weighted Pearson distance between two
763 rows or columns in a matrix. We define the Pearson distance as one minus the
764 Pearson correlation.
765 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
766 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
767 (e.g., choose b = a + c).
768
769 Arguments
770 =========
771
772 n (input) int
773 The number of elements in a row or column. If transpose==0, then n is the number
774 of columns; otherwise, n is the number of rows.
775
776 data1 (input) float array
777 The data array containing the first vector.
778
779 data2 (input) float array
780 The data array containing the second vector.
781
782 mask1 (input) int array
783 This array which elements in data1 are missing. If mask1[i][j]==0, then
784 data1[i][j] is missing.
785
786 mask2 (input) int array
787 This array which elements in data2 are missing. If mask2[i][j]==0, then
788 data2[i][j] is missing.
789
790 weight (input) float[n]
791 The weights that are used to calculate the distance.
792
793 index1 (input) int
794 Index of the first row or column.
795
796 index2 (input) int
797 Index of the second row or column.
798
799 transpose (input) int
800 If transpose==0, the distance between two rows in the matrix is calculated.
801 Otherwise, the distance between two columns in the matrix is calculated.
802 ============================================================================
803 */
804 { float result = 0.;
805 float sum1 = 0.;
806 float sum2 = 0.;
807 float denom1 = 0.;
808 float denom2 = 0.;
809 float tweight = 0.;
810 if (transpose==0) /* Calculate the distance between two rows */
811 { int i;
812 for (i = 0; i < n; i++)
813 {
814 float term1 = data1[index1][i];
815 float term2 = data2[index2][i];
816 float w = weight[i];
817 sum1 += w*term1;
818 sum2 += w*term2;
819 result += w*term1*term2;
820 denom1 += w*term1*term1;
821 denom2 += w*term2*term2;
822 tweight += w;
823
824 }
825 }
826 else
827 { int i;
828 for (i = 0; i < n; i++)
829 {
830 float term1 = data1[i][index1];
831 float term2 = data2[i][index2];
832 float w = weight[i];
833 sum1 += w*term1;
834 sum2 += w*term2;
835 result += w*term1*term2;
836 denom1 += w*term1*term1;
837 denom2 += w*term2*term2;
838 tweight += w;
839
840 }
841 }
842 if (!tweight) return 0; /* usually due to empty clusters */
843 result -= sum1 * sum2 / tweight;
844 denom1 -= sum1 * sum1 / tweight;
845 denom2 -= sum2 * sum2 / tweight;
846 if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
847 if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
848 result = result / sqrt(denom1*denom2);
849 result = 1. - result;
850 return result;
851 }
852
853 /* ********************************************************************* */
854
855 static
acorrelation(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)856 float acorrelation (int n, float** data1, float** data2, const float weight[], int index1, int index2, int transpose)
857 /*
858 Purpose
859 =======
860
861 The acorrelation routine calculates the weighted Pearson distance between two
862 rows or columns, using the absolute value of the correlation.
863 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
864 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
865 (e.g., choose b = a + c).
866
867 Arguments
868 =========
869
870 n (input) int
871 The number of elements in a row or column. If transpose==0, then n is the number
872 of columns; otherwise, n is the number of rows.
873
874 data1 (input) float array
875 The data array containing the first vector.
876
877 data2 (input) float array
878 The data array containing the second vector.
879
880 mask1 (input) int array
881 This array which elements in data1 are missing. If mask1[i][j]==0, then
882 data1[i][j] is missing.
883
884 mask2 (input) int array
885 This array which elements in data2 are missing. If mask2[i][j]==0, then
886 data2[i][j] is missing.
887
888 weight (input) float[n]
889 The weights that are used to calculate the distance.
890
891 index1 (input) int
892 Index of the first row or column.
893
894 index2 (input) int
895 Index of the second row or column.
896
897 transpose (input) int
898 If transpose==0, the distance between two rows in the matrix is calculated.
899 Otherwise, the distance between two columns in the matrix is calculated.
900 ============================================================================
901 */
902 { float result = 0.;
903 float sum1 = 0.;
904 float sum2 = 0.;
905 float denom1 = 0.;
906 float denom2 = 0.;
907 float tweight = 0.;
908 if (transpose==0) /* Calculate the distance between two rows */
909 { int i;
910 for (i = 0; i < n; i++)
911 {
912 float term1 = data1[index1][i];
913 float term2 = data2[index2][i];
914 float w = weight[i];
915 sum1 += w*term1;
916 sum2 += w*term2;
917 result += w*term1*term2;
918 denom1 += w*term1*term1;
919 denom2 += w*term2*term2;
920 tweight += w;
921
922 }
923 }
924 else
925 { int i;
926 for (i = 0; i < n; i++)
927 {
928 float term1 = data1[i][index1];
929 float term2 = data2[i][index2];
930 float w = weight[i];
931 sum1 += w*term1;
932 sum2 += w*term2;
933 result += w*term1*term2;
934 denom1 += w*term1*term1;
935 denom2 += w*term2*term2;
936 tweight += w;
937
938 }
939 }
940 if (!tweight) return 0; /* usually due to empty clusters */
941 result -= sum1 * sum2 / tweight;
942 denom1 -= sum1 * sum1 / tweight;
943 denom2 -= sum2 * sum2 / tweight;
944 if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
945 if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
946 result = fabs(result) / sqrt(denom1*denom2);
947 result = 1. - result;
948 return result;
949 }
950
951 /* ********************************************************************* */
952
953 static
ucorrelation(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)954 float ucorrelation (int n, float** data1, float** data2,
955 const float weight[], int index1, int index2, int transpose)
956 /*
957 Purpose
958 =======
959
960 The ucorrelation routine calculates the weighted Pearson distance between two
961 rows or columns, using the uncentered version of the Pearson correlation. In the
962 uncentered Pearson correlation, a zero mean is used for both vectors even if
963 the actual mean is nonzero.
964 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
965 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
966 (e.g., choose b = a + c).
967
968 Arguments
969 =========
970
971 n (input) int
972 The number of elements in a row or column. If transpose==0, then n is the number
973 of columns; otherwise, n is the number of rows.
974
975 data1 (input) float array
976 The data array containing the first vector.
977
978 data2 (input) float array
979 The data array containing the second vector.
980
981 mask1 (input) int array
982 This array which elements in data1 are missing. If mask1[i][j]==0, then
983 data1[i][j] is missing.
984
985 mask2 (input) int array
986 This array which elements in data2 are missing. If mask2[i][j]==0, then
987 data2[i][j] is missing.
988
989 weight (input) float[n]
990 The weights that are used to calculate the distance.
991
992 index1 (input) int
993 Index of the first row or column.
994
995 index2 (input) int
996 Index of the second row or column.
997
998 transpose (input) int
999 If transpose==0, the distance between two rows in the matrix is calculated.
1000 Otherwise, the distance between two columns in the matrix is calculated.
1001 ============================================================================
1002 */
1003 { float result = 0.;
1004 float denom1 = 0.;
1005 float denom2 = 0.;
1006 int flag = 0;
1007 /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1008 * found.
1009 */
1010 if (transpose==0) /* Calculate the distance between two rows */
1011 { int i;
1012 for (i = 0; i < n; i++)
1013 {
1014 float term1 = data1[index1][i];
1015 float term2 = data2[index2][i];
1016 float w = weight[i];
1017 result += w*term1*term2;
1018 denom1 += w*term1*term1;
1019 denom2 += w*term2*term2;
1020 flag = 1;
1021
1022 }
1023 }
1024 else
1025 { int i;
1026 for (i = 0; i < n; i++)
1027 {
1028 float term1 = data1[i][index1];
1029 float term2 = data2[i][index2];
1030 float w = weight[i];
1031 result += w*term1*term2;
1032 denom1 += w*term1*term1;
1033 denom2 += w*term2*term2;
1034 flag = 1;
1035
1036 }
1037 }
1038 if (!flag) return 0.;
1039 if (denom1==0.) return 1.;
1040 if (denom2==0.) return 1.;
1041 result = result / sqrt(denom1*denom2);
1042 result = 1. - result;
1043 return result;
1044 }
1045
1046 /* ********************************************************************* */
1047
1048 static
uacorrelation(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)1049 float uacorrelation (int n, float** data1, float** data2,
1050 const float weight[], int index1, int index2, int transpose)
1051 /*
1052 Purpose
1053 =======
1054
1055 The uacorrelation routine calculates the weighted Pearson distance between two
1056 rows or columns, using the absolute value of the uncentered version of the
1057 Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
1058 for both vectors even if the actual mean is nonzero.
1059 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1060 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1061 (e.g., choose b = a + c).
1062
1063 Arguments
1064 =========
1065
1066 n (input) int
1067 The number of elements in a row or column. If transpose==0, then n is the number
1068 of columns; otherwise, n is the number of rows.
1069
1070 data1 (input) float array
1071 The data array containing the first vector.
1072
1073 data2 (input) float array
1074 The data array containing the second vector.
1075
1076 mask1 (input) int array
1077 This array which elements in data1 are missing. If mask1[i][j]==0, then
1078 data1[i][j] is missing.
1079
1080 mask2 (input) int array
1081 This array which elements in data2 are missing. If mask2[i][j]==0, then
1082 data2[i][j] is missing.
1083
1084 weight (input) float[n]
1085 The weights that are used to calculate the distance.
1086
1087 index1 (input) int
1088 Index of the first row or column.
1089
1090 index2 (input) int
1091 Index of the second row or column.
1092
1093 transpose (input) int
1094 If transpose==0, the distance between two rows in the matrix is calculated.
1095 Otherwise, the distance between two columns in the matrix is calculated.
1096 ============================================================================
1097 */
1098 { float result = 0.;
1099 float denom1 = 0.;
1100 float denom2 = 0.;
1101 int flag = 0;
1102 /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1103 * found.
1104 */
1105 if (transpose==0) /* Calculate the distance between two rows */
1106 { int i;
1107 for (i = 0; i < n; i++)
1108 {
1109 float term1 = data1[index1][i];
1110 float term2 = data2[index2][i];
1111 float w = weight[i];
1112 result += w*term1*term2;
1113 denom1 += w*term1*term1;
1114 denom2 += w*term2*term2;
1115 flag = 1;
1116
1117 }
1118 }
1119 else
1120 { int i;
1121 for (i = 0; i < n; i++)
1122 {
1123 float term1 = data1[i][index1];
1124 float term2 = data2[i][index2];
1125 float w = weight[i];
1126 result += w*term1*term2;
1127 denom1 += w*term1*term1;
1128 denom2 += w*term2*term2;
1129 flag = 1;
1130
1131 }
1132 }
1133 if (!flag) return 0.;
1134 if (denom1==0.) return 1.;
1135 if (denom2==0.) return 1.;
1136 result = fabs(result) / sqrt(denom1*denom2);
1137 result = 1. - result;
1138 return result;
1139 }
1140
1141 /* ********************************************************************* */
1142
1143 static
spearman(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)1144 float spearman (int n, float** data1, float** data2,
1145 const float weight[], int index1, int index2, int transpose)
1146 /*
1147 Purpose
1148 =======
1149
1150 The spearman routine calculates the Spearman distance between two rows or
1151 columns. The Spearman distance is defined as one minus the Spearman rank
1152 correlation.
1153
1154 Arguments
1155 =========
1156
1157 n (input) int
1158 The number of elements in a row or column. If transpose==0, then n is the number
1159 of columns; otherwise, n is the number of rows.
1160
1161 data1 (input) float array
1162 The data array containing the first vector.
1163
1164 data2 (input) float array
1165 The data array containing the second vector.
1166
1167 mask1 (input) int array
1168 This array which elements in data1 are missing. If mask1[i][j]==0, then
1169 data1[i][j] is missing.
1170
1171 mask2 (input) int array
1172 This array which elements in data2 are missing. If mask2[i][j]==0, then
1173 data2[i][j] is missing.
1174
1175 weight (input) float[n]
1176 These weights are ignored, but included for consistency with other distance
1177 measures.
1178
1179 index1 (input) int
1180 Index of the first row or column.
1181
1182 index2 (input) int
1183 Index of the second row or column.
1184
1185 transpose (input) int
1186 If transpose==0, the distance between two rows in the matrix is calculated.
1187 Otherwise, the distance between two columns in the matrix is calculated.
1188 ============================================================================
1189 */
1190 { int i;
1191 int m = 0;
1192 float* rank1;
1193 float* rank2;
1194 float result = 0.;
1195 float denom1 = 0.;
1196 float denom2 = 0.;
1197 float avgrank;
1198 float* tdata1;
1199 float* tdata2;
1200 tdata1 = malloc(n*sizeof(float));
1201 if(!tdata1) return 0.0; /* Memory allocation error */
1202 tdata2 = malloc(n*sizeof(float));
1203 if(!tdata2) /* Memory allocation error */
1204 { free(tdata1);
1205 return 0.0;
1206 }
1207 if (transpose==0)
1208 { for (i = 0; i < n; i++)
1209 { tdata1[m] = data1[index1][i];
1210 tdata2[m] = data2[index2][i];
1211 m++;
1212
1213 }
1214 }
1215 else
1216 { for (i = 0; i < n; i++)
1217 {
1218 tdata1[m] = data1[i][index1];
1219 tdata2[m] = data2[i][index2];
1220 m++;
1221
1222 }
1223 }
1224 if (m==0) return 0;
1225 rank1 = getrank(m, tdata1);
1226 free(tdata1);
1227 if(!rank1) return 0.0; /* Memory allocation error */
1228 rank2 = getrank(m, tdata2);
1229 free(tdata2);
1230 if(!rank2) /* Memory allocation error */
1231 { free(rank1);
1232 return 0.0;
1233 }
1234 avgrank = 0.5*(m-1); /* Average rank */
1235 for (i = 0; i < m; i++)
1236 { const float value1 = rank1[i];
1237 const float value2 = rank2[i];
1238 result += value1 * value2;
1239 denom1 += value1 * value1;
1240 denom2 += value2 * value2;
1241 }
1242 /* Note: denom1 and denom2 cannot be calculated directly from the number
1243 * of elements. If two elements have the same rank, the squared sum of
1244 * their ranks will change.
1245 */
1246 free(rank1);
1247 free(rank2);
1248 result /= m;
1249 denom1 /= m;
1250 denom2 /= m;
1251 result -= avgrank * avgrank;
1252 denom1 -= avgrank * avgrank;
1253 denom2 -= avgrank * avgrank;
1254 if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1255 if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1256 result = result / sqrt(denom1*denom2);
1257 result = 1. - result;
1258 return result;
1259 }
1260
1261 /* ********************************************************************* */
1262
1263 static
kendall(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)1264 float kendall (int n, float** data1, float** data2,
1265 const float weight[], int index1, int index2, int transpose)
1266 /*
1267 Purpose
1268 =======
1269
1270 The kendall routine calculates the Kendall distance between two
1271 rows or columns. The Kendall distance is defined as one minus Kendall's tau.
1272
1273 Arguments
1274 =========
1275
1276 n (input) int
1277 The number of elements in a row or column. If transpose==0, then n is the number
1278 of columns; otherwise, n is the number of rows.
1279
1280 data1 (input) float array
1281 The data array containing the first vector.
1282
1283 data2 (input) float array
1284 The data array containing the second vector.
1285
1286 mask1 (input) int array
1287 This array which elements in data1 are missing. If mask1[i][j]==0, then
1288 data1[i][j] is missing.
1289
1290 mask2 (input) int array
1291 This array which elements in data2 are missing. If mask2[i][j]==0, then
1292 data2[i][j] is missing.
1293
1294 weight (input) float[n]
1295 These weights are ignored, but included for consistency with other distance
1296 measures.
1297
1298 index1 (input) int
1299 Index of the first row or column.
1300
1301 index2 (input) int
1302 Index of the second row or column.
1303
1304 transpose (input) int
1305 If transpose==0, the distance between two rows in the matrix is calculated.
1306 Otherwise, the distance between two columns in the matrix is calculated.
1307 ============================================================================
1308 */
1309 { int con = 0;
1310 int dis = 0;
1311 int exx = 0;
1312 int exy = 0;
1313 int flag = 0;
1314 /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1315 * found.
1316 */
1317 float denomx;
1318 float denomy;
1319 float tau;
1320 int i, j;
1321 if (transpose==0)
1322 { for (i = 0; i < n; i++)
1323
1324 { for (j = 0; j < i; j++)
1325
1326 { float x1 = data1[index1][i];
1327 float x2 = data1[index1][j];
1328 float y1 = data2[index2][i];
1329 float y2 = data2[index2][j];
1330 if (x1 < x2 && y1 < y2) con++;
1331 if (x1 > x2 && y1 > y2) con++;
1332 if (x1 < x2 && y1 > y2) dis++;
1333 if (x1 > x2 && y1 < y2) dis++;
1334 if (x1 == x2 && y1 != y2) exx++;
1335 if (x1 != x2 && y1 == y2) exy++;
1336 flag = 1;
1337 }
1338
1339 }
1340
1341 }
1342 else
1343 { for (i = 0; i < n; i++)
1344
1345 { for (j = 0; j < i; j++)
1346
1347 { float x1 = data1[i][index1];
1348 float x2 = data1[j][index1];
1349 float y1 = data2[i][index2];
1350 float y2 = data2[j][index2];
1351 if (x1 < x2 && y1 < y2) con++;
1352 if (x1 > x2 && y1 > y2) con++;
1353 if (x1 < x2 && y1 > y2) dis++;
1354 if (x1 > x2 && y1 < y2) dis++;
1355 if (x1 == x2 && y1 != y2) exx++;
1356 if (x1 != x2 && y1 == y2) exy++;
1357 flag = 1;
1358 }
1359
1360 }
1361
1362 }
1363 if (!flag) return 0.;
1364 denomx = con + dis + exx;
1365 denomy = con + dis + exy;
1366 if (denomx==0) return 1;
1367 if (denomy==0) return 1;
1368 tau = (con-dis)/sqrt(denomx*denomy);
1369 return 1.-tau;
1370 }
1371
1372 /* ********************************************************************* */
1373
setmetric(char dist)1374 float(*setmetric(char dist))
1375 (int, float**, float**, const float[], int, int, int)
1376 { switch(dist)
1377 { case 'e': return &euclid;
1378 case 'b': return &cityblock;
1379 case 'c': return &correlation;
1380 case 'a': return &acorrelation;
1381 case 'u': return &ucorrelation;
1382 case 'x': return &uacorrelation;
1383 case 's': return &spearman;
1384 case 'k': return &kendall;
1385 default: return &euclid;
1386 }
1387 return NULL; /* Never get here */
1388 }
1389
1390 /* ********************************************************************* */
1391
1392 /* if seed = 0, return preset CLUST_SEED
1393 < 0, set CLUST_SEED to time (0) and return it
1394 > 0, set CLUST_SEED to seed and return it*/
clust_seed(int seed)1395 unsigned int clust_seed(int seed)
1396 {
1397 static int isset = 0;
1398 static unsigned int CLUST_SEED = 0;
1399
1400 if (seed < 0) { /* User wants time seed */
1401 CLUST_SEED = (unsigned int)time(0);
1402 isset = 1;
1403 } else if (seed > 0) {
1404 /* user is providing a seed, use it */
1405 CLUST_SEED = (unsigned int)seed;
1406 isset = 1;
1407 } else { /* user wants preset seed */
1408 if (!isset) { /* not set already, do it (could recurse...) */
1409 CLUST_SEED = (unsigned int)time(0);
1410 isset = 1;
1411 } else {
1412 /* nothing to do, let it return */
1413 }
1414 }
1415 return (CLUST_SEED);
1416 }
uniform(void)1417 static float uniform(void)
1418 /*
1419 Purpose
1420 =======
1421
1422 This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
1423 and 1.0 are excluded. This random number generator is described in:
1424
1425 Pierre l'Ecuyer
1426 Efficient and Portable Combined Random Number Generators
1427 Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749,774.
1428
1429 The first time this routine is called, it initializes the random number
1430 generator using the current time. First, the current epoch time in seconds is
1431 used as a seed for the random number generator in the C library. The first two
1432 random numbers generated by this generator are used to initialize the random
1433 number generator implemented in this routine.
1434
1435
1436 Arguments
1437 =========
1438
1439 None.
1440
1441
1442 Return value
1443 ============
1444
1445 A float-precison number between 0.0 and 1.0.
1446 ============================================================================
1447 */
1448 { int z;
1449 static const int m1 = 2147483563;
1450 static const int m2 = 2147483399;
1451 const float scale = 1.0/m1;
1452
1453 static int s1 = 0;
1454 static int s2 = 0;
1455
1456 if (s1==0 || s2==0) /* initialize */
1457 { unsigned int initseed = clust_seed(0);
1458 srand(initseed);
1459 s1 = rand();
1460 s2 = rand();
1461 }
1462
1463 do
1464 { int k;
1465 k = s1/53668;
1466 s1 = 40014*(s1-k*53668)-k*12211;
1467 if (s1 < 0) s1+=m1;
1468 k = s2/52774;
1469 s2 = 40692*(s2-k*52774)-k*3791;
1470 if(s2 < 0) s2+=m2;
1471 z = s1-s2;
1472 if(z < 1) z+=(m1-1);
1473 } while (z==m1); /* To avoid returning 1.0 */
1474
1475 return z*scale;
1476 }
1477
1478 /* ************************************************************************ */
1479
binomial(int n,float p)1480 static int binomial(int n, float p)
1481 /*
1482 Purpose
1483 =======
1484
1485 This routine generates a random number between 0 and n inclusive, following
1486 the binomial distribution with probability p and n trials. The routine is
1487 based on the BTPE algorithm, described in:
1488
1489 Voratas Kachitvichyanukul and Bruce W. Schmeiser:
1490 Binomial Random Variate Generation
1491 Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222.
1492
1493
1494 Arguments
1495 =========
1496
1497 p (input) float
1498 The probability of a single event. This probability should be less than or
1499 equal to 0.5.
1500
1501 n (input) int
1502 The number of trials.
1503
1504
1505 Return value
1506 ============
1507
1508 An integer drawn from a binomial distribution with parameters (p, n).
1509
1510 ============================================================================
1511 */
1512 { const float q = 1 - p;
1513 if (n*p < 30.0) /* Algorithm BINV */
1514 { const float s = p/q;
1515 const float a = (n+1)*s;
1516 float r = exp(n*log(q)); /* pow() causes a crash on AIX */
1517 int x = 0;
1518 float u = uniform();
1519 while(1)
1520 { if (u < r) return x;
1521 u-=r;
1522 x++;
1523 r *= (a/x)-s;
1524 }
1525 }
1526 else /* Algorithm BTPE */
1527 { /* Step 0 */
1528 const float fm = n*p + p;
1529 const int m = (int) fm;
1530 const float p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5;
1531 const float xm = m + 0.5;
1532 const float xl = xm - p1;
1533 const float xr = xm + p1;
1534 const float c = 0.134 + 20.5/(15.3+m);
1535 const float a = (fm-xl)/(fm-xl*p);
1536 const float b = (xr-fm)/(xr*q);
1537 const float lambdal = a*(1.0+0.5*a);
1538 const float lambdar = b*(1.0+0.5*b);
1539 const float p2 = p1*(1+2*c);
1540 const float p3 = p2 + c/lambdal;
1541 const float p4 = p3 + c/lambdar;
1542 while (1)
1543 { /* Step 1 */
1544 int y;
1545 int k;
1546 float u = uniform();
1547 float v = uniform();
1548 u *= p4;
1549 if (u <= p1) return (int)(xm-p1*v+u);
1550 /* Step 2 */
1551 if (u > p2)
1552 { /* Step 3 */
1553 if (u > p3)
1554 { /* Step 4 */
1555 y = (int)(xr-log(v)/lambdar);
1556 if (y > n) continue;
1557 /* Go to step 5 */
1558 v = v*(u-p3)*lambdar;
1559 }
1560 else
1561 { y = (int)(xl+log(v)/lambdal);
1562 if (y < 0) continue;
1563 /* Go to step 5 */
1564 v = v*(u-p2)*lambdal;
1565 }
1566 }
1567 else
1568 { const float x = xl + (u-p1)/c;
1569 v = v*c + 1.0 - fabs(m-x+0.5)/p1;
1570 if (v > 1) continue;
1571 /* Go to step 5 */
1572 y = (int)x;
1573 }
1574 /* Step 5 */
1575 /* Step 5.0 */
1576 k = abs(y-m);
1577 if (k > 20 && k < 0.5*n*p*q-1.0)
1578 { /* Step 5.2 */
1579 float rho = (k/(n*p*q))*((k*(k/3.0 + 0.625) + 0.1666666666666)/(n*p*q)+0.5);
1580 float t = -k*k/(2*n*p*q);
1581 float A = log(v);
1582 if (A < t-rho) return y;
1583 else if (A > t+rho) continue;
1584 else
1585 { /* Step 5.3 */
1586 float x1 = y+1;
1587 float f1 = m+1;
1588 float z = n+1-m;
1589 float w = n-y+1;
1590 float x2 = x1*x1;
1591 float f2 = f1*f1;
1592 float z2 = z*z;
1593 float w2 = w*w;
1594 if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w)
1595 + (y-m)*log(w*p/(x1*q))
1596 + (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320.
1597 + (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320.
1598 + (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320.
1599 + (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.)
1600 continue;
1601 return y;
1602 }
1603 }
1604 else
1605 { /* Step 5.1 */
1606 int i;
1607 const float s = p/q;
1608 const float aa = s*(n+1);
1609 float f = 1.0;
1610 for (i = m; i < y; f *= (aa/(++i)-s));
1611 for (i = y; i < m; f /= (aa/(++i)-s));
1612 if (v > f) continue;
1613 return y;
1614 }
1615 }
1616 }
1617 /* Never get here */
1618 return -1;
1619 }
1620
1621 /* ************************************************************************ */
1622
randomassign(int nclusters,int nelements,int clusterid[])1623 static void randomassign (int nclusters, int nelements, int clusterid[])
1624 /*
1625 Purpose
1626 =======
1627
1628 The randomassign routine performs an initial random clustering, needed for
1629 k-means or k-median clustering. Elements (genes or microarrays) are randomly
1630 assigned to clusters. The number of elements in each cluster is chosen
1631 randomly, making sure that each cluster will receive at least one element.
1632
1633
1634 Arguments
1635 =========
1636
1637 nclusters (input) int
1638 The number of clusters.
1639
1640 nelements (input) int
1641 The number of elements to be clustered (i.e., the number of genes or microarrays
1642 to be clustered).
1643
1644 clusterid (output) int[nelements]
1645 The cluster number to which an element was assigned.
1646
1647 ============================================================================
1648 */
1649 { int i, j;
1650 int k = 0;
1651 float p;
1652 int n = nelements-nclusters;
1653 /* Draw the number of elements in each cluster from a multinomial
1654 * distribution, reserving ncluster elements to set independently
1655 * in order to guarantee that none of the clusters are empty.
1656 */
1657
1658 for (i = 0; i < nclusters-1; i++)
1659 { p = 1.0/(nclusters-i);
1660 j = binomial(n, p);
1661 n -= j;
1662 j += k+1; /* Assign at least one element to cluster i */
1663 for ( ; k < j; k++) clusterid[k] = i;
1664 }
1665 /* Assign the remaining elements to the last cluster */
1666 for ( ; k < nelements; k++) clusterid[k] = i;
1667
1668
1669 /* Create a random permutation of the cluster assignments */
1670 for (i = 0; i < nelements; i++)
1671 { j = (int) (i + (nelements-i)*uniform());
1672 if (j>=nelements) j=nelements-1; /* ZSS: This was causing memory
1673 corruption, j can be equal to
1674 nelements and that is illegal. */
1675 k = clusterid[j];
1676 clusterid[j] = clusterid[i];
1677 clusterid[i] = k;
1678 }
1679
1680 return;
1681 }
1682
1683 /* ********************************************************************* */
1684
getclustermeans(int nclusters,int nrows,int ncolumns,float ** data,int clusterid[],float ** cdata,int transpose)1685 static void getclustermeans(int nclusters, int nrows, int ncolumns,
1686 float** data, int clusterid[], float** cdata,
1687 int transpose)
1688 /*
1689 Purpose
1690 =======
1691
1692 The getclustermeans routine calculates the cluster centroids, given to which
1693 cluster each element belongs. The centroid is defined as the mean over all
1694 elements for each dimension.
1695
1696 Arguments
1697 =========
1698
1699 nclusters (input) int
1700 The number of clusters.
1701
1702 nrows (input) int
1703 The number of rows in the gene expression data matrix, equal to the number of
1704 genes.
1705
1706 ncolumns (input) int
1707 The number of columns in the gene expression data matrix, equal to the number of
1708 microarrays.
1709
1710 data (input) float[nrows][ncolumns]
1711 The array containing the gene expression data.
1712
1713 mask (input) int[nrows][ncolumns]
1714 This array shows which data values are missing. If mask[i][j]==0, then
1715 data[i][j] is missing.
1716
1717 clusterid (output) int[nrows] if transpose==0
1718 int[ncolumns] if transpose==1
1719 The cluster number to which each element belongs. If transpose==0, then the
1720 dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
1721 is equal to ncolumns (the number of microarrays).
1722
1723 cdata (output) float[nclusters][ncolumns] if transpose==0
1724 float[nrows][nclusters] if transpose==1
1725 On exit of getclustermeans, this array contains the cluster centroids.
1726
1727 cmask (output) int[nclusters][ncolumns] if transpose==0
1728 int[nrows][nclusters] if transpose==1
1729 This array shows which data values of are missing for each centroid. If
1730 cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
1731 a centroid if all corresponding data values of the cluster members are missing.
1732
1733 transpose (input) int
1734 If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
1735 columns (microarrays) are specified.
1736
1737 ========================================================================
1738 */
1739 { int i, j, k;
1740 int** voxlcounter;
1741 voxlcounter = malloc(nclusters*sizeof(int*));
1742 for (i = 0; i < nclusters; i++)
1743 { voxlcounter[i] = malloc(ncolumns*sizeof(int));
1744 }
1745
1746 if (transpose==0)
1747 { for (i = 0; i < nclusters; i++)
1748 { for (j = 0; j < ncolumns; j++)
1749 { voxlcounter[i][j] = 0;
1750 cdata[i][j] = 0.;
1751 }
1752 }
1753
1754 for (k = 0; k < nrows; k++)
1755 { i = clusterid[k];
1756 for (j = 0; j < ncolumns; j++)
1757 {
1758 cdata[i][j]+=data[k][j];
1759 voxlcounter[i][j]++;
1760
1761 }
1762 }
1763 for (i = 0; i < nclusters; i++)
1764 { for (j = 0; j < ncolumns; j++)
1765 { if (voxlcounter[i][j]>0)
1766 { cdata[i][j] /= voxlcounter[i][j];
1767 voxlcounter[i][j] = 1;
1768 }
1769 }
1770 }
1771
1772 }
1773 else
1774 { for (i = 0; i < nrows; i++)
1775 { for (j = 0; j < nclusters; j++)
1776 { cdata[i][j] = 0.;
1777 voxlcounter[i][j] = 0;
1778 }
1779 }
1780 for (k = 0; k < ncolumns; k++)
1781 { i = clusterid[k];
1782 for (j = 0; j < nrows; j++)
1783 { cdata[j][i]+=data[j][k];
1784 voxlcounter[j][i]++;
1785
1786 }
1787 }
1788 for (i = 0; i < nrows; i++)
1789 { for (j = 0; j < nclusters; j++)
1790 { if (voxlcounter[i][j]>0)
1791 { cdata[i][j] /= voxlcounter[i][j];
1792 voxlcounter[i][j] = 1;
1793 }
1794 }
1795 }
1796 }
1797
1798 for (i = 0; i < nclusters; i++)
1799 { free(voxlcounter[i]);
1800 }
1801 free(voxlcounter);
1802 /* It might be good to free it later?! but it is small anyway... */
1803 }
1804
1805 /* ********************************************************************* */
1806
1807 static void
getclustermedians(int nclusters,int nrows,int ncolumns,float ** data,int clusterid[],float ** cdata,int transpose,float cache[])1808 getclustermedians(int nclusters, int nrows, int ncolumns,
1809 float** data, int clusterid[], float** cdata,
1810 int transpose, float cache[])
1811 /*
1812 Purpose
1813 =======
1814
1815 The getclustermedians routine calculates the cluster centroids, given to which
1816 cluster each element belongs. The centroid is defined as the median over all
1817 elements for each dimension.
1818
1819 Arguments
1820 =========
1821
1822 nclusters (input) int
1823 The number of clusters.
1824
1825 nrows (input) int
1826 The number of rows in the gene expression data matrix, equal to the number of
1827 genes.
1828
1829 ncolumns (input) int
1830 The number of columns in the gene expression data matrix, equal to the number of
1831 microarrays.
1832
1833 data (input) float[nrows][ncolumns]
1834 The array containing the gene expression data.
1835
1836 mask (input) int[nrows][ncolumns]
1837 This array shows which data values are missing. If mask[i][j]==0, then
1838 data[i][j] is missing.
1839
1840 clusterid (output) int[nrows] if transpose==0
1841 int[ncolumns] if transpose==1
1842 The cluster number to which each element belongs. If transpose==0, then the
1843 dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
1844 is equal to ncolumns (the number of microarrays).
1845
1846 cdata (output) float[nclusters][ncolumns] if transpose==0
1847 float[nrows][nclusters] if transpose==1
1848 On exit of getclustermedians, this array contains the cluster centroids.
1849
1850 cmask (output) int[nclusters][ncolumns] if transpose==0
1851 int[nrows][nclusters] if transpose==1
1852 This array shows which data values of are missing for each centroid. If
1853 cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
1854 a centroid if all corresponding data values of the cluster members are missing.
1855
1856 transpose (input) int
1857 If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
1858 columns (microarrays) are specified.
1859
1860 cache (input) float[nrows] if transpose==0
1861 float[ncolumns] if transpose==1
1862 This array should be allocated before calling getclustermedians; its contents
1863 on input is not relevant. This array is used as a temporary storage space when
1864 calculating the medians.
1865
1866 ========================================================================
1867 */
1868 { int i, j, k;
1869 if (transpose==0)
1870 { for (i = 0; i < nclusters; i++)
1871 { for (j = 0; j < ncolumns; j++)
1872 { int count = 0;
1873 for (k = 0; k < nrows; k++)
1874 { if (i==clusterid[k])
1875 { cache[count] = data[k][j];
1876 count++;
1877 }
1878 }
1879 if (count>0)
1880 { cdata[i][j] = median(count,cache);
1881
1882 }
1883 else
1884 { cdata[i][j] = 0.;
1885
1886 }
1887 }
1888 }
1889 }
1890 else
1891 { for (i = 0; i < nclusters; i++)
1892 { for (j = 0; j < nrows; j++)
1893 { int count = 0;
1894 for (k = 0; k < ncolumns; k++)
1895 { if (i==clusterid[k])
1896 { cache[count] = data[j][k];
1897 count++;
1898 }
1899 }
1900 if (count>0)
1901 { cdata[j][i] = median(count,cache);
1902
1903 }
1904 else
1905 { cdata[j][i] = 0.;
1906
1907 }
1908 }
1909 }
1910 }
1911 }
1912
1913 /* ********************************************************************* */
1914
getclustercentroids(int nclusters,int nrows,int ncolumns,float ** data,int clusterid[],float ** cdata,int transpose,char method)1915 int getclustercentroids(int nclusters, int nrows, int ncolumns,
1916 float** data, int clusterid[], float** cdata,
1917 int transpose, char method)
1918 /*
1919 Purpose
1920 =======
1921
1922 The getclustercentroids routine calculates the cluster centroids, given to
1923 which cluster each element belongs. Depending on the argument method, the
1924 centroid is defined as either the mean or the median for each dimension over
1925 all elements belonging to a cluster.
1926
1927 Arguments
1928 =========
1929
1930 nclusters (input) int
1931 The number of clusters.
1932
1933 nrows (input) int
1934 The number of rows in the gene expression data matrix, equal to the number of
1935 genes.
1936
1937 ncolumns (input) int
1938 The number of columns in the gene expression data matrix, equal to the number of
1939 microarrays.
1940
1941 data (input) float[nrows][ncolumns]
1942 The array containing the gene expression data.
1943
1944 mask (input) int[nrows][ncolumns]
1945 This array shows which data values are missing. If mask[i][j]==0, then
1946 data[i][j] is missing.
1947
1948 clusterid (output) int[nrows] if transpose==0
1949 int[ncolumns] if transpose==1
1950 The cluster number to which each element belongs. If transpose==0, then the
1951 dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
1952 is equal to ncolumns (the number of microarrays).
1953
1954 cdata (output) float[nclusters][ncolumns] if transpose==0
1955 float[nrows][nclusters] if transpose==1
1956 On exit of getclustercentroids, this array contains the cluster centroids.
1957
1958 cmask (output) int[nclusters][ncolumns] if transpose==0
1959 int[nrows][nclusters] if transpose==1
1960 This array shows which data values of are missing for each centroid. If
1961 cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
1962 a centroid if all corresponding data values of the cluster members are missing.
1963
1964 transpose (input) int
1965 If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
1966 columns (microarrays) are specified.
1967
1968 method (input) char
1969 For method=='a', the centroid is defined as the mean over all elements
1970 belonging to a cluster for each dimension.
1971 For method=='m', the centroid is defined as the median over all elements
1972 belonging to a cluster for each dimension.
1973
1974 Return value
1975 ============
1976
1977 The function returns an integer to indicate success or failure. If a
1978 memory error occurs, or if method is not 'm' or 'a', getclustercentroids
1979 returns 0. If successful, getclustercentroids returns 1;
1980 ========================================================================
1981 */
1982 { switch(method)
1983 { case 'm':
1984 { const int nelements = (transpose==0) ? nrows : ncolumns;
1985 float* cache = malloc(nelements*sizeof(float));
1986 if (!cache) return 0;
1987 getclustermedians(nclusters, nrows, ncolumns, data, clusterid,
1988 cdata, transpose, cache);
1989 free(cache);
1990 return 1;
1991 }
1992 case 'a':
1993 { getclustermeans(nclusters, nrows, ncolumns, data, clusterid,
1994 cdata, transpose);
1995 return 1;
1996 }
1997 }
1998 return 0;
1999 }
2000
2001 /* ********************************************************************* */
2002
getclustermedoids(int nclusters,int nelements,float ** distance,int clusterid[],int centroids[],float errors[])2003 void getclustermedoids(int nclusters, int nelements, float** distance,
2004 int clusterid[], int centroids[], float errors[])
2005 /*
2006 Purpose
2007 =======
2008
2009 The getclustermedoids routine calculates the cluster centroids, given to which
2010 cluster each element belongs. The centroid is defined as the element with the
2011 smallest sum of distances to the other elements.
2012
2013 Arguments
2014 =========
2015
2016 nclusters (input) int
2017 The number of clusters.
2018
2019 nelements (input) int
2020 The total number of elements.
2021
2022 distmatrix (input) float array, ragged
2023 (number of rows is nelements, number of columns is equal to the row number)
2024 The distance matrix. To save space, the distance matrix is given in the
2025 form of a ragged array. The distance matrix is symmetric and has zeros
2026 on the diagonal. See distancematrix for a description of the content.
2027
2028 clusterid (output) int[nelements]
2029 The cluster number to which each element belongs.
2030
2031 centroid (output) int[nclusters]
2032 The index of the element that functions as the centroid for each cluster.
2033
2034 errors (output) float[nclusters]
2035 The within-cluster sum of distances between the items and the cluster
2036 centroid.
2037
2038 ========================================================================
2039 */
2040 { int i, j, k;
2041 for (j = 0; j < nclusters; j++) errors[j] = FLT_MAX;
2042 for (i = 0; i < nelements; i++)
2043 { float d = 0.0;
2044 j = clusterid[i];
2045 for (k = 0; k < nelements; k++)
2046 { if (i==k || clusterid[k]!=j) continue;
2047 d += (i < k ? distance[k][i] : distance[i][k]);
2048 if (d > errors[j]) break;
2049 }
2050 if (d < errors[j])
2051 { errors[j] = d;
2052 centroids[j] = i;
2053 }
2054 }
2055 }
2056
2057 /* ********************************************************************* */
2058
2059 static int
kmeans(int nclusters,int nrows,int ncolumns,float ** data,float weight[],int transpose,int npass,char dist,float ** cdata,int clusterid[],float * error,int tclusterid[],int counts[],int mapping[])2060 kmeans(int nclusters, int nrows, int ncolumns, float** data,
2061 float weight[], int transpose, int npass, char dist,
2062 float** cdata, int clusterid[], float* error,
2063 int tclusterid[], int counts[], int mapping[])
2064 { int i, j, k;
2065 const int nelements = (transpose==0) ? nrows : ncolumns;
2066 const int ndata = (transpose==0) ? ncolumns : nrows;
2067 int ifound = 1;
2068 int ipass = 0;
2069 /* Set the metric function as indicated by dist */
2070 float (*metric)
2071 (int, float**, float**, const float[], int, int, int) =
2072 setmetric(dist);
2073
2074 /* We save the clustering solution periodically and check if it reappears */
2075 int* saved = malloc(nelements*sizeof(int));
2076
2077 if (saved==NULL) return -1;
2078
2079 *error = FLT_MAX;
2080
2081 do
2082 { float total = FLT_MAX;
2083 int counter = 0;
2084 int period = 10;
2085
2086 /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2087
2088 if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2089
2090 for (i = 0; i < nclusters; i++) counts[i] = 0;
2091 for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2092
2093 /* Start the loop */
2094 while(1)
2095 { float previous = total;
2096 total = 0.0;
2097
2098 if (counter % period == 0) /* Save the current cluster assignments */
2099 { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2100 if (period < INT_MAX / 2) period *= 2;
2101 }
2102 counter++;
2103
2104 /* Find the center */
2105
2106 getclustermeans(nclusters, nrows, ncolumns, data, tclusterid,
2107 cdata, transpose);
2108
2109 for (i = 0; i < nelements; i++)
2110 /* Calculate the distances */
2111 { float distance;
2112 k = tclusterid[i];
2113 if (counts[k]==1) continue;
2114 /* No reassignment if that would lead to an empty cluster */
2115 /* Treat the present cluster as a special case */
2116 distance = metric(ndata,data,cdata,weight,i,k,transpose);
2117 for (j = 0; j < nclusters; j++)
2118 { float tdistance;
2119 if (j==k) continue;
2120 tdistance = metric(ndata,data,cdata,weight,i,j,transpose);
2121 if (tdistance < distance)
2122 { distance = tdistance;
2123 counts[tclusterid[i]]--;
2124 tclusterid[i] = j;
2125 counts[j]++;
2126 }
2127 }
2128 total += distance;
2129 }
2130 if (total>=previous) break;
2131 /* total>=previous is FALSE on some machines even if total and previous
2132 * are bitwise identical. */
2133 for (i = 0; i < nelements; i++)
2134 if (saved[i]!=tclusterid[i]) break;
2135 if (i==nelements)
2136 break; /* Identical solution found; break out of this loop */
2137 }
2138
2139 if (npass<=1)
2140 { *error = total;
2141 break;
2142 }
2143
2144 for (i = 0; i < nclusters; i++) mapping[i] = -1;
2145 for (i = 0; i < nelements; i++)
2146 { j = tclusterid[i];
2147 k = clusterid[i];
2148 if (mapping[k] == -1) mapping[k] = j;
2149 else if (mapping[k] != j)
2150 { if (total < *error)
2151 { ifound = 1;
2152 *error = total;
2153 for (i = 0; i < nelements; i++) clusterid[i] = tclusterid[i];
2154 }
2155 break;
2156 }
2157 }
2158 if (i==nelements) ifound++; /* break statement not encountered */
2159
2160 } while (++ipass < npass);
2161
2162 free(saved);
2163 return ifound;
2164 }
2165
2166 /* ---------------------------------------------------------------------- */
2167
2168 static int
kmedians(int nclusters,int nrows,int ncolumns,float ** data,float weight[],int transpose,int npass,char dist,float ** cdata,int clusterid[],float * error,int tclusterid[],int counts[],int mapping[],float cache[])2169 kmedians(int nclusters, int nrows, int ncolumns, float** data,
2170 float weight[], int transpose, int npass, char dist,
2171 float** cdata, int clusterid[], float* error,
2172 int tclusterid[], int counts[], int mapping[], float cache[])
2173 { int i, j, k;
2174 const int nelements = (transpose==0) ? nrows : ncolumns;
2175 const int ndata = (transpose==0) ? ncolumns : nrows;
2176 int ifound = 1;
2177 int ipass = 0;
2178 /* Set the metric function as indicated by dist */
2179 float (*metric)
2180 (int, float**, float**, const float[], int, int, int) =
2181 setmetric(dist);
2182
2183 /* We save the clustering solution periodically and check if it reappears */
2184 int* saved = malloc(nelements*sizeof(int));
2185 if (saved==NULL) return -1;
2186
2187 *error = FLT_MAX;
2188
2189 do
2190 { float total = FLT_MAX;
2191 int counter = 0;
2192 int period = 10;
2193
2194 /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2195 if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2196
2197 for (i = 0; i < nclusters; i++) counts[i]=0;
2198 for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2199
2200 /* Start the loop */
2201 while(1)
2202 { float previous = total;
2203 total = 0.0;
2204
2205 if (counter % period == 0) /* Save the current cluster assignments */
2206 { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2207 if (period < INT_MAX / 2) period *= 2;
2208 }
2209 counter++;
2210
2211 /* Find the center */
2212 getclustermedians(nclusters, nrows, ncolumns, data, tclusterid,
2213 cdata, transpose, cache);
2214
2215 for (i = 0; i < nelements; i++)
2216 /* Calculate the distances */
2217 { float distance;
2218 k = tclusterid[i];
2219 if (counts[k]==1) continue;
2220 /* No reassignment if that would lead to an empty cluster */
2221 /* Treat the present cluster as a special case */
2222 distance = metric(ndata,data,cdata,weight,i,k,transpose);
2223 for (j = 0; j < nclusters; j++)
2224 { float tdistance;
2225 if (j==k) continue;
2226 tdistance = metric(ndata,data,cdata,weight,i,j,transpose);
2227 if (tdistance < distance)
2228 { distance = tdistance;
2229 counts[tclusterid[i]]--;
2230 tclusterid[i] = j;
2231 counts[j]++;
2232 }
2233 }
2234 total += distance;
2235 }
2236 if (total>=previous) break;
2237 /* total>=previous is FALSE on some machines even if total and previous
2238 * are bitwise identical. */
2239 for (i = 0; i < nelements; i++)
2240 if (saved[i]!=tclusterid[i]) break;
2241 if (i==nelements)
2242 break; /* Identical solution found; break out of this loop */
2243 }
2244
2245 if (npass<=1)
2246 { *error = total;
2247 break;
2248 }
2249
2250 for (i = 0; i < nclusters; i++) mapping[i] = -1;
2251 for (i = 0; i < nelements; i++)
2252 { j = tclusterid[i];
2253 k = clusterid[i];
2254 if (mapping[k] == -1) mapping[k] = j;
2255 else if (mapping[k] != j)
2256 { if (total < *error)
2257 { ifound = 1;
2258 *error = total;
2259 for (i = 0; i < nelements; i++) clusterid[i] = tclusterid[i];
2260 }
2261 break;
2262 }
2263 }
2264 if (i==nelements) ifound++; /* break statement not encountered */
2265 } while (++ipass < npass);
2266
2267 free(saved);
2268 return ifound;
2269 }
2270
2271 /* ********************************************************************* */
2272
kcluster(int nclusters,int nrows,int ncolumns,float ** data,float weight[],int transpose,int npass,char method,char dist,int clusterid[],float * error,int * ifound)2273 void CALL kcluster (int nclusters, int nrows, int ncolumns,
2274 float** data, float weight[], int transpose,
2275 int npass, char method, char dist,
2276 int clusterid[], float* error, int* ifound)
2277 /*
2278 Purpose
2279 =======
2280
2281 The kcluster routine performs k-means or k-median clustering on a given set of
2282 elements, using the specified distance measure. The number of clusters is given
2283 by the user. Multiple passes are being made to find the optimal clustering
2284 solution, each time starting from a different initial clustering.
2285
2286
2287 Arguments
2288 =========
2289
2290 nclusters (input) int
2291 The number of clusters to be found.
2292
2293 data (input) float[nrows][ncolumns]
2294 The array containing the data of the elements to be clustered (i.e., the gene
2295 expression data).
2296
2297 mask (input) int[nrows][ncolumns]
2298 This array shows which data values are missing. If
2299 mask[i][j] == 0, then data[i][j] is missing.
2300
2301 nrows (input) int
2302 The number of rows in the data matrix, equal to the number of genes.
2303
2304 ncolumns (input) int
2305 The number of columns in the data matrix, equal to the number of microarrays.
2306
2307 weight (input) float[n]
2308 The weights that are used to calculate the distance.
2309
2310 transpose (input) int
2311 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
2312 of the matrix are clustered.
2313
2314 npass (input) int
2315 The number of times clustering is performed. Clustering is performed npass
2316 times, each time starting from a different (random) initial assignment of
2317 genes to clusters. The clustering solution with the lowest within-cluster sum
2318 of distances is chosen.
2319 If npass==0, then the clustering algorithm will be run once, where the initial
2320 assignment of elements to clusters is taken from the clusterid array.
2321
2322 method (input) char
2323 Defines whether the arithmetic mean (method=='a') or the median
2324 (method=='m') is used to calculate the cluster center.
2325
2326 dist (input) char
2327 Defines which distance measure is used, as given by the table:
2328 dist=='e': Euclidean distance
2329 dist=='b': City-block distance
2330 dist=='c': correlation
2331 dist=='a': absolute value of the correlation
2332 dist=='u': uncentered correlation
2333 dist=='x': absolute uncentered correlation
2334 dist=='s': Spearman's rank correlation
2335 dist=='k': Kendall's tau
2336 For other values of dist, the default (Euclidean distance) is used.
2337
2338 clusterid (output; input) int[nrows] if transpose==0
2339 int[ncolumns] if transpose==1
2340 The cluster number to which a gene or microarray was assigned. If npass==0,
2341 then on input clusterid contains the initial clustering assignment from which
2342 the clustering algorithm starts. On output, it contains the clustering solution
2343 that was found.
2344
2345 error (output) float*
2346 The sum of distances to the cluster center of each item in the optimal k-means
2347 clustering solution that was found.
2348
2349 ifound (output) int*
2350 The number of times the optimal clustering solution was
2351 found. The value of ifound is at least 1; its maximum value is npass. If the
2352 number of clusters is larger than the number of elements being clustered,
2353 *ifound is set to 0 as an error code. If a memory allocation error occurs,
2354 *ifound is set to -1.
2355
2356 ========================================================================
2357 */
2358 { const int nelements = (transpose==0) ? nrows : ncolumns;
2359 const int ndata = (transpose==0) ? ncolumns : nrows;
2360
2361 int i;
2362 int ok;
2363 int* tclusterid;
2364 int* mapping = NULL;
2365 float** cdata;
2366
2367 int* counts;
2368 int verb = 0;
2369
2370 if (nelements < nclusters)
2371 { *ifound = 0;
2372 return;
2373 }
2374 /* More clusters asked for than elements available */
2375
2376 *ifound = -1;
2377
2378 /* This will contain the number of elements in each cluster, which is
2379 * needed to check for empty clusters. */
2380 counts = malloc(nclusters*sizeof(int));
2381 if(!counts) return;
2382
2383 /* Find out if the user specified an initial clustering */
2384 if (npass<=1) {
2385 tclusterid = clusterid;
2386 } else {
2387 tclusterid = malloc(nelements*sizeof(int));
2388 if (!tclusterid)
2389 { free(counts);
2390 return;
2391 }
2392 mapping = malloc(nclusters*sizeof(int));
2393 if (!mapping)
2394 { free(counts);
2395 free(tclusterid);
2396 return;
2397 }
2398 for (i = 0; i < nelements; i++) clusterid[i] = 0;
2399 }
2400
2401
2402 /* Allocate space to store the centroid data */
2403 if (transpose==0) ok = makedatamask(nclusters, ndata, &cdata);
2404 else ok = makedatamask(ndata, nclusters, &cdata);
2405 if(!ok)
2406 { free(counts);
2407 if(npass>1)
2408 { free(tclusterid);
2409 free(mapping);
2410 return;
2411 }
2412 }
2413
2414 if (method=='m')
2415 { float* cache = malloc(nelements*sizeof(float));
2416 if (verb) fprintf(stderr,"doing kmedians\n");
2417 if(cache)
2418 { *ifound = kmedians(nclusters, nrows, ncolumns, data, weight,
2419 transpose, npass, dist, cdata, clusterid, error,
2420 tclusterid, counts, mapping, cache);
2421 free(cache);
2422 }
2423 }
2424 else {
2425 if (verb) fprintf(stderr,"doing kmeans\n");
2426 *ifound = kmeans(nclusters, nrows, ncolumns, data, weight,
2427 transpose, npass, dist, cdata, clusterid, error,
2428 tclusterid, counts, mapping);
2429 }
2430
2431 /* remapping is done in calling function now (see thd_segtools_fNM.c)
2432 ZSS Jan 2011*/
2433
2434 /* Deallocate temporarily used space */
2435 if (npass > 1)
2436 { free(mapping);
2437 free(tclusterid);
2438 }
2439
2440 if (transpose==0) freedatamask(nclusters, cdata);
2441 else freedatamask(ndata, cdata);
2442
2443 free(counts);
2444 }
2445
2446 /* *********************************************************************** */
2447
comp(const void * p,const void * q)2448 int comp(const void *p, const void *q)
2449 {
2450 int *ptr1 = (int *)(p);
2451 int *ptr2 = (int *)(q);
2452
2453 if (*ptr1 > *ptr2)
2454 return -1;
2455 else if (*ptr1 == *ptr2)
2456 return 0;
2457 else
2458 return 1;
2459 }
2460 /* *********************************************************************** */
2461
kmedoids(int nclusters,int nelements,float ** distmatrix,int npass,int clusterid[],float * error,int * ifound)2462 void CALL kmedoids (int nclusters, int nelements, float** distmatrix,
2463 int npass, int clusterid[], float* error, int* ifound)
2464 /*
2465 Purpose
2466 =======
2467
2468 The kmedoids routine performs k-medoids clustering on a given set of elements,
2469 using the distance matrix and the number of clusters passed by the user.
2470 Multiple passes are being made to find the optimal clustering solution, each
2471 time starting from a different initial clustering.
2472
2473
2474 Arguments
2475 =========
2476
2477 nclusters (input) int
2478 The number of clusters to be found.
2479
2480 nelements (input) int
2481 The number of elements to be clustered.
2482
2483 distmatrix (input) float array, ragged
2484 (number of rows is nelements, number of columns is equal to the row number)
2485 The distance matrix. To save space, the distance matrix is given in the
2486 form of a ragged array. The distance matrix is symmetric and has zeros
2487 on the diagonal. See distancematrix for a description of the content.
2488
2489 npass (input) int
2490 The number of times clustering is performed. Clustering is performed npass
2491 times, each time starting from a different (random) initial assignment of genes
2492 to clusters. The clustering solution with the lowest within-cluster sum of
2493 distances is chosen.
2494 If npass==0, then the clustering algorithm will be run once, where the initial
2495 assignment of elements to clusters is taken from the clusterid array.
2496
2497 clusterid (output; input) int[nelements]
2498 On input, if npass==0, then clusterid contains the initial clustering assignment
2499 from which the clustering algorithm starts; all numbers in clusterid should be
2500 between zero and nelements-1 inclusive. If npass!=0, clusterid is ignored on
2501 input.
2502 On output, clusterid contains the clustering solution that was found: clusterid
2503 contains the number of the cluster to which each item was assigned. On output,
2504 the number of a cluster is defined as the item number of the centroid of the
2505 cluster.
2506
2507 error (output) float
2508 The sum of distances to the cluster center of each item in the optimal k-medoids
2509 clustering solution that was found.
2510
2511 ifound (output) int
2512 If kmedoids is successful: the number of times the optimal clustering solution
2513 was found. The value of ifound is at least 1; its maximum value is npass.
2514 If the user requested more clusters than elements available, ifound is set
2515 to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
2516
2517 ========================================================================
2518 */
2519 { int i, j, icluster;
2520 int* tclusterid;
2521 int* saved;
2522 int* centroids;
2523 float* errors;
2524 int ipass = 0;
2525
2526 if (nelements < nclusters)
2527 { *ifound = 0;
2528 return;
2529 } /* More clusters asked for than elements available */
2530
2531 *ifound = -1;
2532
2533 /* We save the clustering solution periodically and check if it reappears */
2534 saved = malloc(nelements*sizeof(int));
2535 if (saved==NULL) return;
2536
2537 centroids = malloc(nclusters*sizeof(int));
2538 if(!centroids)
2539 { free(saved);
2540 return;
2541 }
2542
2543 errors = malloc(nclusters*sizeof(float));
2544 if(!errors)
2545 { free(saved);
2546 free(centroids);
2547 return;
2548 }
2549
2550 /* Find out if the user specified an initial clustering */
2551 if (npass<=1) tclusterid = clusterid;
2552 else
2553 { tclusterid = malloc(nelements*sizeof(int));
2554 if(!tclusterid)
2555 { free(saved);
2556 free(centroids);
2557 free(errors);
2558 return;
2559 }
2560 }
2561
2562 *error = FLT_MAX;
2563 do /* Start the loop */
2564 { float total = FLT_MAX;
2565 int counter = 0;
2566 int period = 10;
2567
2568 if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2569 while(1)
2570 { float previous = total;
2571 total = 0.0;
2572
2573 if (counter % period == 0) /* Save the current cluster assignments */
2574 { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2575 if (period < INT_MAX / 2) period *= 2;
2576 }
2577 counter++;
2578
2579 /* Find the center */
2580 getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
2581 centroids, errors);
2582
2583 for (i = 0; i < nelements; i++)
2584 /* Find the closest cluster */
2585 { float distance = FLT_MAX;
2586 for (icluster = 0; icluster < nclusters; icluster++)
2587 { float tdistance;
2588 j = centroids[icluster];
2589 if (i==j)
2590 { distance = 0.0;
2591 tclusterid[i] = icluster;
2592 break;
2593 }
2594 tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
2595 if (tdistance < distance)
2596 { distance = tdistance;
2597 tclusterid[i] = icluster;
2598 }
2599 }
2600 total += distance;
2601 }
2602 if (total>=previous) break;
2603 /* total>=previous is FALSE on some machines even if total and previous
2604 * are bitwise identical. */
2605 for (i = 0; i < nelements; i++)
2606 if (saved[i]!=tclusterid[i]) break;
2607 if (i==nelements)
2608 break; /* Identical solution found; break out of this loop */
2609 }
2610
2611 for (i = 0; i < nelements; i++)
2612 { if (total < *error)
2613 { *ifound = 1;
2614 *error = total;
2615 /* Replace by the centroid in each cluster. */
2616 for (j = 0; j < nelements; j++) clusterid[j] = centroids[tclusterid[j]];
2617 break;
2618 }
2619 else if (clusterid[i]!=tclusterid[i]) break;
2620 }
2621 if (i==nelements) (*ifound)++; /* break statement not encountered */
2622 } while (++ipass < npass);
2623
2624 /* Deallocate temporarily used space */
2625 if (npass > 1) free(tclusterid);
2626
2627 free(saved);
2628 free(centroids);
2629 free(errors);
2630
2631 return;
2632 }
2633
2634 /* ******************************************************************** */
2635
distancematrix(int nrows,int ncolumns,float ** data,float weights[],char dist,int transpose)2636 float** CALL distancematrix (int nrows, int ncolumns, float** data,
2637 float weights[], char dist, int transpose)
2638 /*
2639 Purpose
2640 =======
2641
2642 The distancematrix routine calculates the distance matrix between genes or
2643 microarrays using their measured gene expression data. Several distance measures
2644 can be used. The routine returns a pointer to a ragged array containing the
2645 distances between the genes. As the distance matrix is symmetric, with zeros on
2646 the diagonal, only the lower triangular half of the distance matrix is saved.
2647 The distancematrix routine allocates space for the distance matrix. If the
2648 parameter transpose is set to a nonzero value, the distances between the columns
2649 (microarrays) are calculated, otherwise distances between the rows (genes) are
2650 calculated.
2651 If sufficient space in memory cannot be allocated to store the distance matrix,
2652 the routine returns a NULL pointer, and all memory allocated so far for the
2653 distance matrix is freed.
2654
2655
2656 Arguments
2657 =========
2658
2659 nrows (input) int
2660 The number of rows in the gene expression data matrix (i.e., the number of
2661 genes)
2662
2663 ncolumns (input) int
2664 The number of columns in the gene expression data matrix (i.e., the number of
2665 microarrays)
2666
2667 data (input) float[nrows][ncolumns]
2668 The array containing the gene expression data.
2669
2670 mask (input) int[nrows][ncolumns]
2671 This array shows which data values are missing. If mask[i][j]==0, then
2672 data[i][j] is missing.
2673
2674 weight (input) float[n]
2675 The weights that are used to calculate the distance. The length of this vector
2676 is equal to the number of columns if the distances between genes are calculated,
2677 or the number of rows if the distances between microarrays are calculated.
2678
2679 dist (input) char
2680 Defines which distance measure is used, as given by the table:
2681 dist=='e': Euclidean distance
2682 dist=='b': City-block distance
2683 dist=='c': correlation
2684 dist=='a': absolute value of the correlation
2685 dist=='u': uncentered correlation
2686 dist=='x': absolute uncentered correlation
2687 dist=='s': Spearman's rank correlation
2688 dist=='k': Kendall's tau
2689 For other values of dist, the default (Euclidean distance) is used.
2690
2691 transpose (input) int
2692 If transpose is equal to zero, the distances between the rows is
2693 calculated. Otherwise, the distances between the columns is calculated.
2694 The former is needed when genes are being clustered; the latter is used
2695 when microarrays are being clustered.
2696
2697 ========================================================================
2698 */
2699 { /* First determine the size of the distance matrix */
2700 const int n = (transpose==0) ? nrows : ncolumns;
2701 const int ndata = (transpose==0) ? ncolumns : nrows;
2702 int i,j;
2703 float** matrix;
2704
2705 /* Set the metric function as indicated by dist */
2706 float (*metric)
2707 (int, float**, float**, const float[], int, int, int) =
2708 setmetric(dist);
2709
2710 if (n < 2) return NULL;
2711
2712 /* Set up the ragged array */
2713 matrix = malloc(n*sizeof(float*));
2714 if(matrix==NULL) return NULL; /* Not enough memory available */
2715 matrix[0] = NULL;
2716 /* The zeroth row has zero columns. We allocate it anyway for convenience.*/
2717 for (i = 1; i < n; i++)
2718 { matrix[i] = malloc(i*sizeof(float));
2719 if (matrix[i]==NULL) break; /* Not enough memory available */
2720 }
2721 if (i < n) /* break condition encountered */
2722 { j = i;
2723 for (i = 1; i < j; i++) free(matrix[i]);
2724 return NULL;
2725 }
2726
2727 /* Calculate the distances and save them in the ragged array */
2728 for (i = 1; i < n; i++)
2729 for (j = 0; j < i; j++)
2730 matrix[i][j]=metric(ndata,data,data,weights,i,j,transpose);
2731
2732 return matrix;
2733 }
2734
2735 /* ******************************************************************** */
2736
calculate_weights(int nrows,int ncolumns,float ** data,float weights[],int transpose,char dist,float cutoff,float exponent)2737 float* calculate_weights(int nrows, int ncolumns, float** data,
2738 float weights[], int transpose, char dist, float cutoff, float exponent)
2739
2740 /*
2741 Purpose
2742 =======
2743
2744 This function calculates the weights using the weighting scheme proposed by
2745 Michael Eisen:
2746 w[i] = 1.0 / sum_{j where d[i][j]<cutoff} (1 - d[i][j]/cutoff)^exponent
2747 where the cutoff and the exponent are specified by the user.
2748
2749
2750 Arguments
2751 =========
2752
2753 nrows (input) int
2754 The number of rows in the gene expression data matrix, equal to the number of
2755 genes.
2756
2757 ncolumns (input) int
2758 The number of columns in the gene expression data matrix, equal to the number of
2759 microarrays.
2760
2761 data (input) float[nrows][ncolumns]
2762 The array containing the gene expression data.
2763
2764 mask (input) int[nrows][ncolumns]
2765 This array shows which data values are missing. If mask[i][j]==0, then
2766 data[i][j] is missing.
2767
2768 weight (input) int[ncolumns] if transpose==0,
2769 int[nrows] if transpose==1
2770 The weights that are used to calculate the distance. The length of this vector
2771 is ncolumns if gene weights are being clustered, and nrows if microarrays
2772 weights are being clustered.
2773
2774 transpose (input) int
2775 If transpose==0, the weights of the rows of the data matrix are calculated.
2776 Otherwise, the weights of the columns of the data matrix are calculated.
2777
2778 dist (input) char
2779 Defines which distance measure is used, as given by the table:
2780 dist=='e': Euclidean distance
2781 dist=='b': City-block distance
2782 dist=='c': correlation
2783 dist=='a': absolute value of the correlation
2784 dist=='u': uncentered correlation
2785 dist=='x': absolute uncentered correlation
2786 dist=='s': Spearman's rank correlation
2787 dist=='k': Kendall's tau
2788 For other values of dist, the default (Euclidean distance) is used.
2789
2790 cutoff (input) float
2791 The cutoff to be used to calculate the weights.
2792
2793 exponent (input) float
2794 The exponent to be used to calculate the weights.
2795
2796
2797 Return value
2798 ============
2799
2800 The function returns a pointer to a newly allocated array containing the
2801 calculated weights for the rows (if transpose==0) or columns (if
2802 transpose==1). If not enough memory could be allocated to store the
2803 weights array, the function returns NULL.
2804
2805 ========================================================================
2806 */
2807 { int i,j;
2808 const int ndata = (transpose==0) ? ncolumns : nrows;
2809 const int nelements = (transpose==0) ? nrows : ncolumns;
2810
2811 /* Set the metric function as indicated by dist */
2812 float (*metric)
2813 (int, float**, float**, const float[], int, int, int) =
2814 setmetric(dist);
2815
2816 float* result = malloc(nelements*sizeof(float));
2817 if (!result) return NULL;
2818 memset(result, 0, nelements*sizeof(float));
2819
2820 for (i = 0; i < nelements; i++)
2821 { result[i] += 1.0;
2822 for (j = 0; j < i; j++)
2823 { const float distance = metric(ndata, data, data, weights,
2824 i, j, transpose);
2825 if (distance < cutoff)
2826 { const float dweight = exp(exponent*log(1-distance/cutoff));
2827 /* pow() causes a crash on AIX */
2828 result[i] += dweight;
2829 result[j] += dweight;
2830 }
2831 }
2832 }
2833 for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
2834 return result;
2835 }
2836
2837 /* ******************************************************************** */
2838
cuttree(int nelements,Node * tree,int nclusters,int clusterid[])2839 void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
2840
2841 /*
2842 Purpose
2843 =======
2844
2845 The cuttree routine takes the output of a hierarchical clustering routine, and
2846 divides the elements in the tree structure into clusters based on the
2847 hierarchical clustering result. The number of clusters is specified by the user.
2848
2849 Arguments
2850 =========
2851
2852 nelements (input) int
2853 The number of elements that were clustered.
2854
2855 tree (input) Node[nelements-1]
2856 The clustering solution. Each node in the array describes one linking event,
2857 with tree[i].left and tree[i].right representig the elements that were joined.
2858 The original elements are numbered 0..nelements-1, nodes are numbered
2859 -1..-(nelements-1).
2860
2861 nclusters (input) int
2862 The number of clusters to be formed.
2863
2864 clusterid (output) int[nelements]
2865 The number of the cluster to which each element was assigned. Space for this
2866 array should be allocated before calling the cuttree routine. If a memory
2867 error occured, all elements in clusterid are set to -1.
2868
2869 ========================================================================
2870 */
2871 { int i, j, k;
2872 int icluster = 0;
2873 const int n = nelements-nclusters; /* number of nodes to join */
2874 int* nodeid;
2875 for (i = nelements-2; i >= n; i--)
2876 { k = tree[i].left;
2877 if (k>=0)
2878 { clusterid[k] = icluster;
2879 icluster++;
2880 }
2881 k = tree[i].right;
2882 if (k>=0)
2883 { clusterid[k] = icluster;
2884 icluster++;
2885 }
2886 }
2887 nodeid = malloc(n*sizeof(int));
2888 if(!nodeid)
2889 { for (i = 0; i < nelements; i++) clusterid[i] = -1;
2890 return;
2891 }
2892 for (i = 0; i < n; i++) nodeid[i] = -1;
2893 for (i = n-1; i >= 0; i--)
2894 { if(nodeid[i]<0)
2895 { j = icluster;
2896 nodeid[i] = j;
2897 icluster++;
2898 }
2899 else j = nodeid[i];
2900 k = tree[i].left;
2901 if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
2902 k = tree[i].right;
2903 if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
2904 }
2905 free(nodeid);
2906 return;
2907 }
2908
2909 /* ******************************************************************** */
2910
2911 static
pclcluster(int nrows,int ncolumns,float ** data,float weight[],float ** distmatrix,char dist,int transpose)2912 Node* pclcluster (int nrows, int ncolumns, float** data,
2913 float weight[], float** distmatrix, char dist, int transpose)
2914
2915 /*
2916
2917 Purpose
2918 =======
2919
2920 The pclcluster routine performs clustering using pairwise centroid-linking
2921 on a given set of gene expression data, using the distance metric given by dist.
2922
2923 Arguments
2924 =========
2925
2926 nrows (input) int
2927 The number of rows in the gene expression data matrix, equal to the number of
2928 genes.
2929
2930 ncolumns (input) int
2931 The number of columns in the gene expression data matrix, equal to the number of
2932 microarrays.
2933
2934 data (input) float[nrows][ncolumns]
2935 The array containing the gene expression data.
2936
2937 mask (input) int[nrows][ncolumns]
2938 This array shows which data values are missing. If
2939 mask[i][j] == 0, then data[i][j] is missing.
2940
2941 weight (input) float[ncolumns] if transpose==0;
2942 float[nrows] if transpose==1
2943 The weights that are used to calculate the distance. The length of this vector
2944 is ncolumns if genes are being clustered, and nrows if microarrays are being
2945 clustered.
2946
2947 transpose (input) int
2948 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
2949 of the matrix are clustered.
2950
2951 dist (input) char
2952 Defines which distance measure is used, as given by the table:
2953 dist=='e': Euclidean distance
2954 dist=='b': City-block distance
2955 dist=='c': correlation
2956 dist=='a': absolute value of the correlation
2957 dist=='u': uncentered correlation
2958 dist=='x': absolute uncentered correlation
2959 dist=='s': Spearman's rank correlation
2960 dist=='k': Kendall's tau
2961 For other values of dist, the default (Euclidean distance) is used.
2962
2963 distmatrix (input) float**
2964 The distance matrix. This matrix is precalculated by the calling routine
2965 treecluster. The pclcluster routine modifies the contents of distmatrix, but
2966 does not deallocate it.
2967
2968 Return value
2969 ============
2970
2971 A pointer to a newly allocated array of Node structs, describing the
2972 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
2973 whether genes (rows) or microarrays (columns) were clustered, nelements is
2974 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
2975 structure.
2976 If a memory error occurs, pclcluster returns NULL.
2977 ========================================================================
2978 */
2979 { int i, j;
2980 const int nelements = (transpose==0) ? nrows : ncolumns;
2981 int inode;
2982 const int ndata = transpose ? nrows : ncolumns;
2983 const int nnodes = nelements - 1;
2984
2985 /* Set the metric function as indicated by dist */
2986 float (*metric)
2987 (int, float**, float**, const float[], int, int, int) =
2988 setmetric(dist);
2989
2990 Node* result;
2991 float** newdata;
2992
2993 int* distid = malloc(nelements*sizeof(int));
2994 if(!distid) return NULL;
2995 result = malloc(nnodes*sizeof(Node));
2996 if(!result)
2997 { free(distid);
2998 return NULL;
2999 }
3000 if(!makedatamask(nelements, ndata, &newdata))
3001 { free(result);
3002 free(distid);
3003 return NULL;
3004 }
3005
3006 for (i = 0; i < nelements; i++) distid[i] = i;
3007 /* To remember which row/column in the distance matrix contains what */
3008
3009 /* Storage for node data */
3010 if (transpose)
3011 { for (i = 0; i < nelements; i++)
3012 { for (j = 0; j < ndata; j++)
3013 { newdata[i][j] = data[j][i];
3014
3015 }
3016 }
3017 data = newdata;
3018
3019 }
3020 else
3021 { for (i = 0; i < nelements; i++)
3022 { memcpy(newdata[i], data[i], ndata*sizeof(float));
3023
3024 }
3025 data = newdata;
3026
3027 }
3028
3029 for (inode = 0; inode < nnodes; inode++)
3030 { /* Find the pair with the shortest distance */
3031 int is = 1;
3032 int js = 0;
3033 result[inode].distance = find_closest_pair(nelements-inode, distmatrix, &is, &js);
3034 result[inode].left = distid[js];
3035 result[inode].right = distid[is];
3036
3037 /* Make node js the new node */
3038 for (i = 0; i < ndata; i++)
3039 { data[js][i] = data[js][i] + data[is][i];
3040
3041
3042 }
3043 free(data[is]);
3044
3045 data[is] = data[nnodes-inode];
3046
3047
3048 /* Fix the distances */
3049 distid[is] = distid[nnodes-inode];
3050 for (i = 0; i < is; i++)
3051 distmatrix[is][i] = distmatrix[nnodes-inode][i];
3052 for (i = is + 1; i < nnodes-inode; i++)
3053 distmatrix[i][is] = distmatrix[nnodes-inode][i];
3054
3055 distid[js] = -inode-1;
3056 for (i = 0; i < js; i++)
3057 distmatrix[js][i] = metric(ndata,data,data,weight,js,i,0);
3058 for (i = js + 1; i < nnodes-inode; i++)
3059 distmatrix[i][js] = metric(ndata,data,data,weight,js,i,0);
3060 }
3061
3062 /* Free temporarily allocated space */
3063 free(data[0]);
3064
3065 free(data);
3066
3067 free(distid);
3068
3069 return result;
3070 }
3071
3072 /* ******************************************************************** */
3073
3074 static
nodecompare(const void * a,const void * b)3075 int nodecompare(const void* a, const void* b)
3076 /* Helper function for qsort. */
3077 { const Node* node1 = (const Node*)a;
3078 const Node* node2 = (const Node*)b;
3079 const float term1 = node1->distance;
3080 const float term2 = node2->distance;
3081 if (term1 < term2) return -1;
3082 if (term1 > term2) return +1;
3083 return 0;
3084 }
3085
3086 /* ---------------------------------------------------------------------- */
3087
3088 static
pslcluster(int nrows,int ncolumns,float ** data,float weight[],float ** distmatrix,char dist,int transpose)3089 Node* pslcluster (int nrows, int ncolumns, float** data,
3090 float weight[], float** distmatrix, char dist, int transpose)
3091
3092 /*
3093
3094 Purpose
3095 =======
3096
3097 The pslcluster routine performs single-linkage hierarchical clustering, using
3098 either the distance matrix directly, if available, or by calculating the
3099 distances from the data array. This implementation is based on the SLINK
3100 algorithm, described in:
3101 Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
3102 cluster method. The Computer Journal, 16(1): 30-34.
3103 The output of this algorithm is identical to conventional single-linkage
3104 hierarchical clustering, but is much more memory-efficient and faster. Hence,
3105 it can be applied to large data sets, for which the conventional single-
3106 linkage algorithm fails due to lack of memory.
3107
3108
3109 Arguments
3110 =========
3111
3112 nrows (input) int
3113 The number of rows in the gene expression data matrix, equal to the number of
3114 genes.
3115
3116 ncolumns (input) int
3117 The number of columns in the gene expression data matrix, equal to the number of
3118 microarrays.
3119
3120 data (input) float[nrows][ncolumns]
3121 The array containing the gene expression data.
3122
3123 mask (input) int[nrows][ncolumns]
3124 This array shows which data values are missing. If
3125 mask[i][j] == 0, then data[i][j] is missing.
3126
3127 weight (input) float[n]
3128 The weights that are used to calculate the distance. The length of this vector
3129 is ncolumns if genes are being clustered, and nrows if microarrays are being
3130 clustered.
3131
3132 transpose (input) int
3133 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3134 of the matrix are clustered.
3135
3136 dist (input) char
3137 Defines which distance measure is used, as given by the table:
3138 dist=='e': Euclidean distance
3139 dist=='b': City-block distance
3140 dist=='c': correlation
3141 dist=='a': absolute value of the correlation
3142 dist=='u': uncentered correlation
3143 dist=='x': absolute uncentered correlation
3144 dist=='s': Spearman's rank correlation
3145 dist=='k': Kendall's tau
3146 For other values of dist, the default (Euclidean distance) is used.
3147
3148 distmatrix (input) float**
3149 The distance matrix. If the distance matrix is passed by the calling routine
3150 treecluster, it is used by pslcluster to speed up the clustering calculation.
3151 The pslcluster routine does not modify the contents of distmatrix, and does
3152 not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
3153 by the pslcluster routine from the gene expression data (the data and mask
3154 arrays) and stored in temporary arrays. If distmatrix is passed, the original
3155 gene expression data (specified by the data and mask arguments) are not needed
3156 and are therefore ignored.
3157
3158
3159 Return value
3160 ============
3161
3162 A pointer to a newly allocated array of Node structs, describing the
3163 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3164 whether genes (rows) or microarrays (columns) were clustered, nelements is
3165 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3166 structure.
3167 If a memory error occurs, pslcluster returns NULL.
3168
3169 ========================================================================
3170 */
3171 { int i, j, k;
3172 const int nelements = transpose ? ncolumns : nrows;
3173 const int nnodes = nelements - 1;
3174 int* vector;
3175 float* temp;
3176 int* index;
3177 Node* result;
3178 temp = malloc(nnodes*sizeof(float));
3179 if(!temp) return NULL;
3180 index = malloc(nelements*sizeof(int));
3181 if(!index)
3182 { free(temp);
3183 return NULL;
3184 }
3185 vector = malloc(nnodes*sizeof(int));
3186 if(!vector)
3187 { free(index);
3188 free(temp);
3189 return NULL;
3190 }
3191 result = malloc(nnodes*sizeof(Node));
3192 if(!result)
3193 { free(vector);
3194 free(index);
3195 free(temp);
3196 return NULL;
3197 }
3198
3199 for (i = 0; i < nnodes; i++)
3200 { vector[i] = i;
3201 result[i].distance = FLT_MAX;
3202 }
3203
3204 if(distmatrix)
3205 { for (i = 0; i < nrows; i++)
3206 { for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
3207 for (j = 0; j < i; j++)
3208 { k = vector[j];
3209 if (result[j].distance >= temp[j])
3210 { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3211 result[j].distance = temp[j];
3212 vector[j] = i;
3213 }
3214 else if (temp[j] < temp[k]) temp[k] = temp[j];
3215 }
3216 for (j = 0; j < i; j++)
3217 if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3218 }
3219 }
3220 else
3221 { const int ndata = transpose ? nrows : ncolumns;
3222 /* Set the metric function as indicated by dist */
3223 float (*metric)
3224 (int, float**, float**, const float[], int, int, int) =
3225 setmetric(dist);
3226
3227 for (i = 0; i < nelements; i++)
3228 { for (j = 0; j < i; j++) temp[j] =
3229 metric(ndata, data, data, weight, i, j, transpose);
3230 for (j = 0; j < i; j++)
3231 { k = vector[j];
3232 if (result[j].distance >= temp[j])
3233 { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3234 result[j].distance = temp[j];
3235 vector[j] = i;
3236 }
3237 else if (temp[j] < temp[k]) temp[k] = temp[j];
3238 }
3239 for (j = 0; j < i; j++)
3240 if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3241 }
3242 }
3243 free(temp);
3244
3245 for (i = 0; i < nnodes; i++) result[i].left = i;
3246 qsort(result, nnodes, sizeof(Node), nodecompare);
3247
3248 for (i = 0; i < nelements; i++) index[i] = i;
3249 for (i = 0; i < nnodes; i++)
3250 { j = result[i].left;
3251 k = vector[j];
3252 result[i].left = index[j];
3253 result[i].right = index[k];
3254 index[k] = -i-1;
3255 }
3256 free(vector);
3257 free(index);
3258
3259 return result;
3260 }
3261 /* ******************************************************************** */
3262
pmlcluster(int nelements,float ** distmatrix)3263 static Node* pmlcluster (int nelements, float** distmatrix)
3264 /*
3265
3266 Purpose
3267 =======
3268
3269 The pmlcluster routine performs clustering using pairwise maximum- (complete-)
3270 linking on the given distance matrix.
3271
3272 Arguments
3273 =========
3274
3275 nelements (input) int
3276 The number of elements to be clustered.
3277
3278 distmatrix (input) float**
3279 The distance matrix, with nelements rows, each row being filled up to the
3280 diagonal. The elements on the diagonal are not used, as they are assumed to be
3281 zero. The distance matrix will be modified by this routine.
3282
3283 Return value
3284 ============
3285
3286 A pointer to a newly allocated array of Node structs, describing the
3287 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3288 whether genes (rows) or microarrays (columns) were clustered, nelements is
3289 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3290 structure.
3291 If a memory error occurs, pmlcluster returns NULL.
3292 ========================================================================
3293 */
3294 { int j;
3295 int n;
3296 int* clusterid;
3297 Node* result;
3298
3299 clusterid = malloc(nelements*sizeof(int));
3300 if(!clusterid) return NULL;
3301 result = malloc((nelements-1)*sizeof(Node));
3302 if (!result)
3303 { free(clusterid);
3304 return NULL;
3305 }
3306
3307 /* Setup a list specifying to which cluster a gene belongs */
3308 for (j = 0; j < nelements; j++) clusterid[j] = j;
3309
3310 for (n = nelements; n > 1; n--)
3311 { int is = 1;
3312 int js = 0;
3313 result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3314
3315 /* Fix the distances */
3316 for (j = 0; j < js; j++)
3317 distmatrix[js][j] = max(distmatrix[is][j],distmatrix[js][j]);
3318 for (j = js+1; j < is; j++)
3319 distmatrix[j][js] = max(distmatrix[is][j],distmatrix[j][js]);
3320 for (j = is+1; j < n; j++)
3321 distmatrix[j][js] = max(distmatrix[j][is],distmatrix[j][js]);
3322
3323 for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3324 for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3325
3326 /* Update clusterids */
3327 result[nelements-n].left = clusterid[is];
3328 result[nelements-n].right = clusterid[js];
3329 clusterid[js] = n-nelements-1;
3330 clusterid[is] = clusterid[n-1];
3331 }
3332 free(clusterid);
3333
3334 return result;
3335 }
3336
3337 /* ******************************************************************* */
3338
palcluster(int nelements,float ** distmatrix)3339 static Node* palcluster (int nelements, float** distmatrix)
3340 /*
3341 Purpose
3342 =======
3343
3344 The palcluster routine performs clustering using pairwise average
3345 linking on the given distance matrix.
3346
3347 Arguments
3348 =========
3349
3350 nelements (input) int
3351 The number of elements to be clustered.
3352
3353 distmatrix (input) float**
3354 The distance matrix, with nelements rows, each row being filled up to the
3355 diagonal. The elements on the diagonal are not used, as they are assumed to be
3356 zero. The distance matrix will be modified by this routine.
3357
3358 Return value
3359 ============
3360
3361 A pointer to a newly allocated array of Node structs, describing the
3362 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3363 whether genes (rows) or microarrays (columns) were clustered, nelements is
3364 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3365 structure.
3366 If a memory error occurs, palcluster returns NULL.
3367 ========================================================================
3368 */
3369 { int j;
3370 int n;
3371 int* clusterid;
3372 int* number;
3373 Node* result;
3374
3375 clusterid = malloc(nelements*sizeof(int));
3376 if(!clusterid) return NULL;
3377 number = malloc(nelements*sizeof(int));
3378 if(!number)
3379 { free(clusterid);
3380 return NULL;
3381 }
3382 result = malloc((nelements-1)*sizeof(Node));
3383 if (!result)
3384 { free(clusterid);
3385 free(number);
3386 return NULL;
3387 }
3388
3389 /* Setup a list specifying to which cluster a gene belongs, and keep track
3390 * of the number of elements in each cluster (needed to calculate the
3391 * average). */
3392 for (j = 0; j < nelements; j++)
3393 { number[j] = 1;
3394 clusterid[j] = j;
3395 }
3396
3397 for (n = nelements; n > 1; n--)
3398 { int sum;
3399 int is = 1;
3400 int js = 0;
3401 result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3402
3403 /* Save result */
3404 result[nelements-n].left = clusterid[is];
3405 result[nelements-n].right = clusterid[js];
3406
3407 /* Fix the distances */
3408 sum = number[is] + number[js];
3409 for (j = 0; j < js; j++)
3410 { distmatrix[js][j] = distmatrix[is][j]*number[is]
3411 + distmatrix[js][j]*number[js];
3412 distmatrix[js][j] /= sum;
3413 }
3414 for (j = js+1; j < is; j++)
3415 { distmatrix[j][js] = distmatrix[is][j]*number[is]
3416 + distmatrix[j][js]*number[js];
3417 distmatrix[j][js] /= sum;
3418 }
3419 for (j = is+1; j < n; j++)
3420 { distmatrix[j][js] = distmatrix[j][is]*number[is]
3421 + distmatrix[j][js]*number[js];
3422 distmatrix[j][js] /= sum;
3423 }
3424
3425 for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3426 for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3427
3428 /* Update number of elements in the clusters */
3429 number[js] = sum;
3430 number[is] = number[n-1];
3431
3432 /* Update clusterids */
3433 clusterid[js] = n-nelements-1;
3434 clusterid[is] = clusterid[n-1];
3435 }
3436 free(clusterid);
3437 free(number);
3438
3439 return result;
3440 }
3441
3442 /* ******************************************************************* */
3443
treecluster(int nrows,int ncolumns,float ** data,float weight[],int transpose,char dist,char method,float ** distmatrix)3444 Node* CALL treecluster (int nrows, int ncolumns, float** data,
3445 float weight[], int transpose, char dist, char method, float** distmatrix)
3446 /*
3447 Purpose
3448 =======
3449
3450 The treecluster routine performs hierarchical clustering using pairwise
3451 single-, maximum-, centroid-, or average-linkage, as defined by method, on a
3452 given set of gene expression data, using the distance metric given by dist.
3453 If successful, the function returns a pointer to a newly allocated Tree struct
3454 containing the hierarchical clustering solution, and NULL if a memory error
3455 occurs. The pointer should be freed by the calling routine to prevent memory
3456 leaks.
3457
3458 Arguments
3459 =========
3460
3461 nrows (input) int
3462 The number of rows in the data matrix, equal to the number of genes.
3463
3464 ncolumns (input) int
3465 The number of columns in the data matrix, equal to the number of microarrays.
3466
3467 data (input) float[nrows][ncolumns]
3468 The array containing the data of the vectors to be clustered.
3469
3470 mask (input) int[nrows][ncolumns]
3471 This array shows which data values are missing. If mask[i][j]==0, then
3472 data[i][j] is missing.
3473
3474 weight (input) float array[n]
3475 The weights that are used to calculate the distance.
3476
3477 transpose (input) int
3478 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3479 of the matrix are clustered.
3480
3481 dist (input) char
3482 Defines which distance measure is used, as given by the table:
3483 dist=='e': Euclidean distance
3484 dist=='b': City-block distance
3485 dist=='c': correlation
3486 dist=='a': absolute value of the correlation
3487 dist=='u': uncentered correlation
3488 dist=='x': absolute uncentered correlation
3489 dist=='s': Spearman's rank correlation
3490 dist=='k': Kendall's tau
3491 For other values of dist, the default (Euclidean distance) is used.
3492
3493 method (input) char
3494 Defines which hierarchical clustering method is used:
3495 method=='s': pairwise single-linkage clustering
3496 method=='m': pairwise maximum- (or complete-) linkage clustering
3497 method=='a': pairwise average-linkage clustering
3498 method=='c': pairwise centroid-linkage clustering
3499 For the first three, either the distance matrix or the gene expression data is
3500 sufficient to perform the clustering algorithm. For pairwise centroid-linkage
3501 clustering, however, the gene expression data are always needed, even if the
3502 distance matrix itself is available.
3503
3504 distmatrix (input) float**
3505 The distance matrix. If the distance matrix is zero initially, the distance
3506 matrix will be allocated and calculated from the data by treecluster, and
3507 deallocated before treecluster returns. If the distance matrix is passed by the
3508 calling routine, treecluster will modify the contents of the distance matrix as
3509 part of the clustering algorithm, but will not deallocate it. The calling
3510 routine should deallocate the distance matrix after the return from treecluster.
3511
3512 Return value
3513 ============
3514
3515 A pointer to a newly allocated array of Node structs, describing the
3516 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3517 whether genes (rows) or microarrays (columns) were clustered, nelements is
3518 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3519 structure.
3520 If a memory error occurs, treecluster returns NULL.
3521
3522 ========================================================================
3523 */
3524 { Node* result = NULL;
3525 const int nelements = (transpose==0) ? nrows : ncolumns;
3526 const int ldistmatrix = (distmatrix==NULL && method!='s') ? 1 : 0;
3527
3528 if (nelements < 2) return NULL;
3529
3530 /* Calculate the distance matrix if the user didn't give it */
3531 if(ldistmatrix)
3532 { distmatrix =
3533 distancematrix(nrows, ncolumns, data, weight, dist, transpose);
3534 if (!distmatrix) return NULL; /* Insufficient memory */
3535 }
3536
3537 switch(method)
3538 { case 's':
3539 result = pslcluster(nrows, ncolumns, data, weight, distmatrix,
3540 dist, transpose);
3541 break;
3542 case 'm':
3543 result = pmlcluster(nelements, distmatrix);
3544 break;
3545 case 'a':
3546 result = palcluster(nelements, distmatrix);
3547 break;
3548 case 'c':
3549 result = pclcluster(nrows, ncolumns, data, weight, distmatrix,
3550 dist, transpose);
3551 break;
3552 }
3553
3554 /* Deallocate space for distance matrix, if it was allocated by treecluster */
3555 if(ldistmatrix)
3556 { int i;
3557 for (i = 1; i < nelements; i++) free(distmatrix[i]);
3558 free (distmatrix);
3559 }
3560
3561 return result;
3562 }
3563
3564 /* ******************************************************************* */
3565
3566
3567
3568 /* ******************************************************************* */
3569
3570
3571
3572 /* ******************************************************************* */
3573
3574
3575
3576 /* ******************************************************************** */
3577
clusterdistance(int nrows,int ncolumns,float ** data,float weight[],int n1,int n2,int index1[],int index2[],char dist,char method,int transpose)3578 float CALL clusterdistance (int nrows, int ncolumns, float** data,
3579 float weight[], int n1, int n2, int index1[], int index2[],
3580 char dist, char method, int transpose)
3581
3582 /*
3583 Purpose
3584 =======
3585
3586 The clusterdistance routine calculates the distance between two clusters
3587 containing genes or microarrays using the measured gene expression vectors. The
3588 distance between clusters, given the genes/microarrays in each cluster, can be
3589 defined in several ways. Several distance measures can be used.
3590
3591 The routine returns the distance in float precision.
3592 If the parameter transpose is set to a nonzero value, the clusters are
3593 interpreted as clusters of microarrays, otherwise as clusters of gene.
3594
3595 Arguments
3596 =========
3597
3598 nrows (input) int
3599 The number of rows (i.e., the number of genes) in the gene expression data
3600 matrix.
3601
3602 ncolumns (input) int
3603 The number of columns (i.e., the number of microarrays) in the gene expression
3604 data matrix.
3605
3606 data (input) float[nrows][ncolumns]
3607 The array containing the data of the vectors.
3608
3609 mask (input) int[nrows][ncolumns]
3610 This array shows which data values are missing. If mask[i][j]==0, then
3611 data[i][j] is missing.
3612
3613 weight (input) float[ncolumns] if transpose==0;
3614 float[nrows] if transpose==1
3615 The weights that are used to calculate the distance.
3616
3617 n1 (input) int
3618 The number of elements in the first cluster.
3619
3620 n2 (input) int
3621 The number of elements in the second cluster.
3622
3623 index1 (input) int[n1]
3624 Identifies which genes/microarrays belong to the first cluster.
3625
3626 index2 (input) int[n2]
3627 Identifies which genes/microarrays belong to the second cluster.
3628
3629 dist (input) char
3630 Defines which distance measure is used, as given by the table:
3631 dist=='e': Euclidean distance
3632 dist=='b': City-block distance
3633 dist=='c': correlation
3634 dist=='a': absolute value of the correlation
3635 dist=='u': uncentered correlation
3636 dist=='x': absolute uncentered correlation
3637 dist=='s': Spearman's rank correlation
3638 dist=='k': Kendall's tau
3639 For other values of dist, the default (Euclidean distance) is used.
3640
3641 method (input) char
3642 Defines how the distance between two clusters is defined, given which genes
3643 belong to which cluster:
3644 method=='a': the distance between the arithmetic means of the two clusters
3645 method=='m': the distance between the medians of the two clusters
3646 method=='s': the smallest pairwise distance between members of the two clusters
3647 method=='x': the largest pairwise distance between members of the two clusters
3648 method=='v': average of the pairwise distances between members of the clusters
3649
3650 transpose (input) int
3651 If transpose is equal to zero, the distances between the rows is
3652 calculated. Otherwise, the distances between the columns is calculated.
3653 The former is needed when genes are being clustered; the latter is used
3654 when microarrays are being clustered.
3655
3656 ========================================================================
3657 */
3658 { /* Set the metric function as indicated by dist */
3659 float (*metric)
3660 (int, float**, float**, const float[], int, int, int) =
3661 setmetric(dist);
3662
3663 /* if one or both clusters are empty, return */
3664 if (n1 < 1 || n2 < 1) return -1.0;
3665 /* Check the indices */
3666 if (transpose==0)
3667 { int i;
3668 for (i = 0; i < n1; i++)
3669 { int index = index1[i];
3670 if (index < 0 || index >= nrows) return -1.0;
3671 }
3672 for (i = 0; i < n2; i++)
3673 { int index = index2[i];
3674 if (index < 0 || index >= nrows) return -1.0;
3675 }
3676 }
3677 else
3678 { int i;
3679 for (i = 0; i < n1; i++)
3680 { int index = index1[i];
3681 if (index < 0 || index >= ncolumns) return -1.0;
3682 }
3683 for (i = 0; i < n2; i++)
3684 { int index = index2[i];
3685 if (index < 0 || index >= ncolumns) return -1.0;
3686 }
3687 }
3688
3689 switch (method)
3690 { case 'a':
3691 { /* Find the center */
3692 int i,j,k;
3693 if (transpose==0)
3694 { float distance;
3695 float* cdata[2];
3696
3697 int* count[2];
3698 count[0] = calloc(ncolumns,sizeof(int));
3699 count[1] = calloc(ncolumns,sizeof(int));
3700 cdata[0] = calloc(ncolumns,sizeof(float));
3701 cdata[1] = calloc(ncolumns,sizeof(float));
3702
3703 for (i = 0; i < n1; i++)
3704 { k = index1[i];
3705 for (j = 0; j < ncolumns; j++)
3706 { cdata[0][j] = cdata[0][j] + data[k][j];
3707 count[0][j] = count[0][j] + 1;
3708 }
3709 }
3710 for (i = 0; i < n2; i++)
3711 { k = index2[i];
3712 for (j = 0; j < ncolumns; j++)
3713 { cdata[1][j] = cdata[1][j] + data[k][j];
3714 count[1][j] = count[1][j] + 1;
3715 }
3716 }
3717 for (i = 0; i < 2; i++)
3718 for (j = 0; j < ncolumns; j++)
3719 { if (count[i][j]>0)
3720 { cdata[i][j] = cdata[i][j] / count[i][j];
3721 }
3722
3723 }
3724 distance =
3725 metric (ncolumns,cdata,cdata,weight,0,1,0);
3726 for (i = 0; i < 2; i++)
3727 { free (cdata[i]);
3728
3729 free (count[i]);
3730 }
3731 return distance;
3732 }
3733 else
3734 { float distance;
3735 int** count = malloc(nrows*sizeof(int*));
3736 float** cdata = malloc(nrows*sizeof(float*));
3737
3738 for (i = 0; i < nrows; i++)
3739 { count[i] = calloc(2,sizeof(int));
3740 cdata[i] = calloc(2,sizeof(float));
3741
3742 }
3743 for (i = 0; i < n1; i++)
3744 { k = index1[i];
3745 for (j = 0; j < nrows; j++)
3746 { cdata[j][0] = cdata[j][0] + data[j][k];
3747 count[j][0] = count[j][0] + 1;
3748
3749 }
3750 }
3751 for (i = 0; i < n2; i++)
3752 { k = index2[i];
3753 for (j = 0; j < nrows; j++)
3754 { cdata[j][1] = cdata[j][1] + data[j][k];
3755 count[j][1] = count[j][1] + 1;
3756
3757 }
3758 }
3759 for (i = 0; i < nrows; i++)
3760 for (j = 0; j < 2; j++)
3761 if (count[i][j]>0)
3762 { cdata[i][j] = cdata[i][j] / count[i][j];
3763
3764 }
3765
3766 distance = metric (nrows,cdata,cdata,weight,0,1,1);
3767 for (i = 0; i < nrows; i++)
3768 { free (count[i]);
3769 free (cdata[i]);
3770
3771 }
3772 free (count);
3773 free (cdata);
3774
3775 return distance;
3776 }
3777 }
3778 case 'm':
3779 { int i, j, k;
3780 if (transpose==0)
3781 { float distance;
3782 float* temp = malloc(nrows*sizeof(float));
3783 float* cdata[2];
3784
3785 for (i = 0; i < 2; i++)
3786 { cdata[i] = malloc(ncolumns*sizeof(float));
3787
3788 }
3789 for (j = 0; j < ncolumns; j++)
3790 { int count = 0;
3791 for (k = 0; k < n1; k++)
3792 { i = index1[k];
3793 temp[count] = data[i][j];
3794 count++;
3795
3796 }
3797 if (count>0)
3798 { cdata[0][j] = median (count,temp);
3799
3800 }
3801 else
3802 { cdata[0][j] = 0.;
3803
3804 }
3805 }
3806 for (j = 0; j < ncolumns; j++)
3807 { int count = 0;
3808 for (k = 0; k < n2; k++)
3809 { i = index2[k];
3810 temp[count] = data[i][j];
3811 count++;
3812
3813 }
3814 if (count>0)
3815 { cdata[1][j] = median (count,temp);
3816
3817 }
3818 else
3819 { cdata[1][j] = 0.;
3820
3821 }
3822 }
3823 distance = metric (ncolumns,cdata,cdata,weight,0,1,0);
3824 for (i = 0; i < 2; i++)
3825 { free (cdata[i]);
3826
3827 }
3828 free(temp);
3829 return distance;
3830 }
3831 else
3832 { float distance;
3833 float* temp = malloc(ncolumns*sizeof(float));
3834 float** cdata = malloc(nrows*sizeof(float*));
3835
3836 for (i = 0; i < nrows; i++)
3837 { cdata[i] = malloc(2*sizeof(float));
3838
3839 }
3840 for (j = 0; j < nrows; j++)
3841 { int count = 0;
3842 for (k = 0; k < n1; k++)
3843 { i = index1[k];
3844 temp[count] = data[j][i];
3845 count++;
3846
3847 }
3848 if (count>0)
3849 { cdata[j][0] = median (count,temp);
3850
3851 }
3852 else
3853 { cdata[j][0] = 0.;
3854
3855 }
3856 }
3857 for (j = 0; j < nrows; j++)
3858 { int count = 0;
3859 for (k = 0; k < n2; k++)
3860 { i = index2[k];
3861
3862 temp[count] = data[j][i];
3863 count++;
3864
3865 }
3866 if (count>0)
3867 { cdata[j][1] = median (count,temp);
3868
3869 }
3870 else
3871 { cdata[j][1] = 0.;
3872
3873 }
3874 }
3875 distance = metric (nrows,cdata,cdata,weight,0,1,1);
3876 for (i = 0; i < nrows; i++)
3877 { free (cdata[i]);
3878
3879 }
3880 free(cdata);
3881
3882 free(temp);
3883 return distance;
3884 }
3885 }
3886 case 's':
3887 { int i1, i2, j1, j2;
3888 const int n = (transpose==0) ? ncolumns : nrows;
3889 float mindistance = FLT_MAX;
3890 for (i1 = 0; i1 < n1; i1++)
3891 for (i2 = 0; i2 < n2; i2++)
3892 { float distance;
3893 j1 = index1[i1];
3894 j2 = index2[i2];
3895 distance = metric (n,data,data,weight,j1,j2,transpose);
3896 if (distance < mindistance) mindistance = distance;
3897 }
3898 return mindistance;
3899 }
3900 case 'x':
3901 { int i1, i2, j1, j2;
3902 const int n = (transpose==0) ? ncolumns : nrows;
3903 float maxdistance = 0;
3904 for (i1 = 0; i1 < n1; i1++)
3905 for (i2 = 0; i2 < n2; i2++)
3906 { float distance;
3907 j1 = index1[i1];
3908 j2 = index2[i2];
3909 distance = metric (n,data,data,weight,j1,j2,transpose);
3910 if (distance > maxdistance) maxdistance = distance;
3911 }
3912 return maxdistance;
3913 }
3914 case 'v':
3915 { int i1, i2, j1, j2;
3916 const int n = (transpose==0) ? ncolumns : nrows;
3917 float distance = 0;
3918 for (i1 = 0; i1 < n1; i1++)
3919 for (i2 = 0; i2 < n2; i2++)
3920 { j1 = index1[i1];
3921 j2 = index2[i2];
3922 distance += metric (n,data,data,weight,j1,j2,transpose);
3923 }
3924 distance /= (n1*n2);
3925 return distance;
3926 }
3927 }
3928 /* Never get here */
3929 return -2.0;
3930 }
3931