1 /* The C clustering library.
2  * Copyright (C) 2002 Michiel Jan Laurens de Hoon.
3  *
4  * This library was written at the Laboratory of DNA Information Analysis,
5  * Human Genome Center, Institute of Medical Science, University of Tokyo,
6  * 4-6-1 Shirokanedai, Minato-ku, Tokyo 108-8639, Japan.
7  * Contact: mdehoon 'AT' gsc.riken.jp
8  *
9  * Permission to use, copy, modify, and distribute this software and its
10  * documentation with or without modifications and for any purpose and
11  * without fee is hereby granted, provided that any copyright notices
12  * appear in all copies and that both those copyright notices and this
13  * permission notice appear in supporting documentation, and that the
14  * names of the contributors or copyright holders not be used in
15  * advertising or publicity pertaining to distribution of the software
16  * without specific prior permission.
17  *
18  * THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
19  * WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
20  * WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
21  * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
22  * OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23  * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25  * OR PERFORMANCE OF THIS SOFTWARE.
26  *
27  */
28 
29 #include <time.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <math.h>
33 #include <float.h>
34 #include <limits.h>
35 #include <string.h>
36 #include "cluster_floatNOMASK.h"
37 #ifdef WINDOWS
38 #  include <windows.h>
39 #endif
40 
41 /* ************************************************************************ */
42 
43 #ifdef WINDOWS
44 /* Then we make a Windows DLL */
45 int WINAPI
clusterdll_init(HANDLE h,DWORD reason,void * foo)46 clusterdll_init (HANDLE h, DWORD reason, void* foo)
47 {
48   return 1;
49 }
50 #endif
51 
52 /* ************************************************************************ */
53 
mean(int n,float x[])54 float CALL mean(int n, float x[])
55 { float result = 0.;
56   int i;
57   for (i = 0; i < n; i++) result += x[i];
58   result /= n;
59   return result;
60 }
61 
62 /* ************************************************************************ */
63 
median(int n,float x[])64 float CALL median (int n, float x[])
65 /*
66 Find the median of X(1), ... , X(N), using as much of the quicksort
67 algorithm as is needed to isolate it.
68 N.B. On exit, the array X is partially ordered.
69 Based on Alan J. Miller's median.f90 routine.
70 */
71 
72 { int i, j;
73   int nr = n / 2;
74   int nl = nr - 1;
75   int even = 0;
76   /* hi & lo are position limits encompassing the median. */
77   int lo = 0;
78   int hi = n-1;
79 
80   if (n==2*nr) even = 1;
81   if (n<3)
82   { if (n<1) return 0.;
83     if (n == 1) return x[0];
84     return 0.5*(x[0]+x[1]);
85   }
86 
87   /* Find median of 1st, middle & last values. */
88   do
89   { int loop;
90     int mid = (lo + hi)/2;
91     float result = x[mid];
92     float xlo = x[lo];
93     float xhi = x[hi];
94     if (xhi<xlo)
95     { float temp = xlo;
96       xlo = xhi;
97       xhi = temp;
98     }
99     if (result>xhi) result = xhi;
100     else if (result<xlo) result = xlo;
101     /* The basic quicksort algorithm to move all values <= the sort key (XMED)
102      * to the left-hand end, and all higher values to the other end.
103      */
104     i = lo;
105     j = hi;
106     do
107     { while (x[i]<result) i++;
108       while (x[j]>result) j--;
109       loop = 0;
110       if (i<j)
111       { float temp = x[i];
112         x[i] = x[j];
113         x[j] = temp;
114         i++;
115         j--;
116         if (i<=j) loop = 1;
117       }
118     } while (loop); /* Decide which half the median is in. */
119 
120     if (even)
121     { if (j==nl && i==nr)
122         /* Special case, n even, j = n/2 & i = j + 1, so the median is
123          * between the two halves of the series.   Find max. of the first
124          * half & min. of the second half, then average.
125          */
126         { int k;
127           float xmax = x[0];
128           float xmin = x[n-1];
129           for (k = lo; k <= j; k++) xmax = max(xmax,x[k]);
130           for (k = i; k <= hi; k++) xmin = min(xmin,x[k]);
131           return 0.5*(xmin + xmax);
132         }
133       if (j<nl) lo = i;
134       if (i>nr) hi = j;
135       if (i==j)
136       { if (i==nl) lo = nl;
137         if (j==nr) hi = nr;
138       }
139     }
140     else
141     { if (j<nr) lo = i;
142       if (i>nr) hi = j;
143       /* Test whether median has been isolated. */
144       if (i==j && i==nr) return result;
145     }
146   }
147   while (lo<hi-1);
148 
149   if (even) return (0.5*(x[nl]+x[nr]));
150   if (x[lo]>x[hi])
151   { float temp = x[lo];
152     x[lo] = x[hi];
153     x[hi] = temp;
154   }
155   return x[nr];
156 }
157 
158 /* ********************************************************************** */
159 
160 static const float* sortdata = NULL; /* used in the quicksort algorithm */
161 
162 /* ---------------------------------------------------------------------- */
163 
164 static
compare(const void * a,const void * b)165 int compare(const void* a, const void* b)
166 /* Helper function for sort. Previously, this was a nested function under
167  * sort, which is not allowed under ANSI C.
168  */
169 { const int i1 = *(const int*)a;
170   const int i2 = *(const int*)b;
171   const float term1 = sortdata[i1];
172   const float term2 = sortdata[i2];
173   if (term1 < term2) return -1;
174   if (term1 > term2) return +1;
175   return 0;
176 }
177 
178 /* ---------------------------------------------------------------------- */
179 
sort(int n,const float data[],int index[])180 void CALL sort(int n, const float data[], int index[])
181 /* Sets up an index table given the data, such that data[index[]] is in
182  * increasing order. Sorting is done on the indices; the array data
183  * is unchanged.
184  */
185 { int i;
186   sortdata = data;
187   for (i = 0; i < n; i++) index[i] = i;
188   qsort(index, n, sizeof(int), compare);
189 }
190 
191 /* ********************************************************************** */
192 
getrank(int n,float data[])193 static float* getrank (int n, float data[])
194 /* Calculates the ranks of the elements in the array data. Two elements with
195  * the same value get the same rank, equal to the average of the ranks had the
196  * elements different values. The ranks are returned as a newly allocated
197  * array that should be freed by the calling routine. If getrank fails due to
198  * a memory allocation error, it returns NULL.
199  */
200 { int i;
201   float* rank;
202   int* index;
203   rank = malloc(n*sizeof(float));
204   if (!rank) return NULL;
205   index = malloc(n*sizeof(int));
206   if (!index)
207   { free(rank);
208     return NULL;
209   }
210   /* Call sort to get an index table */
211   sort (n, data, index);
212   /* Build a rank table */
213   for (i = 0; i < n; i++) rank[index[i]] = i;
214   /* Fix for equal ranks */
215   i = 0;
216   while (i < n)
217   { int m;
218     float value = data[index[i]];
219     int j = i + 1;
220     while (j < n && data[index[j]] == value) j++;
221     m = j - i; /* number of equal ranks found */
222     value = rank[index[i]] + (m-1)/2.;
223     for (j = i; j < i + m; j++) rank[index[j]] = value;
224     i += m;
225   }
226   free (index);
227   return rank;
228 }
229 
230 /* ---------------------------------------------------------------------- */
231 
232 static int
makedatamask(int nrows,int ncols,float *** pdata)233 makedatamask(int nrows, int ncols, float*** pdata)
234 { int i;
235   float** data;
236   int** mask;
237   data = malloc(nrows*sizeof(float*));
238   if(!data) return 0;
239   mask = malloc(nrows*sizeof(int*));
240   if(!mask)
241   { free(data);
242     return 0;
243   }
244   for (i = 0; i < nrows; i++)
245   { data[i] = malloc(ncols*sizeof(float));
246     if(!data[i]) break;
247     mask[i] = malloc(ncols*sizeof(int));
248     if(!mask[i])
249     { free(data[i]);
250       break;
251     }
252   }
253   if (i==nrows) /* break not encountered */
254   { *pdata = data;
255     /* *pmask = mask;*/
256     return 1;
257   }
258   *pdata = NULL;
259   /*  *pmask = NULL;*/
260   nrows = i;
261   for (i = 0; i < nrows; i++)
262   { free(data[i]);
263     free(mask[i]);
264   }
265   free(data);
266   free(mask);
267   return 0;
268 }
269 
270 /* ---------------------------------------------------------------------- */
271 
272 static void
freedatamask(int n,float ** data)273 freedatamask(int n, float** data)
274 { int i;
275   for (i = 0; i < n; i++)
276   {
277     free(data[i]);
278   }
279 
280   free(data);
281 }
282 
283 /* ---------------------------------------------------------------------- */
284 
285 static
find_closest_pair(int n,float ** distmatrix,int * ip,int * jp)286 float find_closest_pair(int n, float** distmatrix, int* ip, int* jp)
287 /*
288 This function searches the distance matrix to find the pair with the shortest
289 distance between them. The indices of the pair are returned in ip and jp; the
290 distance itself is returned by the function.
291 
292 n          (input) int
293 The number of elements in the distance matrix.
294 
295 distmatrix (input) float**
296 A ragged array containing the distance matrix. The number of columns in each
297 row is one less than the row index.
298 
299 ip         (output) int*
300 A pointer to the integer that is to receive the first index of the pair with
301 the shortest distance.
302 
303 jp         (output) int*
304 A pointer to the integer that is to receive the second index of the pair with
305 the shortest distance.
306 */
307 { int i, j;
308   float temp;
309   float distance = distmatrix[1][0];
310   *ip = 1;
311   *jp = 0;
312   for (i = 1; i < n; i++)
313   { for (j = 0; j < i; j++)
314     { temp = distmatrix[i][j];
315       if (temp<distance)
316       { distance = temp;
317         *ip = i;
318         *jp = j;
319       }
320     }
321   }
322   return distance;
323 }
324 
325 /* ********************************************************************* */
326 
svd(int m,int n,float ** u,float w[],float ** v,int * ierr)327 void CALL svd(int m, int n, float** u, float w[], float** v, int* ierr)
328 /*
329  *   This subroutine is a translation of the Algol procedure svd,
330  *   Num. Math. 14, 403-420(1970) by Golub and Reinsch.
331  *   Handbook for Auto. Comp., Vol II-Linear Algebra, 134-151(1971).
332  *
333  *   This subroutine determines the singular value decomposition
334  *        t
335  *   A=usv  of a real m by n rectangular matrix, where m is greater
336  *   then or equal to n.  Householder bidiagonalization and a variant
337  *   of the QR algorithm are used.
338  *
339  *
340  *   On input.
341  *
342  *      m is the number of rows of A (and u).
343  *
344  *      n is the number of columns of A (and u) and the order of v.
345  *
346  *      u contains the rectangular input matrix A to be decomposed.
347  *
348  *   On output.
349  *
350  *      w contains the n (non-negative) singular values of a (the
351  *        diagonal elements of s).  they are unordered.  if an
352  *        error exit is made, the singular values should be correct
353  *        for indices ierr+1,ierr+2,...,n.
354  *
355  *      u contains the matrix u (orthogonal column vectors) of the
356  *        decomposition.
357  *        if an error exit is made, the columns of u corresponding
358  *        to indices of correct singular values should be correct.
359  *
360  *      v contains the matrix v (orthogonal) of the decomposition.
361  *        if an error exit is made, the columns of v corresponding
362  *        to indices of correct singular values should be correct.
363  *
364  *      ierr is set to
365  *        zero       for normal return,
366  *        k          if the k-th singular value has not been
367  *                   determined after 30 iterations,
368  *        -1         if memory allocation fails.
369  *
370  *   Questions and comments should be directed to B. S. Garbow,
371  *   Applied Mathematics division, Argonne National Laboratory
372  *
373  *   Modified to eliminate machep
374  *
375  *   Translated to C by Michiel de Hoon, Human Genome Center,
376  *   University of Tokyo, for inclusion in the C Clustering Library.
377  *   This routine is less general than the original svd routine, as
378  *   it focuses on the singular value decomposition as needed for
379  *   clustering. In particular,
380  *     - We require m >= n
381  *     - We calculate both u and v in all cases
382  *     - We pass the input array A via u; this array is subsequently
383  *       overwritten.
384  *     - We allocate for the array rv1, used as a working space,
385  *       internally in this routine, instead of passing it as an
386  *       argument. If the allocation fails, svd sets *ierr to -1
387  *       and returns.
388  *   2003.06.05
389  */
390 { int i, j, k, i1, k1, l1, its;
391   float c,f,h,s,x,y,z;
392   int l = 0;
393   float g = 0.0;
394   float scale = 0.0;
395   float anorm = 0.0;
396   float* rv1 = malloc(n*sizeof(float));
397   if (!rv1)
398   { *ierr = -1;
399     return;
400   }
401   *ierr = 0;
402   /* Householder reduction to bidiagonal form */
403   for (i = 0; i < n; i++)
404   { l = i + 1;
405     rv1[i] = scale * g;
406     g = 0.0;
407     s = 0.0;
408     scale = 0.0;
409     for (k = i; k < m; k++) scale += fabs(u[k][i]);
410     if (scale != 0.0)
411     { for (k = i; k < m; k++)
412       { u[k][i] /= scale;
413         s += u[k][i]*u[k][i];
414       }
415       f = u[i][i];
416       g = (f >= 0) ? -sqrt(s) : sqrt(s);
417       h = f * g - s;
418       u[i][i] = f - g;
419       if (i < n-1)
420       { for (j = l; j < n; j++)
421         { s = 0.0;
422           for (k = i; k < m; k++) s += u[k][i] * u[k][j];
423           f = s / h;
424           for (k = i; k < m; k++) u[k][j] += f * u[k][i];
425         }
426       }
427       for (k = i; k < m; k++) u[k][i] *= scale;
428     }
429     w[i] = scale * g;
430     g = 0.0;
431     s = 0.0;
432     scale = 0.0;
433     if (i<n-1)
434     { for (k = l; k < n; k++) scale += fabs(u[i][k]);
435       if (scale != 0.0)
436       { for (k = l; k < n; k++)
437         { u[i][k] /= scale;
438           s += u[i][k] * u[i][k];
439         }
440         f = u[i][l];
441         g = (f >= 0) ? -sqrt(s) : sqrt(s);
442         h = f * g - s;
443         u[i][l] = f - g;
444         for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
445         for (j = l; j < m; j++)
446         { s = 0.0;
447           for (k = l; k < n; k++) s += u[j][k] * u[i][k];
448           for (k = l; k < n; k++) u[j][k] += s * rv1[k];
449         }
450         for (k = l; k < n; k++)  u[i][k] *= scale;
451       }
452     }
453     anorm = max(anorm,fabs(w[i])+fabs(rv1[i]));
454   }
455   /* accumulation of right-hand transformations */
456   for (i = n-1; i>=0; i--)
457   { if (i < n-1)
458     { if (g != 0.0)
459       { for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
460         /* float division avoids possible underflow */
461         for (j = l; j < n; j++)
462         { s = 0.0;
463           for (k = l; k < n; k++) s += u[i][k] * v[k][j];
464           for (k = l; k < n; k++) v[k][j] += s * v[k][i];
465         }
466       }
467     }
468     for (j = l; j < n; j++)
469     { v[i][j] = 0.0;
470       v[j][i] = 0.0;
471     }
472     v[i][i] = 1.0;
473     g = rv1[i];
474     l = i;
475   }
476   /* accumulation of left-hand transformations */
477   for (i = n-1; i >= 0; i--)
478   { l = i + 1;
479     g = w[i];
480     if (i!=n-1)
481       for (j = l; j < n; j++) u[i][j] = 0.0;
482     if (g!=0.0)
483     { if (i!=n-1)
484       { for (j = l; j < n; j++)
485         { s = 0.0;
486           for (k = l; k < m; k++) s += u[k][i] * u[k][j];
487           /* float division avoids possible underflow */
488           f = (s / u[i][i]) / g;
489           for (k = i; k < m; k++) u[k][j] += f * u[k][i];
490         }
491       }
492       for (j = i; j < m; j++) u[j][i] /= g;
493     }
494     else
495       for (j = i; j < m; j++) u[j][i] = 0.0;
496     u[i][i] += 1.0;
497   }
498   /* diagonalization of the bidiagonal form */
499   for (k = n-1; k >= 0; k--)
500   { k1 = k-1;
501     its = 0;
502     while(1)
503     /* test for splitting */
504     { for (l = k; l >= 0; l--)
505       { l1 = l-1;
506         if (fabs(rv1[l]) + anorm == anorm) break;
507         /* rv1[0] is always zero, so there is no exit
508          * through the bottom of the loop */
509         if (fabs(w[l1]) + anorm == anorm)
510         /* cancellation of rv1[l] if l greater than 0 */
511         { c = 0.0;
512           s = 1.0;
513           for (i = l; i <= k; i++)
514           { f = s * rv1[i];
515             rv1[i] *= c;
516             if (fabs(f) + anorm == anorm) break;
517             g = w[i];
518             h = sqrt(f*f+g*g);
519             w[i] = h;
520             c = g / h;
521             s = -f / h;
522             for (j = 0; j < m; j++)
523             { y = u[j][l1];
524               z = u[j][i];
525               u[j][l1] = y * c + z * s;
526               u[j][i] = -y * s + z * c;
527             }
528           }
529           break;
530         }
531       }
532       /* test for convergence */
533       z = w[k];
534       if (l==k) /* convergence */
535       { if (z < 0.0)
536         /*  w[k] is made non-negative */
537         { w[k] = -z;
538           for (j = 0; j < n; j++) v[j][k] = -v[j][k];
539         }
540         break;
541       }
542       else if (its==30)
543       { *ierr = k;
544         break;
545       }
546       else
547       /* shift from bottom 2 by 2 minor */
548       { its++;
549         x = w[l];
550         y = w[k1];
551         g = rv1[k1];
552         h = rv1[k];
553         f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
554         g = sqrt(f*f+1.0);
555         f = ((x - z) * (x + z) + h * (y / (f + (f >= 0 ? g : -g)) - h)) / x;
556         /* next qr transformation */
557         c = 1.0;
558         s = 1.0;
559         for (i1 = l; i1 <= k1; i1++)
560         { i = i1 + 1;
561           g = rv1[i];
562           y = w[i];
563           h = s * g;
564           g = c * g;
565           z = sqrt(f*f+h*h);
566           rv1[i1] = z;
567           c = f / z;
568           s = h / z;
569           f = x * c + g * s;
570           g = -x * s + g * c;
571           h = y * s;
572           y = y * c;
573           for (j = 0; j < n; j++)
574           { x = v[j][i1];
575             z = v[j][i];
576             v[j][i1] = x * c + z * s;
577             v[j][i] = -x * s + z * c;
578           }
579           z = sqrt(f*f+h*h);
580           w[i1] = z;
581           /* rotation can be arbitrary if z is zero */
582           if (z!=0.0)
583           { c = f / z;
584             s = h / z;
585           }
586           f = c * g + s * y;
587           x = -s * g + c * y;
588           for (j = 0; j < m; j++)
589           { y = u[j][i1];
590             z = u[j][i];
591             u[j][i1] = y * c + z * s;
592             u[j][i] = -y * s + z * c;
593           }
594         }
595         rv1[l] = 0.0;
596         rv1[k] = f;
597         w[k] = x;
598       }
599     }
600   }
601   free(rv1);
602   return;
603 }
604 
605 /* ********************************************************************* */
606 
607 static
euclid(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)608 float euclid (int n, float** data1, float** data2,
609   const float weight[], int index1, int index2, int transpose)
610 
611 /*
612 Purpose
613 =======
614 
615 The euclid routine calculates the weighted Euclidean distance between two
616 rows or columns in a matrix.
617 
618 Arguments
619 =========
620 
621 n      (input) int
622 The number of elements in a row or column. If transpose==0, then n is the number
623 of columns; otherwise, n is the number of rows.
624 
625 data1  (input) float array
626 The data array containing the first vector.
627 
628 data2  (input) float array
629 The data array containing the second vector.
630 
631 mask1  (input) int array
632 This array which elements in data1 are missing. If mask1[i][j]==0, then
633 data1[i][j] is missing.
634 
635 mask2  (input) int array
636 This array which elements in data2 are missing. If mask2[i][j]==0, then
637 data2[i][j] is missing.
638 
639 weight (input) float[n]
640 The weights that are used to calculate the distance.
641 
642 index1     (input) int
643 Index of the first row or column.
644 
645 index2     (input) int
646 Index of the second row or column.
647 
648 transpose (input) int
649 If transpose==0, the distance between two rows in the matrix is calculated.
650 Otherwise, the distance between two columns in the matrix is calculated.
651 
652 ============================================================================
653 */
654 { float result = 0.;
655   float tweight = 0;
656   int i;
657   if (transpose==0) /* Calculate the distance between two rows */
658   { for (i = 0; i < n; i++)
659       {  float term = data1[index1][i] - data2[index2][i];
660         result += weight[i]*term*term;
661         tweight += weight[i];
662 
663       }
664   }
665   else
666   { for (i = 0; i < n; i++)
667       {  float term = data1[i][index1] - data2[i][index2];
668         result += weight[i]*term*term;
669         tweight += weight[i];
670       }
671 
672   }
673   if (!tweight) return 0; /* usually due to empty clusters */
674   result /= tweight;
675   return result;
676 }
677 
678 /* ********************************************************************* */
679 
680 static
cityblock(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)681 float cityblock (int n, float** data1, float** data2, const float weight[], int index1, int index2, int transpose)
682 
683 /*
684 Purpose
685 =======
686 
687 The cityblock routine calculates the weighted "City Block" distance between
688 two rows or columns in a matrix. City Block distance is defined as the
689 absolute value of X1-X2 plus the absolute value of Y1-Y2 plus..., which is
690 equivalent to taking an "up and over" path.
691 
692 Arguments
693 =========
694 
695 n      (input) int
696 The number of elements in a row or column. If transpose==0, then n is the number
697 of columns; otherwise, n is the number of rows.
698 
699 data1  (input) float array
700 The data array containing the first vector.
701 
702 data2  (input) float array
703 The data array containing the second vector.
704 
705 mask1  (input) int array
706 This array which elements in data1 are missing. If mask1[i][j]==0, then
707 data1[i][j] is missing.
708 
709 mask2  (input) int array
710 This array which elements in data2 are missing. If mask2[i][j]==0, then
711 data2[i][j] is missing.
712 
713 weight (input) float[n]
714 The weights that are used to calculate the distance.
715 
716 index1     (input) int
717 Index of the first row or column.
718 
719 index2     (input) int
720 Index of the second row or column.
721 
722 transpose (input) int
723 If transpose==0, the distance between two rows in the matrix is calculated.
724 Otherwise, the distance between two columns in the matrix is calculated.
725 
726 ============================================================================ */
727 { float result = 0.;
728   float tweight = 0;
729   int i;
730   if (transpose==0) /* Calculate the distance between two rows */
731   { for (i = 0; i < n; i++)
732     {
733        float term = data1[index1][i] - data2[index2][i];
734        result = result + weight[i]*fabs(term);
735        tweight += weight[i];
736 
737     }
738   }
739   else
740   { for (i = 0; i < n; i++)
741     {
742        float term = data1[i][index1] - data2[i][index2];
743         result = result + weight[i]*fabs(term);
744         tweight += weight[i];
745 
746     }
747   }
748   if (!tweight) return 0; /* usually due to empty clusters */
749   result /= tweight;
750   return result;
751 }
752 
753 /* ********************************************************************* */
754 
755 static
correlation(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)756 float correlation (int n, float** data1, float** data2,
757   const float weight[], int index1, int index2, int transpose)
758 /*
759 Purpose
760 =======
761 
762 The correlation routine calculates the weighted Pearson distance between two
763 rows or columns in a matrix. We define the Pearson distance as one minus the
764 Pearson correlation.
765 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
766 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
767 (e.g., choose b = a + c).
768 
769 Arguments
770 =========
771 
772 n      (input) int
773 The number of elements in a row or column. If transpose==0, then n is the number
774 of columns; otherwise, n is the number of rows.
775 
776 data1  (input) float array
777 The data array containing the first vector.
778 
779 data2  (input) float array
780 The data array containing the second vector.
781 
782 mask1  (input) int array
783 This array which elements in data1 are missing. If mask1[i][j]==0, then
784 data1[i][j] is missing.
785 
786 mask2  (input) int array
787 This array which elements in data2 are missing. If mask2[i][j]==0, then
788 data2[i][j] is missing.
789 
790 weight (input) float[n]
791 The weights that are used to calculate the distance.
792 
793 index1     (input) int
794 Index of the first row or column.
795 
796 index2     (input) int
797 Index of the second row or column.
798 
799 transpose (input) int
800 If transpose==0, the distance between two rows in the matrix is calculated.
801 Otherwise, the distance between two columns in the matrix is calculated.
802 ============================================================================
803 */
804 { float result = 0.;
805   float sum1 = 0.;
806   float sum2 = 0.;
807   float denom1 = 0.;
808   float denom2 = 0.;
809   float tweight = 0.;
810   if (transpose==0) /* Calculate the distance between two rows */
811   { int i;
812     for (i = 0; i < n; i++)
813     {
814        float term1 = data1[index1][i];
815         float term2 = data2[index2][i];
816         float w = weight[i];
817         sum1 += w*term1;
818         sum2 += w*term2;
819         result += w*term1*term2;
820         denom1 += w*term1*term1;
821         denom2 += w*term2*term2;
822         tweight += w;
823 
824     }
825   }
826   else
827   { int i;
828     for (i = 0; i < n; i++)
829     {
830        float term1 = data1[i][index1];
831         float term2 = data2[i][index2];
832         float w = weight[i];
833         sum1 += w*term1;
834         sum2 += w*term2;
835         result += w*term1*term2;
836         denom1 += w*term1*term1;
837         denom2 += w*term2*term2;
838         tweight += w;
839 
840     }
841   }
842   if (!tweight) return 0; /* usually due to empty clusters */
843   result -= sum1 * sum2 / tweight;
844   denom1 -= sum1 * sum1 / tweight;
845   denom2 -= sum2 * sum2 / tweight;
846   if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
847   if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
848   result = result / sqrt(denom1*denom2);
849   result = 1. - result;
850   return result;
851 }
852 
853 /* ********************************************************************* */
854 
855 static
acorrelation(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)856 float acorrelation (int n, float** data1, float** data2, const float weight[], int index1, int index2, int transpose)
857 /*
858 Purpose
859 =======
860 
861 The acorrelation routine calculates the weighted Pearson distance between two
862 rows or columns, using the absolute value of the correlation.
863 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
864 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
865 (e.g., choose b = a + c).
866 
867 Arguments
868 =========
869 
870 n      (input) int
871 The number of elements in a row or column. If transpose==0, then n is the number
872 of columns; otherwise, n is the number of rows.
873 
874 data1  (input) float array
875 The data array containing the first vector.
876 
877 data2  (input) float array
878 The data array containing the second vector.
879 
880 mask1  (input) int array
881 This array which elements in data1 are missing. If mask1[i][j]==0, then
882 data1[i][j] is missing.
883 
884 mask2  (input) int array
885 This array which elements in data2 are missing. If mask2[i][j]==0, then
886 data2[i][j] is missing.
887 
888 weight (input) float[n]
889 The weights that are used to calculate the distance.
890 
891 index1     (input) int
892 Index of the first row or column.
893 
894 index2     (input) int
895 Index of the second row or column.
896 
897 transpose (input) int
898 If transpose==0, the distance between two rows in the matrix is calculated.
899 Otherwise, the distance between two columns in the matrix is calculated.
900 ============================================================================
901 */
902 { float result = 0.;
903   float sum1 = 0.;
904   float sum2 = 0.;
905   float denom1 = 0.;
906   float denom2 = 0.;
907   float tweight = 0.;
908   if (transpose==0) /* Calculate the distance between two rows */
909   { int i;
910     for (i = 0; i < n; i++)
911     {
912        float term1 = data1[index1][i];
913         float term2 = data2[index2][i];
914         float w = weight[i];
915         sum1 += w*term1;
916         sum2 += w*term2;
917         result += w*term1*term2;
918         denom1 += w*term1*term1;
919         denom2 += w*term2*term2;
920         tweight += w;
921 
922     }
923   }
924   else
925   { int i;
926     for (i = 0; i < n; i++)
927     {
928        float term1 = data1[i][index1];
929         float term2 = data2[i][index2];
930         float w = weight[i];
931         sum1 += w*term1;
932         sum2 += w*term2;
933         result += w*term1*term2;
934         denom1 += w*term1*term1;
935         denom2 += w*term2*term2;
936         tweight += w;
937 
938     }
939   }
940   if (!tweight) return 0; /* usually due to empty clusters */
941   result -= sum1 * sum2 / tweight;
942   denom1 -= sum1 * sum1 / tweight;
943   denom2 -= sum2 * sum2 / tweight;
944   if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
945   if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
946   result = fabs(result) / sqrt(denom1*denom2);
947   result = 1. - result;
948   return result;
949 }
950 
951 /* ********************************************************************* */
952 
953 static
ucorrelation(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)954 float ucorrelation (int n, float** data1, float** data2,
955    const float weight[], int index1, int index2, int transpose)
956 /*
957 Purpose
958 =======
959 
960 The ucorrelation routine calculates the weighted Pearson distance between two
961 rows or columns, using the uncentered version of the Pearson correlation. In the
962 uncentered Pearson correlation, a zero mean is used for both vectors even if
963 the actual mean is nonzero.
964 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
965 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
966 (e.g., choose b = a + c).
967 
968 Arguments
969 =========
970 
971 n      (input) int
972 The number of elements in a row or column. If transpose==0, then n is the number
973 of columns; otherwise, n is the number of rows.
974 
975 data1  (input) float array
976 The data array containing the first vector.
977 
978 data2  (input) float array
979 The data array containing the second vector.
980 
981 mask1  (input) int array
982 This array which elements in data1 are missing. If mask1[i][j]==0, then
983 data1[i][j] is missing.
984 
985 mask2  (input) int array
986 This array which elements in data2 are missing. If mask2[i][j]==0, then
987 data2[i][j] is missing.
988 
989 weight (input) float[n]
990 The weights that are used to calculate the distance.
991 
992 index1     (input) int
993 Index of the first row or column.
994 
995 index2     (input) int
996 Index of the second row or column.
997 
998 transpose (input) int
999 If transpose==0, the distance between two rows in the matrix is calculated.
1000 Otherwise, the distance between two columns in the matrix is calculated.
1001 ============================================================================
1002 */
1003 { float result = 0.;
1004   float denom1 = 0.;
1005   float denom2 = 0.;
1006   int flag = 0;
1007   /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1008    * found.
1009    */
1010   if (transpose==0) /* Calculate the distance between two rows */
1011   { int i;
1012     for (i = 0; i < n; i++)
1013     {
1014        float term1 = data1[index1][i];
1015         float term2 = data2[index2][i];
1016         float w = weight[i];
1017         result += w*term1*term2;
1018         denom1 += w*term1*term1;
1019         denom2 += w*term2*term2;
1020         flag = 1;
1021 
1022     }
1023   }
1024   else
1025   { int i;
1026     for (i = 0; i < n; i++)
1027     {
1028        float term1 = data1[i][index1];
1029         float term2 = data2[i][index2];
1030         float w = weight[i];
1031         result += w*term1*term2;
1032         denom1 += w*term1*term1;
1033         denom2 += w*term2*term2;
1034         flag = 1;
1035 
1036     }
1037   }
1038   if (!flag) return 0.;
1039   if (denom1==0.) return 1.;
1040   if (denom2==0.) return 1.;
1041   result = result / sqrt(denom1*denom2);
1042   result = 1. - result;
1043   return result;
1044 }
1045 
1046 /* ********************************************************************* */
1047 
1048 static
uacorrelation(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)1049 float uacorrelation (int n, float** data1, float** data2,
1050   const float weight[], int index1, int index2, int transpose)
1051 /*
1052 Purpose
1053 =======
1054 
1055 The uacorrelation routine calculates the weighted Pearson distance between two
1056 rows or columns, using the absolute value of the uncentered version of the
1057 Pearson correlation. In the uncentered Pearson correlation, a zero mean is used
1058 for both vectors even if the actual mean is nonzero.
1059 This definition yields a semi-metric: d(a,b) >= 0, and d(a,b) = 0 iff a = b.
1060 but the triangular inequality d(a,b) + d(b,c) >= d(a,c) does not hold
1061 (e.g., choose b = a + c).
1062 
1063 Arguments
1064 =========
1065 
1066 n      (input) int
1067 The number of elements in a row or column. If transpose==0, then n is the number
1068 of columns; otherwise, n is the number of rows.
1069 
1070 data1  (input) float array
1071 The data array containing the first vector.
1072 
1073 data2  (input) float array
1074 The data array containing the second vector.
1075 
1076 mask1  (input) int array
1077 This array which elements in data1 are missing. If mask1[i][j]==0, then
1078 data1[i][j] is missing.
1079 
1080 mask2  (input) int array
1081 This array which elements in data2 are missing. If mask2[i][j]==0, then
1082 data2[i][j] is missing.
1083 
1084 weight (input) float[n]
1085 The weights that are used to calculate the distance.
1086 
1087 index1     (input) int
1088 Index of the first row or column.
1089 
1090 index2     (input) int
1091 Index of the second row or column.
1092 
1093 transpose (input) int
1094 If transpose==0, the distance between two rows in the matrix is calculated.
1095 Otherwise, the distance between two columns in the matrix is calculated.
1096 ============================================================================
1097 */
1098 { float result = 0.;
1099   float denom1 = 0.;
1100   float denom2 = 0.;
1101   int flag = 0;
1102   /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1103    * found.
1104    */
1105   if (transpose==0) /* Calculate the distance between two rows */
1106   { int i;
1107     for (i = 0; i < n; i++)
1108     {
1109        float term1 = data1[index1][i];
1110         float term2 = data2[index2][i];
1111         float w = weight[i];
1112         result += w*term1*term2;
1113         denom1 += w*term1*term1;
1114         denom2 += w*term2*term2;
1115         flag = 1;
1116 
1117     }
1118   }
1119   else
1120   { int i;
1121     for (i = 0; i < n; i++)
1122     {
1123        float term1 = data1[i][index1];
1124         float term2 = data2[i][index2];
1125         float w = weight[i];
1126         result += w*term1*term2;
1127         denom1 += w*term1*term1;
1128         denom2 += w*term2*term2;
1129         flag = 1;
1130 
1131     }
1132   }
1133   if (!flag) return 0.;
1134   if (denom1==0.) return 1.;
1135   if (denom2==0.) return 1.;
1136   result = fabs(result) / sqrt(denom1*denom2);
1137   result = 1. - result;
1138   return result;
1139 }
1140 
1141 /* *********************************************************************  */
1142 
1143 static
spearman(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)1144 float spearman (int n, float** data1, float** data2,
1145    const float weight[], int index1, int index2, int transpose)
1146 /*
1147 Purpose
1148 =======
1149 
1150 The spearman routine calculates the Spearman distance between two rows or
1151 columns. The Spearman distance is defined as one minus the Spearman rank
1152 correlation.
1153 
1154 Arguments
1155 =========
1156 
1157 n      (input) int
1158 The number of elements in a row or column. If transpose==0, then n is the number
1159 of columns; otherwise, n is the number of rows.
1160 
1161 data1  (input) float array
1162 The data array containing the first vector.
1163 
1164 data2  (input) float array
1165 The data array containing the second vector.
1166 
1167 mask1  (input) int array
1168 This array which elements in data1 are missing. If mask1[i][j]==0, then
1169 data1[i][j] is missing.
1170 
1171 mask2  (input) int array
1172 This array which elements in data2 are missing. If mask2[i][j]==0, then
1173 data2[i][j] is missing.
1174 
1175 weight (input) float[n]
1176 These weights are ignored, but included for consistency with other distance
1177 measures.
1178 
1179 index1     (input) int
1180 Index of the first row or column.
1181 
1182 index2     (input) int
1183 Index of the second row or column.
1184 
1185 transpose (input) int
1186 If transpose==0, the distance between two rows in the matrix is calculated.
1187 Otherwise, the distance between two columns in the matrix is calculated.
1188 ============================================================================
1189 */
1190 { int i;
1191   int m = 0;
1192   float* rank1;
1193   float* rank2;
1194   float result = 0.;
1195   float denom1 = 0.;
1196   float denom2 = 0.;
1197   float avgrank;
1198   float* tdata1;
1199   float* tdata2;
1200   tdata1 = malloc(n*sizeof(float));
1201   if(!tdata1) return 0.0; /* Memory allocation error */
1202   tdata2 = malloc(n*sizeof(float));
1203   if(!tdata2) /* Memory allocation error */
1204   { free(tdata1);
1205     return 0.0;
1206   }
1207   if (transpose==0)
1208   { for (i = 0; i < n; i++)
1209       { tdata1[m] = data1[index1][i];
1210 	tdata2[m] = data2[index2][i];
1211         m++;
1212 
1213       }
1214   }
1215   else
1216   { for (i = 0; i < n; i++)
1217     {
1218        tdata1[m] = data1[i][index1];
1219         tdata2[m] = data2[i][index2];
1220         m++;
1221 
1222     }
1223   }
1224   if (m==0) return 0;
1225   rank1 = getrank(m, tdata1);
1226   free(tdata1);
1227   if(!rank1) return 0.0; /* Memory allocation error */
1228   rank2 = getrank(m, tdata2);
1229   free(tdata2);
1230   if(!rank2) /* Memory allocation error */
1231   { free(rank1);
1232     return 0.0;
1233   }
1234   avgrank = 0.5*(m-1); /* Average rank */
1235   for (i = 0; i < m; i++)
1236   { const float value1 = rank1[i];
1237     const float value2 = rank2[i];
1238     result += value1 * value2;
1239     denom1 += value1 * value1;
1240     denom2 += value2 * value2;
1241   }
1242   /* Note: denom1 and denom2 cannot be calculated directly from the number
1243    * of elements. If two elements have the same rank, the squared sum of
1244    * their ranks will change.
1245    */
1246   free(rank1);
1247   free(rank2);
1248   result /= m;
1249   denom1 /= m;
1250   denom2 /= m;
1251   result -= avgrank * avgrank;
1252   denom1 -= avgrank * avgrank;
1253   denom2 -= avgrank * avgrank;
1254   if (denom1 <= 0) return 1; /* include '<' to deal with roundoff errors */
1255   if (denom2 <= 0) return 1; /* include '<' to deal with roundoff errors */
1256   result = result / sqrt(denom1*denom2);
1257   result = 1. - result;
1258   return result;
1259 }
1260 
1261 /* *********************************************************************  */
1262 
1263 static
kendall(int n,float ** data1,float ** data2,const float weight[],int index1,int index2,int transpose)1264 float kendall (int n, float** data1, float** data2,
1265   const float weight[], int index1, int index2, int transpose)
1266 /*
1267 Purpose
1268 =======
1269 
1270 The kendall routine calculates the Kendall distance between two
1271 rows or columns. The Kendall distance is defined as one minus Kendall's tau.
1272 
1273 Arguments
1274 =========
1275 
1276 n      (input) int
1277 The number of elements in a row or column. If transpose==0, then n is the number
1278 of columns; otherwise, n is the number of rows.
1279 
1280 data1  (input) float array
1281 The data array containing the first vector.
1282 
1283 data2  (input) float array
1284 The data array containing the second vector.
1285 
1286 mask1  (input) int array
1287 This array which elements in data1 are missing. If mask1[i][j]==0, then
1288 data1[i][j] is missing.
1289 
1290 mask2  (input) int array
1291 This array which elements in data2 are missing. If mask2[i][j]==0, then
1292 data2[i][j] is missing.
1293 
1294 weight (input) float[n]
1295 These weights are ignored, but included for consistency with other distance
1296 measures.
1297 
1298 index1     (input) int
1299 Index of the first row or column.
1300 
1301 index2     (input) int
1302 Index of the second row or column.
1303 
1304 transpose (input) int
1305 If transpose==0, the distance between two rows in the matrix is calculated.
1306 Otherwise, the distance between two columns in the matrix is calculated.
1307 ============================================================================
1308 */
1309 { int con = 0;
1310   int dis = 0;
1311   int exx = 0;
1312   int exy = 0;
1313   int flag = 0;
1314   /* flag will remain zero if no nonzero combinations of mask1 and mask2 are
1315    * found.
1316    */
1317   float denomx;
1318   float denomy;
1319   float tau;
1320   int i, j;
1321   if (transpose==0)
1322   { for (i = 0; i < n; i++)
1323 
1324       { for (j = 0; j < i; j++)
1325 
1326           { float x1 = data1[index1][i];
1327             float x2 = data1[index1][j];
1328             float y1 = data2[index2][i];
1329             float y2 = data2[index2][j];
1330             if (x1 < x2 && y1 < y2) con++;
1331             if (x1 > x2 && y1 > y2) con++;
1332             if (x1 < x2 && y1 > y2) dis++;
1333             if (x1 > x2 && y1 < y2) dis++;
1334             if (x1 == x2 && y1 != y2) exx++;
1335             if (x1 != x2 && y1 == y2) exy++;
1336             flag = 1;
1337           }
1338 
1339       }
1340 
1341   }
1342   else
1343   { for (i = 0; i < n; i++)
1344 
1345       { for (j = 0; j < i; j++)
1346 
1347           { float x1 = data1[i][index1];
1348             float x2 = data1[j][index1];
1349             float y1 = data2[i][index2];
1350             float y2 = data2[j][index2];
1351             if (x1 < x2 && y1 < y2) con++;
1352             if (x1 > x2 && y1 > y2) con++;
1353             if (x1 < x2 && y1 > y2) dis++;
1354             if (x1 > x2 && y1 < y2) dis++;
1355             if (x1 == x2 && y1 != y2) exx++;
1356             if (x1 != x2 && y1 == y2) exy++;
1357             flag = 1;
1358           }
1359 
1360       }
1361 
1362   }
1363   if (!flag) return 0.;
1364   denomx = con + dis + exx;
1365   denomy = con + dis + exy;
1366   if (denomx==0) return 1;
1367   if (denomy==0) return 1;
1368   tau = (con-dis)/sqrt(denomx*denomy);
1369   return 1.-tau;
1370 }
1371 
1372 /* *********************************************************************  */
1373 
setmetric(char dist)1374 float(*setmetric(char dist))
1375   (int, float**, float**, const float[], int, int, int)
1376 { switch(dist)
1377   { case 'e': return &euclid;
1378     case 'b': return &cityblock;
1379     case 'c': return &correlation;
1380     case 'a': return &acorrelation;
1381     case 'u': return &ucorrelation;
1382     case 'x': return &uacorrelation;
1383     case 's': return &spearman;
1384     case 'k': return &kendall;
1385     default: return &euclid;
1386   }
1387   return NULL; /* Never get here */
1388 }
1389 
1390 /* *********************************************************************  */
1391 
1392 /* if seed = 0, return preset CLUST_SEED
1393            < 0, set CLUST_SEED to time (0) and return it
1394            > 0, set CLUST_SEED to seed and return it*/
clust_seed(int seed)1395 unsigned int clust_seed(int seed)
1396 {
1397    static int isset = 0;
1398    static unsigned int CLUST_SEED = 0;
1399 
1400    if (seed < 0) { /* User wants time seed */
1401       CLUST_SEED = (unsigned int)time(0);
1402       isset = 1;
1403    } else if (seed > 0) {
1404       /* user is  providing a seed, use it */
1405       CLUST_SEED = (unsigned int)seed;
1406       isset = 1;
1407    } else { /* user wants preset seed  */
1408       if (!isset) { /* not set already, do it (could recurse...) */
1409          CLUST_SEED = (unsigned int)time(0);
1410          isset = 1;
1411       } else {
1412          /* nothing to do, let it return */
1413       }
1414    }
1415    return (CLUST_SEED);
1416 }
uniform(void)1417 static float uniform(void)
1418 /*
1419 Purpose
1420 =======
1421 
1422 This routine returns a uniform random number between 0.0 and 1.0. Both 0.0
1423 and 1.0 are excluded. This random number generator is described in:
1424 
1425 Pierre l'Ecuyer
1426 Efficient and Portable Combined Random Number Generators
1427 Communications of the ACM, Volume 31, Number 6, June 1988, pages 742-749,774.
1428 
1429 The first time this routine is called, it initializes the random number
1430 generator using the current time. First, the current epoch time in seconds is
1431 used as a seed for the random number generator in the C library. The first two
1432 random numbers generated by this generator are used to initialize the random
1433 number generator implemented in this routine.
1434 
1435 
1436 Arguments
1437 =========
1438 
1439 None.
1440 
1441 
1442 Return value
1443 ============
1444 
1445 A float-precison number between 0.0 and 1.0.
1446 ============================================================================
1447 */
1448 { int z;
1449   static const int m1 = 2147483563;
1450   static const int m2 = 2147483399;
1451   const float scale = 1.0/m1;
1452 
1453   static int s1 = 0;
1454   static int s2 = 0;
1455 
1456   if (s1==0 || s2==0) /* initialize */
1457   { unsigned int initseed = clust_seed(0);
1458     srand(initseed);
1459     s1 = rand();
1460     s2 = rand();
1461   }
1462 
1463   do
1464   { int k;
1465     k = s1/53668;
1466     s1 = 40014*(s1-k*53668)-k*12211;
1467     if (s1 < 0) s1+=m1;
1468     k = s2/52774;
1469     s2 = 40692*(s2-k*52774)-k*3791;
1470     if(s2 < 0) s2+=m2;
1471     z = s1-s2;
1472     if(z < 1) z+=(m1-1);
1473   } while (z==m1); /* To avoid returning 1.0 */
1474 
1475   return z*scale;
1476 }
1477 
1478 /* ************************************************************************ */
1479 
binomial(int n,float p)1480 static int binomial(int n, float p)
1481 /*
1482 Purpose
1483 =======
1484 
1485 This routine generates a random number between 0 and n inclusive, following
1486 the binomial distribution with probability p and n trials. The routine is
1487 based on the BTPE algorithm, described in:
1488 
1489 Voratas Kachitvichyanukul and Bruce W. Schmeiser:
1490 Binomial Random Variate Generation
1491 Communications of the ACM, Volume 31, Number 2, February 1988, pages 216-222.
1492 
1493 
1494 Arguments
1495 =========
1496 
1497 p          (input) float
1498 The probability of a single event. This probability should be less than or
1499 equal to 0.5.
1500 
1501 n          (input) int
1502 The number of trials.
1503 
1504 
1505 Return value
1506 ============
1507 
1508 An integer drawn from a binomial distribution with parameters (p, n).
1509 
1510 ============================================================================
1511 */
1512 { const float q = 1 - p;
1513   if (n*p < 30.0) /* Algorithm BINV */
1514   { const float s = p/q;
1515     const float a = (n+1)*s;
1516     float r = exp(n*log(q)); /* pow() causes a crash on AIX */
1517     int x = 0;
1518     float u = uniform();
1519     while(1)
1520     { if (u < r) return x;
1521       u-=r;
1522       x++;
1523       r *= (a/x)-s;
1524     }
1525   }
1526   else /* Algorithm BTPE */
1527   { /* Step 0 */
1528     const float fm = n*p + p;
1529     const int m = (int) fm;
1530     const float p1 = floor(2.195*sqrt(n*p*q) -4.6*q) + 0.5;
1531     const float xm = m + 0.5;
1532     const float xl = xm - p1;
1533     const float xr = xm + p1;
1534     const float c = 0.134 + 20.5/(15.3+m);
1535     const float a = (fm-xl)/(fm-xl*p);
1536     const float b = (xr-fm)/(xr*q);
1537     const float lambdal = a*(1.0+0.5*a);
1538     const float lambdar = b*(1.0+0.5*b);
1539     const float p2 = p1*(1+2*c);
1540     const float p3 = p2 + c/lambdal;
1541     const float p4 = p3 + c/lambdar;
1542     while (1)
1543     { /* Step 1 */
1544       int y;
1545       int k;
1546       float u = uniform();
1547       float v = uniform();
1548       u *= p4;
1549       if (u <= p1) return (int)(xm-p1*v+u);
1550       /* Step 2 */
1551       if (u > p2)
1552       { /* Step 3 */
1553         if (u > p3)
1554         { /* Step 4 */
1555           y = (int)(xr-log(v)/lambdar);
1556           if (y > n) continue;
1557           /* Go to step 5 */
1558           v = v*(u-p3)*lambdar;
1559         }
1560         else
1561         { y = (int)(xl+log(v)/lambdal);
1562           if (y < 0) continue;
1563           /* Go to step 5 */
1564           v = v*(u-p2)*lambdal;
1565         }
1566       }
1567       else
1568       { const float x = xl + (u-p1)/c;
1569         v = v*c + 1.0 - fabs(m-x+0.5)/p1;
1570         if (v > 1) continue;
1571         /* Go to step 5 */
1572         y = (int)x;
1573       }
1574       /* Step 5 */
1575       /* Step 5.0 */
1576       k = abs(y-m);
1577       if (k > 20 && k < 0.5*n*p*q-1.0)
1578       { /* Step 5.2 */
1579         float rho = (k/(n*p*q))*((k*(k/3.0 + 0.625) + 0.1666666666666)/(n*p*q)+0.5);
1580         float t = -k*k/(2*n*p*q);
1581         float A = log(v);
1582         if (A < t-rho) return y;
1583         else if (A > t+rho) continue;
1584         else
1585         { /* Step 5.3 */
1586           float x1 = y+1;
1587           float f1 = m+1;
1588           float z = n+1-m;
1589           float w = n-y+1;
1590           float x2 = x1*x1;
1591           float f2 = f1*f1;
1592           float z2 = z*z;
1593           float w2 = w*w;
1594           if (A > xm * log(f1/x1) + (n-m+0.5)*log(z/w)
1595                 + (y-m)*log(w*p/(x1*q))
1596                 + (13860.-(462.-(132.-(99.-140./f2)/f2)/f2)/f2)/f1/166320.
1597                 + (13860.-(462.-(132.-(99.-140./z2)/z2)/z2)/z2)/z/166320.
1598                 + (13860.-(462.-(132.-(99.-140./x2)/x2)/x2)/x2)/x1/166320.
1599                 + (13860.-(462.-(132.-(99.-140./w2)/w2)/w2)/w2)/w/166320.)
1600              continue;
1601           return y;
1602         }
1603       }
1604       else
1605       { /* Step 5.1 */
1606         int i;
1607         const float s = p/q;
1608         const float aa = s*(n+1);
1609         float f = 1.0;
1610         for (i = m; i < y; f *= (aa/(++i)-s));
1611         for (i = y; i < m; f /= (aa/(++i)-s));
1612         if (v > f) continue;
1613         return y;
1614       }
1615     }
1616   }
1617   /* Never get here */
1618   return -1;
1619 }
1620 
1621 /* ************************************************************************ */
1622 
randomassign(int nclusters,int nelements,int clusterid[])1623 static void randomassign (int nclusters, int nelements, int clusterid[])
1624 /*
1625 Purpose
1626 =======
1627 
1628 The randomassign routine performs an initial random clustering, needed for
1629 k-means or k-median clustering. Elements (genes or microarrays) are randomly
1630 assigned to clusters. The number of elements in each cluster is chosen
1631 randomly, making sure that each cluster will receive at least one element.
1632 
1633 
1634 Arguments
1635 =========
1636 
1637 nclusters  (input) int
1638 The number of clusters.
1639 
1640 nelements  (input) int
1641 The number of elements to be clustered (i.e., the number of genes or microarrays
1642 to be clustered).
1643 
1644 clusterid  (output) int[nelements]
1645 The cluster number to which an element was assigned.
1646 
1647 ============================================================================
1648 */
1649 { int i, j;
1650   int k = 0;
1651   float p;
1652   int n = nelements-nclusters;
1653   /* Draw the number of elements in each cluster from a multinomial
1654    * distribution, reserving ncluster elements to set independently
1655    * in order to guarantee that none of the clusters are empty.
1656    */
1657 
1658   for (i = 0; i < nclusters-1; i++)
1659   { p = 1.0/(nclusters-i);
1660     j = binomial(n, p);
1661     n -= j;
1662     j += k+1; /* Assign at least one element to cluster i */
1663     for ( ; k < j; k++) clusterid[k] = i;
1664   }
1665   /* Assign the remaining elements to the last cluster */
1666   for ( ; k < nelements; k++) clusterid[k] = i;
1667 
1668 
1669   /* Create a random permutation of the cluster assignments */
1670   for (i = 0; i < nelements; i++)
1671   { j = (int) (i + (nelements-i)*uniform());
1672     if (j>=nelements) j=nelements-1;   /* ZSS:  This was causing memory
1673                                                 corruption, j can be equal to
1674                                                 nelements and that is illegal. */
1675     k = clusterid[j];
1676     clusterid[j] = clusterid[i];
1677     clusterid[i] = k;
1678   }
1679 
1680   return;
1681 }
1682 
1683 /* ********************************************************************* */
1684 
getclustermeans(int nclusters,int nrows,int ncolumns,float ** data,int clusterid[],float ** cdata,int transpose)1685 static void getclustermeans(int nclusters, int nrows, int ncolumns,
1686   float** data,  int clusterid[], float** cdata,
1687   int transpose)
1688 /*
1689 Purpose
1690 =======
1691 
1692 The getclustermeans routine calculates the cluster centroids, given to which
1693 cluster each element belongs. The centroid is defined as the mean over all
1694 elements for each dimension.
1695 
1696 Arguments
1697 =========
1698 
1699 nclusters  (input) int
1700 The number of clusters.
1701 
1702 nrows     (input) int
1703 The number of rows in the gene expression data matrix, equal to the number of
1704 genes.
1705 
1706 ncolumns  (input) int
1707 The number of columns in the gene expression data matrix, equal to the number of
1708 microarrays.
1709 
1710 data       (input) float[nrows][ncolumns]
1711 The array containing the gene expression data.
1712 
1713 mask       (input) int[nrows][ncolumns]
1714 This array shows which data values are missing. If mask[i][j]==0, then
1715 data[i][j] is missing.
1716 
1717 clusterid  (output) int[nrows] if transpose==0
1718                     int[ncolumns] if transpose==1
1719 The cluster number to which each element belongs. If transpose==0, then the
1720 dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
1721 is equal to ncolumns (the number of microarrays).
1722 
1723 cdata      (output) float[nclusters][ncolumns] if transpose==0
1724                     float[nrows][nclusters] if transpose==1
1725 On exit of getclustermeans, this array contains the cluster centroids.
1726 
1727 cmask      (output) int[nclusters][ncolumns] if transpose==0
1728                     int[nrows][nclusters] if transpose==1
1729 This array shows which data values of are missing for each centroid. If
1730 cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
1731 a centroid if all corresponding data values of the cluster members are missing.
1732 
1733 transpose  (input) int
1734 If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
1735 columns (microarrays) are specified.
1736 
1737 ========================================================================
1738 */
1739 { int i, j, k;
1740   int** voxlcounter;
1741   voxlcounter = malloc(nclusters*sizeof(int*));
1742   for (i = 0; i < nclusters; i++)
1743     { voxlcounter[i] = malloc(ncolumns*sizeof(int));
1744     }
1745 
1746   if (transpose==0)
1747   { for (i = 0; i < nclusters; i++)
1748     { for (j = 0; j < ncolumns; j++)
1749 	{ voxlcounter[i][j] = 0;
1750 	  cdata[i][j] = 0.;
1751       }
1752     }
1753 
1754     for (k = 0; k < nrows; k++)
1755     { i = clusterid[k];
1756       for (j = 0; j < ncolumns; j++)
1757       {
1758 	cdata[i][j]+=data[k][j];
1759         voxlcounter[i][j]++;
1760 
1761       }
1762     }
1763    for (i = 0; i < nclusters; i++)
1764     { for (j = 0; j < ncolumns; j++)
1765       { if (voxlcounter[i][j]>0)
1766         { cdata[i][j] /= voxlcounter[i][j];
1767           voxlcounter[i][j] = 1;
1768         }
1769       }
1770     }
1771 
1772   }
1773   else
1774   { for (i = 0; i < nrows; i++)
1775     { for (j = 0; j < nclusters; j++)
1776       { cdata[i][j] = 0.;
1777         voxlcounter[i][j] = 0;
1778       }
1779     }
1780     for (k = 0; k < ncolumns; k++)
1781     { i = clusterid[k];
1782       for (j = 0; j < nrows; j++)
1783       { cdata[j][i]+=data[j][k];
1784 	voxlcounter[j][i]++;
1785 
1786       }
1787     }
1788    for (i = 0; i < nrows; i++)
1789     { for (j = 0; j < nclusters; j++)
1790       { if (voxlcounter[i][j]>0)
1791         { cdata[i][j] /= voxlcounter[i][j];
1792           voxlcounter[i][j] = 1;
1793         }
1794       }
1795     }
1796   }
1797 
1798   for (i = 0; i < nclusters; i++)
1799     { free(voxlcounter[i]);
1800     }
1801   free(voxlcounter);
1802   /* It might be good to free it later?! but it is small anyway... */
1803 }
1804 
1805 /* ********************************************************************* */
1806 
1807 static void
getclustermedians(int nclusters,int nrows,int ncolumns,float ** data,int clusterid[],float ** cdata,int transpose,float cache[])1808 getclustermedians(int nclusters, int nrows, int ncolumns,
1809   float** data, int clusterid[], float** cdata,
1810   int transpose, float cache[])
1811 /*
1812 Purpose
1813 =======
1814 
1815 The getclustermedians routine calculates the cluster centroids, given to which
1816 cluster each element belongs. The centroid is defined as the median over all
1817 elements for each dimension.
1818 
1819 Arguments
1820 =========
1821 
1822 nclusters  (input) int
1823 The number of clusters.
1824 
1825 nrows     (input) int
1826 The number of rows in the gene expression data matrix, equal to the number of
1827 genes.
1828 
1829 ncolumns  (input) int
1830 The number of columns in the gene expression data matrix, equal to the number of
1831 microarrays.
1832 
1833 data       (input) float[nrows][ncolumns]
1834 The array containing the gene expression data.
1835 
1836 mask       (input) int[nrows][ncolumns]
1837 This array shows which data values are missing. If mask[i][j]==0, then
1838 data[i][j] is missing.
1839 
1840 clusterid  (output) int[nrows] if transpose==0
1841                     int[ncolumns] if transpose==1
1842 The cluster number to which each element belongs. If transpose==0, then the
1843 dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
1844 is equal to ncolumns (the number of microarrays).
1845 
1846 cdata      (output) float[nclusters][ncolumns] if transpose==0
1847                     float[nrows][nclusters] if transpose==1
1848 On exit of getclustermedians, this array contains the cluster centroids.
1849 
1850 cmask      (output) int[nclusters][ncolumns] if transpose==0
1851                     int[nrows][nclusters] if transpose==1
1852 This array shows which data values of are missing for each centroid. If
1853 cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
1854 a centroid if all corresponding data values of the cluster members are missing.
1855 
1856 transpose  (input) int
1857 If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
1858 columns (microarrays) are specified.
1859 
1860 cache      (input) float[nrows] if transpose==0
1861                    float[ncolumns] if transpose==1
1862 This array should be allocated before calling getclustermedians; its contents
1863 on input is not relevant. This array is used as a temporary storage space when
1864 calculating the medians.
1865 
1866 ========================================================================
1867 */
1868 { int i, j, k;
1869   if (transpose==0)
1870   { for (i = 0; i < nclusters; i++)
1871     { for (j = 0; j < ncolumns; j++)
1872       { int count = 0;
1873         for (k = 0; k < nrows; k++)
1874 	  { if (i==clusterid[k])
1875           { cache[count] = data[k][j];
1876             count++;
1877           }
1878         }
1879         if (count>0)
1880         { cdata[i][j] = median(count,cache);
1881 
1882         }
1883         else
1884         { cdata[i][j] = 0.;
1885 
1886         }
1887       }
1888     }
1889   }
1890   else
1891   { for (i = 0; i < nclusters; i++)
1892     { for (j = 0; j < nrows; j++)
1893       { int count = 0;
1894         for (k = 0; k < ncolumns; k++)
1895         { if (i==clusterid[k])
1896           { cache[count] = data[j][k];
1897             count++;
1898           }
1899         }
1900         if (count>0)
1901         { cdata[j][i] = median(count,cache);
1902 
1903         }
1904         else
1905         { cdata[j][i] = 0.;
1906 
1907         }
1908       }
1909     }
1910   }
1911 }
1912 
1913 /* ********************************************************************* */
1914 
getclustercentroids(int nclusters,int nrows,int ncolumns,float ** data,int clusterid[],float ** cdata,int transpose,char method)1915 int getclustercentroids(int nclusters, int nrows, int ncolumns,
1916   float** data, int clusterid[], float** cdata,
1917   int transpose, char method)
1918 /*
1919 Purpose
1920 =======
1921 
1922 The getclustercentroids routine calculates the cluster centroids, given to
1923 which cluster each element belongs. Depending on the argument method, the
1924 centroid is defined as either the mean or the median for each dimension over
1925 all elements belonging to a cluster.
1926 
1927 Arguments
1928 =========
1929 
1930 nclusters  (input) int
1931 The number of clusters.
1932 
1933 nrows     (input) int
1934 The number of rows in the gene expression data matrix, equal to the number of
1935 genes.
1936 
1937 ncolumns  (input) int
1938 The number of columns in the gene expression data matrix, equal to the number of
1939 microarrays.
1940 
1941 data       (input) float[nrows][ncolumns]
1942 The array containing the gene expression data.
1943 
1944 mask       (input) int[nrows][ncolumns]
1945 This array shows which data values are missing. If mask[i][j]==0, then
1946 data[i][j] is missing.
1947 
1948 clusterid  (output) int[nrows] if transpose==0
1949                     int[ncolumns] if transpose==1
1950 The cluster number to which each element belongs. If transpose==0, then the
1951 dimension of clusterid is equal to nrows (the number of genes). Otherwise, it
1952 is equal to ncolumns (the number of microarrays).
1953 
1954 cdata      (output) float[nclusters][ncolumns] if transpose==0
1955                     float[nrows][nclusters] if transpose==1
1956 On exit of getclustercentroids, this array contains the cluster centroids.
1957 
1958 cmask      (output) int[nclusters][ncolumns] if transpose==0
1959                     int[nrows][nclusters] if transpose==1
1960 This array shows which data values of are missing for each centroid. If
1961 cmask[i][j]==0, then cdata[i][j] is missing. A data value is missing for
1962 a centroid if all corresponding data values of the cluster members are missing.
1963 
1964 transpose  (input) int
1965 If transpose==0, clusters of rows (genes) are specified. Otherwise, clusters of
1966 columns (microarrays) are specified.
1967 
1968 method     (input) char
1969 For method=='a', the centroid is defined as the mean over all elements
1970 belonging to a cluster for each dimension.
1971 For method=='m', the centroid is defined as the median over all elements
1972 belonging to a cluster for each dimension.
1973 
1974 Return value
1975 ============
1976 
1977 The function returns an integer to indicate success or failure. If a
1978 memory error occurs, or if method is not 'm' or 'a', getclustercentroids
1979 returns 0. If successful, getclustercentroids returns 1;
1980 ========================================================================
1981 */
1982 { switch(method)
1983   { case 'm':
1984     { const int nelements = (transpose==0) ? nrows : ncolumns;
1985       float* cache = malloc(nelements*sizeof(float));
1986       if (!cache) return 0;
1987       getclustermedians(nclusters, nrows, ncolumns, data, clusterid,
1988                         cdata, transpose, cache);
1989       free(cache);
1990       return 1;
1991     }
1992     case 'a':
1993     { getclustermeans(nclusters, nrows, ncolumns, data, clusterid,
1994                       cdata, transpose);
1995       return 1;
1996     }
1997   }
1998   return 0;
1999 }
2000 
2001 /* ********************************************************************* */
2002 
getclustermedoids(int nclusters,int nelements,float ** distance,int clusterid[],int centroids[],float errors[])2003 void getclustermedoids(int nclusters, int nelements, float** distance,
2004   int clusterid[], int centroids[], float errors[])
2005 /*
2006 Purpose
2007 =======
2008 
2009 The getclustermedoids routine calculates the cluster centroids, given to which
2010 cluster each element belongs. The centroid is defined as the element with the
2011 smallest sum of distances to the other elements.
2012 
2013 Arguments
2014 =========
2015 
2016 nclusters  (input) int
2017 The number of clusters.
2018 
2019 nelements  (input) int
2020 The total number of elements.
2021 
2022 distmatrix (input) float array, ragged
2023   (number of rows is nelements, number of columns is equal to the row number)
2024 The distance matrix. To save space, the distance matrix is given in the
2025 form of a ragged array. The distance matrix is symmetric and has zeros
2026 on the diagonal. See distancematrix for a description of the content.
2027 
2028 clusterid  (output) int[nelements]
2029 The cluster number to which each element belongs.
2030 
2031 centroid   (output) int[nclusters]
2032 The index of the element that functions as the centroid for each cluster.
2033 
2034 errors     (output) float[nclusters]
2035 The within-cluster sum of distances between the items and the cluster
2036 centroid.
2037 
2038 ========================================================================
2039 */
2040 { int i, j, k;
2041   for (j = 0; j < nclusters; j++) errors[j] = FLT_MAX;
2042   for (i = 0; i < nelements; i++)
2043   { float d = 0.0;
2044     j = clusterid[i];
2045     for (k = 0; k < nelements; k++)
2046     { if (i==k || clusterid[k]!=j) continue;
2047       d += (i < k ? distance[k][i] : distance[i][k]);
2048       if (d > errors[j]) break;
2049     }
2050     if (d < errors[j])
2051     { errors[j] = d;
2052       centroids[j] = i;
2053     }
2054   }
2055 }
2056 
2057 /* ********************************************************************* */
2058 
2059 static int
kmeans(int nclusters,int nrows,int ncolumns,float ** data,float weight[],int transpose,int npass,char dist,float ** cdata,int clusterid[],float * error,int tclusterid[],int counts[],int mapping[])2060 kmeans(int nclusters, int nrows, int ncolumns, float** data,
2061   float weight[], int transpose, int npass, char dist,
2062   float** cdata, int clusterid[], float* error,
2063   int tclusterid[], int counts[], int mapping[])
2064 { int i, j, k;
2065   const int nelements = (transpose==0) ? nrows : ncolumns;
2066   const int ndata = (transpose==0) ? ncolumns : nrows;
2067   int ifound = 1;
2068   int ipass = 0;
2069   /* Set the metric function as indicated by dist */
2070   float (*metric)
2071     (int, float**, float**, const float[], int, int, int) =
2072        setmetric(dist);
2073 
2074   /* We save the clustering solution periodically and check if it reappears */
2075   int* saved = malloc(nelements*sizeof(int));
2076 
2077   if (saved==NULL) return -1;
2078 
2079   *error = FLT_MAX;
2080 
2081   do
2082   { float total = FLT_MAX;
2083     int counter = 0;
2084     int period = 10;
2085 
2086     /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2087 
2088    if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2089 
2090     for (i = 0; i < nclusters; i++) counts[i] = 0;
2091     for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2092 
2093     /* Start the loop */
2094     while(1)
2095     { float previous = total;
2096       total = 0.0;
2097 
2098       if (counter % period == 0) /* Save the current cluster assignments */
2099       { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2100         if (period < INT_MAX / 2) period *= 2;
2101       }
2102       counter++;
2103 
2104       /* Find the center */
2105 
2106       getclustermeans(nclusters, nrows, ncolumns, data, tclusterid,
2107                       cdata, transpose);
2108 
2109       for (i = 0; i < nelements; i++)
2110       /* Calculate the distances */
2111       { float distance;
2112         k = tclusterid[i];
2113         if (counts[k]==1) continue;
2114         /* No reassignment if that would lead to an empty cluster */
2115         /* Treat the present cluster as a special case */
2116         distance = metric(ndata,data,cdata,weight,i,k,transpose);
2117         for (j = 0; j < nclusters; j++)
2118         { float tdistance;
2119           if (j==k) continue;
2120           tdistance = metric(ndata,data,cdata,weight,i,j,transpose);
2121           if (tdistance < distance)
2122           { distance = tdistance;
2123             counts[tclusterid[i]]--;
2124             tclusterid[i] = j;
2125             counts[j]++;
2126           }
2127         }
2128         total += distance;
2129       }
2130       if (total>=previous) break;
2131       /* total>=previous is FALSE on some machines even if total and previous
2132        * are bitwise identical. */
2133       for (i = 0; i < nelements; i++)
2134         if (saved[i]!=tclusterid[i]) break;
2135       if (i==nelements)
2136         break; /* Identical solution found; break out of this loop */
2137     }
2138 
2139     if (npass<=1)
2140     { *error = total;
2141       break;
2142     }
2143 
2144     for (i = 0; i < nclusters; i++) mapping[i] = -1;
2145     for (i = 0; i < nelements; i++)
2146     { j = tclusterid[i];
2147       k = clusterid[i];
2148       if (mapping[k] == -1) mapping[k] = j;
2149       else if (mapping[k] != j)
2150       { if (total < *error)
2151         { ifound = 1;
2152           *error = total;
2153           for (i = 0; i < nelements; i++) clusterid[i] = tclusterid[i];
2154         }
2155         break;
2156       }
2157     }
2158     if (i==nelements) ifound++; /* break statement not encountered */
2159 
2160   } while (++ipass < npass);
2161 
2162   free(saved);
2163   return ifound;
2164 }
2165 
2166 /* ---------------------------------------------------------------------- */
2167 
2168 static int
kmedians(int nclusters,int nrows,int ncolumns,float ** data,float weight[],int transpose,int npass,char dist,float ** cdata,int clusterid[],float * error,int tclusterid[],int counts[],int mapping[],float cache[])2169 kmedians(int nclusters, int nrows, int ncolumns, float** data,
2170   float weight[], int transpose, int npass, char dist,
2171   float** cdata, int clusterid[], float* error,
2172   int tclusterid[], int counts[], int mapping[], float cache[])
2173 { int i, j, k;
2174   const int nelements = (transpose==0) ? nrows : ncolumns;
2175   const int ndata = (transpose==0) ? ncolumns : nrows;
2176   int ifound = 1;
2177   int ipass = 0;
2178   /* Set the metric function as indicated by dist */
2179   float (*metric)
2180     (int, float**, float**, const float[], int, int, int) =
2181        setmetric(dist);
2182 
2183   /* We save the clustering solution periodically and check if it reappears */
2184   int* saved = malloc(nelements*sizeof(int));
2185   if (saved==NULL) return -1;
2186 
2187   *error = FLT_MAX;
2188 
2189   do
2190   { float total = FLT_MAX;
2191     int counter = 0;
2192     int period = 10;
2193 
2194     /* Perform the EM algorithm. First, randomly assign elements to clusters. */
2195     if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2196 
2197     for (i = 0; i < nclusters; i++) counts[i]=0;
2198     for (i = 0; i < nelements; i++) counts[tclusterid[i]]++;
2199 
2200     /* Start the loop */
2201     while(1)
2202     { float previous = total;
2203       total = 0.0;
2204 
2205       if (counter % period == 0) /* Save the current cluster assignments */
2206       { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2207         if (period < INT_MAX / 2) period *= 2;
2208       }
2209       counter++;
2210 
2211       /* Find the center */
2212       getclustermedians(nclusters, nrows, ncolumns, data, tclusterid,
2213                         cdata, transpose, cache);
2214 
2215       for (i = 0; i < nelements; i++)
2216       /* Calculate the distances */
2217       { float distance;
2218         k = tclusterid[i];
2219         if (counts[k]==1) continue;
2220         /* No reassignment if that would lead to an empty cluster */
2221         /* Treat the present cluster as a special case */
2222         distance = metric(ndata,data,cdata,weight,i,k,transpose);
2223         for (j = 0; j < nclusters; j++)
2224         { float tdistance;
2225           if (j==k) continue;
2226           tdistance = metric(ndata,data,cdata,weight,i,j,transpose);
2227           if (tdistance < distance)
2228           { distance = tdistance;
2229             counts[tclusterid[i]]--;
2230             tclusterid[i] = j;
2231             counts[j]++;
2232           }
2233         }
2234         total += distance;
2235       }
2236       if (total>=previous) break;
2237       /* total>=previous is FALSE on some machines even if total and previous
2238        * are bitwise identical. */
2239       for (i = 0; i < nelements; i++)
2240         if (saved[i]!=tclusterid[i]) break;
2241       if (i==nelements)
2242         break; /* Identical solution found; break out of this loop */
2243     }
2244 
2245     if (npass<=1)
2246     { *error = total;
2247       break;
2248     }
2249 
2250     for (i = 0; i < nclusters; i++) mapping[i] = -1;
2251     for (i = 0; i < nelements; i++)
2252     { j = tclusterid[i];
2253       k = clusterid[i];
2254       if (mapping[k] == -1) mapping[k] = j;
2255       else if (mapping[k] != j)
2256       { if (total < *error)
2257         { ifound = 1;
2258           *error = total;
2259           for (i = 0; i < nelements; i++) clusterid[i] = tclusterid[i];
2260         }
2261         break;
2262       }
2263     }
2264     if (i==nelements) ifound++; /* break statement not encountered */
2265   } while (++ipass < npass);
2266 
2267   free(saved);
2268   return ifound;
2269 }
2270 
2271 /* ********************************************************************* */
2272 
kcluster(int nclusters,int nrows,int ncolumns,float ** data,float weight[],int transpose,int npass,char method,char dist,int clusterid[],float * error,int * ifound)2273 void CALL kcluster (int nclusters, int nrows, int ncolumns,
2274   float** data, float weight[], int transpose,
2275   int npass, char method, char dist,
2276   int clusterid[], float* error, int* ifound)
2277 /*
2278 Purpose
2279 =======
2280 
2281 The kcluster routine performs k-means or k-median clustering on a given set of
2282 elements, using the specified distance measure. The number of clusters is given
2283 by the user. Multiple passes are being made to find the optimal clustering
2284 solution, each time starting from a different initial clustering.
2285 
2286 
2287 Arguments
2288 =========
2289 
2290 nclusters  (input) int
2291 The number of clusters to be found.
2292 
2293 data       (input) float[nrows][ncolumns]
2294 The array containing the data of the elements to be clustered (i.e., the gene
2295 expression data).
2296 
2297 mask       (input) int[nrows][ncolumns]
2298 This array shows which data values are missing. If
2299 mask[i][j] == 0, then data[i][j] is missing.
2300 
2301 nrows     (input) int
2302 The number of rows in the data matrix, equal to the number of genes.
2303 
2304 ncolumns  (input) int
2305 The number of columns in the data matrix, equal to the number of microarrays.
2306 
2307 weight (input) float[n]
2308 The weights that are used to calculate the distance.
2309 
2310 transpose  (input) int
2311 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
2312 of the matrix are clustered.
2313 
2314 npass      (input) int
2315 The number of times clustering is performed. Clustering is performed npass
2316 times, each time starting from a different (random) initial assignment of
2317 genes to clusters. The clustering solution with the lowest within-cluster sum
2318 of distances is chosen.
2319 If npass==0, then the clustering algorithm will be run once, where the initial
2320 assignment of elements to clusters is taken from the clusterid array.
2321 
2322 method     (input) char
2323 Defines whether the arithmetic mean (method=='a') or the median
2324 (method=='m') is used to calculate the cluster center.
2325 
2326 dist       (input) char
2327 Defines which distance measure is used, as given by the table:
2328 dist=='e': Euclidean distance
2329 dist=='b': City-block distance
2330 dist=='c': correlation
2331 dist=='a': absolute value of the correlation
2332 dist=='u': uncentered correlation
2333 dist=='x': absolute uncentered correlation
2334 dist=='s': Spearman's rank correlation
2335 dist=='k': Kendall's tau
2336 For other values of dist, the default (Euclidean distance) is used.
2337 
2338 clusterid  (output; input) int[nrows] if transpose==0
2339                            int[ncolumns] if transpose==1
2340 The cluster number to which a gene or microarray was assigned. If npass==0,
2341 then on input clusterid contains the initial clustering assignment from which
2342 the clustering algorithm starts. On output, it contains the clustering solution
2343 that was found.
2344 
2345 error      (output) float*
2346 The sum of distances to the cluster center of each item in the optimal k-means
2347 clustering solution that was found.
2348 
2349 ifound     (output) int*
2350 The number of times the optimal clustering solution was
2351 found. The value of ifound is at least 1; its maximum value is npass. If the
2352 number of clusters is larger than the number of elements being clustered,
2353 *ifound is set to 0 as an error code. If a memory allocation error occurs,
2354 *ifound is set to -1.
2355 
2356 ========================================================================
2357 */
2358 { const int nelements = (transpose==0) ? nrows : ncolumns;
2359   const int ndata = (transpose==0) ? ncolumns : nrows;
2360 
2361   int i;
2362   int ok;
2363   int* tclusterid;
2364   int* mapping = NULL;
2365   float** cdata;
2366 
2367   int* counts;
2368   int verb = 0;
2369 
2370   if (nelements < nclusters)
2371   { *ifound = 0;
2372     return;
2373   }
2374   /* More clusters asked for than elements available */
2375 
2376   *ifound = -1;
2377 
2378   /* This will contain the number of elements in each cluster, which is
2379    * needed to check for empty clusters. */
2380   counts = malloc(nclusters*sizeof(int));
2381   if(!counts) return;
2382 
2383   /* Find out if the user specified an initial clustering */
2384   if (npass<=1) {
2385    tclusterid = clusterid;
2386   } else {
2387    tclusterid = malloc(nelements*sizeof(int));
2388     if (!tclusterid)
2389     { free(counts);
2390       return;
2391     }
2392     mapping = malloc(nclusters*sizeof(int));
2393     if (!mapping)
2394     { free(counts);
2395       free(tclusterid);
2396       return;
2397     }
2398     for (i = 0; i < nelements; i++) clusterid[i] = 0;
2399   }
2400 
2401 
2402   /* Allocate space to store the centroid data */
2403   if (transpose==0) ok = makedatamask(nclusters, ndata, &cdata);
2404   else ok = makedatamask(ndata, nclusters, &cdata);
2405   if(!ok)
2406   { free(counts);
2407     if(npass>1)
2408     { free(tclusterid);
2409       free(mapping);
2410       return;
2411     }
2412   }
2413 
2414   if (method=='m')
2415   { float* cache = malloc(nelements*sizeof(float));
2416     if (verb) fprintf(stderr,"doing kmedians\n");
2417     if(cache)
2418     { *ifound = kmedians(nclusters, nrows, ncolumns, data, weight,
2419                          transpose, npass, dist, cdata, clusterid, error,
2420                          tclusterid, counts, mapping, cache);
2421       free(cache);
2422     }
2423   }
2424   else {
2425     if (verb) fprintf(stderr,"doing kmeans\n");
2426     *ifound = kmeans(nclusters, nrows, ncolumns, data, weight,
2427                      transpose, npass, dist, cdata, clusterid, error,
2428                      tclusterid, counts, mapping);
2429   }
2430 
2431   /* remapping is done in calling function now (see thd_segtools_fNM.c)
2432             ZSS Jan 2011*/
2433 
2434   /* Deallocate temporarily used space */
2435   if (npass > 1)
2436   { free(mapping);
2437     free(tclusterid);
2438   }
2439 
2440   if (transpose==0) freedatamask(nclusters, cdata);
2441   else freedatamask(ndata, cdata);
2442 
2443   free(counts);
2444 }
2445 
2446 /* *********************************************************************** */
2447 
comp(const void * p,const void * q)2448 int  comp(const void *p, const void *q)
2449 {
2450   int *ptr1 = (int *)(p);
2451   int *ptr2 = (int *)(q);
2452 
2453   if (*ptr1 > *ptr2)
2454     return -1;
2455   else if (*ptr1 == *ptr2)
2456     return 0;
2457   else
2458     return 1;
2459 }
2460 /* *********************************************************************** */
2461 
kmedoids(int nclusters,int nelements,float ** distmatrix,int npass,int clusterid[],float * error,int * ifound)2462 void CALL kmedoids (int nclusters, int nelements, float** distmatrix,
2463   int npass, int clusterid[], float* error, int* ifound)
2464 /*
2465 Purpose
2466 =======
2467 
2468 The kmedoids routine performs k-medoids clustering on a given set of elements,
2469 using the distance matrix and the number of clusters passed by the user.
2470 Multiple passes are being made to find the optimal clustering solution, each
2471 time starting from a different initial clustering.
2472 
2473 
2474 Arguments
2475 =========
2476 
2477 nclusters  (input) int
2478 The number of clusters to be found.
2479 
2480 nelements  (input) int
2481 The number of elements to be clustered.
2482 
2483 distmatrix (input) float array, ragged
2484   (number of rows is nelements, number of columns is equal to the row number)
2485 The distance matrix. To save space, the distance matrix is given in the
2486 form of a ragged array. The distance matrix is symmetric and has zeros
2487 on the diagonal. See distancematrix for a description of the content.
2488 
2489 npass      (input) int
2490 The number of times clustering is performed. Clustering is performed npass
2491 times, each time starting from a different (random) initial assignment of genes
2492 to clusters. The clustering solution with the lowest within-cluster sum of
2493 distances is chosen.
2494 If npass==0, then the clustering algorithm will be run once, where the initial
2495 assignment of elements to clusters is taken from the clusterid array.
2496 
2497 clusterid  (output; input) int[nelements]
2498 On input, if npass==0, then clusterid contains the initial clustering assignment
2499 from which the clustering algorithm starts; all numbers in clusterid should be
2500 between zero and nelements-1 inclusive. If npass!=0, clusterid is ignored on
2501 input.
2502 On output, clusterid contains the clustering solution that was found: clusterid
2503 contains the number of the cluster to which each item was assigned. On output,
2504 the number of a cluster is defined as the item number of the centroid of the
2505 cluster.
2506 
2507 error      (output) float
2508 The sum of distances to the cluster center of each item in the optimal k-medoids
2509 clustering solution that was found.
2510 
2511 ifound     (output) int
2512 If kmedoids is successful: the number of times the optimal clustering solution
2513 was found. The value of ifound is at least 1; its maximum value is npass.
2514 If the user requested more clusters than elements available, ifound is set
2515 to 0. If kmedoids fails due to a memory allocation error, ifound is set to -1.
2516 
2517 ========================================================================
2518 */
2519 { int i, j, icluster;
2520   int* tclusterid;
2521   int* saved;
2522   int* centroids;
2523   float* errors;
2524   int ipass = 0;
2525 
2526   if (nelements < nclusters)
2527   { *ifound = 0;
2528     return;
2529   } /* More clusters asked for than elements available */
2530 
2531   *ifound = -1;
2532 
2533   /* We save the clustering solution periodically and check if it reappears */
2534   saved = malloc(nelements*sizeof(int));
2535   if (saved==NULL) return;
2536 
2537   centroids = malloc(nclusters*sizeof(int));
2538   if(!centroids)
2539   { free(saved);
2540     return;
2541   }
2542 
2543   errors = malloc(nclusters*sizeof(float));
2544   if(!errors)
2545   { free(saved);
2546     free(centroids);
2547     return;
2548   }
2549 
2550   /* Find out if the user specified an initial clustering */
2551   if (npass<=1) tclusterid = clusterid;
2552   else
2553   { tclusterid = malloc(nelements*sizeof(int));
2554     if(!tclusterid)
2555     { free(saved);
2556       free(centroids);
2557       free(errors);
2558       return;
2559     }
2560   }
2561 
2562   *error = FLT_MAX;
2563   do /* Start the loop */
2564   { float total = FLT_MAX;
2565     int counter = 0;
2566     int period = 10;
2567 
2568     if (npass!=0) randomassign (nclusters, nelements, tclusterid);
2569     while(1)
2570     { float previous = total;
2571       total = 0.0;
2572 
2573       if (counter % period == 0) /* Save the current cluster assignments */
2574       { for (i = 0; i < nelements; i++) saved[i] = tclusterid[i];
2575         if (period < INT_MAX / 2) period *= 2;
2576       }
2577       counter++;
2578 
2579       /* Find the center */
2580       getclustermedoids(nclusters, nelements, distmatrix, tclusterid,
2581                         centroids, errors);
2582 
2583       for (i = 0; i < nelements; i++)
2584       /* Find the closest cluster */
2585       { float distance = FLT_MAX;
2586         for (icluster = 0; icluster < nclusters; icluster++)
2587         { float tdistance;
2588           j = centroids[icluster];
2589           if (i==j)
2590           { distance = 0.0;
2591             tclusterid[i] = icluster;
2592             break;
2593           }
2594           tdistance = (i > j) ? distmatrix[i][j] : distmatrix[j][i];
2595           if (tdistance < distance)
2596           { distance = tdistance;
2597             tclusterid[i] = icluster;
2598           }
2599         }
2600         total += distance;
2601       }
2602       if (total>=previous) break;
2603       /* total>=previous is FALSE on some machines even if total and previous
2604        * are bitwise identical. */
2605       for (i = 0; i < nelements; i++)
2606         if (saved[i]!=tclusterid[i]) break;
2607       if (i==nelements)
2608         break; /* Identical solution found; break out of this loop */
2609     }
2610 
2611     for (i = 0; i < nelements; i++)
2612     { if (total < *error)
2613       { *ifound = 1;
2614         *error = total;
2615         /* Replace by the centroid in each cluster. */
2616         for (j = 0; j < nelements; j++) clusterid[j] = centroids[tclusterid[j]];
2617         break;
2618       }
2619       else if (clusterid[i]!=tclusterid[i]) break;
2620     }
2621     if (i==nelements) (*ifound)++; /* break statement not encountered */
2622   } while (++ipass < npass);
2623 
2624   /* Deallocate temporarily used space */
2625   if (npass > 1) free(tclusterid);
2626 
2627   free(saved);
2628   free(centroids);
2629   free(errors);
2630 
2631   return;
2632 }
2633 
2634 /* ******************************************************************** */
2635 
distancematrix(int nrows,int ncolumns,float ** data,float weights[],char dist,int transpose)2636 float** CALL distancematrix (int nrows, int ncolumns, float** data,
2637    float weights[], char dist, int transpose)
2638 /*
2639 Purpose
2640 =======
2641 
2642 The distancematrix routine calculates the distance matrix between genes or
2643 microarrays using their measured gene expression data. Several distance measures
2644 can be used. The routine returns a pointer to a ragged array containing the
2645 distances between the genes. As the distance matrix is symmetric, with zeros on
2646 the diagonal, only the lower triangular half of the distance matrix is saved.
2647 The distancematrix routine allocates space for the distance matrix. If the
2648 parameter transpose is set to a nonzero value, the distances between the columns
2649 (microarrays) are calculated, otherwise distances between the rows (genes) are
2650 calculated.
2651 If sufficient space in memory cannot be allocated to store the distance matrix,
2652 the routine returns a NULL pointer, and all memory allocated so far for the
2653 distance matrix is freed.
2654 
2655 
2656 Arguments
2657 =========
2658 
2659 nrows      (input) int
2660 The number of rows in the gene expression data matrix (i.e., the number of
2661 genes)
2662 
2663 ncolumns   (input) int
2664 The number of columns in the gene expression data matrix (i.e., the number of
2665 microarrays)
2666 
2667 data       (input) float[nrows][ncolumns]
2668 The array containing the gene expression data.
2669 
2670 mask       (input) int[nrows][ncolumns]
2671 This array shows which data values are missing. If mask[i][j]==0, then
2672 data[i][j] is missing.
2673 
2674 weight (input) float[n]
2675 The weights that are used to calculate the distance. The length of this vector
2676 is equal to the number of columns if the distances between genes are calculated,
2677 or the number of rows if the distances between microarrays are calculated.
2678 
2679 dist       (input) char
2680 Defines which distance measure is used, as given by the table:
2681 dist=='e': Euclidean distance
2682 dist=='b': City-block distance
2683 dist=='c': correlation
2684 dist=='a': absolute value of the correlation
2685 dist=='u': uncentered correlation
2686 dist=='x': absolute uncentered correlation
2687 dist=='s': Spearman's rank correlation
2688 dist=='k': Kendall's tau
2689 For other values of dist, the default (Euclidean distance) is used.
2690 
2691 transpose  (input) int
2692 If transpose is equal to zero, the distances between the rows is
2693 calculated. Otherwise, the distances between the columns is calculated.
2694 The former is needed when genes are being clustered; the latter is used
2695 when microarrays are being clustered.
2696 
2697 ========================================================================
2698 */
2699 { /* First determine the size of the distance matrix */
2700   const int n = (transpose==0) ? nrows : ncolumns;
2701   const int ndata = (transpose==0) ? ncolumns : nrows;
2702   int i,j;
2703   float** matrix;
2704 
2705   /* Set the metric function as indicated by dist */
2706   float (*metric)
2707     (int, float**, float**, const float[], int, int, int) =
2708        setmetric(dist);
2709 
2710   if (n < 2) return NULL;
2711 
2712   /* Set up the ragged array */
2713   matrix = malloc(n*sizeof(float*));
2714   if(matrix==NULL) return NULL; /* Not enough memory available */
2715   matrix[0] = NULL;
2716   /* The zeroth row has zero columns. We allocate it anyway for convenience.*/
2717   for (i = 1; i < n; i++)
2718   { matrix[i] = malloc(i*sizeof(float));
2719     if (matrix[i]==NULL) break; /* Not enough memory available */
2720   }
2721   if (i < n) /* break condition encountered */
2722   { j = i;
2723     for (i = 1; i < j; i++) free(matrix[i]);
2724     return NULL;
2725   }
2726 
2727   /* Calculate the distances and save them in the ragged array */
2728   for (i = 1; i < n; i++)
2729     for (j = 0; j < i; j++)
2730       matrix[i][j]=metric(ndata,data,data,weights,i,j,transpose);
2731 
2732   return matrix;
2733 }
2734 
2735 /* ******************************************************************** */
2736 
calculate_weights(int nrows,int ncolumns,float ** data,float weights[],int transpose,char dist,float cutoff,float exponent)2737 float* calculate_weights(int nrows, int ncolumns, float** data,
2738   float weights[], int transpose, char dist, float cutoff, float exponent)
2739 
2740 /*
2741 Purpose
2742 =======
2743 
2744 This function calculates the weights using the weighting scheme proposed by
2745 Michael Eisen:
2746 w[i] = 1.0 / sum_{j where d[i][j]<cutoff} (1 - d[i][j]/cutoff)^exponent
2747 where the cutoff and the exponent are specified by the user.
2748 
2749 
2750 Arguments
2751 =========
2752 
2753 nrows      (input) int
2754 The number of rows in the gene expression data matrix, equal to the number of
2755 genes.
2756 
2757 ncolumns   (input) int
2758 The number of columns in the gene expression data matrix, equal to the number of
2759 microarrays.
2760 
2761 data       (input) float[nrows][ncolumns]
2762 The array containing the gene expression data.
2763 
2764 mask       (input) int[nrows][ncolumns]
2765 This array shows which data values are missing. If mask[i][j]==0, then
2766 data[i][j] is missing.
2767 
2768 weight     (input) int[ncolumns] if transpose==0,
2769                    int[nrows]    if transpose==1
2770 The weights that are used to calculate the distance. The length of this vector
2771 is ncolumns if gene weights are being clustered, and nrows if microarrays
2772 weights are being clustered.
2773 
2774 transpose (input) int
2775 If transpose==0, the weights of the rows of the data matrix are calculated.
2776 Otherwise, the weights of the columns of the data matrix are calculated.
2777 
2778 dist      (input) char
2779 Defines which distance measure is used, as given by the table:
2780 dist=='e': Euclidean distance
2781 dist=='b': City-block distance
2782 dist=='c': correlation
2783 dist=='a': absolute value of the correlation
2784 dist=='u': uncentered correlation
2785 dist=='x': absolute uncentered correlation
2786 dist=='s': Spearman's rank correlation
2787 dist=='k': Kendall's tau
2788 For other values of dist, the default (Euclidean distance) is used.
2789 
2790 cutoff    (input) float
2791 The cutoff to be used to calculate the weights.
2792 
2793 exponent  (input) float
2794 The exponent to be used to calculate the weights.
2795 
2796 
2797 Return value
2798 ============
2799 
2800 The function returns a pointer to a newly allocated array containing the
2801 calculated weights for the rows (if transpose==0) or columns (if
2802 transpose==1). If not enough memory could be allocated to store the
2803 weights array, the function returns NULL.
2804 
2805 ========================================================================
2806 */
2807 { int i,j;
2808   const int ndata = (transpose==0) ? ncolumns : nrows;
2809   const int nelements = (transpose==0) ? nrows : ncolumns;
2810 
2811   /* Set the metric function as indicated by dist */
2812   float (*metric)
2813     (int, float**, float**, const float[], int, int, int) =
2814        setmetric(dist);
2815 
2816   float* result = malloc(nelements*sizeof(float));
2817   if (!result) return NULL;
2818   memset(result, 0, nelements*sizeof(float));
2819 
2820   for (i = 0; i < nelements; i++)
2821   { result[i] += 1.0;
2822     for (j = 0; j < i; j++)
2823     { const float distance = metric(ndata, data, data, weights,
2824                                      i, j, transpose);
2825       if (distance < cutoff)
2826       { const float dweight = exp(exponent*log(1-distance/cutoff));
2827         /* pow() causes a crash on AIX */
2828         result[i] += dweight;
2829         result[j] += dweight;
2830       }
2831     }
2832   }
2833   for (i = 0; i < nelements; i++) result[i] = 1.0/result[i];
2834   return result;
2835 }
2836 
2837 /* ******************************************************************** */
2838 
cuttree(int nelements,Node * tree,int nclusters,int clusterid[])2839 void cuttree (int nelements, Node* tree, int nclusters, int clusterid[])
2840 
2841 /*
2842 Purpose
2843 =======
2844 
2845 The cuttree routine takes the output of a hierarchical clustering routine, and
2846 divides the elements in the tree structure into clusters based on the
2847 hierarchical clustering result. The number of clusters is specified by the user.
2848 
2849 Arguments
2850 =========
2851 
2852 nelements      (input) int
2853 The number of elements that were clustered.
2854 
2855 tree           (input) Node[nelements-1]
2856 The clustering solution. Each node in the array describes one linking event,
2857 with tree[i].left and tree[i].right representig the elements that were joined.
2858 The original elements are numbered 0..nelements-1, nodes are numbered
2859 -1..-(nelements-1).
2860 
2861 nclusters      (input) int
2862 The number of clusters to be formed.
2863 
2864 clusterid      (output) int[nelements]
2865 The number of the cluster to which each element was assigned. Space for this
2866 array should be allocated before calling the cuttree routine. If a memory
2867 error occured, all elements in clusterid are set to -1.
2868 
2869 ========================================================================
2870 */
2871 { int i, j, k;
2872   int icluster = 0;
2873   const int n = nelements-nclusters; /* number of nodes to join */
2874   int* nodeid;
2875   for (i = nelements-2; i >= n; i--)
2876   { k = tree[i].left;
2877     if (k>=0)
2878     { clusterid[k] = icluster;
2879       icluster++;
2880     }
2881     k = tree[i].right;
2882     if (k>=0)
2883     { clusterid[k] = icluster;
2884       icluster++;
2885     }
2886   }
2887   nodeid = malloc(n*sizeof(int));
2888   if(!nodeid)
2889   { for (i = 0; i < nelements; i++) clusterid[i] = -1;
2890     return;
2891   }
2892   for (i = 0; i < n; i++) nodeid[i] = -1;
2893   for (i = n-1; i >= 0; i--)
2894   { if(nodeid[i]<0)
2895     { j = icluster;
2896       nodeid[i] = j;
2897       icluster++;
2898     }
2899     else j = nodeid[i];
2900     k = tree[i].left;
2901     if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
2902     k = tree[i].right;
2903     if (k<0) nodeid[-k-1] = j; else clusterid[k] = j;
2904   }
2905   free(nodeid);
2906   return;
2907 }
2908 
2909 /* ******************************************************************** */
2910 
2911 static
pclcluster(int nrows,int ncolumns,float ** data,float weight[],float ** distmatrix,char dist,int transpose)2912 Node* pclcluster (int nrows, int ncolumns, float** data,
2913   float weight[], float** distmatrix, char dist, int transpose)
2914 
2915 /*
2916 
2917 Purpose
2918 =======
2919 
2920 The pclcluster routine performs clustering using pairwise centroid-linking
2921 on a given set of gene expression data, using the distance metric given by dist.
2922 
2923 Arguments
2924 =========
2925 
2926 nrows     (input) int
2927 The number of rows in the gene expression data matrix, equal to the number of
2928 genes.
2929 
2930 ncolumns  (input) int
2931 The number of columns in the gene expression data matrix, equal to the number of
2932 microarrays.
2933 
2934 data       (input) float[nrows][ncolumns]
2935 The array containing the gene expression data.
2936 
2937 mask       (input) int[nrows][ncolumns]
2938 This array shows which data values are missing. If
2939 mask[i][j] == 0, then data[i][j] is missing.
2940 
2941 weight     (input) float[ncolumns] if transpose==0;
2942                    float[nrows]    if transpose==1
2943 The weights that are used to calculate the distance. The length of this vector
2944 is ncolumns if genes are being clustered, and nrows if microarrays are being
2945 clustered.
2946 
2947 transpose  (input) int
2948 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
2949 of the matrix are clustered.
2950 
2951 dist       (input) char
2952 Defines which distance measure is used, as given by the table:
2953 dist=='e': Euclidean distance
2954 dist=='b': City-block distance
2955 dist=='c': correlation
2956 dist=='a': absolute value of the correlation
2957 dist=='u': uncentered correlation
2958 dist=='x': absolute uncentered correlation
2959 dist=='s': Spearman's rank correlation
2960 dist=='k': Kendall's tau
2961 For other values of dist, the default (Euclidean distance) is used.
2962 
2963 distmatrix (input) float**
2964 The distance matrix. This matrix is precalculated by the calling routine
2965 treecluster. The pclcluster routine modifies the contents of distmatrix, but
2966 does not deallocate it.
2967 
2968 Return value
2969 ============
2970 
2971 A pointer to a newly allocated array of Node structs, describing the
2972 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
2973 whether genes (rows) or microarrays (columns) were clustered, nelements is
2974 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
2975 structure.
2976 If a memory error occurs, pclcluster returns NULL.
2977 ========================================================================
2978 */
2979 { int i, j;
2980   const int nelements = (transpose==0) ? nrows : ncolumns;
2981   int inode;
2982   const int ndata = transpose ? nrows : ncolumns;
2983   const int nnodes = nelements - 1;
2984 
2985   /* Set the metric function as indicated by dist */
2986   float (*metric)
2987     (int, float**, float**, const float[], int, int, int) =
2988        setmetric(dist);
2989 
2990   Node* result;
2991   float** newdata;
2992 
2993   int* distid = malloc(nelements*sizeof(int));
2994   if(!distid) return NULL;
2995   result = malloc(nnodes*sizeof(Node));
2996   if(!result)
2997   { free(distid);
2998     return NULL;
2999   }
3000   if(!makedatamask(nelements, ndata, &newdata))
3001   { free(result);
3002     free(distid);
3003     return NULL;
3004   }
3005 
3006   for (i = 0; i < nelements; i++) distid[i] = i;
3007   /* To remember which row/column in the distance matrix contains what */
3008 
3009   /* Storage for node data */
3010   if (transpose)
3011   { for (i = 0; i < nelements; i++)
3012     { for (j = 0; j < ndata; j++)
3013       { newdata[i][j] = data[j][i];
3014 
3015       }
3016     }
3017     data = newdata;
3018 
3019   }
3020   else
3021   { for (i = 0; i < nelements; i++)
3022     { memcpy(newdata[i], data[i], ndata*sizeof(float));
3023 
3024     }
3025     data = newdata;
3026 
3027   }
3028 
3029   for (inode = 0; inode < nnodes; inode++)
3030   { /* Find the pair with the shortest distance */
3031     int is = 1;
3032     int js = 0;
3033     result[inode].distance = find_closest_pair(nelements-inode, distmatrix, &is, &js);
3034     result[inode].left = distid[js];
3035     result[inode].right = distid[is];
3036 
3037     /* Make node js the new node */
3038     for (i = 0; i < ndata; i++)
3039     { data[js][i] = data[js][i] + data[is][i];
3040 
3041 
3042     }
3043     free(data[is]);
3044 
3045     data[is] = data[nnodes-inode];
3046 
3047 
3048     /* Fix the distances */
3049     distid[is] = distid[nnodes-inode];
3050     for (i = 0; i < is; i++)
3051       distmatrix[is][i] = distmatrix[nnodes-inode][i];
3052     for (i = is + 1; i < nnodes-inode; i++)
3053       distmatrix[i][is] = distmatrix[nnodes-inode][i];
3054 
3055     distid[js] = -inode-1;
3056     for (i = 0; i < js; i++)
3057       distmatrix[js][i] = metric(ndata,data,data,weight,js,i,0);
3058     for (i = js + 1; i < nnodes-inode; i++)
3059       distmatrix[i][js] = metric(ndata,data,data,weight,js,i,0);
3060   }
3061 
3062   /* Free temporarily allocated space */
3063   free(data[0]);
3064 
3065   free(data);
3066 
3067   free(distid);
3068 
3069   return result;
3070 }
3071 
3072 /* ******************************************************************** */
3073 
3074 static
nodecompare(const void * a,const void * b)3075 int nodecompare(const void* a, const void* b)
3076 /* Helper function for qsort. */
3077 { const Node* node1 = (const Node*)a;
3078   const Node* node2 = (const Node*)b;
3079   const float term1 = node1->distance;
3080   const float term2 = node2->distance;
3081   if (term1 < term2) return -1;
3082   if (term1 > term2) return +1;
3083   return 0;
3084 }
3085 
3086 /* ---------------------------------------------------------------------- */
3087 
3088 static
pslcluster(int nrows,int ncolumns,float ** data,float weight[],float ** distmatrix,char dist,int transpose)3089 Node* pslcluster (int nrows, int ncolumns, float** data,
3090   float weight[], float** distmatrix, char dist, int transpose)
3091 
3092 /*
3093 
3094 Purpose
3095 =======
3096 
3097 The pslcluster routine performs single-linkage hierarchical clustering, using
3098 either the distance matrix directly, if available, or by calculating the
3099 distances from the data array. This implementation is based on the SLINK
3100 algorithm, described in:
3101 Sibson, R. (1973). SLINK: An optimally efficient algorithm for the single-link
3102 cluster method. The Computer Journal, 16(1): 30-34.
3103 The output of this algorithm is identical to conventional single-linkage
3104 hierarchical clustering, but is much more memory-efficient and faster. Hence,
3105 it can be applied to large data sets, for which the conventional single-
3106 linkage algorithm fails due to lack of memory.
3107 
3108 
3109 Arguments
3110 =========
3111 
3112 nrows     (input) int
3113 The number of rows in the gene expression data matrix, equal to the number of
3114 genes.
3115 
3116 ncolumns  (input) int
3117 The number of columns in the gene expression data matrix, equal to the number of
3118 microarrays.
3119 
3120 data       (input) float[nrows][ncolumns]
3121 The array containing the gene expression data.
3122 
3123 mask       (input) int[nrows][ncolumns]
3124 This array shows which data values are missing. If
3125 mask[i][j] == 0, then data[i][j] is missing.
3126 
3127 weight (input) float[n]
3128 The weights that are used to calculate the distance. The length of this vector
3129 is ncolumns if genes are being clustered, and nrows if microarrays are being
3130 clustered.
3131 
3132 transpose  (input) int
3133 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3134 of the matrix are clustered.
3135 
3136 dist       (input) char
3137 Defines which distance measure is used, as given by the table:
3138 dist=='e': Euclidean distance
3139 dist=='b': City-block distance
3140 dist=='c': correlation
3141 dist=='a': absolute value of the correlation
3142 dist=='u': uncentered correlation
3143 dist=='x': absolute uncentered correlation
3144 dist=='s': Spearman's rank correlation
3145 dist=='k': Kendall's tau
3146 For other values of dist, the default (Euclidean distance) is used.
3147 
3148 distmatrix (input) float**
3149 The distance matrix. If the distance matrix is passed by the calling routine
3150 treecluster, it is used by pslcluster to speed up the clustering calculation.
3151 The pslcluster routine does not modify the contents of distmatrix, and does
3152 not deallocate it. If distmatrix is NULL, the pairwise distances are calculated
3153 by the pslcluster routine from the gene expression data (the data and mask
3154 arrays) and stored in temporary arrays. If distmatrix is passed, the original
3155 gene expression data (specified by the data and mask arguments) are not needed
3156 and are therefore ignored.
3157 
3158 
3159 Return value
3160 ============
3161 
3162 A pointer to a newly allocated array of Node structs, describing the
3163 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3164 whether genes (rows) or microarrays (columns) were clustered, nelements is
3165 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3166 structure.
3167 If a memory error occurs, pslcluster returns NULL.
3168 
3169 ========================================================================
3170 */
3171 { int i, j, k;
3172   const int nelements = transpose ? ncolumns : nrows;
3173   const int nnodes = nelements - 1;
3174   int* vector;
3175   float* temp;
3176   int* index;
3177   Node* result;
3178   temp = malloc(nnodes*sizeof(float));
3179   if(!temp) return NULL;
3180   index = malloc(nelements*sizeof(int));
3181   if(!index)
3182   { free(temp);
3183     return NULL;
3184   }
3185   vector = malloc(nnodes*sizeof(int));
3186   if(!vector)
3187   { free(index);
3188     free(temp);
3189     return NULL;
3190   }
3191   result = malloc(nnodes*sizeof(Node));
3192   if(!result)
3193   { free(vector);
3194     free(index);
3195     free(temp);
3196     return NULL;
3197   }
3198 
3199   for (i = 0; i < nnodes; i++)
3200   { vector[i] = i;
3201     result[i].distance = FLT_MAX;
3202   }
3203 
3204   if(distmatrix)
3205   { for (i = 0; i < nrows; i++)
3206     { for (j = 0; j < i; j++) temp[j] = distmatrix[i][j];
3207       for (j = 0; j < i; j++)
3208       { k = vector[j];
3209         if (result[j].distance >= temp[j])
3210         { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3211           result[j].distance = temp[j];
3212           vector[j] = i;
3213         }
3214         else if (temp[j] < temp[k]) temp[k] = temp[j];
3215       }
3216       for (j = 0; j < i; j++)
3217         if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3218     }
3219   }
3220   else
3221   { const int ndata = transpose ? nrows : ncolumns;
3222     /* Set the metric function as indicated by dist */
3223     float (*metric)
3224       (int, float**, float**, const float[], int, int, int) =
3225          setmetric(dist);
3226 
3227     for (i = 0; i < nelements; i++)
3228     { for (j = 0; j < i; j++) temp[j] =
3229         metric(ndata, data, data, weight, i, j, transpose);
3230       for (j = 0; j < i; j++)
3231       { k = vector[j];
3232         if (result[j].distance >= temp[j])
3233         { if (result[j].distance < temp[k]) temp[k] = result[j].distance;
3234           result[j].distance = temp[j];
3235           vector[j] = i;
3236         }
3237         else if (temp[j] < temp[k]) temp[k] = temp[j];
3238       }
3239       for (j = 0; j < i; j++)
3240         if (result[j].distance >= result[vector[j]].distance) vector[j] = i;
3241     }
3242   }
3243   free(temp);
3244 
3245   for (i = 0; i < nnodes; i++) result[i].left = i;
3246   qsort(result, nnodes, sizeof(Node), nodecompare);
3247 
3248   for (i = 0; i < nelements; i++) index[i] = i;
3249   for (i = 0; i < nnodes; i++)
3250   { j = result[i].left;
3251     k = vector[j];
3252     result[i].left = index[j];
3253     result[i].right = index[k];
3254     index[k] = -i-1;
3255   }
3256   free(vector);
3257   free(index);
3258 
3259   return result;
3260 }
3261 /* ******************************************************************** */
3262 
pmlcluster(int nelements,float ** distmatrix)3263 static Node* pmlcluster (int nelements, float** distmatrix)
3264 /*
3265 
3266 Purpose
3267 =======
3268 
3269 The pmlcluster routine performs clustering using pairwise maximum- (complete-)
3270 linking on the given distance matrix.
3271 
3272 Arguments
3273 =========
3274 
3275 nelements     (input) int
3276 The number of elements to be clustered.
3277 
3278 distmatrix (input) float**
3279 The distance matrix, with nelements rows, each row being filled up to the
3280 diagonal. The elements on the diagonal are not used, as they are assumed to be
3281 zero. The distance matrix will be modified by this routine.
3282 
3283 Return value
3284 ============
3285 
3286 A pointer to a newly allocated array of Node structs, describing the
3287 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3288 whether genes (rows) or microarrays (columns) were clustered, nelements is
3289 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3290 structure.
3291 If a memory error occurs, pmlcluster returns NULL.
3292 ========================================================================
3293 */
3294 { int j;
3295   int n;
3296   int* clusterid;
3297   Node* result;
3298 
3299   clusterid = malloc(nelements*sizeof(int));
3300   if(!clusterid) return NULL;
3301   result = malloc((nelements-1)*sizeof(Node));
3302   if (!result)
3303   { free(clusterid);
3304     return NULL;
3305   }
3306 
3307   /* Setup a list specifying to which cluster a gene belongs */
3308   for (j = 0; j < nelements; j++) clusterid[j] = j;
3309 
3310   for (n = nelements; n > 1; n--)
3311   { int is = 1;
3312     int js = 0;
3313     result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3314 
3315     /* Fix the distances */
3316     for (j = 0; j < js; j++)
3317       distmatrix[js][j] = max(distmatrix[is][j],distmatrix[js][j]);
3318     for (j = js+1; j < is; j++)
3319       distmatrix[j][js] = max(distmatrix[is][j],distmatrix[j][js]);
3320     for (j = is+1; j < n; j++)
3321       distmatrix[j][js] = max(distmatrix[j][is],distmatrix[j][js]);
3322 
3323     for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3324     for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3325 
3326     /* Update clusterids */
3327     result[nelements-n].left = clusterid[is];
3328     result[nelements-n].right = clusterid[js];
3329     clusterid[js] = n-nelements-1;
3330     clusterid[is] = clusterid[n-1];
3331   }
3332   free(clusterid);
3333 
3334   return result;
3335 }
3336 
3337 /* ******************************************************************* */
3338 
palcluster(int nelements,float ** distmatrix)3339 static Node* palcluster (int nelements, float** distmatrix)
3340 /*
3341 Purpose
3342 =======
3343 
3344 The palcluster routine performs clustering using pairwise average
3345 linking on the given distance matrix.
3346 
3347 Arguments
3348 =========
3349 
3350 nelements     (input) int
3351 The number of elements to be clustered.
3352 
3353 distmatrix (input) float**
3354 The distance matrix, with nelements rows, each row being filled up to the
3355 diagonal. The elements on the diagonal are not used, as they are assumed to be
3356 zero. The distance matrix will be modified by this routine.
3357 
3358 Return value
3359 ============
3360 
3361 A pointer to a newly allocated array of Node structs, describing the
3362 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3363 whether genes (rows) or microarrays (columns) were clustered, nelements is
3364 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3365 structure.
3366 If a memory error occurs, palcluster returns NULL.
3367 ========================================================================
3368 */
3369 { int j;
3370   int n;
3371   int* clusterid;
3372   int* number;
3373   Node* result;
3374 
3375   clusterid = malloc(nelements*sizeof(int));
3376   if(!clusterid) return NULL;
3377   number = malloc(nelements*sizeof(int));
3378   if(!number)
3379   { free(clusterid);
3380     return NULL;
3381   }
3382   result = malloc((nelements-1)*sizeof(Node));
3383   if (!result)
3384   { free(clusterid);
3385     free(number);
3386     return NULL;
3387   }
3388 
3389   /* Setup a list specifying to which cluster a gene belongs, and keep track
3390    * of the number of elements in each cluster (needed to calculate the
3391    * average). */
3392   for (j = 0; j < nelements; j++)
3393   { number[j] = 1;
3394     clusterid[j] = j;
3395   }
3396 
3397   for (n = nelements; n > 1; n--)
3398   { int sum;
3399     int is = 1;
3400     int js = 0;
3401     result[nelements-n].distance = find_closest_pair(n, distmatrix, &is, &js);
3402 
3403     /* Save result */
3404     result[nelements-n].left = clusterid[is];
3405     result[nelements-n].right = clusterid[js];
3406 
3407     /* Fix the distances */
3408     sum = number[is] + number[js];
3409     for (j = 0; j < js; j++)
3410     { distmatrix[js][j] = distmatrix[is][j]*number[is]
3411                         + distmatrix[js][j]*number[js];
3412       distmatrix[js][j] /= sum;
3413     }
3414     for (j = js+1; j < is; j++)
3415     { distmatrix[j][js] = distmatrix[is][j]*number[is]
3416                         + distmatrix[j][js]*number[js];
3417       distmatrix[j][js] /= sum;
3418     }
3419     for (j = is+1; j < n; j++)
3420     { distmatrix[j][js] = distmatrix[j][is]*number[is]
3421                         + distmatrix[j][js]*number[js];
3422       distmatrix[j][js] /= sum;
3423     }
3424 
3425     for (j = 0; j < is; j++) distmatrix[is][j] = distmatrix[n-1][j];
3426     for (j = is+1; j < n-1; j++) distmatrix[j][is] = distmatrix[n-1][j];
3427 
3428     /* Update number of elements in the clusters */
3429     number[js] = sum;
3430     number[is] = number[n-1];
3431 
3432     /* Update clusterids */
3433     clusterid[js] = n-nelements-1;
3434     clusterid[is] = clusterid[n-1];
3435   }
3436   free(clusterid);
3437   free(number);
3438 
3439   return result;
3440 }
3441 
3442 /* ******************************************************************* */
3443 
treecluster(int nrows,int ncolumns,float ** data,float weight[],int transpose,char dist,char method,float ** distmatrix)3444 Node* CALL treecluster (int nrows, int ncolumns, float** data,
3445   float weight[], int transpose, char dist, char method, float** distmatrix)
3446 /*
3447 Purpose
3448 =======
3449 
3450 The treecluster routine performs hierarchical clustering using pairwise
3451 single-, maximum-, centroid-, or average-linkage, as defined by method, on a
3452 given set of gene expression data, using the distance metric given by dist.
3453 If successful, the function returns a pointer to a newly allocated Tree struct
3454 containing the hierarchical clustering solution, and NULL if a memory error
3455 occurs. The pointer should be freed by the calling routine to prevent memory
3456 leaks.
3457 
3458 Arguments
3459 =========
3460 
3461 nrows     (input) int
3462 The number of rows in the data matrix, equal to the number of genes.
3463 
3464 ncolumns  (input) int
3465 The number of columns in the data matrix, equal to the number of microarrays.
3466 
3467 data       (input) float[nrows][ncolumns]
3468 The array containing the data of the vectors to be clustered.
3469 
3470 mask       (input) int[nrows][ncolumns]
3471 This array shows which data values are missing. If mask[i][j]==0, then
3472 data[i][j] is missing.
3473 
3474 weight (input) float array[n]
3475 The weights that are used to calculate the distance.
3476 
3477 transpose  (input) int
3478 If transpose==0, the rows of the matrix are clustered. Otherwise, columns
3479 of the matrix are clustered.
3480 
3481 dist       (input) char
3482 Defines which distance measure is used, as given by the table:
3483 dist=='e': Euclidean distance
3484 dist=='b': City-block distance
3485 dist=='c': correlation
3486 dist=='a': absolute value of the correlation
3487 dist=='u': uncentered correlation
3488 dist=='x': absolute uncentered correlation
3489 dist=='s': Spearman's rank correlation
3490 dist=='k': Kendall's tau
3491 For other values of dist, the default (Euclidean distance) is used.
3492 
3493 method     (input) char
3494 Defines which hierarchical clustering method is used:
3495 method=='s': pairwise single-linkage clustering
3496 method=='m': pairwise maximum- (or complete-) linkage clustering
3497 method=='a': pairwise average-linkage clustering
3498 method=='c': pairwise centroid-linkage clustering
3499 For the first three, either the distance matrix or the gene expression data is
3500 sufficient to perform the clustering algorithm. For pairwise centroid-linkage
3501 clustering, however, the gene expression data are always needed, even if the
3502 distance matrix itself is available.
3503 
3504 distmatrix (input) float**
3505 The distance matrix. If the distance matrix is zero initially, the distance
3506 matrix will be allocated and calculated from the data by treecluster, and
3507 deallocated before treecluster returns. If the distance matrix is passed by the
3508 calling routine, treecluster will modify the contents of the distance matrix as
3509 part of the clustering algorithm, but will not deallocate it. The calling
3510 routine should deallocate the distance matrix after the return from treecluster.
3511 
3512 Return value
3513 ============
3514 
3515 A pointer to a newly allocated array of Node structs, describing the
3516 hierarchical clustering solution consisting of nelements-1 nodes. Depending on
3517 whether genes (rows) or microarrays (columns) were clustered, nelements is
3518 equal to nrows or ncolumns. See src/cluster.h for a description of the Node
3519 structure.
3520 If a memory error occurs, treecluster returns NULL.
3521 
3522 ========================================================================
3523 */
3524 { Node* result = NULL;
3525   const int nelements = (transpose==0) ? nrows : ncolumns;
3526   const int ldistmatrix = (distmatrix==NULL && method!='s') ? 1 : 0;
3527 
3528   if (nelements < 2) return NULL;
3529 
3530   /* Calculate the distance matrix if the user didn't give it */
3531   if(ldistmatrix)
3532   { distmatrix =
3533       distancematrix(nrows, ncolumns, data, weight, dist, transpose);
3534     if (!distmatrix) return NULL; /* Insufficient memory */
3535   }
3536 
3537   switch(method)
3538   { case 's':
3539       result = pslcluster(nrows, ncolumns, data, weight, distmatrix,
3540                           dist, transpose);
3541       break;
3542     case 'm':
3543       result = pmlcluster(nelements, distmatrix);
3544       break;
3545     case 'a':
3546       result = palcluster(nelements, distmatrix);
3547       break;
3548     case 'c':
3549       result = pclcluster(nrows, ncolumns, data, weight, distmatrix,
3550                           dist, transpose);
3551       break;
3552   }
3553 
3554   /* Deallocate space for distance matrix, if it was allocated by treecluster */
3555   if(ldistmatrix)
3556   { int i;
3557     for (i = 1; i < nelements; i++) free(distmatrix[i]);
3558     free (distmatrix);
3559   }
3560 
3561   return result;
3562 }
3563 
3564 /* ******************************************************************* */
3565 
3566 
3567 
3568 /* ******************************************************************* */
3569 
3570 
3571 
3572 /* ******************************************************************* */
3573 
3574 
3575 
3576 /* ******************************************************************** */
3577 
clusterdistance(int nrows,int ncolumns,float ** data,float weight[],int n1,int n2,int index1[],int index2[],char dist,char method,int transpose)3578 float CALL clusterdistance (int nrows, int ncolumns, float** data,
3579    float weight[], int n1, int n2, int index1[], int index2[],
3580   char dist, char method, int transpose)
3581 
3582 /*
3583 Purpose
3584 =======
3585 
3586 The clusterdistance routine calculates the distance between two clusters
3587 containing genes or microarrays using the measured gene expression vectors. The
3588 distance between clusters, given the genes/microarrays in each cluster, can be
3589 defined in several ways. Several distance measures can be used.
3590 
3591 The routine returns the distance in float precision.
3592 If the parameter transpose is set to a nonzero value, the clusters are
3593 interpreted as clusters of microarrays, otherwise as clusters of gene.
3594 
3595 Arguments
3596 =========
3597 
3598 nrows     (input) int
3599 The number of rows (i.e., the number of genes) in the gene expression data
3600 matrix.
3601 
3602 ncolumns      (input) int
3603 The number of columns (i.e., the number of microarrays) in the gene expression
3604 data matrix.
3605 
3606 data       (input) float[nrows][ncolumns]
3607 The array containing the data of the vectors.
3608 
3609 mask       (input) int[nrows][ncolumns]
3610 This array shows which data values are missing. If mask[i][j]==0, then
3611 data[i][j] is missing.
3612 
3613 weight     (input) float[ncolumns] if transpose==0;
3614                    float[nrows]    if transpose==1
3615 The weights that are used to calculate the distance.
3616 
3617 n1         (input) int
3618 The number of elements in the first cluster.
3619 
3620 n2         (input) int
3621 The number of elements in the second cluster.
3622 
3623 index1     (input) int[n1]
3624 Identifies which genes/microarrays belong to the first cluster.
3625 
3626 index2     (input) int[n2]
3627 Identifies which genes/microarrays belong to the second cluster.
3628 
3629 dist       (input) char
3630 Defines which distance measure is used, as given by the table:
3631 dist=='e': Euclidean distance
3632 dist=='b': City-block distance
3633 dist=='c': correlation
3634 dist=='a': absolute value of the correlation
3635 dist=='u': uncentered correlation
3636 dist=='x': absolute uncentered correlation
3637 dist=='s': Spearman's rank correlation
3638 dist=='k': Kendall's tau
3639 For other values of dist, the default (Euclidean distance) is used.
3640 
3641 method     (input) char
3642 Defines how the distance between two clusters is defined, given which genes
3643 belong to which cluster:
3644 method=='a': the distance between the arithmetic means of the two clusters
3645 method=='m': the distance between the medians of the two clusters
3646 method=='s': the smallest pairwise distance between members of the two clusters
3647 method=='x': the largest pairwise distance between members of the two clusters
3648 method=='v': average of the pairwise distances between members of the clusters
3649 
3650 transpose  (input) int
3651 If transpose is equal to zero, the distances between the rows is
3652 calculated. Otherwise, the distances between the columns is calculated.
3653 The former is needed when genes are being clustered; the latter is used
3654 when microarrays are being clustered.
3655 
3656 ========================================================================
3657 */
3658 { /* Set the metric function as indicated by dist */
3659   float (*metric)
3660     (int, float**, float**, const float[], int, int, int) =
3661        setmetric(dist);
3662 
3663   /* if one or both clusters are empty, return */
3664   if (n1 < 1 || n2 < 1) return -1.0;
3665   /* Check the indices */
3666   if (transpose==0)
3667   { int i;
3668     for (i = 0; i < n1; i++)
3669     { int index = index1[i];
3670       if (index < 0 || index >= nrows) return -1.0;
3671     }
3672     for (i = 0; i < n2; i++)
3673     { int index = index2[i];
3674       if (index < 0 || index >= nrows) return -1.0;
3675     }
3676   }
3677   else
3678   { int i;
3679     for (i = 0; i < n1; i++)
3680     { int index = index1[i];
3681       if (index < 0 || index >= ncolumns) return -1.0;
3682     }
3683     for (i = 0; i < n2; i++)
3684     { int index = index2[i];
3685       if (index < 0 || index >= ncolumns) return -1.0;
3686     }
3687   }
3688 
3689   switch (method)
3690   { case 'a':
3691     { /* Find the center */
3692       int i,j,k;
3693       if (transpose==0)
3694       { float distance;
3695         float* cdata[2];
3696 
3697         int* count[2];
3698         count[0] = calloc(ncolumns,sizeof(int));
3699         count[1] = calloc(ncolumns,sizeof(int));
3700         cdata[0] = calloc(ncolumns,sizeof(float));
3701         cdata[1] = calloc(ncolumns,sizeof(float));
3702 
3703         for (i = 0; i < n1; i++)
3704         { k = index1[i];
3705           for (j = 0; j < ncolumns; j++)
3706 	    { cdata[0][j] = cdata[0][j] + data[k][j];
3707               count[0][j] = count[0][j] + 1;
3708             }
3709         }
3710         for (i = 0; i < n2; i++)
3711         { k = index2[i];
3712           for (j = 0; j < ncolumns; j++)
3713 	    { cdata[1][j] = cdata[1][j] + data[k][j];
3714               count[1][j] = count[1][j] + 1;
3715             }
3716         }
3717         for (i = 0; i < 2; i++)
3718           for (j = 0; j < ncolumns; j++)
3719           { if (count[i][j]>0)
3720             { cdata[i][j] = cdata[i][j] / count[i][j];
3721 	    }
3722 
3723           }
3724         distance =
3725           metric (ncolumns,cdata,cdata,weight,0,1,0);
3726         for (i = 0; i < 2; i++)
3727         { free (cdata[i]);
3728 
3729           free (count[i]);
3730         }
3731         return distance;
3732       }
3733       else
3734       { float distance;
3735         int** count = malloc(nrows*sizeof(int*));
3736         float** cdata = malloc(nrows*sizeof(float*));
3737 
3738         for (i = 0; i < nrows; i++)
3739         { count[i] = calloc(2,sizeof(int));
3740           cdata[i] = calloc(2,sizeof(float));
3741 
3742         }
3743         for (i = 0; i < n1; i++)
3744         { k = index1[i];
3745           for (j = 0; j < nrows; j++)
3746 	    {  cdata[j][0] = cdata[j][0] + data[j][k];
3747               count[j][0] = count[j][0] + 1;
3748 
3749 	    }
3750         }
3751         for (i = 0; i < n2; i++)
3752         { k = index2[i];
3753           for (j = 0; j < nrows; j++)
3754 	    {  cdata[j][1] = cdata[j][1] + data[j][k];
3755               count[j][1] = count[j][1] + 1;
3756 
3757 	    }
3758         }
3759         for (i = 0; i < nrows; i++)
3760           for (j = 0; j < 2; j++)
3761             if (count[i][j]>0)
3762             { cdata[i][j] = cdata[i][j] / count[i][j];
3763 
3764             }
3765 
3766         distance = metric (nrows,cdata,cdata,weight,0,1,1);
3767         for (i = 0; i < nrows; i++)
3768         { free (count[i]);
3769           free (cdata[i]);
3770 
3771         }
3772         free (count);
3773         free (cdata);
3774 
3775         return distance;
3776       }
3777     }
3778     case 'm':
3779     { int i, j, k;
3780       if (transpose==0)
3781       { float distance;
3782         float* temp = malloc(nrows*sizeof(float));
3783         float* cdata[2];
3784 
3785         for (i = 0; i < 2; i++)
3786         { cdata[i] = malloc(ncolumns*sizeof(float));
3787 
3788         }
3789         for (j = 0; j < ncolumns; j++)
3790         { int count = 0;
3791           for (k = 0; k < n1; k++)
3792           { i = index1[k];
3793 	    temp[count] = data[i][j];
3794 	    count++;
3795 
3796           }
3797           if (count>0)
3798           { cdata[0][j] = median (count,temp);
3799 
3800           }
3801           else
3802           { cdata[0][j] = 0.;
3803 
3804           }
3805         }
3806         for (j = 0; j < ncolumns; j++)
3807         { int count = 0;
3808           for (k = 0; k < n2; k++)
3809           { i = index2[k];
3810             temp[count] = data[i][j];
3811               count++;
3812 
3813           }
3814           if (count>0)
3815           { cdata[1][j] = median (count,temp);
3816 
3817           }
3818           else
3819           { cdata[1][j] = 0.;
3820 
3821           }
3822         }
3823         distance = metric (ncolumns,cdata,cdata,weight,0,1,0);
3824         for (i = 0; i < 2; i++)
3825         { free (cdata[i]);
3826 
3827         }
3828         free(temp);
3829         return distance;
3830       }
3831       else
3832       { float distance;
3833         float* temp = malloc(ncolumns*sizeof(float));
3834         float** cdata = malloc(nrows*sizeof(float*));
3835 
3836         for (i = 0; i < nrows; i++)
3837         { cdata[i] = malloc(2*sizeof(float));
3838 
3839         }
3840         for (j = 0; j < nrows; j++)
3841         { int count = 0;
3842           for (k = 0; k < n1; k++)
3843           { i = index1[k];
3844 	    temp[count] = data[j][i];
3845 	    count++;
3846 
3847           }
3848           if (count>0)
3849           { cdata[j][0] = median (count,temp);
3850 
3851           }
3852           else
3853           { cdata[j][0] = 0.;
3854 
3855           }
3856         }
3857         for (j = 0; j < nrows; j++)
3858         { int count = 0;
3859           for (k = 0; k < n2; k++)
3860           { i = index2[k];
3861 
3862              temp[count] = data[j][i];
3863              count++;
3864 
3865           }
3866           if (count>0)
3867           { cdata[j][1] = median (count,temp);
3868 
3869           }
3870           else
3871           { cdata[j][1] = 0.;
3872 
3873           }
3874         }
3875         distance = metric (nrows,cdata,cdata,weight,0,1,1);
3876         for (i = 0; i < nrows; i++)
3877         { free (cdata[i]);
3878 
3879         }
3880         free(cdata);
3881 
3882         free(temp);
3883         return distance;
3884       }
3885     }
3886     case 's':
3887     { int i1, i2, j1, j2;
3888       const int n = (transpose==0) ? ncolumns : nrows;
3889       float mindistance = FLT_MAX;
3890       for (i1 = 0; i1 < n1; i1++)
3891         for (i2 = 0; i2 < n2; i2++)
3892         { float distance;
3893           j1 = index1[i1];
3894           j2 = index2[i2];
3895           distance = metric (n,data,data,weight,j1,j2,transpose);
3896           if (distance < mindistance) mindistance = distance;
3897         }
3898       return mindistance;
3899     }
3900     case 'x':
3901     { int i1, i2, j1, j2;
3902       const int n = (transpose==0) ? ncolumns : nrows;
3903       float maxdistance = 0;
3904       for (i1 = 0; i1 < n1; i1++)
3905         for (i2 = 0; i2 < n2; i2++)
3906         { float distance;
3907           j1 = index1[i1];
3908           j2 = index2[i2];
3909           distance = metric (n,data,data,weight,j1,j2,transpose);
3910           if (distance > maxdistance) maxdistance = distance;
3911         }
3912       return maxdistance;
3913     }
3914     case 'v':
3915     { int i1, i2, j1, j2;
3916       const int n = (transpose==0) ? ncolumns : nrows;
3917       float distance = 0;
3918       for (i1 = 0; i1 < n1; i1++)
3919         for (i2 = 0; i2 < n2; i2++)
3920         { j1 = index1[i1];
3921           j2 = index2[i2];
3922           distance += metric (n,data,data,weight,j1,j2,transpose);
3923         }
3924       distance /= (n1*n2);
3925       return distance;
3926     }
3927   }
3928   /* Never get here */
3929   return -2.0;
3930 }
3931